001package org.biopax.paxtools.pattern.util; 002 003import org.biopax.paxtools.controller.PathAccessor; 004import org.biopax.paxtools.io.SimpleIOHandler; 005import org.biopax.paxtools.model.Model; 006import org.biopax.paxtools.model.level3.SimplePhysicalEntity; 007import org.biopax.paxtools.model.level3.SmallMoleculeReference; 008 009import java.io.FileInputStream; 010import java.io.FileNotFoundException; 011import java.util.*; 012 013/** 014 * This class is used for finding a standard name for a small molecule. During detection of 015 * ubiquitous small molecules, we map the duplicated small molecules to one standard name, otherwise 016 * their degree would be divided and this would spoil the detection method. 017 * 018 * @author Ozgun Babur 019 */ 020public class ChemicalNameNormalizer 021{ 022 /** 023 * Mapping from the a small molecule to the one that contains the standard name. 024 */ 025 Map<SmallMoleculeReference, SmallMoleculeReference> map; 026 027 public static void main(String[] args) throws FileNotFoundException 028 { 029 SimpleIOHandler reader = new SimpleIOHandler(); 030 Model model = reader.convertFromOWL(new FileInputStream( 031 "/home/ozgun/Projects/biopax-pattern/All-Data.owl")); 032 033 new ChemicalNameNormalizer(model); 034 } 035 036 /** 037 * Gets the standard name of the small molecule. 038 * @param smr the molecule to check standard name 039 * @return standard name 040 */ 041 public String getName(SmallMoleculeReference smr) 042 { 043 if (map.containsKey(smr)) return map.get(smr).getDisplayName(); 044 else return smr.getDisplayName(); 045 } 046 047 /** 048 * Constructor that also infers all the mapping. 049 * @param model the big picture 050 */ 051 public ChemicalNameNormalizer(Model model) 052 { 053 map = new HashMap<SmallMoleculeReference, SmallMoleculeReference>(); 054 055 Set<SmallMoleculeReference> standard = new HashSet<SmallMoleculeReference>(); 056 Set<SmallMoleculeReference> other = new HashSet<SmallMoleculeReference>(); 057 058 for (SmallMoleculeReference smr : model.getObjects(SmallMoleculeReference.class)) 059 { 060 if (smr.getRDFId().startsWith("http://identifiers")) standard.add(smr); 061 else other.add(smr); 062 } 063 064 System.out.println("Standard smr = " + standard.size()); 065 System.out.println("Other smr = " + other.size()); 066 067 Map<SmallMoleculeReference, Set<String>> smrNames = collectNames(false, standard, other); 068 Map<SmallMoleculeReference, Set<String>> smNames = collectNames(true, standard, other); 069 070 071 // Unify names of standards 072 073 Map<SmallMoleculeReference, Set<SmallMoleculeReference>> standardSelfMatch = 074 getSelfMatching(standard, smrNames, smNames, true); 075 076 for (SmallMoleculeReference smr : standardSelfMatch.keySet()) 077 { 078 Set<SmallMoleculeReference> matches = standardSelfMatch.get(smr); 079 if (matches.size() == 1) 080 { 081 SmallMoleculeReference m = matches.iterator().next(); 082 083 if (smr.getDisplayName().length() <= m.getDisplayName().length()) 084 { 085 map.put(smr, m); 086 standard.remove(smr); 087 } 088 } 089 else 090 { 091 System.out.print(smr.getDisplayName() + " matched more than one"); 092 for (SmallMoleculeReference match : matches) 093 { 094 System.out.print("\t" + match.getDisplayName()); 095 } 096 System.out.println(); 097 } 098 } 099 100 Map<SmallMoleculeReference, Set<SmallMoleculeReference>> selfMatch = 101 getSelfMatching(other, smrNames, smNames, false); 102 103 enrichNamesWithMatchings(selfMatch, smrNames); 104 enrichNamesWithMatchings(selfMatch, smNames); 105 106 Set<SmallMoleculeReference> missed = new HashSet<SmallMoleculeReference>(); 107 Map<SmallMoleculeReference, Set<SmallMoleculeReference>> multiMap = 108 new HashMap<SmallMoleculeReference, Set<SmallMoleculeReference>>(); 109 110 for (SmallMoleculeReference smr : other) 111 { 112 Set<SmallMoleculeReference> matching = getMatching(smr, standard, smrNames, smNames); 113 114 if (matching.size() == 1) 115 { 116 map.put(smr, matching.iterator().next()); 117 } 118 else if (matching.size() > 1) 119 { 120 multiMap.put(smr, matching); 121 } 122 else 123 { 124 missed.add(smr); 125 } 126 } 127 128 for (SmallMoleculeReference smr : multiMap.keySet()) 129 { 130 if (isGeneric(smr)) continue; 131 132 Set<SmallMoleculeReference> matches = multiMap.get(smr); 133 SmallMoleculeReference rep = selectRepresentative(matches, map); 134 135 map.put(smr, rep); 136 137 for (SmallMoleculeReference match : matches) 138 { 139 if (match == rep) continue; 140 if (map.containsKey(match)) 141 { 142 if (map.get(match) == rep) continue; 143 144 System.out.println("Already matched " + match.getDisplayName() + " to " + 145 map.get(match).getDisplayName() + ". This one is " + rep.getDisplayName()); 146 } 147 else if (map.values().contains(match)) 148 { 149 System.out.println(match.getDisplayName() + " was mapped from another chem"); 150 } 151 else map.put(match, rep); 152 } 153 } 154 155 Iterator<SmallMoleculeReference> iter = multiMap.keySet().iterator(); 156 while (iter.hasNext()) 157 { 158 SmallMoleculeReference smr = iter.next(); 159 if (map.containsKey(smr)) iter.remove(); 160 } 161 162 System.out.println("matchCnt = " + map.size()); 163 System.out.println("multiCnt = " + multiMap.size()); 164 System.out.println("missCnt = " + missed.size()); 165 System.out.println(); 166 167// printTopPart("Multi match", multiMap, 50); 168// printTopPart("Miss-match", missed, 50); 169 } 170 171 private Map<SmallMoleculeReference, Set<String>> collectNames(boolean peLevel, 172 Set<SmallMoleculeReference>... sets) 173 { 174 Map<SmallMoleculeReference, Set<String>> map = 175 new HashMap<SmallMoleculeReference, Set<String>>(); 176 177 for (Set<SmallMoleculeReference> set : sets) 178 { 179 for (SmallMoleculeReference smr : set) 180 { 181 map.put(smr, new HashSet<String>()); 182 183 if (!peLevel) 184 { 185 for (String name : smr.getName()) 186 { 187 map.get(smr).add(name.toLowerCase()); 188 } 189 } 190 else 191 { 192 for (SimplePhysicalEntity sm : smr.getEntityReferenceOf()) 193 { 194 for (String name : sm.getName()) 195 { 196 map.get(smr).add(name.toLowerCase()); 197 } 198 } 199 } 200 } 201 } 202 return map; 203 } 204 205 private Set<SmallMoleculeReference> getNameNormalizedMatching(SmallMoleculeReference smr, 206 Set<SmallMoleculeReference> smrs) 207 { 208 String name = null; 209 String dispName = smr.getDisplayName().toLowerCase(); 210 if (dispName.endsWith("-)") || dispName.endsWith("+)")) 211 { 212 name = dispName.substring(0, dispName.lastIndexOf("(")).trim(); 213 } 214 else if (dispName.endsWith(" zwitterion")) 215 { 216 name = dispName.substring(0, dispName.lastIndexOf(" ")).trim(); 217 } 218 219 if (name == null) return Collections.emptySet(); 220 221 Set<SmallMoleculeReference> matching = new HashSet<SmallMoleculeReference>(); 222 223 for (SmallMoleculeReference ref : smrs) 224 { 225 if (ref.getDisplayName().toLowerCase().equals(name)) matching.add(ref); 226 } 227 228 return matching; 229 } 230 231 private Set<SmallMoleculeReference> getMatching(SmallMoleculeReference smr, 232 Set<SmallMoleculeReference> standard, Map<SmallMoleculeReference, Set<String>> smrNames, 233 Map<SmallMoleculeReference, Set<String>> smNames) 234 { 235 Set<SmallMoleculeReference> matching = new HashSet<SmallMoleculeReference>(); 236 237 for (SmallMoleculeReference std : standard) 238 { 239 if (std.getDisplayName() != null && smr.getDisplayName() != null && 240 std.getDisplayName().toLowerCase().equals(smr.getDisplayName().toLowerCase())) 241 matching.add(std); 242 } 243 244 if (!matching.isEmpty()) return matching; 245 246 for (SmallMoleculeReference std : standard) 247 { 248 for (String name : smrNames.get(smr)) 249 { 250 if(smrNames.get(std).contains(name)) matching.add(std); 251 } 252 } 253 254 if (!matching.isEmpty()) return matching; 255 256 for (SmallMoleculeReference std : standard) 257 { 258 for (String name : smrNames.get(smr)) 259 { 260 if(smNames.get(std).contains(name)) matching.add(std); 261 } 262 } 263 264 if (!matching.isEmpty()) return matching; 265 266 for (SmallMoleculeReference std : standard) 267 { 268 for (String name : smNames.get(smr)) 269 { 270 if(smrNames.get(std).contains(name)) matching.add(std); 271 } 272 } 273 274 if (!matching.isEmpty()) return matching; 275 276 for (SmallMoleculeReference std : standard) 277 { 278 for (String name : smNames.get(smr)) 279 { 280 if(smNames.get(std).contains(name)) matching.add(std); 281 } 282 } 283 284 return matching; 285 } 286 287 private Map<SmallMoleculeReference, Set<SmallMoleculeReference>> getSelfMatching( 288 Set<SmallMoleculeReference> smrs, Map<SmallMoleculeReference, Set<String>> smrNames, 289 Map<SmallMoleculeReference, Set<String>> smNames, boolean normalizeName) 290 { 291 Map<SmallMoleculeReference, Set<SmallMoleculeReference>> map = 292 new HashMap<SmallMoleculeReference, Set<SmallMoleculeReference>>(); 293 294 for (SmallMoleculeReference smr : smrs) 295 { 296 Set<SmallMoleculeReference> matching = normalizeName ? 297 getNameNormalizedMatching(smr, smrs) : 298 getMatching(smr, smrs, smrNames, smNames); 299 300 assert normalizeName || !matching.isEmpty(); // it should at least detect itself 301 302 matching.remove(smr); 303 if (!matching.isEmpty()) map.put(smr, matching); 304 } 305 306 return map; 307 } 308 309 private static final PathAccessor INTER_ACC = 310 new PathAccessor("SmallMoleculeReference/entityReferenceOf/participantOf"); 311 312 private Map<SmallMoleculeReference, Integer> getInteractionCounts( 313 Set<SmallMoleculeReference>... smrSets) 314 { 315 Map<SmallMoleculeReference, Integer> cnt = new HashMap<SmallMoleculeReference, Integer>(); 316 317 for (Set<SmallMoleculeReference> smrSet : smrSets) 318 { 319 for (SmallMoleculeReference smr : smrSet) 320 { 321 if (cnt.containsKey(smr)) continue; 322 323 cnt.put(smr, INTER_ACC.getValueFromBean(smr).size()); 324 } 325 } 326 return cnt; 327 } 328 329 private List<SmallMoleculeReference> getSortedList(Collection<SmallMoleculeReference> smrs, 330 final Map<SmallMoleculeReference, Integer> cnt) 331 { 332 List<SmallMoleculeReference> list = new ArrayList<SmallMoleculeReference>(smrs); 333 Collections.sort(list, new Comparator<SmallMoleculeReference>() 334 { 335 @Override 336 public int compare(SmallMoleculeReference o1, SmallMoleculeReference o2) 337 { 338 return cnt.get(o2).compareTo(cnt.get(o1)); 339 } 340 }); 341 342 return list; 343 } 344 345 private void printTopPart(String listName, Set<SmallMoleculeReference> smrs, int upTo) 346 { 347 Map<SmallMoleculeReference, Integer> cnt = getInteractionCounts(smrs); 348 List<SmallMoleculeReference> list = getSortedList(smrs, cnt); 349 350 int i = 0; 351 352 System.out.println(listName + "\n--------------"); 353 for (SmallMoleculeReference smr : list) 354 { 355 System.out.println(cnt.get(smr) + "\t" + smr.getDisplayName()); 356 357 if (++i == upTo) break; 358 } 359 System.out.println(); 360 } 361 362 private void printTopPart(String listName, 363 Map<SmallMoleculeReference, Set<SmallMoleculeReference>> smrMap, int upTo) 364 { 365 Map<SmallMoleculeReference, Integer> cnt = getInteractionCounts(smrMap.keySet()); 366 List<SmallMoleculeReference> list = getSortedList(smrMap.keySet(), cnt); 367 368 int i = 0; 369 370 System.out.println(listName + "\n--------------"); 371 for (SmallMoleculeReference smr : list) 372 { 373 System.out.print(cnt.get(smr) + "\t" + smr.getDisplayName() + "\t"); 374 375 for (SmallMoleculeReference match : smrMap.get(smr)) 376 { 377 System.out.print("\t" + match.getDisplayName()); 378 } 379 System.out.println(); 380 381 if (++i == upTo) break; 382 } 383 System.out.println(); 384 } 385 386 private void enrichNamesWithMatchings( 387 Map<SmallMoleculeReference, Set<SmallMoleculeReference>> matchMap, 388 Map<SmallMoleculeReference, Set<String>> names) 389 { 390 for (SmallMoleculeReference smr : matchMap.keySet()) 391 { 392 for (SmallMoleculeReference match : matchMap.get(smr)) 393 { 394 names.get(smr).addAll(names.get(match)); 395 } 396 } 397 } 398 399 private boolean isGeneric(SmallMoleculeReference smr) 400 { 401 if (!smr.getMemberEntityReference().isEmpty()) return true; 402 403 for (SimplePhysicalEntity sm : smr.getEntityReferenceOf()) 404 { 405 if (!sm.getMemberPhysicalEntity().isEmpty()) return true; 406 } 407 408 return false; 409 } 410 411 private SmallMoleculeReference selectRepresentative(Set<SmallMoleculeReference> smrs, 412 final Map<SmallMoleculeReference, SmallMoleculeReference> map) 413 { 414 List<SmallMoleculeReference> list = new ArrayList<SmallMoleculeReference>(smrs); 415 final Map<SmallMoleculeReference, Integer> cnt = getInteractionCounts(smrs); 416 417 Collections.sort(list, new Comparator<SmallMoleculeReference>() 418 { 419 @Override 420 public int compare(SmallMoleculeReference o1, SmallMoleculeReference o2) 421 { 422 if (map.containsValue(o1)) 423 { 424 if (!map.containsValue(o2)) return -1; 425 } 426 else 427 { 428 if (map.containsValue(o2)) return 1; 429 } 430 431 if (!cnt.get(o1).equals(cnt.get(o2))) return cnt.get(o2).compareTo(cnt.get(o1)); 432 433 if (o1.getDisplayName().endsWith(")")) 434 { 435 if (!o2.getDisplayName().endsWith(")")) return -1; 436 } 437 else if (o2.getDisplayName().endsWith(")")) return 1; 438 439 return o1.getDisplayName().compareTo(o2.getDisplayName()); 440 } 441 }); 442 443 return list.get(0); 444 } 445}