001package org.biopax.paxtools.pattern.util;
002
003import org.biopax.paxtools.controller.PathAccessor;
004import org.biopax.paxtools.io.SimpleIOHandler;
005import org.biopax.paxtools.model.Model;
006import org.biopax.paxtools.model.level3.SimplePhysicalEntity;
007import org.biopax.paxtools.model.level3.SmallMoleculeReference;
008
009import java.io.FileInputStream;
010import java.io.FileNotFoundException;
011import java.util.*;
012
013/**
014 * This class is used for finding a standard name for a small molecule. During detection of
015 * ubiquitous small molecules, we map the duplicated small molecules to one standard name, otherwise
016 * their degree would be divided and this would spoil the detection method.
017 *
018 * @author Ozgun Babur
019 */
020public class ChemicalNameNormalizer
021{
022        /**
023         * Mapping from the a small molecule to the one that contains the standard name.
024         */
025        Map<SmallMoleculeReference, SmallMoleculeReference> map;
026
027        public static void main(String[] args) throws FileNotFoundException
028        {
029                SimpleIOHandler reader = new SimpleIOHandler();
030                Model model = reader.convertFromOWL(new FileInputStream(
031                        "/home/ozgun/Projects/biopax-pattern/All-Data.owl"));
032
033                new ChemicalNameNormalizer(model);
034        }
035
036        /**
037         * Gets the standard name of the small molecule.
038         * @param smr the molecule to check standard name
039         * @return standard name
040         */
041        public String getName(SmallMoleculeReference smr)
042        {
043                if (map.containsKey(smr)) return map.get(smr).getDisplayName();
044                else return smr.getDisplayName();
045        }
046
047        /**
048         * Constructor that also infers all the mapping.
049         * @param model the big picture
050         */
051        public ChemicalNameNormalizer(Model model)
052        {
053                map = new HashMap<SmallMoleculeReference, SmallMoleculeReference>();
054
055                Set<SmallMoleculeReference> standard = new HashSet<SmallMoleculeReference>();
056                Set<SmallMoleculeReference> other = new HashSet<SmallMoleculeReference>();
057
058                for (SmallMoleculeReference smr : model.getObjects(SmallMoleculeReference.class))
059                {
060                        if (smr.getRDFId().startsWith("http://identifiers")) standard.add(smr);
061                        else other.add(smr);
062                }
063
064                System.out.println("Standard smr = " + standard.size());
065                System.out.println("Other smr = " + other.size());
066
067                Map<SmallMoleculeReference, Set<String>> smrNames = collectNames(false, standard, other);
068                Map<SmallMoleculeReference, Set<String>> smNames = collectNames(true, standard, other);
069
070
071                // Unify names of standards
072
073                Map<SmallMoleculeReference, Set<SmallMoleculeReference>> standardSelfMatch =
074                        getSelfMatching(standard, smrNames, smNames, true);
075
076                for (SmallMoleculeReference smr : standardSelfMatch.keySet())
077                {
078                        Set<SmallMoleculeReference> matches = standardSelfMatch.get(smr);
079                        if (matches.size() == 1)
080                        {
081                                SmallMoleculeReference m = matches.iterator().next();
082
083                                if (smr.getDisplayName().length() <= m.getDisplayName().length())
084                                {
085                                        map.put(smr, m);
086                                        standard.remove(smr);
087                                }
088                        }
089                        else
090                        {
091                                System.out.print(smr.getDisplayName() + " matched more than one");
092                                for (SmallMoleculeReference match : matches)
093                                {
094                                        System.out.print("\t" + match.getDisplayName());
095                                }
096                                System.out.println();
097                        }
098                }
099
100                Map<SmallMoleculeReference, Set<SmallMoleculeReference>> selfMatch =
101                        getSelfMatching(other, smrNames, smNames, false);
102
103                enrichNamesWithMatchings(selfMatch, smrNames);
104                enrichNamesWithMatchings(selfMatch, smNames);
105
106                Set<SmallMoleculeReference> missed = new HashSet<SmallMoleculeReference>();
107                Map<SmallMoleculeReference, Set<SmallMoleculeReference>> multiMap =
108                        new HashMap<SmallMoleculeReference, Set<SmallMoleculeReference>>();
109
110                for (SmallMoleculeReference smr : other)
111                {
112                        Set<SmallMoleculeReference> matching = getMatching(smr, standard, smrNames, smNames);
113
114                        if (matching.size() == 1)
115                        {
116                                map.put(smr, matching.iterator().next());
117                        }
118                        else if (matching.size() > 1)
119                        {
120                                multiMap.put(smr, matching);
121                        }
122                        else
123                        {
124                                missed.add(smr);
125                        }
126                }
127
128                for (SmallMoleculeReference smr : multiMap.keySet())
129                {
130                        if (isGeneric(smr)) continue;
131
132                        Set<SmallMoleculeReference> matches = multiMap.get(smr);
133                        SmallMoleculeReference rep = selectRepresentative(matches, map);
134
135                        map.put(smr, rep);
136
137                        for (SmallMoleculeReference match : matches)
138                        {
139                                if (match == rep) continue;
140                                if (map.containsKey(match))
141                                {
142                                        if (map.get(match) == rep) continue;
143
144                                        System.out.println("Already matched " + match.getDisplayName() + " to " +
145                                                map.get(match).getDisplayName() + ". This one is " + rep.getDisplayName());
146                                }
147                                else if (map.values().contains(match))
148                                {
149                                        System.out.println(match.getDisplayName() + " was mapped from another chem");
150                                }
151                                else map.put(match, rep);
152                        }
153                }
154
155                Iterator<SmallMoleculeReference> iter = multiMap.keySet().iterator();
156                while (iter.hasNext())
157                {
158                        SmallMoleculeReference smr = iter.next();
159                        if (map.containsKey(smr)) iter.remove();
160                }
161
162                System.out.println("matchCnt = " + map.size());
163                System.out.println("multiCnt = " + multiMap.size());
164                System.out.println("missCnt = " + missed.size());
165                System.out.println();
166
167//              printTopPart("Multi match", multiMap, 50);
168//              printTopPart("Miss-match", missed, 50);
169        }
170
171        private Map<SmallMoleculeReference, Set<String>> collectNames(boolean peLevel,
172                Set<SmallMoleculeReference>... sets)
173        {
174                Map<SmallMoleculeReference, Set<String>> map =
175                        new HashMap<SmallMoleculeReference, Set<String>>();
176
177                for (Set<SmallMoleculeReference> set : sets)
178                {
179                        for (SmallMoleculeReference smr : set)
180                        {
181                                map.put(smr, new HashSet<String>());
182
183                                if (!peLevel)
184                                {
185                                        for (String name : smr.getName())
186                                        {
187                                                map.get(smr).add(name.toLowerCase());
188                                        }
189                                }
190                                else
191                                {
192                                        for (SimplePhysicalEntity sm : smr.getEntityReferenceOf())
193                                        {
194                                                for (String name : sm.getName())
195                                                {
196                                                        map.get(smr).add(name.toLowerCase());
197                                                }
198                                        }
199                                }
200                        }
201                }
202                return map;
203        }
204
205        private Set<SmallMoleculeReference> getNameNormalizedMatching(SmallMoleculeReference smr,
206                Set<SmallMoleculeReference> smrs)
207        {
208                String name = null;
209                String dispName = smr.getDisplayName().toLowerCase();
210                if (dispName.endsWith("-)") || dispName.endsWith("+)"))
211                {
212                        name = dispName.substring(0, dispName.lastIndexOf("(")).trim();
213                }
214                else if (dispName.endsWith(" zwitterion"))
215                {
216                        name = dispName.substring(0, dispName.lastIndexOf(" ")).trim();
217                }
218
219                if (name == null) return Collections.emptySet();
220
221                Set<SmallMoleculeReference> matching = new HashSet<SmallMoleculeReference>();
222
223                for (SmallMoleculeReference ref : smrs)
224                {
225                        if (ref.getDisplayName().toLowerCase().equals(name)) matching.add(ref);
226                }
227
228                return matching;
229        }
230
231        private Set<SmallMoleculeReference> getMatching(SmallMoleculeReference smr,
232                Set<SmallMoleculeReference> standard, Map<SmallMoleculeReference, Set<String>> smrNames,
233                Map<SmallMoleculeReference, Set<String>> smNames)
234        {
235                Set<SmallMoleculeReference> matching = new HashSet<SmallMoleculeReference>();
236
237                for (SmallMoleculeReference std : standard)
238                {
239                        if (std.getDisplayName() != null && smr.getDisplayName() != null &&
240                                std.getDisplayName().toLowerCase().equals(smr.getDisplayName().toLowerCase()))
241                                matching.add(std);
242                }
243
244                if (!matching.isEmpty()) return matching;
245
246                for (SmallMoleculeReference std : standard)
247                {
248                        for (String name : smrNames.get(smr))
249                        {
250                                if(smrNames.get(std).contains(name)) matching.add(std);
251                        }
252                }
253
254                if (!matching.isEmpty()) return matching;
255
256                for (SmallMoleculeReference std : standard)
257                {
258                        for (String name : smrNames.get(smr))
259                        {
260                                if(smNames.get(std).contains(name)) matching.add(std);
261                        }
262                }
263
264                if (!matching.isEmpty()) return matching;
265
266                for (SmallMoleculeReference std : standard)
267                {
268                        for (String name : smNames.get(smr))
269                        {
270                                if(smrNames.get(std).contains(name)) matching.add(std);
271                        }
272                }
273
274                if (!matching.isEmpty()) return matching;
275
276                for (SmallMoleculeReference std : standard)
277                {
278                        for (String name : smNames.get(smr))
279                        {
280                                if(smNames.get(std).contains(name)) matching.add(std);
281                        }
282                }
283
284                return matching;
285        }
286
287        private Map<SmallMoleculeReference, Set<SmallMoleculeReference>> getSelfMatching(
288                Set<SmallMoleculeReference> smrs, Map<SmallMoleculeReference, Set<String>> smrNames,
289                Map<SmallMoleculeReference, Set<String>> smNames, boolean normalizeName)
290        {
291                Map<SmallMoleculeReference, Set<SmallMoleculeReference>> map =
292                        new HashMap<SmallMoleculeReference, Set<SmallMoleculeReference>>();
293
294                for (SmallMoleculeReference smr : smrs)
295                {
296                        Set<SmallMoleculeReference> matching = normalizeName ?
297                                getNameNormalizedMatching(smr, smrs) :
298                                getMatching(smr, smrs, smrNames, smNames);
299
300                        assert normalizeName || !matching.isEmpty(); // it should at least detect itself
301
302                        matching.remove(smr);
303                        if (!matching.isEmpty()) map.put(smr, matching);
304                }
305
306                return map;
307        }
308
309        private static final PathAccessor INTER_ACC =
310                new PathAccessor("SmallMoleculeReference/entityReferenceOf/participantOf");
311
312        private Map<SmallMoleculeReference, Integer> getInteractionCounts(
313                Set<SmallMoleculeReference>... smrSets)
314        {
315                Map<SmallMoleculeReference, Integer> cnt = new HashMap<SmallMoleculeReference, Integer>();
316
317                for (Set<SmallMoleculeReference> smrSet : smrSets)
318                {
319                        for (SmallMoleculeReference smr : smrSet)
320                        {
321                                if (cnt.containsKey(smr)) continue;
322
323                                cnt.put(smr, INTER_ACC.getValueFromBean(smr).size());
324                        }
325                }
326                return cnt;
327        }
328
329        private List<SmallMoleculeReference> getSortedList(Collection<SmallMoleculeReference> smrs,
330                final Map<SmallMoleculeReference, Integer> cnt)
331        {
332                List<SmallMoleculeReference> list = new ArrayList<SmallMoleculeReference>(smrs);
333                Collections.sort(list, new Comparator<SmallMoleculeReference>()
334                {
335                        @Override
336                        public int compare(SmallMoleculeReference o1, SmallMoleculeReference o2)
337                        {
338                                return cnt.get(o2).compareTo(cnt.get(o1));
339                        }
340                });
341
342                return list;
343        }
344
345        private void printTopPart(String listName, Set<SmallMoleculeReference> smrs, int upTo)
346        {
347                Map<SmallMoleculeReference, Integer> cnt = getInteractionCounts(smrs);
348                List<SmallMoleculeReference> list = getSortedList(smrs, cnt);
349
350                int i = 0;
351
352                System.out.println(listName + "\n--------------");
353                for (SmallMoleculeReference smr : list)
354                {
355                        System.out.println(cnt.get(smr) + "\t" + smr.getDisplayName());
356
357                        if (++i == upTo) break;
358                }
359                System.out.println();
360        }
361
362        private void printTopPart(String listName,
363                Map<SmallMoleculeReference, Set<SmallMoleculeReference>> smrMap, int upTo)
364        {
365                Map<SmallMoleculeReference, Integer> cnt = getInteractionCounts(smrMap.keySet());
366                List<SmallMoleculeReference> list = getSortedList(smrMap.keySet(), cnt);
367
368                int i = 0;
369
370                System.out.println(listName + "\n--------------");
371                for (SmallMoleculeReference smr : list)
372                {
373                        System.out.print(cnt.get(smr) + "\t" + smr.getDisplayName() + "\t");
374
375                        for (SmallMoleculeReference match : smrMap.get(smr))
376                        {
377                                System.out.print("\t" + match.getDisplayName());
378                        }
379                        System.out.println();
380
381                        if (++i == upTo) break;
382                }
383                System.out.println();
384        }
385
386        private void enrichNamesWithMatchings(
387                Map<SmallMoleculeReference, Set<SmallMoleculeReference>> matchMap,
388                Map<SmallMoleculeReference, Set<String>> names)
389        {
390                for (SmallMoleculeReference smr : matchMap.keySet())
391                {
392                        for (SmallMoleculeReference match : matchMap.get(smr))
393                        {
394                                names.get(smr).addAll(names.get(match));
395                        }
396                }
397        }
398
399        private boolean isGeneric(SmallMoleculeReference smr)
400        {
401                if (!smr.getMemberEntityReference().isEmpty()) return true;
402
403                for (SimplePhysicalEntity sm : smr.getEntityReferenceOf())
404                {
405                        if (!sm.getMemberPhysicalEntity().isEmpty()) return true;
406                }
407
408                return false;
409        }
410
411        private SmallMoleculeReference selectRepresentative(Set<SmallMoleculeReference> smrs,
412                final Map<SmallMoleculeReference, SmallMoleculeReference> map)
413        {
414                List<SmallMoleculeReference> list = new ArrayList<SmallMoleculeReference>(smrs);
415                final Map<SmallMoleculeReference, Integer> cnt = getInteractionCounts(smrs);
416
417                Collections.sort(list, new Comparator<SmallMoleculeReference>()
418                {
419                        @Override
420                        public int compare(SmallMoleculeReference o1, SmallMoleculeReference o2)
421                        {
422                                if (map.containsValue(o1))
423                                {
424                                        if (!map.containsValue(o2)) return -1;
425                                }
426                                else
427                                {
428                                        if (map.containsValue(o2)) return 1;
429                                }
430
431                                if (!cnt.get(o1).equals(cnt.get(o2))) return cnt.get(o2).compareTo(cnt.get(o1));
432
433                                if (o1.getDisplayName().endsWith(")"))
434                                {
435                                        if (!o2.getDisplayName().endsWith(")")) return -1;
436                                }
437                                else if (o2.getDisplayName().endsWith(")")) return 1;
438
439                                return o1.getDisplayName().compareTo(o2.getDisplayName());
440                        }
441                });
442
443                return list.get(0);
444        }
445}