001package org.biopax.paxtools.io.gsea;
002
003import org.apache.commons.collections15.CollectionUtils;
004import org.apache.commons.lang.StringUtils;
005import org.apache.commons.logging.Log;
006import org.apache.commons.logging.LogFactory;
007import org.biopax.paxtools.controller.*;
008import org.biopax.paxtools.converter.LevelUpgrader;
009import org.biopax.paxtools.model.BioPAXElement;
010import org.biopax.paxtools.model.BioPAXLevel;
011import org.biopax.paxtools.model.Model;
012import org.biopax.paxtools.model.level3.*;
013
014import java.io.IOException;
015import java.io.OutputStream;
016import java.io.OutputStreamWriter;
017import java.io.Writer;
018import java.util.*;
019import java.util.concurrent.ExecutorService;
020import java.util.concurrent.Executors;
021import java.util.concurrent.TimeUnit;
022
023/**
024 * Converts a BioPAX model to the GMT format (used by GSEA software).
025 * 
026 * It creates GSEA entries from the protein reference (PR) xrefs 
027 * in the BioPAX model as follows: 
028 * <ul>
029 * <li>Each entry (row) consists of three columns (tab separated): 
030 * name (e.g., "taxonomyID: pathway_name"), 
031 * description (e.g. "datasource: pid;reactome; organism: 9606 id type: uniprot"), and
032 * the list of identifiers (of the same type). For all PRs not associated with any pathway,
033 * "Not pathway" is used instead of the pathway name.</li>
034 * <li>The "id type" is what specified by Constructor parameter 'database'. 
035 * </li>
036 * <li>The list may have one or more IDs of the same type per PR, 
037 * e.g., UniProt IDs or HGNC Symbols; PRs not having an xref of 
038 * given db/id type are ignored. If there are less than three protein 
039 * referencesper entry, it will not be printed.</li>
040 * </ul>
041 * 
042 * Note, to effectively enforce cross-species violation, 
043 * 'organism' property of PRs and pathways must be set 
044 * to a BioSource object that has a valid unification xref: 
045 * db="Taxonomy" and some taxonomy id.
046 *
047 * Note, this code assumes that the model has successfully been validated
048 * and perhaps even normalized (using the BioPAX Validator/Normalizer). 
049 * A BioPAX L1 or L2 model is first converted to the L3 
050 * (this is a lossless conversion if there are no BioPAX errors).
051 */
052public class GSEAConverter
053{
054        private final static Log LOG = LogFactory.getLog(GSEAConverter.class);
055        
056        private final String database;
057        private final boolean crossSpeciesCheckEnabled;
058        private final boolean skipSubPathways;
059        private final Set<Provenance> skipSubPathwaysOf;
060
061        /**
062         * Constructor.
063         */
064        public GSEAConverter()
065        {
066                this("", true);
067        }
068
069        /**
070         * Constructor.
071         *
072         * See class declaration for more information.
073         * @param database - identifier type, name of the resource, either the string value 
074         *                   of the most of EntityReference's xref.db properties in the BioPAX data,
075         *                   e.g., "HGNC Symbol", "NCBI Gene", "RefSeq", "UniProt" or "UniProt knowledgebase",
076         *                   or the &lt;namespace&gt; part in normalized EntityReference URIs 
077         *                   http://identifiers.org/&lt;namespace&gt;/&lt;ID&gt;
078         *                   (it depends on the actual data; so double-check before using in this constructor).
079         * @param crossSpeciesCheckEnabled - if true, enforces no cross species participants in output
080         */
081        public GSEAConverter(String database, boolean crossSpeciesCheckEnabled)
082        {
083                this(database, crossSpeciesCheckEnabled, false);
084        }
085
086        /**
087         * Constructor.
088         *
089         * See class declaration for more information.
090         * @param database - identifier type, name of the resource, either the string value
091         *                   of the most of EntityReference's xref.db properties in the BioPAX data,
092         *                   e.g., "HGNC Symbol", "NCBI Gene", "RefSeq", "UniProt" or "UniProt knowledgebase",
093         *                   or the &lt;namespace&gt; part in normalized EntityReference URIs
094         *                   http://identifiers.org/&lt;namespace&gt;/&lt;ID&gt;
095         *                   (it depends on the actual data; so double-check before using in this constructor).
096         * @param crossSpeciesCheckEnabled - if true, enforces no cross species participants in output
097         * @param skipSubPathways - if true, do not traverse into any sub-pathways to collect entity references
098         *                       (useful when a model, such as converted to BioPAX KEGG data, has lots of sub-pathways, loops.)
099         */
100        public GSEAConverter(String database, boolean crossSpeciesCheckEnabled, boolean skipSubPathways)
101        {
102                this.database = database;
103                this.crossSpeciesCheckEnabled = crossSpeciesCheckEnabled;
104                this.skipSubPathways = skipSubPathways;
105                this.skipSubPathwaysOf = Collections.emptySet();
106        }
107
108        /**
109         * Constructor.
110         *
111         * See class declaration for more information.
112         * @param database - identifier type, name of the resource, either the string value
113         *                   of the most of EntityReference's xref.db properties in the BioPAX data,
114         *                   e.g., "HGNC Symbol", "NCBI Gene", "RefSeq", "UniProt" or "UniProt knowledgebase",
115         *                   or the &lt;namespace&gt; part in normalized EntityReference URIs
116         *                   http://identifiers.org/&lt;namespace&gt;/&lt;ID&gt;, such as 'hgnc.symbol', 'uniprot'
117         *                   (it depends on the actual data; so double-check before using in this constructor).
118         * @param crossSpeciesCheckEnabled - if true, enforces no cross species participants in output
119         * @param skipSubPathwaysOf - do not look inside sub-pathways of pathways of given data sources to collect entity references
120         *                       (useful when a model, such as converted to BioPAX KEGG data, has lots of sub-pathways, loops.)
121         */
122        public GSEAConverter(String database, boolean crossSpeciesCheckEnabled, Set<Provenance> skipSubPathwaysOf)
123        {
124                this.database = database;
125                this.crossSpeciesCheckEnabled = crossSpeciesCheckEnabled;
126
127                if(skipSubPathwaysOf == null)
128                        skipSubPathwaysOf = Collections.emptySet();
129                this.skipSubPathwaysOf = skipSubPathwaysOf;
130
131                this.skipSubPathways = false;
132        }
133
134
135        /**
136         * Converts model to GSEA (GMT) and writes to out.
137         * See class declaration for more information.
138         *
139         * @param model Model
140         * @param out output stream to write the result to
141         * @throws IOException when there's an output stream error
142         */
143        public void writeToGSEA(final Model model, OutputStream out) throws IOException
144        {
145                Collection<GSEAEntry> entries = convert(model);
146                if (entries.size() > 0)
147                {
148                        Writer writer = new OutputStreamWriter(out);
149                        for (GSEAEntry entry : entries)
150                        {
151                                writer.write(entry.toString() + "\n");
152                        }
153                        writer.flush();
154                }
155        }
156
157        /**
158         * Creates GSEA entries from the pathways contained in the model.
159         * @param model Model
160         * @return a set of GSEA entries
161         */
162        public Collection<GSEAEntry> convert(final Model model)
163        {
164                final Collection<GSEAEntry> toReturn = new TreeSet<GSEAEntry>(new Comparator<GSEAEntry>() {
165                        @Override
166                        public int compare(GSEAEntry o1, GSEAEntry o2) {
167                                return o1.toString().compareTo(o2.toString());
168                        }
169                });
170
171                Model l3Model;
172                // convert to level 3 in necessary
173                if (model.getLevel() == BioPAXLevel.L1 || model.getLevel() == BioPAXLevel.L2)
174                        l3Model = (new LevelUpgrader()).filter(model);
175                else
176                        l3Model = model;
177                
178                //a modifiable copy of the set of all PRs in the model - 
179                //simply to keep, after all, all the PRs that do not belong to any pathway
180                final Set<ProteinReference> prs = Collections.synchronizedSet(
181                                new HashSet<ProteinReference>(l3Model.getObjects(ProteinReference.class))
182                );
183
184                ExecutorService exe = Executors.newFixedThreadPool(10);
185                final Set<Pathway> pathways = l3Model.getObjects(Pathway.class);
186                for (Pathway pathway : pathways) 
187                {
188                        String name = (pathway.getDisplayName() == null)
189                                        ? pathway.getStandardName() : pathway.getDisplayName();
190                        
191                        if(name == null || name.isEmpty()) 
192                                name = pathway.getRDFId();
193
194                        final Pathway currentPathway = pathway;
195                        final String currentPathwayName = name;
196                        final boolean ignoreSubPathways = skipSubPathways ||
197                                (!skipSubPathwaysOf.isEmpty() && shareSomeObjects(currentPathway.getDataSource(), skipSubPathwaysOf));
198
199                        exe.submit(new Runnable() {
200                                @Override
201                                public void run() {
202                                        LOG.info("Begin converting " + currentPathwayName + " pathway, uri=" + currentPathway.getRDFId());
203
204                                        final Set<ProteinReference> pathwayProteinRefs = new HashSet<ProteinReference>();
205
206                                        Traverser traverser = new AbstractTraverser(SimpleEditorMap.L3,
207                                                        Fetcher.nextStepFilter, Fetcher.objectPropertiesOnlyFilter)
208                                        {
209                                                protected void visit(Object range, BioPAXElement domain, Model model, PropertyEditor editor)
210                                                {
211                                                        //by design (- objectPropertiesOnlyFilter is used), it'll visit only object properties.
212                                                        BioPAXElement bpe = (BioPAXElement) range;
213
214                                                        if(bpe instanceof ProteinReference) {
215                                                                pathwayProteinRefs.add((ProteinReference) bpe);
216                                                        }
217
218                                                        if(bpe instanceof Pathway) {
219                                                                Pathway subPathway = (Pathway) bpe;
220                                                                if(ignoreSubPathways)
221                                                                {       //do not traverse into the sub-pathway; log
222                                                                        LOG.info("Skipping sub-pathway: " + subPathway.getRDFId());
223                                                                } else {
224                                                                        traverse(subPathway, null);
225                                                                }
226                                                        } else {
227                                                                traverse(bpe, null);
228                                                        }
229                                                }
230                                        };
231                                        //run it - collect all PRs from the pathway
232                                        traverser.traverse(currentPathway, null);
233
234
235                                        LOG.info("- fetched PRs: " + pathwayProteinRefs.size());
236                                        if(!pathwayProteinRefs.isEmpty()) {
237                                                LOG.info("- grouping the PRs by organism...");
238                                                Map<String,Set<ProteinReference>> orgToPrsMap = organismToProteinRefsMap(pathwayProteinRefs);
239                                                // create GSEA/GMT entries - one entry per organism (null organism also makes one)
240                                                String dataSource = getDataSource(currentPathway.getDataSource());
241                                                LOG.info("- creating GSEA/GMT entries...");
242                                                Collection<GSEAEntry> entries = createGseaEntries(currentPathwayName, dataSource, orgToPrsMap);
243                                                if(!entries.isEmpty())
244                                                        toReturn.addAll(entries);
245                                                prs.removeAll(pathwayProteinRefs);//there left not yet processed PRs (PR can be processed multiple times anyway)
246                                                LOG.info("- collected " + entries.size() + "entries.");
247                                        }
248                                }
249                        });
250                }
251
252                exe.shutdown();
253                try {
254                        exe.awaitTermination(48, TimeUnit.HOURS);
255                } catch (InterruptedException e) {
256                        throw new RuntimeException("Interrupted unexpectedly!");
257                }
258                
259                //when there're no pathways, only empty pathays, pathways w/o PRs, then use all/rest of PRs -
260                //organize PRs by species (GSEA s/w can handle only same species identifiers in a data row)
261                LOG.info("Creating entries for the rest fo (unused) PRs...");
262                if(!prs.isEmpty()) { //all or not processed above
263                        Map<String,Set<ProteinReference>> orgToPrsMap = organismToProteinRefsMap(prs);
264                        if(!orgToPrsMap.isEmpty()) {
265                                // create GSEA/GMT entries - one entry per organism (null organism also makes one) 
266                                toReturn.addAll(createGseaEntries("Not pathway", 
267                                        getDataSource(l3Model.getObjects(Provenance.class)), orgToPrsMap));     
268                        }
269                }
270                                        
271                return toReturn;
272        }
273
274        
275        private Collection<GSEAEntry> createGseaEntries(final String name, final String dataSource, 
276                        final Map<String, Set<ProteinReference>> orgToPrsMap) 
277        {
278                // generate GSEA entries for each taxId in parallel threads; await till all done (before returning)
279                final Collection<GSEAEntry> toReturn = Collections.synchronizedList(new ArrayList<GSEAEntry>());        
280                ExecutorService exe = Executors.newFixedThreadPool(5);
281                for (final String org : orgToPrsMap.keySet()) {
282                        if(orgToPrsMap.get(org).size() > 0) {
283                                exe.submit(new Runnable() {
284                                        @Override
285                                        public void run() {
286                                                LOG.info("adding " + database + " IDs of " + org + 
287                                                        " proteins (PRs) from '" + name + "', " + 
288                                                        dataSource + " pathway...");
289                                                GSEAEntry gseaEntry = new GSEAEntry(name, org, database, "datasource: " + dataSource);
290                                                processProteinReferences(orgToPrsMap.get(org), gseaEntry);
291                                                toReturn.add(gseaEntry);
292                                        }
293                                });
294                        }
295                }
296                exe.shutdown();
297                try {
298                        exe.awaitTermination(4, TimeUnit.HOURS);
299                } catch (InterruptedException e) {
300                        throw new RuntimeException("Interrupted unexpectedly!");
301                }
302                
303                return toReturn;
304        }
305
306        
307        //warn: there can be many equivalent BioSource objects (same taxonomy id, different URIs)
308        private Map<String, Set<ProteinReference>> organismToProteinRefsMap(
309                        Set<ProteinReference> proteinRefs) 
310        {
311                Map<String,Set<ProteinReference>> map = new HashMap<String, Set<ProteinReference>>();
312
313                if(proteinRefs.isEmpty())
314                        throw new IllegalArgumentException("Empty set");
315                
316                if (crossSpeciesCheckEnabled) {
317                        for (ProteinReference r : proteinRefs) {
318                                String key = getOrganismKey(r.getOrganism()); // null org. is ok (key == "")
319                                Set<ProteinReference> prs = map.get(key);
320                                if (prs == null) {
321                                        prs = new HashSet<ProteinReference>();
322                                        map.put(key, prs);
323                                }
324                                prs.add(r);
325                        }
326                } else {
327                        map.put("", proteinRefs); //all PRs
328                }
329                                
330                return map;
331        }
332
333
334        void processProteinReferences(Set<ProteinReference> prs, GSEAEntry targetEntry)
335        {
336                for (ProteinReference aProteinRef : prs)
337                {
338                        // we only process PRs that belong to the same species (as for targetEntry) if crossSpeciesCheckEnabled==true
339                        if (crossSpeciesCheckEnabled && !targetEntry.taxID().equals(getOrganismKey(aProteinRef.getOrganism())))
340                                continue;
341                                
342                        if (database != null && !database.isEmpty())
343                        {
344                                String lowercaseDb = database.toLowerCase();
345                                // a shortcut if we are converting validated normalized BioPAX model:
346                                // get the primary ID from the URI of the ProteinReference
347                                final String lowcaseUri = aProteinRef.getRDFId().toLowerCase();
348                                if (lowcaseUri.startsWith("http://identifiers.org/")
349                                        && lowcaseUri.contains(lowercaseDb))
350                                {
351                                        String accession = aProteinRef.getRDFId();
352                                        accession = accession.substring(accession.lastIndexOf("/") + 1);
353                                        targetEntry.getIdentifiers().add(accession);
354                                } 
355                                else { // simply pick one xref with matching db value (any one)
356                                        TreeSet<Xref> orderedXrefs = new TreeSet<Xref>(new Comparator<Xref>() {
357                                                @Override
358                                                public int compare(Xref o1, Xref o2) {
359                                                        return o1.toString().compareTo(o2.toString());
360                                                }
361                                        });
362                                        
363                                        orderedXrefs.addAll(aProteinRef.getXref());
364                                        for (Xref aXref : orderedXrefs)
365                                        {
366                                                if (aXref.getId() != null && aXref.getDb() != null && 
367                                                        (aXref.getDb().toLowerCase().startsWith(lowercaseDb) 
368                                                        || aXref.getId().toLowerCase().startsWith(lowercaseDb + ":")
369                                                        || aXref.getId().toLowerCase().startsWith(lowercaseDb + "_"))
370                                                ) {
371                                                        targetEntry.getIdentifiers().add(aXref.getId());
372                                                }
373                                        }
374                                }
375                        } else {
376                                // use URI (not really useful for GSEA software, but good for testing/hacking)
377                                targetEntry.getIdentifiers().add(aProteinRef.getRDFId());
378                        }
379                }
380        }
381
382        /*
383         * Gets datasource names, if any, in a consistent way/order, excl. duplicates
384         */
385        private String getDataSource(Set<Provenance> provenances)
386        {
387                if(provenances.isEmpty()) return "N/A";
388                
389                Set<String> dsNames = new TreeSet<String>();
390                for (Provenance provenance : provenances)
391                {
392                        String name = provenance.getDisplayName();
393                        if(name == null) 
394                                name = provenance.getStandardName();
395                        if(name == null && !provenance.getName().isEmpty()) 
396                                name = provenance.getName().iterator().next();
397                        if (name != null && name.length() > 0)
398                                dsNames.add(name.toLowerCase());
399                }
400                
401                return StringUtils.join(dsNames, ";");
402        }
403
404        
405        private String getOrganismKey(BioSource org) {
406                String key = ""; //default value: unspecified/all species
407
408                if (org != null) {
409                        Set<Xref> xrefs = org.getXref();
410
411                        if(!xrefs.isEmpty()) {
412                                for (Xref xref : xrefs) {
413                                        if (xref instanceof UnificationXref
414                                                && xref.getDb().equalsIgnoreCase("taxonomy")) {
415                                                        if(key.isEmpty())
416                                                                key = xref.getId();
417                                                        else
418                                                                LOG.warn("BioSource " + org + " has multiple taxonomy ID unification xrefs; " +
419                                                                                "I use " + key + " and ignore other, but the conversion might go wrong...");
420                                        }
421                                }
422                        }
423
424                        //when there're no Taxonomy xrefs - use a name
425                        if(key.isEmpty()) {
426                                if (org.getStandardName()!=null) {
427                                        key = org.getStandardName().toLowerCase();
428                                } else if(org.getDisplayName()!=null) {
429                                        key = org.getDisplayName().toLowerCase();
430                                } else if(!org.getName().isEmpty()) {
431                                        key = org.getName().iterator().next().toLowerCase();
432                                }
433                        }
434                }
435
436                return key;
437        }
438
439        private boolean shareSomeObjects(Set<?> setA, Set<?> setB) {
440                return (!setA.isEmpty() && !setB.isEmpty())     ? !CollectionUtils.intersection(setA, setB).isEmpty() : false;
441        }
442}