001package org.biopax.paxtools.io.gsea; 002 003import org.apache.commons.collections15.CollectionUtils; 004import org.apache.commons.lang.StringUtils; 005import org.apache.commons.logging.Log; 006import org.apache.commons.logging.LogFactory; 007import org.biopax.paxtools.controller.*; 008import org.biopax.paxtools.converter.LevelUpgrader; 009import org.biopax.paxtools.model.BioPAXElement; 010import org.biopax.paxtools.model.BioPAXLevel; 011import org.biopax.paxtools.model.Model; 012import org.biopax.paxtools.model.level3.*; 013 014import java.io.IOException; 015import java.io.OutputStream; 016import java.io.OutputStreamWriter; 017import java.io.Writer; 018import java.util.*; 019import java.util.concurrent.ExecutorService; 020import java.util.concurrent.Executors; 021import java.util.concurrent.TimeUnit; 022 023/** 024 * Converts a BioPAX model to the GMT format (used by GSEA software). 025 * 026 * It creates GSEA entries from the protein reference (PR) xrefs 027 * in the BioPAX model as follows: 028 * <ul> 029 * <li>Each entry (row) consists of three columns (tab separated): 030 * name (e.g., "taxonomyID: pathway_name"), 031 * description (e.g. "datasource: pid;reactome; organism: 9606 id type: uniprot"), and 032 * the list of identifiers (of the same type). For all PRs not associated with any pathway, 033 * "Not pathway" is used instead of the pathway name.</li> 034 * <li>The "id type" is what specified by Constructor parameter 'database'. 035 * </li> 036 * <li>The list may have one or more IDs of the same type per PR, 037 * e.g., UniProt IDs or HGNC Symbols; PRs not having an xref of 038 * given db/id type are ignored. If there are less than three protein 039 * referencesper entry, it will not be printed.</li> 040 * </ul> 041 * 042 * Note, to effectively enforce cross-species violation, 043 * 'organism' property of PRs and pathways must be set 044 * to a BioSource object that has a valid unification xref: 045 * db="Taxonomy" and some taxonomy id. 046 * 047 * Note, this code assumes that the model has successfully been validated 048 * and perhaps even normalized (using the BioPAX Validator/Normalizer). 049 * A BioPAX L1 or L2 model is first converted to the L3 050 * (this is a lossless conversion if there are no BioPAX errors). 051 */ 052public class GSEAConverter 053{ 054 private final static Log LOG = LogFactory.getLog(GSEAConverter.class); 055 056 private final String database; 057 private final boolean crossSpeciesCheckEnabled; 058 private final boolean skipSubPathways; 059 private final Set<Provenance> skipSubPathwaysOf; 060 061 /** 062 * Constructor. 063 */ 064 public GSEAConverter() 065 { 066 this("", true); 067 } 068 069 /** 070 * Constructor. 071 * 072 * See class declaration for more information. 073 * @param database - identifier type, name of the resource, either the string value 074 * of the most of EntityReference's xref.db properties in the BioPAX data, 075 * e.g., "HGNC Symbol", "NCBI Gene", "RefSeq", "UniProt" or "UniProt knowledgebase", 076 * or the <namespace> part in normalized EntityReference URIs 077 * http://identifiers.org/<namespace>/<ID> 078 * (it depends on the actual data; so double-check before using in this constructor). 079 * @param crossSpeciesCheckEnabled - if true, enforces no cross species participants in output 080 */ 081 public GSEAConverter(String database, boolean crossSpeciesCheckEnabled) 082 { 083 this(database, crossSpeciesCheckEnabled, false); 084 } 085 086 /** 087 * Constructor. 088 * 089 * See class declaration for more information. 090 * @param database - identifier type, name of the resource, either the string value 091 * of the most of EntityReference's xref.db properties in the BioPAX data, 092 * e.g., "HGNC Symbol", "NCBI Gene", "RefSeq", "UniProt" or "UniProt knowledgebase", 093 * or the <namespace> part in normalized EntityReference URIs 094 * http://identifiers.org/<namespace>/<ID> 095 * (it depends on the actual data; so double-check before using in this constructor). 096 * @param crossSpeciesCheckEnabled - if true, enforces no cross species participants in output 097 * @param skipSubPathways - if true, do not traverse into any sub-pathways to collect entity references 098 * (useful when a model, such as converted to BioPAX KEGG data, has lots of sub-pathways, loops.) 099 */ 100 public GSEAConverter(String database, boolean crossSpeciesCheckEnabled, boolean skipSubPathways) 101 { 102 this.database = database; 103 this.crossSpeciesCheckEnabled = crossSpeciesCheckEnabled; 104 this.skipSubPathways = skipSubPathways; 105 this.skipSubPathwaysOf = Collections.emptySet(); 106 } 107 108 /** 109 * Constructor. 110 * 111 * See class declaration for more information. 112 * @param database - identifier type, name of the resource, either the string value 113 * of the most of EntityReference's xref.db properties in the BioPAX data, 114 * e.g., "HGNC Symbol", "NCBI Gene", "RefSeq", "UniProt" or "UniProt knowledgebase", 115 * or the <namespace> part in normalized EntityReference URIs 116 * http://identifiers.org/<namespace>/<ID>, such as 'hgnc.symbol', 'uniprot' 117 * (it depends on the actual data; so double-check before using in this constructor). 118 * @param crossSpeciesCheckEnabled - if true, enforces no cross species participants in output 119 * @param skipSubPathwaysOf - do not look inside sub-pathways of pathways of given data sources to collect entity references 120 * (useful when a model, such as converted to BioPAX KEGG data, has lots of sub-pathways, loops.) 121 */ 122 public GSEAConverter(String database, boolean crossSpeciesCheckEnabled, Set<Provenance> skipSubPathwaysOf) 123 { 124 this.database = database; 125 this.crossSpeciesCheckEnabled = crossSpeciesCheckEnabled; 126 127 if(skipSubPathwaysOf == null) 128 skipSubPathwaysOf = Collections.emptySet(); 129 this.skipSubPathwaysOf = skipSubPathwaysOf; 130 131 this.skipSubPathways = false; 132 } 133 134 135 /** 136 * Converts model to GSEA (GMT) and writes to out. 137 * See class declaration for more information. 138 * 139 * @param model Model 140 * @param out output stream to write the result to 141 * @throws IOException when there's an output stream error 142 */ 143 public void writeToGSEA(final Model model, OutputStream out) throws IOException 144 { 145 Collection<GSEAEntry> entries = convert(model); 146 if (entries.size() > 0) 147 { 148 Writer writer = new OutputStreamWriter(out); 149 for (GSEAEntry entry : entries) 150 { 151 writer.write(entry.toString() + "\n"); 152 } 153 writer.flush(); 154 } 155 } 156 157 /** 158 * Creates GSEA entries from the pathways contained in the model. 159 * @param model Model 160 * @return a set of GSEA entries 161 */ 162 public Collection<GSEAEntry> convert(final Model model) 163 { 164 final Collection<GSEAEntry> toReturn = new TreeSet<GSEAEntry>(new Comparator<GSEAEntry>() { 165 @Override 166 public int compare(GSEAEntry o1, GSEAEntry o2) { 167 return o1.toString().compareTo(o2.toString()); 168 } 169 }); 170 171 Model l3Model; 172 // convert to level 3 in necessary 173 if (model.getLevel() == BioPAXLevel.L1 || model.getLevel() == BioPAXLevel.L2) 174 l3Model = (new LevelUpgrader()).filter(model); 175 else 176 l3Model = model; 177 178 //a modifiable copy of the set of all PRs in the model - 179 //simply to keep, after all, all the PRs that do not belong to any pathway 180 final Set<ProteinReference> prs = Collections.synchronizedSet( 181 new HashSet<ProteinReference>(l3Model.getObjects(ProteinReference.class)) 182 ); 183 184 ExecutorService exe = Executors.newFixedThreadPool(10); 185 final Set<Pathway> pathways = l3Model.getObjects(Pathway.class); 186 for (Pathway pathway : pathways) 187 { 188 String name = (pathway.getDisplayName() == null) 189 ? pathway.getStandardName() : pathway.getDisplayName(); 190 191 if(name == null || name.isEmpty()) 192 name = pathway.getRDFId(); 193 194 final Pathway currentPathway = pathway; 195 final String currentPathwayName = name; 196 final boolean ignoreSubPathways = skipSubPathways || 197 (!skipSubPathwaysOf.isEmpty() && shareSomeObjects(currentPathway.getDataSource(), skipSubPathwaysOf)); 198 199 exe.submit(new Runnable() { 200 @Override 201 public void run() { 202 LOG.info("Begin converting " + currentPathwayName + " pathway, uri=" + currentPathway.getRDFId()); 203 204 final Set<ProteinReference> pathwayProteinRefs = new HashSet<ProteinReference>(); 205 206 Traverser traverser = new AbstractTraverser(SimpleEditorMap.L3, 207 Fetcher.nextStepFilter, Fetcher.objectPropertiesOnlyFilter) 208 { 209 protected void visit(Object range, BioPAXElement domain, Model model, PropertyEditor editor) 210 { 211 //by design (- objectPropertiesOnlyFilter is used), it'll visit only object properties. 212 BioPAXElement bpe = (BioPAXElement) range; 213 214 if(bpe instanceof ProteinReference) { 215 pathwayProteinRefs.add((ProteinReference) bpe); 216 } 217 218 if(bpe instanceof Pathway) { 219 Pathway subPathway = (Pathway) bpe; 220 if(ignoreSubPathways) 221 { //do not traverse into the sub-pathway; log 222 LOG.info("Skipping sub-pathway: " + subPathway.getRDFId()); 223 } else { 224 traverse(subPathway, null); 225 } 226 } else { 227 traverse(bpe, null); 228 } 229 } 230 }; 231 //run it - collect all PRs from the pathway 232 traverser.traverse(currentPathway, null); 233 234 235 LOG.info("- fetched PRs: " + pathwayProteinRefs.size()); 236 if(!pathwayProteinRefs.isEmpty()) { 237 LOG.info("- grouping the PRs by organism..."); 238 Map<String,Set<ProteinReference>> orgToPrsMap = organismToProteinRefsMap(pathwayProteinRefs); 239 // create GSEA/GMT entries - one entry per organism (null organism also makes one) 240 String dataSource = getDataSource(currentPathway.getDataSource()); 241 LOG.info("- creating GSEA/GMT entries..."); 242 Collection<GSEAEntry> entries = createGseaEntries(currentPathwayName, dataSource, orgToPrsMap); 243 if(!entries.isEmpty()) 244 toReturn.addAll(entries); 245 prs.removeAll(pathwayProteinRefs);//there left not yet processed PRs (PR can be processed multiple times anyway) 246 LOG.info("- collected " + entries.size() + "entries."); 247 } 248 } 249 }); 250 } 251 252 exe.shutdown(); 253 try { 254 exe.awaitTermination(48, TimeUnit.HOURS); 255 } catch (InterruptedException e) { 256 throw new RuntimeException("Interrupted unexpectedly!"); 257 } 258 259 //when there're no pathways, only empty pathays, pathways w/o PRs, then use all/rest of PRs - 260 //organize PRs by species (GSEA s/w can handle only same species identifiers in a data row) 261 LOG.info("Creating entries for the rest fo (unused) PRs..."); 262 if(!prs.isEmpty()) { //all or not processed above 263 Map<String,Set<ProteinReference>> orgToPrsMap = organismToProteinRefsMap(prs); 264 if(!orgToPrsMap.isEmpty()) { 265 // create GSEA/GMT entries - one entry per organism (null organism also makes one) 266 toReturn.addAll(createGseaEntries("Not pathway", 267 getDataSource(l3Model.getObjects(Provenance.class)), orgToPrsMap)); 268 } 269 } 270 271 return toReturn; 272 } 273 274 275 private Collection<GSEAEntry> createGseaEntries(final String name, final String dataSource, 276 final Map<String, Set<ProteinReference>> orgToPrsMap) 277 { 278 // generate GSEA entries for each taxId in parallel threads; await till all done (before returning) 279 final Collection<GSEAEntry> toReturn = Collections.synchronizedList(new ArrayList<GSEAEntry>()); 280 ExecutorService exe = Executors.newFixedThreadPool(5); 281 for (final String org : orgToPrsMap.keySet()) { 282 if(orgToPrsMap.get(org).size() > 0) { 283 exe.submit(new Runnable() { 284 @Override 285 public void run() { 286 LOG.info("adding " + database + " IDs of " + org + 287 " proteins (PRs) from '" + name + "', " + 288 dataSource + " pathway..."); 289 GSEAEntry gseaEntry = new GSEAEntry(name, org, database, "datasource: " + dataSource); 290 processProteinReferences(orgToPrsMap.get(org), gseaEntry); 291 toReturn.add(gseaEntry); 292 } 293 }); 294 } 295 } 296 exe.shutdown(); 297 try { 298 exe.awaitTermination(4, TimeUnit.HOURS); 299 } catch (InterruptedException e) { 300 throw new RuntimeException("Interrupted unexpectedly!"); 301 } 302 303 return toReturn; 304 } 305 306 307 //warn: there can be many equivalent BioSource objects (same taxonomy id, different URIs) 308 private Map<String, Set<ProteinReference>> organismToProteinRefsMap( 309 Set<ProteinReference> proteinRefs) 310 { 311 Map<String,Set<ProteinReference>> map = new HashMap<String, Set<ProteinReference>>(); 312 313 if(proteinRefs.isEmpty()) 314 throw new IllegalArgumentException("Empty set"); 315 316 if (crossSpeciesCheckEnabled) { 317 for (ProteinReference r : proteinRefs) { 318 String key = getOrganismKey(r.getOrganism()); // null org. is ok (key == "") 319 Set<ProteinReference> prs = map.get(key); 320 if (prs == null) { 321 prs = new HashSet<ProteinReference>(); 322 map.put(key, prs); 323 } 324 prs.add(r); 325 } 326 } else { 327 map.put("", proteinRefs); //all PRs 328 } 329 330 return map; 331 } 332 333 334 void processProteinReferences(Set<ProteinReference> prs, GSEAEntry targetEntry) 335 { 336 for (ProteinReference aProteinRef : prs) 337 { 338 // we only process PRs that belong to the same species (as for targetEntry) if crossSpeciesCheckEnabled==true 339 if (crossSpeciesCheckEnabled && !targetEntry.taxID().equals(getOrganismKey(aProteinRef.getOrganism()))) 340 continue; 341 342 if (database != null && !database.isEmpty()) 343 { 344 String lowercaseDb = database.toLowerCase(); 345 // a shortcut if we are converting validated normalized BioPAX model: 346 // get the primary ID from the URI of the ProteinReference 347 final String lowcaseUri = aProteinRef.getRDFId().toLowerCase(); 348 if (lowcaseUri.startsWith("http://identifiers.org/") 349 && lowcaseUri.contains(lowercaseDb)) 350 { 351 String accession = aProteinRef.getRDFId(); 352 accession = accession.substring(accession.lastIndexOf("/") + 1); 353 targetEntry.getIdentifiers().add(accession); 354 } 355 else { // simply pick one xref with matching db value (any one) 356 TreeSet<Xref> orderedXrefs = new TreeSet<Xref>(new Comparator<Xref>() { 357 @Override 358 public int compare(Xref o1, Xref o2) { 359 return o1.toString().compareTo(o2.toString()); 360 } 361 }); 362 363 orderedXrefs.addAll(aProteinRef.getXref()); 364 for (Xref aXref : orderedXrefs) 365 { 366 if (aXref.getId() != null && aXref.getDb() != null && 367 (aXref.getDb().toLowerCase().startsWith(lowercaseDb) 368 || aXref.getId().toLowerCase().startsWith(lowercaseDb + ":") 369 || aXref.getId().toLowerCase().startsWith(lowercaseDb + "_")) 370 ) { 371 targetEntry.getIdentifiers().add(aXref.getId()); 372 } 373 } 374 } 375 } else { 376 // use URI (not really useful for GSEA software, but good for testing/hacking) 377 targetEntry.getIdentifiers().add(aProteinRef.getRDFId()); 378 } 379 } 380 } 381 382 /* 383 * Gets datasource names, if any, in a consistent way/order, excl. duplicates 384 */ 385 private String getDataSource(Set<Provenance> provenances) 386 { 387 if(provenances.isEmpty()) return "N/A"; 388 389 Set<String> dsNames = new TreeSet<String>(); 390 for (Provenance provenance : provenances) 391 { 392 String name = provenance.getDisplayName(); 393 if(name == null) 394 name = provenance.getStandardName(); 395 if(name == null && !provenance.getName().isEmpty()) 396 name = provenance.getName().iterator().next(); 397 if (name != null && name.length() > 0) 398 dsNames.add(name.toLowerCase()); 399 } 400 401 return StringUtils.join(dsNames, ";"); 402 } 403 404 405 private String getOrganismKey(BioSource org) { 406 String key = ""; //default value: unspecified/all species 407 408 if (org != null) { 409 Set<Xref> xrefs = org.getXref(); 410 411 if(!xrefs.isEmpty()) { 412 for (Xref xref : xrefs) { 413 if (xref instanceof UnificationXref 414 && xref.getDb().equalsIgnoreCase("taxonomy")) { 415 if(key.isEmpty()) 416 key = xref.getId(); 417 else 418 LOG.warn("BioSource " + org + " has multiple taxonomy ID unification xrefs; " + 419 "I use " + key + " and ignore other, but the conversion might go wrong..."); 420 } 421 } 422 } 423 424 //when there're no Taxonomy xrefs - use a name 425 if(key.isEmpty()) { 426 if (org.getStandardName()!=null) { 427 key = org.getStandardName().toLowerCase(); 428 } else if(org.getDisplayName()!=null) { 429 key = org.getDisplayName().toLowerCase(); 430 } else if(!org.getName().isEmpty()) { 431 key = org.getName().iterator().next().toLowerCase(); 432 } 433 } 434 } 435 436 return key; 437 } 438 439 private boolean shareSomeObjects(Set<?> setA, Set<?> setB) { 440 return (!setA.isEmpty() && !setB.isEmpty()) ? !CollectionUtils.intersection(setA, setB).isEmpty() : false; 441 } 442}