001package org.biopax.paxtools.pattern.miner; 002 003import org.biopax.paxtools.model.BioPAXElement; 004import org.biopax.paxtools.model.Model; 005import org.biopax.paxtools.model.level3.SmallMoleculeReference; 006import org.biopax.paxtools.pattern.util.Blacklist; 007import org.biopax.paxtools.pattern.util.ChemicalNameNormalizer; 008import org.biopax.paxtools.pattern.util.RelType; 009 010import java.util.*; 011 012/** 013 * This class generates a blacklist for the given model. It is important that the given model is the 014 * very big integrated corpus. It won't work on tiny little model. 015 * 016 * @author Ozgun Babur 017 */ 018public class BlacklistGenerator 019{ 020 /** 021 * For deciding if the molecule is ubiquitous and for determining the score and context. 022 */ 023 private Decider decider; 024 025 /** 026 * Constructor with decider. This decider should be optimized for the specific resource that the 027 * user deals with. 028 * @param decider decides if the molecule is ubique 029 */ 030 public BlacklistGenerator(Decider decider) 031 { 032 this.decider = decider; 033 } 034 035 /** 036 * Default constructor. 037 */ 038 public BlacklistGenerator() 039 { 040 this(new Decider() 041 { 042 @Override 043 public boolean isUbique(int neighborSize, int upstrOnly, int dwstrOnly) 044 { 045 return neighborSize >= 30; 046 } 047 048 @Override 049 public int getScore(int neighborSize, int upstrOnly, int dwstrOnly) 050 { 051 return neighborSize; 052 } 053 054 @Override 055 public RelType getContext(int neighborSize, int upstrOnly, int dwstrOnly) 056 { 057 if (upstrOnly > 10 * dwstrOnly) return RelType.OUTPUT; 058 else if (dwstrOnly > 10 * upstrOnly) return RelType.INPUT; 059 else return null; 060 } 061 }); 062 } 063 064 /** 065 * Generates the blacklist. 066 * @param model model to use 067 * @return the blacklist 068 */ 069 public Blacklist generateBlacklist(Model model) 070 { 071 ChemicalNameNormalizer normalizer = new ChemicalNameNormalizer(model); 072 SIFSearcher searcher = new SIFSearcher(new Fetcher(normalizer), SIFEnum.USED_TO_PRODUCE); 073 074 Set<SIFInteraction> sifs = searcher.searchSIF(model); 075 076 // read interactions into maps 077 078 Map<String, Set<String>> upstrMap = new HashMap<String, Set<String>>(); 079 Map<String, Set<String>> dwstrMap = new HashMap<String, Set<String>>(); 080 Map<String, Set<String>> neighMap = new HashMap<String, Set<String>>(); 081 082 for (SIFInteraction sif : sifs) 083 { 084 String source = sif.sourceID; 085 String target = sif.targetID; 086 087 if (!neighMap.containsKey(source)) neighMap.put(source, new HashSet<String>()); 088 if (!neighMap.containsKey(target)) neighMap.put(target, new HashSet<String>()); 089 if (!dwstrMap.containsKey(source)) dwstrMap.put(source, new HashSet<String>()); 090 if (!dwstrMap.containsKey(target)) dwstrMap.put(target, new HashSet<String>()); 091 if (!upstrMap.containsKey(source)) upstrMap.put(source, new HashSet<String>()); 092 if (!upstrMap.containsKey(target)) upstrMap.put(target, new HashSet<String>()); 093 094 neighMap.get(source).add(target); 095 neighMap.get(target).add(source); 096 dwstrMap.get(source).add(target); 097 upstrMap.get(target).add(source); 098 } 099 100 // remove intersection of upstream and downstream 101 102 for (String name : neighMap.keySet()) 103 { 104 if (!upstrMap.containsKey(name) || !dwstrMap.containsKey(name)) continue; 105 106 Set<String> upstr = upstrMap.get(name); 107 Set<String> dwstr = dwstrMap.get(name); 108 109 Set<String> temp = new HashSet<String>(upstr); 110 upstr.removeAll(dwstr); 111 dwstr.removeAll(temp); 112 } 113 114 115 Blacklist blacklist = new Blacklist(); 116 117 // populate the blacklist 118 119 for (SmallMoleculeReference smr : model.getObjects(SmallMoleculeReference.class)) 120 { 121 String name = normalizer.getName(smr); 122 123 int neighSize = neighMap.containsKey(name) ? neighMap.get(name).size() : 0; 124 int upstrOnly = upstrMap.containsKey(name) ? upstrMap.get(name).size() : 0; 125 int dwstrOnly = dwstrMap.containsKey(name) ? dwstrMap.get(name).size() : 0; 126 127 if (decider.isUbique(neighSize, upstrOnly, dwstrOnly)) 128 { 129 blacklist.addEntry(smr.getRDFId(), 130 decider.getScore(neighSize, upstrOnly, dwstrOnly), 131 decider.getContext(neighSize, upstrOnly, dwstrOnly)); 132 } 133 } 134 135 blacklist.write("blacklist.txt"); 136 137 return blacklist; 138 } 139 140 /** 141 * Class to fetch the ID of the small molecule. 142 */ 143 class Fetcher implements IDFetcher 144 { 145 ChemicalNameNormalizer normalizer; 146 147 Fetcher(ChemicalNameNormalizer normalizer) 148 { 149 this.normalizer = normalizer; 150 } 151 152 @Override 153 public Set<String> fetchID(BioPAXElement ele) 154 { 155 if (ele instanceof SmallMoleculeReference) 156 { 157 return Collections.singleton(normalizer.getName((SmallMoleculeReference) ele)); 158 } 159 160 return null; 161 } 162 } 163 164 /** 165 * The class to decide if a molecule is ubique, its score and its context of ubiquity. 166 */ 167 static interface Decider 168 { 169 /** 170 * Tells if the molecule is ubique in at least one context. 171 * @param neighborSize number of neighbors in the used-to-produce network 172 * @param upstrOnly number of upstream neighbors in the used-to-produce network, that are not also at downstream 173 * @param dwstrOnly number of downstream neighbors in the used-to-produce network, that are not also at upstream 174 */ 175 public boolean isUbique(int neighborSize, int upstrOnly, int dwstrOnly); 176 177 /** 178 * Gets the ubiquity score of the ubique molecule. This score is used for comparing ubiques 179 * and deciding the most essential reactants of a reaction if all reactants are ubique. 180 * @param neighborSize number of neighbors in the used-to-produce network 181 * @param upstrOnly number of upstream neighbors in the used-to-produce network, that are not also at downstream 182 * @param dwstrOnly number of downstream neighbors in the used-to-produce network, that are not also at upstream 183 */ 184 public int getScore(int neighborSize, int upstrOnly, int dwstrOnly); 185 186 /** 187 * Gets the context of ubiquity. A molecule can be ubiquitously consumed, or can be 188 * ubiquitously produced, or both. When it is both, this method has to return null. 189 * @param neighborSize number of neighbors in the used-to-produce network 190 * @param upstrOnly number of upstream neighbors in the used-to-produce network, that are not also at downstream 191 * @param dwstrOnly number of downstream neighbors in the used-to-produce network, that are not also at upstream 192 */ 193 public RelType getContext(int neighborSize, int upstrOnly, int dwstrOnly); 194 } 195 196}