001package org.biopax.paxtools.search; 002 003import java.io.File; 004import java.io.IOException; 005import java.io.StringReader; 006import java.util.ArrayList; 007import java.util.Arrays; 008import java.util.HashSet; 009import java.util.List; 010import java.util.Set; 011import java.util.TreeSet; 012import java.util.concurrent.ExecutorService; 013import java.util.concurrent.Executors; 014import java.util.concurrent.TimeUnit; 015import java.util.concurrent.atomic.AtomicInteger; 016import java.util.regex.Pattern; 017 018import org.apache.commons.lang.StringUtils; 019import org.apache.lucene.analysis.Analyzer; 020import org.apache.lucene.analysis.TokenStream; 021import org.apache.lucene.analysis.standard.StandardAnalyzer; 022import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 023import org.apache.lucene.document.Document; 024import org.apache.lucene.document.Field; 025import org.apache.lucene.document.IntField; 026import org.apache.lucene.document.StoredField; 027import org.apache.lucene.document.StringField; 028import org.apache.lucene.document.TextField; 029import org.apache.lucene.index.CorruptIndexException; 030import org.apache.lucene.index.IndexWriter; 031import org.apache.lucene.index.IndexWriterConfig; 032import org.apache.lucene.index.Term; 033import org.apache.lucene.queryparser.classic.MultiFieldQueryParser; 034import org.apache.lucene.queryparser.classic.ParseException; 035import org.apache.lucene.queryparser.classic.QueryParser; 036import org.apache.lucene.search.BooleanQuery; 037import org.apache.lucene.search.CachingWrapperFilter; 038import org.apache.lucene.search.Filter; 039import org.apache.lucene.search.IndexSearcher; 040import org.apache.lucene.search.Query; 041import org.apache.lucene.search.QueryWrapperFilter; 042import org.apache.lucene.search.ScoreDoc; 043import org.apache.lucene.search.SearcherFactory; 044import org.apache.lucene.search.SearcherManager; 045import org.apache.lucene.search.TermQuery; 046import org.apache.lucene.search.TopDocs; 047import org.apache.lucene.search.TopScoreDocCollector; 048import org.apache.lucene.search.BooleanClause.Occur; 049import org.apache.lucene.search.highlight.Highlighter; 050import org.apache.lucene.search.highlight.QueryScorer; 051import org.apache.lucene.search.highlight.SimpleHTMLFormatter; 052import org.apache.lucene.search.highlight.SimpleSpanFragmenter; 053import org.apache.lucene.store.FSDirectory; 054import org.apache.lucene.store.MMapDirectory; 055import org.apache.lucene.util.Version; 056import org.biopax.paxtools.controller.Fetcher; 057import org.biopax.paxtools.controller.ModelUtils; 058import org.biopax.paxtools.controller.SimpleEditorMap; 059import org.biopax.paxtools.model.BioPAXElement; 060import org.biopax.paxtools.model.Model; 061import org.biopax.paxtools.model.level3.BioSource; 062import org.biopax.paxtools.model.level3.Level3Element; 063import org.biopax.paxtools.model.level3.Named; 064import org.biopax.paxtools.model.level3.Pathway; 065import org.biopax.paxtools.model.level3.Process; 066import org.biopax.paxtools.model.level3.Provenance; 067import org.biopax.paxtools.model.level3.UnificationXref; 068import org.biopax.paxtools.model.level3.XReferrable; 069import org.biopax.paxtools.model.level3.Xref; 070import org.biopax.paxtools.util.ClassFilterSet; 071import org.slf4j.Logger; 072import org.slf4j.LoggerFactory; 073 074 075/** 076 * A full-text searcher/indexer for BioPAX L3 models. 077 * 078 * @author rodche 079 */ 080public class SearchEngine implements Indexer, Searcher { 081 private static final Logger LOG = LoggerFactory.getLogger(SearchEngine.class); 082 083 // search fields 084 public static final String FIELD_URI = "uri"; 085 public static final String FIELD_KEYWORD = "keyword"; //anything, e.g., names, terms, comments, incl. - from child elements 086 public static final String FIELD_NAME = "name"; // standardName, displayName, other names 087 public static final String FIELD_XREFDB = "xrefdb"; //xref.db 088 public static final String FIELD_XREFID = "xrefid"; //xref.id 089 public static final String FIELD_PATHWAY = "pathway"; //parent/owner pathways; to be inferred from the whole biopax model 090 public static final String FIELD_SIZE = "size"; 091 // Full-text search/filter fields (case sensitive) - 092 //index organism names, cell/tissue type (term), taxonomy id, but only store BioSource URIs 093 public static final String FIELD_ORGANISM = "organism"; 094 //index data source names, but only URIs are stored in the index 095 public static final String FIELD_DATASOURCE = "datasource"; 096 public static final String FIELD_TYPE = "type"; 097 098 //Default fields to use with the MultiFieldQueryParser; 099 //one can still search in other fields directly, e.g., 100 //pathway:some_keywords datasource:"pid", etc. 101 public final static String[] DEFAULT_FIELDS = 102 { 103 FIELD_KEYWORD, //includes all data type properties (names, terms, comments), 104 // also from child elements up to given depth (3), also includes pathway names (inferred) 105 FIELD_NAME, // standardName, displayName, other names 106 FIELD_XREFID, //xref.id (also direct child's xref.id, i.e., can find both xref and its owners using a xrefid:<id> query string) 107 FIELD_SIZE, // find entities with a given no. child/associated processes... 108// FIELD_PATHWAY, // only this/parent pathway URIs are stored in the index, not indexed/analyzed; names get indexed but not stored 109// the following fields are for filtering only (thus excluded): 110// FIELD_ORGANISM, 111// FIELD_DATASOURCE, 112// FIELD_TYPE, 113 }; 114 115 /** 116 * A Key for the value in a 117 * BioPAX element's annotations map 118 * where additional information about 119 * corresponding search hit will be stored. 120 */ 121 public enum HitAnnotation 122 { 123 HIT_EXCERPT, 124 HIT_SIZE, 125 HIT_ORGANISM, 126 HIT_DATASOURCE, 127 HIT_PATHWAY, 128 } 129 130 private final Model model; 131 private int maxHitsPerPage; 132 private final Analyzer analyzer; 133 private final File indexFile; 134 private SearcherManager searcherManager; 135 136 public final static int DEFAULT_MAX_HITS_PER_PAGE = 100; 137 138 /** 139 * Main Constructor. 140 * 141 * @param model BioPAX object model to be indexed or searched. 142 * @param indexLocation full path to the index directory 143 */ 144 public SearchEngine(Model model, String indexLocation) { 145 this.model = model; 146 this.indexFile = new File(indexLocation); 147 initSearcherManager(); 148 this.maxHitsPerPage = DEFAULT_MAX_HITS_PER_PAGE; 149 this.analyzer = new StandardAnalyzer(); 150 } 151 152 private void initSearcherManager() { 153 try { 154 if(indexFile.exists()) 155 this.searcherManager = 156 new SearcherManager(MMapDirectory.open(indexFile), new SearcherFactory()); 157 else 158 LOG.info(indexFile.getPath() + " does not exist."); 159 } catch (IOException e) { 160 LOG.warn("Could not create a searcher: " + e); 161 } 162 } 163 164 /** 165 * Sets the maximum no. hits per search results page (pagination). 166 * 167 * @param maxHitsPerPage positive int value; otherwise - unlimited 168 */ 169 public void setMaxHitsPerPage(int maxHitsPerPage) { 170 this.maxHitsPerPage = maxHitsPerPage; 171 } 172 173 /** 174 * Gets the maximum no. hits per search results page (pagination parameter). 175 * @return int value 176 */ 177 public int getMaxHitsPerPage() { 178 return maxHitsPerPage; 179 } 180 181 public SearchResult search(String query, int page, 182 Class<? extends BioPAXElement> filterByType, String[] datasources, 183 String[] organisms) 184 { 185 SearchResult response = null; 186 187 LOG.debug("search: " + query + ", page: " + page 188 + ", filterBy: " + filterByType 189 + "; extra filters: ds in (" + Arrays.toString(datasources) 190 + "), org. in (" + Arrays.toString(organisms) + ")"); 191 192 IndexSearcher searcher = null; 193 194 try { 195 QueryParser queryParser = new MultiFieldQueryParser(DEFAULT_FIELDS, analyzer); 196 queryParser.setAllowLeadingWildcard(true);//TODO do we really want leading wildcards (e.g. *sulin)? 197 198 searcher = searcherManager.acquire(); 199 200 //find and transform top docs to search hits (beans), considering pagination... 201 if(!query.trim().equals("*")) { //if not "*" query, which is not supported out-of-the-box, then 202 //create the lucene query 203 Query luceneQuery = queryParser.parse(query); 204//do NOT (Lucene 4.1), or scoring/highlighting won't work for wildcard queries... 205//luceneQuery = searcher.rewrite(luceneQuery); 206 LOG.debug("parsed lucene query is " + luceneQuery.getClass().getSimpleName()); 207 208 //create filter: type AND (d OR d...) AND (o OR o...) 209 Filter filter = createFilter(filterByType, datasources, organisms); 210 211 //get the first page of top hits 212 TopDocs topDocs = searcher.search(luceneQuery, filter, maxHitsPerPage); 213 //get the required hits page if page>0 214 if(page>0) { 215 TopScoreDocCollector collector = TopScoreDocCollector.create(maxHitsPerPage*(page+1), true); 216 searcher.search(luceneQuery, filter, collector); 217 topDocs = collector.topDocs(page * maxHitsPerPage, maxHitsPerPage); 218 } 219 220 //transform docs to hits, use a highlighter to get excerpts 221 response = transform(luceneQuery, searcher, true, topDocs); 222 223 } else { //find ALL objects of a particular BioPAX class (+ filters by organism, datasource) 224 if(filterByType==null) 225 filterByType = Level3Element.class; 226 227 //replace q="*" with a search for the class or its sub-class name in the TYPE field 228 BooleanQuery luceneQuery = new BooleanQuery(); 229 for(Class<? extends BioPAXElement> subType : SimpleEditorMap.L3.getKnownSubClassesOf(filterByType)) { 230 luceneQuery.add(new TermQuery(new Term(FIELD_TYPE, subType.getSimpleName().toLowerCase())), Occur.SHOULD); 231 } 232 Filter filter = createFilter(null, datasources, organisms); 233 234 //get the first page of top hits 235 TopDocs topDocs = searcher.search(luceneQuery, filter, maxHitsPerPage); 236 //get the required hits page if page>0 237 if(page>0) { 238 TopScoreDocCollector collector = TopScoreDocCollector.create(maxHitsPerPage*(page+1), true); 239 searcher.search(luceneQuery, filter, collector); 240 topDocs = collector.topDocs(page * maxHitsPerPage, maxHitsPerPage); 241 } 242 243 //convert 244 response = transform(luceneQuery, searcher, false, topDocs); 245 } 246 247 } catch (ParseException e) { 248 throw new RuntimeException("getTopDocs: failed to parse the query string.", e); 249 } catch (IOException e) { 250 throw new RuntimeException("getTopDocs: failed.", e); 251 } finally { 252 try { 253 if(searcher!=null) { 254 searcherManager.release(searcher); 255 searcher = null; 256 } 257 } catch (IOException e) {} 258 } 259 260 response.setPage(page); 261 262 return response; 263 } 264 265 266 /** 267 * Returns a SearchResult 268 * that contains a List<BioPAXElement>, 269 * some parameters, totals, etc. 270 */ 271 private SearchResult transform(Query query, IndexSearcher searcher, boolean highlight, TopDocs topDocs) 272 throws CorruptIndexException, IOException 273 { 274 final SearchResult response = new SearchResult(); 275 final List<BioPAXElement> hits = new ArrayList<BioPAXElement>(); 276 277 response.setMaxHitsPerPage(maxHitsPerPage); 278 response.setHits(hits); 279 280 for(ScoreDoc scoreDoc : topDocs.scoreDocs) { 281 Document doc = searcher.doc(scoreDoc.doc); 282 String uri = doc.get(FIELD_URI); 283 BioPAXElement bpe = model.getByID(uri); 284 LOG.debug("transform: doc:" + scoreDoc.doc + ", uri:" + uri); 285 286 // use the highlighter (get matching fragments) 287 // for this to work, all keywords were stored in the index field 288 if (highlight && doc.get(FIELD_KEYWORD) != null) { 289 // use a Highlighter (store.YES must be enabled for 'keyword' field) 290 QueryScorer scorer = new QueryScorer(query, FIELD_KEYWORD); 291 //this fixes scoring/highlighting for all-field wildcard queries like q=insulin* 292 //but not for term/prefix queries, i.e, q=name:insulin*, q=pathway:brca2. TODO 293 scorer.setExpandMultiTermQuery(true); 294 295 //TODO use PostingsHighlighter once it's stable (see http://lucene.apache.org/core/4_10_0/highlighter/org/apache/lucene/search/postingshighlight/PostingsHighlighter.html) 296 SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<span class='hitHL'>", "</span>"); 297 Highlighter highlighter = new Highlighter(formatter, scorer); 298 highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 80)); 299 300 final String text = StringUtils.join(doc.getValues(FIELD_KEYWORD), " "); 301 try { 302 TokenStream tokenStream = analyzer.tokenStream("", new StringReader(text)); 303 String res = highlighter.getBestFragments(tokenStream, text, 7, "..."); 304 305 if(res != null && !res.isEmpty()) { 306 bpe.getAnnotations().put(HitAnnotation.HIT_EXCERPT.name(), res); 307 } 308 309 } catch (Exception e) {throw new RuntimeException(e);} 310 311 } else if(highlight) { 312 LOG.warn("Highlighter skipped, because KEYWORD field was null; hit: " 313 + uri + ", " + bpe.getModelInterface().getSimpleName()); 314 } 315 316 // extract organisms (URI only) if not done before 317 if(doc.get(FIELD_ORGANISM) != null && !bpe.getAnnotations().containsKey(HitAnnotation.HIT_ORGANISM.name())) { 318 Set<String> uniqueVals = new TreeSet<String>(); 319 for(String o : doc.getValues(FIELD_ORGANISM)) { 320 //note: only URIS are stored in the index 321 uniqueVals.add(o); 322 } 323 bpe.getAnnotations().put(HitAnnotation.HIT_ORGANISM.name(), uniqueVals); 324 } 325 326 // extract values form the index if not previously done 327 if(doc.get(FIELD_DATASOURCE) != null && !bpe.getAnnotations().containsKey(HitAnnotation.HIT_DATASOURCE.name())) { 328 Set<String> uniqueVals = new TreeSet<String>(); 329 for(String d : doc.getValues(FIELD_DATASOURCE)) { 330 //note: only URIS are stored in the index 331 uniqueVals.add(d); 332 } 333 bpe.getAnnotations().put(HitAnnotation.HIT_DATASOURCE.name(), uniqueVals); 334 } 335 336 // extract only pathway URIs if not previously done 337 //(because names and IDs used to be stored in the index field as well) 338 if(doc.get(FIELD_PATHWAY) != null && !bpe.getAnnotations().containsKey(HitAnnotation.HIT_PATHWAY.name())) { 339 Set<String> uniqueVals = new TreeSet<String>(); 340 for(String d : doc.getValues(FIELD_PATHWAY)) { 341 //only URIs were stored (though all names/ids were indexed/analyzed) 342 if(!d.equals(uri)) //exclude itself 343 uniqueVals.add(d); 344 } 345 bpe.getAnnotations().put(HitAnnotation.HIT_PATHWAY.name(), uniqueVals); 346 } 347 348 //store the no. processes in the sub-network if not previously done 349 if(doc.get(FIELD_SIZE)!=null && !bpe.getAnnotations().containsKey(HitAnnotation.HIT_SIZE.name())) 350 bpe.getAnnotations().put(HitAnnotation.HIT_SIZE.name(), Integer.valueOf(doc.get(FIELD_SIZE))); 351 352 //store the Lucene's score and explanation. 353 String excerpt = (String) bpe.getAnnotations().get(HitAnnotation.HIT_EXCERPT.name()); 354 if(excerpt == null) excerpt = ""; 355 excerpt += " -SCORE- " + scoreDoc.score + " -EXPLANATION- " + searcher.explain(query, scoreDoc.doc); 356 bpe.getAnnotations().put(HitAnnotation.HIT_EXCERPT.name(), excerpt); 357 358 hits.add(bpe); 359 } 360 361 //set total no. hits 362 response.setTotalHits(topDocs.totalHits); 363 364 return response; 365 } 366 367 368 public void index() { 369 final int numObjects = model.getObjects().size(); 370 LOG.info("index(), there are " + numObjects + " BioPAX objects to be (re-)indexed."); 371 IndexWriter iw; 372 try { 373 //close the searcher manager if the old index exists 374 if(searcherManager != null) { 375 searcherManager.close(); 376 searcherManager = null; 377 } 378 IndexWriterConfig conf = new IndexWriterConfig(Version.LATEST, analyzer); 379 iw = new IndexWriter(FSDirectory.open(indexFile), conf); 380 //cleanup 381 iw.deleteAll(); 382 iw.commit(); 383 } catch (IOException e) { 384 throw new RuntimeException("Failed to create a new IndexWriter.", e); 385 } 386 final IndexWriter indexWriter = iw; 387 388 ExecutorService exec = Executors.newFixedThreadPool(30); 389 390 final AtomicInteger numLeft = new AtomicInteger(numObjects); 391 for(final BioPAXElement bpe : model.getObjects()) { 392 // prepare & index each element in a separate thread 393 exec.execute(new Runnable() { 394 public void run() { 395 // get or infer some important values if possible from this, child or parent objects: 396 Set<String> keywords = ModelUtils.getKeywords(bpe, 3); //TODO use Filter<DataPropertyEditor>... args 397 398 // a hack to remove special (debugging) biopax comments 399 for(String s : new HashSet<String>(keywords)) { 400 //exclude additional comments generated by normalizer, merger, etc. 401 if(s.startsWith("REPLACED ") || s.contains("ADDED")) 402 keywords.remove(s); 403 } 404 405 bpe.getAnnotations().put(FIELD_KEYWORD, keywords); 406 bpe.getAnnotations().put(FIELD_DATASOURCE, ModelUtils.getDatasources(bpe)); 407 bpe.getAnnotations().put(FIELD_ORGANISM, ModelUtils.getOrganisms(bpe)); 408 bpe.getAnnotations().put(FIELD_PATHWAY, ModelUtils.getParentPathways(bpe)); //- includes itself if bpe is a pathway 409 410 // for bio processes, also save the total number of member interactions or pathways: 411 if(bpe instanceof org.biopax.paxtools.model.level3.Process) { 412 int size = new Fetcher(SimpleEditorMap.L3, Fetcher.nextStepFilter) 413 .fetch(bpe, Process.class).size(); 414 bpe.getAnnotations().put(FIELD_SIZE, Integer.toString(size)); 415 } 416 417 index(bpe, indexWriter); 418 419 //count, log a progress message 420 int left = numLeft.decrementAndGet(); 421 if(left % 10000 == 0) 422 LOG.info("index(), biopax objects left to index: " + left); 423 } 424 }); 425 } 426 427 exec.shutdown(); //stop accepting new tasks 428 try { //wait 429 exec.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS); 430 } catch (InterruptedException e) { 431 throw new RuntimeException("Interrupted!", e); 432 } 433 434 try { 435 indexWriter.close(); //wait for pending op., auto-commit, close. 436 } catch (IOException e) { 437 throw new RuntimeException("Failed to close IndexWriter.", e); 438 } 439 440 //finally, create a new searcher manager 441 initSearcherManager(); 442 } 443 444 445 // internal methods 446 447 /** 448 * Creates a new Lucene Document that corresponds to a BioPAX object. 449 * It does not check whether the document exists (should not be there, 450 * because the {@link #index()} method cleans up the index) 451 * 452 * Some fields also include biopax data type property values not only from 453 * the biopax object but also from its child elements, up to some depth 454 * (using key-value pairs in the pre-computed bpe.annotations map): 455 * 456 * 'uri' - biopax object's absolute URI, analyze=no, store=yes; 457 * 458 * 'name' - names, analyze=yes, store=yes; boosted; 459 * 460 * 'keyword' - infer from this bpe and its child objects' data properties, 461 * such as Score.value, structureData, structureFormat, chemicalFormula, 462 * availability, term, comment, patoData, author, source, title, url, published, 463 * up to given depth/level; and also all 'pathway' field values are included here; 464 * analyze=yes, store=yes; 465 * 466 * 'datasource', 'organism' and 'pathway' - infer from this bpe and its child objects 467 * up to given depth/level, analyze=no, store=yes; 468 * 469 * 'size' - number of child processes, an integer as string; analyze=no, store=yes 470 * 471 * @param bpe BioPAX object 472 * @param indexWriter index writer 473 */ 474 void index(BioPAXElement bpe, IndexWriter indexWriter) { 475 // create a new document 476 final Document doc = new Document(); 477 478 // save URI (not indexed field) 479 Field field = new StoredField(FIELD_URI, bpe.getRDFId()); 480 doc.add(field); 481 482 // index and store but not analyze/tokenize the biopax class name: 483 field = new StringField(FIELD_TYPE, bpe.getModelInterface().getSimpleName().toLowerCase(), Field.Store.YES); 484 doc.add(field); 485 486 // make index fields from the annotations map (of pre-calculated/inferred values) 487 if(!bpe.getAnnotations().isEmpty()) { 488 if(bpe.getAnnotations().containsKey(FIELD_PATHWAY)) { 489 addPathways((Set<Pathway>)bpe.getAnnotations().get(FIELD_PATHWAY), doc); 490 } 491 if(bpe.getAnnotations().containsKey(FIELD_ORGANISM)) { 492 addOrganisms((Set<BioSource>)bpe.getAnnotations().get(FIELD_ORGANISM), doc); 493 } 494 if(bpe.getAnnotations().containsKey(FIELD_DATASOURCE)) { 495 addDatasources((Set<Provenance>)bpe.getAnnotations().get(FIELD_DATASOURCE), doc); 496 } 497 if(bpe.getAnnotations().containsKey(FIELD_KEYWORD)) { 498 addKeywords((Set<String>)bpe.getAnnotations().get(FIELD_KEYWORD), doc); 499 } 500 if(bpe.getAnnotations().containsKey(FIELD_SIZE)) { 501 field = new IntField(FIELD_SIZE, 502 Integer.parseInt((String)bpe.getAnnotations() 503 .get(FIELD_SIZE)), Field.Store.YES); 504 doc.add(field); 505 } 506 } 507 bpe.getAnnotations().remove(FIELD_KEYWORD); 508 bpe.getAnnotations().remove(FIELD_DATASOURCE); 509 bpe.getAnnotations().remove(FIELD_ORGANISM); 510 bpe.getAnnotations().remove(FIELD_PATHWAY); 511 bpe.getAnnotations().remove(FIELD_SIZE); 512 513 // name 514 if(bpe instanceof Named) { 515 Named named = (Named) bpe; 516 if(named.getStandardName() != null) { 517 field = new TextField(FIELD_NAME, named.getStandardName(), Field.Store.NO); 518 field.setBoost(3.0f); 519 doc.add(field); 520 } 521 if(named.getDisplayName() != null && !named.getDisplayName().equalsIgnoreCase(named.getStandardName())) { 522 field = new TextField(FIELD_NAME, named.getDisplayName(), Field.Store.NO); 523 field.setBoost(2.5f); 524 doc.add(field); 525 } 526 for(String name : named.getName()) { 527 if(name.equalsIgnoreCase(named.getDisplayName()) || name.equalsIgnoreCase(named.getStandardName())) 528 continue; 529 field = new TextField(FIELD_NAME, name.toLowerCase(), Field.Store.NO); 530 field.setBoost(2.0f); 531 doc.add(field); 532 } 533 } 534 535 // XReferrable.xref - build 'xrefid' index field from all Xrefs) 536 if(bpe instanceof XReferrable) { 537 XReferrable xr = (XReferrable) bpe; 538 for(Xref xref : xr.getXref()) { 539 if (xref.getId() != null) { 540 //the filed is not_analyzed; so in order to make search case-insensitive 541 //(when searcher uses standard analyzer), we turn the value to lowercase. 542 field = new StringField(FIELD_XREFID, xref.getId().toLowerCase(), Field.Store.NO); 543// field.setBoost(1.5f); //cannot do for such field/store type 544 doc.add(field); 545 } 546 } 547 } 548 549 // Xref db/id (these are for a precise search by standard bio ID) 550 if(bpe instanceof Xref) { 551 Xref xref = (Xref) bpe; 552 if (xref.getId() != null) { 553 field = new StringField(FIELD_XREFID, xref.getId().toLowerCase(), Field.Store.NO); 554 doc.add(field); 555 } 556 if (xref.getDb() != null) { 557 field = new TextField(FIELD_XREFDB, xref.getDb().toLowerCase(), Field.Store.NO); 558 doc.add(field); 559 } 560 } 561 562 // write 563 try { 564 indexWriter.addDocument(doc); 565 } catch (IOException e) { 566 throw new RuntimeException("Failed to index; " + bpe.getRDFId(), e); 567 } 568 } 569 570 private void addKeywords(Set<String> keywords, Document doc) { 571 for (String keyword : keywords) { 572 Field f = new TextField(FIELD_KEYWORD, keyword.toLowerCase(), Field.Store.YES); 573 doc.add(f); 574 } 575 } 576 577 private void addDatasources(Set<Provenance> set, Document doc) { 578 for (Provenance p : set) { 579 // Index and store URI (untokinized) - 580 // required to accurately calculate no. entities or to filter by data source (diff. datasources may share same names) 581 doc.add(new StringField(FIELD_DATASOURCE, p.getRDFId(), Field.Store.YES)); 582 // index names as well 583 for (String s : p.getName()) 584 doc.add(new TextField(FIELD_DATASOURCE, s.toLowerCase(), Field.Store.NO)); 585 } 586 } 587 588 private void addOrganisms(Set<BioSource> set, Document doc) { 589 for(BioSource bs : set) { 590 // store URI as is (not indexed, untokinized) 591 doc.add(new StoredField(FIELD_ORGANISM, bs.getRDFId())); 592 593 // add organism names 594 for(String s : bs.getName()) { 595 doc.add(new TextField(FIELD_ORGANISM, s.toLowerCase(), Field.Store.NO)); 596 } 597 // add taxonomy 598 for(UnificationXref x : 599 new ClassFilterSet<Xref,UnificationXref>(bs.getXref(), UnificationXref.class)) { 600 if(x.getId() != null) 601 doc.add(new TextField(FIELD_ORGANISM, x.getId().toLowerCase(), Field.Store.NO)); 602 } 603 // include tissue type terms 604 if (bs.getTissue() != null) { 605 for (String s : bs.getTissue().getTerm()) 606 doc.add(new TextField(FIELD_ORGANISM, s.toLowerCase(), Field.Store.NO)); 607 } 608 // include cell type terms 609 if (bs.getCellType() != null) { 610 for (String s : bs.getCellType().getTerm()) { 611 doc.add(new TextField(FIELD_ORGANISM, s.toLowerCase(), Field.Store.NO)); 612 } 613 } 614 } 615 } 616 617 private void addPathways(Set<Pathway> set, Document doc) { 618 for(Pathway pw : set) { 619 //add URI as is (do not lowercase; do not index; store=yes - required to report hits, e.g., as xml) 620 doc.add(new StoredField(FIELD_PATHWAY, pw.getRDFId())); 621 622 // add names to the 'pathway' (don't store) and 'keywords' (store, don't index) fields 623 for (String s : pw.getName()) { 624 doc.add(new TextField(FIELD_PATHWAY, s.toLowerCase(), Field.Store.NO)); 625 doc.add(new StoredField(FIELD_KEYWORD, s.toLowerCase()));//for highlighting only, not indexed 626 } 627 628 // add unification xref IDs too 629 for (UnificationXref x : new ClassFilterSet<Xref, UnificationXref>( 630 pw.getXref(), UnificationXref.class)) { 631 if (x.getId() != null) { 632 // index in both 'pathway' (don't store) and 'keywords' (store, don't index) 633 doc.add(new TextField(FIELD_PATHWAY, x.getId().toLowerCase(), Field.Store.NO)); 634 doc.add(new StoredField(FIELD_KEYWORD, x.getId().toLowerCase()));//for highlighting only, not indexed 635 } 636 } 637 } 638 } 639 640 641 private String getTaxonId(BioSource bioSource) { 642 String id = null; 643 if(!bioSource.getXref().isEmpty()) { 644 Set<UnificationXref> uxs = new 645 ClassFilterSet<Xref,UnificationXref>(bioSource.getXref(), 646 UnificationXref.class); 647 for(UnificationXref ux : uxs) { 648 if("taxonomy".equalsIgnoreCase(ux.getDb())) { 649 id = ux.getId(); 650 break; 651 } 652 } 653 } 654 return id; 655 } 656 657 /** 658 * Creates a search filter like 659 * type AND (datasource OR datasource...) 660 * AND (organism OR organism OR...) 661 * 662 * Both names (partial or full) and URIs should work as filter values. 663 * 664 * @param type 665 * @param datasources 666 * @param organisms 667 */ 668 private Filter createFilter(Class<? extends BioPAXElement> type, 669 String[] datasources, String[] organisms) { 670 671 BooleanQuery filterQuery = new BooleanQuery(); 672 673 //AND datasources 674 if (datasources != null && datasources.length > 0) { 675 filterQuery.add(subQuery(datasources, FIELD_DATASOURCE), Occur.MUST); 676 } 677 //AND organisms 678 if (organisms != null && organisms.length > 0) { 679 filterQuery.add(subQuery(organisms, FIELD_ORGANISM), Occur.MUST); 680 } 681 //AND type 682 if(type != null) { //add biopax class filter 683 BooleanQuery query = new BooleanQuery(); 684 query.add(new TermQuery(new Term(FIELD_TYPE, type.getSimpleName().toLowerCase())), Occur.SHOULD);//OR 685 //for each biopax subclass (interface), add the name to the filter query 686 for(Class<? extends BioPAXElement> subType : SimpleEditorMap.L3.getKnownSubClassesOf(type)) { 687 query.add(new TermQuery(new Term(FIELD_TYPE, subType.getSimpleName().toLowerCase())), Occur.SHOULD);//OR 688 } 689 filterQuery.add(query, Occur.MUST); 690 } 691 692 if(!filterQuery.clauses().isEmpty()) { 693 LOG.debug("filterQuery: " + filterQuery.toString()); 694 return new CachingWrapperFilter( new QueryWrapperFilter(filterQuery) ); //TODO why CachingWrapperFilter, QueryWrapperFilter? 695 } else 696 return null; 697 } 698 699 /** 700 * Filter values here are joint with 'OR' operator, 701 * but if a value has internal whitespace symbols, this also makes a sub-query, 702 * in which terms are joint with 'AND'. This is to allow filtering 703 * by datasource/organism's full name, partial name, or uri 704 * and allowing multiple datasources/organisms. 705 * 706 * @param filterValues 707 * @param filterField 708 * @return 709 */ 710 private Query subQuery(String[] filterValues, String filterField) { 711 BooleanQuery query = new BooleanQuery(); 712 final Pattern pattern = Pattern.compile("\\s"); 713 for(String v : filterValues) { 714 //if v has whitespace chars (several words), make a "word1 AND word2 AND..." subquery 715 if(pattern.matcher(v).find()) { 716 BooleanQuery bq = new BooleanQuery(); 717//was bug: text with spaces and 'of', 'for', 'and', etc., did not match anything (we have to use the same analyzer as during indexing!) 718// for(String w : v.split("\\s+")) { 719// bq.add(new TermQuery(new Term(filterField, w.toLowerCase())), Occur.MUST); 720// LOG.debug("subQuery, add part: " + w.toLowerCase()); 721// } 722 try { 723 //use the same analyser as when indexing 724 TokenStream tokenStream = analyzer.tokenStream(filterField, new StringReader(v)); 725 CharTermAttribute chattr = tokenStream.addAttribute(CharTermAttribute.class); 726 tokenStream.reset(); 727 while(tokenStream.incrementToken()) { 728 //'of', 'and', 'for',.. never occur as tokens (this is how the std. analyzer works) 729 String token = chattr.toString(); 730 bq.add(new TermQuery(new Term(filterField, token)), Occur.MUST); 731 } 732 tokenStream.end(); 733 tokenStream.close(); 734 } catch (IOException e) { 735 //should never happen as we use StringReader 736 throw new RuntimeException("Failed to open a token stream; " 737 + "field:" + filterField + ", value:" + v,e); 738 } 739 query.add(bq, Occur.SHOULD); 740 } else { 741 query.add(new TermQuery(new Term(filterField, v.toLowerCase())), Occur.SHOULD); 742 } 743 } 744 745 return query; 746 } 747 748}