Source code

001package org.biopax.paxtools.search;
002
003import java.io.File;
004import java.io.IOException;
005import java.io.StringReader;
006import java.util.ArrayList;
007import java.util.Arrays;
008import java.util.HashSet;
009import java.util.List;
010import java.util.Set;
011import java.util.TreeSet;
012import java.util.concurrent.ExecutorService;
013import java.util.concurrent.Executors;
014import java.util.concurrent.TimeUnit;
015import java.util.concurrent.atomic.AtomicInteger;
016import java.util.regex.Pattern;
017
018import org.apache.commons.lang.StringUtils;
019import org.apache.lucene.analysis.Analyzer;
020import org.apache.lucene.analysis.TokenStream;
021import org.apache.lucene.analysis.standard.StandardAnalyzer;
022import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
023import org.apache.lucene.document.Document;
024import org.apache.lucene.document.Field;
025import org.apache.lucene.document.IntField;
026import org.apache.lucene.document.StoredField;
027import org.apache.lucene.document.StringField;
028import org.apache.lucene.document.TextField;
029import org.apache.lucene.index.CorruptIndexException;
030import org.apache.lucene.index.IndexWriter;
031import org.apache.lucene.index.IndexWriterConfig;
032import org.apache.lucene.index.Term;
033import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
034import org.apache.lucene.queryparser.classic.ParseException;
035import org.apache.lucene.queryparser.classic.QueryParser;
036import org.apache.lucene.search.BooleanQuery;
037import org.apache.lucene.search.CachingWrapperFilter;
038import org.apache.lucene.search.Filter;
039import org.apache.lucene.search.IndexSearcher;
040import org.apache.lucene.search.Query;
041import org.apache.lucene.search.QueryWrapperFilter;
042import org.apache.lucene.search.ScoreDoc;
043import org.apache.lucene.search.SearcherFactory;
044import org.apache.lucene.search.SearcherManager;
045import org.apache.lucene.search.TermQuery;
046import org.apache.lucene.search.TopDocs;
047import org.apache.lucene.search.TopScoreDocCollector;
048import org.apache.lucene.search.BooleanClause.Occur;
049import org.apache.lucene.search.highlight.Highlighter;
050import org.apache.lucene.search.highlight.QueryScorer;
051import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
052import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
053import org.apache.lucene.store.FSDirectory;
054import org.apache.lucene.store.MMapDirectory;
055import org.apache.lucene.util.Version;
056import org.biopax.paxtools.controller.Fetcher;
057import org.biopax.paxtools.controller.ModelUtils;
058import org.biopax.paxtools.controller.SimpleEditorMap;
059import org.biopax.paxtools.model.BioPAXElement;
060import org.biopax.paxtools.model.Model;
061import org.biopax.paxtools.model.level3.BioSource;
062import org.biopax.paxtools.model.level3.Level3Element;
063import org.biopax.paxtools.model.level3.Named;
064import org.biopax.paxtools.model.level3.Pathway;
065import org.biopax.paxtools.model.level3.Process;
066import org.biopax.paxtools.model.level3.Provenance;
067import org.biopax.paxtools.model.level3.UnificationXref;
068import org.biopax.paxtools.model.level3.XReferrable;
069import org.biopax.paxtools.model.level3.Xref;
070import org.biopax.paxtools.util.ClassFilterSet;
071import org.slf4j.Logger;
072import org.slf4j.LoggerFactory;
073
074
075/**
076 * A full-text searcher/indexer for BioPAX L3 models.
077 * 
078 * @author rodche
079 */
080public class SearchEngine implements Indexer, Searcher {
081        private static final Logger LOG = LoggerFactory.getLogger(SearchEngine.class);
082        
083        // search fields
084        public static final String FIELD_URI = "uri";
085        public static final String FIELD_KEYWORD = "keyword"; //anything, e.g., names, terms, comments, incl. - from child elements 
086        public static final String FIELD_NAME = "name"; // standardName, displayName, other names
087        public static final String FIELD_XREFDB = "xrefdb"; //xref.db
088        public static final String FIELD_XREFID = "xrefid"; //xref.id
089        public static final String FIELD_PATHWAY = "pathway"; //parent/owner pathways; to be inferred from the whole biopax model
090        public static final String FIELD_SIZE = "size"; 
091        // Full-text search/filter fields (case sensitive) -
092        //index organism names, cell/tissue type (term), taxonomy id, but only store BioSource URIs     
093        public static final String FIELD_ORGANISM = "organism";
094        //index data source names, but only URIs are stored in the index
095        public static final String FIELD_DATASOURCE = "datasource";
096        public static final String FIELD_TYPE = "type";
097        
098        //Default fields to use with the MultiFieldQueryParser;
099        //one can still search in other fields directly, e.g.,
100        //pathway:some_keywords datasource:"pid", etc.
101        public final static String[] DEFAULT_FIELDS = 
102        {
103                        FIELD_KEYWORD, //includes all data type properties (names, terms, comments), 
104                        // also from child elements up to given depth (3), also includes pathway names (inferred)
105                        FIELD_NAME, // standardName, displayName, other names
106                        FIELD_XREFID, //xref.id (also direct child's xref.id, i.e., can find both xref and its owners using a xrefid:<id> query string)
107                        FIELD_SIZE, // find entities with a given no. child/associated processes...
108//                      FIELD_PATHWAY, // only this/parent pathway URIs are stored in the index, not indexed/analyzed; names get indexed but not stored
109// the following fields are for filtering only (thus excluded):
110//                      FIELD_ORGANISM, 
111//                      FIELD_DATASOURCE, 
112//                      FIELD_TYPE,
113        };
114        
115        /**
116         * A Key for the value in a 
117         * BioPAX element's annotations map
118         * where additional information about  
119         * corresponding search hit will be stored. 
120         */
121        public enum HitAnnotation 
122        {
123                HIT_EXCERPT,
124                HIT_SIZE,
125                HIT_ORGANISM,
126                HIT_DATASOURCE,
127                HIT_PATHWAY,
128        }
129                
130        private final Model model;
131        private int maxHitsPerPage;
132        private final Analyzer analyzer;
133        private final File indexFile;
134        private SearcherManager searcherManager;
135
136        public final static int DEFAULT_MAX_HITS_PER_PAGE = 100;
137        
138        /**
139         * Main Constructor.
140         *
141         * @param model BioPAX object model to be indexed or searched.
142         * @param indexLocation full path to the index directory
143         */
144        public SearchEngine(Model model, String indexLocation) {
145                this.model = model;
146                this.indexFile = new File(indexLocation);
147                initSearcherManager();
148                this.maxHitsPerPage = DEFAULT_MAX_HITS_PER_PAGE;
149                this.analyzer = new StandardAnalyzer();
150        }
151
152        private void initSearcherManager() {
153                try {
154                        if(indexFile.exists()) 
155                                this.searcherManager = 
156                                        new SearcherManager(MMapDirectory.open(indexFile), new SearcherFactory());
157                        else 
158                                LOG.info(indexFile.getPath() + " does not exist.");
159                } catch (IOException e) {
160                        LOG.warn("Could not create a searcher: " + e);
161                }
162        }
163
164        /**
165         * Sets the maximum no. hits per search results page (pagination).
166         *
167         * @param maxHitsPerPage positive int value; otherwise - unlimited
168         */
169        public void setMaxHitsPerPage(int maxHitsPerPage) {
170                this.maxHitsPerPage = maxHitsPerPage;
171        }
172
173        /**
174         * Gets the maximum no. hits per search results page (pagination parameter).
175         * @return int value
176         */
177         public int getMaxHitsPerPage() {
178                return maxHitsPerPage;
179        }
180
181        public SearchResult search(String query, int page,
182                        Class<? extends BioPAXElement> filterByType, String[] datasources,
183                        String[] organisms) 
184        {
185                SearchResult response = null;
186                
187                LOG.debug("search: " + query + ", page: " + page 
188                        + ", filterBy: " + filterByType
189                        + "; extra filters: ds in (" + Arrays.toString(datasources)
190                        + "), org. in (" + Arrays.toString(organisms) + ")");
191                
192                IndexSearcher searcher = null;
193        
194                try {   
195                        QueryParser queryParser = new MultiFieldQueryParser(DEFAULT_FIELDS, analyzer);
196                        queryParser.setAllowLeadingWildcard(true);//TODO do we really want leading wildcards (e.g. *sulin)?
197                        
198                        searcher = searcherManager.acquire();   
199                        
200                        //find and transform top docs to search hits (beans), considering pagination...
201                        if(!query.trim().equals("*")) { //if not "*" query, which is not supported out-of-the-box, then
202                                //create the lucene query
203                                Query luceneQuery = queryParser.parse(query);
204//do NOT (Lucene 4.1), or scoring/highlighting won't work for wildcard queries...                               
205//luceneQuery = searcher.rewrite(luceneQuery); 
206                                LOG.debug("parsed lucene query is " + luceneQuery.getClass().getSimpleName());
207                                
208                                //create filter: type AND (d OR d...) AND (o OR o...)
209                                Filter filter = createFilter(filterByType, datasources, organisms);
210                                
211                                //get the first page of top hits
212                                TopDocs topDocs = searcher.search(luceneQuery, filter, maxHitsPerPage);
213                                //get the required hits page if page>0
214                                if(page>0) {
215                                        TopScoreDocCollector collector = TopScoreDocCollector.create(maxHitsPerPage*(page+1), true);  
216                                        searcher.search(luceneQuery, filter, collector);
217                                        topDocs = collector.topDocs(page * maxHitsPerPage, maxHitsPerPage);
218                                }
219                                
220                                //transform docs to hits, use a highlighter to get excerpts
221                                response = transform(luceneQuery, searcher, true, topDocs);
222        
223                        } else { //find ALL objects of a particular BioPAX class (+ filters by organism, datasource)
224                                if(filterByType==null) 
225                                        filterByType = Level3Element.class;
226
227                                //replace q="*" with a search for the class or its sub-class name in the TYPE field
228                                BooleanQuery luceneQuery = new BooleanQuery();
229                                for(Class<? extends BioPAXElement> subType : SimpleEditorMap.L3.getKnownSubClassesOf(filterByType)) {
230                                        luceneQuery.add(new TermQuery(new Term(FIELD_TYPE, subType.getSimpleName().toLowerCase())), Occur.SHOULD);
231                                }
232                                Filter filter = createFilter(null, datasources, organisms);
233                                
234                                //get the first page of top hits
235                                TopDocs topDocs = searcher.search(luceneQuery, filter, maxHitsPerPage);
236                                //get the required hits page if page>0
237                                if(page>0) {
238                                        TopScoreDocCollector collector = TopScoreDocCollector.create(maxHitsPerPage*(page+1), true);  
239                                        searcher.search(luceneQuery, filter, collector);
240                                        topDocs = collector.topDocs(page * maxHitsPerPage, maxHitsPerPage);     
241                                }
242                                
243                                //convert
244                                response = transform(luceneQuery, searcher, false, topDocs);                    
245                        }       
246                        
247                } catch (ParseException e) {
248                        throw new RuntimeException("getTopDocs: failed to parse the query string.", e);
249                } catch (IOException e) {
250                        throw new RuntimeException("getTopDocs: failed.", e);
251                } finally {
252                        try {
253                                if(searcher!=null) {
254                                        searcherManager.release(searcher);
255                                        searcher = null;
256                                }
257                        } catch (IOException e) {}      
258                }
259                
260                response.setPage(page);
261                
262                return response;
263        }
264
265        
266        /**
267         * Returns a SearchResult
268         * that contains a List<BioPAXElement>,
269         * some parameters, totals, etc.
270         */
271        private SearchResult transform(Query query, IndexSearcher searcher, boolean highlight, TopDocs topDocs) 
272                        throws CorruptIndexException, IOException 
273        {       
274                final SearchResult response = new SearchResult();
275                final List<BioPAXElement> hits = new ArrayList<BioPAXElement>();
276                
277                response.setMaxHitsPerPage(maxHitsPerPage);
278                response.setHits(hits);
279                
280                for(ScoreDoc scoreDoc : topDocs.scoreDocs) {                    
281                        Document doc = searcher.doc(scoreDoc.doc);
282                        String uri = doc.get(FIELD_URI);
283                        BioPAXElement bpe = model.getByID(uri);                 
284                        LOG.debug("transform: doc:" + scoreDoc.doc + ", uri:" + uri);
285                        
286                        // use the highlighter (get matching fragments)
287                        // for this to work, all keywords were stored in the index field
288                        if (highlight && doc.get(FIELD_KEYWORD) != null) {                              
289                                // use a Highlighter (store.YES must be enabled for 'keyword' field)
290                                QueryScorer scorer = new QueryScorer(query, FIELD_KEYWORD); 
291                                //this fixes scoring/highlighting for all-field wildcard queries like q=insulin* 
292                                //but not for term/prefix queries, i.e, q=name:insulin*, q=pathway:brca2. TODO
293                                scorer.setExpandMultiTermQuery(true);   
294                                
295                                //TODO use PostingsHighlighter once it's stable (see http://lucene.apache.org/core/4_10_0/highlighter/org/apache/lucene/search/postingshighlight/PostingsHighlighter.html)                              
296                                SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<span class='hitHL'>", "</span>");
297                                Highlighter highlighter = new Highlighter(formatter, scorer);
298                                highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 80));
299                                                                                
300                                final String text = StringUtils.join(doc.getValues(FIELD_KEYWORD), " ");
301                                try {
302                                        TokenStream tokenStream = analyzer.tokenStream("", new StringReader(text));
303                                        String res = highlighter.getBestFragments(tokenStream, text, 7, "...");
304
305                                        if(res != null && !res.isEmpty()) {
306                                                bpe.getAnnotations().put(HitAnnotation.HIT_EXCERPT.name(), res);
307                                        }
308
309                                } catch (Exception e) {throw new RuntimeException(e);}
310
311                        } else if(highlight) {
312                                LOG.warn("Highlighter skipped, because KEYWORD field was null; hit: " 
313                                                + uri + ", " + bpe.getModelInterface().getSimpleName());
314                        }
315                                                                                        
316                        // extract organisms (URI only) if not done before
317                        if(doc.get(FIELD_ORGANISM) != null && !bpe.getAnnotations().containsKey(HitAnnotation.HIT_ORGANISM.name())) {
318                                Set<String> uniqueVals = new TreeSet<String>();
319                                for(String o : doc.getValues(FIELD_ORGANISM)) {
320                                        //note: only URIS are stored in the index                                       
321                                        uniqueVals.add(o);
322                                }
323                                bpe.getAnnotations().put(HitAnnotation.HIT_ORGANISM.name(), uniqueVals);
324                        }
325                        
326                        // extract values form the index if not previously done
327                        if(doc.get(FIELD_DATASOURCE) != null && !bpe.getAnnotations().containsKey(HitAnnotation.HIT_DATASOURCE.name())) {
328                                Set<String> uniqueVals = new TreeSet<String>();
329                                for(String d : doc.getValues(FIELD_DATASOURCE)) {
330                                        //note: only URIS are stored in the index
331                                        uniqueVals.add(d);
332                                }
333                                bpe.getAnnotations().put(HitAnnotation.HIT_DATASOURCE.name(), uniqueVals);
334                        }       
335                        
336                        // extract only pathway URIs if not previously done
337                        //(because names and IDs used to be stored in the index field as well)
338                        if(doc.get(FIELD_PATHWAY) != null && !bpe.getAnnotations().containsKey(HitAnnotation.HIT_PATHWAY.name())) {
339                                Set<String> uniqueVals = new TreeSet<String>();
340                                for(String d : doc.getValues(FIELD_PATHWAY)) {
341                                        //only URIs were stored (though all names/ids were indexed/analyzed)
342                                        if(!d.equals(uri)) //exclude itself
343                                                uniqueVals.add(d);
344                                }
345                                bpe.getAnnotations().put(HitAnnotation.HIT_PATHWAY.name(), uniqueVals);
346                        }
347                        
348                        //store the no. processes in the sub-network if not previously done
349                        if(doc.get(FIELD_SIZE)!=null && !bpe.getAnnotations().containsKey(HitAnnotation.HIT_SIZE.name()))
350                                bpe.getAnnotations().put(HitAnnotation.HIT_SIZE.name(), Integer.valueOf(doc.get(FIELD_SIZE))); 
351                        
352                        //store the Lucene's score and explanation.
353                        String excerpt = (String) bpe.getAnnotations().get(HitAnnotation.HIT_EXCERPT.name());
354                        if(excerpt == null) excerpt = "";
355                        excerpt += " -SCORE- " + scoreDoc.score + " -EXPLANATION- " + searcher.explain(query, scoreDoc.doc);
356                        bpe.getAnnotations().put(HitAnnotation.HIT_EXCERPT.name(), excerpt);
357                        
358                        hits.add(bpe);
359                }
360                                                
361                //set total no. hits    
362                response.setTotalHits(topDocs.totalHits);
363                
364                return response;
365        }
366
367
368        public void index() {
369                final int numObjects =  model.getObjects().size();
370                LOG.info("index(), there are " + numObjects + " BioPAX objects to be (re-)indexed.");           
371                IndexWriter iw;         
372                try {
373                        //close the searcher manager if the old index exists
374                        if(searcherManager != null) {
375                                searcherManager.close();
376                                searcherManager = null;
377                        }
378                        IndexWriterConfig conf = new IndexWriterConfig(Version.LATEST, analyzer);
379                        iw = new IndexWriter(FSDirectory.open(indexFile), conf);
380                        //cleanup
381                        iw.deleteAll();
382                        iw.commit();
383                } catch (IOException e) {
384                        throw new RuntimeException("Failed to create a new IndexWriter.", e);
385                }               
386                final IndexWriter indexWriter = iw;
387
388                ExecutorService exec = Executors.newFixedThreadPool(30);
389                
390                final AtomicInteger numLeft = new AtomicInteger(numObjects);
391                for(final BioPAXElement bpe : model.getObjects()) {     
392                        // prepare & index each element in a separate thread
393                        exec.execute(new Runnable() {
394                                public void run() {                                     
395                                        // get or infer some important values if possible from this, child or parent objects:
396                                        Set<String> keywords = ModelUtils.getKeywords(bpe, 3); //TODO use Filter<DataPropertyEditor>... args
397                                        
398                                        // a hack to remove special (debugging) biopax comments
399                                        for(String s : new HashSet<String>(keywords)) {
400                                                //exclude additional comments generated by normalizer, merger, etc.
401                                                if(s.startsWith("REPLACED ") || s.contains("ADDED"))
402                                                        keywords.remove(s);
403                                        }
404                                        
405                                        bpe.getAnnotations().put(FIELD_KEYWORD, keywords);
406                                        bpe.getAnnotations().put(FIELD_DATASOURCE, ModelUtils.getDatasources(bpe));
407                                        bpe.getAnnotations().put(FIELD_ORGANISM, ModelUtils.getOrganisms(bpe));
408                                        bpe.getAnnotations().put(FIELD_PATHWAY, ModelUtils.getParentPathways(bpe)); //- includes itself if bpe is a pathway
409
410                                        // for bio processes, also save the total number of member interactions or pathways:
411                                        if(bpe instanceof org.biopax.paxtools.model.level3.Process) {
412                                                int size = new Fetcher(SimpleEditorMap.L3, Fetcher.nextStepFilter)
413                                                                .fetch(bpe, Process.class).size();                                              
414                                                bpe.getAnnotations().put(FIELD_SIZE, Integer.toString(size)); 
415                                        }
416
417                                        index(bpe, indexWriter);
418                                        
419                                        //count, log a progress message
420                                        int left = numLeft.decrementAndGet();
421                                        if(left % 10000 == 0)
422                                                LOG.info("index(), biopax objects left to index: " + left);
423                                }
424                        });
425                }
426                
427                exec.shutdown(); //stop accepting new tasks     
428                try { //wait
429                        exec.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS);
430                } catch (InterruptedException e) {
431                        throw new RuntimeException("Interrupted!", e);
432                }
433                
434                try {
435                        indexWriter.close(); //wait for pending op., auto-commit, close.
436                } catch (IOException e) {
437                        throw new RuntimeException("Failed to close IndexWriter.", e);
438                } 
439                
440                //finally, create a new searcher manager
441                initSearcherManager();
442        }
443        
444        
445        // internal methods
446        
447        /**
448         * Creates a new Lucene Document that corresponds to a BioPAX object.
449         * It does not check whether the document exists (should not be there,
450         * because the {@link #index()} method cleans up the index)
451         * 
452         * Some fields also include biopax data type property values not only from 
453         * the biopax object but also from its child elements, up to some depth 
454         * (using key-value pairs in the pre-computed bpe.annotations map):
455         * 
456         *  'uri' - biopax object's absolute URI, analyze=no, store=yes;
457         * 
458         *  'name' - names, analyze=yes, store=yes; boosted;
459         * 
460         *  'keyword' - infer from this bpe and its child objects' data properties,
461         *            such as Score.value, structureData, structureFormat, chemicalFormula, 
462         *            availability, term, comment, patoData, author, source, title, url, published, 
463         *            up to given depth/level; and also all 'pathway' field values are included here; 
464         *            analyze=yes, store=yes;
465         *  
466         *  'datasource', 'organism' and 'pathway' - infer from this bpe and its child objects 
467         *                                                                        up to given depth/level, analyze=no, store=yes;
468         *  
469         *  'size' - number of child processes, an integer as string; analyze=no, store=yes
470         * 
471         * @param bpe BioPAX object
472         * @param indexWriter index writer
473        */
474        void index(BioPAXElement bpe, IndexWriter indexWriter) {                
475                // create a new document
476                final Document doc = new Document();
477                
478                // save URI (not indexed field)
479                Field field = new StoredField(FIELD_URI, bpe.getRDFId());
480                doc.add(field);
481                
482                // index and store but not analyze/tokenize the biopax class name:
483                field = new StringField(FIELD_TYPE, bpe.getModelInterface().getSimpleName().toLowerCase(), Field.Store.YES);
484                doc.add(field);
485                
486                // make index fields from the annotations map (of pre-calculated/inferred values)
487                if(!bpe.getAnnotations().isEmpty()) {
488                        if(bpe.getAnnotations().containsKey(FIELD_PATHWAY)) {
489                                addPathways((Set<Pathway>)bpe.getAnnotations().get(FIELD_PATHWAY), doc);
490                        }
491                        if(bpe.getAnnotations().containsKey(FIELD_ORGANISM)) {
492                                addOrganisms((Set<BioSource>)bpe.getAnnotations().get(FIELD_ORGANISM), doc);
493                        }
494                        if(bpe.getAnnotations().containsKey(FIELD_DATASOURCE)) {
495                                addDatasources((Set<Provenance>)bpe.getAnnotations().get(FIELD_DATASOURCE), doc);
496                        }
497                        if(bpe.getAnnotations().containsKey(FIELD_KEYWORD)) {
498                                addKeywords((Set<String>)bpe.getAnnotations().get(FIELD_KEYWORD), doc);
499                        }
500                        if(bpe.getAnnotations().containsKey(FIELD_SIZE)) {
501                                field = new IntField(FIELD_SIZE, 
502                                        Integer.parseInt((String)bpe.getAnnotations()
503                                        .get(FIELD_SIZE)), Field.Store.YES);
504                                doc.add(field);
505                        }
506                }
507                bpe.getAnnotations().remove(FIELD_KEYWORD);
508                bpe.getAnnotations().remove(FIELD_DATASOURCE);
509                bpe.getAnnotations().remove(FIELD_ORGANISM);
510                bpe.getAnnotations().remove(FIELD_PATHWAY);
511                bpe.getAnnotations().remove(FIELD_SIZE);
512                        
513                // name
514                if(bpe instanceof Named) {
515                        Named named = (Named) bpe;
516                        if(named.getStandardName() != null) {
517                                field = new TextField(FIELD_NAME, named.getStandardName(), Field.Store.NO);
518                                field.setBoost(3.0f);
519                                doc.add(field);
520                        }
521                        if(named.getDisplayName() != null && !named.getDisplayName().equalsIgnoreCase(named.getStandardName())) {
522                                field = new TextField(FIELD_NAME, named.getDisplayName(), Field.Store.NO);
523                                field.setBoost(2.5f);
524                                doc.add(field);
525                        }
526                        for(String name : named.getName()) {
527                                if(name.equalsIgnoreCase(named.getDisplayName()) || name.equalsIgnoreCase(named.getStandardName()))
528                                        continue;
529                                field = new TextField(FIELD_NAME, name.toLowerCase(), Field.Store.NO);
530                                field.setBoost(2.0f);
531                                doc.add(field);
532                        }
533                }
534                
535                // XReferrable.xref - build 'xrefid' index field from all Xrefs)
536                if(bpe instanceof XReferrable) {
537                        XReferrable xr = (XReferrable) bpe;
538                        for(Xref xref : xr.getXref()) {
539                                if (xref.getId() != null) {
540                                        //the filed is not_analyzed; so in order to make search case-insensitive 
541                                        //(when searcher uses standard analyzer), we turn the value to lowercase.
542                                        field = new StringField(FIELD_XREFID, xref.getId().toLowerCase(), Field.Store.NO);
543//                                      field.setBoost(1.5f); //cannot do for such field/store type
544                                        doc.add(field);
545                                }
546                        }
547                }
548                
549                // Xref db/id (these are for a precise search by standard bio ID)
550                if(bpe instanceof Xref) {
551                        Xref xref = (Xref) bpe;
552                        if (xref.getId() != null) {
553                                field = new StringField(FIELD_XREFID, xref.getId().toLowerCase(), Field.Store.NO);
554                                doc.add(field);
555                        }
556                        if (xref.getDb() != null) {
557                                field = new TextField(FIELD_XREFDB, xref.getDb().toLowerCase(), Field.Store.NO);
558                                doc.add(field);
559                        }
560                }
561                
562                // write
563                try {
564                        indexWriter.addDocument(doc);
565                } catch (IOException e) {
566                        throw new RuntimeException("Failed to index; " + bpe.getRDFId(), e);
567                }
568        }
569
570        private void addKeywords(Set<String> keywords, Document doc) {
571                for (String keyword : keywords) {
572                        Field f = new TextField(FIELD_KEYWORD, keyword.toLowerCase(), Field.Store.YES);
573                        doc.add(f);
574                }
575        }
576
577        private void addDatasources(Set<Provenance> set, Document doc) {
578                for (Provenance p : set) {
579                        // Index and store URI (untokinized) - 
580                        // required to accurately calculate no. entities or to filter by data source (diff. datasources may share same names)
581                        doc.add(new StringField(FIELD_DATASOURCE, p.getRDFId(), Field.Store.YES));
582                        // index names as well
583                        for (String s : p.getName())
584                                doc.add(new TextField(FIELD_DATASOURCE, s.toLowerCase(), Field.Store.NO));
585                }
586        }
587
588        private void addOrganisms(Set<BioSource> set, Document doc) {   
589                for(BioSource bs : set) {
590                        // store URI as is (not indexed, untokinized)
591                        doc.add(new StoredField(FIELD_ORGANISM, bs.getRDFId()));
592                                
593                        // add organism names
594                        for(String s : bs.getName()) {
595                                doc.add(new TextField(FIELD_ORGANISM, s.toLowerCase(), Field.Store.NO));
596                        }
597                        // add taxonomy
598                        for(UnificationXref x : 
599                                new ClassFilterSet<Xref,UnificationXref>(bs.getXref(), UnificationXref.class)) {
600                                if(x.getId() != null)
601                                        doc.add(new TextField(FIELD_ORGANISM, x.getId().toLowerCase(), Field.Store.NO));
602                        }
603                        // include tissue type terms
604                        if (bs.getTissue() != null) {
605                                for (String s : bs.getTissue().getTerm())
606                                        doc.add(new TextField(FIELD_ORGANISM, s.toLowerCase(), Field.Store.NO));
607                        }
608                        // include cell type terms
609                        if (bs.getCellType() != null) {
610                                for (String s : bs.getCellType().getTerm()) {
611                                        doc.add(new TextField(FIELD_ORGANISM, s.toLowerCase(), Field.Store.NO));
612                                }
613                        }
614                }
615        }
616
617        private void addPathways(Set<Pathway> set, Document doc) {
618                for(Pathway pw : set) {
619                        //add URI as is (do not lowercase; do not index; store=yes - required to report hits, e.g., as xml)
620                        doc.add(new StoredField(FIELD_PATHWAY, pw.getRDFId()));
621                        
622                        // add names to the 'pathway' (don't store) and 'keywords' (store, don't index) fields
623                        for (String s : pw.getName()) {
624                                doc.add(new TextField(FIELD_PATHWAY, s.toLowerCase(), Field.Store.NO));
625                                doc.add(new StoredField(FIELD_KEYWORD, s.toLowerCase()));//for highlighting only, not indexed
626                        }
627                        
628                        // add unification xref IDs too
629                        for (UnificationXref x : new ClassFilterSet<Xref, UnificationXref>(
630                                        pw.getXref(), UnificationXref.class)) {
631                                if (x.getId() != null) {
632                                        // index in both 'pathway' (don't store) and 'keywords' (store, don't index)
633                                        doc.add(new TextField(FIELD_PATHWAY, x.getId().toLowerCase(), Field.Store.NO));
634                                        doc.add(new StoredField(FIELD_KEYWORD, x.getId().toLowerCase()));//for highlighting only, not indexed
635                                }
636                        }
637                }
638        }
639
640        
641        private String getTaxonId(BioSource bioSource) {
642                String id = null;
643                if(!bioSource.getXref().isEmpty()) {
644                        Set<UnificationXref> uxs = new 
645                                ClassFilterSet<Xref,UnificationXref>(bioSource.getXref(), 
646                                                UnificationXref.class);
647                        for(UnificationXref ux : uxs) {
648                                if("taxonomy".equalsIgnoreCase(ux.getDb())) {
649                                        id = ux.getId();
650                                        break;
651                                }
652                        }
653                }
654                return id;
655        }
656        
657        /** 
658         * Creates a search filter like 
659         * type AND (datasource OR datasource...) 
660         *      AND (organism OR organism OR...)
661         * 
662         * Both names (partial or full) and URIs should work as filter values. 
663         * 
664         * @param type
665         * @param datasources
666         * @param organisms
667         */     
668        private Filter createFilter(Class<? extends BioPAXElement> type, 
669                        String[] datasources, String[] organisms) {
670                
671                BooleanQuery filterQuery = new BooleanQuery();
672                
673                //AND datasources       
674                if (datasources != null && datasources.length > 0) {
675                        filterQuery.add(subQuery(datasources, FIELD_DATASOURCE), Occur.MUST);
676                }
677                //AND organisms
678                if (organisms != null && organisms.length > 0) {
679                        filterQuery.add(subQuery(organisms, FIELD_ORGANISM), Occur.MUST);
680                }               
681                //AND type      
682                if(type != null) { //add biopax class filter
683                        BooleanQuery query = new BooleanQuery();
684                        query.add(new TermQuery(new Term(FIELD_TYPE, type.getSimpleName().toLowerCase())), Occur.SHOULD);//OR
685                        //for each biopax subclass (interface), add the name to the filter query
686                        for(Class<? extends BioPAXElement> subType : SimpleEditorMap.L3.getKnownSubClassesOf(type)) {
687                                query.add(new TermQuery(new Term(FIELD_TYPE, subType.getSimpleName().toLowerCase())), Occur.SHOULD);//OR
688                        }               
689                        filterQuery.add(query, Occur.MUST);
690                }
691                
692                if(!filterQuery.clauses().isEmpty()) {
693                        LOG.debug("filterQuery: " + filterQuery.toString());
694                        return new CachingWrapperFilter( new QueryWrapperFilter(filterQuery) ); //TODO why CachingWrapperFilter, QueryWrapperFilter?
695                } else 
696                        return null;
697        }
698
699        /**
700         * Filter values here are joint with 'OR' operator, 
701         * but if a value has internal whitespace symbols, this also makes a sub-query,
702         * in which terms are joint with 'AND'. This is to allow filtering
703         * by datasource/organism's full name, partial name, or uri 
704         * and allowing multiple datasources/organisms.
705         * 
706         * @param filterValues
707         * @param filterField
708         * @return
709         */
710        private Query subQuery(String[] filterValues, String filterField) {
711                BooleanQuery query = new BooleanQuery();        
712                final Pattern pattern = Pattern.compile("\\s");         
713                for(String v : filterValues) {
714                        //if v has whitespace chars (several words), make a "word1 AND word2 AND..." subquery
715                        if(pattern.matcher(v).find()) {
716                                BooleanQuery bq = new BooleanQuery();
717//was bug: text with spaces and 'of', 'for', 'and', etc., did not match anything (we have to use the same analyzer as during indexing!)
718//                              for(String w : v.split("\\s+")) {
719//                                      bq.add(new TermQuery(new Term(filterField, w.toLowerCase())), Occur.MUST);
720//                                      LOG.debug("subQuery, add part: " + w.toLowerCase());
721//                              }
722                                try {
723                                        //use the same analyser as when indexing
724                                        TokenStream tokenStream = analyzer.tokenStream(filterField, new StringReader(v));
725                                        CharTermAttribute chattr = tokenStream.addAttribute(CharTermAttribute.class);
726                                        tokenStream.reset();
727                                        while(tokenStream.incrementToken()) {
728                                                //'of', 'and', 'for',.. never occur as tokens (this is how the std. analyzer works)
729                                                String token = chattr.toString();
730                                                bq.add(new TermQuery(new Term(filterField, token)), Occur.MUST);
731                                        }
732                                        tokenStream.end(); 
733                                        tokenStream.close();
734                                } catch (IOException e) {
735                                        //should never happen as we use StringReader
736                                        throw new RuntimeException("Failed to open a token stream; "
737                                                        + "field:" + filterField + ", value:" + v,e);
738                                }
739                                query.add(bq, Occur.SHOULD);
740                        } else {
741                                query.add(new TermQuery(new Term(filterField, v.toLowerCase())), Occur.SHOULD);
742                        }                       
743                }
744                
745                return query;
746        }
747        
748}