LuceneのBooleanClause Occur MUSTとSHOULDの組み合わせテスト


テストバージョンLucene 2.9.4
====================================
lucene中のBooleanQueryは或いはの複合探索を実現する.
BooleanClauseは、BooleanClauseを含むブールクエリの句関係を表すクラスです.Occur.MUST,BooleanClause.Occur.MUST_NOT,BooleanClause.Occur.SHOULD. 含まなければならない、含まない、3種類含まれてもよい.1.MUSTとMUSTの6つの組み合わせがあります.連続するクエリ句の交差を取得します.2.MUSTとMUST_NOT:クエリ結果にMUST_が含まれないことを示しますNOTが取得すべきクエリ句の検索結果.3.SHOULDとMUST_NOT:連用時、機能はMUSTとMUST_NOT. 4.SHOULDとMUSTを連用した場合、結果はMUST句の検索結果となるが、SHOULDはソートに影響を与える.5.SHOULDとSHOULD:「または」の関係を表し、最終的な検索結果はすべての検索句の並列である.6.MUST_NOTとMUST_NOT:意味がない、検索結果がない.
試験手順:
package org.apache.lucene.search;

import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase;


public class TestBooleanClauseOccur extends LuceneTestCase {

  private IndexSearcher searcher;
  private IndexReader reader;
  private Directory directory;
  private Analyzer analyzer;
  
  private static final String DOC_TEXT_LINES[] = {
    "Well, this is just some plain text we use for creating the ", // 0
    "test documents. It used to be a text from an online collection ", // 1
    "devoted to first aid, but if there was there an (online) lawyers ", // 2
    "first aid collection with legal advices, \"it\" might have quite ", // 3
    "probably advised one not to include \"it\"'s text or the text of ", // 4
    "any other online collection in one's code, unless one has money ",// 5
    "that one don't need and one is happy to donate for lawyers ", // 6
    "charity. Anyhow at some point, rechecking the usage of this text, ", // 7
    "it became uncertain that this text is free to use, because ", //8
    "the web site in the disclaimer of he eBook containing that text ", //9
    "was not responding anymore, and at the same time, in projGut, ", //10
    "searching for first aid no longer found that eBook as well. ", // 11
    "So here we are, with a perhaps much less interesting ", // 12
    "text for the test, but oh much much safer. ", //13
  };
  
  @Override
  protected void setUp() throws Exception {
    // TODO Auto-generated method stub
    super.setUp();
    directory = new RAMDirectory();
    analyzer = new Analyzer() {
      @Override
      public TokenStream tokenStream(String fieldName, Reader reader) {
        // TODO Auto-generated method stub
        return new WhitespaceTokenizer(reader);
      }

      @Override
      public int getPositionIncrementGap(String fieldName) {
        // TODO Auto-generated method stub
        return 100;
      }
    };
    
    IndexWriter writer = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
    Document doc = null;
    
    for (int i = 0; i < DOC_TEXT_LINES.length; ++i) {
      doc = new Document();
      doc.add(new Field("title", DOC_TEXT_LINES[i], Field.Store.YES, Field.Index.ANALYZED));
      writer.addDocument(doc);
    }
    
    reader = writer.getReader();
    writer.close();

    searcher = new IndexSearcher(directory);
  }

  @Override
  protected void tearDown() throws Exception {
    // TODO Auto-generated method stub
    searcher.close();
    reader.close();
    directory.close();
    analyzer.close();
    super.tearDown();
  }
  
  public void testOccurCombination1() throws Exception {
    BooleanQuery query = new BooleanQuery();
    
    TermQuery termQuery = new TermQuery(new Term("title", "text"));  
    termQuery.setBoost(3.0f);
    query.add(termQuery, BooleanClause.Occur.MUST);
    
    termQuery = new TermQuery(new Term("title", "test"));
    //query.add(termQuery, BooleanClause.Occur.MUST);
    
    termQuery = new TermQuery(new Term("title", "some"));
    termQuery.setBoost(1.2f);
    query.add(termQuery, BooleanClause.Occur.SHOULD);
    
    termQuery = new TermQuery(new Term("title", "the"));
    termQuery.setBoost(1.3f);
    query.add(termQuery, BooleanClause.Occur.SHOULD);
    
    termQuery = new TermQuery(new Term("title", "that"));
    query.add(termQuery, BooleanClause.Occur.SHOULD);
    
    ScoreDoc[] docs = searcher.search(query, null, 10).scoreDocs;
    
    System.out.println("=== query {text(must) some(should) the(should) that(should) } ===");
    for (ScoreDoc doc : docs) {
      //Explanation explanation = searcher.explain(query, scoreDoc.doc);
      System.out.println("docID : " + doc.doc + ", score: " + doc.score + ", content :" + reader.document(doc.doc).get("title").toString());
    }
  }
  
  public void testOccurCombination2() throws Exception {
    
    BooleanQuery query1 = new BooleanQuery();
    TermQuery termQuery = new TermQuery(new Term("title", "text"));  
    termQuery.setBoost(3.0f);
    query1.add(termQuery, BooleanClause.Occur.MUST);
    
    //termQuery = new TermQuery(new Term("title", "test"));
    //query1.add(termQuery, BooleanClause.Occur.MUST);
    
    BooleanQuery query2 = new BooleanQuery();
    
    termQuery = new TermQuery(new Term("title", "some"));
    termQuery.setBoost(1.2f);
    query2.add(termQuery, BooleanClause.Occur.SHOULD);
    
    termQuery = new TermQuery(new Term("title", "the"));
    termQuery.setBoost(1.3f);
    query2.add(termQuery, BooleanClause.Occur.SHOULD);
    
    termQuery = new TermQuery(new Term("title", "that"));
    query2.add(termQuery, BooleanClause.Occur.SHOULD);
    
    //query.add(query1, BooleanClause.Occur.MUST);
    query1.add(query2, BooleanClause.Occur.MUST);
    
    TopDocs docs = searcher.search(query1, null, 10);
    System.out.println("=== query { text(must) { some(should) the(should) that(should) }(must) } ===");
    
    for (ScoreDoc doc : docs.scoreDocs) {
      //Explanation explanation = searcher.explain(query, scoreDoc.doc);
      System.out.println("docID: " + doc.doc + ",  score: " + doc.score + ", content :" + reader.document(doc.doc).get("title").toString());
    }
  }
  
public void testOccurCombination3() throws Exception {
    
    BooleanQuery query = new BooleanQuery();
    TermQuery termQuery = new TermQuery(new Term("title", "text"));  
    termQuery.setBoost(3.0f);
    query.add(termQuery, BooleanClause.Occur.SHOULD);
    
    termQuery = new TermQuery(new Term("title", "some"));
    termQuery.setBoost(1.2f);
    query.add(termQuery, BooleanClause.Occur.SHOULD);
    
    termQuery = new TermQuery(new Term("title", "the"));
    termQuery.setBoost(1.3f);
    query.add(termQuery, BooleanClause.Occur.SHOULD);
    
    termQuery = new TermQuery(new Term("title", "that"));
    query.add(termQuery, BooleanClause.Occur.SHOULD);
    
    TopDocs docs = searcher.search(query, null, 10);
    System.out.println("=== query { text(should) some(should) the(should) that(should) } ===");
    
    for (ScoreDoc doc : docs.scoreDocs) {
      //Explanation explanation = searcher.explain(query, scoreDoc.doc);
      System.out.println("docID: " + doc.doc + ",  score: " + doc.score + ", content :" + reader.document(doc.doc).get("title").toString());
    }
  }
}

結果出力
=== query {text(must) some(should) the(should) that(should) } ===
docID : 0, score: 0.5669826, content :Well, this is just some plain text we use for creating the 
docID : 9, score: 0.50818175, content :the web site in the disclaimer of he eBook containing that text 
docID : 4, score: 0.29922757, content :probably advised one not to include "it"'s text or the text of 
docID : 13, score: 0.29017726, content :text for the test, but oh much much safer. 
docID : 8, score: 0.23953491, content :it became uncertain that this text is free to use, because 
docID : 1, score: 0.0809797, content :test documents. It used to be a text from an online collection 
=== query { text(must) { some(should) the(should) that(should) }(must) } ===
docID: 0,  score: 0.6119575, content :Well, this is just some plain text we use for creating the 
docID: 9,  score: 0.55969, content :the web site in the disclaimer of he eBook containing that text 
docID: 4,  score: 0.5048786, content :probably advised one not to include "it"'s text or the text of 
docID: 13,  score: 0.46338382, content :text for the test, but oh much much safer. 
docID: 8,  score: 0.3756358, content :it became uncertain that this text is free to use, because 
=== query { text(should) some(should) the(should) that(should) } ===
docID: 0,  score: 0.5669826, content :Well, this is just some plain text we use for creating the 
docID: 9,  score: 0.50818175, content :the web site in the disclaimer of he eBook containing that text 
docID: 4,  score: 0.29922757, content :probably advised one not to include "it"'s text or the text of 
docID: 13,  score: 0.29017726, content :text for the test, but oh much much safer. 
docID: 8,  score: 0.23953491, content :it became uncertain that this text is free to use, because 
docID: 7,  score: 0.21602902, content :charity. Anyhow at some point, rechecking the usage of this text, 
docID: 1,  score: 0.0809797, content :test documents. It used to be a text from an online collection 
docID: 6,  score: 0.03878776, content :that one don't need and one is happy to donate for lawyers 
docID: 11,  score: 0.03878776, content :searching for first aid no longer found that eBook as well. 
docID: 10,  score: 0.035091203, content :was not responding anymore, and at the same time, in projGut,