Luceneでのカスタムソートの実装


Luceneを使用してコンテンツを検索する場合、検索結果の表示順序はもちろん重要です.LuceneのBuild-inのいくつかのソート定義は、多くの場合、私たちが使用するのに適していません.自分のアプリケーションのシーンに適応するには、ソート機能をカスタマイズするしかありません.この節では、Luceneでどのようにカスタムソート機能を実現するかを見てみましょう.
     Luceneにおけるカスタムソート機能とJava集合におけるカスタムソートの実現方法の差は少なく,比較インタフェースを実現しなければならない.JavaではComparableインタフェースを実現すればよい.しかしLuceneではSortComparatorSourceインタフェースとScoreDocComparatorインタフェースを実現する.具体的な実現方法を知る前に,この2つのインタフェースの定義を見ておこう.
    SortComparatorSourceインタフェースの機能は、ScoreDocsをソートするためのcomparator(Expert:returns a comparator for sorting ScoreDocs)を返すことです.このインタフェースは、1つのメソッドのみを定義します.次のようにします.

/**  
 * Creates a comparator for the field in the given index.   
 * @param reader - Index to create comparator for.   
 * @param fieldname - Field to create comparator for.  
 * @return Comparator of ScoreDoc objects.   
 * @throws IOException - If an error occurs reading the index.   
 */  
public ScoreDocComparator newComparator(IndexReader reader,String fieldname) throws IOException  

/**
 * Creates a comparator for the field in the given index. 
 * @param reader - Index to create comparator for. 
 * @param fieldname - Field to create comparator for.
 * @return Comparator of ScoreDoc objects. 
 * @throws IOException - If an error occurs reading the index. 
 */
public ScoreDocComparator newComparator(IndexReader reader,String fieldname) throws IOException     
      この方法はただ1つのScoreDocComparatorインスタンスを創造して順序付けを実現するために用いる.だから私達はまたScoreDocComparatorインタフェースを実現する.ScoreDocComparatorインタフェースを見てみる.機能は2つのScoreDocオブジェクトを比較して順序付け(Compares two ScoreDoc objects for sorting)の中で2つのLucene実現の静的インスタンスを定義した.以下の通りである:
 
//Special comparator for sorting hits according to computed relevance (document score).    
public static final ScoreDocComparator RELEVANCE;   
       
//Special comparator for sorting hits according to index order (document number).    
public static final ScoreDocComparator INDEXORDER;  

//Special comparator for sorting hits according to computed relevance (document score). 
public static final ScoreDocComparator RELEVANCE;
	
//Special comparator for sorting hits according to index order (document number). 
public static final ScoreDocComparator INDEXORDER;    
 
   ソートに関連する3つの方法があり、それぞれ次のように実現する必要があります.
  
/**  
 * Compares two ScoreDoc objects and returns a result indicating their sort order.   
 * @param i First ScoreDoc   
 * @param j Second ScoreDoc  
 * @return -1 if i should come before j;   
 *         1 if i should come after j;  
 *         0 if they are equal  
 */  
public int compare(ScoreDoc i,ScoreDoc j);   
  
/**  
 * Returns the value used to sort the given document. The object returned must implement the java.io.Serializable interface. This is used by multisearchers to determine how to collate results from their searchers.   
 * @param i Document  
 * @return Serializable object  
 */  
public Comparable sortValue(ScoreDoc i);   
  
/**  
 * Returns the type of sort. Should return SortField.SCORE, SortField.DOC, SortField.STRING, SortField.INTEGER, SortField.FLOAT or SortField.CUSTOM. It is not valid to return SortField.AUTO. This is used by multisearchers to determine how to collate results from their searchers.   
 * @return One of the constants in SortField.   
 */  
public int sortType();  

               /**
	 * Compares two ScoreDoc objects and returns a result indicating their sort order. 
	 * @param i First ScoreDoc 
	 * @param j Second ScoreDoc
	 * @return -1 if i should come before j; 
	 *         1 if i should come after j;
	 *         0 if they are equal
	 */
	public int compare(ScoreDoc i,ScoreDoc j);

	/**
	 * Returns the value used to sort the given document. The object returned must implement the java.io.Serializable interface. This is used by multisearchers to determine how to collate results from their searchers. 
	 * @param i Document
	 * @return Serializable object
	 */
	public Comparable sortValue(ScoreDoc i);

	/**
	 * Returns the type of sort. Should return SortField.SCORE, SortField.DOC, SortField.STRING, SortField.INTEGER, SortField.FLOAT or SortField.CUSTOM. It is not valid to return SortField.AUTO. This is used by multisearchers to determine how to collate results from their searchers. 
	 * @return One of the constants in SortField. 
	 */
	public int sortType();   
    例を見てみましょう.
    この例はLucene in Actionの実装であり、最も近いレストランの名前を検索するために使用されます.レストラン座標は文字列「x,y」で格納されます.
  
package com.nikee.lucene;   
  
import java.io.IOException;   
  
import org.apache.lucene.index.IndexReader;   
import org.apache.lucene.index.Term;   
import org.apache.lucene.index.TermDocs;   
import org.apache.lucene.index.TermEnum;   
import org.apache.lucene.search.ScoreDoc;   
import org.apache.lucene.search.ScoreDocComparator;   
import org.apache.lucene.search.SortComparatorSource;   
import org.apache.lucene.search.SortField;   
  
//               .         "x,y"      
//DistanceComparatorSource    SortComparatorSource     
public class DistanceComparatorSource implements SortComparatorSource {   
    private static final long serialVersionUID = 1L;   
       
    // x y             
    private int x;   
    private int y;   
       
    public DistanceComparatorSource(int x, int y) {   
        this.x = x;   
        this.y = y;   
    }   
       
    //   ScoreDocComparator            
    public ScoreDocComparator newComparator(IndexReader reader, String fieldname) throws IOException {   
        return new DistanceScoreDocLookupComparator(reader, fieldname, x, y);   
    }   
       
    //DistanceScoreDocLookupComparator    ScoreDocComparator        
    private static class DistanceScoreDocLookupComparator implements ScoreDocComparator {   
        private float[] distances;  //                 
           
        //      ,                   .   
        public DistanceScoreDocLookupComparator(IndexReader reader, String fieldname, int x, int y) throws IOException {   
            System.out.println("fieldName2="+fieldname);   
            final TermEnum enumerator = reader.terms(new Term(fieldname, ""));   
               
            System.out.println("maxDoc="+reader.maxDoc());   
            distances = new float[reader.maxDoc()];  //    distances   
            if (distances.length > 0) {   
                TermDocs termDocs = reader.termDocs();   
                try {   
                    if (enumerator.term() == null) {   
                        throw new RuntimeException("no terms in field " + fieldname);   
                    }   
                    int i = 0,j = 0;   
                    do {   
                        System.out.println("in do-while :" + i ++);   
                        Term term = enumerator.term();  //      Term    
                        if (term.field() != fieldname)  //                  
                            break;   
                           
                        //Sets this to the data for the current term in a TermEnum.    
                        //This may be optimized in some implementations.   
                        termDocs.seek(enumerator); //  TermDocs Doc   
                        while (termDocs.next()) {   
                            System.out.println("    in while :" + j ++);   
                            System.out.println("    in while ,Term :" + term.toString());   
                               
                            String[] xy = term.text().split(","); //   x y   
                            int deltax = Integer.parseInt(xy[0]) - x;   
                            int deltay = Integer.parseInt(xy[1]) - y;   
                            //        
                            distances[termDocs.doc()] = (float) Math.sqrt(deltax * deltax + deltay * deltay);   
                        }   
                    }    
                    while (enumerator.next());   
                } finally {   
                    termDocs.close();   
                }   
            }   
        }   
  
        //                       
        public int compare(ScoreDoc i, ScoreDoc j) {   
            if (distances[i.doc] < distances[j.doc])   
                return -1;   
            if (distances[i.doc] > distances[j.doc])   
                return 1;   
            return 0;   
        }   
           
        //        
        public Comparable sortValue(ScoreDoc i) {   
            return new Float(distances[i.doc]);   
        }   
           
        //  SortType   
        public int sortType() {   
            return SortField.FLOAT;   
        }   
    }   
           
    public String toString() {   
        return "Distance from (" + x + "," + y + ")";   
    }   
}   
 
package com.nikee.lucene;

import java.io.IOException;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.ScoreDocComparator;
import org.apache.lucene.search.SortComparatorSource;
import org.apache.lucene.search.SortField;

//               .         "x,y"   
//DistanceComparatorSource    SortComparatorSource  
public class DistanceComparatorSource implements SortComparatorSource {
	private static final long serialVersionUID = 1L;
	
	// x y          
	private int x;
	private int y;
	
	public DistanceComparatorSource(int x, int y) {
		this.x = x;
		this.y = y;
	}
	
	//   ScoreDocComparator         
	public ScoreDocComparator newComparator(IndexReader reader, String fieldname) throws IOException {
		return new DistanceScoreDocLookupComparator(reader, fieldname, x, y);
	}
	
	//DistanceScoreDocLookupComparator    ScoreDocComparator     
	private static class DistanceScoreDocLookupComparator implements ScoreDocComparator {
		private float[] distances;  //              
		
		//      ,                   .
		public DistanceScoreDocLookupComparator(IndexReader reader, String fieldname, int x, int y) throws IOException {
			System.out.println("fieldName2="+fieldname);
			final TermEnum enumerator = reader.terms(new Term(fieldname, ""));
			
			System.out.println("maxDoc="+reader.maxDoc());
			distances = new float[reader.maxDoc()];  //    distances
			if (distances.length > 0) {
				TermDocs termDocs = reader.termDocs();
				try {
					if (enumerator.term() == null) {
						throw new RuntimeException("no terms in field " + fieldname);
					}
					int i = 0,j = 0;
					do {
						System.out.println("in do-while :" + i ++);
						Term term = enumerator.term();  //      Term 
						if (term.field() != fieldname)  //               
							break;
						
						//Sets this to the data for the current term in a TermEnum. 
						//This may be optimized in some implementations.
						termDocs.seek(enumerator); //  TermDocs Doc
						while (termDocs.next()) {
							System.out.println("    in while :" + j ++);
							System.out.println("    in while ,Term :" + term.toString());
							
							String[] xy = term.text().split(","); //   x y
							int deltax = Integer.parseInt(xy[0]) - x;
							int deltay = Integer.parseInt(xy[1]) - y;
							//     
							distances[termDocs.doc()] = (float) Math.sqrt(deltax * deltax + deltay * deltay);
						}
					} 
					while (enumerator.next());
				} finally {
					termDocs.close();
				}
			}
		}

		//                    
		public int compare(ScoreDoc i, ScoreDoc j) {
			if (distances[i.doc] < distances[j.doc])
				return -1;
			if (distances[i.doc] > distances[j.doc])
				return 1;
			return 0;
		}
		
		//     
		public Comparable sortValue(ScoreDoc i) {
			return new Float(distances[i.doc]);
		}
		
		//  SortType
		public int sortType() {
			return SortField.FLOAT;
		}
	}
		
	public String toString() {
		return "Distance from (" + x + "," + y + ")";
	}
} 

  これは上の2つのインタフェースを実現した2つのクラスで、詳細な注釈が付いていて、カスタムソートが難しくないことがわかります.この実装が正しく実現できるかどうか、テストコードが合格できるかどうかを見てみましょう.
  
package com.nikee.lucene.test;   
  
import java.io.IOException;   
  
import junit.framework.TestCase;   
  
import org.apache.lucene.analysis.WhitespaceAnalyzer;   
import org.apache.lucene.document.Document;   
import org.apache.lucene.document.Field;   
import org.apache.lucene.index.IndexWriter;   
import org.apache.lucene.index.Term;   
import org.apache.lucene.search.FieldDoc;   
import org.apache.lucene.search.Hits;   
import org.apache.lucene.search.IndexSearcher;   
import org.apache.lucene.search.Query;   
import org.apache.lucene.search.ScoreDoc;   
import org.apache.lucene.search.Sort;   
import org.apache.lucene.search.SortField;   
import org.apache.lucene.search.TermQuery;   
import org.apache.lucene.search.TopFieldDocs;   
import org.apache.lucene.store.RAMDirectory;   
  
import com.nikee.lucene.DistanceComparatorSource;   
  
public class DistanceComparatorSourceTest extends TestCase {   
    private RAMDirectory directory;   
       
    private IndexSearcher searcher;   
    private Query query;   
       
    //         
    protected void setUp() throws Exception {   
        directory = new RAMDirectory();   
        IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true);   
           
        addPoint(writer, "El Charro", "restaurant", 1, 2);   
        addPoint(writer, "Cafe Poca Cosa", "restaurant", 5, 9);   
        addPoint(writer, "Los Betos", "restaurant", 9, 6);   
        addPoint(writer, "Nico's Taco Shop", "restaurant", 3, 8);   
  
        writer.close();   
        searcher = new IndexSearcher(directory);   
        query = new TermQuery(new Term("type", "restaurant"));   
    }   
       
    private void addPoint(IndexWriter writer, String name, String type, int x, int y) throws IOException {   
        Document doc = new Document();   
        doc.add(new Field("name", name, Field.Store.YES, Field.Index.TOKENIZED));   
        doc.add(new Field("type", type, Field.Store.YES, Field.Index.TOKENIZED));   
        doc.add(new Field("location", x + "," + y, Field.Store.YES, Field.Index.UN_TOKENIZED));   
        writer.addDocument(doc);   
    }   
       
    public void testNearestRestaurantToHome() throws Exception {   
        //  DistanceComparatorSource     SortField   
        Sort sort = new Sort(new SortField("location", new DistanceComparatorSource(0, 0)));   
        Hits hits = searcher.search(query, sort);  //      
           
        //     
        assertEquals("closest", "El Charro", hits.doc(0).get("name"));   
        assertEquals("furthest", "Los Betos", hits.doc(3).get("name"));   
    }   
       
    public void testNeareastRestaurantToWork() throws Exception {   
        Sort sort = new Sort(new SortField("location", new DistanceComparatorSource(10, 10)));  //       10,10   
        //             ,                  ,     
        //TopFieldDocs               
        TopFieldDocs docs = searcher.search(query, null, 3, sort);   
           
        assertEquals(4, docs.totalHits);   
        assertEquals(3, docs.scoreDocs.length);   
           
        //  FieldDoc   FieldDoc                  FieldDoc Doc   
        FieldDoc fieldDoc = (FieldDoc) docs.scoreDocs[0];   
  
        assertEquals("(10,10) -> (9,6) = sqrt(17)", new Float(Math.sqrt(17)), fieldDoc.fields[0]);   
        Document document = searcher.doc(fieldDoc.doc);   
        assertEquals("Los Betos", document.get("name"));   
        dumpDocs(sort, docs);  //          
    }   
       
    //             
    private void dumpDocs(Sort sort, TopFieldDocs docs) throws IOException {   
        System.out.println("Sorted by: " + sort);   
        ScoreDoc[] scoreDocs = docs.scoreDocs;   
        for (int i = 0; i < scoreDocs.length; i++) {   
            FieldDoc fieldDoc = (FieldDoc) scoreDocs[i];   
            Float distance = (Float) fieldDoc.fields[0];   
            Document doc = searcher.doc(fieldDoc.doc);   
            System.out.println("   " + doc.get("name") + " @ (" + doc.get("location") + ") -> " + distance);   
        }   
    }   
}   
  
package com.nikee.lucene.test;

import java.io.IOException;

import junit.framework.TestCase;

import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.store.RAMDirectory;

import com.nikee.lucene.DistanceComparatorSource;

public class DistanceComparatorSourceTest extends TestCase {
	private RAMDirectory directory;
	
	private IndexSearcher searcher;
	private Query query;
	
	//      
	protected void setUp() throws Exception {
		directory = new RAMDirectory();
		IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true);
		
		addPoint(writer, "El Charro", "restaurant", 1, 2);
		addPoint(writer, "Cafe Poca Cosa", "restaurant", 5, 9);
		addPoint(writer, "Los Betos", "restaurant", 9, 6);
		addPoint(writer, "Nico's Taco Shop", "restaurant", 3, 8);

		writer.close();
		searcher = new IndexSearcher(directory);
		query = new TermQuery(new Term("type", "restaurant"));
	}
	
	private void addPoint(IndexWriter writer, String name, String type, int x, int y) throws IOException {
		Document doc = new Document();
		doc.add(new Field("name", name, Field.Store.YES, Field.Index.TOKENIZED));
		doc.add(new Field("type", type, Field.Store.YES, Field.Index.TOKENIZED));
		doc.add(new Field("location", x + "," + y, Field.Store.YES, Field.Index.UN_TOKENIZED));
		writer.addDocument(doc);
	}
	
	public void testNearestRestaurantToHome() throws Exception {
		//  DistanceComparatorSource     SortField
		Sort sort = new Sort(new SortField("location", new DistanceComparatorSource(0, 0)));
		Hits hits = searcher.search(query, sort);  //   
		
		//  
		assertEquals("closest", "El Charro", hits.doc(0).get("name"));
		assertEquals("furthest", "Los Betos", hits.doc(3).get("name"));
	}
	
	public void testNeareastRestaurantToWork() throws Exception {
		Sort sort = new Sort(new SortField("location", new DistanceComparatorSource(10, 10)));  //       10,10
		//             ,                  ,  
		//TopFieldDocs            
		TopFieldDocs docs = searcher.search(query, null, 3, sort);
		
		assertEquals(4, docs.totalHits);
		assertEquals(3, docs.scoreDocs.length);
		
		//  FieldDoc   FieldDoc                  FieldDoc Doc
		FieldDoc fieldDoc = (FieldDoc) docs.scoreDocs[0];

		assertEquals("(10,10) -> (9,6) = sqrt(17)", new Float(Math.sqrt(17)), fieldDoc.fields[0]);
		Document document = searcher.doc(fieldDoc.doc);
		assertEquals("Los Betos", document.get("name"));
		dumpDocs(sort, docs);  //       
	}
	
	//          
	private void dumpDocs(Sort sort, TopFieldDocs docs) throws IOException {
		System.out.println("Sorted by: " + sort);
		ScoreDoc[] scoreDocs = docs.scoreDocs;
		for (int i = 0; i < scoreDocs.length; i++) {
			FieldDoc fieldDoc = (FieldDoc) scoreDocs[i];
			Float distance = (Float) fieldDoc.fields[0];
			Document doc = searcher.doc(fieldDoc.doc);
			System.out.println("   " + doc.get("name") + " @ (" + doc.get("location") + ") -> " + distance);
		}
	}
}