ルーンノート


参照http://blog.csdn.net/zzpchina/archive/2006/01/15/579875.aspx
IR(Information Retrieval)はLuceneのような検索ツールを記述する。
lucene in actionの第二版は、アマゾンが買い占めていませんでした。
ソースを直接見るhttp://www.manning.com/hatcher3/LIAsourcecode.zip
使用しているのはlucene-core-3..22.jarです。
---------------------------------
Evaaluating search quality:
検索品質の評価:
D.5.1 Precision and recall
Precision and recall are standard metrics in the information retrieval community for objectively measuring
relevance of search results. Precision measures what subset of the documents returned for each query were
relevant. For example, if a query has 20 hits and only 1 is relevant, precision is 0.05. If only 1 hit was returned
and it was relevant, precision is 1.0. Recall measures what percentage of the relevant documents for that query
was actually returned. So if the query listed 8 documents as being relevant, but 6 were in the result set, that’s a
recall of 0.75.
In a properly configured search application, these two measures are naturally at odds with one another. Let’s
say, on one extreme, you only show the user the very best (top 1) document matching their query. With such an
approach, your precision will typically be high, because the first result has a good chance of being relevant, while
your recall would be very low, because if there are many relevant documents for a given query you have only
returned one of them. If we increase top 1 to top 10, then suddenly we will be returning many documents for each
query. The precision will necessarily drop because most likely you are now allowing some non-relevant documents
into the result set. But recall should increase because each query should return a larger subset of its relevant
documents.
Still, you’d like the relevant documents to be higher up in the ranking. To measure this, average precision is
computed. This measure computes precision at each of the N cutoffs, where N ranges from 1 to a maximum value,
and then takes the average. So this measure is higher if your search application generally returns relevant
documents earlier in the result set. Mean average precision, or MAP, then measures the mean of average precision
across a set of queries. A related measure, mean reciprocal rank or MRR, measures 1/M where M is the first rank
that had a relevant document. You want both of these numbers to be as high as possible!
import java.io.File;
import java.io.PrintWriter;
import java.io.BufferedReader;
import java.io.FileReader;
import org.apache.lucene.search.*;
import org.apache.lucene.store.*;
import org.apache.lucene.benchmark.quality.*;
import org.apache.lucene.benchmark.quality.utils.*;
import org.apache.lucene.benchmark.quality.trec.*;
public class PrecisionRecall {
  public static void main(String[] args) throws Throwable {
    File topicsFile = new File("D:/Workspaces/suanfa/sohu3/src/lia/benchmark/topics.txt");
    File qrelsFile = new File("D:/Workspaces/suanfa/sohu3/src/lia/benchmark/qrels.txt");
    Directory dir = FSDirectory.open(new File("indexes/MeetLucene"));
    org.apache.lucene.search.Searcher searcher = new IndexSearcher(dir, true);
    String docNameField = "filename"; 
    PrintWriter logger = new PrintWriter(System.out, true); 
    TrecTopicsReader qReader = new TrecTopicsReader();   //#1
    QualityQuery qqs[] = qReader.readQueries( new BufferedReader(new FileReader(topicsFile))); //#1
    Judge judge = new TrecJudge(new BufferedReader(new FileReader(qrelsFile)));                     //#2
    judge.validateData(qqs, logger);                     //#3
    QualityQueryParser qqParser = new SimpleQQParser("title", "contents");  //#4
    QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, docNameField);
    SubmissionReport submitLog = null;
    QualityStats stats[] = qrun.execute(judge,submitLog, logger);
    QualityStats avg = QualityStats.average(stats);      //#6
    avg.log("SUMMARY",2,logger, "  ");
    dir.close();
  }
}
------------------------------------------
ハローワークは
LIAsource\lia 2 e\src\lia\meetlucene\Indexer.java
簡略化してください:
import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class Indexer {
	public static void main(String[] args) throws IOException {
		String indexDir = "D:\\Workspaces\\suanfa\\sohu3\\src\\lia\\meetlucene\\index";// args[0];
		String dataDir = "D:\\Workspaces\\suanfa\\sohu3\\src\\lia\\meetlucene\\data";// args[1];
		long start = System.currentTimeMillis();
		Directory dir = FSDirectory.open(new File(indexDir));
		IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30), true, IndexWriter.MaxFieldLength.UNLIMITED); // 3
		int numIndexed = 0;
		try {
			TextFilesFilter filter = new TextFilesFilter();
			File[] files = new File(dataDir).listFiles();
			for (File f : files) {
				if (!f.isDirectory() && !f.isHidden() && f.exists()&& f.canRead() && (filter == null || filter.accept(f))) {
					// indexFile(f);
					System.out.println("Indexing " + f.getCanonicalPath());
					Document doc = new Document();
					doc.add(new Field("contents", new FileReader(f))); // 7
					doc.add(new Field("filename", f.getName(),Field.Store.YES, Field.Index.NOT_ANALYZED));// 8
					doc.add(new Field("fullpath", f.getCanonicalPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));// 9
					writer.addDocument(doc);
					numIndexed = writer.numDocs();
				}
			}
		} finally {
			writer.close();
		}
		long end = System.currentTimeMillis();
		System.out.println("Indexing " + numIndexed + " files took "+ (end - start) + " milliseconds");
	}
	private static class TextFilesFilter implements FileFilter {
		public boolean accept(File path) {
			return path.getName().toLowerCase().endsWith(".txt"); // 6
		}
	}
}
import org.apache.lucene.document.Document;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;
import java.io.File;
import java.io.IOException;
public class Searcher {
  public static void main(String[] args) throws IllegalArgumentException,IOException, ParseException {
    String indexDir = "D:\\Workspaces\\suanfa\\sohu3\\src\\lia\\meetlucene\\index";//args[0];               //1 
    String q = "Redistri*";//args[1];                      //2   
    Directory dir = FSDirectory.open(new File(indexDir)); //3
    IndexSearcher is = new IndexSearcher(dir);   //3   
    QueryParser parser = new QueryParser(Version.LUCENE_30,"contents",new StandardAnalyzer(Version.LUCENE_30));  //4
    Query query = parser.parse(q);              //4   
    long start = System.currentTimeMillis();
    TopDocs hits = is.search(query, 10); //5
    long end = System.currentTimeMillis();
    System.err.println("Found " + hits.totalHits + " document(s) (in " + (end - start) +" milliseconds) that matched query '" +q + "':");                                   // 6
    for(ScoreDoc scoreDoc : hits.scoreDocs) {
      Document doc = is.doc(scoreDoc.doc);               //7      
      System.out.println(doc.get("fullpath"));  //8  
    }
    is.close();  
  }
}
--------------------
luceneを使わずに直接に一つのフォルダの中の文字の出現個数を統計します。
package com.hao;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class UserTreeMap {
	/**
	 * @param args
	 * @throws Exception
	 */
	public static void main(String[] args) throws Exception {
		//test();
		Map map=getMapFromFile("D:\\Workspaces\\suanfa\\sohu3\\src\\english.txt");
		Iterator it = map.entrySet().iterator();
        while (it.hasNext()) {
            Map.Entry entry = (Map.Entry) it.next();
            Object key = entry.getKey();
            Object value = entry.getValue();
            System.out.println(key+"--"+value);
        }

	}
	public static Map getMapFromFile(String filepath) throws Exception{
		BufferedReader buf = new BufferedReader(new FileReader(filepath));
		StringBuffer sbuf = new StringBuffer();//      
		String line = null;
		while ((line = buf.readLine()) != null) {
			sbuf.append(line);//          
		}
		buf.close();//     
		Pattern expression = Pattern.compile("[a-zA-Z]+");//            
		String string1 = sbuf.toString();//.toLowerCase();//      
		Matcher matcher = expression.matcher(string1);//   string1    
		TreeMap myTreeMap = new TreeMap();//          /  
		int n = 0;//        
		Object word = null;//       
		Object num = null;//      
		while (matcher.find()) {//       
			word = matcher.group();//       -     
			n++;//     1
			if (myTreeMap.containsKey(word)) {//       ,     
				num = myTreeMap.get(word);//          
				Integer count = (Integer) num;//     
				myTreeMap.put(word, new Integer(count.intValue() + 1));
			} else {
				myTreeMap.put(word,new Integer(1));//         ,      
			}
		}
		return myTreeMap;
	}
	public static void test() throws Exception{
		BufferedReader buf = new BufferedReader(new FileReader("D:\\sohu3\\english.txt"));
		System.out.println("Read under this dir English.txt");
		StringBuffer sbuf = new StringBuffer();//      
		String line = null;
		while ((line = buf.readLine()) != null) {
			sbuf.append(line);//          
		}
		buf.close();//     
		Pattern expression = Pattern.compile("[a-zA-Z]+");//            
		String string1 = sbuf.toString().toLowerCase();//      
		Matcher matcher = expression.matcher(string1);//   string1    
		TreeMap myTreeMap = new TreeMap();//          /  
		int n = 0;//        
		Object word = null;//       
		Object num = null;//      
		while (matcher.find()) {//       
			word = matcher.group();//       -     
			n++;//     1
			if (myTreeMap.containsKey(word)) {//       ,     
				num = myTreeMap.get(word);//          
				Integer count = (Integer) num;//     
				myTreeMap.put(word, new Integer(count.intValue() + 1));
			} else {// src="http://images.csdn.net/syntaxhighlighting/OutliningIndicators/InBlock.gif"
					// alt="" /> 
				myTreeMap.put(word,new Integer(1));//         ,      
			}
		}
		System.out.println("      :");
		System.out.println("        " + n + " ");
		System.out.println("           result.txt   ");
		BufferedWriter bufw = new BufferedWriter(new FileWriter("result.txt"));
		Iterator iter = myTreeMap.keySet().iterator();//             
		Object key = null;
		while (iter.hasNext()) {//             
			key = iter.next();
			bufw.write((String) key + ":" + myTreeMap.get(key));//  /      
			bufw.newLine();
		}
		bufw.write("english.txt      " + n + " ");
		bufw.newLine();
		bufw.write("english.txt     " + myTreeMap.size() + " ");
		bufw.close();
	}
}