luceneの基本的な検索機能


ネット上で盛んに伝えられているluceneの使用方法についていくつかの修正を行った.ネット上では、あるディレクトリの下にあるすべてのHTMLファイルをインデックスして検索します.ただし、マルチディレクトリでの検索はサポートされていません.ここで少し修正しました.ほとんどはネット上のコードです.
Constants.java
package testlucene;

public class Constants {
        //           
        public final static String INDEX_FILE_PATH = "c:\\dataDir";
        
        //       
        public final static String INDEX_STORE_PATH = "c:\\indexDir";
}

LuceneIndex.java
package testlucene;
import java.io.*;
import java.util.*;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.mira.lucene.analysis.IK_CAnalyzer;

public class LuceneIndex {
        private IndexWriter writer = null;
        
		public LuceneIndex(){
                try {
                        writer = new IndexWriter(Constants.INDEX_STORE_PATH,new IK_CAnalyzer(),true);
                        //true      (  ?)
                }catch(Exception e){
                        e.printStackTrace();
                }
        }
        
        @SuppressWarnings("deprecation")
		private Document getDocument(File f) throws Exception{
        	//         Document  ,      
                Document doc = new Document();
                    if(f.isFile()){
                   	 FileInputStream is = new FileInputStream(f);
                        Reader reader = new BufferedReader(new InputStreamReader(is));
                        doc.add(new Field("contents",reader));
                        doc.add(new Field("path",f.getCanonicalPath(),Field.Store.YES,Field.Index.TOKENIZED));
                   }

               
                return doc;
        }
        
        public void writeToIndex() throws Exception{
                File folder = new File(Constants.INDEX_FILE_PATH);
                if(folder.isDirectory()){
                        File[] files = getFileList(new File(Constants.INDEX_FILE_PATH));
                        for(int i=0; i<files.length; i++){
                                File file = new File(files[i].toString());
                                Document doc = getDocument(file);
                                System.out.println("        (" + file + ")     ...");
                                writer.addDocument(doc);
                        }
                }
        }
        
        public void close()throws Exception{
                writer.close();
        }
        
        public static void main(String[] args)throws Exception{
                LuceneIndex indexer = new LuceneIndex();
                Date start = new Date();
                indexer.writeToIndex();
                Date end = new Date();
                System.out.println("       " + (end.getTime() - start.getTime()) + "  ");
                indexer.close();
        }
        
		@SuppressWarnings("unchecked")
		private File[] getFileList(File file){
        	File[] list = null;
        	ArrayList show = new ArrayList();
        	if(file.isFile()){list = new File[1];list[0] = file;return list;}
        	else if(file.isDirectory()){
        		File[] subDir = file.listFiles();
        		
        		for(int j=0;j<subDir.length;j++){
        			if(subDir[j].isFile()){
        				
        				 show.add(subDir[j]);
        			}else if(subDir[j].isDirectory()){
        				File[] third = getFileList(subDir[j]);
        				for(int k=0;null!=third&&k<third.length;k++)
        				show.add(third[k]);
        			}
        		}
        	}
        	list = new File[show.size()];
        	for(int m=0;m<show.size();m++)list[m]=new File(show.get(m).toString());
        	return list;
        }
        
}
package testlucene;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.util.*;
import org.apache.lucene.document.*;
import org.apache.lucene.queryParser.*;
import org.apache.lucene.search.*;
import org.mira.lucene.analysis.IK_CAnalyzer;

public class LuceneSearch {
	private IndexSearcher searcher = null;
	private Query query = null;
	private  File shopInfoTxt = null;
	private RandomAccessFile bw;
	

	
	public LuceneSearch() {
		try {
			searcher = new IndexSearcher(Constants.INDEX_STORE_PATH);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	@SuppressWarnings("deprecation")
	public final Hits Search(String keyword) {
		System.out.println("        " + keyword);
		try {
			query = new QueryParser("contents", new IK_CAnalyzer())
					.parse(keyword);
			Date start = new Date();
			Hits hits = searcher.search(query);
			Date end = new Date();
			System.out.println("    ,  " + (end.getTime() - start.getTime())
					+ "  ");
			return hits;
		} catch (Exception e) {
			e.printStackTrace();
			return null;
		}
	}

	@SuppressWarnings("deprecation")
	public String printResult(Hits h, String test) {//             , |  
		if (h.length() == 0) {
			System.out.println("   ,         。");
			return "";
		} else {
			for (int i = 0; i < h.length(); i++) {
				try {
					Document doc = h.doc(i);
					System.out.print("   " + (i + 1) + "       ,     :");
					System.out.println(doc.get("path"));

					BufferedReader br = new BufferedReader(new FileReader(doc.get("path")));
					String line = null;
					int lineNum = 0;
					while ((line = br.readLine()) != null) {
						lineNum++;

						if (line.indexOf(test) != -1)
							return doc.get("path")+"|"+lineNum;
					}

				} catch (Exception e) {
					e.printStackTrace();
				}
			}
		}
		return "";
	}

	public static void main(String[] args) throws Exception {
		
		LuceneSearch temp = new LuceneSearch();
		
		
		//      
		String content = temp.getInnerContent3("nav_w","main_w");
		content = content.replaceAll("&nbsp;", "");
		content = content.replaceAll("&gt;", ">");
		System.out.println("==========    :=============
"+content+"
================"); String typeinfo = content; // String shopInfo = ""; content = temp.getInnerContent3("shopInfo","shopRemark"); shopInfo = content; //DataOutputStream write = new DataOutputStream(new FileOutputStream(temp.getShopInfoTxt())); //write.write(typeinfo.getBytes()); temp.getBw().write(typeinfo.getBytes()); System.out.println(" :
"+content); // int addStart = content.indexOf(" "); int addEnd = content.indexOf(" "); if(addStart>0&&addEnd>0){ content = content.substring(addStart+3, addEnd); content = content.replaceAll("&nbsp;", ""); System.out.println("========== :=============
"+content+"
==============="); } //write.write(content.getBytes()); temp.getBw().write(content.getBytes()); temp.getBw().write("
".getBytes()); // content = shopInfo; int telStart = content.indexOf(" "); int telEnd = content.indexOf(" "); if(telStart>0&&telEnd>0){ content = content.substring(telStart+3,telEnd); content = content.replaceAll("&nbsp;", ""); System.out.println("================= :================
"+content+"
=================="); //write.write(content.getBytes()); temp.getBw().write(content.getBytes()); temp.getBw().write("
".getBytes()); } // content = shopInfo; int introStart = content.indexOf(" "); int introEnd = content.indexOf(" "); if(introStart>0&&introEnd>0){ content = content.substring(introStart+5,introEnd); content = content.replaceAll("&nbsp;", ""); System.out.println("================= :===============
"+content+"
==================="); //write.write(content.getBytes()); temp.getBw().write(content.getBytes()); temp.getBw().write("
".getBytes()); } // content = shopInfo; int typeStart = content.indexOf(" "); int typeEnd = content.indexOf(" "); if(typeStart>0&&typeEnd>0){ content = content.substring(typeStart+4,typeEnd); content = content.replaceAll("&nbsp;", ""); System.out.println("================= :===============
"+content+"
==================="); //write.write(content.getBytes()); temp.getBw().write(content.getBytes()); temp.getBw().write("
".getBytes()); } // content = shopInfo; int suggestEnd = content.lastIndexOf(")"); if(typeEnd>0&&suggestEnd>0){ content = content.substring(typeEnd+4,suggestEnd+1); content = content.replaceAll("&nbsp;", ""); System.out.println("================= :===============
"+content+"
==================="); //write.write(content.getBytes()); //write.close(); temp.getBw().write(content.getBytes()); temp.getBw().write("
".getBytes()); } temp.getBw().close(); } @SuppressWarnings("deprecation") public String getInnerContent3(String first,String sec) throws Exception, FileNotFoundException{ // , LuceneSearch test = new LuceneSearch(); Hits h = null; String scrrenString = ""; //first = "nav_w";// h = test.Search(first); String start = test.printResult(h, first); String fileName = start.substring(0, start.indexOf("|")); int startLine = Integer.parseInt(start.substring(start.indexOf("|")+1, start.length())); //sec = "main_w";// h = test.Search(sec); String end = test.printResult(h, sec); String fileName2 = start.substring(0, start.indexOf("|")); int endLine = Integer.parseInt(end.substring(end.indexOf("|")+1, end.length())); if(fileName2.equalsIgnoreCase(fileName)){ String tempFileName = ""; tempFileName = fileName.substring(fileName.lastIndexOf("\\")).replace(".html", ".txt"); tempFileName = fileName.substring(fileName.lastIndexOf("\\")).replace(".htm", ".txt"); shopInfoTxt = new File("c:/temp/",tempFileName); if(!shopInfoTxt.exists())shopInfoTxt.createNewFile(); bw = new RandomAccessFile(shopInfoTxt,"rw"); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName),"utf-8")); String line = null; int lineNum = 0; System.out.println("sentences from "+startLine+" to "+endLine); while ((line = br.readLine()) != null) { lineNum++; if (lineNum>=startLine&&lineNum<endLine)scrrenString+=line; } scrrenString = getShortFormat(scrrenString); scrrenString = scrrenString.substring(scrrenString.indexOf(":")+1); } return scrrenString; } public File getShopInfoTxt() { return shopInfoTxt; } public RandomAccessFile getBw() { return bw; } private static String getShortFormat(String content){// <> String finalString = content.trim(); int first = finalString.indexOf("<"); int end = finalString.indexOf(">"); if(first>-1&&end>-1){ finalString = finalString.substring(0, first).trim()+finalString.substring(end+1, finalString.length()).trim(); finalString = getShortFormat(finalString); } return finalString; } }