luceneの基本的な検索機能
ネット上で盛んに伝えられているluceneの使用方法についていくつかの修正を行った.ネット上では、あるディレクトリの下にあるすべてのHTMLファイルをインデックスして検索します.ただし、マルチディレクトリでの検索はサポートされていません.ここで少し修正しました.ほとんどはネット上のコードです.
Constants.java
LuceneIndex.java
Constants.java
package testlucene;
public class Constants {
//
public final static String INDEX_FILE_PATH = "c:\\dataDir";
//
public final static String INDEX_STORE_PATH = "c:\\indexDir";
}
LuceneIndex.java
package testlucene;
import java.io.*;
import java.util.*;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.mira.lucene.analysis.IK_CAnalyzer;
public class LuceneIndex {
private IndexWriter writer = null;
public LuceneIndex(){
try {
writer = new IndexWriter(Constants.INDEX_STORE_PATH,new IK_CAnalyzer(),true);
//true ( ?)
}catch(Exception e){
e.printStackTrace();
}
}
@SuppressWarnings("deprecation")
private Document getDocument(File f) throws Exception{
// Document ,
Document doc = new Document();
if(f.isFile()){
FileInputStream is = new FileInputStream(f);
Reader reader = new BufferedReader(new InputStreamReader(is));
doc.add(new Field("contents",reader));
doc.add(new Field("path",f.getCanonicalPath(),Field.Store.YES,Field.Index.TOKENIZED));
}
return doc;
}
public void writeToIndex() throws Exception{
File folder = new File(Constants.INDEX_FILE_PATH);
if(folder.isDirectory()){
File[] files = getFileList(new File(Constants.INDEX_FILE_PATH));
for(int i=0; i<files.length; i++){
File file = new File(files[i].toString());
Document doc = getDocument(file);
System.out.println(" (" + file + ") ...");
writer.addDocument(doc);
}
}
}
public void close()throws Exception{
writer.close();
}
public static void main(String[] args)throws Exception{
LuceneIndex indexer = new LuceneIndex();
Date start = new Date();
indexer.writeToIndex();
Date end = new Date();
System.out.println(" " + (end.getTime() - start.getTime()) + " ");
indexer.close();
}
@SuppressWarnings("unchecked")
private File[] getFileList(File file){
File[] list = null;
ArrayList show = new ArrayList();
if(file.isFile()){list = new File[1];list[0] = file;return list;}
else if(file.isDirectory()){
File[] subDir = file.listFiles();
for(int j=0;j<subDir.length;j++){
if(subDir[j].isFile()){
show.add(subDir[j]);
}else if(subDir[j].isDirectory()){
File[] third = getFileList(subDir[j]);
for(int k=0;null!=third&&k<third.length;k++)
show.add(third[k]);
}
}
}
list = new File[show.size()];
for(int m=0;m<show.size();m++)list[m]=new File(show.get(m).toString());
return list;
}
}
package testlucene;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.util.*;
import org.apache.lucene.document.*;
import org.apache.lucene.queryParser.*;
import org.apache.lucene.search.*;
import org.mira.lucene.analysis.IK_CAnalyzer;
public class LuceneSearch {
private IndexSearcher searcher = null;
private Query query = null;
private File shopInfoTxt = null;
private RandomAccessFile bw;
public LuceneSearch() {
try {
searcher = new IndexSearcher(Constants.INDEX_STORE_PATH);
} catch (Exception e) {
e.printStackTrace();
}
}
@SuppressWarnings("deprecation")
public final Hits Search(String keyword) {
System.out.println(" " + keyword);
try {
query = new QueryParser("contents", new IK_CAnalyzer())
.parse(keyword);
Date start = new Date();
Hits hits = searcher.search(query);
Date end = new Date();
System.out.println(" , " + (end.getTime() - start.getTime())
+ " ");
return hits;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
@SuppressWarnings("deprecation")
public String printResult(Hits h, String test) {// , |
if (h.length() == 0) {
System.out.println(" , 。");
return "";
} else {
for (int i = 0; i < h.length(); i++) {
try {
Document doc = h.doc(i);
System.out.print(" " + (i + 1) + " , :");
System.out.println(doc.get("path"));
BufferedReader br = new BufferedReader(new FileReader(doc.get("path")));
String line = null;
int lineNum = 0;
while ((line = br.readLine()) != null) {
lineNum++;
if (line.indexOf(test) != -1)
return doc.get("path")+"|"+lineNum;
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
return "";
}
public static void main(String[] args) throws Exception {
LuceneSearch temp = new LuceneSearch();
//
String content = temp.getInnerContent3("nav_w","main_w");
content = content.replaceAll(" ", "");
content = content.replaceAll(">", ">");
System.out.println("========== :=============
"+content+"
================");
String typeinfo = content;
//
String shopInfo = "";
content = temp.getInnerContent3("shopInfo","shopRemark");
shopInfo = content;
//DataOutputStream write = new DataOutputStream(new FileOutputStream(temp.getShopInfoTxt()));
//write.write(typeinfo.getBytes());
temp.getBw().write(typeinfo.getBytes());
System.out.println(" :
"+content);
//
int addStart = content.indexOf(" ");
int addEnd = content.indexOf(" ");
if(addStart>0&&addEnd>0){
content = content.substring(addStart+3, addEnd);
content = content.replaceAll(" ", "");
System.out.println("========== :=============
"+content+"
===============");
}
//write.write(content.getBytes());
temp.getBw().write(content.getBytes());
temp.getBw().write("
".getBytes());
//
content = shopInfo;
int telStart = content.indexOf(" ");
int telEnd = content.indexOf(" ");
if(telStart>0&&telEnd>0){
content = content.substring(telStart+3,telEnd);
content = content.replaceAll(" ", "");
System.out.println("================= :================
"+content+"
==================");
//write.write(content.getBytes());
temp.getBw().write(content.getBytes());
temp.getBw().write("
".getBytes());
}
//
content = shopInfo;
int introStart = content.indexOf(" ");
int introEnd = content.indexOf(" ");
if(introStart>0&&introEnd>0){
content = content.substring(introStart+5,introEnd);
content = content.replaceAll(" ", "");
System.out.println("================= :===============
"+content+"
===================");
//write.write(content.getBytes());
temp.getBw().write(content.getBytes());
temp.getBw().write("
".getBytes());
}
//
content = shopInfo;
int typeStart = content.indexOf(" ");
int typeEnd = content.indexOf(" ");
if(typeStart>0&&typeEnd>0){
content = content.substring(typeStart+4,typeEnd);
content = content.replaceAll(" ", "");
System.out.println("================= :===============
"+content+"
===================");
//write.write(content.getBytes());
temp.getBw().write(content.getBytes());
temp.getBw().write("
".getBytes());
}
//
content = shopInfo;
int suggestEnd = content.lastIndexOf(")");
if(typeEnd>0&&suggestEnd>0){
content = content.substring(typeEnd+4,suggestEnd+1);
content = content.replaceAll(" ", "");
System.out.println("================= :===============
"+content+"
===================");
//write.write(content.getBytes());
//write.close();
temp.getBw().write(content.getBytes());
temp.getBw().write("
".getBytes());
}
temp.getBw().close();
}
@SuppressWarnings("deprecation")
public String getInnerContent3(String first,String sec) throws Exception, FileNotFoundException{
// ,
LuceneSearch test = new LuceneSearch();
Hits h = null;
String scrrenString = "";
//first = "nav_w";//
h = test.Search(first);
String start = test.printResult(h, first);
String fileName = start.substring(0, start.indexOf("|"));
int startLine = Integer.parseInt(start.substring(start.indexOf("|")+1, start.length()));
//sec = "main_w";//
h = test.Search(sec);
String end = test.printResult(h, sec);
String fileName2 = start.substring(0, start.indexOf("|"));
int endLine = Integer.parseInt(end.substring(end.indexOf("|")+1, end.length()));
if(fileName2.equalsIgnoreCase(fileName)){
String tempFileName = "";
tempFileName = fileName.substring(fileName.lastIndexOf("\\")).replace(".html", ".txt");
tempFileName = fileName.substring(fileName.lastIndexOf("\\")).replace(".htm", ".txt");
shopInfoTxt = new File("c:/temp/",tempFileName);
if(!shopInfoTxt.exists())shopInfoTxt.createNewFile();
bw = new RandomAccessFile(shopInfoTxt,"rw");
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName),"utf-8"));
String line = null;
int lineNum = 0;
System.out.println("sentences from "+startLine+" to "+endLine);
while ((line = br.readLine()) != null) {
lineNum++;
if (lineNum>=startLine&&lineNum<endLine)scrrenString+=line;
}
scrrenString = getShortFormat(scrrenString);
scrrenString = scrrenString.substring(scrrenString.indexOf(":")+1);
}
return scrrenString;
}
public File getShopInfoTxt() {
return shopInfoTxt;
}
public RandomAccessFile getBw() {
return bw;
}
private static String getShortFormat(String content){// <>
String finalString = content.trim();
int first = finalString.indexOf("<");
int end = finalString.indexOf(">");
if(first>-1&&end>-1){
finalString = finalString.substring(0, first).trim()+finalString.substring(end+1, finalString.length()).trim();
finalString = getShortFormat(finalString);
}
return finalString;
}
}