LuceneインデックスをHadoopに書き込むにはどうすればいいですか?

8745 ワード

mapreuce hadoop Solr Lucene 全文検索

転載は必ず明記してください.オリジナルの住所、ご協力ありがとうございます.http://qindongliang1922.iteye.com/blog/2088076
HadoopはLuceneのサブプロジェクトであり、現在は盛んに発展している.Hadoopの分散処理能力をどのように利用してLuceneにインデックスを構築する効率を高めるか.そうすれば、HDFSのすべての利点を十分に利用することができるが、HDFSシステムは、ランダムな読み取りサポートに友好的ではなく、Luceneのような全文検索の枠組みでは、ほとんどの検索操作が行われていることはよく知られている.ランダム読み書きの操作も欠かせないが、どうすればLuceneをhadoopと組み合わせることができるのだろうか.実はhadoopのバージョンでは、contribのツールバッグの中に、Luceneインデックスのツール類が入っているが、使う人は少ないようで、散仙はこれを使ったことがないので、ここではあまり評価されていない.solr 4.4以降のプロジェクトでは、HDFSのようにインデックスを書き込むjarパッケージが統合されています.solrの中にいれば簡単にできます.インデックスをHDFSに構築するには、solrconfig.xmlにDirectoryの実装クラスをHDFSDirectoryに配置するだけでいいですが、solr 4.4のjarはサポートされています.最新版のhadoopも2.0以降です.直接1.xのhadoopで使用すると、異常が発生します.これは、2.xと1.xのhadoopのAPIが変化し、散仙が一部のソースコードを変更した後、1.xのhadoopをインデックスし、クエリー操作をサポートすることができ、文末には、散仙がこれらのクラスをアップロードし、使用する場合は、これらのクラスをプロジェクトに導入するだけでいいからです.次は散仙のテストdemoのソースコードを見てみましょう.

package  indexhadoop;

import hdfs.HdfsDirectory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;
 

/**
 * 
 * @author qindongliang
 *       HDFS  demo
 *   hadoop1.x   
 * 
 * **/
public class MyIndex {

	
	public static void main(String[] args)throws Exception {
		//long a=System.currentTimeMillis();
	  //add();
	 	// long b=System.currentTimeMillis();
	 	// System.out.println("  : "+(b-a)+"  ");
		     query("  ");
		//delete("3");//    ID   
	}
	
	
	
	/***
	 *   HDFS writer
	 * 
	 * **/
	public static IndexWriter  getIndexWriter() throws Exception{
		
		Analyzer  analyzer=new SmartChineseAnalyzer(Version.LUCENE_46);
 		IndexWriterConfig    config=new IndexWriterConfig(Version.LUCENE_46, analyzer);
 		Configuration conf=new Configuration();
 		//Path p1 =new Path("hdfs://10.2.143.5:9090/root/myfile/my.txt");
 		//Path path=new Path("hdfs://10.2.143.5:9090/root/myfile");
 		Path path=new Path("hdfs://192.168.75.130:9000/root/index");
 		HdfsDirectory directory=new HdfsDirectory(path, conf);
 		IndexWriter writer=new IndexWriter(directory, config);
 		
 		return writer;
		
	}
	
	/**
	 *       
	 * 
	 * **/
	public static void add()throws Exception{
	      
		IndexWriter writer=getIndexWriter();	
		 
		
//		doc.add(new StringField("id", "3", Store.YES));
//		doc.add(new StringField("name", "lucene              ", Store.YES));
//		doc.add(new TextField("content", "       ", Store.YES));
//		Document doc2=new Document();
//		doc.add(new StringField("id", "4", Store.YES));
//		doc2.add(new StringField("name", "       ", Store.YES));
//		doc2.add(new TextField("content", "         ", Store.YES));
//		Document doc3=new Document();
//		doc3.add(new StringField("id", "5", Store.YES));
//		doc3.add(new StringField("name", "      ，     ！", Store.YES));
//		doc3.add(new TextField("content", "      ！", Store.YES));
//		 writer.addDocument(doc);
//		 writer.addDocument(doc2);
//		writer.addDocument(doc3);
		for(int i=6;i<10000;i++){
			Document doc=new Document();
			doc.add(new StringField("id", i+"", Store.YES));
			doc.add(new StringField("name", "lucene              "+i, Store.YES));
			doc.add(new TextField("content", "       "+i, Store.YES));
			writer.addDocument(doc);
			if(i%1000==0){
				writer.commit();
			}
		}
		 writer.forceMerge(1);
		writer.commit();
		System.out.println("  10000       !");
		writer.close();
	}
	
	/***
	 *     
	 * 
	 * **/
	public static void add(Document d)throws Exception{
      
		IndexWriter writer=getIndexWriter();	
		writer.addDocument(d);
		 writer.forceMerge(1);
		writer.commit();
		System.out.println("  10000       !");
		writer.close();
	}
	
	/**
	 *     ID
	 *   HDFS      
	 * 
	 * 
	 * **/
	public static void delete(String id)throws Exception{
		
		
		IndexWriter writer=getIndexWriter();
		writer.deleteDocuments(new Term("id", id));//    ID   
		writer.forceMerge(1);//           
		writer.commit();//    
		
		System.out.println("id "+id+"         .........");
		
		
	}
	
	/**
	 *       
	 * 
	 * **/
	public static void query(String queryTerm)throws Exception{
		System.out.println("      :  "+queryTerm);
		Configuration conf=new Configuration();
 		//Path p1 =new Path("hdfs://10.2.143.5:9090/root/myfile/my.txt");
 	//	Path path=new Path("hdfs://192.168.75.130:9000/root/index");
 		Path path=new Path("hdfs://192.168.75.130:9000/root/output/map1");
		Directory directory=new HdfsDirectory(path, conf);
		IndexReader reader=DirectoryReader.open(directory);
		System.out.println("    : "+reader.numDocs());
		long a=System.currentTimeMillis();
		IndexSearcher searcher=new IndexSearcher(reader);
		QueryParser parse=new QueryParser(Version.LUCENE_46, "city", new SmartChineseAnalyzer(Version.LUCENE_46));
		
		 Query query=parse.parse(queryTerm);
		
		 TopDocs docs=searcher.search(query, 100);
		 
 	 System.out.println("      :   "+docs.totalHits+"   " );
//		 for(ScoreDoc sc:docs.scoreDocs){
//			 
//			 System.out.println("  :  "+sc.score+"  id : "+searcher.doc(sc.doc).get("id")+"  name:   "+searcher.doc(sc.doc).get("name")+"       : "+searcher.doc(sc.doc).get("content"));
//			 
//		 }
		long b=System.currentTimeMillis();
		System.out.println("     :"+(b-a)+"   ");
		System.out.println("============================================");
		long c=System.currentTimeMillis();
		   query=parse.parse(queryTerm);
			
		   docs=searcher.search(query, 100);
		 System.out.println("      :   "+docs.totalHits+"   " );
//		 for(ScoreDoc sc:docs.scoreDocs){
//			 
//			 System.out.println("  :  "+sc.score+"  id : "+searcher.doc(sc.doc).get("id")+"  name:   "+searcher.doc(sc.doc).get("name")+"       : "+searcher.doc(sc.doc).get("content"));
//			 
//		 }
		long d=System.currentTimeMillis();
		System.out.println("     :"+(d-c)+"   ");
		
		 reader.close();
		 directory.close();
		 
		 System.out.println("    ...............");
	 
		
		
		
	}
	
	
	
	
}

上は散仙テストの例で、テストを経て、HDFS上のluceneインデックスの削除・変更は問題ありませんが、luceneはhadoopと結合して、確かにインデックスを構築する速度を大幅に向上させることができますが、検索には何の優位性もありません.検索もできますが、速度が遅く、現在のストレージの実現は、block cacheのキャッシュ特性を利用しています.検索性能を低下させることができるが、データ量が大きい場合、検索性能が非常に悪い点は、luceneやsolrにHbaseのようなデータ構造を追加しない限り、検索上はずっと良い解決方法がない.上記のコードはインデックスを1.xのhadoopに書き込むことができ、その後、散仙はhadoop 2.xでインデックスを構築する例と、MapReduceを使用してインデックスを並列に構築する方法を示します.
転載は必ず明記してください.オリジナルの住所、ご協力ありがとうございます.http://qindongliang1922.iteye.com/blog/2088076

10個の最も優秀なコードコメント

URLのパラメータにプラス符号がスペースになる問題があります(URL特殊文字)