lucene分詞器分析
luceneのAnalyzerクラスには主にこの方法が含まれています.
analyzerを実装するクラスはすべてこのメソッドを書き換え、自分で作成したTokenizer(TokenizerはTokenStreamに継承されます)を取得します.分詞の主な方法はTokenStreamで提供されるnext()メソッドであり,TokenStreamクラスを継承するすべてのメソッドはこのnext()メソッドを書き換え,自分の分詞器を制定しなければならない.
テストコード:
public abstract class Analyzer{
public abstract TokenStream tokenStream(String fieldName, Reader reader);
}
-------------------------------------------------------------------
public abstract class TokenStream {
......
public Token next(final Token reusableToken) throws IOException {
// We don't actually use inputToken, but still add this assert
assert reusableToken != null;
return next();
}
......
}
analyzerを実装するクラスはすべてこのメソッドを書き換え、自分で作成したTokenizer(TokenizerはTokenStreamに継承されます)を取得します.分詞の主な方法はTokenStreamで提供されるnext()メソッドであり,TokenStreamクラスを継承するすべてのメソッドはこのnext()メソッドを書き換え,自分の分詞器を制定しなければならない.
// StandAnalyzer, Analyzer,StandAnalyzer tokenStream :
public class StandardAnalyzer extends Analyzer {
......
public TokenStream tokenStream(String fieldName, Reader reader) {
StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym); // StandardTokenizer
tokenStream.setMaxTokenLength(maxTokenLength);
TokenStream result = new StandardFilter(tokenStream);
result = new LowerCaseFilter(result);
result = new StopFilter(result, stopSet);
return result;
}
}
-----------------------------------------------------------------
//StandardTokenizer next :
public class StandardTokenizer extends Tokenizer {
.......
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
int posIncr = 1;
while(true) {
int tokenType = scanner.getNextToken();
if (tokenType == StandardTokenizerImpl.YYEOF) {
return null;
}
if (scanner.yylength() <= maxTokenLength) {
reusableToken.clear();
reusableToken.setPositionIncrement(posIncr);
scanner.getText(reusableToken);
final int start = scanner.yychar();
reusableToken.setStartOffset(start);
reusableToken.setEndOffset(start+reusableToken.termLength());
// This 'if' should be removed in the next release. For now, it converts
// invalid acronyms to HOST. When removed, only the 'else' part should
// remain.
if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
if (replaceInvalidAcronym) {
reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
reusableToken.setTermLength(reusableToken.termLength() - 1); // remove extra '.'
} else {
reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
}
} else {
reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
}
return reusableToken;
} else
// When we skip a too-long term, we still increment the
// position increment
posIncr++;
}
}
......
}
テストコード:
public class TestAnalyzer {
final static String TEXT = "hadoop is very good,xina a tooken finished!";
@SuppressWarnings("deprecation")
public void testTokenizer() throws IOException{
Reader reader = new StringReader(TEXT);
StandardTokenizer tokenizer = new StandardTokenizer(reader);
for (Token t = tokenizer.next();t!=null;t = tokenizer.next()) {
System.out.println(t.startOffset() +"--" + t.endOffset() + ": "+ t.termText());
}
}
public static void main(String[] args) throws IOException {
TestAnalyzer t = new TestAnalyzer();
t.testTokenizer();
}
}
//lucene2.4.1 API :
TokenStream ts = analyzer.tokenStream("text", new StringReader(txt));
for(Token t= new Token(); (t=ts.next(t)) !=null;) {
System.out.println(t);
}