ElasticSearchノート整理(三):Java APIはESと中国語の単語を使用します.

23362 ワード

[TOC]
pom.xml
mavenプロジェクトを使ってES Java APIを構築するテスト項目は、以下のように利用されています.


    org.elasticsearch
    elasticsearch
    2.3.0


    com.fasterxml.jackson.core
    jackson-databind
    2.7.0


    org.dom4j
    dom4j
    2.0.0



    org.projectlombok
    lombok
    1.16.10

ES APIの基本的な添削と調査
junnitを使ってテストします.グローバル変数とsetUp関数は以下の通りです.

private TransportClient client;
private String index = "bigdata";   //         "bigdata"
private String type = "product";    //        "product"

@Before
public void setup() throws UnknownHostException {
    //     ES  ，          ，         
    Settings settings = Settings.builder().put("cluster.name", "bigdata-08-28").build();
    client = TransportClient.builder().settings(settings).build();
    TransportAddress ta1 = new InetSocketTransportAddress(InetAddress.getByName("uplooking01"), 9300);
    TransportAddress ta2 = new InetSocketTransportAddress(InetAddress.getByName("uplooking02"), 9300);
    TransportAddress ta3 = new InetSocketTransportAddress(InetAddress.getByName("uplooking03"), 9300);
    client.addTransportAddresses(ta1, ta2, ta3);
    /*settings = client.settings();
        Map asMap = settings.getAsMap();
        for(Map.Entry setting : asMap.entrySet()) {
            System.out.println(setting.getKey() + "::" + setting.getValue());
        }*/
}

インデックス追加:JSON方式

/**
     *   ： es      4   
     * 1.JSON
     * 2.Map
     * 3.Java Bean
     * 4.XContentBuilder
     *
     * 1.JSON  
     */
@Test
public void testAddJSON() {
    String source = "{\"name\":\"sqoop\", \"author\": \"apache\", \"version\": \"1.4.6\"}";
    IndexResponse response = client.prepareIndex(index, type, "4").setSource(source).get();
    System.out.println(response.isCreated());
}

索引の追加:Map方式

/**
     *     ：
     * 2.Map  
     */
@Test
public void testAddMap() {
    Map source = new HashMap();
    source.put("name", "flume");
    source.put("author", "Cloudera");
    source.put("version", "1.8.0");
    IndexResponse response = client.prepareIndex(index, type, "5").setSource(source).get();
    System.out.println(response.isCreated());
}

インデックス追加:Java Bean方式

/**
     *     ：
     * 3.Java Bean  
     *
     *          json   ，        ：
     * The number of object passed must be even but was [1]
     */
@Test
public void testAddObj() throws JsonProcessingException {
    Product product = new Product("kafka", "linkedIn", "0.10.0.1", "kafka.apache.org");
    ObjectMapper objectMapper = new ObjectMapper();
    String json = objectMapper.writeValueAsString(product);
    System.out.println(json);
    IndexResponse response = client.prepareIndex(index, type, "6").setSource(json).get();
    System.out.println(response.isCreated());
}

インデックス追加:XContentBuilder方式

/**
     *     ：
     * 4.XContentBuilder  
     */
@Test
public void testAddXContentBuilder() throws IOException {
    XContentBuilder source = XContentFactory.jsonBuilder();
    source.startObject()
        .field("name", "redis")
        .field("author", "redis")
        .field("version", "3.2.0")
        .field("url", "redis.cn")
        .endObject();
    IndexResponse response = client.prepareIndex(index, type, "7").setSource(source).get();
    System.out.println(response.isCreated());
}

インデックスクエリ

/**
     *          
     */
@Test
public void testGet() {
    GetResponse response = client.prepareGet(index, type, "6").get();
    Map map = response.getSource();
    /*for(Map.Entry me : map.entrySet()) {
            System.out.println(me.getKey() + "=" + me.getValue());
        }*/
    // lambda   ，jdk 1.8  
    map.forEach((k, v) -> System.out.println(k + "=" + v));
    //        map.keySet().forEach(key -> System.out.println(key + "xxx"));
}

インデックス更新

/**
     *        curl       
     * curl -XPOST http://uplooking01:9200/bigdata/product/AWA184kojrSrzszxL-Zs/_update -d' {"doc":{"name":"sqoop", "author":"apache"}}'
     *
     *         ，   prepareUpdate，     prepareIndex
     */
@Test
public void testUpdate() throws Exception {
    /*String source = "{\"doc\":{\"url\": \"http://flume.apache.org\"}}";
        UpdateResponse response = client.prepareUpdate(index, type, "4").setSource(source.getBytes()).get();*/
    //              
    String source = "{\"url\": \"http://flume.apache.org\"}";
    UpdateResponse response = client.prepareUpdate(index, type, "4").setDoc(source.getBytes()).get();
    System.out.println(response.getVersion());
}

インデックスの削除

/**
     *     
     */
@Test
public void testDelete() {
    DeleteResponse response = client.prepareDelete(index, type, "5").get();
    System.out.println(response.getVersion());
}

一括操作

/**
     *     
     */
@Test
public void testBulk() {
    IndexRequestBuilder indexRequestBuilder = client.prepareIndex(index, type, "8")
        .setSource("{\"name\":\"elasticsearch\", \"url\":\"http://www.elastic.co\"}");
    UpdateRequestBuilder updateRequestBuilder = client.prepareUpdate(index, type, "1").setDoc("{\"url\":\"http://hadoop.apache.org\"}");
    BulkRequestBuilder bulk = client.prepareBulk();
    BulkResponse bulkResponse = bulk.add(indexRequestBuilder).add(updateRequestBuilder).get();
    Iterator it = bulkResponse.iterator();
    while(it.hasNext()) {
        BulkItemResponse response = it.next();
        System.out.println(response.getId() + "" + response.getVersion());
    }
}

インデックスレコードの取得数

/**
     *        
     */
@Test
public void testCount() {
    CountResponse response = client.prepareCount(index).get();
    System.out.println("     ：" + response.getCount());
}

ES APIのシニアクエリ
junnitに基づいてテストを行います.setUp関数とshowResult関数は以下の通りです.
グローバル変数とsetUp:

private TransportClient client;
private String index = "bigdata";
private String type = "product";
private String[] indics = {"bigdata", "bank"};

@Before
public void setUp() throws UnknownHostException {
    Settings settings = Settings.builder().put("cluster.name", "bigdata-08-28").build();
    client = TransportClient.builder().settings(settings).build();
    TransportAddress ta1 = new InetSocketTransportAddress(InetAddress.getByName("uplooking01"), 9300);
    TransportAddress ta2 = new InetSocketTransportAddress(InetAddress.getByName("uplooking02"), 9300);
    TransportAddress ta3 = new InetSocketTransportAddress(InetAddress.getByName("uplooking03"), 9300);
    client.addTransportAddresses(ta1, ta2, ta3);
}

show Result:

/**
     *          
     * @param response
     */
private void showResult(SearchResponse response) {
    SearchHits searchHits = response.getHits();
    float maxScore = searchHits.getMaxScore();  //             
    System.out.println("maxScore: " + maxScore);
    long totalHits = searchHits.getTotalHits(); //         
    System.out.println("totalHits: " + totalHits);
    SearchHit[] hits = searchHits.getHits();    //     
    System.out.println("          ：" + hits.length);
    for (SearchHit hit : hits) {
        long version = hit.version();
        String id = hit.getId();
        String index = hit.getIndex();
        String type = hit.getType();
        float score = hit.getScore();
        System.out.println("===================================================");
        String source = hit.getSourceAsString();
        System.out.println("version: " + version);
        System.out.println("id: " + id);
        System.out.println("index: " + index);
        System.out.println("type: " + type);
        System.out.println("score: " + score);
        System.out.println("source: " + source);
    }
}

ESクエリの種類説明
クエリーの種類は以下の4つです.

query and fetch(    )(  N    )
query then fetch（       ）
DFS query and fetch
DFS query then fetch(              。)

APIのコメントは以下の通りです.

/**
     * Same as {@link #QUERY_THEN_FETCH}, except for an initial scatter phase which goes and computes the distributed
     * term frequencies for more accurate scoring.
     */
DFS_QUERY_THEN_FETCH((byte) 0),
/**
     * The query is executed against all shards, but only enough information is returned (not the document content).
     * The results are then sorted and ranked, and based on it, only the relevant shards are asked for the actual
     * document content. The return number of hits is exactly as specified in size, since they are the only ones that
     * are fetched. This is very handy when the index has a lot of shards (not replicas, shard id groups).
     */
QUERY_THEN_FETCH((byte) 1),
/**
     * Same as {@link #QUERY_AND_FETCH}, except for an initial scatter phase which goes and computes the distributed
     * term frequencies for more accurate scoring.
     */
DFS_QUERY_AND_FETCH((byte) 2),
/**
     * The most naive (and possibly fastest) implementation is to simply execute the query on all relevant shards
     * and return the results. Each shard returns size results. Since each shard already returns size hits, this
     * type actually returns size times number of shards results back to the caller.
     */
QUERY_AND_FETCH((byte) 3),

DFSについての説明:

DFS     ？
  D   Distributed，F   frequency   ，  S   Scatter   ，              
         。

              ？
 es           ，                   ，                  
 ，          ，                       。      
DFS_QUERY_THEN_FETCH      ，      ，      ，     3   。 ，  DFS  ，  
        。

まとめ:

    ，     QUERY_AND_FETCH    ，DFS_QUERY_THEN_FETCH    。         ，DFS 
  DFS      。

正確なクエリー

/**
     * 1.    
     * termQuery
     * term      
     */
@Test
public void testSearch1() {
    SearchRequestBuilder searchQuery = client.prepareSearch(indics)    //  prepareSearch()         ，              
        .setSearchType(SearchType.DEFAULT)  //       ， QUERY_AND_FETCH  QUERY_THEN_FETCH  DFS_QUERY_AND_FETCH  DFS_QUERY_THEN_FETCH
        .setQuery(QueryBuilders.termQuery("author", "apache"))//      query，    ，termQuery     ：name doc     field，value         
        ;
    //           ，      
    SearchResponse response = searchQuery.get();

    showResult(response);
}

あいまい検索

/**
     * 2.    
     * prefixQuery
     */
@Test
public void testSearch2() {
    SearchResponse response = client.prepareSearch(indics).setSearchType(SearchType.QUERY_THEN_FETCH)
        .setQuery(QueryBuilders.prefixQuery("name", "h"))
        .get();
    showResult(response);
}

ページ別に照会する

/**
     * 3.    
     *      bank 
     *    (25, 35]       
     *
     *     ：
     *            ，       
     *              10   
     *
     *         4    
     *          setFrom(30=(4-1)*size)
     *          setSize(10)
     *          N      ：(N - 1) * pageSize
     */
@Test
public void testSearch3() {
    //   QUERY_THEN_FETCH   QUERY_AND_FETCH         ，    10 ，   50 （5   ）
    SearchResponse response = client.prepareSearch(indics).setSearchType(SearchType.DFS_QUERY_THEN_FETCH)
        .setQuery(QueryBuilders.rangeQuery("age").gt(25).lte(35))
        //   setFrom setSize            
        .setFrom(0)
        .setSize(5)
        .get();
    showResult(response);
}

クエリーをハイライト表示

/**
     * 4.      
     *     ，
     *    apache，   author  ，    url， name      
     *  author or url   --->booleanQuery  should  
     *         and   --->booleanQuery  must  
     *         not   --->booleanQuery  mustNot  
     *     match  ，          keyword           ，    ，    
     */
@Test
public void testSearch4() {
    SearchResponse response = client.prepareSearch(indics).setSearchType(SearchType.DEFAULT)
        //                .setQuery(QueryBuilders.multiMatchQuery("apache", "author", "url"))
        //                .setQuery(QueryBuilders.regexpQuery("url", ".*apache.*"))
        //                .setQuery(QueryBuilders.termQuery("author", "apache"))
        .setQuery(QueryBuilders.boolQuery()
                  .should(QueryBuilders.regexpQuery("url", ".*apache.*"))
                  .should(QueryBuilders.termQuery("author", "apache")))
        //       --->              
        .setHighlighterPreTags("")
        .setHighlighterPostTags("")
        //           
        .addHighlightedField("author")
        .addHighlightedField("url")
        .get();
    SearchHits searchHits = response.getHits();
    float maxScore = searchHits.getMaxScore();  //             
    System.out.println("maxScore: " + maxScore);
    long totalHits = searchHits.getTotalHits(); //         
    System.out.println("totalHits: " + totalHits);
    SearchHit[] hits = searchHits.getHits();    //     
    System.out.println("          ：" + hits.length);
    for(SearchHit hit : hits) {
        System.out.println("========================================================");
        Map highlightFields = hit.getHighlightFields();
        for(Map.Entry me : highlightFields.entrySet()) {
            System.out.println("--------------------------------------");
            String key = me.getKey();
            HighlightField highlightField = me.getValue();
            String name = highlightField.getName();
            System.out.println("key: " + key + ", name: " + name);
            Text[] texts = highlightField.fragments();
            String value = "";
            for(Text text : texts) {
                // System.out.println("text: " + text.toString());
                value += text.toString();
            }
            System.out.println("value: " + value);
        }
    }
}

クエリーの並べ替え

/**
     * 5.    
     *         
     *  balance（  ）    
     */
@Test
public void testSearch5() {
    //   QUERY_THEN_FETCH   QUERY_AND_FETCH         ，    10 ，   50 （5   ）
    SearchResponse response = client.prepareSearch(indics).setSearchType(SearchType.DFS_QUERY_THEN_FETCH)
        .setQuery(QueryBuilders.rangeQuery("age").gt(25).lte(35))
        .addSort("balance", SortOrder.DESC)
        //   setFrom setSize            
        .setFrom(0)
        .setSize(5)
        .get();
    showResult(response);
}

集約クエリ:平均を計算する

/**
     * 6.    ：     
     */
@Test
public void testSearch6() {
    indics = new String[]{"bank"};
    //   QUERY_THEN_FETCH   QUERY_AND_FETCH         ，    10 ，   50 （5   ）
    SearchResponse response = client.prepareSearch(indics).setSearchType(SearchType.DFS_QUERY_THEN_FETCH)
        .setQuery(QueryBuilders.rangeQuery("age").gt(25).lte(35))
        /*
                    select avg(age) as avg_name from person;
                         avg("balance")--->      avg_name    
                 */
        .addAggregation(AggregationBuilders.avg("avg_balance").field("balance"))
        .addAggregation(AggregationBuilders.max("max").field("balance"))
        .get();
    //        System.out.println(response);
    /*
            response    Aggregations
                "aggregations" : {
                    "max" : {
                      "value" : 49741.0
                    },
                    "avg_balance" : {
                      "value" : 25142.137373737372
                    }
                  }
                     aggregation ：
                  {
                      "value" : 49741.0
                    }
         */
    Aggregations aggregations = response.getAggregations();
    List aggregationList = aggregations.asList();
    for(Aggregation aggregation : aggregationList) {
        System.out.println("========================================");
        String name = aggregation.getName();
        // Map map = aggregation.getMetaData();
        System.out.println("name: " + name);
        // System.out.println(map);
        Object obj = aggregation.getProperty("value");
        System.out.println(obj);
    }
    /*Aggregation avgBalance = aggregations.get("avg_balance");
        Object obj = avgBalance.getProperty("value");
        System.out.println(obj);*/
}

ES中国語分詞の集積IK分詞
私達のデータが中国語を含む場合、中国語の分詞検索をサポートしたいと思いますが、ES自体がLuceneの分詞に依存して中国語に対してよくないです.ここで説明するIK中国語の分詞を使って、ESにまとめるステップは以下の通りです.

  1)    ：
    https://github.com/medcl/elasticsearch-analysis-ik
  2)  maven        (mvn clean install -DskipTests)（package）
  3)     target/releases  zip        ES_HOME/plugins/analysis-ik    ，    
  4)    ik    conf/ik     ES_HOME/config 
  5)  ES_HOME/config/elasticsearch.yml  ，  index.analysis.analyzer.default.type: ik
  ( IK        ,       )
  6)  es  
  7)

なお、データは新たに挿入され、ik分詞を使用して、中国語分詞IKを使用したいインデックスライブラリを再構築する必要がある.
テストコードは以下の通りです.

package cn.xpleaf.bigdata.elasticsearch;

import org.elasticsearch.action.search.SearchRequestBuilder;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHits;
import org.elasticsearch.search.aggregations.Aggregation;
import org.elasticsearch.search.aggregations.AggregationBuilders;
import org.elasticsearch.search.aggregations.Aggregations;
import org.elasticsearch.search.highlight.HighlightField;
import org.elasticsearch.search.sort.SortOrder;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.List;
import java.util.Map;

/**
 *   Java API   es  
 * Transport
 *        
 *              TransportClient
 * 
 *   prepareSearch        
 *      
 */
public class ElasticSearchTest3 {

    private TransportClient client;
    private String index = "bigdata";
    private String type = "product";
    private String[] indics = {"chinese"};

    @Before
    public void setUp() throws UnknownHostException {
        Settings settings = Settings.builder().put("cluster.name", "bigdata-08-28").build();
        client = TransportClient.builder().settings(settings).build();
        TransportAddress ta1 = new InetSocketTransportAddress(InetAddress.getByName("uplooking01"), 9300);
        TransportAddress ta2 = new InetSocketTransportAddress(InetAddress.getByName("uplooking02"), 9300);
        TransportAddress ta3 = new InetSocketTransportAddress(InetAddress.getByName("uplooking03"), 9300);
        client.addTransportAddresses(ta1, ta2, ta3);
    }

    /**
     *        
     * 1.   " "     ，   
     * 2.   “  ”     ， 0 
     * 3.    “ ”   ， 1 
     * 4.    “   ”   ， 0 
     *   ：
     *             China is the greatest country~
     *                   ：     
     *
     *                 ×××
     *                        
     *                        
     *                         
     *                          
     *                           
     *                        
     *                        
     *              ：
     *              
     *          IK   
     *               
     */
    @Test
    public void testSearch1() {
        SearchResponse response = client.prepareSearch(indics)    //  prepareSearch()         ，              
                .setSearchType(SearchType.DEFAULT)  //       ， QUERY_AND_FETCH  QUERY_THEN_FETCH  DFS_QUERY_AND_FETCH  DFS_QUERY_THEN_FETCH
                //.setQuery(QueryBuilders.prefixQuery("content", "   "))//      query，    ，termQuery     ：name doc     field，value         
//                .setQuery(QueryBuilders.regexpQuery("content", ".*   .*"))
                .setQuery(QueryBuilders.prefixQuery("content", "  "))
                .get();
        showResult(response);
    }

    /**
     *          
     * @param response
     */
    private void showResult(SearchResponse response) {
        SearchHits searchHits = response.getHits();
        float maxScore = searchHits.getMaxScore();  //             
        System.out.println("maxScore: " + maxScore);
        long totalHits = searchHits.getTotalHits(); //         
        System.out.println("totalHits: " + totalHits);
        SearchHit[] hits = searchHits.getHits();    //     
        System.out.println("          ：" + hits.length);
        for (SearchHit hit : hits) {
            long version = hit.version();
            String id = hit.getId();
            String index = hit.getIndex();
            String type = hit.getType();
            float score = hit.getScore();
            System.out.println("===================================================");
            String source = hit.getSourceAsString();
            System.out.println("version: " + version);
            System.out.println("id: " + id);
            System.out.println("index: " + index);
            System.out.println("type: " + type);
            System.out.println("score: " + score);
            System.out.println("source: " + source);
        }
    }

    @After
    public void cleanUp() {
        client.close();
    }
}

関連テストコードはGitHubにアップロードされました.https://github.com/xpleaf/elasticsearch-study

BenUtilsフレームワークの分析

[golang][訳]os/execを使ってコマンドを実行します.