Elasticsearchのインストールと入門

22162 ワード

Elasticsearch      

 、    :

  (Node):     ES   ,             -
  (Cluster):         ,                 
  (Shard):             ,              
  (Replica):          ,                  
  (Index):       
  (Type):       
  (Document):       ,       Field
  (Field):       ,   Mapping       (       )

Transport:       
discovery.zen:         p2p  
river:   
gateway:    


 、  :
1.    JDK,   JAVA_HOME    (export JAVA_HOME=.../java8),

  Java  
java -version

2.    Elasticsearch。
    :https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-5.0.1.zip

     :
     bin\elasticsearch.bat     
       http://localhost:9200/        

     :
  ,cmd  bin  :
service install
service start
       http://localhost:9200/        


  elasticsearch:
service stop

3.  head  
head          、       。
   https://github.com/mobz/elasticsearch-head   ,    ./plugins/head    。


cmd  \bin  ,
    :
plugin install mobz/elasticsearch-head


      http://localhost:9200/_plugin/head/   


 、  :

1)        
  /home/zkpk/elasticsearch-node1/config    elasticsearch.yml   

           :
#      
cluster.name: cswuyg_qa_pair_test

#  node  
node.name: xxx-node


#             master     。        3   。
covery.zen.minimum_master_nodes: 2


#      
network.host: 192.168.137.100

#            ,   true。
discovery.zen.ping.multicast.enabled: false


#       ping      
discovery.zen.ping.timeout: 40s

#     ip  
network.bind_host: 192.168.137.100

#             ip  。
network.publish_host: 192.168.137.100

#    bind_host publish_host      。
network.host: 192.168.137.100

#           HTTP    
 transport.tcp.port: 9302
 http.port: 8302


#discovery.zen.ping.unicast.hosts:["  1  ip","  2  ip","  3 ip"]
#          master   ip,  es               。
discovery.zen.ping.unicast.hosts: ["192.168.137.100",  "192.168.137.101","192.168.137.100:9301"]


#        
index.analysis.analyzer.ik.type: "ik"


2)        

   elasticsearch-node1      ,   elasticsearch-node2
        es  ,        transport.tcp.port  http.port             。
  elasticsearch.yml          :
node.name: "es-node3"
transport.tcp.port: 9301
http.port: 9201



  :

      ES     :export ES_HEAP_SIZE=10g。

elasticsearch -d          ,    ,          head        , :

http://host_name_xxx:port_yyy/_plugin/head/


 、  :
1.    index:
POST http://192.168.137.100:9200/test_index

     index:
GET  http://192.168.137.100:9200/_cat/indices?v


      
DELETE  http://192.168.137.100:9200/customer?pretty


2.    :
POST http://xxxhost:8201/qa_xx2/qa_xx3/1234

{
    "title":"cswuyg test",
    "content":"this is test by cswuyg。。。"
}

{
    "_index":"qa_xx2",
    "_type":"qa_xx3",
    "_id":"1234",
    "_version":1,
    "_shards":{
    "total":2,
    "successful":2,
    "failed":0
    },
    "created":true

}


3.    :

(1)  id   :
GET http://xxxhost:8201/qa_xx2/qa_xx3/12352)DSL  :
   url POST    :
URL  :http://xxxhost:8201/qa_xx2/qa_xx3/_search


a.   title    cswuyg     。Highlight        。POST   body:
{
    "query": {
        "match": {
            "title": {
                "query": "cswuyg "
            }
        }
    },
    "highlight": {
        "fields": {
            "title": {

            }
        }
    }
}

b. bool    ,      title       “  ”、“  ”、“  ”,       ,            。“  ”      。
POST   body:


           ,   match   match_phrase。


  :        “  ” “  ",“  ”      


c.         (boost)。POST   body:


d. filter  ,   kv  ,           ,title       ,               ,    ,             , “    ”,     “  ” “  ”,  filter         “  ”  “  ”    。POST   body:


e.         ,    “   ”               ,            (match    )。POST   body:


 (3)  
a.     ,           ,        

PUT http://192.168.137.100:8202/qa_pair2/_settings
{
   "number_of_replicas" : 0
}



4.  :  ik      

Elasticsearch              ,         ,          。

  github  :https://github.com/medcl/elasticsearch-analysis-ik
    :       ,   windows      , elasticsearch-analysis-ik-1.9.3.zip   linux   ./plugin/head     。
  :     ./config/elasticsearch.yml      : index.analysis.analyzer.ik.type: "ik"

  ik  :http://host_name_xx:port_yyy/qa_pair/_analyze?analyzer=ik&pretty=true&text=     "



5.  python  Elasticsearch
  pip  elasticsearch


1)            :
#!/home/work/bin/python
#-*-coding:utf8-*-
"""
    ,   es
  :python insert_demo.py xxx_file_name

Authors: cswuyg 
Date: 2016.06.18
"""

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from elasticsearch import exceptions
import traceback
import datetime
import sys

reload(sys)
sys.setdefaultencoding('utf-8')

#  mappings
def _create_index(es, index_name="cswuyg", doc_type_name="cswuyg"):
    my_settingss = {
            'number_of_shards': 18,
            'number_of_replicas': 0
            }
    my_mappings = {
            "cswuyg": {
                '_all': {
                    'enabled': 'false'
                    },
                "properties": {
                    "title": {
                        'type': 'string',
                        'store': 'no',
                        'term_vector': 'with_positions_offsets',
                        'analyzer': 'ik_max_word',
                        'boost': 8
                        },
                    "url": {
                        "type": "string",
                        'index': 'not_analyzed'
                        },
                    'content': {
                        'type': 'string',
                        'store': 'no',
                        'term_vector': 'with_positions_offsets',
                        'analyzer': 'ik_max_word',
                        'boost': 8
                        }
                    }
                }
            }
    settings = {
            'settings': my_settingss, 
            'mappings': my_mappings
            }

    create_index = es.indices.create(index=index_name, body=settings)

#          es 
def _save_data(es, input_file):
    #    
    all_data = list()
    count = 0
    with open(input_file) as f_r:
        for line in f_r:
        count += 1
            all_data.append({
                '_index': 'cswuyg',
                '_type': 'cswuyg',
                '_source': {
                    'title': line
                    }
                })
            if len(all_data) == 100:
                success, _ = bulk(es, all_data, index='cswuyg', raise_on_error=True)
                all_data = list()
                print('{1}: finish {0}'.format(count, input_file))
    if len(all_data) != 0:
        success, _ = bulk(es, all_data, index='cswuyg', raise_on_error=True)
        all_data = list()
        print('{1}: finish {0}'.format(count, input_file))
    print('{0}: finish all'.format(input_file))

def _insert_data(es, file_name):
    start_time = datetime.datetime.now()
    _save_data(es, file_name)
    cost_time = datetime.datetime.now() - start_time
    print('all cost time{0}'.format(cost_time))

def _main():
    if len(sys.argv) != 2:
        print('need file argument')
        return 
    es = Elasticsearch(hosts=["10.200.100.80:8301"], timeout=500)
    try:
        _create_index(es)
    except exceptions.RequestError:
        print(traceback.format_exc())
    _insert_data(es, sys.argv[1]);

if __name__ == '__main__':
    _main()

        :
     
    


2)    (          :        Term):

#!/home/work/bin/python
#-*-coding:utf8-*-
"""
   es
  :
           ,      term,     term     ,   match_phrase + bool  
    2

    :python search_demo.py test_file
output:
es_query
query\ttitle\tsall_score

output demo:
{'query': {'bool': {'should': [{'match': {'title': {'query': '\xe6\x88\x91', 'boost': 0.2}}}], 'must': [{'match_phrase': {'title': {'query': '\xe4\xb8\xad\xe5\x9b\xbd', 'boost': 0.69}}}, {'match_phrase': {'title': {'query': '\xe7\x88\xb1', 'boost': 0.11}}}]}}}
                        {"should": [" "], "score": {" ": 0.2, "  ": 0.69, " ": 0.11}, "must": ["  ", " "]}
                        {"should": [" "], "score": {" ": 0.2, "  ": 0.69, " ": 0.11}, "must": ["  ", " "]}
Authors: cswuyg
Date: 2016.06.18
"""
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from elasticsearch import exceptions
import sys
import json
from log import logger
import traceback

reload(sys)
sys.setdefaultencoding('utf-8')

def _release_es_query_by_file_info(file_info):
    #    : raw_query\tmust_term\tshould_term\tall_score
    logger.debug('file_info:{0}'.format(file_info))
    file_info_list = file_info.split('\t')
    print file_info_list
    raw_query = file_info_list[0]
    must_term = file_info_list[3]
    should_term= file_info_list[4]
    all_score = file_info_list[5]
    json_score = json.loads(all_score, encoding='utf-8')

    ret_obj = {}
    ret_obj['must'] = must_term.split()
    ret_obj['should'] = should_term.split()
    ret_obj['score'] = json_score 

    bool_query = dict()
    must_query = list()
    should_query = list()
    for item in must_term.split(' '):
        must_query.append({'match_phrase': {'title': {'query': item, 'boost': json_score[unicode(item)]}}})
    bool_query['must'] = must_query

    for item in should_term.split(' '):
        should_query.append({'match': {'title': {'query': item, 'boost': json_score[unicode(item)]}}})
    bool_query['should'] = should_query

    es_query = {'query': {'bool': bool_query}}
    print es_query
    return raw_query, es_query, json.dumps(ret_obj, ensure_ascii=False) 

def _do_query_use_file_info(es, file_info):
    raw_query, query, all_score = _release_es_query_by_file_info(file_info.strip('\r
'
)) res = es.search(index='cswuyg', doc_type='cswuyg', body=query, size=100) if (len(res['hits']['hits']) == 0): logger.debug('len(res["hits"]["hits"]) == 0') print("{0}\t{1}\t{2}".format(raw_query, "", all_score)) return for item in res['hits']['hits']: try: print("{0}\t{1}\t{2}".format(raw_query, item['_source']['title'].strip('\r
'
), all_score)) except: logger.debug(traceback.format_exc()) logger.debug(item['_source']['title']) print('\r
'
) def _main(): if len(sys.argv) != 2: print('argv error') return else: print('argv[1] = {0}'.format(sys.argv[1])) es = Elasticsearch(hosts=["10.200.100.80:8301"], timeout=5000) with open(sys.argv[1]) as f_r: for item in f_r: try: _do_query_use_file_info(es, item) except: logger.debug(traceback.format_exc()) if __name__ == '__main__': _main() : {" ": 0.20, " ": 0.69, " ": 0.11} : http://blog.csdn.net/sinat_28224453/article/details/51134978 http://www.cnblogs.com/cswuyg/p/5651620.html http://blog.csdn.net/ebw123/article/details/46707559 http://blog.csdn.net/napoay/article/details/52201558 http://rockelixir.iteye.com/blog/1890879