Elasticsearchのインストールと入門
22162 ワード
Elasticsearch
、 :
(Node): ES , -
(Cluster): ,
(Shard): ,
(Replica): ,
(Index):
(Type):
(Document): , Field
(Field): , Mapping ( )
Transport:
discovery.zen: p2p
river:
gateway:
、 :
1. JDK, JAVA_HOME (export JAVA_HOME=.../java8),
Java
java -version
2. Elasticsearch。
:https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-5.0.1.zip
:
bin\elasticsearch.bat
http://localhost:9200/
:
,cmd bin :
service install
service start
http://localhost:9200/
elasticsearch:
service stop
3. head
head 、 。
https://github.com/mobz/elasticsearch-head , ./plugins/head 。
cmd \bin ,
:
plugin install mobz/elasticsearch-head
http://localhost:9200/_plugin/head/
、 :
1)
/home/zkpk/elasticsearch-node1/config elasticsearch.yml
:
#
cluster.name: cswuyg_qa_pair_test
# node
node.name: xxx-node
# master 。 3 。
covery.zen.minimum_master_nodes: 2
#
network.host: 192.168.137.100
# , true。
discovery.zen.ping.multicast.enabled: false
# ping
discovery.zen.ping.timeout: 40s
# ip
network.bind_host: 192.168.137.100
# ip 。
network.publish_host: 192.168.137.100
# bind_host publish_host 。
network.host: 192.168.137.100
# HTTP
transport.tcp.port: 9302
http.port: 8302
#discovery.zen.ping.unicast.hosts:[" 1 ip"," 2 ip"," 3 ip"]
# master ip, es 。
discovery.zen.ping.unicast.hosts: ["192.168.137.100", "192.168.137.101","192.168.137.100:9301"]
#
index.analysis.analyzer.ik.type: "ik"
2)
elasticsearch-node1 , elasticsearch-node2
es , transport.tcp.port http.port 。
elasticsearch.yml :
node.name: "es-node3"
transport.tcp.port: 9301
http.port: 9201
:
ES :export ES_HEAP_SIZE=10g。
elasticsearch -d , , head , :
http://host_name_xxx:port_yyy/_plugin/head/
、 :
1. index:
POST http://192.168.137.100:9200/test_index
index:
GET http://192.168.137.100:9200/_cat/indices?v
DELETE http://192.168.137.100:9200/customer?pretty
2. :
POST http://xxxhost:8201/qa_xx2/qa_xx3/1234
{
"title":"cswuyg test",
"content":"this is test by cswuyg。。。"
}
{
"_index":"qa_xx2",
"_type":"qa_xx3",
"_id":"1234",
"_version":1,
"_shards":{
"total":2,
"successful":2,
"failed":0
},
"created":true
}
3. :
(1) id :
GET http://xxxhost:8201/qa_xx2/qa_xx3/1235
(2)DSL :
url POST :
URL :http://xxxhost:8201/qa_xx2/qa_xx3/_search
a. title cswuyg 。Highlight 。POST body:
{
"query": {
"match": {
"title": {
"query": "cswuyg "
}
}
},
"highlight": {
"fields": {
"title": {
}
}
}
}
b. bool , title “ ”、“ ”、“ ”, , 。“ ” 。
POST body:
, match match_phrase。
: “ ” “ ",“ ”
c. (boost)。POST body:
d. filter , kv , ,title , , , , “ ”, “ ” “ ”, filter “ ” “ ” 。POST body:
e. , “ ” , (match )。POST body:
(3)
a. , ,
PUT http://192.168.137.100:8202/qa_pair2/_settings
{
"number_of_replicas" : 0
}
4. : ik
Elasticsearch , , 。
github :https://github.com/medcl/elasticsearch-analysis-ik
: , windows , elasticsearch-analysis-ik-1.9.3.zip linux ./plugin/head 。
: ./config/elasticsearch.yml : index.analysis.analyzer.ik.type: "ik"
ik :http://host_name_xx:port_yyy/qa_pair/_analyze?analyzer=ik&pretty=true&text= "
5. python Elasticsearch
pip elasticsearch
1) :
#!/home/work/bin/python
#-*-coding:utf8-*-
"""
, es
:python insert_demo.py xxx_file_name
Authors: cswuyg
Date: 2016.06.18
"""
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from elasticsearch import exceptions
import traceback
import datetime
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
# mappings
def _create_index(es, index_name="cswuyg", doc_type_name="cswuyg"):
my_settingss = {
'number_of_shards': 18,
'number_of_replicas': 0
}
my_mappings = {
"cswuyg": {
'_all': {
'enabled': 'false'
},
"properties": {
"title": {
'type': 'string',
'store': 'no',
'term_vector': 'with_positions_offsets',
'analyzer': 'ik_max_word',
'boost': 8
},
"url": {
"type": "string",
'index': 'not_analyzed'
},
'content': {
'type': 'string',
'store': 'no',
'term_vector': 'with_positions_offsets',
'analyzer': 'ik_max_word',
'boost': 8
}
}
}
}
settings = {
'settings': my_settingss,
'mappings': my_mappings
}
create_index = es.indices.create(index=index_name, body=settings)
# es
def _save_data(es, input_file):
#
all_data = list()
count = 0
with open(input_file) as f_r:
for line in f_r:
count += 1
all_data.append({
'_index': 'cswuyg',
'_type': 'cswuyg',
'_source': {
'title': line
}
})
if len(all_data) == 100:
success, _ = bulk(es, all_data, index='cswuyg', raise_on_error=True)
all_data = list()
print('{1}: finish {0}'.format(count, input_file))
if len(all_data) != 0:
success, _ = bulk(es, all_data, index='cswuyg', raise_on_error=True)
all_data = list()
print('{1}: finish {0}'.format(count, input_file))
print('{0}: finish all'.format(input_file))
def _insert_data(es, file_name):
start_time = datetime.datetime.now()
_save_data(es, file_name)
cost_time = datetime.datetime.now() - start_time
print('all cost time{0}'.format(cost_time))
def _main():
if len(sys.argv) != 2:
print('need file argument')
return
es = Elasticsearch(hosts=["10.200.100.80:8301"], timeout=500)
try:
_create_index(es)
except exceptions.RequestError:
print(traceback.format_exc())
_insert_data(es, sys.argv[1]);
if __name__ == '__main__':
_main()
:
2) ( : Term):
#!/home/work/bin/python
#-*-coding:utf8-*-
"""
es
:
, term, term , match_phrase + bool
2
:python search_demo.py test_file
output:
es_query
query\ttitle\tsall_score
output demo:
{'query': {'bool': {'should': [{'match': {'title': {'query': '\xe6\x88\x91', 'boost': 0.2}}}], 'must': [{'match_phrase': {'title': {'query': '\xe4\xb8\xad\xe5\x9b\xbd', 'boost': 0.69}}}, {'match_phrase': {'title': {'query': '\xe7\x88\xb1', 'boost': 0.11}}}]}}}
{"should": [" "], "score": {" ": 0.2, " ": 0.69, " ": 0.11}, "must": [" ", " "]}
{"should": [" "], "score": {" ": 0.2, " ": 0.69, " ": 0.11}, "must": [" ", " "]}
Authors: cswuyg
Date: 2016.06.18
"""
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from elasticsearch import exceptions
import sys
import json
from log import logger
import traceback
reload(sys)
sys.setdefaultencoding('utf-8')
def _release_es_query_by_file_info(file_info):
# : raw_query\tmust_term\tshould_term\tall_score
logger.debug('file_info:{0}'.format(file_info))
file_info_list = file_info.split('\t')
print file_info_list
raw_query = file_info_list[0]
must_term = file_info_list[3]
should_term= file_info_list[4]
all_score = file_info_list[5]
json_score = json.loads(all_score, encoding='utf-8')
ret_obj = {}
ret_obj['must'] = must_term.split()
ret_obj['should'] = should_term.split()
ret_obj['score'] = json_score
bool_query = dict()
must_query = list()
should_query = list()
for item in must_term.split(' '):
must_query.append({'match_phrase': {'title': {'query': item, 'boost': json_score[unicode(item)]}}})
bool_query['must'] = must_query
for item in should_term.split(' '):
should_query.append({'match': {'title': {'query': item, 'boost': json_score[unicode(item)]}}})
bool_query['should'] = should_query
es_query = {'query': {'bool': bool_query}}
print es_query
return raw_query, es_query, json.dumps(ret_obj, ensure_ascii=False)
def _do_query_use_file_info(es, file_info):
raw_query, query, all_score = _release_es_query_by_file_info(file_info.strip('\r
'))
res = es.search(index='cswuyg', doc_type='cswuyg', body=query, size=100)
if (len(res['hits']['hits']) == 0):
logger.debug('len(res["hits"]["hits"]) == 0')
print("{0}\t{1}\t{2}".format(raw_query, "", all_score))
return
for item in res['hits']['hits']:
try:
print("{0}\t{1}\t{2}".format(raw_query, item['_source']['title'].strip('\r
'), all_score))
except:
logger.debug(traceback.format_exc())
logger.debug(item['_source']['title'])
print('\r
')
def _main():
if len(sys.argv) != 2:
print('argv error')
return
else:
print('argv[1] = {0}'.format(sys.argv[1]))
es = Elasticsearch(hosts=["10.200.100.80:8301"], timeout=5000)
with open(sys.argv[1]) as f_r:
for item in f_r:
try:
_do_query_use_file_info(es, item)
except:
logger.debug(traceback.format_exc())
if __name__ == '__main__':
_main()
:
{" ": 0.20, " ": 0.69, " ": 0.11}
:
http://blog.csdn.net/sinat_28224453/article/details/51134978
http://www.cnblogs.com/cswuyg/p/5651620.html
http://blog.csdn.net/ebw123/article/details/46707559
http://blog.csdn.net/napoay/article/details/52201558
http://rockelixir.iteye.com/blog/1890879