sphinx python xmlpipe 2データソースを使用してインデックスを生成

3897 ワード

source testxml
{
    type = xmlpipe2
    xmlpipe_command = python /tmp/testx.py
}

index testxml
{
        source                                  = testxml
        path                                    = /data/sphinx/testxml
        docinfo                                 = extern
        morphology                              = none
        min_word_len                            = 1
        charset_type                            = utf-8
        min_prefix_len                          = 0
        html_strip                              = 1
        charset_table                           = 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F
        ngram_len                               = 1
        ngram_chars                             = U+3000..U+2FA1F
}

indexer
{
        mem_limit                               = 128M
}

searchd
{
        port                                    = 4412
        log                                     = /data/log/sphinxsearch/searchd.log
        query_log                               = /data/log/sphinxsearch/query.log
        read_timeout                            = 5
        max_children                            = 30
        pid_file                                = /data/log/sphinxsearch/searchd.pid
        max_matches                             = 1000
        seamless_rotate                         = 1
        preopen_indexes                         = 0
        unlink_old                              = 1
}

PYコードは以下の通りである.
# coding=utf-8

from loxun import XmlWriter
from StringIO import StringIO
import pymssql

conn = pymssql.connect(host=r'MyServer2k', user='citymap', password='city@map@com', database='CitycomeMap',as_dict=True,charset='utf8')

cur = conn.cursor()

out = StringIO()
xml = XmlWriter(out)

xml.addNamespace("sphinx","http://www.beihai365.com")
#---docset
xml.startTag("sphinx:docset")
# --- schema
xml.startTag("sphinx:schema")
#--- field
'''....'''
xml.tag("sphinx:field",{"name":"myname"})
xml.tag("sphinx:field",{"name":"myaddress"})
xml.tag("sphinx:field",{"name":"mykeyword"})
'''....'''
xml.tag("sphinx:attr",{"name":"AID","type":"int"})

#--- /field
xml.endTag() 
#--- /schema

#-#---  wenwen--document
cur.execute('SELECT COUNT(*) FROM MapObjectInfo')
tj = cur.fetchone()[0]  #....

pNum = 1000  #......
cutSqlNum = pNum
_p = 1 #..
zNum = 0

while True:
        '''...............................'''
	if (tj-zNum)<pNum:
		cutSqlNum = tj-zNum

	zNum = int(pNum * _p)
	cur.execute('SELECT * FROM (select top %d* from (select top %d* from MapObjectInfo ORDER BY ID DESC)t1 ORDER BY ID)t2 ORDER BY ID DESC' % (cutSqlNum,zNum))
	#print 'SELECT * FROM (select top %d* from (select top %d * from MapObjectInfo ORDER BY ID DESC)t1 ORDER BY ID)t2 ORDER BY ID DESC
' % (cutSqlNum,zNum) row = cur.fetchone_asdict() while row: if row['ID'] == 0: row = cur.fetchone_asdict() continue xml.startTag("sphinx:document",{"id":row['ID']}) xml.startTag("myname") xml.text(row['Name']) xml.endTag() xml.startTag("myaddress") xml.text(row['Address']) xml.endTag() xml.startTag("mykeyword") xml.text(row['Keyword']) xml.endTag() xml.startTag("AID") xml.text(str(row['ID'])) xml.endTag() xml.endTag() row = cur.fetchone_asdict() _p = _p + 1 #......... if zNum > tj: break #---#--- /wenwen /document conn.close() xml.endTag() #--- /docset xml.close() print out.getvalue()