pythonマルチスレッドによるMeshキーワードのキャプチャ


# -*- coding: utf-8 -*-
"""
Sinomed          by:     
======================================
            ('A','B',...,'TL','TN'),       main                  。
   JSON,      name, node, p_node    
node   :        (       )
p_node :          
name   :   
"""
import urllib2
import cookielib
import json
import threading
import xml.etree.ElementTree as ET

SAVE_PATH = 'mesh.json'


url_prefix = 'http://www.sinomed.ac.cn'

tree_root_prefix = 'http://www.sinomed.ac.cn/cross/subjectSearch.do?method=xml&db=me_MESHE&treenumber='
tree_root_list = ('A','B','C','D','E','F','G','H','I','J','K','L','N','V','Z','TA','TB','TC','TD','TE','TF','TG','TH','TI','TK','TL','TN')

ResultContent = []

cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

def get_tree(currentNode,url):
    '''     XML     '''
    request = urllib2.Request(url)
    content = opener.open(request)
    xml = content.read()
    opener.close()
    tree = ET.fromstring(xml)
    for e in tree:
        name = e.attrib['text']
        has_sub = e.attrib.get('src',False)  #         

        if has_sub:
            sub_url = url_prefix+e.attrib.get('src',None)  #         
            node_index = e.attrib['src'].find('&treenumber=')+12  #       
            node = e.attrib['src'][node_index:]  #       
            ResultContent.append({'name':name, 'node':node, 'p_node':currentNode})
            get_tree(node,sub_url)
        else:
            ResultContent.append({'name':name, 'node':'', 'p_node':currentNode})

def thread_key(prefix):
    '''         '''
    assert prefix in tree_root_list
    key = []
    xml = urllib2.urlopen(tree_root_prefix+prefix).read()
    tree = ET.fromstring(xml)
    for i in range(len(tree)):
        if i<9:
            key.append(prefix+'0'+str(i+1))
        else:
            key.append(prefix+str(i+1))
    return key

def main(prefix):
    '''           '''
    thread = []
    key = thread_key(prefix)
    print 'defining thread...'
    for i in key:
        thread.append(threading.Thread(target=get_tree,args=(i,tree_root_prefix+i)))
    print 'done!'

    print 'start...'
    for i in thread:      
        i.start()

    for i in thread:
        i.join()

def write_to(content, file_path):
    '''              '''
    print 'writing...'
    f = open(file_path, 'w')
    f.write(content)
    f.close()
    print 'done!'

if __name__ == '__main__':
    main('C')
    # get_tree('C01',tree_root_prefix+'C01')
    content = json.dumps(ResultContent, ensure_ascii=False, separators=(',', ':')).encode('utf-8')
    write_to(content, SAVE_PATH)