python爬虫類-新浪財経

14807 ワード
python
from pymongo import MongoClient
from requests_html import HTMLSession
import time
import random
from threading import Thread


session = HTMLSession()
headers = [{'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
           'Accept-Encoding':'gzip, deflate, sdch',
           'Accept-Language':'zh-CN,zh;q=0.8',
           'Connection':'keep-alive',
           'Host':'vip.stock.finance.sina.com.cn',
           'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
           },
            {'user-agent' : "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)",},
           {'user-agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",},
           {'user-agent':'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0',},
           {'user-agent':'Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3',},
           ]
stypes = {'300':'sz','600':'sh','601':'sh','603':'sh',
              '900':'sh','000':'sz','200':'sz','002':'sz'}
#stypes                    
wztypes = {'AllNewsStock':'    ','stockIndustryNews':'    ','FinManDiv':'     ',
           'gzbc':'     ','gszc':'    ','gqfzgg':'         ','hfbg':'    ',
           'lsgg':'    ','ndbg':'    ','ndbgzy':'    （  ）','pgsms':'     ',
           'qzssgg':'       ','qzsms':'     ','sjdbg':'     ','sjdbgzy':'     （  ）',
           'ssggs':'     ','yjdbg':'     ','yjdbgzy':'     （  ）','zgsmssbg':'     （   ）',
           'zgsmsyxs':'     /   ','zqbg':'    ','zqbgzy':'    （  ）'
           }
def create_db():
    #  mongo   ，     
    client = MongoClient('localhost',27017)
    db = client.sina_finance
    col_basic = db.basic
    col_detail = db.detail
    return col_basic,col_detail

def get_basic_data(html,scode):
    #          
    sname = html.find('div.hq_title',first = True).find('h1',first = True).text[:-11]
    col_basic.save({'_id':scode,'sname':sname})

def exist_next_page(html):
    #             
    p = html.find("div[style = 'margin-top:10px;float:right;margin-right:100px;']",first = True)
    if p is None :
        return False
    else:
        if '   ' in p.text:
            return True
        else:
            return False

def get_wztype(url):
    #       （    、    、    、       ）
    for i in wztypes.keys():
        if i in url:
            wztype = wztypes[i]
            return wztype

def get_news_data(url,scode):
    #                
    wztype = get_wztype(url)
    i= 1
    html = session.get(url,headers = random.choice(headers)).html
    while 1:
    # while                 
        datelist = html.find('div.datelist',first = True)
        if datelist is None: break
        #             
        else:
            print('     %s【%s】 %s   ...'%(scode,wztype,i))
            datelist = datelist.find('a')         
            for item in datelist:
            #          
                title = item.text
                zx_url = item.attrs['href']
                zx_html = session.get(zx_url).html
#                time.sleep(random.uniform(0,1))
                contents = zx_html.find("div[id = 'artibody']",first =True)
                if contents is not None:
                    contents = contents.find('p')
                    content =''
                    for c in contents:
                        content += c.text
                else:
                #          ，          。      ，        ，
                #     ‘      ’。 ‘sina_log.txt’          。
                    with open('d://sina_broken_link.txt','a') as f:
                        f.writelines(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))+':'+zx_url)
                    continue
                source = zx_html.find("meta[name = 'mediaid']",first = True)
                if source is not None:
                    source = source.attrs['content']
                else:
                    source = ''
                date = zx_html.find("meta[name = 'weibo: article:create_at']",first = True)
                if date is not None:
                    date = date.attrs['content']
                elif zx_html.find("span[id = 'pub_date']",first = True) is not None:
                    date = zx_html.find("span[id = 'pub_date']",first = True).text.replace(' ','')
                else : date = ''
                keywords = zx_html.find("meta[name ='keywords']",first = True)
                if keywords is not None:
                    keywords = keywords.attrs['content'].replace(title,'').split(',')
                else:
                    keywords = ''
#                print(zx_url)
                col_detail.save({'_id':zx_url,'scode':scode,'wztype':wztype,'title':title,'date':date,'source':source,'keywords':keywords,'content':content,'grabtime':time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))})
#                print('aaaaaaaaaaaaaaaaa')
            if exist_next_page(html):
                url = html.find("div[style = 'margin-top:10px;float:right;margin-right:100px;']",first = True).find('a')[-1].attrs['href']
                html = session.get(url,headers = random.choice(headers)).html
                i+=1
            else:break

def get_fmd_data(url,scode):
    #              
    wztype = get_wztype(url)
    i =1
    html = session.get(url,headers = random.choice(headers)).html
    while 1:
        if html.find('div.datelist',first = True) is  None: break
        else:
            print('     %s【%s】 %s   ...'%(scode,wztype,i))
            datelist= html.find('div.datelist',first = True).find('a')[1::2]
            for item in datelist:
                title = item.text
                fmd_url = item.attrs['href']
                fmd_html = session.get(fmd_url).html
#                time.sleep(random.uniform(0,1))
                contents = fmd_html.find("div.p_article",first = True)
                if contents is not None:
                    content =''
                    if fmd_html.find('div.p_quote',first = True) is None:
                        quote = ''
                    else:
                        quote = fmd_html.find('div.p_quote',first = True).text
                    contents = contents.find('p')
                    for c in contents:
                        content += c.text.replace('\u200d','').replace('\xa0','')
                    content = quote+ ' '+content
                else:
                    with open('d:\\sina_broken_link.txt','a') as f:
                        f.writelines(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))+':'+fmd_url)
                    continue
                source = fmd_html.find("span.p_info_package",first = True)
                if source is not None:
                    source = source.text[3:].split('、')
                else:
                    source = ''
                date = fmd_html.find("time.p_info_time",first = True)
                if date is not None:
                    date = date.text
                else:
                    date = ''
                keywords = fmd_html.find("meta[name ='keywords']",first = True)
                if keywords is not None:
                    keywords = keywords.attrs['content'].split(',')
                else:
                    keywords = ''
                tag = fmd_html.find('span.p_info_tag',first = True)
                if tag is not None:
                    tag = tag.text[3:]
                else:
                    tag = ''
#                print('bbbbbbbbbbbbb')
                col_detail.save({'_id':fmd_url,'scode':scode,'wztype':wztype,'title':title,'date':date,'source':source,'keywords':keywords,'content':content,'grabtime':time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),'tag':tag})
            if exist_next_page(html):
                url = html.find("div[style = 'margin-top:10px;float:right;margin-right:100px;']",first = True).find('a')[-1].attrs['href']
                html = session.get(url,headers = random.choice(headers)).html
                i+=1
            else:break

def get_bulletin_data(url,scode):
    #           
    time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
    tp = [i for i in wztypes.keys()]
    tp.remove('AllNewsStock')
    tp.remove('stockIndustryNews')
    tp.remove('FinManDiv')
    urls = [url+'?ftype='+i for i in tp]
    #urls            
    def get_gg_info(url,scode):
        #             
        wztype = get_wztype(url)
        i = 1
        html = session.get(url,headers = random.choice(headers)).html
        while 1:
            if html.find('div.datelist',first = True) is None:break
            else:
                print('     %s【%s】 %s   ...'%(scode,wztype,i))
                datelist = html.find('div.datelist',first = True).find('a')
                for item in datelist:
                    gg_url = 'http://vip.stock.finance.sina.com.cn'+item.attrs['href']
                    title = item.text
                    gg_html = session.get(gg_url).html
#                    time.sleep(random.uniform(0,1))
                    content = gg_html.find("div[id = 'content']",first = True)
                    if content is None:
                        with open('d:\\sina_broken_link.txt','a') as f:
                            f.writelines(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))+':'+gg_url)
                        continue
                    else:
                        content = content.text
                    keywords = gg_html.find("meta[name ='Keywords']",first = True)
                    if keywords is not None:
                        keywords = keywords.attrs['content'].split(',')
                    else:
                        keywords = ''
                    date = gg_html.find("td.graybgH2",first = True)
                    if date is not None: 
                        date = date.text[5:]
                    else:
                        date = ''
#                    print('cccccccccccccc')
                    col_detail.save({'_id':gg_url,'scode':scode,'wztype':wztype,'title':title,'date':date,'keywords':keywords,'content':content,'grabtime':time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))})
                if exist_next_page(html):
                    url = html.find("div[style = 'margin-top:10px;float:right;margin-right:100px;']",first = True).find('a')[-1].attrs['href']
                    html = session.get(url,headers = random.choice(headers)).html
                    i+=1
                else:break
    for url in urls :
        get_gg_info(url,scode)

def main(line):
    #  url        url，                   
    links = []
    url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/'+stypes[line[:3]]+line[:6]+'.phtml'
    html = session.get(url,headers = random.choice(headers)).html
    try:
        get_basic_data(html,line[:6])
        for url in html.find('ul.r-menu',first = True).find('a'):
            links.append(url.attrs['href'])
            #links    url               
        try:
            for li in links:
                if 'AllNewsStock' in li or 'stockIndustryNews' in li:      
                    get_news_data(li,line[:6])
                if 'FinManDiv' in li :
                    get_fmd_data(li,line[:6])
                if 'AllBulletin' in li:
                    get_bulletin_data(li,line[:6])
        except Exception as e:
            print(e)
    except Exception as e1:
    #    url            ，     。 'sina_scode.txt'     。
        print(e1)
        with open('d:\\sina_dead_code.txt','a') as f:
            f.writelines(line[:6]+',')

if __name__ == '__main__':
    col_basic,col_detail = create_db()  
    isalive = []
    complete = False
    temp = []
    for line in open('e:\\datasource.csv','r'):
        temp.append(line)
        
    while 1:
        try:
            for t in isalive:
                if not t.is_alive():
                    isalive.remove(t)
                    print('one complete')        
                else:pass
        except Exception:
            pass
        
        if len(isalive)<20:
            try:
                for i in range(20 - len(isalive)):
                    i = Thread(target = main,args = (temp.pop(),))
                    i.start()
    #                i.join()
                    isalive.append(i)
                    print('one start')
    #                print(threading.activeCount())
    #                print(threading.enumerate())
            except IndexError:
                complete = True
      
        if complete:break
        time.sleep(60)
    
#    while 1:
#        try :
#            line = temp.pop()
#            main(line)
#        except IndexError :
#            break
Python接続elasticsearchのコード
Pythonでコード作成フォルダ