python爬虫類-新浪財経
14807 ワード
from pymongo import MongoClient
from requests_html import HTMLSession
import time
import random
from threading import Thread
session = HTMLSession()
headers = [{'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Connection':'keep-alive',
'Host':'vip.stock.finance.sina.com.cn',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
},
{'user-agent' : "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)",},
{'user-agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",},
{'user-agent':'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0',},
{'user-agent':'Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3',},
]
stypes = {'300':'sz','600':'sh','601':'sh','603':'sh',
'900':'sh','000':'sz','200':'sz','002':'sz'}
#stypes
wztypes = {'AllNewsStock':' ','stockIndustryNews':' ','FinManDiv':' ',
'gzbc':' ','gszc':' ','gqfzgg':' ','hfbg':' ',
'lsgg':' ','ndbg':' ','ndbgzy':' ( )','pgsms':' ',
'qzssgg':' ','qzsms':' ','sjdbg':' ','sjdbgzy':' ( )',
'ssggs':' ','yjdbg':' ','yjdbgzy':' ( )','zgsmssbg':' ( )',
'zgsmsyxs':' / ','zqbg':' ','zqbgzy':' ( )'
}
def create_db():
# mongo ,
client = MongoClient('localhost',27017)
db = client.sina_finance
col_basic = db.basic
col_detail = db.detail
return col_basic,col_detail
def get_basic_data(html,scode):
#
sname = html.find('div.hq_title',first = True).find('h1',first = True).text[:-11]
col_basic.save({'_id':scode,'sname':sname})
def exist_next_page(html):
#
p = html.find("div[style = 'margin-top:10px;float:right;margin-right:100px;']",first = True)
if p is None :
return False
else:
if ' ' in p.text:
return True
else:
return False
def get_wztype(url):
# ( 、 、 、 )
for i in wztypes.keys():
if i in url:
wztype = wztypes[i]
return wztype
def get_news_data(url,scode):
#
wztype = get_wztype(url)
i= 1
html = session.get(url,headers = random.choice(headers)).html
while 1:
# while
datelist = html.find('div.datelist',first = True)
if datelist is None: break
#
else:
print(' %s【%s】 %s ...'%(scode,wztype,i))
datelist = datelist.find('a')
for item in datelist:
#
title = item.text
zx_url = item.attrs['href']
zx_html = session.get(zx_url).html
# time.sleep(random.uniform(0,1))
contents = zx_html.find("div[id = 'artibody']",first =True)
if contents is not None:
contents = contents.find('p')
content =''
for c in contents:
content += c.text
else:
# , 。 , ,
# ‘ ’。 ‘sina_log.txt’ 。
with open('d://sina_broken_link.txt','a') as f:
f.writelines(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))+':'+zx_url)
continue
source = zx_html.find("meta[name = 'mediaid']",first = True)
if source is not None:
source = source.attrs['content']
else:
source = ''
date = zx_html.find("meta[name = 'weibo: article:create_at']",first = True)
if date is not None:
date = date.attrs['content']
elif zx_html.find("span[id = 'pub_date']",first = True) is not None:
date = zx_html.find("span[id = 'pub_date']",first = True).text.replace(' ','')
else : date = ''
keywords = zx_html.find("meta[name ='keywords']",first = True)
if keywords is not None:
keywords = keywords.attrs['content'].replace(title,'').split(',')
else:
keywords = ''
# print(zx_url)
col_detail.save({'_id':zx_url,'scode':scode,'wztype':wztype,'title':title,'date':date,'source':source,'keywords':keywords,'content':content,'grabtime':time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))})
# print('aaaaaaaaaaaaaaaaa')
if exist_next_page(html):
url = html.find("div[style = 'margin-top:10px;float:right;margin-right:100px;']",first = True).find('a')[-1].attrs['href']
html = session.get(url,headers = random.choice(headers)).html
i+=1
else:break
def get_fmd_data(url,scode):
#
wztype = get_wztype(url)
i =1
html = session.get(url,headers = random.choice(headers)).html
while 1:
if html.find('div.datelist',first = True) is None: break
else:
print(' %s【%s】 %s ...'%(scode,wztype,i))
datelist= html.find('div.datelist',first = True).find('a')[1::2]
for item in datelist:
title = item.text
fmd_url = item.attrs['href']
fmd_html = session.get(fmd_url).html
# time.sleep(random.uniform(0,1))
contents = fmd_html.find("div.p_article",first = True)
if contents is not None:
content =''
if fmd_html.find('div.p_quote',first = True) is None:
quote = ''
else:
quote = fmd_html.find('div.p_quote',first = True).text
contents = contents.find('p')
for c in contents:
content += c.text.replace('\u200d','').replace('\xa0','')
content = quote+ ' '+content
else:
with open('d:\\sina_broken_link.txt','a') as f:
f.writelines(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))+':'+fmd_url)
continue
source = fmd_html.find("span.p_info_package",first = True)
if source is not None:
source = source.text[3:].split('、')
else:
source = ''
date = fmd_html.find("time.p_info_time",first = True)
if date is not None:
date = date.text
else:
date = ''
keywords = fmd_html.find("meta[name ='keywords']",first = True)
if keywords is not None:
keywords = keywords.attrs['content'].split(',')
else:
keywords = ''
tag = fmd_html.find('span.p_info_tag',first = True)
if tag is not None:
tag = tag.text[3:]
else:
tag = ''
# print('bbbbbbbbbbbbb')
col_detail.save({'_id':fmd_url,'scode':scode,'wztype':wztype,'title':title,'date':date,'source':source,'keywords':keywords,'content':content,'grabtime':time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),'tag':tag})
if exist_next_page(html):
url = html.find("div[style = 'margin-top:10px;float:right;margin-right:100px;']",first = True).find('a')[-1].attrs['href']
html = session.get(url,headers = random.choice(headers)).html
i+=1
else:break
def get_bulletin_data(url,scode):
#
time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
tp = [i for i in wztypes.keys()]
tp.remove('AllNewsStock')
tp.remove('stockIndustryNews')
tp.remove('FinManDiv')
urls = [url+'?ftype='+i for i in tp]
#urls
def get_gg_info(url,scode):
#
wztype = get_wztype(url)
i = 1
html = session.get(url,headers = random.choice(headers)).html
while 1:
if html.find('div.datelist',first = True) is None:break
else:
print(' %s【%s】 %s ...'%(scode,wztype,i))
datelist = html.find('div.datelist',first = True).find('a')
for item in datelist:
gg_url = 'http://vip.stock.finance.sina.com.cn'+item.attrs['href']
title = item.text
gg_html = session.get(gg_url).html
# time.sleep(random.uniform(0,1))
content = gg_html.find("div[id = 'content']",first = True)
if content is None:
with open('d:\\sina_broken_link.txt','a') as f:
f.writelines(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))+':'+gg_url)
continue
else:
content = content.text
keywords = gg_html.find("meta[name ='Keywords']",first = True)
if keywords is not None:
keywords = keywords.attrs['content'].split(',')
else:
keywords = ''
date = gg_html.find("td.graybgH2",first = True)
if date is not None:
date = date.text[5:]
else:
date = ''
# print('cccccccccccccc')
col_detail.save({'_id':gg_url,'scode':scode,'wztype':wztype,'title':title,'date':date,'keywords':keywords,'content':content,'grabtime':time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))})
if exist_next_page(html):
url = html.find("div[style = 'margin-top:10px;float:right;margin-right:100px;']",first = True).find('a')[-1].attrs['href']
html = session.get(url,headers = random.choice(headers)).html
i+=1
else:break
for url in urls :
get_gg_info(url,scode)
def main(line):
# url url,
links = []
url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/'+stypes[line[:3]]+line[:6]+'.phtml'
html = session.get(url,headers = random.choice(headers)).html
try:
get_basic_data(html,line[:6])
for url in html.find('ul.r-menu',first = True).find('a'):
links.append(url.attrs['href'])
#links url
try:
for li in links:
if 'AllNewsStock' in li or 'stockIndustryNews' in li:
get_news_data(li,line[:6])
if 'FinManDiv' in li :
get_fmd_data(li,line[:6])
if 'AllBulletin' in li:
get_bulletin_data(li,line[:6])
except Exception as e:
print(e)
except Exception as e1:
# url , 。 'sina_scode.txt' 。
print(e1)
with open('d:\\sina_dead_code.txt','a') as f:
f.writelines(line[:6]+',')
if __name__ == '__main__':
col_basic,col_detail = create_db()
isalive = []
complete = False
temp = []
for line in open('e:\\datasource.csv','r'):
temp.append(line)
while 1:
try:
for t in isalive:
if not t.is_alive():
isalive.remove(t)
print('one complete')
else:pass
except Exception:
pass
if len(isalive)<20:
try:
for i in range(20 - len(isalive)):
i = Thread(target = main,args = (temp.pop(),))
i.start()
# i.join()
isalive.append(i)
print('one start')
# print(threading.activeCount())
# print(threading.enumerate())
except IndexError:
complete = True
if complete:break
time.sleep(60)
# while 1:
# try :
# line = temp.pop()
# main(line)
# except IndexError :
# break