python+++マルチスレッド豆弁網のリストを這い出す
import requests as reqs
import threading
import time
#Some User Agents
hds={'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11',
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}
from scrapy import Selector
def html_parse(html_str,xpath_expr):
sel = Selector(text=htmltext)
xp = lambda x: sel.xpath(x).extract()
return xp(xpathstr)
class myThread_detail(threading.Thread):
"""docstring for myThead"""
def __init__(self, url,people_nums):
super(myThread_detail, self).__init__()
self.url = url
self.people_nums=people_nums
def run(self,):
loc=threading.Lock()
data=reqs.get(self.url,headers=hds).content.decode()
#
loc.acquire()
try:
self.people_nums.append(html_parse(data,'//a[@class="rating_people"]//span/text()')[0])
except IndexError as e:
self.people_nums.append(" ")
loc.release()
#
def book_spider(book_tag):
#url='http://www.douban.com/tag/ /book?start=0' # For Test
page_num=0
url=r'http://www.douban.com/tag/'+book_tag+'/book?start='
for i in range(0,15*10,15):
urls=url+str(i)
print(urls)
data=reqs.get(urls,headers=hds).content.decode()
# ============== , ================
#
booknames =html_parse(data,'//a[@class="title"]/text()')
#
descs =html_parse(data,'//div[@class="desc"]/text()')
detail_links =html_parse(data,'''//a[@class="title"]/@href''')
#
people_nums=[]
# ,
tt=[]
for link in detail_links:
t=myThread_detail(link,people_nums)
t.start()
time.sleep(0.1)
tt.append(t)
for th in tt:
th.join()
yield (booknames,descs,people_nums)
# ,
# https://book.douban.com/subject/6082808/?from=tag_all for test
def get_people_num(url):
data=reqs.get(url,headers=hds).content.decode()
#
return html_parse(data,'//a[@class="rating_people"]//span/text()')
# ( )
def do_spider(book_tag_lists):
for i in book_tag_lists:
for item in book_spider(i):
#
print(item)
# ,
# return book_lists
return 0
# 。
def print_book_lists_excel(book_lists,book_tag_lists):
''' , ;
, !'''
pass
if __name__=='__main__':
#book_tag_lists = [' ',' ',' ',' ',' ',' ']
#book_tag_lists = [' ',' ',' ',' ',' ',' ',' ']
#book_tag_lists = [' ',' ',' ','web',' ',' ',' ']
#book_tag_lists = [' ',' ','linux','android',' ',' ']
#book_tag_lists = [' ']
#book_tag_lists = [' ',' ',' ',' ',' ',' ',' ',' ',' ',' ']
#book_tag_lists = [' ',' ',' ']
#book_tag_lists = [' ']
#book_tag_lists = [' ',' ',' ',' ',' ']
#book_tag_lists = [' ',' ',' ']
# book_tag_lists = [' ',' ',' ',' ',' ']
book_tag_lists = [' ']
book_lists=do_spider(book_tag_lists)
print_book_lists_excel(book_lists,book_tag_lists)
###########################################################################
# , , ip , ip; 。 ##
参照先:
https://github.com/lanbing510/DouBanSpider/blob/master/doubanSpider.py