python爬虫類(二):爬虫効率の向上
参考資料I同時実行II Python爬虫類の速度を最適化するにはどうすればいいですか?
1需要説明
多くのデータを登るときにどのように時間が長すぎることを避けるか、現在の一般的な方法は主にマルチプロセス、マルチスレッド、コヒーレンス、ハイブリッドモードの4種類です.
2方法の説明
2.1マルチプロセス-threading
詳細はthreading公式ドキュメントを参照
2.2マルチスレッド——multiprocessing
詳細はmultiprocessing公式ドキュメントを参照
2.2協程——asyncio
詳しくはasyncio公式ドキュメントを参照
3実戦記録
3.1完全なコード
比比電子入札情報プラットフォームの入札情報を抽出するウェブページの全文を例に説明し、主な内容はウェブページの全文、プロジェクト名、リンクアドレスと発表時間を含む.
import requests
from lxml import etree
from bs4 import BeautifulSoup
import re
import time
import csv
import multiprocessing
import aiohttp
import asyncio
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/81.0.4044.113 Safari/537.36'}
with open('data.csv', 'w', newline='', encoding='utf-8') as f:
csvwriter = csv.writer(f, dialect='excel')
csvwriter.writerow(['detail', 'name', 'url', 'publishDate'])
f.close()
# --------------- url -------------- #
def get_link_url(link_url):
time.sleep(3)
response = requests.get(link_url, headers=header, timeout=5)
# print(response.text)
table = re.findall('(.*?)
', response.text, re.S | re.M)[0]
urls = re.findall('(?<=href=\").*?(?=\")',table, re.S | re.M)
# print(urls)
return urls
# --------------- -------------- #
def get_link_info(url):
time.sleep(3)
with open('data.csv', 'a', newline='', encoding='utf-8') as f:
response = requests.get(url, headers=header, timeout=5)
soup = BeautifulSoup(response.text, 'lxml') # soup
content = etree.HTML(response.text) # xpath
html = soup.get_text() #
# --------- -------- #
# detail = html.replace(' ','') #
# detail = ''.join(html.split()) #
detail = re.sub('[\r
\s]','', html) # 、 ( 、 )
# ============================= #
# 2020 “ ”
# ============================= #
# ----------xpath---------- #
# name = content.xpath('//h3/text()')[0]
# publishDate = content.xpath('//p[contains(@class,"sum")]/text()')[0]
# ----------BeautifulSoup---------- #
# name = soup.select('h3')[0].get_text()
# publishDate = soup.select('.sum')[0].get_text()
# ----------re---------- #
name = re.findall('(?<=).*?(?=
, response.text, re.S|re.M)[0]
name = ''.join(re.split('\s',name))
publishDate = re.findall('(?<= :).*?(?=|)', html, re.S|re.M)[0]
f.write('{},{},{},{}
'.format(detail, name, url, publishDate))
f.close()
async def get_asy_link_info(url):
time.sleep(3)
with open('data.csv', 'a', newline='', encoding='utf-8') as f:
# response = await requests.get(url, headers=header, timeout=5) # asyncio , aiohttp
session = aiohttp.ClientSession()
response = session.get(url)
soup = BeautifulSoup(response.text, 'lxml') # soup
# content = etree.HTML(response.text) # xpath
html = soup.get_text() #
detail = re.sub('\s','', html)
# ============================= #
#
2020 “ ”
# ============================= #
# ----------xpath---------- #
# name = content.xpath('//h3/text()')[0]
# publishDate = content.xpath('//p[contains(@class,"sum")]/text()')[0]
# ----------BeautifulSoup---------- #
# name = soup.select('h3')[0].get_text()
# publishDate = soup.select('.sum')[0].get_text()
# ----------re---------- #
name = re.findall('(?<=).*?(?=
, response.text, re.S|re.M)[0]
name = ''.join(re.split('\s',name))
publishDate = re.findall('(?<= :).*?(?=|)', html, re.S|re.M)[0]
f.write('{},{},{},{}
'.format(detail, name, url, publishDate))
f.close()
if __name__ == '__main__':
# 1~5 url
link_urls = ['http://www.bitbid.cn/ggWeb!zhaobiaogg.action?gongShiType=1¤tPage={}&ggName=&type=&startDate=&endDate=&shengID=0'.format(i) for i in range(1, 6)]
url_list = []
for link_url in link_urls:
urls = get_link_url(link_url)
url_list = url_list + urls # 1~5 url url_list
print(url_list)
# **********************1 ************************** #
t11 = time.time() #
for url in url_list:
get_link_info(url)
t12 = time.time() #
print(' ', t12 - t11)
# **********************2 ************************** #
# ------------ ---------- #
t21 = time.time() #
pool = multiprocessing.Pool(processes=4) # multiprocessing.cpu_count()
# for url in url_list:
# pool.apply_async(get_link_info, args=(url,)) #
pool.map(get_link_info, url_list)
t22 = time.time() #
pool.close()
pool.join()
print(' ', t22 - t21)
# ------------ ---------- #
t31 = time.time() #
for url in url_list:
p = multiprocessing.Process(target=get_link_info, args=(url,))
p.start()
p.join()
t32 = time.time() #
print(' ', t32 - t31)
# **********************3 ************************** #
t51 = time.time() #
loop = asyncio.get_event_loop() #
tasks = [get_asy_link_info(url) for url in url_list]
loop.run_until_complete(asyncio.wait(tasks)) # ,
loop.close() #
t52 = time.time() #
print(' ', t52 - t51)
3.2結果展示
237.6622188091278
65.96817064285278
348.5716996192932
235.63298511505127