python爬虫類(二):爬虫効率の向上

25929 ワード

爬虫類 Python

文書ディレクトリ

需要説明

2方法説明

2.1マルチプロセス-threading

2.2マルチスレッド-multiprocessing

2.2連携-asyncio

3実戦記録

3.1完全コード

3.2結果は

を示した.
参考資料I同時実行II Python爬虫類の速度を最適化するにはどうすればいいですか?
1需要説明
多くのデータを登るときにどのように時間が長すぎることを避けるか、現在の一般的な方法は主にマルチプロセス、マルチスレッド、コヒーレンス、ハイブリッドモードの4種類です.
2方法の説明
2.1マルチプロセス-threading
詳細はthreading公式ドキュメントを参照
2.2マルチスレッド——multiprocessing
詳細はmultiprocessing公式ドキュメントを参照
2.2協程——asyncio
詳しくはasyncio公式ドキュメントを参照
3実戦記録
3.1完全なコード
比比電子入札情報プラットフォームの入札情報を抽出するウェブページの全文を例に説明し、主な内容はウェブページの全文、プロジェクト名、リンクアドレスと発表時間を含む.

import requests
from lxml import etree
from bs4 import BeautifulSoup

import re
import time
import csv

import multiprocessing

import aiohttp
import asyncio


header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/81.0.4044.113 Safari/537.36'}

with open('data.csv', 'w', newline='', encoding='utf-8') as f:
    csvwriter = csv.writer(f, dialect='excel')
    csvwriter.writerow(['detail', 'name', 'url', 'publishDate'])
f.close()
# ---------------        url  -------------- #
def get_link_url(link_url):
    time.sleep(3)
    response = requests.get(link_url, headers=header, timeout=5)
    # print(response.text)
    table = re.findall('(.*?)
', response.text, re.S | re.M)[0]
    urls = re.findall('(?<=href=\").*?(?=\")',table, re.S | re.M)
    # print(urls)
    return urls

# ---------------      -------------- #
def get_link_info(url):
    time.sleep(3)
    with open('data.csv', 'a', newline='', encoding='utf-8') as f:
        response = requests.get(url, headers=header, timeout=5)
        soup = BeautifulSoup(response.text, 'lxml') #       soup  

        content = etree.HTML(response.text) #       xpath  

        html = soup.get_text() #          

        # ---------       -------- #
        # detail = html.replace(' ','') #               
        # detail = ''.join(html.split()) #   
        detail = re.sub('[\r
\s]','', html) #     、      （   、        ）
        
        # ============================= #
        #    2020 “     ”                 
        # ============================= #

        # ----------xpath---------- #
        # name = content.xpath('//h3/text()')[0]
        # publishDate = content.xpath('//p[contains(@class,"sum")]/text()')[0]

        # ----------BeautifulSoup---------- #
        # name = soup.select('h3')[0].get_text()
        # publishDate = soup.select('.sum')[0].get_text()

        # ----------re---------- #
        name = re.findall('(?<=).*?(?=
, response.text, re.S|re.M)[0]
        name = ''.join(re.split('\s',name))
        publishDate = re.findall('(?<=    ：).*?(?=|)', html, re.S|re.M)[0]

        f.write('{},{},{},{}
'.format(detail, name, url, publishDate))
    f.close()

async def get_asy_link_info(url):
    time.sleep(3)
    with open('data.csv', 'a', newline='', encoding='utf-8') as f:
        # response = await requests.get(url, headers=header, timeout=5)  #     asyncio    ，   aiohttp
		
        session = aiohttp.ClientSession()
        response = session.get(url)
        
        soup = BeautifulSoup(response.text, 'lxml') #       soup  

        # content = etree.HTML(response.text) #       xpath  

        html = soup.get_text() #          

        detail = re.sub('\s','', html)
        # ============================= #
        # 
   2020 “     ”                 
        # ============================= #

        # ----------xpath---------- #
        # name = content.xpath('//h3/text()')[0]
        # publishDate = content.xpath('//p[contains(@class,"sum")]/text()')[0]

        # ----------BeautifulSoup---------- #
        # name = soup.select('h3')[0].get_text()
        # publishDate = soup.select('.sum')[0].get_text()

        # ----------re---------- #
        name = re.findall('(?<=).*?(?=
, response.text, re.S|re.M)[0]
        name = ''.join(re.split('\s',name))
        publishDate = re.findall('(?<=    ：).*?(?=|)', html, re.S|re.M)[0]

        f.write('{},{},{},{}
'.format(detail, name, url, publishDate))
    f.close()

if __name__ == '__main__':
    #        1~5  url  
    link_urls = ['http://www.bitbid.cn/ggWeb!zhaobiaogg.action?gongShiType=1&currentPage={}&ggName=&type=&startDate=&endDate=&shengID=0'.format(i) for i in range(1, 6)]

    url_list = []
    for link_url in link_urls:
        urls = get_link_url(link_url)
        url_list = url_list + urls #  1~5  url         url_list
    print(url_list)

    # **********************1    ************************** #
    t11 = time.time() #          
    for url in url_list:
        get_link_info(url)
    t12 = time.time() #          
    print('   ', t12 - t11)

    # **********************2    ************************** #

    # ------------   ---------- #
    t21 = time.time()  #        
    pool = multiprocessing.Pool(processes=4) #      multiprocessing.cpu_count()

	# for url in url_list:
	#	pool.apply_async(get_link_info, args=(url,)) #      
		
    pool.map(get_link_info, url_list)
    t22 = time.time()  #        
    pool.close()
    pool.join()
    print('   ', t22 - t21)

    # ------------   ---------- #
    t31 = time.time()  #        
    for url in url_list:
        p = multiprocessing.Process(target=get_link_info, args=(url,))
        p.start()
        p.join()
    t32 = time.time()  #        
    print('   ', t32 - t31)
	
	# **********************3   ************************** #
	
    t51 = time.time()  #       
    loop = asyncio.get_event_loop() #           
    tasks = [get_asy_link_info(url) for url in url_list]
    loop.run_until_complete(asyncio.wait(tasks)) #         ，    
    loop.close() #       
    t52 = time.time()  #       
    print('  ', t52 - t51)

3.2結果展示

    237.6622188091278
    65.96817064285278
    348.5716996192932
   235.63298511505127

Flow の型チェックを CircleCI から実行する時はワーカーの上限値を設定した方がよかった話

データ構造線形テーブル順序格納操作