python爬虫類(二):爬虫効率の向上

25929 ワード

文書ディレクトリ
  • 需要説明
  • 2方法説明
  • 2.1マルチプロセス-threading
  • 2.2マルチスレッド-multiprocessing
  • 2.2連携-asyncio
  • 3実戦記録
  • 3.1完全コード
  • 3.2結果は
  • を示した.
    参考資料I同時実行II Python爬虫類の速度を最適化するにはどうすればいいですか?
    1需要説明
    多くのデータを登るときにどのように時間が長すぎることを避けるか、現在の一般的な方法は主にマルチプロセス、マルチスレッド、コヒーレンス、ハイブリッドモードの4種類です.
    2方法の説明
    2.1マルチプロセス-threading
    詳細はthreading公式ドキュメントを参照
    2.2マルチスレッド——multiprocessing
    詳細はmultiprocessing公式ドキュメントを参照
    2.2協程——asyncio
    詳しくはasyncio公式ドキュメントを参照
    3実戦記録
    3.1完全なコード
    比比電子入札情報プラットフォームの入札情報を抽出するウェブページの全文を例に説明し、主な内容はウェブページの全文、プロジェクト名、リンクアドレスと発表時間を含む.
    import requests
    from lxml import etree
    from bs4 import BeautifulSoup
    
    import re
    import time
    import csv
    
    import multiprocessing
    
    import aiohttp
    import asyncio
    
    
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/81.0.4044.113 Safari/537.36'}
    
    with open('data.csv', 'w', newline='', encoding='utf-8') as f:
        csvwriter = csv.writer(f, dialect='excel')
        csvwriter.writerow(['detail', 'name', 'url', 'publishDate'])
    f.close()
    # ---------------        url  -------------- #
    def get_link_url(link_url):
        time.sleep(3)
        response = requests.get(link_url, headers=header, timeout=5)
        # print(response.text)
        table = re.findall('(.*?)
    '
    , response.text, re.S | re.M)[0] urls = re.findall('(?<=href=\").*?(?=\")',table, re.S | re.M) # print(urls) return urls # --------------- -------------- # def get_link_info(url): time.sleep(3) with open('data.csv', 'a', newline='', encoding='utf-8') as f: response = requests.get(url, headers=header, timeout=5) soup = BeautifulSoup(response.text, 'lxml') # soup content = etree.HTML(response.text) # xpath html = soup.get_text() # # --------- -------- # # detail = html.replace(' ','') # # detail = ''.join(html.split()) # detail = re.sub('[\r
    \s]'
    ,'', html) # 、 ( 、 ) # ============================= # #

    2020 “ ”

    # ============================= # # ----------xpath---------- # # name = content.xpath('//h3/text()')[0] # publishDate = content.xpath('//p[contains(@class,"sum")]/text()')[0] # ----------BeautifulSoup---------- # # name = soup.select('h3')[0].get_text() # publishDate = soup.select('.sum')[0].get_text() # ----------re---------- # name = re.findall('(?<=

    ).*?(?=

    , response.text, re.S|re.M)[0] name = ''.join(re.split('\s',name)) publishDate = re.findall('(?<= :).*?(?=|)', html, re.S|re.M)[0] f.write('{},{},{},{}
    '
    .format(detail, name, url, publishDate)) f.close() async def get_asy_link_info(url): time.sleep(3) with open('data.csv', 'a', newline='', encoding='utf-8') as f: # response = await requests.get(url, headers=header, timeout=5) # asyncio , aiohttp session = aiohttp.ClientSession() response = session.get(url) soup = BeautifulSoup(response.text, 'lxml') # soup # content = etree.HTML(response.text) # xpath html = soup.get_text() # detail = re.sub('\s','', html) # ============================= # #

    2020 “ ”

    # ============================= # # ----------xpath---------- # # name = content.xpath('//h3/text()')[0] # publishDate = content.xpath('//p[contains(@class,"sum")]/text()')[0] # ----------BeautifulSoup---------- # # name = soup.select('h3')[0].get_text() # publishDate = soup.select('.sum')[0].get_text() # ----------re---------- # name = re.findall('(?<=

    ).*?(?=

    , response.text, re.S|re.M)[0] name = ''.join(re.split('\s',name)) publishDate = re.findall('(?<= :).*?(?=|)', html, re.S|re.M)[0] f.write('{},{},{},{}
    '
    .format(detail, name, url, publishDate)) f.close() if __name__ == '__main__': # 1~5 url link_urls = ['http://www.bitbid.cn/ggWeb!zhaobiaogg.action?gongShiType=1&currentPage={}&ggName=&type=&startDate=&endDate=&shengID=0'.format(i) for i in range(1, 6)] url_list = [] for link_url in link_urls: urls = get_link_url(link_url) url_list = url_list + urls # 1~5 url url_list print(url_list) # **********************1 ************************** # t11 = time.time() # for url in url_list: get_link_info(url) t12 = time.time() # print(' ', t12 - t11) # **********************2 ************************** # # ------------ ---------- # t21 = time.time() # pool = multiprocessing.Pool(processes=4) # multiprocessing.cpu_count() # for url in url_list: # pool.apply_async(get_link_info, args=(url,)) # pool.map(get_link_info, url_list) t22 = time.time() # pool.close() pool.join() print(' ', t22 - t21) # ------------ ---------- # t31 = time.time() # for url in url_list: p = multiprocessing.Process(target=get_link_info, args=(url,)) p.start() p.join() t32 = time.time() # print(' ', t32 - t31) # **********************3 ************************** # t51 = time.time() # loop = asyncio.get_event_loop() # tasks = [get_asy_link_info(url) for url in url_list] loop.run_until_complete(asyncio.wait(tasks)) # , loop.close() # t52 = time.time() # print(' ', t52 - t51)


    3.2結果展示
        237.6622188091278
        65.96817064285278
        348.5716996192932
       235.63298511505127