pyppeteeer:asyncioベースの非同期Webページで爬虫類ライブラリをレンダリングする

1828 ワード

概要
seleniumと同様にpyppeteeerもWebページをレンダリングできますが、非同期です.
使用方法
インストール方法pip install pyppeteer
# python 3.7.5

import asyncio

from pyppeteer import launch
from pyquery import PyQuery as pq


async def main():
    browser = await launch()
    page = await browser.newPage()
    await page.goto("http://quotes.toscrape.com/js/")
    doc = pq(await page.content())
    print("Quotes:", doc(".quote").length)
    await browser.close()


asyncio.run(main())

複雑なケース、css、画像、フォントなどを隠す
import asyncio

from pyppeteer import launch
from pyquery import PyQuery as pq


class Global:
    browser = None


async def intercept_request(req):
    """      """
    if req.resourceType in ["image", "media", "eventsource", "websocket", "stylesheet", "font"]:
        await req.abort()
    else:
        await req.continue_()


async def fetch():
    page = await Global.browser.newPage()
    await page.setUserAgent(
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299"
    )  #    useragent
    await page.setViewport({"width": 1080, "height": 960})

    await page.setRequestInterception(True)
    page.on("request", intercept_request)

    await page.goto("https://juejin.im/timeline")
    await asyncio.sleep(3)
    doc = pq(await page.content())
    print("Quotes:", doc("a").length)
    await page.close()


async def main():
    Global.browser = await launch()
    await asyncio.gather(*[fetch() for _ in range(10)])  #   
    await Global.browser.close()


asyncio.get_event_loop().run_until_complete(main())