pppeteer多URLはって取ります.


基本的な使い方
'use strict';
const puppeteer = require('puppeteer');

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  let imgArr = [];
  page.on('domcontentloaded', async () => {
    imgArr = await page.$$eval('img', img => {
      const arr = [];
      //               
      for (let i = 0; i < img.length; i++) {
        const obj = {
          width: img[i].width,
          naturalWidth: img[i].naturalWidth,
          height: img[i].height,
          naturalHeight: img[i].naturalHeight,
          isStandard: !((img[i].width * 10 <= img[i].naturalWidth || img[i].height * 10 <= img[i].naturalHeight)),
          url: img[i].src,
          level: 3,
          imageUrl: img[i].src,
          describeUrl: '',
          summary: `    ${img[i].width}x${img[i].height}          ${img[i].naturalWidth}x${img[i].naturalHeight}   `,
        };
        if (obj.width && obj.height) {
          arr.push(obj);
        }
      }
      return arr;
    });
  });
  await page.goto('https://www.npmjs.com/package/puppeteer', { waitUntil: 'networkidle0' });
  await browser.close();
  console.log('imgArr: ', imgArr);
})();
順序は変えられません
  • await pppeteer.launch()起動
  • await browser.newPage()page
  • を開く.
  • page.on傍受事件
  • await Page.gotoジャンプページ
  • await browser.close()クローズ
  • 順番が変わりましたが、Page.on()モニターイベントは傍受できなくなります.
    複数のURLの使い方
    配列urlの上のすべてのピクチャーをよじ登って、そしてその真実な幅の高さを返します.
    /* eslint-disable no-undef */
    'use strict';
    const puppeteer = require('puppeteer');
    
    (async () => {
      const browser = await puppeteer.launch();
      const page = await browser.newPage();
      let arr = [];
    
      const html = [ 'https://www.npmjs.com/package/puppeteer', 'https://www.iconfont.cn/search/index?searchType=icon&q=test' ];
    
      for (let i = 0; i < html.length; i++) {
        await page.goto(html[i], { waitUntil: 'domcontentloaded' });
        await page.waitForSelector('img', { timeout: 3000 });
        // eslint-disable-next-line no-loop-func
        const doms = await page.evaluate(() => {
          const arr = [ ...document.querySelectorAll('img') ];
          return arr.map(v => {
            return {
              naturalWidth: v.naturalWidth,
              naturalHeight: v.naturalHeight,
              width: v.width,
              height: v.height,
            };
          });
        });
        arr = [ ...arr, ...doms ];
      }
      await browser.close();
    })();
    
    この方法は大体overflowの答えを参考にしました.
  • Crawling multiple URL in a loop using pppeter
  • Looping through a set of urls in Pppeter
  • Pppeter-Prop way to loop through multiple URLs