puppeteer爬虫類入門チュートリアル

4612 ワード

puppeteerを使って古詩文網を這い出すhttps://www.gushiwen.org/shiwen/コンテンツおよびオーディオはmysqljsでデータベースに保存
コアコードは次のとおりです.
'use strict';
const puppeteer = require('puppeteer');
var request = require('request');
var fs = require('fs');
const gushiwen = require('./sql');

(async () => {
  const browser = await puppeteer.launch({
    // headless: false
  });
  
  for(let i=1 ; i < 999; i++){
    let url = 'https://www.gushiwen.org/shiwen/default_0AA'+i+'.aspx';
    this.page = await browser.newPage();

    //  await       ,         
    await click(this.page,url);
    // click(this.page,url);
  }
})();

async function click(page,url){
  // let page = this.page;
  await page.goto(url);

  let mp3IDList = await page.$$('img[id*="speakerimg"]')
  // console.debug(mp3IDList)

  for(let i=0 ; i < mp3IDList.length; i++){
    await mp3IDList[i].click();
  }

  let cont = await page.$$('.left .sons')
  // console.debug(cont);
  for(let i=0 ; i < cont.length; i++){
    //     
    let title = await cont[i].$eval('.sons .cont a', el => el.innerText);
    console.debug(title); 

    //        
    let source = await cont[i].$eval('.source', el => el.innerText);
    // console.debug(source);
    let dynasty = source.split(":")[0]
    let author = source.split(":")[1]
    // console.debug(dynasty);
    // console.debug(author);

    //     ID
    let id = (await cont[i].$eval('.contson', el => el.id)).substring(7)
    // console.debug(id);

    //     
    let contson = await cont[i].$eval('.contson', el => el.innerHTML);
    // console.debug(contson); 

    //   tag
    let tag = await cont[i].$eval('.tag', el => el.innerText).catch(function (err){
      console.error(err);
    });
    if (tag === undefined){
      tag = null
    }else{
      tag = tag.replace(/[\r
]/g,"").replace(/,/g,",") } // console.debug(tag) // let scores = (await cont[i].$eval('.good', el => el.innerText)).trim(); // console.debug(scores); // let audiosrc = await cont[i].$eval('audio', el => el.src).catch(function (err){ console.error(err); }); // console.debug(audiosrc) let filename = '' if (audiosrc === undefined){ audiosrc = null }else{ // filename = './mp3/' + audiosrc.split('/')[5] await downloadFile(audiosrc,filename,function(){ console.debug(filename+' '); }); } gushiwen.insertGushiwen(id,title,author,contson,dynasty,filename,scores,tag) } } /* * url * filename * callback */ function downloadFile(url,filename,callback){ fs.open(filename, 'wx', (err, fd) => { if (err) { if (err.code === 'EEXIST') { console.error(filename + ' already exists'); return; } throw err; } // console.debug('downloading'); let stream = fs.createWriteStream(filename); try { request(url).pipe(stream).on('close', callback); } catch (err) { console.error(err); } }); }
mysqljsデータベースを使用して操作を挿入するには、次の手順に従います.
var mysql      = require('mysql');
var connection = mysql.createConnection({
  host     : 'localhost',
  user     : 'root',
  password : 'root',
  database : 'nichuiniu'
});

module.exports = {
  insertGushiwen: function(num, title,author,content,dynasty,audiourl,scores,tag){
    console.log('insert Gushiwen into tables')

    let sql  = {num: num, title: title,author:author,content: content, 
    dynasty: dynasty,audiourl:audiourl,scores: scores, tag: tag};

    connection.query('INSERT ignore INTO tbl_nichuiniu_gushiwen SET ?', sql, function(err, results, fields){
          if (err) throw err;
          console.log('The affect row is: ' + results.insertId);
        }
    );
  }
}


//          
// connection.connect();

// var post  = {num: 2, title: 'title',author:'author',content: 'content', 
// dynasty: 'dynasty',audiourl:'audiourl',scores: 1, tag: 'title'};

// connection.query('INSERT ignore INTO tbl_nichuiniu_gushiwen SET ?', post, function (error, results, fields) {
//   if (error) throw error;
//   console.log('The solution is: ' + results.insertId);
// });

// connection.end();
GitHubアドレス:https://github.com/libp/gushiwenpuppeteer