nodejsタイミング爬虫類、取り続けます.

9594 ワード

第一歩:nodejsプロジェクトを作成する
mkdir
cd
npm init-y
npm install-D eslint
npx eslint--init     # 注:いくつかの設定を選択します.populer->airbnb->no react->ymlパッケージの残りのデフォルト
そして、インストールプロジェクトで使用するモジュールnpm install request request-promiseなどはここでは挙げられません.
gitignoreファイルを作成します.中に伝git不要のファイルを入れます.gitは無視されます.
自動的に生成されたスタイルパケットの中で、eslintrc.ymlの内容は以下のように置き換えられます.
parserOptions:
  ecmaVersion: 8
env:
  es6: true
  node: true
  mocha: true
globals:
  Service: true
extends: 'eslint:recommended'
rules:
  indent:
    - warn
    - 2
    - SwitchCase: 1
      VariableDeclarator:
        var: 2
        let: 2
        const: 3
  linebreak-style:
    - error
    - unix
  quotes:
    - warn
    - single
  semi:
    - error
    - always
  comma-dangle:
    - warn
    - always-multiline
  no-dupe-keys: error
  no-dupe-args: error
  use-isnan: error
  valid-typeof: error
  curly: error
  default-case: error
  eqeqeq:
    - error
    - allow-null
  guard-for-in: warn
  no-else-return: warn
  no-fallthrough: error
  no-floating-decimal: warn
  no-multi-str: error
  no-octal: error
  no-octal-escape: error
  no-redeclare: error
  no-with: error
  no-void: error
  radix: error
  strict: error
  no-delete-var: error
  array-bracket-spacing:
    - error
    - never
  block-spacing: error
  brace-style:
    - error
    - 1tbs
    - allowSingleLine: true
  comma-spacing: error
  comma-style:
    - error
    - last
  computed-property-spacing: error
  camelcase: warn
  key-spacing:
    - error
    - beforeColon: false
      afterColon: true
  keyword-spacing: error
  max-params:
    - warn
    - 6
  new-cap:
    - error
    - newIsCap: true
      capIsNew: false
      properties: true
  no-array-constructor: error
  no-spaced-func: error
  no-whitespace-before-property: error
  no-trailing-spaces:
    - error
    - skipBlankLines: true
  operator-linebreak: off
  space-before-blocks:
    - error
    - always
  space-before-function-paren:
    - error
    - anonymous: never
      named: never
      asyncArrow: always
  space-in-parens:
    - error
    - never
  space-infix-ops: error
  space-unary-ops: error
  spaced-comment:
    - warn
    - always
  arrow-spacing: error
  semi-spacing: error
  constructor-super: error
  generator-star-spacing: warn
  yield-star-spacing: warn
  no-const-assign: error
  no-dupe-class-members: error
  no-this-before-super: error
  no-var: error
  no-unused-vars:
    - warn
    - vars: local
      args: none
  no-use-before-define:
    - error
    - functions: false
      classes: false
      variables: false
  prefer-arrow-callback: warn
  prefer-const: off
  prefer-rest-params: warn
  prefer-spread: warn
  prefer-template: warn
  template-curly-spacing:
    - warn
    - never
  object-curly-spacing:
    - warn
    - always
  no-multi-spaces:
    - warn
    - ignoreEOLComments: true
  valid-jsdoc: off
  no-global-assign: error
  no-unsafe-negation: error
  require-yield: off
  no-warning-comments:
    - warn
    - location: start
      terms:
        - todo
        - fixme
        - xxx
        - hack
        - review
 
 
 
index.js爬虫類コード
'use strict';
const rp = require('request-promise');
const log = require('xxd-log');
const bluebird = require('bluebird');

//      
const cheerio = require('cheerio');
//    
const crypto = require('crypto');

//      
const fs = require('fs');
//      json  
const ticket = require('./ticket.json');
//    ,      
const schedule = require('node-schedule');

// http   
const request = rp.defaults({
  jar: rp.jar(),
  gzip: true,
  headers: {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
  },
});
//   post  ,           
const post = rp.defaults({
  baseUrl: ticket.baseUrl,
  method: 'post',
  uri: '/news/import',
  headers: {
    'x-xxd-ticket': ticket.ticket,
  },
  json: true,
});

// sha1    
function sha1(str) {
  return crypto.createHash('sha1').update(str).digest('hex');
}

//   html  ,  request  try catch
async function getTmpRecord(url) {
  const sha1Url = sha1(url);
  const fileUrl = `${__dirname}/tmpRecord/${sha1Url}.txt`;
  const bool = fs.existsSync(fileUrl);
  if (bool) {
    return fs.readFileSync(fileUrl, 'utf-8');
  }
  
  try {
    const html = await request(url);
    fs.writeFileSync(fileUrl, html, 'utf-8');
    return html;
  } catch (err) {
    log.error(err.message.slice(0, 200));
    return null;
  }
}

// sleep  ,    1000
function sleep(milliseconds = 1000) {
  //   sleep  ,    Promise,          await
  return new Promise((resolve, reject) => {
    setTimeout(() => {
      resolve();
    }, milliseconds);
  });
}


async function main() {
  const dataList = [];
  const sections = [
    {
      sectionName: '    ',
      section: 'tiaojian',
    },
    {
      sectionName: '    ',
      section: 'yuanxiaozhuanye/guojia',
    },
    {
      sectionName: '    ',
      section: 'yuanxiaozhuanye/jiaoyutixi',
    },
    {
      sectionName: '    ',
      section: 'yuanxiaozhuanye/zhuanyezixun',
    },
    {
      sectionName: '    ',
      section: 'yuanxiaozhuanye/remenzhuanye',
    },
  ];

  for (let i = 0; i < 5; i += 1) {
    for (let page = 0; page < 10; page += 1) {

      try {
        const url = `https://www.liuxue86.com/${sections[i].section}/${(page === 0) ? '' : `${page + 1}.html`}`;
        log.trace('    -->', url);
        const html = await request(url);
        const section = sections[i].sectionName;
        const $ = cheerio.load(html, { decodeEntities: false });
        // bulebird        
        await bluebird.map($('.news-title').get(), async (element) => {
          const contentUrl = $('a', element).attr('href');
          const data = {};
          const contentHtml = await getTmpRecord(contentUrl);
          if (contentHtml == null) {
            //   bluebird        ,  return   continue
            return;
          }
          const dollar = cheerio.load(`${contentHtml}`, { decodeEntities: false });
          data.section = section;
          data.url = contentUrl;
          data.title = dollar('h1').text();
          log.trace('    --->', data.title);
          data.time = dollar('.conter_main_one_nav').children('p').text();
          dollar('#article-content img').remove();
          dollar('#article-content a').replaceWith(function() { return dollar(this).html(); });
          dollar('#article-content [style]').removeAttr('style');
          dollar('#article-content [class]').removeAttr('class');
          dollar('p:contains(    :)').nextAll().remove();
          dollar('p:contains(    :)').remove();
          data.content = dollar('#article-content').html().trim();
          data.content = data.content.replace(/     /g, '   ');
          dataList.push(data);
        }, { concurrency: 4 });
      } catch (err) {
        log.error(err.stack);
      }
    }
  }
  log.trace('      ');
  //         url  ,      Set     
  //  Set      ,          
  const dataListUrlSet = new Set(dataList.map(data => `${sha1(data.url)}.txt`));
  //   Set.has()            
  const tmpRecordList = fs.readdirSync(`${__dirname}/tmpRecord`)
    .filter((x) => !dataListUrlSet.has(x));
  const successRecordList = fs.readdirSync(`${__dirname}/successRecord`)
    .filter((x) => !dataListUrlSet.has(x));
  //       
  tmpRecordList.forEach((element) => {
    fs.unlinkSync(`${__dirname}/tmpRecord/${element}`);
  });
  successRecordList.forEach((element) => {
    fs.unlinkSync(`${__dirname}/successRecord/${element}`);
  });

  // for (let i = 0; i < dataList.length; i++) {
  //   if (fs.existsSync(`${__dirname}/successRecord/${sha1(dataList[i].url)}.txt`)) {
  //     dataList.splice(i, i);
  //   }
  // }
  
  //   filter,          
  const sendList = dataList.filter((x) => !fs.existsSync(`${__dirname}/successRecord/${sha1(x.url)}.txt`));

  bluebird.map(sendList, async (item) => {
    try {
      //        post  
      await sleep(500);
      //   formData  ,    
      const res = await post({
        formData: {
          title: item.title,
          content: item.content,
          source: `liuxue86-    -${item.section}`,
        },
      });
      //           ,   `      `,    
      if (res.code !== 0 && res.msg !== '      ') {
        throw new Error(res.msg);
      }
      //     ,        url     ,       
      fs.writeFileSync(`${__dirname}/successRecord/${sha1(item.url)}.txt`, null);
      log.trace(item.title, '    ');
    } catch (err) {
      log.error(err.message);
    }
  }, { concurrency: 1 })
    .catch((err) => {
      log.fatal(err.stack);
    });
}
//       ,                  ,   
if (!fs.existsSync(`${__dirname}/tmpRecord`)) {
  fs.mkdirSync(`${__dirname}/tmpRecord`);
}
if (!fs.existsSync(`${__dirname}/successRecord`)) {
  fs.mkdirSync(`${__dirname}/successRecord`);
}

//          ,    6       (           ),              
schedule.scheduleJob('0 0 4 * * *', () => { //        schedule    
  main().catch((err) => {
    log.fatal(err.stack);
    
  });
});

// main().catch((err) => {
//   log.fatal(err.stack);
// });