nodejsタイミング爬虫類、取り続けます.
9594 ワード
第一歩:nodejsプロジェクトを作成する
mkdir
cd
npm init-y
npm install-D eslint
npx eslint--init # 注:いくつかの設定を選択します.populer->airbnb->no react->ymlパッケージの残りのデフォルト
そして、インストールプロジェクトで使用するモジュールnpm install request request-promiseなどはここでは挙げられません.
gitignoreファイルを作成します.中に伝git不要のファイルを入れます.gitは無視されます.
自動的に生成されたスタイルパケットの中で、eslintrc.ymlの内容は以下のように置き換えられます.
index.js爬虫類コード
mkdir
cd
npm init-y
npm install-D eslint
npx eslint--init # 注:いくつかの設定を選択します.populer->airbnb->no react->ymlパッケージの残りのデフォルト
そして、インストールプロジェクトで使用するモジュールnpm install request request-promiseなどはここでは挙げられません.
gitignoreファイルを作成します.中に伝git不要のファイルを入れます.gitは無視されます.
自動的に生成されたスタイルパケットの中で、eslintrc.ymlの内容は以下のように置き換えられます.
parserOptions:
ecmaVersion: 8
env:
es6: true
node: true
mocha: true
globals:
Service: true
extends: 'eslint:recommended'
rules:
indent:
- warn
- 2
- SwitchCase: 1
VariableDeclarator:
var: 2
let: 2
const: 3
linebreak-style:
- error
- unix
quotes:
- warn
- single
semi:
- error
- always
comma-dangle:
- warn
- always-multiline
no-dupe-keys: error
no-dupe-args: error
use-isnan: error
valid-typeof: error
curly: error
default-case: error
eqeqeq:
- error
- allow-null
guard-for-in: warn
no-else-return: warn
no-fallthrough: error
no-floating-decimal: warn
no-multi-str: error
no-octal: error
no-octal-escape: error
no-redeclare: error
no-with: error
no-void: error
radix: error
strict: error
no-delete-var: error
array-bracket-spacing:
- error
- never
block-spacing: error
brace-style:
- error
- 1tbs
- allowSingleLine: true
comma-spacing: error
comma-style:
- error
- last
computed-property-spacing: error
camelcase: warn
key-spacing:
- error
- beforeColon: false
afterColon: true
keyword-spacing: error
max-params:
- warn
- 6
new-cap:
- error
- newIsCap: true
capIsNew: false
properties: true
no-array-constructor: error
no-spaced-func: error
no-whitespace-before-property: error
no-trailing-spaces:
- error
- skipBlankLines: true
operator-linebreak: off
space-before-blocks:
- error
- always
space-before-function-paren:
- error
- anonymous: never
named: never
asyncArrow: always
space-in-parens:
- error
- never
space-infix-ops: error
space-unary-ops: error
spaced-comment:
- warn
- always
arrow-spacing: error
semi-spacing: error
constructor-super: error
generator-star-spacing: warn
yield-star-spacing: warn
no-const-assign: error
no-dupe-class-members: error
no-this-before-super: error
no-var: error
no-unused-vars:
- warn
- vars: local
args: none
no-use-before-define:
- error
- functions: false
classes: false
variables: false
prefer-arrow-callback: warn
prefer-const: off
prefer-rest-params: warn
prefer-spread: warn
prefer-template: warn
template-curly-spacing:
- warn
- never
object-curly-spacing:
- warn
- always
no-multi-spaces:
- warn
- ignoreEOLComments: true
valid-jsdoc: off
no-global-assign: error
no-unsafe-negation: error
require-yield: off
no-warning-comments:
- warn
- location: start
terms:
- todo
- fixme
- xxx
- hack
- review
index.js爬虫類コード
'use strict';
const rp = require('request-promise');
const log = require('xxd-log');
const bluebird = require('bluebird');
//
const cheerio = require('cheerio');
//
const crypto = require('crypto');
//
const fs = require('fs');
// json
const ticket = require('./ticket.json');
// ,
const schedule = require('node-schedule');
// http
const request = rp.defaults({
jar: rp.jar(),
gzip: true,
headers: {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
},
});
// post ,
const post = rp.defaults({
baseUrl: ticket.baseUrl,
method: 'post',
uri: '/news/import',
headers: {
'x-xxd-ticket': ticket.ticket,
},
json: true,
});
// sha1
function sha1(str) {
return crypto.createHash('sha1').update(str).digest('hex');
}
// html , request try catch
async function getTmpRecord(url) {
const sha1Url = sha1(url);
const fileUrl = `${__dirname}/tmpRecord/${sha1Url}.txt`;
const bool = fs.existsSync(fileUrl);
if (bool) {
return fs.readFileSync(fileUrl, 'utf-8');
}
try {
const html = await request(url);
fs.writeFileSync(fileUrl, html, 'utf-8');
return html;
} catch (err) {
log.error(err.message.slice(0, 200));
return null;
}
}
// sleep , 1000
function sleep(milliseconds = 1000) {
// sleep , Promise, await
return new Promise((resolve, reject) => {
setTimeout(() => {
resolve();
}, milliseconds);
});
}
async function main() {
const dataList = [];
const sections = [
{
sectionName: ' ',
section: 'tiaojian',
},
{
sectionName: ' ',
section: 'yuanxiaozhuanye/guojia',
},
{
sectionName: ' ',
section: 'yuanxiaozhuanye/jiaoyutixi',
},
{
sectionName: ' ',
section: 'yuanxiaozhuanye/zhuanyezixun',
},
{
sectionName: ' ',
section: 'yuanxiaozhuanye/remenzhuanye',
},
];
for (let i = 0; i < 5; i += 1) {
for (let page = 0; page < 10; page += 1) {
try {
const url = `https://www.liuxue86.com/${sections[i].section}/${(page === 0) ? '' : `${page + 1}.html`}`;
log.trace(' -->', url);
const html = await request(url);
const section = sections[i].sectionName;
const $ = cheerio.load(html, { decodeEntities: false });
// bulebird
await bluebird.map($('.news-title').get(), async (element) => {
const contentUrl = $('a', element).attr('href');
const data = {};
const contentHtml = await getTmpRecord(contentUrl);
if (contentHtml == null) {
// bluebird , return continue
return;
}
const dollar = cheerio.load(`${contentHtml}`, { decodeEntities: false });
data.section = section;
data.url = contentUrl;
data.title = dollar('h1').text();
log.trace(' --->', data.title);
data.time = dollar('.conter_main_one_nav').children('p').text();
dollar('#article-content img').remove();
dollar('#article-content a').replaceWith(function() { return dollar(this).html(); });
dollar('#article-content [style]').removeAttr('style');
dollar('#article-content [class]').removeAttr('class');
dollar('p:contains( :)').nextAll().remove();
dollar('p:contains( :)').remove();
data.content = dollar('#article-content').html().trim();
data.content = data.content.replace(/ /g, ' ');
dataList.push(data);
}, { concurrency: 4 });
} catch (err) {
log.error(err.stack);
}
}
}
log.trace(' ');
// url , Set
// Set ,
const dataListUrlSet = new Set(dataList.map(data => `${sha1(data.url)}.txt`));
// Set.has()
const tmpRecordList = fs.readdirSync(`${__dirname}/tmpRecord`)
.filter((x) => !dataListUrlSet.has(x));
const successRecordList = fs.readdirSync(`${__dirname}/successRecord`)
.filter((x) => !dataListUrlSet.has(x));
//
tmpRecordList.forEach((element) => {
fs.unlinkSync(`${__dirname}/tmpRecord/${element}`);
});
successRecordList.forEach((element) => {
fs.unlinkSync(`${__dirname}/successRecord/${element}`);
});
// for (let i = 0; i < dataList.length; i++) {
// if (fs.existsSync(`${__dirname}/successRecord/${sha1(dataList[i].url)}.txt`)) {
// dataList.splice(i, i);
// }
// }
// filter,
const sendList = dataList.filter((x) => !fs.existsSync(`${__dirname}/successRecord/${sha1(x.url)}.txt`));
bluebird.map(sendList, async (item) => {
try {
// post
await sleep(500);
// formData ,
const res = await post({
formData: {
title: item.title,
content: item.content,
source: `liuxue86- -${item.section}`,
},
});
// , ` `,
if (res.code !== 0 && res.msg !== ' ') {
throw new Error(res.msg);
}
// , url ,
fs.writeFileSync(`${__dirname}/successRecord/${sha1(item.url)}.txt`, null);
log.trace(item.title, ' ');
} catch (err) {
log.error(err.message);
}
}, { concurrency: 1 })
.catch((err) => {
log.fatal(err.stack);
});
}
// , ,
if (!fs.existsSync(`${__dirname}/tmpRecord`)) {
fs.mkdirSync(`${__dirname}/tmpRecord`);
}
if (!fs.existsSync(`${__dirname}/successRecord`)) {
fs.mkdirSync(`${__dirname}/successRecord`);
}
// , 6 ( ),
schedule.scheduleJob('0 0 4 * * *', () => { // schedule
main().catch((err) => {
log.fatal(err.stack);
});
});
// main().catch((err) => {
// log.fatal(err.stack);
// });