あるURLを知っていますhttp://www.***.com、すべてのニュースを探し出します。

11301 ワード

app.jsファイル:
const fs = require('fs');
const request = require('superagent');
const cheerio = require('cheerio');
const mapLimit = require('async/mapLimit');
const url = 'http://www.hiynn.com/hy-zh';
let counter = 0

// const fetch = require('whatwg-fetch');

function getNews() {
    return new Promise((resolve, reject) => {
        request.get(`${url}/news.html`)
            .end((err, data) => {
                if (err) throw err

                let $ = cheerio.load(data.text), links = []
                $('h3 > a').each((index, item) => {
                    let $item = $(item)

                    // href="#" || href="undefined"
                    if ($item.attr('href').match('#') || !$item.attr('href')) return
                    links.push($item.attr('href'))
                })

                resolve(links)
            })
    })
        .then(links => {
            /**
             * mapLimit
             *@param arg[0]         
             *@param arg[1]         
             *@param arg[2]         ->   1:         ;  2:    
             *@param arg[3]               ->   1:err;  2:   
             */
            mapLimit(links, 10, function (link, cb) {
                request.get(`${url}/${link}`)
                    .end((err, data) => {
                        if (err) throw err

                        let $ = cheerio.load(data.text, {
                            xmlMode: true,
                            decodeEntities: false,
                            normalizeWhitespace: true,
                            withDomLvl1: false
                        }), news = [], create_time=[]
                        let creat_time = link.slice(5, 11);
                        create_time.push(creat_time.slice(0, 2), creat_time.slice(2, 4), creat_time.slice(4, 6))
                        let createTime = create_time.join('-');

                        let title = $('#tab1').html();

                        let Title = title.match(/

(.+)/

)[1].trim(); let typeArr = [1, 2]; let allStr = $('#tab1').html(); // (g i ) var imgReg = /|\/>)/gi; // src var srcReg = /src=[\'\"]?([^\'\"]*)[\'\"]?/i; var arr = allStr.match(imgReg); console.log('arr------->', arr); let imgSrc = []; if(arr) { for (var i = 0; i < arr.length; i++) { imgSrc = arr[i].match(srcReg); } }else { imgSrc = [ ' ', "./images/news161222/3.png"] } // console.log('imgSrc[0]------->', imgSrc[0]); console.log('imgSrc---->', imgSrc); news.push({ counter:counter, author:' ', content:'', creat_time:`20${createTime}`, link: link, content_html: $('#tab1').html(), deleted_flag: 0, important: 2, title: Title, img: imgSrc[1], type: Math.ceil(Math.random() * typeArr.length) }) counter++; console.log(`${counter}/${links.length}`) // setTimeout(() => { // callback , cb(null, news) }, 1000) }) }, function (err, coll) { if (err) throw err // -> let news = Array.prototype.concat.apply([], coll) // fs.writeFile('./links.js', JSON.stringify(news), 'utf8', function (err) { if (err) throw err console.log(' '); }) }) })
} getNews()
package.json
{
  "name": "Crawler",
  "version": "1.0.0",
  "description": "",
  "main": "index.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1",
    "start": "node app.js"
  },
  "keywords": [],
  "author": "",
  "license": "ISC",
  "dependencies": {
    "async": "^2.6.0",
    "cheerio": "^1.0.0-rc.2",
    "superagent": "^3.8.1",
    "whatwg-fetch": "^2.0.3"
  }
}
最後にlinks.jsファイルを生成します。ファイルの内容はすべてのニュースを含む配列です。