nodejsが書いたウェブページの爬虫類例(不良チェーン率)

30363 ワード

仕事が必要なので、nodejsで簡単な爬虫類例を書きました.前にもnodejsを使ったことがありません.環境に合わせて5日間ぐらい書きました.どんなに粗末なものがあっても、ここに置いて後の自分に見せてください.
    全体の需要は、有効なURLアドレスを付与し、このページのすべての無効なリンクのパーセンテージ(悪いチェーン率)を返します.
    最初のファイル:チェーンレートを計算するurl Spider.js   
  1 /*================================================

  2 @author MissUU

  3   4 

  5 1.       

  6 2.       <a>

  7 3.      href   ,     “   ,  http      (javascript    )

  8 4.         URL  

  9 ================================================*/

 10 var http = require('http');

 11 var async = require('async');

 12 var dbHandle = require('./dbHandle.js');

 13 

 14 //   

 15 var runUrlSpider = function(obj, callback){

 16     //10s timeout

 17     var request_timer = setTimeout(function() {

 18                         req.abort();

 19                         console.log('Request Timeout.');

 20                         }, 10000);

 21     

 22     var urlBadLink = new UrlBadLink();

 23     var html='';   

 24     var req = http.get(obj.url, function(res) {

 25 

 26         clearTimeout(request_timer);

 27         

 28         res.setEncoding('utf8');

 29         res.on('data', function (chunk) {

 30         html += chunk;

 31         }).on('end', function(){

 32            console.log('*******          ******');

 33            console.log(new Date().toLocaleString());

 34            console.log(obj.url);

 35            urlBadLink.host = obj.url;

 36            urlBadLink.id = obj.id;

 37            matchURL(html, urlBadLink, function(){

 38                callback();

 39            });

 40            });

 41      });

 42 

 43      req.on('error', function(e) {

 44          console.log('problem with request: ' + e.message);

 45          callback();

 46      });

 47 }

 48 

 49 //this is the entrance of code

 50 var main = function(){

 51      var urlArray = dbHandle.showUrls(1, function(result){

 54      async.eachSeries(result, runUrlSpider, function(err){

 55          console.log('******this is the end, haha*******');

 56     });

 57    });

 58   //  console.log(urlArray);

 59     

 60 };

 61 

 62 main();

 63 

 64 /*

 65 *       get  

 66 *

 67 * @param {string} content       

 68 * @param {string} host    

 69 */

 70 function matchURL(content, urlBadLink, callend){

 71    var host = urlBadLink.host;

 72    var anchor = /<a\s[^>]*>/g;

 73    var matches = content.match(anchor);

 74    var badLink = 0;

 75    var flag = 0;

 76    var HttpGet = function(url,callback){

 77         //10s timeout

 78        var request_timer = setTimeout(function() {

 79                            req.abort();

 80                            console.log('Request Timeout.');

 81                            }, 10000);

 82     

 83        var req = http.get(url, function(res) {

 84             clearTimeout(request_timer);

 85            

 86             res.on('data', function () {         

 87             }).on('end', function(){

 88                console.log(++flag + ": " + url + ' response status: ' + res.statusCode);

 89           

 90                if(!(res.statusCode >= 200 && res.statusCode < 400)){

 91                console.log('-----------------------');

 92                badLink++;                  

 93                }

 94                

 95                callback();

 96               });         

 97              });

 98             req.on('error', function(err){

 99                console.log(++flag + ": " + 'problem with request: ' + err.message);

100                badLink++;  

101                callback();

102            });

103       };

104 

105    var urls = filterUrl(matches,host);

106    

107    if(urls !== null){

108       var totalLink = urls.length;

109    //console.log(urls); 

110       async.eachSeries(urls, HttpGet, function(err){

111      // var urlBadLink = new UrlBadLink(host,totalLink, badLink);

112      // console.log("     : " + urlBadLink.badCounts);

113      // console.log("    : " + urlBadLink.getRate());  

114            urlBadLink.total = totalLink;

115            urlBadLink.badCounts = badLink;

116       //data store puts here

117            dbHandle.updateBadLink(urlBadLink);            

118            callend();    

119    });

120   }else{

121         console.log('no links found');

122         urlBadLink.total = 10;

123         urlBadLink.badCounts = 0;

124         dbHandle.updateBadLink(urlBadLink);

125         callend();

126   }

127 }

128 

129 //    href   

130 function URLFommat(strUrl,host){

131 

132    var urlPatten = /href=[\'\"]?([^\'\"]*)[\'\"]?/i;

133    var temp = urlPatten.exec(strUrl);

134 

135    if(temp!= null){

136    var url = temp[0].substring(6,temp[0].length-1).trim();

137 

138       if(url.indexOf("\"") != -1){

139          url = url.slice(url.indexOf("\"") + 1);

140       }

141 

142       if(url.charAt(0) == "/"){  

143          url = url.slice(1);

144          return host + url;

145       }else if((url.indexOf("http") == -1)&&

146           (url.indexOf("javascript") == -1)){

147          return host + url;

148           }else

149               return url;

150    }else 

151      return null;

152 }

153 

154 176 //test URLFommat

177  //var test = "http://baidu.com";

178 // var test1 = " \"http://baidu.com";

179  //var test2 = "/wenhao";

180 //console.log(URLFommat(test,"www.sina.com.cn"));

181  //console.log(URLFommat(test1,"www.sina.com.cn"));

182  //console.log(URLFommat(test2,"www.sina.com.cn"));

183 

184 

185 //       url  

186 function IsURL(strUrl) {

187    if(strUrl != null){

188     var regular = /^\b(((http?|ftp):\/\/)?[-a-z0-9]+(\.[-a-z0-9]+)*\.(?:com|edu|gov|int|mil|net|org|biz|info|name|museum|asia|coop|aero|[a-z][a-z]|((25[0-5])|(2[0-4]\d)|(1\d\d)|([1-9]\d)|\d))\b(\/[-a-z0-9_:\@&?=+,.!\/~%\$]*)?)$/i;

189     if (regular.test(strUrl)) {

190         return true;

191     }

192     else {

193         return false;

194     }

195    }else

196         return false;

197 }

198 

199 

200 //  

201 function UrlBadLink(id, host, total, badCounts){

202     this.id = id;

203     this.host = host;

204     this.total = total;

205     this.badCounts = badCounts;

206 

207     if(typeof this.getRate != "function"){

208        UrlBadLink.prototype.getRate = function(){

209          var output = Number(Math.round(this.badCounts/this.total*10000)/100).toFixed(2)+'%';

210          return output;

211        };

212     }

213 }

214 

215 function filterUrl(arr,host){

216 

217      if(arr === null)

218         return null;

219      var output = [];

220      arr.forEach(function(item,index,array){

221        //console.log(item);

222        var formatURL = URLFommat(item,host);

223        

224        if(IsURL(formatURL)){

225         output.push(formatURL);

226         }//if

227      });//forEach

228      

229      return output;

230 }
      第二のファイル:データを倉庫に保存します.dbHandle.js
/** 

 * @author MissUU

 * @des MySql     

 * API:     https://github.com/felixge/node-mysql 

 */  

  

var mysql = require('mysql');  

  

mysql.createConnection('mysql://root:apple@localhost/test?debug=false');  

  

var pool  = mysql.createPool({  

  host     : '10.102.1.00',  

  user     : 'root',  

  password : 'root',  

  database : 'test',  

  connectionLimit: 15  

});  





//  urls

exports.showUrls = function (groupId, callback){

  

  console.log('this is showUrl()');

  pool.getConnection(function(err, conn){



      if (err) {

        console.log("connection error!");

        console.log(err);

      }



      conn.query('SELECT id,realurl as url FROM t_site WHERE siteGroupId = ?',groupId, function(err, result){

          if(err){

             console.log(err.message);

          }



          conn.release();

          if(result.length){

            // console.log(result instanceof Array);          

              callback(result);

              return true;           

          }else{

             callback('');

             return false;

          }

      });

  });

}; 



exports.updateBadLink = function (urlBadLink){

  //         

  if (!!urlBadLink) {



     pool.getConnection(function(err, conn){

       

     if (err) {

        console.log("connection error!");

        console.log(err);

      }

      

      var updateSql = "UPDATE a_qualityinfo SET brokenRate = '"+ urlBadLink.getRate() +"' WHERE siteId = " + urlBadLink.id;

     

      console.log(updateSql);



      conn.query(updateSql, function(err, result){

          if(err){

             console.log(err.message);

             console.log('update fail');

          }



          conn.release();

          console.log('update success');

      });// conn.query

     });//pool.getConnection

  } 

};
     コードの後期はまた変更されます.ここで注意すべき点があります.
     1、http.getは常に応答を待っていますので、タイムアウトはエラーと判断して、プログラムが引っかかってしまいます.
     2、calbackの使用に注意してください.そうでないと、実行順序を規範化するのが難しくて、nodejsを使ったことがあるものは全部分かります.