Pythonキューベースのマルチスレッド爬虫類

2874 ワード
Wpsec-H4rdy http://www.h4rdy.me/post/1cba6492_6b61da3
Wpsec-p0di http://zone.wooyun.org/content/11888
#!/usr/bin/python
# -*- coding: utf-8 -*-
#author:Erie

import re
import sys
import requests
import Queue
import threading
from bs4 import BeautifulSoup
from urlparse import urljoin
from urlparse import urlparse
from urlparse import urlunparse
from posixpath import normpath

reload(sys)
sys.setdefaultencoding('utf-8')
sys.setdefaultencoding('gbk')

VisitedUrl = Queue.Queue()
VisitedLinks = []

#Spider Function()
class Spider(threading.Thread):
    def __init__ (self,queue,links):
        threading.Thread.__init__(self)
        tmp = urlparse(links)
        self.queue = queue
        self.host = tmp.netloc
        self.pro = tmp.scheme
        self.path = tmp.path

    #chongzu Url
    def myjoin (self,base, url):
        url1 = urljoin(base, url)
        arr = urlparse(url1)
        path = normpath(arr[2])
        return urlunparse((arr.scheme, arr.netloc, path, arr.params, arr.query, arr.fragment))
    def getpage (self,url):
        Response = requests.get(url)
        try:
            Htmlpage = Response.content
            soup = BeautifulSoup(Htmlpage)
            all_href = soup.findAll("a")
            for href in all_href:
                tmp = str(href)
                if tmp.find('href') != -1:
                    if href['href'].find("http://") != -1:
                        if urlparse(href['href']).hostname==self.host:
                            UnvisitedHref = href['href']
                            if UnvisitedHref not in VisitedLinks:
                                self.queue.put(UnvisitedHref)
                    else:
                        UnvisitedHref = self.myjoin(url,href['href'])
                        if UnvisitedHref not in VisitedLinks and urlparse(UnvisitedHref).hostname==self.host  and (urlparse(UnvisitedHref).path.count('/')-self.path.count('/'))<=3 and UnvisitedHref.find('#') == -1 and UnvisitedHref.find('.js' )== -1 and UnvisitedHref.find('.jpg') == -1 and UnvisitedHref.find('bmp') == -1 and UnvisitedHref.find('.png') == -1 and UnvisitedHref.find('.gif') == -1:
                            self.queue.put(UnvisitedHref)
        except :
            pass

    def run (self):
        while True:
            Urling =self.queue.get()
            print Urling
            VisitedLinks.append(Urling)
            self.getpage(Urling)
            self.queue.task_done()
#main Function()
def main(Url):    
    #Url = sys.argv[1]
    for i in range(5):
        spider = Spider(VisitedUrl,Url)
        spider.start()
    VisitedUrl.put(Url)
    VisitedUrl.join()
if __name__ == "__main__":
    main('http://www.sta.edu.cn')
node で grpc を使ってみる + TypeScript でブラウザ実行しようとした
C/c++の-->演算子