爬虫類ノート(二)-Scrapyダウンロードミドルウェア(IPエージェント)について

4151 ワード

代理サイト


よく使う代理のウェブサイトは西刺が無料でIP IPRENT米扑を代理します

コード#コード#


このコードにはバグがあり、reviewをテストします.ips関数は有効ではないようです
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
# @Time    : 2017/4/27 18:58
# @Author  : Spareribs
# @File    : xicidaili.py
"""

import requests
from bs4 import BeautifulSoup
import threading
import Queue


class Get_ips():
    def __init__(self, page):
        self.ips = []
        self.urls = []
        for i in range(page):
            self.urls.append("http://www.xicidaili.com/nn/" + str(i))
        self.header = {"User-Agent": 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'}
        # self.file=open("ips",'w')
        self.q = Queue.Queue()
        self.Lock = threading.Lock()
        self.cookies = {"user_trace_token": "20170502200739-07d687303c1e44fa9c7f0259097266d6", }
        # self.base_url = "https://www.lagou.com/jobs/positionAjax.json?city=%E5%B9%BF%E5%B7%9E&needAddtionalResult=false&kd=python&pn=1"
        self.base_url = "https://www.baidu.com"
    def get_ips(self):
        for url in self.urls:
            res = requests.get(url, headers=self.header)
            soup = BeautifulSoup(res.text, 'lxml')
            ips = soup.find_all('tr')
            for i in range(1, len(ips)):
                ip = ips[i]
                tds = ip.find_all("td")
                ip_temp = "{0}://{1}:{2}".format(tds[5].contents[0], tds[1].contents[0], tds[2].contents[0])
                # print str(ip_temp)
                self.q.put(str(ip_temp))

    def review_ips(self):
        while not self.q.empty():
            ip = self.q.get()
            # print ip,type(ip)
            http_tag = ip.split(":")[0]
            # print http_tag
            try:
                proxy = {http_tag: ip}
                print proxy
                res = requests.get(self.base_url, proxies=proxy, timeout=1)
                self.Lock.acquire()
                if res.status_code == 200:
                    self.ips.append(ip)
                    # print ip
                    self.Lock.release()
            except Exception:
                pass
                # print 'error'

    def main(self):
        self.get_ips()
        threads = []
        for i in range(40):
            threads.append(threading.Thread(target=self.review_ips, args=[]))
        for t in threads:
            t.start()
        for t in threads:
            t.join()
        return self.ips


def get_ip():
    my = Get_ips(2)
    getips_list = my.main()
    with open("iplist.txt", "w") as f:
        for getip in getips_list:
            f.write(str(getip) + "
") # print getip f.close() return getips_list if __name__ == "__main__": get_ip()

scrapyにおけるエージェントミドルウェアの使用方法


middlewares.py設定
class ProxyMiddleware(object):
    # with open("( ip txt )") as f:
    #     proxy_list = f.readlines()
    # f.close()
    proxy_list = [
        # "HTTP://110.73.3.113:8123",
        "HTTP://171.13.37.172:808",
        "HTTPS://221.229.44.79:808",
    ]

    def process_request(self, request, spider):
        ip = random.choice(self.proxy_list)
        print ip
        request.meta['proxy'] = ip


setting.pyファイル設定
DOWNLOADER_MIDDLEWARES = {
    'lagou.middlewares.ProxyMiddleware': 110,
}

Requestsエージェントのテスト方法

import requests

proxy = {"http":"http://110.80.142.147:808"}
base_url = "https://www.lagou.com/jobs/positionAjax.json?city=%E5%B9%BF%E5%B7%9E&needAddtionalResult=false&kd=python&pn=1"
# base_url = "http://icanhazip.com"
res = requests.get(base_url, proxies=proxy)
print res.status_code
print res.text


問題

  • マルチスレッド処理
  • Request使用代理proxies失敗の問題:血の教訓、proxy={"http":"http://110.80.142.147:808"}小文字!!!