Pythonでよく見られるIPエージェントの追加の簡単な紹介


記事の概要:
一般的なエージェントは次のとおりです.
1、購入した動的IPトンネル:例えば阿布雲動的トンネル、要求はIPを返さず、代理アクセス要求、要求値を返す;
2、私密代理IP:すなわち、返される具体的なIP値(時間制限がある)を取得することができ、その後、私たちは取得した代理IPで代理プールを構築し、要求を開始する.
3、自分でフリーエージェントIPをつかむことで、自分のIPエージェントプールを構築し、興味があれば移動してください.https://blog.csdn.net/Owen_goodman/article/details/100074822
一般的なプロキシ使用シーン:
1、requestsスクリプト:get/postリクエスト
2、scrapy:get/postリクエスト、ミドルウェアにエージェントを追加
3、自動化スクリプト:selenium+webdriver+エージェント
詳細:
一、requestsライブラリでのエージェントの使用
  • アブ雲動的トンネル
  • headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
    }
    #          
    #      
    proxyHost = "http-dyn.abuyun.com"
    proxyPort = "9020"
    #         
    proxyUser = "****"  #       
    proxyPass = "****"  #       
    proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
        "host": proxyHost,
        "port": proxyPort,
        "user": proxyUser,
        "pass": proxyPass,
    }
    proxies = {
        "http": proxyMeta,
        "https": proxyMeta,
    }
    time.sleep(0.1)
    url = 'baidu.com'
    
    # get       
    response = requests.get(url=url, proxies=proxies, headers=headers)
    
    # post        
    data={}
    response = requests.post(url=url, proxies=proxies, headers=headers, data=data)
    
  • プライベートエージェントIP(個人用トンボエージェント)
  • class dandd():
    
        def spider(self):
            #   Python3   ,         ,     HTTP GET       
            import requests
            #                 
            targetUrl = "***" #       
            resp = requests.get(targetUrl)
            print(resp.status_code)
            print(resp.text)
            #         
            with open("./ip.txt", "w+") as f:
                iplist = f.readlines()
                ipList = iplist[0::2]
                self.count = len(ipList)
                ip = random.choice(ipList)
                proxies = {
                        "http": 'http://' + ip.replace("
    ", "") } headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", } url = 'baidu.com' res = requests.get(url=url, proxies=proxies, headers=headers) res = requests.post(url=url, proxies=proxies, headers=headers) if __name__ == '__main__': d1 = dandd() d1.spider()

    二、scrapyフレームワークにおけるエージェントの使用
  • requestsメソッド
  • を書き換える
    from requests import Request
    
    
    def start_requests(self, *args):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
        }
        url = 'baidu.com'
        proxies = {"http": 'http://' + ip.replace("
    ", "")} # # post request = Request(url, callback=self.parse, dont_filter=True,headers=headers, meta={'proxy': proxies}) # time.sleep(0.5) yield request
  • ミドルウェアDOWNLOAD追加エージェント
  • #         
    #      
    import random
    from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
    
    
    proxyServer = "http://http-dyn.abuyun.com:9020"
    #         
    proxyUser = "****"
    proxyPass = "****"
    
    '''
    # for Python2
    proxyAuth = "Basic " + base64.b64encode(proxyUser + ":" + proxyPass)
    
    '''
    # for Python3
    proxyAuth = "Basic " + base64.urlsafe_b64encode(bytes((proxyUser + ":" + proxyPass), "ascii")).decode("utf8")
    
    class ProxyMiddleware(object):
        #     process_request(self,request,spider)   
        #    
        def process_request(self, request, spider):
            request.meta["proxy"] = proxyServer
            print('       ')
            request.headers["Proxy-Authorization"] = proxyAuth
        #    
        #     spider  ,                  
        def process_request(self, request, spider):
            if spider.name in ["name1", "name2", "name3"]:
                request.meta["proxy"] = proxyServer
                request.headers["Proxy-Authorization"] = proxyAuth
    #        
    #            ,   settings        
    class qingTingMiddleware(object):
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the downloader middleware does not modify the
        # passed objects.
        num = 1  #
        count = 0  #      ip  
        index = 0  #     ip
        now = 0
    
        def getIP(self):  #     ip
            if self.count - 1 == self.index or self.index == 0:  #     IP,      IP
                pre = self.now
                self.now = time.time()
                if int((self.now - pre)) < 6:
                    time.sleep(6 - int((self.now - pre)))
                    self.now = time.time()
                print("    IP")
                getAllIp = "your api"
                es = requests.get(url=getAllIp)
                res.encoding = "utf-8"
                with open("./ip.txt", "w") as f:
                    f.write(res.text)
                if self.index != 0:
                    self.index = 0
                ip = re.findall(r'(\d+\.\d+\.\d+\.\d+:\d+)', res.text)[self.index] 
                self.index += 1
            else:
                with open("./ip.txt", "r") as f:
                    iplist = f.readlines()
                ipList = iplist[0::2]
                self.count = len(ipList)
                ip = ipList[self.index]
                self.index += 1
            return 'http://' + ip.replace("
    ", "") @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s # process_request(self,request,spider) # def process_request(self, request, spider): ip = self.getIp() request.meta['proxy'] = ip # def process_request(self, request, spider): if spider.name in ["name1", "name2", "name3"]: ip = self.getIp() request.meta['proxy'] = ip else: return None # def process_request(self, request, spider): # # Called for each request that goes through the downloader # # middleware. # # # Must either: # # - return None: continue processing this request # # - or return a Response object # # - or return a Request object # # - or raise IgnoreRequest: process_exception() methods of # # installed downloader middleware will be called # return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain # #####2020-05-26 , , # if isinstance(exception,TimeoutError): # return request pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)

    三、自動化ツールselenium
  • は、ミドルウェアに直接配置する
  • .
  • は、要求中
  • に記載する.
        def parse(self, response):
            url = "http://.aspx"
            chrome_options = Options()
            # proxies = random.choice([
            #     "116.239.105.250:40049",
            #     "117.26.88.235:23525",
            #     "60.182.178.192:30221",
            #     "123.163.184.232:43565",
            #     "113.120.62.57:43358",
            #     "1.199.187.37:41380",
            #     "117.87.139.65:49842",
            #     "113.128.26.228:31984",
            #     "125.117.146.134:48840",
            #     "113.120.63.82:42216",
            # ])
    
            #     
            chrome_options.add_argument('--proxy-server=%s' % proxies)
    
            # chrome_options.add_argument('--headless')  #     
            chrome_options.add_argument('--disable-gpu')  #                  bug
            chrome_options.add_argument('--no-sandbox')  #        
            chrome_options.add_argument("--test-type")
            chrome_options.add_argument(
                'user-agent="MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"')
            driver = webdriver.Chrome(chrome_options=chrome_options)
            driver.get(url)
            page = driver.page_source
            res = etree.HTML(page)  #   HTML      /html