Pythonでよく見られるIPエージェントの追加の簡単な紹介
記事の概要:
一般的なエージェントは次のとおりです.
1、購入した動的IPトンネル:例えば阿布雲動的トンネル、要求はIPを返さず、代理アクセス要求、要求値を返す;
2、私密代理IP:すなわち、返される具体的なIP値(時間制限がある)を取得することができ、その後、私たちは取得した代理IPで代理プールを構築し、要求を開始する.
3、自分でフリーエージェントIPをつかむことで、自分のIPエージェントプールを構築し、興味があれば移動してください.https://blog.csdn.net/Owen_goodman/article/details/100074822
一般的なプロキシ使用シーン:
1、requestsスクリプト:get/postリクエスト
2、scrapy:get/postリクエスト、ミドルウェアにエージェントを追加
3、自動化スクリプト:selenium+webdriver+エージェント
詳細:
一、requestsライブラリでのエージェントの使用アブ雲動的トンネル プライベートエージェントIP(個人用トンボエージェント)
二、scrapyフレームワークにおけるエージェントの使用 requestsメソッド を書き換えるミドルウェアDOWNLOAD追加エージェント
三、自動化ツールseleniumは、ミドルウェアに直接配置する .は、要求中 に記載する.
一般的なエージェントは次のとおりです.
1、購入した動的IPトンネル:例えば阿布雲動的トンネル、要求はIPを返さず、代理アクセス要求、要求値を返す;
2、私密代理IP:すなわち、返される具体的なIP値(時間制限がある)を取得することができ、その後、私たちは取得した代理IPで代理プールを構築し、要求を開始する.
3、自分でフリーエージェントIPをつかむことで、自分のIPエージェントプールを構築し、興味があれば移動してください.https://blog.csdn.net/Owen_goodman/article/details/100074822
一般的なプロキシ使用シーン:
1、requestsスクリプト:get/postリクエスト
2、scrapy:get/postリクエスト、ミドルウェアにエージェントを追加
3、自動化スクリプト:selenium+webdriver+エージェント
詳細:
一、requestsライブラリでのエージェントの使用
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
}
#
#
proxyHost = "http-dyn.abuyun.com"
proxyPort = "9020"
#
proxyUser = "****" #
proxyPass = "****" #
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
"user": proxyUser,
"pass": proxyPass,
}
proxies = {
"http": proxyMeta,
"https": proxyMeta,
}
time.sleep(0.1)
url = 'baidu.com'
# get
response = requests.get(url=url, proxies=proxies, headers=headers)
# post
data={}
response = requests.post(url=url, proxies=proxies, headers=headers, data=data)
class dandd():
def spider(self):
# Python3 , , HTTP GET
import requests
#
targetUrl = "***" #
resp = requests.get(targetUrl)
print(resp.status_code)
print(resp.text)
#
with open("./ip.txt", "w+") as f:
iplist = f.readlines()
ipList = iplist[0::2]
self.count = len(ipList)
ip = random.choice(ipList)
proxies = {
"http": 'http://' + ip.replace("
", "")
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
}
url = 'baidu.com'
res = requests.get(url=url, proxies=proxies, headers=headers)
res = requests.post(url=url, proxies=proxies, headers=headers)
if __name__ == '__main__':
d1 = dandd()
d1.spider()
二、scrapyフレームワークにおけるエージェントの使用
from requests import Request
def start_requests(self, *args):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
}
url = 'baidu.com'
proxies = {"http": 'http://' + ip.replace("
", "")} #
# post
request = Request(url, callback=self.parse, dont_filter=True,headers=headers, meta={'proxy': proxies})
# time.sleep(0.5)
yield request
#
#
import random
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
proxyServer = "http://http-dyn.abuyun.com:9020"
#
proxyUser = "****"
proxyPass = "****"
'''
# for Python2
proxyAuth = "Basic " + base64.b64encode(proxyUser + ":" + proxyPass)
'''
# for Python3
proxyAuth = "Basic " + base64.urlsafe_b64encode(bytes((proxyUser + ":" + proxyPass), "ascii")).decode("utf8")
class ProxyMiddleware(object):
# process_request(self,request,spider)
#
def process_request(self, request, spider):
request.meta["proxy"] = proxyServer
print(' ')
request.headers["Proxy-Authorization"] = proxyAuth
#
# spider ,
def process_request(self, request, spider):
if spider.name in ["name1", "name2", "name3"]:
request.meta["proxy"] = proxyServer
request.headers["Proxy-Authorization"] = proxyAuth
#
# , settings
class qingTingMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
num = 1 #
count = 0 # ip
index = 0 # ip
now = 0
def getIP(self): # ip
if self.count - 1 == self.index or self.index == 0: # IP, IP
pre = self.now
self.now = time.time()
if int((self.now - pre)) < 6:
time.sleep(6 - int((self.now - pre)))
self.now = time.time()
print(" IP")
getAllIp = "your api"
es = requests.get(url=getAllIp)
res.encoding = "utf-8"
with open("./ip.txt", "w") as f:
f.write(res.text)
if self.index != 0:
self.index = 0
ip = re.findall(r'(\d+\.\d+\.\d+\.\d+:\d+)', res.text)[self.index]
self.index += 1
else:
with open("./ip.txt", "r") as f:
iplist = f.readlines()
ipList = iplist[0::2]
self.count = len(ipList)
ip = ipList[self.index]
self.index += 1
return 'http://' + ip.replace("
", "")
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
# process_request(self,request,spider)
#
def process_request(self, request, spider):
ip = self.getIp()
request.meta['proxy'] = ip
#
def process_request(self, request, spider):
if spider.name in ["name1", "name2", "name3"]:
ip = self.getIp()
request.meta['proxy'] = ip
else:
return None
# def process_request(self, request, spider):
# # Called for each request that goes through the downloader
# # middleware.
#
# # Must either:
# # - return None: continue processing this request
# # - or return a Response object
# # - or return a Request object
# # - or raise IgnoreRequest: process_exception() methods of
# # installed downloader middleware will be called
# return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
# #####2020-05-26 , ,
# if isinstance(exception,TimeoutError):
# return request
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
三、自動化ツールselenium
def parse(self, response):
url = "http://.aspx"
chrome_options = Options()
# proxies = random.choice([
# "116.239.105.250:40049",
# "117.26.88.235:23525",
# "60.182.178.192:30221",
# "123.163.184.232:43565",
# "113.120.62.57:43358",
# "1.199.187.37:41380",
# "117.87.139.65:49842",
# "113.128.26.228:31984",
# "125.117.146.134:48840",
# "113.120.63.82:42216",
# ])
#
chrome_options.add_argument('--proxy-server=%s' % proxies)
# chrome_options.add_argument('--headless') #
chrome_options.add_argument('--disable-gpu') # bug
chrome_options.add_argument('--no-sandbox') #
chrome_options.add_argument("--test-type")
chrome_options.add_argument(
'user-agent="MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"')
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get(url)
page = driver.page_source
res = etree.HTML(page) # HTML /html