爬虫類開発pythonキットの紹介(4)

13949 ワード

本文は網易雲コミュニティから来た
作者:王涛
ここでは、get、post(json、フォーム)、証明書付きアクセス:Getリクエストなど、一般的なコードの例をいくつか示します.
@gen.coroutine
def fetch_url():
    try:
        c = CurlAsyncHTTPClient()  #     httpclient
        myheaders = {
            "Host": "weixin.sogou.com",
            "Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5 ",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
        }
        url = "http://weixin.sogou.com/weixin?type=1&s_from=input&query=%E4%BA%BA%E6%B0%91%E6%97%A5%E6%8A%A5&ie=utf8&_sug_=n&_sug_type_="

        req = HTTPRequest(url=url, method="GET", headers=myheaders, follow_redirects=True, request_timeout=20, connect_timeout=10,
                          proxy_host="127.0.0.1",
                          proxy_port=8888)
        response = yield c.fetch(req)  #     
        print response.code
        print response.body
        IOLoop.current().stop()  #   ioloop  
    except:
        print traceback.format_exc()

Fiddlerが捕まえたメッセージ要求ヘッダ:
POST JSONデータ要求
@gen.coroutine
def fetch_url():
    """  url"""
    try:
        c = CurlAsyncHTTPClient()  #     httpclient
        myheaders = {
            "Host": "weixin.sogou.com",
            "Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5 ",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate",
            "Content-Type": "Application/json",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
        }
        url = "http://127.0.0.1?type=1&s_from=input&query=%E4%BA%BA%E6%B0%91%E6%97%A5%E6%8A%A5&ie=utf8&_sug_=n&_sug_type_="
        body =json.dumps({"key1": "value1", "key2": "value2"})  # Json    

        req = HTTPRequest(url=url, method="POST", headers=myheaders, follow_redirects=True, request_timeout=20, connect_timeout=10,
                          proxy_host="127.0.0.1",proxy_port=8888,body=body)
        response = yield c.fetch(req)  #     
        print response.code
        print response.body
        IOLoop.current().stop()  #   ioloop  
    except:
        print traceback.format_exc()

Fiddlerが捕まえたメッセージ要求ヘッダ:
POST Formデータ要求
@gen.coroutine
def fetch_url():
    """  url"""
    try:
        c = CurlAsyncHTTPClient()  #     httpclient
        myheaders = {
            "Host": "weixin.sogou.com",
            "Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5 ",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate",
            # "Content-Type": "Application/json",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
        }
        import urllib
        url = "http://127.0.0.1?type=1&s_from=input&query=%E4%BA%BA%E6%B0%91%E6%97%A5%E6%8A%A5&ie=utf8&_sug_=n&_sug_type_="
        body =urllib.urlencode({"key1": "value1", "key2": "value2"})  #   form  

        req = HTTPRequest(url=url, method="POST", headers=myheaders, follow_redirects=True, request_timeout=20, connect_timeout=10,
                          proxy_host="127.0.0.1",proxy_port=8888,body=body)
        response = yield c.fetch(req)  #     
        print response.code
        print response.body
        IOLoop.current().stop()  #   ioloop  
    except:
        print traceback.format_exc()

Fiddlerが捕まえたメッセージ要求ヘッダ:
証明書アクセスの追加
def fetch_url():
    """  url"""
    try:
        c = CurlAsyncHTTPClient()  #     httpclient
        myheaders = {
            "Host": "www.amazon.com",
            "Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/68.0.3440.106 Safari/537.36"),
            "Accept": ("text/html,application/xhtml+xml,"
               "application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"),
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
        }
        import urllib
        url = "https://www.amazon.com/"

        req = HTTPRequest(url=url, method="GET", headers=myheaders, follow_redirects=True, request_timeout=20, connect_timeout=10,proxy_host="127.0.0.1",
        proxy_port=8888,ca_certs="FiddlerRoot.pem")  #     
        response = yield c.fetch(req)  #     
        print response.code
        print response.body
        IOLoop.current().stop()  #   ioloop  
    except:
        print traceback.format_exc()

Fiddlerがキャプチャしたメッセージ(正常にアクセスできることを示します)
四、まとめ
キャプチャ量が少ない場合はrequestsを使用することをお勧めします.簡単で使いやすいです.コンカレント量が大きい場合は、tornadoを使用することをお勧めします.単線距離が高く、コンカレントで、効率的でプログラミングしやすいです.
以上、requestsとFiddlerでよく使われるインタフェースとパラメータの説明を示しました.爬虫類が直面する大部分の問題を解決することができます.同時捕獲、日常的な逆登り対応、httpsサイトの捕獲が含まれています.
自分のよく使うコードロジックを添付します.
import randomfrom tornado.ioloop import IOLoopfrom tornado import genfrom tornado.queues import Queue


import random
from tornado.ioloop import IOLoop
from tornado import gen
from tornado.queues import Queue


TASK_QUE = Queue(maxsize=1000)


def response_handler(res):
    """     ,         url        ,          """
    pass


@gen.coroutine
def url_fetcher_without_param():
    pass


@gen.coroutine
def url_fetcher(*args,**kwargs):
    global TASK_QUE
    c = CurlAsyncHTTPClient()

    while 1:
        #console_show_log("Let's spider")
        try: 
            param = TASK_QUE.get(time.time() + 300) # 5     
        except tornado.util.TimeoutError::
            yield gen.sleep(random.randint(10,100))
            continue

        try:
            req = HTTPRequest(url,method=,headers=,....) #       
            response = yield c.fetch(req) 
            if response.coe==200:
                response_handler(response.body)
        except Exception:
            yield gen.sleep(10)
            continue
        finally:
            print "I am a slow spider"
            yield gen.sleep(random.randint(10,100))

@gen.coroutine
def period_callback():
    pass

def main():
    io_loop = IOLoop.current()
    #       1
    io_loop.spawn_callback(url_fetcher, 1)  
    io_loop.spawn_callback(url_fetcher, 2)
    io_loop.spawn_callback(url_fetcher_without_param) #       

    #         ,  PeriodicCallback:
    PERIOD_CALLBACK_MILSEC = 10  # 10,   ms
    io_loop.PeriodicCallback(period_callback,).start()
    io_loop.start()

if __name__ == "__main__":
    main()

以上、討論交流を歓迎します
五、参考:
  • requests快速入門:docs.python-requests.org/zh_CN/lates…
  • requests高度応用:docs.python-requests.org/en/master/u…
  • CAとはBUNDLE:www.namecheap.com/support/kno…
  • requestsで画像をダウンロードする方法:stackoverflow.com/questions/1…
  • tornado AsyncHttpClient: www.tornadoweb.org/en/stable/h…
  • 100 Continueステータスコード:developer.mozilla.org/zh-CN/docs/…
  • HTTP認証:developer.mozilla.org/en-US/docs/…
  • 証明書変換:www.alibabacloud.com/help/zh/faq…

  • 網易雲無料体験館、0コスト体験20+クラウド製品!
    より多くの網易研究開発、製品、運営経験を共有するには、網易クラウドコミュニティにアクセスしてください.
    関連文章:【推薦】AOPのクーポンに基づいて異常歩哨監視を送信【推薦】伝統的な文字型検証安全現状及び網易盾検証コードの優位性