Python scrapyランダムリクエストヘッダfake_を追加useragentモジュール

2953 ワード

爬虫類ファイル
フックページはリクエストヘッダなしではアクセスできません
# -*- coding: utf-8 -*-
import scrapy

class TestSpider(scrapy.Spider):
    name = 'test'
    # allowed_domains = ['www.baidu.com']
    start_urls = ['https://www.lagou.com/jobs/3145309.html']

    #         
    custom_settings = {
        'DOWNLOADER_MIDDLEWARES' : {
           'Test_C.middlewares.Random_UA': 1,
        },
        'DEFAULT_REQUEST_HEADERS': {
           'Accept': 'application/json, text/javascript, */*; q=0.01',
           'Accept-Encoding': 'gzip, deflate, br',
           'Accept-Language': 'zh-CN,zh;q=0.8',
           'Connection': 'keep-alive',
           'cookie': '_ga=GA1.2.1809666113.1526002829; user_trace_token=20180511094034-4c4d62d7-54bc-11e8-949f-525400f775ce; LGUID=20180511094034-4c4d6608-54bc-11e8-949f-525400f775ce; LG_LOGIN_USER_ID=537d2089412cae011d73a44bb8911986e2cf8ecc81522b3c; JSESSIONID=ABAAABAAAGFABEF2F6A133027057686F2D420CEB60B7F87; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1541598090,1542114329,1542774094,1543386087; _gid=GA1.2.118340539.1543386087; index_location_city=%E5%85%A8%E5%9B%BD; X_HTTP_TOKEN=fb416e9ede186e36ef1a080ebf43ceba; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22167593fd949988-079e956e17a40f-4313362-2073600-167593fd94a1f2%22%2C%22%24device_id%22%3A%22167593fd949988-079e956e17a40f-4313362-2073600-167593fd94a1f2%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; LGSID=20181128222414-482f1608-f319-11e8-8c3d-5254005c3644; TG-TRACK-CODE=index_navigation; SEARCH_ID=8304e9b43803439494c1c91c06395eca; _gat=1; LGRID=20181128233044-924a1586-f322-11e8-8c40-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543419045',
           'Host': 'www.lagou.com',
           'Origin': 'https://www.lagou.com',
           'Referer': 'https://www.lagou.com/',
           # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
       }

    }

    def parse(self, response):
        print('*_'*20)
        print(response.css('.position-content-l span::text').extract())
        print(response.status)
        print(response.headers)
        print('*_'*20)


カスタムミドルウェアmiddleware

from scrapy import signals
from fake_useragent import UserAgent

class Random_UA(object):
    def process_request(self,request,spider):
        ua = UserAgent().random
        request.headers.setdefault('User-Agent',ua)