Python爬虫類、scrapy、urlアドレスを抽出し、次のurl要求、scrapyを送信します.Requestオブジェクト

1800 ワード


プロジェクト名/spiders/爬虫類名.py(爬虫類、xpathなどの抽出データとurl、次のurl要求を送信):
# -*- coding: utf-8 -*-
import scrapy
from tencent.items import TencentItem

class HrSpider(scrapy.Spider):
    name = 'hr'  #    
    allowed_domains = ['tencent.com']
    start_urls = ['http://hr.tencent.com/position.php']

    def parse(self, response):
        tr_list = response.xpath("//table[@class='tablelist']/tr")[1:-1]
        for tr in tr_list:
            item = TencentItem()
            item["title"] = tr.xpath("./td[1]/a/text()").extract_first()
            item["position"] = tr.xpath("./td[2]/text()").extract_first()
            item["publish_date"] = tr.xpath("./td[5]/text()").extract_first()
            yield item  #         pipelines

        #       url  
        next_url = response.xpath("//a[@id='next']/@href").extract_first()
        if next_url != "javascript:;":
            next_url = "http://hr.tencent.com/" +next_url
            #      url    ,            。
            yield scrapy.Request(
                                    next_url,
                                    callback=self.parse,  #          。
                                    # method="GET",  #   GET
                                    # headers={},  #    headers    cookie  。   None
                                    # cookies={},  #   None
                                    # meta = {"mydata":item},  #                
                                    # dont_filter=False    #   False。 (scrapy       url)
                                )


    # def parse1(self,response):
    #     response.meta["mydata"]  #     meta                。