Python爬虫類、scrapy、urlアドレスを抽出し、次のurl要求、scrapyを送信します.Requestオブジェクト
1800 ワード
プロジェクト名/spiders/爬虫類名.py(爬虫類、xpathなどの抽出データとurl、次のurl要求を送信):
# -*- coding: utf-8 -*-
import scrapy
from tencent.items import TencentItem
class HrSpider(scrapy.Spider):
name = 'hr' #
allowed_domains = ['tencent.com']
start_urls = ['http://hr.tencent.com/position.php']
def parse(self, response):
tr_list = response.xpath("//table[@class='tablelist']/tr")[1:-1]
for tr in tr_list:
item = TencentItem()
item["title"] = tr.xpath("./td[1]/a/text()").extract_first()
item["position"] = tr.xpath("./td[2]/text()").extract_first()
item["publish_date"] = tr.xpath("./td[5]/text()").extract_first()
yield item # pipelines
# url
next_url = response.xpath("//a[@id='next']/@href").extract_first()
if next_url != "javascript:;":
next_url = "http://hr.tencent.com/" +next_url
# url , 。
yield scrapy.Request(
next_url,
callback=self.parse, # 。
# method="GET", # GET
# headers={}, # headers cookie 。 None
# cookies={}, # None
# meta = {"mydata":item}, #
# dont_filter=False # False。 (scrapy url)
)
# def parse1(self,response):
# response.meta["mydata"] # meta 。