scrapyノート【3】[栗を挙げる]
3331 ワード
————挙栗:ケース:テンセント招聘网自动ページめくり采集————
#scrapy startproject xxx( )
scrapy startproject tcent
#scrapy genspider mytianya( ) "bbs.tianya.cn"( )
scrapy genspider tencent "tencent.com"
職階名、詳細、
class TencentItem(scrapy.Item):
# define the fields for your item here like:
jobTitle = scrapy.Field()
jobCategories = scrapy.Field()
number = scrapy.Field()
location = scrapy.Field()
releasetime = scrapy.Field()
# -*- coding: utf-8 -*-
import re
import scrapy
from Tencent import items
class MytencentSpider(scrapy.Spider):
name = 'myTencent'
allowed_domains = ['hr.tencent.com']
start_urls = ['https://hr.tencent.com/position.php?lid=2218&start=0#a']
def parse(self, response):
for data in response.xpath("//tr[@class=\"even\"] | //tr[@class=\"odd\"]"):
item = items.TencentItem()
item["jobTitle"] = data.xpath("./td[1]/a/text()")[0].extract()
item["jobLink"] = data.xpath("./td[1]/a/@href")[0].extract()
item["jobCategories"] = data.xpath("./td[1]/a/text()")[0].extract()
item["number"] = data.xpath("./td[2]/text()")[0].extract()
item["location"] = data.xpath("./td[3]/text()")[0].extract()
item["releasetime"] = data.xpath("./td[4]/text()")[0].extract()
yield item
for i in range(1, 200):
newurl = "https://hr.tencent.com/position.php?lid=2218&start=%d#a" % (i*10)
yield scrapy.Request(newurl, callback=self.parse)
class TencentPipeline(object):
def __init__(self):
self.file = open("tencent.txt", "w", encoding="utf-8")
def process_item(self, item, spider):
line = str(item) + "\r
"
self.file.write(line)
self.file.flush()
return item
def __del__(self):
self.file.close()
ITEM_PIPELINES = {
"mySpider.pipelines.TencentJsonPipeline":300
}
scrapy crawl tencent
8、parse()の方法を考える仕事のメカニズム: