分布式爬虫類----当ネット図書データ爬虫類
4191 ワード
後続の技術類のドキュメントは微信の公衆番号に更新します------>>好きなスキャンコードは注目します
ネット図書のデータを取得するとき
ぶんぷしき爬虫類
spider.py
pipelineパイプデータ処理
ネット図書のデータを取得するとき
ぶんぷしき爬虫類
:
runspider py ( ), ( ) :
scrapy runspider myspider_redis.py
Master redis-cli push , :
$redis > lpush myspider:start_urls http://www.dmoz.org/
Slaver , 。
lrange mycrawler:start_url 0 -1
spider.py
# -*- coding: utf-8 -*-
import scrapy
from copy import deepcopy
from scrapy_redis.spiders import RedisSpider
class DdSpider(RedisSpider):
name = 'dd'
allowed_domains = ['dangdang.com']
# start_urls = ['http://book.dangdang.com/']
#lpush dangdang http://book.dangdang.com/ redis start
redis_key = "dangdang"
def parse(self, response):
dl_list = response.xpath("//div[@class='con flq_body']/div")[1:-1]#
for dl in dl_list:
item = {}
item["b_cate"] = dl.xpath(".//dl[contains(@class,'primary_dl')]/dt//text()").extract()
#
dll_list = response.xpath(".//dl[@class='inner_dl']")
for dl in dll_list:
#
item["m_cate"] = dl.xpath("./dt//text()").extract()
#
a_list = dl.xpath("./dd/a")
for a in a_list:
#
item["s_cate"] = a.xpath("./text()").extract_first()
# url
item["s_href"] = a.xpath("./@href").extract_first()
# url ,
yield scrapy.Request(
item["s_href"],
callback=self.parse_book_list,
meta={"item": deepcopy(item)}
)
def parse_book_list(self, response): #
#
item = response.meta["item"]
#
li_list = response.xpath("//ul[@class='bigimg']/li")
for li in li_list:
item["book_name"] = li.xpath("./a/@title").extract_first()
item["book_href"] = li.xpath("./a/@href").extract_first()
item["book_author"] = li.xpath(".//p[@class='search_book_author']/span[1]/a/@title").extract()
item["book_pub_data"] = li.xpath(".//p[@class='search_book_author']/span[2]/text()").extract_first()
item["book_press"] = li.xpath(".//p[@class='search_book_author']/span[3]/a/@title").extract_first()
item["book_desc"] = li.xpath(".//p[@class='detail']/text()").extract_first()
item["book_price"] = li.xpath(".//p[@class='price']/span[1]/text()").extract_first()
item["book_store_name"] = li.xpath(".//span[@class='new_lable']/span[1]/text()").extract_first()
item["book_store_name"] = " " if item["book_store_name"] is None else item["book_store_name"]
yield item
#
next_url = response.xpath("//li[@class='next']/a/@href").extract_first()
if not next_url:
#
yield scrapy.Request(
next_url,
callback=self.parse_book_list,
meta={"item": item}
)
pipelineパイプデータ処理
import re
class DangdangPipeline(object):
def process_item(self, item, spider):
item["b_cate"] = self.process_content(item["b_cate"])
item["m_cate"] = self.process_content(item["m_cate"])
print(item)
return item
def process_content(self, content):# content
content = [re.sub("\\r\
| ", "", i) for i in content]# \xa0|\s
content = [i for i in content if len(i)>0]#
return content