分布式爬虫類----当ネット図書データ爬虫類

4191 ワード

後続の技術類のドキュメントは微信の公衆番号に更新します------>>好きなスキャンコードは注目します
ネット図書のデータを取得するとき
ぶんぷしき爬虫類

      ：
  runspider       py  （         ），  （ ）         ：

scrapy runspider myspider_redis.py
 Master  redis-cli  push  ，    ：

$redis > lpush myspider:start_urls http://www.dmoz.org/
Slaver        ，    。

lrange mycrawler:start_url 0 -1

spider.py

# -*- coding: utf-8 -*-
import scrapy
from  copy import deepcopy
from scrapy_redis.spiders import RedisSpider


class DdSpider(RedisSpider):
    name = 'dd'
    allowed_domains = ['dangdang.com']
    # start_urls = ['http://book.dangdang.com/']
    #lpush dangdang http://book.dangdang.com/  redis    start  
    redis_key = "dangdang"

    def parse(self, response):
        dl_list = response.xpath("//div[@class='con flq_body']/div")[1:-1]#             
        for dl in dl_list:
            item = {}
            item["b_cate"] = dl.xpath(".//dl[contains(@class,'primary_dl')]/dt//text()").extract()

            #           
            dll_list = response.xpath(".//dl[@class='inner_dl']")
            for dl in dll_list:
                #          
                item["m_cate"] = dl.xpath("./dt//text()").extract()
                #        
                a_list = dl.xpath("./dd/a")
                for a in a_list:
                    #        
                    item["s_cate"] = a.xpath("./text()").extract_first()
                    #      url
                    item["s_href"] = a.xpath("./@href").extract_first()
                    #      url    ，     
                    yield scrapy.Request(
                        item["s_href"],
                        callback=self.parse_book_list,
                        meta={"item": deepcopy(item)}
                    )

    def parse_book_list(self, response): #         
        #       
        item = response.meta["item"]
        #           
        li_list = response.xpath("//ul[@class='bigimg']/li")
        for li in li_list:
            item["book_name"] = li.xpath("./a/@title").extract_first()
            item["book_href"] = li.xpath("./a/@href").extract_first()
            item["book_author"] = li.xpath(".//p[@class='search_book_author']/span[1]/a/@title").extract()
            item["book_pub_data"] = li.xpath(".//p[@class='search_book_author']/span[2]/text()").extract_first()
            item["book_press"] = li.xpath(".//p[@class='search_book_author']/span[3]/a/@title").extract_first()
            item["book_desc"] = li.xpath(".//p[@class='detail']/text()").extract_first()
            item["book_price"] = li.xpath(".//p[@class='price']/span[1]/text()").extract_first()
            item["book_store_name"] = li.xpath(".//span[@class='new_lable']/span[1]/text()").extract_first()
            item["book_store_name"] = "    " if item["book_store_name"] is None else item["book_store_name"]
            yield item

        #        
        next_url = response.xpath("//li[@class='next']/a/@href").extract_first()
        if  not next_url:
            #      
            yield scrapy.Request(
                next_url,
                callback=self.parse_book_list,
                meta={"item": item}
            )

pipelineパイプデータ処理

import re
class DangdangPipeline(object):
    def process_item(self, item, spider):
        item["b_cate"] = self.process_content(item["b_cate"])
        item["m_cate"] = self.process_content(item["m_cate"])
        print(item)
        return item

    def process_content(self, content):#  content     
        content = [re.sub("\\r\
| ", "", i) for i in content]#       \xa0|\s  
        content = [i for i in content if len(i)>0]#           
        return content

SSRFはRedisの中でshellを反発します

Jedis接続プールのエントリーレベルでの使用