(2018-05-23.Python ZeroからOneへ)7、(爬虫類)scrapy-Redis実戦_1.7.6新浪網分類情報爬虫類の書き換えを試みる2

6328 ワード

既存の新浪網分類情報Scrapy爬虫類項目をRedisSpider類に基づくscrapy-redis分布式爬虫類項目に変更
注意:itemsデータはRedisデータベースに直接格納されます.この機能はscrapy-redisによって独自に実装されています.ローカルデータベースに直接格納などの追加処理を単独で行わない限りpipelinesを記述する必要はない.pyコード.
items.pyファイル
# items.py

# -*- coding: utf-8 -*-

import scrapy

import sys
reload(sys)
sys.setdefaultencoding("utf-8")

class SinaItem(scrapy.Item):
    #         url
    parentTitle = scrapy.Field()
    parentUrls = scrapy.Field()

    #          url
    subTitle = scrapy.Field()
    subUrls = scrapy.Field()

    #         
    # subFilename = scrapy.Field()

    #        
    sonUrls = scrapy.Field()

    #        
    head = scrapy.Field()
    content = scrapy.Field()

settings.pyファイル

SPIDER_MODULES = ['Sina.spiders']
NEWSPIDER_MODULE = 'Sina.spiders'

USER_AGENT = 'scrapy-redis (+https://github.com/rolando/scrapy-redis)'

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"

ITEM_PIPELINES = {
#    'Sina.pipelines.SinaPipeline': 300,
    'scrapy_redis.pipelines.RedisPipeline': 400,
}

LOG_LEVEL = 'DEBUG'

# Introduce an artifical delay to make use of parallelism. to speed up the
# crawl.
DOWNLOAD_DELAY = 1

REDIS_HOST = "192.168.13.26"
REDIS_PORT = 6379

spiders/sina.py
# sina.py

# -*- coding: utf-8 -*-

from Sina.items import SinaItem
from scrapy_redis.spiders import RedisSpider
#from scrapy.spiders import Spider
import scrapy

import sys
reload(sys)
sys.setdefaultencoding("utf-8")

#class SinaSpider(Spider):
class SinaSpider(RedisSpider):
    name= "sina"
    redis_key = "sinaspider:start_urls"
    #allowed_domains= ["sina.com.cn"]
    #start_urls= [
    #   "http://news.sina.com.cn/guide/"
    #]#  urls  

    def __init__(self, *args, **kwargs):
        domain = kwargs.pop('domain', '')
        self.allowed_domains = filter(None, domain.split(','))
        super(SinaSpider, self).__init__(*args, **kwargs)


    def parse(self, response):
        items= []

        #      url     
        parentUrls = response.xpath('//div[@id=\"tab01\"]/div/h3/a/@href').extract()
        parentTitle = response.xpath("//div[@id=\"tab01\"]/div/h3/a/text()").extract()

        #      ur     
        subUrls  = response.xpath('//div[@id=\"tab01\"]/div/ul/li/a/@href').extract()
        subTitle = response.xpath('//div[@id=\"tab01\"]/div/ul/li/a/text()').extract()

        #      
        for i in range(0, len(parentTitle)):

            #            
            #parentFilename = "./Data/" + parentTitle[i]

            #       ,     
            #if(not os.path.exists(parentFilename)):
            #    os.makedirs(parentFilename)

            #       
            for j in range(0, len(subUrls)):
                item = SinaItem()

                #      title urls
                item['parentTitle'] = parentTitle[i]
                item['parentUrls'] = parentUrls[i]

                #      url        url  ,     True (sports.sina.com.cn   sports.sina.com.cn/nba)
                if_belong = subUrls[j].startswith(item['parentUrls'])

                #        ,             
                if(if_belong):
                    #subFilename =parentFilename + '/'+ subTitle[j]

                    #        ,     
                    #if(not os.path.exists(subFilename)):
                    #    os.makedirs(subFilename)

                    #      url、title filename    
                    item['subUrls'] = subUrls[j]
                    item['subTitle'] =subTitle[j]
                    #item['subFilename'] = subFilename

                    items.append(item)

        #      url Request  ,  Response    meta            second_parse     
        for item in items:
            yield scrapy.Request( url = item['subUrls'], meta={'meta_1': item}, callback=self.second_parse)

    #        url,       
    def second_parse(self, response):
        #     Response meta  
        meta_1= response.meta['meta_1']

        #           
        sonUrls = response.xpath('//a/@href').extract()

        items= []
        for i in range(0, len(sonUrls)):
            #            url  、 .shtml  ,     True
            if_belong = sonUrls[i].endswith('.shtml') and sonUrls[i].startswith(meta_1['parentUrls'])

            #        ,          item     
            if(if_belong):
                item = SinaItem()
                item['parentTitle'] =meta_1['parentTitle']
                item['parentUrls'] =meta_1['parentUrls']
                item['subUrls'] =meta_1['subUrls']
                item['subTitle'] =meta_1['subTitle']
                #item['subFilename'] = meta_1['subFilename']
                item['sonUrls'] = sonUrls[i]
                items.append(item)

        #          url Request  ,  Response     meta            detail_parse     
        for item in items:
                yield scrapy.Request(url=item['sonUrls'], meta={'meta_2':item}, callback = self.detail_parse)

    #       ,         
    def detail_parse(self, response):
        item = response.meta['meta_2']
        content = ""
        head = response.xpath('//h1[@id=\"main_title\"]/text()').extract()
        content_list = response.xpath('//div[@id=\"artibody\"]/p/text()').extract()

        #  p             
        for content_one in content_list:
            content += content_one

        item['head']= head[0] if len(head) > 0 else "NULL"

        item['content']= content

        yield item

実行:
slave :
scrapy runspider sina.py

Master :
redis-cli> lpush sinaspider:start_urls http://news.sina.com.cn/guide/