scrapyスタンドアロン版からscrapy分散修正プロセス


1、設備
scrapy単機版:1台のマシンscrapy分布式:少なくとも2台のマシン
2、linux環境構成
2.1 redisインストール
詳細は、linux redisの完全なインストール手順を参照してください.
2.2 python環境インストール
anacondaによるpythonの管理の詳細は、linux anacondaのインストールと環境構成を参照してください.
3、コード修正
3.1 setting.py修正
settingに次のコードを追加します.

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = False 			# False :      redis
SCHEDULER_FLUSH_ON_START = True  	# True:       redis
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.FifoQueue"
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.PriorityQueue"
# REDIS_URL = 'redis://172.16.1.101:6379'
REDIS_HOST = '172.16.1.101'
REDIS_PORT = 6379

HTTPERROR_ALLOWED_CODES = [403]


3.2 spiderコード
1、爬虫類の継承をRedisCrawlSpiderに変更
# pip install scrapy-redis

from scrapy_redis.spiders import RedisCrawlSpider

class SampleSpider(RedisCrawlSpider):
	pass

2、allowedを取り除くdomains、start_urlsまたはstart_requests 3、爬虫類の最初のurlエントリはparseまたはカスタムルールでなければなりません
    def parse(self, response):
    	pass

3.3拡張:爬虫類が長期にわたって空いている場合、爬虫類を閉じる
空き時間が1時間になると爬虫類を閉じる
1、setting.py追加
MYEXT_ENABLED = True      #     
IDLE_NUMBER = 720           #             360  ,       5s。   360 :   

#   EXTENSIONS   ,    
EXTENSIONS = {
            'project.extensions.RedisSpiderSmartIdleClosedExensions': 500,
        }


2、extensions.py詳細はextensions付録を参照
4タスクタイミング
4.1 redisマシンcrontabタスク
#      start_url

# comic_redis
5 0 * * * sh /home/hilqiqi0/workspace/crawler_redis/run_scrapy.sh sample >> /home/hilqiqi0/workspace/crawler_redis/log.txt 2>&1

15 0 * * * sh /home/hilqiqi0/workspace/crawler_redis/run_scrapy_redis_start_url.sh


4.2その他の爬虫類機械crontab任務
#        

# comic_redis
5 0 * * * sh /home/hilqiqi0/workspace/crawler_redis/run_scrapy.sh qq >> /home/hilqiqi0/workspace/crawler_redis/log.txt 2>&1


4.3爬虫起動スクリプト:run_scrapy.sh
#!/bin/bash

#     python  ;           

# -f      $file     
if [ -f "/data2/hilqiqi0/venv/bin/activate" ]; then
  echo /data2/hilqiqi0/venv/bin/activate
  source /data2/hilqiqi0/venv/bin/activate
fi

if [ ! -f "/data2/hilqiqi0/venv/bin/activate" ]; then
  echo /etc/profile
  source /etc/profile
#  echo conda activate
#  conda activate
fi

cur_dateTime="`date +%Y-%m-%d,%H:%M:%S`"
echo $cur_dateTime
cd /home/hilqiqi0/workspace/crawler_redis
echo $1
echo log/log_$1.txt
scrapy crawl $1 > log/log_$1.txt 2>&1 &


/data 2/hilqiqi 0/venv/bin/activate:python仮想環境
4.4爬虫起動スクリプト:run_scrapy_redis_start_url.sh
#!/bin/bash
source /etc/profile

redis-cli lpush sample:start_urls www.sample.com



5、付録
5.1extensions.py
# -*- coding: utf-8 -*-

# Define here the models for your scraped Extensions


from scrapy import signals
from scrapy.exceptions import NotConfigured


class RedisSpiderSmartIdleClosedExensions(object):

    def __init__(self, idle_number, crawler):
        self.crawler = crawler
        self.idle_number = idle_number
        self.idle_list = []
        self.idle_count = 0

    @classmethod
    def from_crawler(cls, crawler):
        # first check if the extension should be enabled and raise

        # NotConfigured otherwise

        if not crawler.settings.getbool('MYEXT_ENABLED'):

            raise NotConfigured

        if not 'redis_key' in crawler.spidercls.__dict__.keys():

            raise NotConfigured('Only supports RedisSpider')

        # get the number of items from settings

        idle_number = crawler.settings.getint('IDLE_NUMBER', 360)

        # instantiate the extension object

        ext = cls(idle_number, crawler)

        # connect the extension object to signals

        crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)

        crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)

        crawler.signals.connect(ext.spider_idle, signal=signals.spider_idle)

        return ext

    def spider_opened(self, spider):
        spider.logger.info("opened spider {}, Allow waiting time:{} second".format(spider.name, self.idle_number*5))

    def spider_closed(self, spider):
        spider.logger.info("closed spider {}, Waiting time exceeded {} second".format(spider.name, self.idle_number*5))

    def spider_idle(self, spider):
        #                 ,    5      
        #           spider.redis_key,     
        #        redis_key
        if not spider.server.exists(spider.redis_key):
            self.idle_count += 1
        else:
            self.idle_count = 0

        if self.idle_count > self.idle_number:
            #         
            self.crawler.engine.close_spider(spider, 'Waiting time exceeded')