Spiderクラスソースコードリファレンス(超多注釈超詳細)

9872 ワード

2020に別れを告げる
実の「ネズミ」は容易ではない
2021を迎える
「牛」が乾坤に転じる
#       ,               
class Spider(object_ref):
    #   spider      (string)。
    # spider      Scrapy    (    )spider,         。
    # name spider      ,      。
    #            (domain)(     )   spider。   ,  spider   mywebsite.com , spider        mywebsite
    name = None

    

#    ,      ,start_urls
    def __init__(self, name=None, **kwargs):
        if name is not None:
            self.name = name
        #         ,         
        elif not getattr(self, 'name', None):
            raise ValueError("%s must have a name" % type(self).__name__)

        # python            __dict__       
        self.__dict__.update(kwargs)

        #URL  。      URL ,spider            。   ,           URL       。    URL            。
        if not hasattr(self, 'start_urls'):
            self.start_urls = []

    #   Scrapy    log  
    def log(self, message, level=log.DEBUG, **kw):
        log.msg(message, spider=self, level=level, **kw)

    #     object       ,        
    def set_crawler(self, crawler):
        assert not hasattr(self, '_crawler'), "Spider already bounded to %s" % crawler
        self._crawler = crawler

    @property
    def crawler(self):
        assert hasattr(self, '_crawler'), "Spider not bounded to any crawler"
        return self._crawler

    @property
    def settings(self):
        return self.crawler.settings

    #      start_urls    ,           Request  ,  Scrapy     Response
    #        
    def start_requests(self):
        for url in self.start_urls:
            yield self.make_requests_from_url(url)

    #start_requests()   ,    Request   。
    #Request          parse(),      get
    def make_requests_from_url(self, url):
        return Request(url, dont_filter=True)

    #   Request      ,     response。
    #  Item  Request  。        
    def parse(self, response):
        raise NotImplementedError

    @classmethod
    def handles_request(cls, request):
        return url_is_from_spider(request.url, cls)

    def __str__(self):
        return "" % (type(self).__name__, self.name, id(self))

    __repr__ = __str__