Spiderクラスソースコードリファレンス(超多注釈超詳細)
2020に別れを告げる
実の「ネズミ」は容易ではない
2021を迎える
「牛」が乾坤に転じる
実の「ネズミ」は容易ではない
,
2021を迎える
「牛」が乾坤に転じる
,
# ,
class Spider(object_ref):
# spider (string)。
# spider Scrapy ( )spider, 。
# name spider , 。
# (domain)( ) spider。 , spider mywebsite.com , spider mywebsite
name = None
# , ,start_urls
def __init__(self, name=None, **kwargs):
if name is not None:
self.name = name
# ,
elif not getattr(self, 'name', None):
raise ValueError("%s must have a name" % type(self).__name__)
# python __dict__
self.__dict__.update(kwargs)
#URL 。 URL ,spider 。 , URL 。 URL 。
if not hasattr(self, 'start_urls'):
self.start_urls = []
# Scrapy log
def log(self, message, level=log.DEBUG, **kw):
log.msg(message, spider=self, level=level, **kw)
# object ,
def set_crawler(self, crawler):
assert not hasattr(self, '_crawler'), "Spider already bounded to %s" % crawler
self._crawler = crawler
@property
def crawler(self):
assert hasattr(self, '_crawler'), "Spider not bounded to any crawler"
return self._crawler
@property
def settings(self):
return self.crawler.settings
# start_urls , Request , Scrapy Response
#
def start_requests(self):
for url in self.start_urls:
yield self.make_requests_from_url(url)
#start_requests() , Request 。
#Request parse(), get
def make_requests_from_url(self, url):
return Request(url, dont_filter=True)
# Request , response。
# Item Request 。
def parse(self, response):
raise NotImplementedError
@classmethod
def handles_request(cls, request):
return url_is_from_spider(request.url, cls)
def __str__(self):
return "" % (type(self).__name__, self.name, id(self))
__repr__ = __str__