scrapy-redis分布式爬虫類のケース(部屋の天下)
8711 ワード
効果マップを実行します.
簡単な説明:この例ではLinuxシステムがRedisサーバを実行し、2台のWindowsシステムが分散爬虫類を走っています.爬虫類はRedisキューから這い出すURLを取得し、同時にredisはキュー中のURLの重さと爬虫類が何らかの原因で一時停止または終了した場合、次回から爬虫類を開いて自動的に前回完了していないURLを継続して這い出すことを担当し、頭を上げて這い出すことはなく、爬虫類がRedis中のURLを這い出すと待機状態になり、回数を設定してしばらく待つことができ、キューに這い出すURLが追加されていない場合、自動的に爬虫類を閉じることができる.爬虫類が常に待機している状態で資源を占有することを避ける.
sfw.py
pipelines.py
settings.py
簡単な説明:この例ではLinuxシステムがRedisサーバを実行し、2台のWindowsシステムが分散爬虫類を走っています.爬虫類はRedisキューから這い出すURLを取得し、同時にredisはキュー中のURLの重さと爬虫類が何らかの原因で一時停止または終了した場合、次回から爬虫類を開いて自動的に前回完了していないURLを継続して這い出すことを担当し、頭を上げて這い出すことはなく、爬虫類がRedis中のURLを這い出すと待機状態になり、回数を設定してしばらく待つことができ、キューに這い出すURLが追加されていない場合、自動的に爬虫類を閉じることができる.爬虫類が常に待機している状態で資源を占有することを避ける.
sfw.py
import scrapy
import re
from fang.items import NewHouseItem, OldHouseItem
from scrapy_redis.spiders import RedisSpider
class SfwSpider(RedisSpider):
name = 'sfw'
allowed_domains = ['fang.com']
# start_urls = ['https://www.fang.com/SoufunFamily.htm']
redis_key = 'fang:start_url'
def parse(self, response):
trs = response.xpath("//div[@class='outCont']//tr")
province = None
for tr in trs:
tds = tr.xpath(".//td[not(@class)]")
province_td = tds[0]
province_text = province_td.xpath(".//text()").get()
province_text = re.sub(r'\s', '', province_text)
if province_text:
province = province_text
if province == ' ':
continue
city_td = tds[1]
city_links = city_td.xpath(".//a")
for city_link in city_links:
city = city_link.xpath(".//text()").get()
city_url = city_link.xpath(".//@href").get()
# print(" ", province)
# print(" ", city)
# print(" ", city_url)
url_module = city_url.split("//")
scheme = url_module[0]
domain = url_module[1]
if 'bj.' in domain:
newHouse_url = 'https://newhouse.fang.com/house/s/'
oldHouse_url = 'https://esf.fang.com'
else:
# URL
newHouse_url = scheme+'//'+'newHouse.'+domain+'/house/s/'
# URL
oldHouse_url = scheme+'//'+"esf."+domain
# print(" :%s%s" % (province, city))
# print(" : %s" % newHouse_url )
# print(" : %s" % esf_url )
yield scrapy.Request(url=newHouse_url, callback=self.parse_newhouse, meta={'info': (province, city)})
yield scrapy.Request(url=oldHouse_url, callback=self.parse_oldhouse, meta={'info': (province, city)})
# break
# break
def parse_newhouse(self, response):
province, city = response.meta.get('info')
lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li")
for li in lis:
name = li.xpath(".//div[@class='nlcd_name']/a/text()").get()
if name is not None:
name = name.strip()
# print(name)
house_type = li.xpath(".//div[contains(@class, 'house_type')]/a/text()").getall()
house_type_list =list(map(lambda x: re.sub(r"\s", "", x), house_type))
rooms =list(filter(lambda x: x.endswith(' '), house_type_list))
# print(house_type_list)
area = " ".join(li.xpath(".//div[contains(@class, 'house_type')]/text()").getall())
area = re.sub(r"\s|-|/", "", area)
# print(area)
address = li.xpath(".//div[@class='address']/a/@title").get()
district = li.xpath(".//div[@class='address']/a//text()").getall()
district_text = "".join(district)
district = re.search(r".*\[(.+)\].*", district_text)
if district is not None:
district = district.group(1)
sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/text()").get()
price = "".join(li.xpath(".//div[@class = 'nhouse_price']//text()").getall())
price = re.sub(r"\s| ", "", price)
url = li.xpath(".//div[@class = 'nlcd_name']/a/@href").get()
if url is not None:
url = "https:"+url
item = NewHouseItem(province=province, city=city, name=name, rooms=rooms, area=area,
district=district, address=address, price=price, sale=sale, url=url)
yield item
domain_url = "https://newhouse.fang.com"
next_url = response.xpath("//a[@class='next']/@href").get()
# next_url = domain_url+next_url
# print(" :"+next_url)
if next_url:
scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse, meta={'info': (province, city)})
def parse_oldhouse(self, response):
province, city = response.meta.get('info')
print(" ")
dls = response.xpath("//div[contains(@class, 'shop_list')]/dl[contains(@dataflag,'bg')]")
# dls = response.xpath("//div[contains(@class, 'shop_list')]/dl")
for dl in dls:
# print("dl ")
item = OldHouseItem(province=province, city=city)
# name =dl.xpath(".//p[@class='add_shop']//text()").get()
item['name'] = dl.xpath(".//p[@class='add_shop']/a/text()").get().strip()
infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
infos = list(map(lambda x:re.sub(r"\s", "", x), infos))
for info in infos:
if ' ' in info:
item['rooms'] = info
elif '㎡' in info:
item['area'] = info
elif ' ' in info:
item['floor'] = info
elif ' ' in info:
item['toward'] = info
elif ' ' in info:
item['year'] = info.replace(" ", "")
# print(item)
item['address'] = dl.xpath(".//p[@class='add_shop']/span/text()").get()
item['price'] = "".join(dl.xpath(".//dd[@class='price_right']/span[1]//text()").getall())
item['unit'] = dl.xpath(".//dd[@class='price_right']/span[2]//text()").get()
detail_url = dl.xpath(".//h4/a/@href").get()
item['url'] = response.urljoin(detail_url)
yield item
next_url = response.xpath("//div[@class='page_al']/p[1]/a/@href").get()
# print(next_url)
if next_url is not None:
scrapy.Request(url=response.urljoin(next_url), callback=self.parse_oldhouse, meta={'info': (province, city)})
pipelines.py
from scrapy.exporters import JsonLinesItemExporter
class FangPipeline(object):
def __init__(self):
self.newHouse_fp = open('newHouse.json', 'wb')
self.oldHouse_fp = open('oldHouse.json', 'wb')
self.newHouse_exporter = JsonLinesItemExporter(self.newHouse_fp, ensure_ascii=False)
self.oldHouse_exporter = JsonLinesItemExporter(self.oldHouse_fp, ensure_ascii=False)
def process_item(self, item, spider):
self.newHouse_exporter.export_item(item)
self.oldHouse_exporter.export_item(item)
return item
def close_spider(self):
self.newHouse_fp.close()
self.oldHouse_fp.close()
settings.py
# Scrapy-Redis
# request redis
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
#
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# redis item pipeline
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 300
}
# redis scrapy-redis , redis , 。
SCHEDULER_PERSIST = True
# redis
# REDIS_HOST = '127.0.0.1'
REDIS_HOST = '192.168.150.134'
REDIS_PORT = 8888
# , redis URL , ,
MYEXT_ENABLED = True
# IDLE_NUMBER = 360 # 360 , 5s
IDLE_NUMBER = 60
EXTENSIONS = {
'fang.extensions.RedisSpiderSmartIdleClosedExensions': 500,
}