Scrapy爬取データをMongodbに格納
リード
今回は
エントリファイルの作成ファイル名: 実装コードは以下の通りである:
itemsフィールド定義
ミドルウェア新規ランダムuser-agent
pipelinesを定義してMongodbにデータを格納
その他の知識点 Mongodb windowsのインストールと使用:https://www.cnblogs.com/hongwest/p/7298257.html Mongodbコマンドライン操作データベース:https://www.cnblogs.com/lecaf/p/mongodb.html xpath Googleブラウザプラグイン:
今回は
scrapy
を用いて簡単にいくつかのマルチリスト映画データを抽出し、csvファイルおよびjsonファイルに格納し、最後にこれらのデータをすべてMongodbに格納した.関連する知識点はpipeline,yield,ミドルウェア,xpath,itemsの使用である.エントリファイルの作成
douban_spider.py
# -*- coding: utf-8 -*-
import scrapy
from douban.items import DoubanItem
class DoubanSpiderSpider(scrapy.Spider):
#
name = 'douban_spider'
#
allowed_domains = ['movie.douban.com']
# url
start_urls = ['https://movie.douban.com/top250']
def parse(self, response):
movie_list = response.xpath('//div[@class="article"]//ol[@class="grid_view"]/li')
for it in movie_list:
douban_item = DoubanItem()
douban_item['serial_number'] = it.xpath(".//div[@class='item']//em/text()").extract_first()
douban_item['movie_name'] = it.xpath('.//div[@class="hd"]//a/span[1]/text()').extract_first()
content = it.xpath('.//div[@class="bd"]//p[1]/text()').extract()
for c_introduce in content:
douban_item['introduce'] = "".join(c_introduce.split())
douban_item['star'] = it.xpath('.//div[@class="star"]/span[@class="rating_num"]/text()').extract_first()
douban_item['evaluate'] = it.xpath('.//div[@class="star"]/span[4]/text()').extract_first()
douban_item['describe'] = it.xpath('.//p[@class="quote"]/span/text()').extract_first()
print(douban_item)
yield douban_item
next_link = response.xpath('//span[@class="next"]/a/@href').extract()
if next_link:
next_link = next_link[0]
yield scrapy.Request("https://movie.douban.com/top250"+next_link, callback=self.parse)
itemsフィールド定義
import scrapy
class DoubanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#
serial_number = scrapy.Field()
#
movie_name = scrapy.Field()
#
introduce = scrapy.Field()
#
star = scrapy.Field()
#
evaluate = scrapy.Field()
#
describe = scrapy.Field()
# pass
ミドルウェア新規ランダムuser-agent
class my_user_agent(object):
def process_request(self,request,spider):
# user agent
USER_AGENT_LIST = [
'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23',
'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)',
'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)',
'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)',
'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8 sun4u)',
'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0',
'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)'
]
# user agent
USER_AGENT = random.choice(USER_AGENT_LIST)
request.headers['User_Agent'] = USER_AGENT
pipelinesを定義してMongodbにデータを格納
import pymongo
from douban.settings import mongo_host,mongo_port,mongo_db_name,mongo_db_collecttion
class DoubanPipeline(object):
def __init__(self):
host = mongo_host
port = mongo_port
dbname = mongo_db_name
cname = mongo_db_collecttion
client = pymongo.MongoClient(host=host,port=port)
mydb = client[dbname]
self.post = mydb[cname]
def process_item(self, item, spider):
data = dict(item)
self.post.insert(data)
return item
その他の知識点
XPath Helper