scrapy学習ノート1---爬取の完全な例
一、工程の作成
scrapy startproject dmoz
二、dmozを創立するspider.py
三、itemsを書き換える.py
四、pipelineを書き換える.py
五、dmozフォルダのルートディレクトリで実行する
scrapy crawl dmoz -o dmoz.json
spiderの実行
scrapy startproject dmoz
二、dmozを創立するspider.py
from scrapy.spider import Spider
from scrapy.selector import Selector
from dmoz.items import DmozItem
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/",
]
def parse(self, response):
"""
The lines below is a spider contract. For more info see:
http://doc.scrapy.org/en/latest/topics/contracts.html
@url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
@scrapes name
"""
sel = Selector(response)
sites = sel.xpath('//ul[@class="directory-url"]/li')
items = []
for site in sites:
item = DmozItem()
item['name'] = site.xpath('a/text()').extract()
item['url'] = site.xpath('a/@href').extract()
item['description'] = site.xpath('text()').re('-\s[^
]*\\r')
items.append(item)
return items
三、itemsを書き換える.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
from scrapy.item import Item, Field
class DmozItem(Item):
name = Field()
description = Field()
url = Field()
四、pipelineを書き換える.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
from scrapy.item import Item, Field
class DmozItem(Item):
name = Field()
description = Field()
url = Field()
五、dmozフォルダのルートディレクトリで実行する
scrapy crawl dmoz -o dmoz.json
spiderの実行