scrapyを使って不正事件の百科をつかむ
3045 ワード
先にscrapy startap tut 01
scrapy genspider qsbk www.qsbk.com
qsbk.py
scrapy genspider qsbk www.qsbk.com
qsbk.py
# -*- coding: utf-8 -*-
import scrapy
QSBK_HOST = u"http://www.qiushibaike.com"
COUNT = 0
def debug(msg):
scrapy.log.msg(msg.decode("utf-8").encode("gb2312"), level=scrapy.log.DEBUG)
#
"""
"""
class QBItem(scrapy.Item):
author = scrapy.Field()
content = scrapy.Field()
ctr = scrapy.Field()
def __str__(self):
return "%s:%s %s:%s %s:%s" %(
'author',
self['author'].encode("GB2312"),
'content',
self['content'].encode("GB2312"),
'clickcount',
self['ctr'].encode("GB2312"),
)
"""
USER_AGENT HEADERS,
"""
class QsbkSpider(scrapy.Spider):
name = "qsbk"
allowed_domains = ["www.qiushibaike.com",]
start_urls = (
'http://www.qiushibaike.com/',
)
def read_QBItems(self, response):
global COUNT
COUNT += 1
debug(" {} ".format(COUNT))
css_block = "div.article.block"
css_author = "div.author h2::text"
css_content = "div.content::text"
css_ctr = "div > span > i::text"
for b in response.selector.css(css_block):
try:
qb = QBItem()
qb['author'] = b.css(css_author).extract()[0]
qb['content'] = b.css(css_content).extract()[0]
qb['ctr'] = b.css(css_ctr).extract()[0]
# ,
yield qb
except Exception,e:
debug(" {}".format(response.url.encode("GB2312")))
css_next = "div.pageto a.next::attr(href)"
npage = response.selector.css(css_next).extract()[0]
if npage.startswith(u'/'):
next_url = QSBK_HOST + npage
yield scrapy.Request(next_url, self.read_QBItems)
def parse(self, response):
global COUNT
if not COUNT:
debug(" ...")
return self.read_QBItems(response)
tut 01/settings.py# -*- coding: utf-8 -*-
# Scrapy settings for tut01 project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'tut01'
SPIDER_MODULES = ['tut01.spiders']
NEWSPIDER_MODULE = 'tut01.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# scrapy ,QSBK
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
'''
DEFAULT_REQUEST_HEADERS = {
'Proxy-Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
}
'''