scrapyを使って不正事件の百科をつかむ

3045 ワード

先にscrapy startap tut 01
scrapy genspider qsbk www.qsbk.com
qsbk.py

# -*- coding: utf-8 -*-
import scrapy

QSBK_HOST = u"http://www.qiushibaike.com"
COUNT = 0

def debug(msg):
	scrapy.log.msg(msg.decode("utf-8").encode("gb2312"), level=scrapy.log.DEBUG)

#      
"""
  
  
   
"""
class QBItem(scrapy.Item):
	author = scrapy.Field()
	content = scrapy.Field()
	ctr = scrapy.Field()

	def __str__(self):
		return "%s:%s %s:%s %s:%s" %(
				'author',
				self['author'].encode("GB2312"),
				'content',
				self['content'].encode("GB2312"),
				'clickcount',
				self['ctr'].encode("GB2312"),
				)
"""
      
    USER_AGENT HEADERS，        
"""

class QsbkSpider(scrapy.Spider):
	name = "qsbk"
	allowed_domains = ["www.qiushibaike.com",]
	start_urls = (
		'http://www.qiushibaike.com/',
	)

	def read_QBItems(self, response):
		global COUNT
		COUNT += 1
		debug("     {} ".format(COUNT))

		css_block = "div.article.block"
		css_author = "div.author h2::text"
		css_content = "div.content::text"
		css_ctr = "div > span > i::text"

		for b in response.selector.css(css_block):
			try:
				qb = QBItem()
				qb['author'] = b.css(css_author).extract()[0]
				qb['content'] = b.css(css_content).extract()[0]
				qb['ctr'] = b.css(css_ctr).extract()[0]
				#      ，    
				yield qb
			except Exception,e:
				debug("    {}".format(response.url.encode("GB2312")))

		css_next = "div.pageto a.next::attr(href)"
		npage = response.selector.css(css_next).extract()[0]
		if npage.startswith(u'/'):
			next_url = QSBK_HOST + npage
			yield scrapy.Request(next_url, self.read_QBItems)

	def parse(self, response):
		global COUNT
		if not COUNT:
			debug("        ...")
		return self.read_QBItems(response)

tut 01/settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for tut01 project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#

BOT_NAME = 'tut01'

SPIDER_MODULES = ['tut01.spiders']
NEWSPIDER_MODULE = 'tut01.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

#       scrapy    ，QSBK    
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
'''
DEFAULT_REQUEST_HEADERS = {
	'Proxy-Connection': 'keep-alive',
	'Cache-Control': 'max-age=0',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
	'Accept-Encoding': 'gzip, deflate, sdch',
	'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
}
'''

How to get AVFrame(ffmpeg)from NSImage/UImage

zip中国語のファイル名の圧縮（圧縮ファイルまたはフォルダの対応）