Pythonベースの簡単な爬虫類(恥ずかしい百科爬虫類)
3882 ワード
# -*- coding: utf-8 -*-
import urllib2
from bs4 import BeautifulSoup
import thread
import time
class qiushibaike:
"""docstring for ClassName"""
def __init__(self):
self.page = 1 #
self.pages = [] # html
self.enable = False #
self.url='http://m.qiushibaike.com/hot/page/'
#
def LoadPage(self):
# quit
while self.enable:
# pages 5
if len(self.pages) < 5:
try:
# ,
url=self.url+str(self.page)
newPage = self.GetHtml(url)
self.page += 1
self.pages.append(newPage)
except:
print ' !'
else:
time.sleep(1)
def ParseHtml(self,html):
items=self.GetContenBlock(html)
for item in items:
content=self.ParseContent(item)
try:
print u" ",content['author'],u" :",content["time"]
print content["content"]
print '------------------------------------------'
except:
print u' !'
# html
def GetHtml(self,url):
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
req = urllib2.Request(url,None,headers)
response = urllib2.urlopen(req)
html = response.read()
return html
def GetContenBlock(self,html):
soup = BeautifulSoup(str(html))
items=soup.findAll('div',{'class':'article block untagged mb15'})
return items;
def ParseContent(self,item):
#soup = BeautifulSoup(str(item))
content=item.find('div',{'class':'content'})
result={}
if content!=None:
try:
result["content"]=content.text.strip()
result["time"]=content.get("title").strip()
except:
result["content"]=None
result["time"]=None
else:
result["content"]=None
result["time"]=None
#author=item.find('div',{'class':'author clearfix'})
#if author!=None:
# result["author"]=author.findAll('a')[1].string
#else:
# result["author"]=None
result['author']=self.ParseAuthor(item)
return result;
def ParseAuthor(self,item):
try:
#soup = BeautifulSoup(str(item))
item=item.find('div',{'class':'author clearfix'})
if item!=None:
return item.findAll('a')[1].text
else:
return None;
except:
return None;
def Start(self):
self.enable = True
page = self.page
print u' ......'
#
thread.start_new_thread(self.LoadPage,())
#----------- -----------
while self.enable:
# self page
if self.pages:
nowPage = self.pages[0]
del self.pages[0]
self.ParseHtml(nowPage)
page += 1
print u"""
---------------------------------------
:
:0.1
:zz
:2013-05-15
:Python 2.7
:
---------------------------------------
"""
print u' :'
raw_input(' ')
myModel = qiushibaike()
myModel.Start()