Pythonベースの簡単な爬虫類(恥ずかしい百科爬虫類)

3882 ワード

# -*- coding: utf-8 -*-
import urllib2
from bs4 import BeautifulSoup
import thread
import time
class qiushibaike:
    """docstring for ClassName"""
    def __init__(self):
        self.page = 1   #      
        self.pages = [] #      html
        self.enable = False #   
        self.url='http://m.qiushibaike.com/hot/page/'
    #         
    def LoadPage(self):
        #        quit     
        while self.enable:
            #   pages        5 
            if len(self.pages) < 5:
                try:
                    #       ,      
                    url=self.url+str(self.page)
                    newPage = self.GetHtml(url)
                    self.page += 1
                    self.pages.append(newPage)
                except:
                    print '        !'
            else:
                time.sleep(1)
    
    def ParseHtml(self,html):
        items=self.GetContenBlock(html)
        for item in items:
            content=self.ParseContent(item)
            try:
                print u"  ",content['author'],u"  :",content["time"]
                print content["content"]
                print '------------------------------------------'
            except:
                print u'        !'
            
    #    html
    def GetHtml(self,url):
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = { 'User-Agent' : user_agent } 
        req = urllib2.Request(url,None,headers)  
        response = urllib2.urlopen(req)  
        html = response.read()
        return html
    def GetContenBlock(self,html):
        soup = BeautifulSoup(str(html))
        items=soup.findAll('div',{'class':'article block untagged mb15'})
        return items;

    def ParseContent(self,item):
        #soup = BeautifulSoup(str(item))
        content=item.find('div',{'class':'content'})
        result={}
        if content!=None:
            try:
                result["content"]=content.text.strip()
                result["time"]=content.get("title").strip()
            except:
                result["content"]=None
                result["time"]=None
        else:
            result["content"]=None
            result["time"]=None
        #author=item.find('div',{'class':'author clearfix'})
        #if author!=None:
        #    result["author"]=author.findAll('a')[1].string
        #else:
        #    result["author"]=None
        result['author']=self.ParseAuthor(item)
        return result;

    def ParseAuthor(self,item):
        try:
            #soup = BeautifulSoup(str(item))
            item=item.find('div',{'class':'author clearfix'})
            if item!=None:
                return item.findAll('a')[1].text
            else:
                return None;
        except:
            return None;
        
    def Start(self):
        self.enable = True
        page = self.page
        print u'        ......'
        #                 
        thread.start_new_thread(self.LoadPage,())
        #-----------          -----------
        while self.enable:
            #   self page       
            if self.pages:
                nowPage = self.pages[0]
                del self.pages[0]
                self.ParseHtml(nowPage)
                page += 1

print u"""
---------------------------------------
     :    
     :0.1
     :zz
     :2013-05-15
     :Python 2.7
     :               
---------------------------------------
"""


print u'              :'
raw_input(' ')
myModel = qiushibaike()
myModel.Start()