爬虫類のページを分けてお聞きします.


python正則の勉強を終えて、主にCQCのブログを参考にしました.http://cuiqingcai.com/990.html.
機能:段子の作者、発表時間、ポイント数、内容と配置図を全部合わせました.前のページにジャンプして、後のページ、あるページにジャンプして、機能を終了します.
修正後のコードは以下の通りです.
#!/usr/bin/env python
#-*-coding:utf-8 -*-
__author__ = "PS"
"""
modified from CQC
http://cuiqingcai.com/990.html
python version : 2.7.9 
"""
import urllib
import urllib2
import re
import time
class Scrapy_qiushibaike():
    def __init__(self):
        self.pageIndex = 1
        self.user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
        self.headers  = {'User-Agent':self.user_agent}
        self.stories = []
        self.enable = True
    
    def get_page(self,pageIndex):
        try:
            url = 'http://www.qiushibaike.com/hot/page/' + str(pageIndex)
            request = urllib2.Request(url,headers=self.headers)
            response = urllib2.urlopen(request)
            pageCode = response.read().decode('utf-8')
            return pageCode
        
        except urllib2.URLError,e:
            if hasattr(e, "reason"):
                print "connect to the web error",e.reason
                return None
           
    def get_page_items(self,pageIndex):
        page_code = self.get_page(pageIndex)
        if not page_code:
            print "response failure"
            return None
        pattern = re.compile('<div.*?article.*?'+
                             '<h2>(.*?)</h2>.*?' +
                             '<div class="content">(.*?)' +
                             '<!--(.*?)-->.*?'+
                             'div>(.*?)class="stats".*?' +
                             'class="number">(.*?)</i>', re.S)
        #item[0]:name,item[1]:content,item[2]:time,itme[3]:img,item[4]:support number
        items = re.findall(pattern,page_code)
        page_stories = []
        for item in items:
            haveImg = re.search("img",item[3])
            if haveImg:
                pattern_img = re.compile('<img src="(.*?)"')
                img_url = ''.join(re.findall(pattern_img,item[3]))
            else:
                img_url = 'no image'
            replaceBR = re.compile('<br/>')
            text = re.sub(replaceBR,"
",item[1])             time_float = time.gmtime(float(item[2]))             time_formated =  time.strftime('%Y-%m-%d %H:%M:%S',time_float)             author = item[0]             support_number = item[4]             page_stories.append([author.strip(), text.strip(),time_formated.strip(),                                  img_url,support_number.strip()])         return page_stories                                def load_page(self):         if self.enable == True:             if len(self.stories) <= 2:                 page_stories = self.get_page_items(self.pageIndex)                                 # add to global variable stories             if page_stories:                 self.stories.append(page_stories)                                       def get_one_page_story(self):         self.load_page()         for story in self.stories[0]:             print "page%d
author:%s
time:%s
support_number:%s
%s
%s
" %(self.pageIndex,story[0],story[2],story[4],story[1],story[3])         del self.stories[0]                        def start(self):         while self.enable:             self.get_one_page_story()              input  = raw_input("'n' -> next page, 'p' -> previous page, number -> that page, q/Q -> quit,others -> current page:")             if input == 'q':                 self.enable = False                 return None             elif input == 'f':                 self.pageIndex += 1             elif input == 'b':                 self.pageIndex -= 1             elif input.isdigit():                 self.pageIndex = int(input)                 print self.pageIndex            if __name__ == '__main__':     spider = Scrapy_qiushibaike()     spider.start()