極客学院単スレッド爬虫py 3.xバージョン


極客学院単スレッド爬虫実戦編、先生はpy 2を使っています.xバージョンですが、私は3.5バージョンなので、復号上の違いがあります.だから、先生のコードより一度書いて、ここに置いておきました.
#tips:py 3のようです.x file操作にwritelinesはありません(あるかもしれませんが、見ていません?)ただし、readline(文字型)またはreadline(リスト型)があり、文書の各行を識別する準測は、検索である’.
# -*- coding: utf-8 -*-
""" Created on Fri May 13 2016 @author: s """
import requests
import re

class spider(object):
    def __init__(self):
        print('Start...')
    #   html 
    def get_html(self, url):
        header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'}
        html = requests.get(url, headers=header)
        return html.text
    #    url 
    def get_pages_url(self, url, total_page_nums):
        now_page_num = int(re.search('pageNum=(\d+)', url, re.S).group(1))
        page_link_group = []
        for i in range(now_page_num, total_page_nums+1):
            page_link = re.sub('pageNum=\d+', 'pageNum=%d'%i, url, re.S)
            page_link_group.append(page_link)
        return page_link_group
    #    html 
    def get_class(self, page_html):
        each_class_html = re.findall('<li i(.*?)</li>', page_html, re.S)
        return each_class_html
    #       
        #each_class_html every_class_resource
    def get_info(self, eachclass):
        #eachclass  each_link_group
        info = {}
        info['title'] = re.search('title="(.*?)"', eachclass, re.S).group(1)
        info['content'] = re.search(' display: none;">(.*?)</p>', eachclass, re.S).group(1)
        time_and_level = re.findall('<em>(.*?)</em>', eachclass, re.S)        
        info['classtime'] = time_and_level[0]
        info['classlevel'] = time_and_level[1]
        info['learnnum'] = re.search('"learn-number">(.*?)</em>', eachclass, re.S).group(1)
        return info

    def save_info(self, classinfo):
        f = open('E:\pyProject\jikexueyuan_code\practise\JK_course_list2.txt', 'wb')

        for each in classinfo:
            f.write('titles: {}
'
.format(each['title']).encode()) f.write('content: {}
'
.format(str(each['content']).strip()).encode()) f.write('classtime: {}
'
.format(''.join(str(each['classtime']).split())).encode()) f.write('classlevel: {}
'
.format(''.join(each['classlevel'].split())).encode()) f.write('learnnum: {}
'
.format(each['learnnum']).encode()) f.close() # if __name__ == '__main__': testspider = spider() url = 'http://www.jikexueyuan.com/course/?pageNum=1' page_html = testspider.get_pages_url(url, 1) classinfo = [] for link in page_html: print('ing...' + link) html = testspider.get_html(link) eachclass = testspider.get_class(html) for each in eachclass: info = testspider.get_info(each) classinfo.append(info) testspider.save_info(classinfo) #print(classinfo)