python Webページデータのキャプチャ

5284 ワード

mainメソッド
# -*- coding:utf-8 -*-
__author__ = 'zhaochaoye'

import utlis
import urllib
import MySQLdb
import jieba



if __name__ == "__main__":
    # generate start url list
    start_urls = []
    for i in range(100, 3750):
        url = "http://zxyxpt.suda.edu.cn/Detail.aspx?id="+str(i)
        start_urls.append(url)

    newsSpider = utlis.NewsSpider(start_urls)
    newsSpider.parse()




    host, user, pwd, db = "localhost", "root", "root", "databaseA"
    conn = MySQLdb.connect(host, user, pwd, db, charset='utf8')

    #   cursor       
    cursor = conn.cursor()
    #          
    cursor.execute("SET NAMES utf8")
    cursor.execute("SET CHARACTER_SET_CLIENT=utf8")
    cursor.execute("SET CHARACTER_SET_RESULTS=utf8")

    for record in newsSpider.records:
        print record
        #  sql  
        sql = "INSERT INTO yixiao VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (record.ID, record.category, record.state,record.time,record.TitleLB,record.ContentLB,record.ReplyLB,record. Retime)

        #  sql  
        try:
            cursor.execute(sql)
            conn.commit()
        except:
            pass



    #      
    cursor.close()
    conn.close()

utlis:
# -*- coding:utf-8 -*-
__author__ = 'zhaochaoye'

from bs4 import BeautifulSoup
import urllib
import re
import time
import random

class Record(object):
    def __init__(self, ID, category,state,time,TitleLB,ContentLB,ReplyLB,Retime):
        self.ID = ID
        self.category = category
        self.state = state
        self.time = time
        self.TitleLB = TitleLB
        self.ContentLB = ContentLB
        self.ReplyLB =ReplyLB
        self.Retime = Retime


    def __str__(self):
        return str(self.ID)+","+str(self.category.encode("gbk"))+","+str(self.state.encode("gbk"))+","+str(self.time.encode("gbk"))+","+str(self.TitleLB.encode("gbk"))+","+str(self.ContentLB.encode("gbk"))+","+str(self.ReplyLB.encode("gbk"))+","+str(self.Retime.encode("gbk"))

    def __repr__(self):
        return str(self.ID)+","+str(self.category.encode("gbk"))+","+str(self.state.encode("gbk"))+","+str(self.time.encode("gbk"))+","+str(self.TitleLB.encode("gbk"))+","+str(self.ContentLB.encode("gbk"))+","+str(self.ReplyLB.encode("gbk"))+","+str(self.Retime.encode("gbk"))


class UrlParser(object):

    #   urllib    ,      、        
    def __init__(self, url):
        self.url = url
        self.head_info = urllib.urlopen(url).info()
        self.status = urllib.urlopen(url).getcode()
        self.content = urllib.urlopen(url).read()

    #   URL   ,      [  /    /  /    /  /  /    /    ]
    def url_extractor(self):
        # extract ID
        ID = self.extract_ID()
        # extract category
        category = self.extract_category()
        state = self.extract_state()
        time = self.extract_time()
        TitleLB = self.extract_TitleLB()
        ContentLB =self.extract_ContentLB()
        ReplyLB = self.extract_ReplyLB()
        Retime = self.extract_Retime()

        return Record(ID, category,state,time,TitleLB,ContentLB,ReplyLB,Retime)

    def extract_ID(self):
        soup = BeautifulSoup(self.content)
        id_span = soup.find("span", id="idlb")
        return id_span.get_text()

    def extract_category(self):
        soup = BeautifulSoup(self.content)
        id_span = soup.find("span", id="list_idLB")
        return id_span.get_text()

    def extract_state(self):
        soup = BeautifulSoup(self.content)
        id_span =soup.find(id="stateLB")
        return id_span.get_text()

    def extract_time(self):
        soup = BeautifulSoup(self.content)
        id_span =soup.find(id="TimeLB")
        return id_span.get_text()

    def extract_TitleLB(self):
        soup = BeautifulSoup(self.content)
        id_span =soup.find(id="TitleLB")
        return id_span.get_text()

    def extract_ContentLB(self):
        soup = BeautifulSoup(self.content)
        id_span =soup.find(id="ContentLB")
        return id_span.get_text()

    def extract_ReplyLB(self):
        soup = BeautifulSoup(self.content)
        id_span =soup.find(id="ReplyLB")
        return id_span.get_text()

    def extract_Retime(self):
        soup = BeautifulSoup(self.content)
        id_span =soup.find(id="RetimeLB")
        return id_span.get_text()

class NewsSpider(object):

    #               URL
    def __init__(self, crawl_urls):
        self.crawl_urls = crawl_urls
        self.records = []

    #           URL
    def parse(self):
        print "crawling travel urls"
        cout = 0
        for url in self.crawl_urls:
            url_parser = UrlParser(url)
            self.records.append(url_parser.url_extractor())
            time.sleep(random.random()/10)
            cout += 1
            print cout, url