Python爬虫類:爬虫類demo

16414 ワード

# -*- coding: utf-8 -*-

import urllib
import urllib2
import re

def getDetailUrl(name):
    reg = r'(.*?)'
    pattern = re.compile(reg, re.I)
    match = re.search(pattern, name)
    return match.groups()
    
#         
def getlist(page): 
    #      
    tablereg = r'' +'.*?'+ r'
'
pattern = re.compile(tablereg, re.I|re.S) match = re.search(pattern, page) table = match.group() # res = r'' pattern = re.compile(res, re.I) table = re.sub(pattern, "", table) # theadreg = r'.*?' pattern = re.compile(theadreg, re.I|re.S) match = re.search(pattern, table) thead = match.group() reg = r'(.*?) pattern = re.compile(reg, re.I|re.S) match = re.findall(pattern, thead) head = [] for e in match: head.append(e) # reg = r'(.*?)' pattern = re.compile(reg, re.I|re.S) match = re.findall(pattern, table) td = { } res = [] i = 0 while i+len(head) <= len(match): for e in head: if e == ' ': url_name = getDetailUrl(match[i]) td[' '] = url_name[1] td['url'] = 'http://www.jnfdc.gov.cn/kfqy/' + url_name[0] td[e] = match[i] i += 1 res.append(td) td = { } return res # rooturl = "http://www.jnfdc.gov.cn/kfqy/" values = { "entname":"","levelno":"-1"} #levelno=-1: data = urllib.urlencode(values) pageNum = 21 entlist = [] while True: if pageNum == 0: param1 = "" else: param1 = "_" + str(pageNum) url = rooturl + "index" + param1 + ".shtml" pageNum += 1 geturl = url + "?"+data request = urllib2.Request(geturl) response = urllib2.urlopen(request) page = response.read() res = getlist(page) if len(res) == 0: break entlist += getlist(page) # import MySQLdb ip = 'localhost' username = 'root' password = '***' dbname = 'test' conn = MySQLdb.connect(ip, username, password, dbname, charset='utf8') cursor = conn.cursor() print entlist[1] #try: # sql = "insert into fdc_ent_info value (%( )s, %( )s, %( )s, %( )s, %( )s, %(url)s)" # cursor.executemany(sql, entlist) # conn.commit() #except: # import traceback # traceback.print_exc() # conn.rollback() #finally: # cursor.close() # conn.close() print 'file...' f = file("d:\\entinfo.txt", 'w') for e in entlist: le = "" for key,value in e.items(): le += key + ":" + value + ", " le = le[:-2] le += '
'
f.write(le) f.flush() f.close print 'done'