Python爬虫類:爬虫類demo
16414 ワード
# -*- coding: utf-8 -*-
import urllib
import urllib2
import re
def getDetailUrl(name):
reg = r'(.*?)'
pattern = re.compile(reg, re.I)
match = re.search(pattern, name)
return match.groups()
#
def getlist(page):
#
tablereg = r'' +'.*?'+ r'
'
pattern = re.compile(tablereg, re.I|re.S)
match = re.search(pattern, page)
table = match.group()
#
res = r''
pattern = re.compile(res, re.I)
table = re.sub(pattern, "", table)
#
theadreg = r'.*?'
pattern = re.compile(theadreg, re.I|re.S)
match = re.search(pattern, table)
thead = match.group()
reg = r'(.*?)
pattern = re.compile(reg, re.I|re.S)
match = re.findall(pattern, thead)
head = []
for e in match:
head.append(e)
#
reg = r'(.*?)'
pattern = re.compile(reg, re.I|re.S)
match = re.findall(pattern, table)
td = {
}
res = []
i = 0
while i+len(head) <= len(match):
for e in head:
if e == ' ':
url_name = getDetailUrl(match[i])
td[' '] = url_name[1]
td['url'] = 'http://www.jnfdc.gov.cn/kfqy/' + url_name[0]
td[e] = match[i]
i += 1
res.append(td)
td = {
}
return res
#
rooturl = "http://www.jnfdc.gov.cn/kfqy/"
values = {
"entname":"","levelno":"-1"} #levelno=-1:
data = urllib.urlencode(values)
pageNum = 21
entlist = []
while True:
if pageNum == 0:
param1 = ""
else:
param1 = "_" + str(pageNum)
url = rooturl + "index" + param1 + ".shtml"
pageNum += 1
geturl = url + "?"+data
request = urllib2.Request(geturl)
response = urllib2.urlopen(request)
page = response.read()
res = getlist(page)
if len(res) == 0:
break
entlist += getlist(page)
#
import MySQLdb
ip = 'localhost'
username = 'root'
password = '***'
dbname = 'test'
conn = MySQLdb.connect(ip, username, password, dbname, charset='utf8')
cursor = conn.cursor()
print entlist[1]
#try:
# sql = "insert into fdc_ent_info value (%( )s, %( )s, %( )s, %( )s, %( )s, %(url)s)"
# cursor.executemany(sql, entlist)
# conn.commit()
#except:
# import traceback
# traceback.print_exc()
# conn.rollback()
#finally:
# cursor.close()
# conn.close()
print 'file...'
f = file("d:\\entinfo.txt", 'w')
for e in entlist:
le = ""
for key,value in e.items():
le += key + ":" + value + ", "
le = le[:-2]
le += '
'
f.write(le)
f.flush()
f.close
print 'done'