pythonサイト爬虫類はオンライン墓を盗んで小説をローカルのシナリオにダウンロードします
BeautifulSoupとrequestsの2つのライブラリが必要です
(私は凝視をできるだけ具体的に書いた)
このプログラムの运行速度はとても遅くて、达人に最适化の方法を教えてもらいます.
#-*-coding:utf8-*-
from bs4 import BeautifulSoup
import requests
import re
import os
# URL
r = requests.get('http://www.nanpaisanshu.org/').content #
content=BeautifulSoup(r).findAll('a',href=re.compile(r'\Ahttp://www.nanpaisanshu.org/[a-z]+\Z')) #
sc = str(content) # string
lists=[]
lists = sc.split(',')
lists = list(set(lists)) #
lisy=[]
for line in lists:
p=line.split('"')[1] # " ,
lisy.append(p) # url
#print p
#print lisy
# URL , html
s = os.getcwd()#
d = os.sep #
namef='aaa' #
#b = os.path.exists( s+d+namef) #
f=os.path.exists(s+d+namef) #
if f==False:
os.mkdir(s+d+namef) #
else:
print u' '+namef
filenm = s+d+namef+d #
i=1
for line in lisy:
r = requests.get(line) # url
print r.content
print '
'
tfile=open(filenm+'neirong'+str(i)+'.html','w')
i=i+1
tfile.write(r.content) #
# URL URL txt
for i in range(1,len(lisy)+1):
fp = open(filenm+'neirong'+str(i)+'.html', "r")
of = open(filenm+'neirong'+str(i)+'.txt','w')
content = fp.read() #
p=re.compile(r'http://www\.nanpaisanshu\.org/.*?\.html') #
#print p.findall(content)
#print type(p.findall(content))
for line in p.findall(content):
#print line+'
'
#if line !='http://www.nanpaisanshu.org/9701.html':
of.write(line+'
') #
#else:
#continue
#of.write(str(p.findall(content)))
#
of.close()
fp.close()
tfile.close()
# txt
for i in range(1,len(lisy)+1):
ot=open(filenm+'neirong'+str(i)+'.txt','r')
outfile=open(filenm+'quanbu'+str(i)+'.txt','a+')
li=[]
for line in ot:
line = line.replace('
','')
li.append(line) # url
li = sorted(li) #
for line in li:
print line
#line = line.replace('
','')
r = requests.get(line).content # url
title=BeautifulSoup(r).find("div",{'class':"post_title"}).h2 #
content=BeautifulSoup(r).findAll("div",{'class':"post_entry"}) #
sti=str(title).replace('<h2>','').replace('</h2>','') # 。
# ,
scon = str(content).replace('<p>',' ').replace('</p>',' ').replace('<br/>','
')
#print str(urllist)
scon = re.sub("<.*>", "", scon)
scon = re.sub("(.*?);","",scon)
#scon = scon.strip()
scon = '
'.join(scon.split())
print scon
outfile.write(sti+'
'+line+'
'+scon) #
#i=i+1
#print
#print urllist
print '========================= ======================='
#
outfile.close()
ot.close()
#
targetDir=s+d+namef
for line in os.listdir(targetDir):
p=re.compile(r'neirong[0-9]{1}') #
if p.match(line)!=None:
print " "+s+d+namef+d+line+'!!'
os.remove(s+d+namef+d+line) # ,os.remove()
else:
print ' !'
continue
接続に失敗し、プログラムがエラーを報告する場合があります.requestsを推定する必要があります.get(url).status_code != 200ただ私がプラスした后に実行がもっと遅いことを発见して、すべてのページはすべて推定して、汗、私のここのネットの速さの何Kの原因がやっと异常なためかもしれません
以下は変更後の完璧版で、慎重に使用して、速度はきわめて遅いです.推定されたものと回数が追加されたため:
#-*-coding:utf8-*-
#
#2014-10-14
#ZJL
from bs4 import BeautifulSoup
import requests
import re
import os
# URL
r = requests.get('http://www.nanpaisanshu.org/').content #
content=BeautifulSoup(r).findAll('a',href=re.compile(r'\Ahttp://www.nanpaisanshu.org/[a-z]+\Z')) #
sc = str(content) # string
lists=[]
lists = sc.split(',')
lists = list(set(lists)) #
lisy=[]
for line in lists:
p=line.split('"')[1] # " ,
lisy.append(p) # url
#print p
#print lisy
# URL 。 html
s = os.getcwd()#
d = os.sep #
namef='aaa' #
#b = os.path.exists( s+d+namef) #
f=os.path.exists(s+d+namef) #
if f==False:
os.mkdir(s+d+namef) #
else:
print u' '+namef
filenm = s+d+namef+d #
i=1
for line in lisy:
r = requests.get(line) # url
print r.content
print '
'
tfile=open(filenm+'neirong'+str(i)+'.html','w')
i=i+1
tfile.write(r.content) #
# URL URL txt
for i in range(1,len(lisy)+1):
fp = open(filenm+'neirong'+str(i)+'.html', "r")
of = open(filenm+'neirong'+str(i)+'.txt','w')
content = fp.read() #
p=re.compile(r'http://www\.nanpaisanshu\.org/.*?\.html') #
#print p.findall(content)
#print type(p.findall(content))
for line in p.findall(content):
#print line+'
'
#if line !='http://www.nanpaisanshu.org/9701.html':
of.write(line+'
') #
#else:
#continue
#of.write(str(p.findall(content)))
#
of.close()
fp.close()
tfile.close()
# txt
for i in range(1,len(lisy)+1):
ot=open(filenm+'neirong'+str(i)+'.txt','r')
if os.path.exists(filenm+'quanbu'+str(i)+'.txt')==True:
print " "+filenm+'quanbu'+str(i)+'.txt'+' '
os.remove(filenm+'quanbu'+str(i)+'.txt')
outfile=open(filenm+'quanbu'+str(i)+'.txt','a+') # ( )
else:
print " "+filenm+'quanbu'+str(i)+'.txt'
outfile=open(filenm+'quanbu'+str(i)+'.txt','a+')
li=[]
for line in ot:
line = line.replace('
','')
li.append(line) # url
li = sorted(li) #
for line in li:
#print line
#line = line.replace('
','')
if requests.get(line).status_code != 200:
print ' , !'
outfile.write(' 。 ') # ,
elif requests.get(line).status_code == 200:
print ' !'
r = requests.get(line).content # url
title=BeautifulSoup(r).find("div",{'class':"post_title"}).h2 #
content=BeautifulSoup(r).findAll("div",{'class':"post_entry"}) #
sti=str(title).replace('<h2>','').replace('</h2>','') # 。
# ,
scon = str(content).replace('<p>',' ').replace('</p>',' ').replace('<br/>','
')
#print str(urllist)
scon = re.sub("<.*>", "", scon)
scon = re.sub("(.*?);","",scon)
#scon = scon.strip()
scon = '
'.join(scon.split())
print scon
outfile.write(sti+'
'+line+'
'+scon) # 。 ,
#i=i+1
#print
#print urllist
print '========================= ======================='
#
outfile.close()
ot.close()
#
targetDir=s+d+namef
for line in os.listdir(targetDir):
p=re.compile(r'neirong[0-9]{1}') #
if p.match(line)!=None:
print " "+s+d+namef+d+line+'!!'
os.remove(s+d+namef+d+line) # ,os.remove()
else:
print ' 。'
continue