pythonサイト爬虫類はオンライン墓を盗んで小説をローカルのシナリオにダウンロードします

8671 ワード

最近暇で暇で小说を読みたくて、1つのすべて南派の3番目のおじさんの小说のサイトを探し当てて、すべてダウンロードして见ることを决めて、そこで手を出して、とても多いQQ群の中で达人の助けの下で(本人の正则の表现はとても腐っています.プログラムの复雑な正则はすべていくつかの达人の指导のです)、3、4日かけて1つの脚本を书きました
BeautifulSoupとrequestsの2つのライブラリが必要です
(私は凝視をできるだけ具体的に書いた)
このプログラムの运行速度はとても遅くて、达人に最适化の方法を教えてもらいます.

#-*-coding:utf8-*-

from bs4 import BeautifulSoup
import requests
import re
import os


#        URL           
r = requests.get('http://www.nanpaisanshu.org/').content  #        
content=BeautifulSoup(r).findAll('a',href=re.compile(r'\Ahttp://www.nanpaisanshu.org/[a-z]+\Z')) #           

sc = str(content) #   string  

lists=[]
lists = sc.split(',')   
lists = list(set(lists)) #         

lisy=[]


for line in lists:
    p=line.split('"')[1]  #  "   ，           
    lisy.append(p)        #         url
    #print p
#print lisy


#     URL    ，        html   

s = os.getcwd()#    

d = os.sep  #     

namef='aaa' #     

#b = os.path.exists( s+d+namef) #     

f=os.path.exists(s+d+namef) #     

if f==False:
    os.mkdir(s+d+namef)  #            
else:
    print u'    '+namef

filenm = s+d+namef+d    #  

i=1
for line in lisy:
    r = requests.get(line)   #      url
    print r.content
    print '
'
    tfile=open(filenm+'neirong'+str(i)+'.html','w')
    i=i+1
    tfile.write(r.content) #         

# URL         URL        txt   
for i in range(1,len(lisy)+1):
    fp = open(filenm+'neirong'+str(i)+'.html', "r")
    of = open(filenm+'neirong'+str(i)+'.txt','w')  
    content = fp.read()   #       

    p=re.compile(r'http://www\.nanpaisanshu\.org/.*?\.html') #    
    
    #print p.findall(content)

    #print type(p.findall(content))

    for line in p.findall(content):  
        #print line+'
'
        #if line !='http://www.nanpaisanshu.org/9701.html':
        of.write(line+'
')  #                
        #else:
            #continue

        #of.write(str(p.findall(content)))

#    
of.close()
fp.close()
tfile.close()


# txt

for i in range(1,len(lisy)+1):
    ot=open(filenm+'neirong'+str(i)+'.txt','r')
    outfile=open(filenm+'quanbu'+str(i)+'.txt','a+')


    li=[]
    for line in ot:
        line = line.replace('
','')
        li.append(line)   # url           

    li = sorted(li)  #     

    for line in li:
        print line
        #line = line.replace('
','')
        r = requests.get(line).content  #      url
        title=BeautifulSoup(r).find("div",{'class':"post_title"}).h2   #    
        content=BeautifulSoup(r).findAll("div",{'class':"post_entry"}) #    
        sti=str(title).replace('<h2>','').replace('</h2>','')  #    。      

        #    ，      
        scon = str(content).replace('<p>','  ').replace('</p>','  ').replace('<br/>','
')
        #print str(urllist)
        scon = re.sub("<.*>", "", scon)
        scon = re.sub("(.*?);","",scon) 
        #scon = scon.strip()
        scon = '
'.join(scon.split())

        print scon
        outfile.write(sti+'
'+line+'
'+scon) #           
    #i=i+1
    #print 
#print urllist

print '=========================    ======================='


#    
outfile.close()
ot.close()



#              
targetDir=s+d+namef
for line in os.listdir(targetDir):

    p=re.compile(r'neirong[0-9]{1}') #     
    if p.match(line)!=None:
        print "       "+s+d+namef+d+line+'!!'
        os.remove(s+d+namef+d+line)  #           ，os.remove()       
    else:
        print '    ！'
        continue

接続に失敗し、プログラムがエラーを報告する場合があります.requestsを推定する必要があります.get(url).status_code != 200ただ私がプラスした后に実行がもっと遅いことを発见して、すべてのページはすべて推定して、汗、私のここのネットの速さの何Kの原因がやっと异常なためかもしれません
以下は変更後の完璧版で、慎重に使用して、速度はきわめて遅いです.推定されたものと回数が追加されたため:

#-*-coding:utf8-*-

#        
#2014-10-14
#ZJL

from bs4 import BeautifulSoup
import requests
import re
import os


#        URL           
r = requests.get('http://www.nanpaisanshu.org/').content  #        
content=BeautifulSoup(r).findAll('a',href=re.compile(r'\Ahttp://www.nanpaisanshu.org/[a-z]+\Z')) #           

sc = str(content) #   string  

lists=[]
lists = sc.split(',')   
lists = list(set(lists)) #         

lisy=[]


for line in lists:
    p=line.split('"')[1]  #  "   ，           
    lisy.append(p)        #         url
    #print p
#print lisy


#     URL    。        html   

s = os.getcwd()#    

d = os.sep  #     

namef='aaa' #     

#b = os.path.exists( s+d+namef) #     

f=os.path.exists(s+d+namef) #     

if f==False:
    os.mkdir(s+d+namef)  #            
else:
    print u'    '+namef

filenm = s+d+namef+d    #  

i=1
for line in lisy:
    r = requests.get(line)   #      url
    print r.content
    print '
'
    tfile=open(filenm+'neirong'+str(i)+'.html','w')
    i=i+1
    tfile.write(r.content) #         

# URL         URL        txt   
for i in range(1,len(lisy)+1):
    fp = open(filenm+'neirong'+str(i)+'.html', "r")
    of = open(filenm+'neirong'+str(i)+'.txt','w')  
    content = fp.read()   #       

    p=re.compile(r'http://www\.nanpaisanshu\.org/.*?\.html') #    
    
    #print p.findall(content)

    #print type(p.findall(content))

    for line in p.findall(content):  
        #print line+'
'
        #if line !='http://www.nanpaisanshu.org/9701.html':
        of.write(line+'
')  #                
        #else:
            #continue

        #of.write(str(p.findall(content)))

#    
of.close()
fp.close()
tfile.close()


# txt

for i in range(1,len(lisy)+1):
    ot=open(filenm+'neirong'+str(i)+'.txt','r')
    if os.path.exists(filenm+'quanbu'+str(i)+'.txt')==True:
        print "    "+filenm+'quanbu'+str(i)+'.txt'+'       '
        os.remove(filenm+'quanbu'+str(i)+'.txt')
        outfile=open(filenm+'quanbu'+str(i)+'.txt','a+')   #                 (                          )

    else:
        print "  "+filenm+'quanbu'+str(i)+'.txt'
        outfile=open(filenm+'quanbu'+str(i)+'.txt','a+')


    
    li=[]
    for line in ot:
        line = line.replace('
','')
        li.append(line)   # url           

    li = sorted(li)  #     

    for line in li:
    #print line
        #line = line.replace('
','')

        if requests.get(line).status_code != 200:
            print '      ，      !'
            outfile.write('      。      ')   #        ，           
        elif requests.get(line).status_code == 200:
            print '    ！'
            r = requests.get(line).content  #      url
            title=BeautifulSoup(r).find("div",{'class':"post_title"}).h2   #    
            content=BeautifulSoup(r).findAll("div",{'class':"post_entry"}) #    
            sti=str(title).replace('<h2>','').replace('</h2>','')  #    。      

            #    ，      
            scon = str(content).replace('<p>','  ').replace('</p>','  ').replace('<br/>','
')
            #print str(urllist)
            scon = re.sub("<.*>", "", scon)
            scon = re.sub("(.*?);","",scon) 
            #scon = scon.strip()
            scon = '
'.join(scon.split())

            print scon
            outfile.write(sti+'
'+line+'
'+scon) #   。  ，       
        #i=i+1
        #print 
        #print urllist

print '=========================    ======================='


#    
outfile.close()
ot.close()



#              
targetDir=s+d+namef
for line in os.listdir(targetDir):

    p=re.compile(r'neirong[0-9]{1}') #     
    if p.match(line)!=None:
        print "       "+s+d+namef+d+line+'!!'
        os.remove(s+d+namef+d+line)  #           ，os.remove()       
    else:
        print '    。'
        continue

RotateAnimationクラスRotateAnimationクラスRotateAnimationクラス:回転変化アニメーションクラス

Android Cannot add task 'reportSourceSetTransformTest' as a task with that name already exists