pythonベース-爬虫類ダウンロード小説

2555 ワード

pythonベース-爬虫類ダウンロード小説
import requests
from bs4 import BeautifulSoup


def getnevel(content_url,i):
    i=i+1
    header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    res = requests.get(content_url,headers = header,timeout = 10)
    res.encoding = 'gbk'
    soup = BeautifulSoup(res.text,'html.parser')
    title = soup.select('b')[0].text.lstrip('      ')
    content = soup.select('.ART')[0].text.lstrip('style5();').rstrip('style6();')
    both = title + content
    next_url = 'https://yidukk.com/'+soup.select('.MC .btsc')[2]['href']
    print(both,file = f)
    print(i)
    if(next_url.split('/')[2] != 'yidukk.com' or i>80):
        return False
    return getnevel(next_url,i)


f = open("《       》.txt", 'w+',encoding='utf-8')
i=0
getnevel('https://yidukk.com/read_1045847_32261.html',i)
f.close()
print('ok!')

import requests
from bs4 import BeautifulSoup
#        


#    
start_url = "https://yidukk.com/read_1045847_32261.html" #        URL
file_name = "《       》.txt"  #         
max = 80  #     

header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
count = 0 #          


# function:                 
def getContent(content_url):

    global count
    count = count +1 #     
    
    res = requests.get(content_url,headers = header,timeout = 10)
    res.encoding = 'gbk'
    
    soup = BeautifulSoup(res.text,'html.parser')
    title = soup.select('b')[0].text.lstrip('      ') #      
    content = soup.select('.ART')[0].text.lstrip('style5();').rstrip('style6();') #      
    both = title + content

    print(both,file = f) #    
    print("     "+str(count)+" ") #          
    
    next_url = 'https://yidukk.com/'+soup.select('.MC .btsc')[2]['href']  #      URL
    print(next_url) #    ,       URL
    if(next_url.split('/')[2] != 'yidukk.com' or count==max):#    ,    ,     
        return False
    return getContent(next_url)


#MAIN
if __name__ == '__main__':
    f = open(file_name, 'a+',encoding='utf-8')
    getContent(start_url)
    f.close()
    print('      !')