pythonベース-爬虫類ダウンロード小説
2555 ワード
pythonベース-爬虫類ダウンロード小説
import requests
from bs4 import BeautifulSoup
def getnevel(content_url,i):
i=i+1
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
res = requests.get(content_url,headers = header,timeout = 10)
res.encoding = 'gbk'
soup = BeautifulSoup(res.text,'html.parser')
title = soup.select('b')[0].text.lstrip(' ')
content = soup.select('.ART')[0].text.lstrip('style5();').rstrip('style6();')
both = title + content
next_url = 'https://yidukk.com/'+soup.select('.MC .btsc')[2]['href']
print(both,file = f)
print(i)
if(next_url.split('/')[2] != 'yidukk.com' or i>80):
return False
return getnevel(next_url,i)
f = open("《 》.txt", 'w+',encoding='utf-8')
i=0
getnevel('https://yidukk.com/read_1045847_32261.html',i)
f.close()
print('ok!')
import requests
from bs4 import BeautifulSoup
#
#
start_url = "https://yidukk.com/read_1045847_32261.html" # URL
file_name = "《 》.txt" #
max = 80 #
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
count = 0 #
# function:
def getContent(content_url):
global count
count = count +1 #
res = requests.get(content_url,headers = header,timeout = 10)
res.encoding = 'gbk'
soup = BeautifulSoup(res.text,'html.parser')
title = soup.select('b')[0].text.lstrip(' ') #
content = soup.select('.ART')[0].text.lstrip('style5();').rstrip('style6();') #
both = title + content
print(both,file = f) #
print(" "+str(count)+" ") #
next_url = 'https://yidukk.com/'+soup.select('.MC .btsc')[2]['href'] # URL
print(next_url) # , URL
if(next_url.split('/')[2] != 'yidukk.com' or count==max):# , ,
return False
return getContent(next_url)
#MAIN
if __name__ == '__main__':
f = open(file_name, 'a+',encoding='utf-8')
getContent(start_url)
f.close()
print(' !')