【Python】一键爬取金庸连载版/三连版/世纪新修版小说
前回射雕英雄伝の连载版に登って、1年ぶりに、倚天屠龙记の连载版を见る时、前にネット上で探した资源で、意外にも文字化けがあって、また资源を探しにくくて、その上ネット上の资源に対して多く导入して読む后にカタログがなくてとっくに不快で、いっそ自分で手を出します.実は本を読むといえば、やはり紙版を見たほうがいいです.携帯電話を見て、そんなに意味が少なくて目を傷つけます.登りたくなければ、整理した小説のコードを探してもいいです.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@Time : 2019/12/14 10:20
@Author : YuJinNeng
@Site :
@File : .py
@Software: PyCharm
"""
import random
import time
from bs4 import BeautifulSoup
from urllib import request
import re
def down_txt(href_content, book_name):
html_addr = "http://www.jinyongwang.com" + href_content
html_content = request.urlopen(html_addr).read()
soup = BeautifulSoup(html_content, 'html.parser')
author = str(soup.select('.author')[0])
author_content = re.findall(' ', author)[0]
author_info = author_content + author_name
all_content = soup.select(".mlist li")
with open('%s.txt' % book_name, 'w+', encoding='utf-8') as f:
f.write(author_info)
f.write('
' * 3)
for one_content in all_content:
chapter_info = re.findall(">(.*?), str(one_content))[0]
chapter_name = re.split(">", chapter_info)[1]
chapter_url = re.findall(
"", chapter_info)[0].split("href=")[1].split('/')[2].strip('"')
new_chapter_url = html_addr + chapter_url
chapter_url_content = request.urlopen(new_chapter_url).read()
one_soup = BeautifulSoup(chapter_url_content, 'html.parser')
content = one_soup.select(".bg0")[0].text
chapter_content = content.split(
"(adsbygoogle = window.adsbygoogle || []).push({});")[1]
f.write(chapter_name)
f.write(chapter_content)
f.close()
def get_addr_url():
url_addr = "http://www.jinyongwang.com/book/"
html_content = request.urlopen(url_addr).read()
soup = BeautifulSoup(html_content, 'html.parser') #
story_list = soup.select('.booklist .pu_bookrotate')
for one_story in story_list:
# print(one_story)
# print(type(one_story)) #
# print(one_story.contents)
# print(type(one_story.contents)) # list
a_tag = one_story.contents[0]
href_content = a_tag.get("href")
alt_content = a_tag.contents[0].get("alt")
if href_content.replace("/", "").startswith("n"):
alt_content = " " + alt_content.strip(" ")
elif href_content.replace("/", "").startswith("o"):
alt_content = " " + alt_content.strip(" ")
else:
alt_content = " " + alt_content.strip(" ")
print(alt_content)
down_txt(href_content, alt_content)
time.sleep(random.randint(3, 10))
# print(alt_content)
# print(type(a_tag)) #
# a_content = a_tag.contents
# print(a_content)
if __name__ == '__main__':
get_addr_url()