Pythonはコラム記事を読みPDFを生成する
29601 ワード
掲載されたコラムは、元のWebサイトのスタイルに従ってPDFを生成して保存します.
articleUtils.py
./zhihu.cssのウェブサイトのファイルはすでにアップロードしました:https://download.csdn.net/download/mbh12333/11999306必要なのは私を探して、ファイルが大きすぎます.アップロードできません
import json
import io
import requests
from bs4 import BeautifulSoup
from Novel import headers
from articleUtils import *
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='GBK')
def __find_next_page(url):
articleResult = json.loads(requests.get(url, headers=headers).content.decode())
htmls = []
while not articleResult['paging']['is_end']:
for d in articleResult['data']:
htmls.append(__open_article(d['id']))
list = articleResult['paging']['next'].split("/")
list.insert(3, "api")
__find_next_page('/'.join(list))
break
if htmls:
save_pdf(htmls)
print(' !')
else:
print(' ')
def __open_article(article_id):
article_url = f'https://zhuanlan.zhihu.com/p/{article_id}'
soup = BeautifulSoup(requests.get(article_url, headers=headers).content.decode())
title = safe_file_name(soup.select_one(".Post-Title").text)
author = safe_file_name(soup.select_one('.AuthorInfo>meta[itemprop="name"]').attrs['content'])
content = soup.select_one('.Post-RichTextContainer').prettify()
return save_html(soup, content, title, author, './zhihu.css', convert_img_attrs=['data-actualsrc', 'data-original'])
def down(url):
article_id = url.split('/')[-1]
html = __open_article(article_id)
if html:
name = html.split('/')[-1].split('.')[0]
name = "/".join(html.split('/')[:-1]) + "/" + str(name) + ".pdf"
save_pdf(html, name)
print(' !')
else:
print(' ')
def zhuanlan_down(s):
url = 'https://zhuanlan.zhihu.com/api/columns/{}/articles?data%5B*%5D.upvoted_followees%2Cadmin_closed_comment&limit=10&offset=10'.format(
str(s).strip().split('/')[-1])
__find_next_page(url)
if __name__ == '__main__':
zhuanlan_down('https://zhuanlan.zhihu.com/crossin')
down('https://zhuanlan.zhihu.com/p/26252318')
articleUtils.py
import io
import os
import re
import base64
import sys
import requests
import pdfkit
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='UTF-8')
def safe_file_name(file_name):
return re.sub(r'[\|/|:|*|?|"||\|]', "", file_name)
def get_image_file_as_base64_data(img_src):
if img_src:
return f'data:image/jpeg;base64,{str(base64.b64encode(requests.get(img_src).content), encoding="utf-8")}'
def regex_str(str):
for r in [r'\.', r'\+']:
str = re.sub(r, r, str)
return str
def convert_img_tag(content, soup, attrs):
img_tags = soup.select('img')
for img_tag in img_tags:
if not img_tag:
continue
if not img_tag.attrs:
continue
img_src = None
old_tag = img_tag.prettify()
for attr in attrs:
if attr in img_tag.attrs and img_tag.attrs[attr]:
img_src = img_tag.attrs[attr]
break
if img_src:
if img_src.startswith("//"):
img_src = "http:" + img_src
img_tag.attrs['src'] = get_image_file_as_base64_data(img_src)
content = content.replace(old_tag,img_tag.prettify())
return content
def save_file(content, title, user_name):
path = './%s' % user_name
if not os.path.exists(path):
os.mkdir(path)
file = '%s/%s.html' % (path, title)
if not os.path.exists(file):
file = file.encode('UTF-8', errors='ignore').decode(encoding='UTF-8')
with open(file, 'w', encoding='utf-8') as f:
f.write(content)
else:
pass
return file
def html_convert(soup, content, title, author, css_file_path=None, convert_img_attrs=['src']):
content = convert_img_tag(content, soup, attrs=convert_img_attrs)
html = f"""
{title} {author}
{title}
{content}
{__get_css(css_file_path)}
"""
return html
def save_html(soup, content, title, author, css_file_path=None, convert_img_attrs=None):
html = html_convert(soup, content, title, author, css_file_path, convert_img_attrs)
return save_file(html, title, author)
def __get_css(css_file_path):
if css_file_path:
with open(css_file_path, 'r', encoding='utf-8') as f2:
return f''
else:
return ''
def save_pdf(htmls, output_path='./out.pdf'):
path_wk = r'D:\Program Files (x86)\wkhtmltopdf\bin\wkhtmltopdf.exe' # ��װλ��
config = pdfkit.configuration(wkhtmltopdf=path_wk)
options = {
'encoding': "UTF-8",
'custom-header': [
('Accept-Encoding', 'gzip')
]
}
pdfkit.from_file(htmls, output_path, configuration=config, options=options)
./zhihu.cssのウェブサイトのファイルはすでにアップロードしました:https://download.csdn.net/download/mbh12333/11999306必要なのは私を探して、ファイルが大きすぎます.アップロードできません