Pythonはコラム記事を読みPDFを生成する

29601 ワード

掲載されたコラムは、元のWebサイトのスタイルに従ってPDFを生成して保存します.
import json
import io
import requests
from bs4 import BeautifulSoup

from Novel import headers
from articleUtils import *

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='GBK')

def __find_next_page(url):
    articleResult = json.loads(requests.get(url, headers=headers).content.decode())
    htmls = []
    while not articleResult['paging']['is_end']:
        for d in articleResult['data']:
            htmls.append(__open_article(d['id']))
        list = articleResult['paging']['next'].split("/")
        list.insert(3, "api")
        __find_next_page('/'.join(list))
        break
    if htmls:
        save_pdf(htmls)
        print('    !')
    else:
        print('          ')


def __open_article(article_id):
    article_url = f'https://zhuanlan.zhihu.com/p/{article_id}'
    soup = BeautifulSoup(requests.get(article_url, headers=headers).content.decode())
    title = safe_file_name(soup.select_one(".Post-Title").text)
    author = safe_file_name(soup.select_one('.AuthorInfo>meta[itemprop="name"]').attrs['content'])
    content = soup.select_one('.Post-RichTextContainer').prettify()

    return save_html(soup, content, title, author, './zhihu.css', convert_img_attrs=['data-actualsrc', 'data-original'])


def down(url):
    article_id = url.split('/')[-1]
    html = __open_article(article_id)
    if html:
        name = html.split('/')[-1].split('.')[0]
        name = "/".join(html.split('/')[:-1]) + "/" + str(name) + ".pdf"
        save_pdf(html, name)
        print('    !')
    else:
        print('          ')


def zhuanlan_down(s):
    url = 'https://zhuanlan.zhihu.com/api/columns/{}/articles?data%5B*%5D.upvoted_followees%2Cadmin_closed_comment&limit=10&offset=10'.format(
        str(s).strip().split('/')[-1])
    __find_next_page(url)


if __name__ == '__main__':
    zhuanlan_down('https://zhuanlan.zhihu.com/crossin')

    down('https://zhuanlan.zhihu.com/p/26252318')


articleUtils.py
import io
import os
import re
import base64
import sys
import requests

import pdfkit

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='UTF-8')


def safe_file_name(file_name):
    return re.sub(r'[\|/|:|*|?|"||\|]', "", file_name)


def get_image_file_as_base64_data(img_src):
    if img_src:
        return f'data:image/jpeg;base64,{str(base64.b64encode(requests.get(img_src).content), encoding="utf-8")}'


def regex_str(str):
    for r in [r'\.', r'\+']:
        str = re.sub(r, r, str)
    return str


def convert_img_tag(content, soup, attrs):
    img_tags = soup.select('img')
    for img_tag in img_tags:
        if not img_tag:
            continue
        if not img_tag.attrs:
            continue
        img_src = None

        old_tag = img_tag.prettify()
        for attr in attrs:
            if attr in img_tag.attrs and img_tag.attrs[attr]:
                img_src = img_tag.attrs[attr]
                break
        if img_src:
            if img_src.startswith("//"):
                img_src = "http:" + img_src
            img_tag.attrs['src'] = get_image_file_as_base64_data(img_src)
            content = content.replace(old_tag,img_tag.prettify())
    return content


def save_file(content, title, user_name):
    path = './%s' % user_name
    if not os.path.exists(path):
        os.mkdir(path)
    file = '%s/%s.html' % (path, title)
    if not os.path.exists(file):
        file = file.encode('UTF-8', errors='ignore').decode(encoding='UTF-8')
        with open(file, 'w', encoding='utf-8') as f:
            f.write(content)
    else:
        pass
    return file



def html_convert(soup, content, title, author, css_file_path=None, convert_img_attrs=['src']):
    content = convert_img_tag(content, soup, attrs=convert_img_attrs)
    html = f"""
            
            
            
                
                {title} {author}
            
            
            

{title}

{content}
{__get_css(css_file_path)} """
return html def save_html(soup, content, title, author, css_file_path=None, convert_img_attrs=None): html = html_convert(soup, content, title, author, css_file_path, convert_img_attrs) return save_file(html, title, author) def __get_css(css_file_path): if css_file_path: with open(css_file_path, 'r', encoding='utf-8') as f2: return f'' else: return '' def save_pdf(htmls, output_path='./out.pdf'): path_wk = r'D:\Program Files (x86)\wkhtmltopdf\bin\wkhtmltopdf.exe' # ��װλ�� config = pdfkit.configuration(wkhtmltopdf=path_wk) options = { 'encoding': "UTF-8", 'custom-header': [ ('Accept-Encoding', 'gzip') ] } pdfkit.from_file(htmls, output_path, configuration=config, options=options)

./zhihu.cssのウェブサイトのファイルはすでにアップロードしました:https://download.csdn.net/download/mbh12333/11999306必要なのは私を探して、ファイルが大きすぎます.アップロードできません