Python爬虫類実戦ダウンロード原力創有料ドキュメント---スライド式(最適化編)

21564 ワード

python 有料ダウンロード selenium オートメーション python爬虫類爬虫類

オリジナルの有料ドキュメントのダウンロード-スライド(最適化編)
登り速度を大幅に向上させ、インタフェースを最適化し、安全信頼性を向上させ、資源消費を大幅に低減
一、プロジェクトの需要:
ターゲットサイトから有料ドキュメントをダウンロードし、word形式のサイトとして保存します.
二、考え方

1.seleniumによる非同期ロードを実現し、ピクチャurl

を取得する

2.爬取画像

3.wordドキュメント

に画像を書き込む

4.ワードドキュメントをPDF

に変換
三、技術点

1.python+selenium自動化

2.python + docx

3.python + pywin32

四、環境
python3.6 + selenium + docx + pywin32

  (       )：
   pip install selenium -i https://pypi.tuna.tsinghua.edu.cn/simple/
   pip install python-docx -i https://pypi.tuna.tsinghua.edu.cn/simple/
   pip install pywin32 -i https://pypi.tuna.tsinghua.edu.cn/simple/

五、コード

import time

from selenium import webdriver
from selenium.webdriver.common import keys
import requests
from docx import Document
from docx.shared import Inches
from win32com.client import constants, gencache

class YuanLC:
 def __init__(self, url):

     self.filename = None
     
     #   session      
     headers = {
     
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,a"
                   "pplication/signed-exchange;v=b3;q=0.9",
         "Accept-Encoding": "gzip, deflate, br",
         "Accept-Language": "zh-CN,zh;q=0.9",
         "Cache-Control": "no-cache",
         "Connection": "keep-alive",
         "Cookie": "CLIENT_SYS_UN_ID=3rvgCl9XYO1u41DVBzG/Ag==; s_v=cdh%3D%3E27a30245%7C%7C%7Cvid%3D%3E1599561"
                   "968279953439%7C%7C%7Cfsts%3D%3E1599561968%7C%7C%7Cdsfs%3D%3E0%7C%7C%7Cnps%3D%3E1; s_s=cdh%3"
                   "D%3E27a30245%7C%7C%7Clast_req%3D%3E1599561968%7C%7C%7Csid%3D%3E1599561968685697441%7C%7C%7Cd"
                   "sps%3D%3E0; __cfduid=dcce463c0931f0014f9ed1b030e9c47981599561968",
         "Host": "view-cache.book118.com",
         "Pragma": "no-cache",
         "Sec-Fetch-Dest": "document",
         "Sec-Fetch-Mode": "navigate",
         "Sec-Fetch-Site": "none",
         "Sec-Fetch-User": "?1",
         "Upgrade-Insecure-Requests": "1",
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/8"
                       "3.0.4103.106 Safari/537.36",
     }
     self.session = requests.session()
     self.session.headers = headers

     #   web     --->    
     self.driver = webdriver.Chrome()
     self.driver.implicitly_wait(10)
     self.url = url

     #   doc    
     self.doc = Document()

     self.run()

     self.driver.quit()
 
 def get_src(self):
     """       """

     #          、     url
     # self.driver.get('https://max.book118.com/html/2017/0128/87099242.shtm')
     self.driver.get(self.url)
     self.filename = self.driver.find_element_by_xpath('//*[@id="main"]/div[1]/div[1]/h1').text[:-7] + 'docx'

     #       
     while True:
         # tag = self.driver.find_element_by_id("btn_preview_remain")
         tag = self.driver.find_element_by_xpath('//div[@class="btns"]')
         if tag.text == '    ':
             break
         else:
             self.driver.execute_script("arguments[0].scrollIntoView();", tag)  #          
             tag.click()

     #    
     page = self.driver.find_element_by_id('pagenumber').text[:-1]

     self.driver.execute_script("var q=document.documentElement.scrollTop=0")
     #     
     for i in range(1260, int(page) * 1260, 50):
         self.driver.execute_script("window.scrollTo(0, %s)" % str(i))
         time.sleep(0.02)   #           
     for i in range(1260 * 6, int(page) * 1260, 50):
         self.driver.execute_script("window.scrollTo(0, %s)" % str(i))
         time.sleep(0.035)   #           

     srcs = self.driver.find_elements_by_xpath('//div[@class="webpreview-item"]')

     return srcs

 def download(self, src):
     """    """

     #        src   
     url = src.find_element_by_css_selector('img').get_attribute('src')

     #     
     print(url)
     if url:
         res = self.session.get(url)
         return res
 
 def createword(self, res):
     """  word"""

     if res is None:
         return

     #       
     try:
         #      
         with open('1.png', 'wb') as f:
             f.write(res.content)

         # width=Inches(6), height=Inches(8)          A4  
         self.doc.add_picture("1.png", width=Inches(6), height=Inches(8))

     except Exception as e:
         print('1.png     ，   ：%s' % str(e))

     #    word  
     self.doc.save(self.filename)
 
 def createpdf(self):
     """word pdf """

     word = gencache.EnsureDispatch('Word.Application')
     self.doc = word.documents.Open(self.filename, ReadOnly=1)
     self.doc.ExportAsFixedFormat(
         self.pdfPath,
         constants.wdExportFormatPDF,
         Item=constants.wdExportdocumentWithMarkup,
         CreateBookmarks=constants.wdExportCreateHeadingBookmarks
     )
     word.Quit(constants.wdDoNotSaveChanges)
     
 def run(self):
     for src in self.get_src():
         res = self.download(src)
         self.createword(res)
         # self.createpdf()


YuanLC(
 url='https://max.book118.com/html/2021/0106/6232223004003045.shtm',
)

^_^役に立つならいいね~~~
フルスクリーン閲覧式点この原版スライド式点

こうやって01本

AIDL---Androidのリモートインタフェース(1)