Python爬虫類実戦ダウンロード原力創有料ドキュメント---スライド式(最適化編)


オリジナルの有料ドキュメントのダウンロード-スライド(最適化編)
登り速度を大幅に向上させ、インタフェースを最適化し、安全信頼性を向上させ、資源消費を大幅に低減
一、プロジェクトの需要:
ターゲットサイトから有料ドキュメントをダウンロードし、word形式のサイトとして保存します.
二、考え方
  • 1.seleniumによる非同期ロードを実現し、ピクチャurl
  • を取得する
  • 2.爬取画像
  • 3.wordドキュメント
  • に画像を書き込む
  • 4.ワードドキュメントをPDF
  • に変換
    三、技術点
  • 1.python+selenium自動化
  • 2.python + docx
  • 3.python + pywin32

  • 四、環境
    python3.6 + selenium + docx + pywin32
      (       ):
       pip install selenium -i https://pypi.tuna.tsinghua.edu.cn/simple/
       pip install python-docx -i https://pypi.tuna.tsinghua.edu.cn/simple/
       pip install pywin32 -i https://pypi.tuna.tsinghua.edu.cn/simple/
    

    五、コード
    import time
    
    from selenium import webdriver
    from selenium.webdriver.common import keys
    import requests
    from docx import Document
    from docx.shared import Inches
    from win32com.client import constants, gencache
    
    class YuanLC:
     def __init__(self, url):
    
         self.filename = None
         
         #   session      
         headers = {
         
             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,a"
                       "pplication/signed-exchange;v=b3;q=0.9",
             "Accept-Encoding": "gzip, deflate, br",
             "Accept-Language": "zh-CN,zh;q=0.9",
             "Cache-Control": "no-cache",
             "Connection": "keep-alive",
             "Cookie": "CLIENT_SYS_UN_ID=3rvgCl9XYO1u41DVBzG/Ag==; s_v=cdh%3D%3E27a30245%7C%7C%7Cvid%3D%3E1599561"
                       "968279953439%7C%7C%7Cfsts%3D%3E1599561968%7C%7C%7Cdsfs%3D%3E0%7C%7C%7Cnps%3D%3E1; s_s=cdh%3"
                       "D%3E27a30245%7C%7C%7Clast_req%3D%3E1599561968%7C%7C%7Csid%3D%3E1599561968685697441%7C%7C%7Cd"
                       "sps%3D%3E0; __cfduid=dcce463c0931f0014f9ed1b030e9c47981599561968",
             "Host": "view-cache.book118.com",
             "Pragma": "no-cache",
             "Sec-Fetch-Dest": "document",
             "Sec-Fetch-Mode": "navigate",
             "Sec-Fetch-Site": "none",
             "Sec-Fetch-User": "?1",
             "Upgrade-Insecure-Requests": "1",
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/8"
                           "3.0.4103.106 Safari/537.36",
         }
         self.session = requests.session()
         self.session.headers = headers
    
         #   web     --->    
         self.driver = webdriver.Chrome()
         self.driver.implicitly_wait(10)
         self.url = url
    
         #   doc    
         self.doc = Document()
    
         self.run()
    
         self.driver.quit()
     
     def get_src(self):
         """       """
    
         #          、     url
         # self.driver.get('https://max.book118.com/html/2017/0128/87099242.shtm')
         self.driver.get(self.url)
         self.filename = self.driver.find_element_by_xpath('//*[@id="main"]/div[1]/div[1]/h1').text[:-7] + 'docx'
    
         #       
         while True:
             # tag = self.driver.find_element_by_id("btn_preview_remain")
             tag = self.driver.find_element_by_xpath('//div[@class="btns"]')
             if tag.text == '    ':
                 break
             else:
                 self.driver.execute_script("arguments[0].scrollIntoView();", tag)  #          
                 tag.click()
    
         #    
         page = self.driver.find_element_by_id('pagenumber').text[:-1]
    
         self.driver.execute_script("var q=document.documentElement.scrollTop=0")
         #     
         for i in range(1260, int(page) * 1260, 50):
             self.driver.execute_script("window.scrollTo(0, %s)" % str(i))
             time.sleep(0.02)   #           
         for i in range(1260 * 6, int(page) * 1260, 50):
             self.driver.execute_script("window.scrollTo(0, %s)" % str(i))
             time.sleep(0.035)   #           
    
         srcs = self.driver.find_elements_by_xpath('//div[@class="webpreview-item"]')
    
         return srcs
    
     def download(self, src):
         """    """
    
         #        src   
         url = src.find_element_by_css_selector('img').get_attribute('src')
    
         #     
         print(url)
         if url:
             res = self.session.get(url)
             return res
     
     def createword(self, res):
         """  word"""
    
         if res is None:
             return
    
         #       
         try:
             #      
             with open('1.png', 'wb') as f:
                 f.write(res.content)
    
             # width=Inches(6), height=Inches(8)          A4  
             self.doc.add_picture("1.png", width=Inches(6), height=Inches(8))
    
         except Exception as e:
             print('1.png     ,   :%s' % str(e))
    
         #    word  
         self.doc.save(self.filename)
     
     def createpdf(self):
         """word pdf """
    
         word = gencache.EnsureDispatch('Word.Application')
         self.doc = word.documents.Open(self.filename, ReadOnly=1)
         self.doc.ExportAsFixedFormat(
             self.pdfPath,
             constants.wdExportFormatPDF,
             Item=constants.wdExportdocumentWithMarkup,
             CreateBookmarks=constants.wdExportCreateHeadingBookmarks
         )
         word.Quit(constants.wdDoNotSaveChanges)
         
     def run(self):
         for src in self.get_src():
             res = self.download(src)
             self.createword(res)
             # self.createpdf()
    
    
    YuanLC(
     url='https://max.book118.com/html/2021/0106/6232223004003045.shtm',
    )
    
    
    
    ^_^役に立つならいいね~~~
    フルスクリーン閲覧式点この原版スライド式点