mac osプラットフォームpython爬虫類を使って、自動的に巨潮ネットワークファイルをダウンロードします.

7361 ワード

環境設定
python+selenium+wget+Safariの環境を選んでファイルをダウンロードして、本来はphantomjsを使うことを望んでいますが、使う時にクリックしたリンクのページは空白のページで、ファイルをダウンロードすることができません.
Safariを使う時に出会うエラー:selenium.co.mmon.exceptions.WebDriver Exception:Message:Could not create a session:You must enable the'Allow Remote Automation'option in Safari's Develop menu to control Safavia.Eviva.は自動化方式で開発する必要があります.
元のコード

#!/usr/bin/python

# -*- coding: utf-8 -*- 
__metaclass__ = type

import io
from selenium import webdriver
import time
import sys
import re
import os
from selenium.webdriver.common.keys import Keys
import wget
import urllib
from urllib import request
import shutil
#from selenium.webdriver import ActionChains
#from selenium.webdriver.common.keys import Keys

'''class: DownloadFromCninfo'''
class DownloadFromCninfo(object):
    def __init__(self,stockNumberStr,maxNumber=10000):
        self.stockNumber = stockNumberStr
        #     
        self.RecordDownloadIndex = 1
        self.maxDownloadNumber = maxNumber
        self.driver = webdriver.Safari()
        #self.driver = webdriver.PhantomJS(executable_path='/usr/local/phantomjs/bin/phantomjs')
        if(int(stockNumberStr) >= 600000):
            self.dst_url = 'http://www.cninfo.com.cn/cninfo-new/disclosure/sse'
        else:
            self.dst_url = 'http://www.cninfo.com.cn/cninfo-new/disclosure/szse'
        #make new directory
        prefixpath = "./download/"
        self.prefixpathname = prefixpath+self.stockNumber+"/"
        if os.path.exists(self.prefixpathname):
            pass
        else:
            os.mkdir(self.prefixpathname)

    def downloadPDF(self):    
        self.driver.quit()
        #self.driver = webdriver.PhantomJS(executable_path='/usr/local/phantomjs/bin/phantomjs')
        self.driver = webdriver.Safari()
        #      ，           ，            
        self.driver.set_page_load_timeout(10)
        #tmpURL = "http://www.cninfo.com.cn/finalpage/2017-12-29/1204276365.PDF"
        #self.driver.get(tmpURL)
        self.driver.get(self.dst_url)
        self.driver.maximize_window()
        time.sleep(2)
        #print(self.stockNumber
        print('%s'%self.driver.current_url)
        self.driver.find_element_by_class_name("input-stock").send_keys(self.stockNumber)
        #self.driver.find_element_by_xpath("//ul[@id='stock_list']/li[1]/a").click()
        self.driver.find_element_by_xpath("//ul[@id='stock_list']/li[1]/a").send_keys(Keys.ENTER)
        #    ，           
        #tmpDriver = self.driver
        #time.sleep(30)
        time.sleep(5)
        for handle in self.driver.window_handles:
            self.driver.switch_to_window(handle)
            print('current url:%s'%self.driver.current_url)
            if "show" in self.driver.current_url :
                break
        time.sleep(1)

        urldata = self.driver.find_element_by_xpath("//div[@id='con-div-his-fulltext']/div[@class='stat-right']")
        print('%s'%urldata.text)
        name = self.driver.find_element_by_xpath("//div[@id='plus-tag-div']/a/span").text
        print('%s'%name)
        patternStr = '\d+'
        rslt = re.findall(patternStr,urldata.text)
        #print(len(self.driver.window_handles))
        #     ,        ，    handle   
        #self.driver.maximize_window()
        #print(len(self.driver.window_handles))
        while(rslt[0] != rslt[1]):
            #self.driver.find_element_by_link_text('  ').click()
            #self.driver.find_element_by_link_text('  ').send_keys(Keys.ENTER)
            if(int(rslt[1]) >= self.maxDownloadNumber):
                break
                
            self.driver.find_element_by_xpath("//div[@id='con-div-his-fulltext']/div[@class='show-more']/a").click()
            #        
            time.sleep(1)
            urldata = self.driver.find_element_by_xpath("//div[@id='con-div-his-fulltext']/div[@class='stat-right']")
            print('%s'%urldata.text)
            patternStr = '\d+'
            rslt = re.findall(patternStr,urldata.text)
        listNum = int(rslt[1])
        
        if(listNum != 0):
            for indexValue in range(1,listNum+1):
                
                for handle in self.driver.window_handles:
                    self.driver.switch_to_window(handle)
                    print('current url:%s'%self.driver.current_url)
                    if "show" in self.driver.current_url :
                        break
                time.sleep(1)

                findXpathStr = "//ul[@id='ul_his_fulltext']/li[%d]/div[@class='t3']/dd/span[@class='d3']"%indexValue
                urlTextGet = self.driver.find_element_by_xpath(findXpathStr)
                tmpTimeStr = urlTextGet.text
                print('timestr %s'%urlTextGet.text)
                
                findXpathStr = "//ul[@id='ul_his_fulltext']/li[%d]/div[@class='t3']/dd/span/a"%indexValue
                print('%s'%findXpathStr)
                urlTextGet = self.driver.find_element_by_xpath(findXpathStr)
                print('%s'%urlTextGet.text)
                tmpName = urlTextGet.text
                #if(re.search('    ',urlTextGet.text)):
                #print('      ！%s'%urlTextGet.text)
                    #continue
                        
                self.driver.find_element_by_xpath(findXpathStr).click()
                time.sleep(5)
                #enterNumber=0
                for handle in self.driver.window_handles:
                    self.driver.switch_to_window(handle)
                    print('%s'%self.driver.current_url)
                    if "pdf" in self.driver.current_url :
                        break
                    if "PDF" in self.driver.current_url :
                        break
                    #enterNumber = enterNumber + 1
                    #print(enterNumber)
                time.sleep(1)
                print('%s'%self.driver.current_url)
                
                wgetURL = self.driver.current_url
                findlinkSuccess = 1
                downloadfilename = '%s%s%s.pdf'%(self.prefixpathname,tmpTimeStr.strip(),tmpName)
                if(findlinkSuccess == 1):
                    wget.download(wgetURL,downloadfilename)
                else:
                    print('    ！ignore')
                self.driver.close()

            #          URL
            for handle in self.driver.window_handles:
                self.driver.switch_to_window(handle)
                time.sleep(1)
                    
        self.driver.close()
        self.driver.quit()
        
if __name__ == "__main__":
    if(len(sys.argv) < 2):
        print("Input stock number error!")
        print(sys.argv[0])
        sys.exit()
    downloadHandle = DownloadFromCninfo(sys.argv[1],20)
    downloadHandle.downloadPDF()

問題を残す
phantomjsが使えない具体的な原因が分かりません.phantomjsがSafari 2つのプラットフォームに変換されているのを発見した場合、効果が上がらない場合があります.Enterを使う方式が必要です.

Flaask Web開発入門(二)のFlashk-lognを使用します.

SpringAOPで切り込みポイントの高級使用