『Python金融ビッグデータマイニングと分析全プロセス詳細』第10章PDFテキスト解析ノート整理

4322 ワード

1、PDF一括ダウンロード
手順:
(1)ダウンロードするpdfキーワードをシミュレーションして検索し,ダウンロードページの内容を得る.
(2)タイトル、リンク、日付を抽出する.2018-2019年の
(3)ダウンロードリンクにアクセスし,ダウンロードボタンをクリックしてダウンロードするシミュレーションを行う.ここでtimeを追加することに注意する.sleep()はダウンロード待ち時間として

# =============================================================================
# 10.1               by    
# =============================================================================

from selenium import webdriver
import re
import time
browser = webdriver.Chrome()
#pdf    
url = 'http://www.cninfo.com.cn/new/fulltextSearch?notautosubmit=&keyWord=  '
browser.get(url)
time.sleep(3)
data = browser.page_source
p_count = 'id="page-info-title">   (.*?) '
count = re.findall(p_count, data)[0]  #       ，        [0],  findall        
#     ，    10   
pages = int(int(count)/10)

# 1.           
datas = []
datas.append(data)  #              datas     
for i in range(3):  #          range(3)，        range(pages)
    #             ，    Selenium        “   ”  。
    browser.find_element_by_xpath('//*[@id="pagination_title"]/ul/li[12]').click()
    time.sleep(2)
    data = browser.page_source
    #                
    datas.append(data)
    time.sleep(1)
    # datas        ，          
alldata = "".join(datas)
browser.quit()

# 2.       
#    
p_title = '(.*?)'
#    
p_href = '.*?'
#    
p_date = '(.*?)'
title = re.findall(p_title, alldata)
href = re.findall(p_href, alldata)
date = re.findall(p_date, alldata)

# 3.    
for i in range(len(title)):
    title[i] = re.sub('<.>', '', title[i])
    href[i] = 'http://www.cninfo.com.cn' + href[i]
    href[i] = re.sub('amp;', '', href[i])
    date[i] = date[i].split(' ')[0]
    print(str(i + 1) + '.' + title[i] + ' - ' + date[i])
    print(href[i])

# 4.    
for i in range(len(title)):
    if '2018' in date[i] or '2019' in date[i]:  #   2018 2019  ，      
        title[i] = title[i]
        href[i] = href[i]
        date[i] = date[i]
    #       2018  2019           
    else:
        title[i] = ''
        href[i] = ''
        date[i] = ''
# while          ，              ，     remove()         
while '' in title:
    title.remove('')
while '' in href:
    href.remove('')
while '' in date:
    date.remove('')

# 5.      PDF -         
for i in range(len(href)):
    browser = webdriver.Chrome()
    browser.get(href[i])
    try:
        #            
        browser.find_element_by_xpath('/html/body/div/div[1]/div[2]/div[1]/div/a[4]').click()
        time.sleep(3)  #       ，          
        browser.quit()
        print(str(i+1) + '.' + title[i] + ' PDF  ')
    except:
        print(title[i] + '  PDF  ')

#      1：        
# for i in range(len(href)):
#     chrome_options = webdriver.ChromeOptions()
#     chrome_options.add_argument('--headless')
#     browser = webdriver.Chrome(options=chrome_options)
#     browser.get(href[i])
#     try:
#         browser.find_element_by_xpath('/html/body/div/div[1]/div[2]/div[1]/div/a[4]').click()
#         time.sleep(3)  #       ，          
#         browser.quit()
#         print(str(i+1) + '.' + title[i] + ' PDF  ')
#     except:
#         print(title[i] + '  PDF  ')


#      2：      PDF -         
# for i in range(len(href)):
#     chrome_options = webdriver.ChromeOptions()
#     prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': 'd:\\  '} #              
#     chrome_options.add_experimental_option('prefs', prefs)
#     browser = webdriver.Chrome(chrome_options=chrome_options)
#     browser.get(href[i])
#     try:
#         browser.find_element_by_xpath('/html/body/div/div[1]/div[2]/div[1]/div/a[4]').click()
#         time.sleep(3) #       ，          
#         print(str(i+1) + '.' + title[i] + '    ')
#         browser.quit()
#     except:
#         print(title[i] + '  PDF')

IDFラボ:シーザー暗号化

5つの方法pythonプログラムコードの性能分析とタイミング統計を行う