pythonは東方富網を這い出すことを実現


東方富網には各会社のレポートに関する情報がたくさんあります.上の情報を登ることで、データを分析して、私たちが望んでいる結果を得ることができます.
この文書では、requests/json/reモジュールを使用してWebサイトのデータを取得し、csvファイルとして格納します.具体的なコードは以下のように実現されます!
import requests
import re
import json
import csv
import os
import time

#             
file_path = '/Users/username/Desktop'
if not os.path.exists(file_path):
    os.mkdir(file_path)
os.chdir(file_path)


# 1         
def set_table():
    print('*' * 80)
    print('\t\t\t\t         ')
    print('--------------')

    # 1           
    year = int(float(input('         (   2008-2019):
'))) # int , float str, int ,float # https://stackoverflow.com/questions/1841565/valueerror-invalid-literal-for-int-with-base-10 while (year < 2008 or year > 2019): year = int(float(input(' , :
'))) quarter = int(float(input(' (1:1 ,2- ,3:3 ,4- ):
'))) while (quarter < 1 or quarter > 4): quarter = int(float(input(' , :
'))) # quarter ,2 ,0 2 0 , # http://www.runoob.com/python/att-string-format.html quarter = '{:02d}'.format(quarter * 3) # quarter = '%02d' %(int(month)*3) # 30 31 if (quarter == '06') or (quarter == '09'): day = 30 else: day = 31 date = '{}-{}-{}'.format(year, quarter, day) # print('date:', date) # ok # 2 tables = int( input(' (1- ;2- :3- ;4- ;5- ;6- ;7- ):
')) dict_tables = {1: ' ', 2: ' ', 3: ' ', 4: ' ', 5: ' ', 6: ' ', 7: ' '} dict = {1: 'YJBB', 2: 'YJKB', 3: 'YJYG', 4: 'YYPL', 5: 'ZCFZB', 6: 'LRB', 7: 'XJLLB'} category = dict[tables] # js type, 1-4 'YJBB20_', 3 'CWBB_' # set_table() type、st、sr、filter if tables == 1: category_type = 'YJBB20_' st = 'latestnoticedate' sr = -1 filter = "(securitytypecode in ('058001001','058001002'))(reportdate=^%s^)" % (date) elif tables == 2: category_type = 'YJBB20_' st = 'ldate' sr = -1 filter = "(securitytypecode in ('058001001','058001002'))(rdate=^%s^)" % (date) elif tables == 3: category_type = 'YJBB20_' st = 'ndate' sr = -1 filter = " (IsLatest='T')(enddate=^2018-06-30^)" elif tables == 4: category_type = 'YJBB20_' st = 'frdate' sr = 1 filter = "(securitytypecode ='058001001')(reportdate=^%s^)" % (date) else: category_type = 'CWBB_' st = 'noticedate' sr = -1 filter = '(reportdate=^%s^)' % (date) category_type = category_type + category # print(category_type) # set_table() filter yield { 'date': date, 'category': dict_tables[tables], 'category_type': category_type, 'st': st, 'sr': sr, 'filter': filter } # 2 def page_choose(page_all): # start_page = int(input(' :
')) nums = input(' ,( ):
') print('*' * 80) # if nums.isdigit(): end_page = start_page + int(nums) elif nums == '': end_page = int(page_all.group(1)) else: print(' ') # , yield { 'start_page': start_page, 'end_page': end_page } # 3 def get_table(date, category_type, st, sr, filter, page): # params = { # 'type': 'CWBB_LRB', 'type': category_type, # 'token': '70f12f2f4f091e459a279469fe49eca5', 'st': st, 'sr': sr, 'p': page, 'ps': 50, # 'js': 'var LFtlXDqn={pages:(tp),data: (x)}', 'filter': filter, # 'rt': 51294261 } url = 'http://dcfm.eastmoney.com/em_mutisvcexpandinterface/api/js/get?' # print(url) response = requests.get(url, params=params).text # print(response) # pat = re.compile('var.*?{pages:(\d+),data:.*?') page_all = re.search(pat, response) print(page_all.group(1)) # ok # {},json.loads # pattern = re.compile('var.*?data: \[(.*)]}', re.S) # list, json.dumps json.loads pattern = re.compile('var.*?data: (.*)}', re.S) items = re.search(pattern, response) # # items = re.findall(pattern,response) # print(items[0]) data = items.group(1) data = json.loads(data) # data = json.dumps(data,ensure_ascii=False) return page_all, data, page # # 1 csv , def write_header(data, category): with open('{}.csv'.format(category), 'a', encoding='utf_8_sig', newline='') as f: headers = list(data[0].keys()) # print(headers) # ok writer = csv.writer(f) writer.writerow(headers) def write_table(data, page, category): print('
%s ' % page) # 1 for d in data: with open('{}.csv'.format(category), 'a', encoding='utf_8_sig', newline='') as f: w = csv.writer(f) w.writerow(d.values()) def main(date, category_type, st, sr, filter, page): func = get_table(date, category_type, st, sr, filter, page) data = func[1] page = func[2] write_table(data, page, category) if __name__ == '__main__': # , for i in set_table(): date = i.get('date') category = i.get('category') category_type = i.get('category_type') st = i.get('st') sr = i.get('sr') filter = i.get('filter') constant = get_table(date, category_type, st, sr, filter, 1) page_all = constant[0] for i in page_choose(page_all): start_page = i.get('start_page') end_page = i.get('end_page') # write_header(constant[1], category) start_time = time.time() # # for page in range(start_page, end_page): main(date, category_type, st, sr, filter, page) end_time = time.time() - start_time # print(' ') print(' : {:.1f} s'.format(end_time))

すべての期間(2008年から2019年)の利益レポートデータをダウンロードしたい場合は、簡単です.typefilterパラメータを注釈すれば、つまり日付をフィルタしないことを意味し、すべての時期のデータをダウンロードすることができる.ここでfilter列のコメントをキャンセルすると、ページ数page_が合計されます.allは2019年中報の72ページから2528ページに増加し、すべてのダウンロードが完了すると、表には12万行を超えるデータがある.これらのデータに基づいて、そこからいくつかの価値のあるデータ分析を試みることができます.
リファレンスhttps://www.makcyun.top/2018/10/12/web_scraping_withpython6.html
またseleniumを利用してデータのキャプチャを実現することもできますが、この様子では速度が遅くなります.seleniumを利用するのは主にその使い方を熟知するためで、ある時ホームページはJavaScriptを採用して、Ajaxを使わないで、このライブラリが必要です.
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import time
import pandas as pd
import os

#  chrome, phantomjs
# browser = webdriver.Chrome()

#     headlesss
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=chrome_options)

# browser = webdriver.PhantomJS() #            phantomjs,  chrome    
browser.maximize_window()  #      
wait = WebDriverWait(browser, 10)


def index_page(page):
    try:
        print('     : %s  ' % page)
        wait.until(
            EC.presence_of_element_located((By.ID, "dt_1")))
        #       1 ,    1     ,        。
        if page > 1:
            #        
            input = wait.until(EC.presence_of_element_located(
                (By.XPATH, '//*[@id="PageContgopage"]')))
            input.click()
            input.clear()
            input.send_keys(page)
            submit = wait.until(EC.element_to_be_clickable(
                (By.CSS_SELECTOR, '#PageCont > a.btn_link')))
            submit.click()
            time.sleep(2)
        #                
        wait.until(EC.text_to_be_present_in_element(
            (By.CSS_SELECTOR, '#PageCont > span.at'), str(page)))
    except Exception:
        return None


def parse_table():
    #          
    # element = wait.until(EC.presence_of_element_located((By.ID, "dt_1")))
    #      
    element = browser.find_element_by_css_selector('#dt_1')

    #       td
    td_content = element.find_elements_by_tag_name("td")
    lst = []
    for td in td_content:
        # print(type(td.text)) # str
        lst.append(td.text)

    #       
    col = len(element.find_elements_by_css_selector('tr:nth-child(1) td'))
    #       td   ,        ,   list         list
    lst = [lst[i:i + col] for i in range(0, len(lst), col)]

    #       "  "  ,          ,     url    ,      
    lst_link = []
    links = element.find_elements_by_css_selector('#dt_1 a.red')
    for link in links:
        url = link.get_attribute('href')
        lst_link.append(url)

    lst_link = pd.Series(lst_link)
    # list  dataframe
    df_table = pd.DataFrame(lst)
    #   url 
    df_table['url'] = lst_link

    # print(df_table.head())
    return df_table


#     
def write_to_file(df_table, category):
    #        D eastmoney    
    file_path = '/Users/username/Desktop/  '
    if not os.path.exists(file_path):
        os.mkdir(file_path)
    os.chdir(file_path)
    df_table.to_csv('{}.csv' .format(category), mode='a',
                    encoding='utf_8_sig', index=0, header=0)


#         、  
def set_table():
    print('*' * 80)
    print('\t\t\t\t         ')
    print('--------------')

    # 1           
    year = int(float(input('         (   2007-2018):
'))) # int , float str, int ,float # https://stackoverflow.com/questions/1841565/valueerror-invalid-literal-for-int-with-base-10 while (year < 2007 or year > 2018): year = int(float(input(' , :
'))) quarter = int(float(input(' (1:1 ,2- ,3:3 ,4- ):
'))) while (quarter < 1 or quarter > 4): quarter = int(float(input(' , :
'))) # quarter ,2 ,0 2 0 , # http://www.runoob.com/python/att-string-format.html quarter = '{:02d}'.format(quarter * 3) # quarter = '%02d' %(int(month)*3) date = '{}{}' .format(year, quarter) # print(date) ok # 2 tables = int( input(' (1- ;2- :3- ;4- ;5- ;6- ;7- ):
')) dict_tables = {1: ' ', 2: ' ', 3: ' ', 4: ' ', 5: ' ', 6: ' ', 7: ' '} dict = {1: 'yjbb', 2: 'yjkb/13', 3: 'yjyg', 4: 'yysj', 5: 'zcfz', 6: 'lrb', 7: 'xjll'} category = dict[tables] # 3 url # url = 'http://data.eastmoney.com/bbsj/201803/lrb.html' eg. url = 'http://data.eastmoney.com/{}/{}/{}.html' .format( 'bbsj', date, category) # # 4 start_page = int(input(' :
')) nums = input(' ,( ):
') print('*' * 80) # browser.get(url) # , try: page = browser.find_element_by_css_selector('.next+ a') # next a except: page = browser.find_element_by_css_selector('.at+ a') # else: # print(' ') # try.except '.next+ a', 2 , '.next+ a' end_page = int(page.text) if nums.isdigit(): end_page = start_page + int(nums) elif nums == '': end_page = end_page else: print(' ') # print(' :{}-{}' .format(date, dict_tables[tables])) print(url) yield{ 'url': url, 'category': dict_tables[tables], 'start_page': start_page, 'end_page': end_page } def main(category, page): try: index_page(page) # parse_table() # print df_table = parse_table() write_to_file(df_table, category) print(' %s ' % page) print('--------------') except Exception: print(' , ') # if __name__ == '__main__': for i in set_table(): # url = i.get('url') category = i.get('category') start_page = i.get('start_page') end_page = i.get('end_page') for page in range(start_page, end_page): # for page in range(44,pageall+1): # , main(category, page) print(' ')

参照先:https://www.makcyun.top/2018/10/02/web_scraping_withpython5.html