Headless で リストで与えられたページを取り込む (python3)


リストで示された URL を取り込む方法です。

fetch_html_list.py
#! /usr/bin/python
# ------------------------------------------------------------------
#
#   fetch_html_list.py
#
#                       Aug/24/2018
# ------------------------------------------------------------------
import  sys
from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as expected
from selenium.webdriver.support.wait import WebDriverWait

from selenium.webdriver.firefox.options import Options

# ------------------------------------------------------------------
def file_write_proc(file_name,str_out):
    fp_out = open(file_name,mode='w',encoding='utf-8')
    fp_out.write(str_out)
    fp_out.close()
#
# ------------------------------------------------------------------
def fetch_single_proc(url_target,file_html):
    sys.stderr.write("url_target = " + url_target + "\n")
#
    driver.get(url_target)
    driver.save_screenshot("out.png")
    html = driver.page_source

    file_write_proc(file_html,html)
#
# ------------------------------------------------------------------
sys.stderr.write("*** 開始 ***\n")
file_list = sys.argv[1]
fp_in = open(file_list,encoding='utf-8')
lines = fp_in.readlines()
fp_in.close()
#
options = Options()
options.add_argument('-headless')
driver = Firefox(executable_path='/usr/bin/geckodriver', firefox_options=options)
ttx = 100
wait = WebDriverWait(driver, timeout=ttx)
#
for line in lines:
    print(line)
    cols= line[:-1].split()
    print(len(cols))
    fetch_single_proc(cols[0],cols[1])
#
driver.quit()
#
sys.stderr.write("*** 終了 ***\n")
# ------------------------------------------------------------------

使い方

./fetch_html_list.py list_url.txt
list_url.txt
https://example.com/aaa/ a001.html
https://example.com/bbb/ a002.html
https://example.com/ccc/ a003.html
https://example.com/ddd/ a004.html