selenium同時実行
12289 ワード
forループとマルチスレッド+selenium
インスタンス1
forサイクル
# -*- coding: utf-8 -*-
"""
Datetime: 2019/6/22
Author: Zhang Yafei
Description:
"""
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from concurrent.futures import ThreadPoolExecutor
import functools
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument('--disable-gpu')
def timeit(func):
"""
:
:param func:
:return:
"""
@functools.wraps(func)
def inner(*args, **kwargs):
start = time.time()
ret = func(*args, **kwargs)
end = time.time() - start
if end < 60:
print(f' :\t{round(end, 2)} ')
else:
min, sec = divmod(end, 60)
print(f' \t{round(min)} \t{round(sec, 2)} ')
return ret
return inner
class PolicyUrlDownload(object):
""" """
def __init__(self, url, pages_num, output_file, a_xpath, headless: bool=True):
self.url_list = [url.format(page) for page in range(1, pages_num+1)]
self.output_file = output_file
self.a_xpath = a_xpath
if headless:
self.driver = webdriver.Chrome(options=chrome_options)
else:
self.driver = webdriver.Chrome()
def start(self, page, url):
with open(self.output_file, mode='a', encoding='utf-8') as file:
print(f"make request to {url}")
self.driver.get(url)
titles = self.driver.find_elements_by_xpath(self.a_xpath)
for title in titles:
href = title.get_attribute('href')
file.write(f'{page}\t{href}
')
print(f'{url} download completed')
def run(self):
for page, url in enumerate(self.url_list):
self.start(page+1, url)
self.driver.close()
@timeit
def main(setting):
policy_data = PolicyUrlDownload(**setting)
policy_data.run()
if __name__ == '__main__':
start_time = time.time()
print('######################## #########################')
#
settings = [
{
'output_file': ' .txt',
'url': 'http://cdsip.nhfpc.gov.cn/work/0-{}.html',
'pages_num': 8,
'a_xpath': '//div[@id="active0"]/ul/li/a'
},
{
'output_file': ' .txt',
'url': 'http://cdsip.nhfpc.gov.cn/policy/0-{}-0.html',
'pages_num': 9,
'a_xpath': '//div[@class="infoContent box-body"]/ul/li/a'
}
]
for setting in settings:
main(setting)
print(' , ', round(time.time() - start_time, 2), ' ')
結果
, 28.46
マルチスレッド
# -*- coding: utf-8 -*-
"""
Datetime: 2019/6/22
Author: Zhang Yafei
Description:
"""
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from concurrent.futures import ThreadPoolExecutor
import functools
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument('--disable-gpu')
def timeit(func):
"""
:
:param func:
:return:
"""
@functools.wraps(func)
def inner(*args, **kwargs):
start = time.time()
ret = func(*args, **kwargs)
end = time.time() - start
if end < 60:
print(f' :\t{round(end, 2)} ')
else:
min, sec = divmod(end, 60)
print(f' \t{round(min)} \t{round(sec, 2)} ')
return ret
return inner
class PolicyUrlDownload(object):
""" """
def __init__(self, url, pages_num, output_file, a_xpath, headless: bool=True):
self.url_list = [url.format(page) for page in range(1, pages_num+1)]
self.output_file = output_file
self.a_xpath = a_xpath
if headless:
self.driver = webdriver.Chrome(options=chrome_options)
else:
self.driver = webdriver.Chrome()
def start(self, page, url):
with open(self.output_file, mode='a', encoding='utf-8') as file:
print(f"make request to {url}")
self.driver.get(url)
titles = self.driver.find_elements_by_xpath(self.a_xpath)
for title in titles:
href = title.get_attribute('href')
file.write(f'{page}\t{href}
')
print(f'{url} download completed')
def run(self):
for page, url in enumerate(self.url_list):
self.start(page+1, url)
self.driver.close()
@timeit
def main(setting):
policy_data = PolicyUrlDownload(**setting)
policy_data.run()
if __name__ == '__main__':
start_time = time.time()
print('######################## #########################')
#
settings = [
{
'output_file': ' .txt',
'url': 'http://cdsip.nhfpc.gov.cn/work/0-{}.html',
'pages_num': 8,
'a_xpath': '//div[@id="active0"]/ul/li/a'
},
{
'output_file': ' .txt',
'url': 'http://cdsip.nhfpc.gov.cn/policy/0-{}-0.html',
'pages_num': 9,
'a_xpath': '//div[@class="infoContent box-body"]/ul/li/a'
}
]
with ThreadPoolExecutor() as pool:
pool.map(main, settings)
print(' , ', round(time.time() - start_time, 2), ' ')
結果
: 18.04
インスタンス2
シーヶンスエグゼクティブ
# -*- coding: utf-8 -*-
import os
import time
from concurrent.futures import ThreadPoolExecutor
from hashlib import md5
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import numpy as np
class PolicyPageDownload(object):
""" """
def __init__(self, file, dir_name, url_list):
self.file = file
self.dir_name = dir_name
self.urls = url_list
self.chrome_options = Options()
self.chrome_options.add_argument("--headless")
self.chrome_options.add_argument('--disable-gpu')
self.driver = webdriver.Chrome(options=self.chrome_options)
# self.driver = webdriver.Chrome()
def start(self, url):
"""
:param url:
:return:
"""
self.driver.get(url)
response = self.driver.page_source
print(f'make request to {url}')
file_name = md5(bytes(url, encoding='utf-8')).hexdigest() + '.html'
print('11111111111')
with open(f'{self.dir_name}/{file_name}', 'w', encoding='utf-8') as file:
file.write(response)
print(f'{url} download completed')
def run(self):
""" """
[self.start(url) for url in self.urls]
self.driver.quit()
def filter_urls(dir_name, urls):
"""
url
:param urls:
:return:
"""
encode_urls = [md5(bytes(url, encoding='utf-8')).hexdigest() + '.html' for url in urls]
has_file = [file for file in os.listdir(dir_name) if os.path.getsize(os.path.join(dir_name, file)) > 0]
encode_urls = set(encode_urls) - set(has_file)
down_urls = list(
filter(lambda url: md5(bytes(url, encoding='utf-8')).hexdigest() + '.html' in encode_urls, urls))
print(f' {len(set(urls))}\t {len(set(has_file))}\t {len(encode_urls)}')
return down_urls
def run(url_list):
policy = PolicyPageDownload(url_list=url_list, **setting)
policy.run()
def main(file, dir_name):
if not os.path.exists(dir_name):
os.mkdir(dir_name)
inputfile = open(file, 'r', encoding='utf-8')
urls = [line.strip().split('\t')[1] for index, line in enumerate(inputfile)]
if os.path.exists(dir_name):
urls = filter_urls(dir_name, urls)
run(urls)
if __name__ == '__main__':
start_time = time.time()
setting = {
'file': ' .txt',
'dir_name': ' '
}
main(**setting)
print(' , ', round(time.time() - start_time, 2), ' ')
マルチスレッド
# -*- coding: utf-8 -*-
import os
import time
from concurrent.futures import ThreadPoolExecutor
from hashlib import md5
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import numpy as np
class PolicyPageDownload(object):
""" """
def __init__(self, file, dir_name, url_list):
self.file = file
self.dir_name = dir_name
self.urls = url_list
self.chrome_options = Options()
self.chrome_options.add_argument("--headless")
self.chrome_options.add_argument('--disable-gpu')
self.driver = webdriver.Chrome(options=self.chrome_options)
# self.driver = webdriver.Chrome()
def start(self, url):
"""
:param url:
:return:
"""
self.driver.get(url)
response = self.driver.page_source
print(f'make request to {url}')
file_name = md5(bytes(url, encoding='utf-8')).hexdigest() + '.html'
print('11111111111')
with open(f'{self.dir_name}/{file_name}', 'w', encoding='utf-8') as file:
file.write(response)
print(f'{url} download completed')
def run(self):
""" """
[self.start(url) for url in self.urls]
self.driver.quit()
def filter_urls(dir_name, urls):
"""
url
:param urls:
:return:
"""
encode_urls = [md5(bytes(url, encoding='utf-8')).hexdigest() + '.html' for url in urls]
has_file = [file for file in os.listdir(dir_name) if os.path.getsize(os.path.join(dir_name, file)) > 0]
encode_urls = set(encode_urls) - set(has_file)
down_urls = list(
filter(lambda url: md5(bytes(url, encoding='utf-8')).hexdigest() + '.html' in encode_urls, urls))
print(f' {len(set(urls))}\t {len(set(has_file))}\t {len(encode_urls)}')
return down_urls
def run(url_list):
policy = PolicyPageDownload(url_list=url_list, **setting)
policy.run()
def main(file, dir_name):
if not os.path.exists(dir_name):
os.mkdir(dir_name)
inputfile = open(file, 'r', encoding='utf-8')
urls = [line.strip().split('\t')[1] for index, line in enumerate(inputfile)]
if os.path.exists(dir_name):
urls = filter_urls(dir_name, urls)
with ThreadPoolExecutor() as pool:
pool.map(run, np.array_split(urls, 4))
if __name__ == '__main__':
start_time = time.time()
setting = {
'file': ' .txt',
'dir_name': ' '
}
main(**setting)
print(' , ', round(time.time() - start_time, 2), ' ')
実行結果
# 50 for : , 48.62
# 150 for : 150.22
# 150 : 80.84
転載先:https://www.cnblogs.com/zhangyafei/p/11075243.html