selenium

25388 ワード

Interprever->addパッケージを使用してSeleniumをインストールした後
# https://chromedriver.chromium.org/downloads
# 위 경로에서 chrome driver 다운로드
from selenium import webdriver
import os
import time
from selenium.webdriver.chrome.options import Options

# options = Options()
# options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
# driver = webdriver.Chrome("chromedriver.exe", options = options)
driver = webdriver.Chrome("../tools/chromedriver_win32/chromedriver.exe")
driver.get("https://www.naver.com")
if os.path.exists("./driverImage")==False:
    os.mkdir("./driverImage")

# 캡쳐하여 저장
driver.save_screenshot('./driverImage/naver.jpg')

time.sleep(2)

driver.get("https://www.daum.net")
driver.save_screenshot('./driverImage/daum.png')


検索(開発者ツールでidを取得できます)
from selenium import webdriver
import time

driver = webdriver.Chrome("../tools/chromedriver_win32/chromedriver.exe")

driver.get("https://www.naver.com")

time.sleep(0.5)

elem_search = driver.find_element_by_id("query")
elem_search.clear()
elem_search.send_keys("상선약수")

time.sleep(0.5)

elem_search_btn = driver.find_element_by_id("search_btn")
elem_search_btn.click()

time.sleep(0.5)

elem_water_word = driver.find_element_by_xpath("""//*[@id="main_pack"]/section[1]/div/div[2]/div[1]/dl/dt/a/span""")
elem_water_word.click()


Seleniumを使うのは、ダイナミックページ(アドレスは同じですが、中の要素が変化した場合(Ajaxが要求したデータ)でもデータを収集できるからです.
from selenium import webdriver

driver = webdriver.Chrome("../tools/chromedriver_win32/chromedriver.exe")

driver.get("https://www.opinet.co.kr/")
driver.implicitly_wait(1)       # 1초간 대기    time.sleep(1)
driver.get("https://www.opinet.co.kr/searRgSelect.do")

# gu_list_raw = driver.find_element_by_id("SIGUNGU_NM0")
# gu_list_raw = driver.find_element_by_css_selector("#SIGUNGU_NM0")
gu_list_raw = driver.find_element_by_xpath("""//*[@id="SIGUNGU_NM0"]""")
gu_list = gu_list_raw.find_elements_by_tag_name("option")
gu_names = [gu_obj.get_attribute("value") for gu_obj in gu_list]
for gu in gu_names:
    print(gu)



古い情報の取得
gu_names.remove("")     # 빈 데이터 삭제
print(gu_names)

2秒ごとにページを変更します(ボールのオプション).

「Excelの保存」ボタンに追加
for gu in gu_names:
    element = driver.find_element_by_id("SIGUNGU_NM0")
    element.send_keys(gu)
    driver.implicitly_wait(1)
    excel_btn = driver.find_element_by_xpath("""//*[@id="glopopd_excel"]/span""")
    excel_btn.click()
    driver.implicitly_wait(1)
    
    
ダウンロードしたExcelファイル.

美しいスープを使う


コード生成htmlファイルを実行します.
htmlファイルをインポートし、bsを使用できるようになりました.
# 파일 저장
element = driver.find_element_by_id("SIGUNGU_NM0")
element.send_keys(gu_names[0])
driver.implicitly_wait(1)
html = driver.page_source
with open("opi.html", "w", encoding="utf-8") as f:
    f.write(html)
    
    
    
htmlファイルからbsを使用して要素をインポートするコードを追加
次の情報を取得
from selenium import webdriver
from bs4 import BeautifulSoup

driver = webdriver.Chrome("../tools/chromedriver_win32/chromedriver.exe")

driver.get("https://www.opinet.co.kr/")
driver.implicitly_wait(1)       # 1초간 대기    time.sleep(1)
driver.get("https://www.opinet.co.kr/searRgSelect.do")

# gu_list_raw = driver.find_element_by_id("SIGUNGU_NM0")
# gu_list_raw = driver.find_element_by_css_selector("#SIGUNGU_NM0")
gu_list_raw = driver.find_element_by_xpath("""//*[@id="SIGUNGU_NM0"]""")
gu_list = gu_list_raw.find_elements_by_tag_name("option")
gu_names = [gu_obj.get_attribute("value") for gu_obj in gu_list]
gu_names.remove("")

for gu in gu_names:
    element = driver.find_element_by_id("SIGUNGU_NM0")
    element.send_keys(gu)
    driver.implicitly_wait(1)
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    a_stations = soup.select("#body1 > tr > td.rlist > a")
    station_names = [a.text for a in a_stations]
    print(station_names)

# element = driver.find_element_by_id("SIGUNGU_NM0")
# element.send_keys(gu_names[0])
# driver.implicitly_wait(1)
# html = driver.page_source
# with open("opi.html", "w", encoding="utf-8") as f:
#     f.write(html)



from selenium import webdriver
from bs4 import BeautifulSoup

driver = webdriver.Chrome("../tools/chromedriver_win32/chromedriver.exe")

driver.get("https://www.opinet.co.kr/")
driver.implicitly_wait(1)       # 1초간 대기    time.sleep(1)
driver.get("https://www.opinet.co.kr/searRgSelect.do")

# gu_list_raw = driver.find_element_by_id("SIGUNGU_NM0")
# gu_list_raw = driver.find_element_by_css_selector("#SIGUNGU_NM0")
gu_list_raw = driver.find_element_by_xpath("""//*[@id="SIGUNGU_NM0"]""")
gu_list = gu_list_raw.find_elements_by_tag_name("option")
gu_names = [gu_obj.get_attribute("value") for gu_obj in gu_list]
gu_names.remove("")

station_names = []
for gu in gu_names:
    print(gu + " - 주유소 수집")
    element = driver.find_element_by_id("SIGUNGU_NM0")
    element.send_keys(gu)
    driver.implicitly_wait(0.1)
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    a_stations = soup.select("#body1 > tr > td.rlist > a")
    names = [a.text for a in a_stations]
    station_names.extend(names)

# tap은 공백으로 바꾸고 strip으로 공백 제거 
station_names = [name.replace('\t', '').strip() for name in station_names]
for station in station_names:
    print(station)
print("서울 총 주유소 개수 : ", len(station_names))

# element = driver.find_element_by_id("SIGUNGU_NM0")
# element.send_keys(gu_names[0])
# driver.implicitly_wait(1)
# html = driver.page_source
# with open("opi.html", "w", encoding="utf-8") as f:
#     f.write(html)


コードを変更する場合は、次のようになります.
station_names = [name.replace('\t', '').strip() for name in station_names]
for i, station in enumerate(station_names):
    print(station, end="\t")
    if i % 3 == 0:
        print()
print("서울 총 주유소 개수 : ", len(station_names))