Python Scraping get_title
目標
- 投票サイトのランキングから、映画のタイトルを取得する
ソースコード
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
import time
print("Input file_name:")
file_name = input()
Source_file = "/Users/micksmith/home/work/eBay/Python/" + file_name +".csv"
def scroll_down():
# height = driver.find_element_by_id('pagination')
# driver.execute_script('arguments[0].scrollIntoView(true);', height)
# driver.execute_script('scrollBy(0, -150)')
try:
while True:
height = driver.find_element_by_id('pagination')
driver.execute_script('arguments[0].scrollIntoView(true);', height)
driver.execute_script('scrollBy(0, -150)')
time.sleep(1)
# try:
# height.click()
# except:
# pass
if(len(driver.find_elements_by_id('pagination')) == 0):
return
height.click()
time.sleep(1)
except:
return
def get_title(title_Eng):
scroll_down()
# for button in driver.find_elements_by_id('pagination__section'):
# button.find_element_by_id('pagination').click()
# print(button.text)
# driver.find_element_by_id('pagination').click()
items = driver.find_elements_by_class_name('listItem__title')
for item in items:
title_Eng.append(item.text)
print("title_Eng:", title_Eng)
return title_Eng
if __name__ == "__main__":
# Open Browser
options = Options()
#options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(executable_path="/Users/micksmith/home/work/eBay/Python/chromedriver", chrome_options=options)
#url = "https://www.ranker.com/list/most-popular-anime-today/ranker-anime"
url = "https://www.ranker.com/crowdranked-list/top-50-greatest-animated-films-of-all-time?ref=browse_list&l=1"
title_Eng = []
print("Page_Num:")
Page_Num = int(input())
print("MIN_Price:")
MIN_Price = int(input())
print("MAX_Price:")
MAX_Price = int(input())
driver.get(url)
df = pd.DataFrame()
df["Title_Eng"] = get_title(title_Eng)
df["Page_Num"] = [Page_Num for i in range(len(df))]
df["MIN_Price"] = [MIN_Price for i in range(len(df))]
df["MAX_Price"] = [MAX_Price for i in range(len(df))]
df.columns = ["Title_Eng","Page_Num","MIN_Price","MAX_Price"]
df.to_csv(Source_file, index=False)
# df.to_csv(Source_file)
driver.quit()
# res = requests.get(url)
# soup = BeautifulSoup(res.text)
# for title in soup.find_all(class_="listItem__data"):
# title_Eng.append(title.text)
結果
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
import time
print("Input file_name:")
file_name = input()
Source_file = "/Users/micksmith/home/work/eBay/Python/" + file_name +".csv"
def scroll_down():
# height = driver.find_element_by_id('pagination')
# driver.execute_script('arguments[0].scrollIntoView(true);', height)
# driver.execute_script('scrollBy(0, -150)')
try:
while True:
height = driver.find_element_by_id('pagination')
driver.execute_script('arguments[0].scrollIntoView(true);', height)
driver.execute_script('scrollBy(0, -150)')
time.sleep(1)
# try:
# height.click()
# except:
# pass
if(len(driver.find_elements_by_id('pagination')) == 0):
return
height.click()
time.sleep(1)
except:
return
def get_title(title_Eng):
scroll_down()
# for button in driver.find_elements_by_id('pagination__section'):
# button.find_element_by_id('pagination').click()
# print(button.text)
# driver.find_element_by_id('pagination').click()
items = driver.find_elements_by_class_name('listItem__title')
for item in items:
title_Eng.append(item.text)
print("title_Eng:", title_Eng)
return title_Eng
if __name__ == "__main__":
# Open Browser
options = Options()
#options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(executable_path="/Users/micksmith/home/work/eBay/Python/chromedriver", chrome_options=options)
#url = "https://www.ranker.com/list/most-popular-anime-today/ranker-anime"
url = "https://www.ranker.com/crowdranked-list/top-50-greatest-animated-films-of-all-time?ref=browse_list&l=1"
title_Eng = []
print("Page_Num:")
Page_Num = int(input())
print("MIN_Price:")
MIN_Price = int(input())
print("MAX_Price:")
MAX_Price = int(input())
driver.get(url)
df = pd.DataFrame()
df["Title_Eng"] = get_title(title_Eng)
df["Page_Num"] = [Page_Num for i in range(len(df))]
df["MIN_Price"] = [MIN_Price for i in range(len(df))]
df["MAX_Price"] = [MAX_Price for i in range(len(df))]
df.columns = ["Title_Eng","Page_Num","MIN_Price","MAX_Price"]
df.to_csv(Source_file, index=False)
# df.to_csv(Source_file)
driver.quit()
# res = requests.get(url)
# soup = BeautifulSoup(res.text)
# for title in soup.find_all(class_="listItem__data"):
# title_Eng.append(title.text)
結果
分析
- ページ下部までスクロールした後、出現する "LOAD MORE" ボタンをクリックしないと、次のランキングが表示されない仕様
- scroll_down 関数
- 要素位置を取得した後、スクロール位置を微調整する (JavaScript)
- "LOAD MORE" ボタンが出現する限り、ランキングを表示させる
- 表示された全ての映画タイトルを取得する
資料
- 使用した投票サイト
- scroll_down 関数
- 要素位置を取得した後、スクロール位置を微調整する (JavaScript)
- "LOAD MORE" ボタンが出現する限り、ランキングを表示させる
- 表示された全ての映画タイトルを取得する
- 使用した投票サイト
Author And Source
この問題について(Python Scraping get_title), 我々は、より多くの情報をここで見つけました https://qiita.com/kganddl/items/91f9232ae28e4b30a73d著者帰属:元の著者の情報は、元のURLに含まれています。著作権は原作者に属する。
Content is automatically searched and collected through network algorithms . If there is a violation . Please contact us . We will adjust (correct author information ,or delete content ) as soon as possible .