python豆弁の映画名と採点を分析する
1685 ワード
pythonを用いて豆弁網上のデータを抽出し,分析を行い,映画名とスコアを得た.直接運転可能です.
学習用のみ!
import time
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import datetime
## , excel
driver = webdriver.Chrome(executable_path = "D:\\work\\python\\chromedriver.exe")
driver.maximize_window()
#driver.close()
driver.switch_to_window(driver.window_handles[0])
url = 'https://movie.douban.com/tag/#/?sort=U&range=2,10&tags=%E7%94%B5%E5%BD%B1,2010%E5%B9%B4%E4%BB%A3,%E4%B8%AD%E5%9B%BD%E5%A4%A7%E9%99%86'
js='window.open("'+url+'")'
driver.execute_script(js)
driver.close()
driver.switch_to_window(driver.window_handles[0])
'''
i_pag = 0
while i_pag < 3:
try:
js="var q=document.documentElement.scrollTop=10000000"
driver.execute_script(js)
driver.find_element_by_class_name('more').click()
time.sleep(2)
i_pag = i_pag + 1
except:
break
'''
while True:
try:
js="var q=document.documentElement.scrollTop=10000000"
driver.execute_script(js)
driver.find_element_by_class_name('more').click()
time.sleep(2)
except:
break
name = [k.text for k in driver.find_elements_by_class_name('title')]
score = [k.text for k in driver.find_elements_by_class_name('rate')]
url = [k.get_attribute('href') for k in driver.find_elements_by_class_name('item')]
pd.DataFrame({'name':name,'score':score,'url':url}).to_excel(' .xlsx')
学習用のみ!