requestsとbeautifulSoupライブラリは豆弁の各タイプの映画を登ります
コード:
# -*-coding:utf-8-*-
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import requests
import time
import json
import random
#http
Hostreferer = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36',
'Referer':'https://movie.douban.com/subject/'
}
#
ip_list = ['61.155.164.109:3128', '117.158.57.2:3128', '123.207.25.143:3128', '61.155.164.107:3128','61.155.164.111:3128','61.4.184.180:3128']
def use_requests():
html = requests.get('http://icanhazip.com', proxies={'http': 'http://' + random.choice(ip_list)}).text
print(html)
def driverHtml(url):
try:
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get(url)
sourcePage = driver.page_source
soup = BeautifulSoup(sourcePage, "lxml")
return soup
except:
return ""
def getMoviceHref(soup):
HrefInfo = soup.find('div',attrs={'class':'types'})
tmp = r'/[^\s]*.action='
HrefList = re.findall(tmp,str(HrefInfo))
#print(HrefList)
pattern = 'type=[0-9]+'
typeNum = re.findall(pattern,str(HrefList))
print(typeNum)
return HrefList,typeNum
def getTypeHtml(typeNum):
index = 0
max = len(typeNum)
for i in range(8,max):
use_requests()
for j in range(1,8):
#use_requests()
num = 20 * j
time.sleep(5)
typeurl = JsonStart_url + str(typeNum[i]) + JsonMid_url + str(num) + JsonEnd_url
print(typeurl)
tx = getHTMLtext(typeurl)
if tx == "":
break
svalues = json.loads(tx)
#print(svalues)
for k in range(len(svalues)):
#
#use_requests()
print(svalues[k]['url'])
MovUrl = svalues[k]['url'].replace('\/','/')
#print(svalues[k]['url'])
#MoviceUrl.append(svalues[k]['url'])
# 5
time.sleep(5)
htmlText = getHTMLtext(MovUrl)
if htmlText == "":
continue
soups = BeautifulSoup(htmlText, 'html.parser')
index = index+1
getMovieInfo(i,j,k,index,soups)
def getHTMLtext(url,code='utf-8'):
try:
r = requests.get(url,headers = Hostreferer)
r.raise_for_status()
r.encoding = code
return r.text
except:
return ""
def getMovieInfo(i,j,k,index,soups):
try:
InfoList = []
Infos = soups.find('div', attrs={'id': 'info'})
# print(Infos)
if Infos == "":
print(" {0} {1} {2} ...".format(i + 1, j, k + 1))
pass
else:
print(" {0} {1} {2} ...".format(i + 1, j, k + 1))
#
director = Infos.find_all(attrs={'rel': 'v:directedBy'})[0].text.split()[0]
#print(" :" + director)
#
name = soups.find('span', attrs={'property': 'v:itemreviewed'}).text.split()[0]
#print(" :" + name)
#
actors = soups.find_all(attrs={'rel': 'v:starring'})
atr = ""
for at in actors:
atr = atr + at.text.split()[0] + '/'
#print(" :" + atr)
#
type = soups.find_all('span', attrs={'property': 'v:genre'})
ts = ""
for t in type:
ts = ts + t.text.split()[0] + '/'
#print(" :" + ts)
#
time = soups.find_all(attrs={'property': 'v:initialReleaseDate'})
ti = ""
for t in time:
ti = ti + t.text.split()[0] + '/'
#print(" :" + ti)
#
runtime = soups.find_all(attrs={'property': 'v:runtime'})
rt = ""
for r in runtime:
rt = rt + r.text.split()[0] + '/'
#print(" :" + rt)
#
average = soups.find_all(attrs={'property': 'v:average'})[0].text.split()[0]
#print(" :" + average)
#
rating_vote = soups.find_all(attrs={'property': 'v:votes'})[0].text.split()[0] + " "
#print(" :" + rating_vote)
# 5
starts5 = soups.find_all(attrs={'class': 'rating_per'})[0].text.split()[0]
#print(" :" + starts5)
InfoList.append([director, name, atr, ts, ti, rt, average, rating_vote, starts5])
fw.write(",".join(InfoList[0]) + "
")
print(" {0} ".format(index))
print("---------------------------------------------------------")
except:
pass
def main():
# selenium html
soup = driverHtml(url)
# url
HrefList,typeNum = getMoviceHref(soup)
# , url
getTypeHtml(typeNum)
if __name__ == '__main__':
index = 1
#
url = 'https://movie.douban.com/chart'
start_url = 'https://movie.douban.com'
MoviceUrl = []
JsonStart_url = 'https://movie.douban.com/j/chart/top_list?'
JsonMid_url = '&interval_id=100%3A90&action=&start='
JsonEnd_url = '&limit=20'
fw = open("douban.csv", 'a+')
row = [" ", " ", " ", " ", " ", " ", " ", " ", " "]
fw.write(",".join(row) + "
")
main()