BeautifulSoupメソッドを使用して豆弁映画情報をキャプチャ
6473 ワード
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import chardet
import re
import xlwt
#
def getHtml(index=0):
print(' {} '.format(index+1))
url = 'https://movie.douban.com/top250?start='+str(index*25)+'&filter='
r = requests.get(url)
code = chardet.detect(r.content)['encoding']
# print(code) #utf-8
return r.content.decode(code)
def getData(page):
dataList = []
for i in range(page):
html = getHtml(i)
soup = BeautifulSoup(html,'html.parser')
#
parent = soup.find('div',attrs={'id':'content'}) # id , id
# print(type(parentDiv)) #
lis = parent.find_all('li')
for each in lis:
data = []
#
filmName = each.find('div',attrs={'class': 'hd'}).find('span',attrs={'class': 'title'}).string #
data.append(filmName)
#
reg1 = re.compile('.*(\d{4}).*') # 4
filmTimeStr = each.find('div',attrs={'class': 'bd'}).find('p').get_text()
filmTime = re.findall(reg1,filmTimeStr)[0] #
data.append(filmTime)
#
film_score = each.find('div',attrs={'class':'star'}).find_all('span')[1].get_text()
data.append(film_score)
# each
reg2 = re.compile('(\d*)') #
discussNumStr = each.find('div',attrs={'class': 'star'}).find_all('span')[3].get_text()
discussNum = re.findall(reg2, discussNumStr)[0] #
data.append(discussNum)
# , ,
if each.find('p', attrs={'class': 'quote'}): # p
filmReview = each.find('p', attrs={'class': 'quote'})\
.find('span').get_text() # p span span attrs
else:
filmReview = ''
data.append(filmReview)
dataList.append(data)
return dataList
# print(getData())
def saveToExcel(page,filename):
wbk = xlwt.Workbook()
sheet = wbk.add_sheet(' ')
dataList = getData(page)
title_list = [" ", " ", " ", " "]
#
for i in range(len(title_list)):
sheet.write(0, i, title_list[i])
#
for j,each in enumerate(dataList):
for k,value in enumerate(each):
sheet.write(j+1,k,value)
wbk.save(filename)
saveToExcel(10,' .xls')
print(' ')