BeautifulSoupメソッドを使用して豆弁映画情報をキャプチャ

6473 ワード

# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import chardet
import re
import xlwt

#       
def getHtml(index=0):
    print('     {}   '.format(index+1))
    url = 'https://movie.douban.com/top250?start='+str(index*25)+'&filter='
    r = requests.get(url)
    code = chardet.detect(r.content)['encoding']
    # print(code)   #utf-8
    return r.content.decode(code)

def getData(page):
    dataList = []
    for i in range(page):
        html = getHtml(i)
        soup = BeautifulSoup(html,'html.parser')
#      
        parent = soup.find('div',attrs={'id':'content'})   #      id   ,  id        
        # print(type(parentDiv))    #
        lis = parent.find_all('li')
        for each in lis:
            data = []
            #      
            filmName = each.find('div',attrs={'class': 'hd'}).find('span',attrs={'class': 'title'}).string  #    
            data.append(filmName)
            #        
            reg1 = re.compile('.*(\d{4}).*')  #           4   
            filmTimeStr = each.find('div',attrs={'class': 'bd'}).find('p').get_text()
            filmTime = re.findall(reg1,filmTimeStr)[0]    #            
            data.append(filmTime)
            #       
            film_score = each.find('div',attrs={'class':'star'}).find_all('span')[1].get_text()
            data.append(film_score)
            #        each
            reg2 = re.compile('(\d*)')  #       
            discussNumStr = each.find('div',attrs={'class': 'star'}).find_all('span')[3].get_text()
            discussNum = re.findall(reg2, discussNumStr)[0]  #             
            data.append(discussNum)
            #      ,            ,     
            if each.find('p', attrs={'class': 'quote'}):   #     p  
                filmReview = each.find('p', attrs={'class': 'quote'})\
                    .find('span').get_text()  #  p       span      span attrs
            else:
                filmReview = ''
            data.append(filmReview)
            dataList.append(data)
    return dataList
# print(getData())

def saveToExcel(page,filename):
    wbk = xlwt.Workbook()
    sheet = wbk.add_sheet('      ')
    dataList = getData(page)
    title_list = ["    ", "    ", "    ", "    "]
    #     
    for i in range(len(title_list)):
        sheet.write(0, i, title_list[i])
    #      
    for j,each in enumerate(dataList):
        for k,value in enumerate(each):
            sheet.write(j+1,k,value)
    wbk.save(filename)

saveToExcel(10,'       .xls')
print('  ')