requestsとbeautifulSoupライブラリは豆弁の各タイプの映画を登ります


コード:
# -*-coding:utf-8-*-
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import requests
import time
import json
import random


#http   
Hostreferer = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36',
    'Referer':'https://movie.douban.com/subject/'
               }


#    
ip_list = ['61.155.164.109:3128', '117.158.57.2:3128', '123.207.25.143:3128', '61.155.164.107:3128','61.155.164.111:3128','61.4.184.180:3128']
def use_requests():
    html = requests.get('http://icanhazip.com', proxies={'http': 'http://' + random.choice(ip_list)}).text
    print(html)
    
    
def driverHtml(url):
    try:
        
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        driver = webdriver.Chrome(chrome_options=chrome_options)
        driver.get(url)
        sourcePage = driver.page_source
        soup = BeautifulSoup(sourcePage, "lxml")
        return soup
    except:
        return ""
    
def getMoviceHref(soup):
    HrefInfo = soup.find('div',attrs={'class':'types'})
    tmp = r'/[^\s]*.action='
    HrefList = re.findall(tmp,str(HrefInfo))
    #print(HrefList)
    pattern = 'type=[0-9]+'
    typeNum = re.findall(pattern,str(HrefList))
    print(typeNum)
    return HrefList,typeNum


def getTypeHtml(typeNum):
    index = 0
    max = len(typeNum)
    for i in range(8,max):
        use_requests()
        for j in range(1,8):
            #use_requests()
            num = 20 * j
            time.sleep(5)
            typeurl = JsonStart_url + str(typeNum[i]) + JsonMid_url + str(num) + JsonEnd_url
            print(typeurl)
            tx = getHTMLtext(typeurl)
            if tx == "":
                break
            svalues = json.loads(tx)
            #print(svalues)
            for k in range(len(svalues)):
                #      
                #use_requests()
                print(svalues[k]['url'])
                MovUrl = svalues[k]['url'].replace('\/','/')
                #print(svalues[k]['url'])
                #MoviceUrl.append(svalues[k]['url'])
                #   5  
                time.sleep(5)
                htmlText = getHTMLtext(MovUrl)
                if htmlText == "":
                    continue
                soups = BeautifulSoup(htmlText, 'html.parser')
                index = index+1
                getMovieInfo(i,j,k,index,soups)




def getHTMLtext(url,code='utf-8'):
    try:
        r = requests.get(url,headers = Hostreferer)
        r.raise_for_status()
        r.encoding = code
        return r.text
    except:
        return ""


def getMovieInfo(i,j,k,index,soups):
    try:
        InfoList = []
        Infos = soups.find('div', attrs={'id': 'info'})
        # print(Infos)
        if Infos == "":
            print(" {0}   {1}   {2}     ...".format(i + 1, j, k + 1))
            pass
        else:
            print("     {0}   {1}   {2}     ...".format(i + 1, j, k + 1))
            #     
            director = Infos.find_all(attrs={'rel': 'v:directedBy'})[0].text.split()[0]
            #print("  :" + director)
            #       
            name = soups.find('span', attrs={'property': 'v:itemreviewed'}).text.split()[0]
            #print("  :" + name)
            #     
            actors = soups.find_all(attrs={'rel': 'v:starring'})
            atr = ""
            for at in actors:
                atr = atr + at.text.split()[0] + '/'
            #print("  :" + atr)
            #     
            type = soups.find_all('span', attrs={'property': 'v:genre'})
            ts = ""
            for t in type:
                ts = ts + t.text.split()[0] + '/'
            #print("  :" + ts)
            #       
            time = soups.find_all(attrs={'property': 'v:initialReleaseDate'})
            ti = ""
            for t in time:
                ti = ti + t.text.split()[0] + '/'
            #print("    :" + ti)
            #         
            runtime = soups.find_all(attrs={'property': 'v:runtime'})
            rt = ""
            for r in runtime:
                rt = rt + r.text.split()[0] + '/'
            #print("    :" + rt)
            #     
            average = soups.find_all(attrs={'property': 'v:average'})[0].text.split()[0]
            #print("    :" + average)
            #       
            rating_vote = soups.find_all(attrs={'property': 'v:votes'})[0].text.split()[0] + "   "
            #print("      :" + rating_vote)
            # 5     
            starts5 = soups.find_all(attrs={'class': 'rating_per'})[0].text.split()[0]
            #print("      :" + starts5)
        
            InfoList.append([director, name, atr, ts, ti, rt, average, rating_vote, starts5])
        
            fw.write(",".join(InfoList[0]) + "
") print(" {0} ".format(index)) print("---------------------------------------------------------") except: pass def main(): # selenium html soup = driverHtml(url) # url HrefList,typeNum = getMoviceHref(soup) # , url getTypeHtml(typeNum) if __name__ == '__main__': index = 1 # url = 'https://movie.douban.com/chart' start_url = 'https://movie.douban.com' MoviceUrl = [] JsonStart_url = 'https://movie.douban.com/j/chart/top_list?' JsonMid_url = '&interval_id=100%3A90&action=&start=' JsonEnd_url = '&limit=20' fw = open("douban.csv", 'a+') row = [" ", " ", " ", " ", " ", " ", " ", " ", " "] fw.write(",".join(row) + "
") main()