Python爬虫類が豆弁をつかんで本を読むTOP 250


 
  
# -*- coding:utf-8 -*-
#  author: yukun
import requests
from bs4 import BeautifulSoup

#       HTML     
def get_html(url):
    #         
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
    resp = requests.get(url, headers=headers).text
    return resp

#     ,      
def html_parse():
    #     ,for         
    for url in all_page():
        # BeautifulSoup   
        soup = BeautifulSoup(get_html(url), 'lxml')
        #   
        alldiv = soup.find_all('div', class_='pl2')
        names = [a.find('a')['title'] for a in alldiv]
        #   
        allp = soup.find_all('p', class_='pl')
        authors = [p.get_text() for p in allp]
        #   
        starspan = soup.find_all('span', class_='rating_nums')
        scores = [s.get_text() for s in starspan]
        #   
        sumspan = soup.find_all('span', class_='inq')
        sums = [i.get_text() for i in sumspan]
        for name, author, score, sum in zip(names, authors, scores, sums):
            name = '  :' + str(name) + '
' author = ' :' + str(author) + '
' score = ' :' + str(score) + '
' sum = ' :' + str(sum) + '
' data = name + author + score + sum # f.writelines(data + '=======================' + '
') # def all_page(): base_url = 'https://book.douban.com/top250?start=' urllist = [] # 0 225, 25 for page in range(0, 250, 25): allurl = base_url + str(page) urllist.append(allurl) return urllist # filename = ' Top250.txt' # f = open(filename, 'w', encoding='utf-8') # html_parse() f.close() print(' 。')