python爬虫類は豆弁Top 250の書籍情報を取り出し、ファイルに保存する

2547 ワード

python爬虫類は豆弁Top 250の書籍情報を取り出し、ファイルに保存する
import requests
from bs4 import BeautifulSoup

resp = requests.get('https://book.douban.com/top250?start=0')
soup = BeautifulSoup(resp.text, 'lxml')


#       HTML     
def get_html(url):
    #         
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
    resp = requests.get(url, headers=headers).text

    return resp
#          
def all_page():
    base_url = 'https://book.douban.com/top250?start='
    urllist = []
    #  0 225,  25   
    for page in range(0, 250, 25):
        allurl = base_url + str(page)
        urllist.append(allurl)

    return urllist

#     ,      
def html_parse():
    #     ,for         
    for url in all_page():
        # BeautifulSoup   
        soup = BeautifulSoup(get_html(url), 'lxml')
        #   
        alldiv = soup.find_all('div', class_='pl2')
        names = [a.find('a')['title'] for a in alldiv]
        #   
        allp = soup.find_all('p', class_='pl')
        authors = [p.get_text() for p in allp]
        #   
        starspan = soup.find_all('span', class_='rating_nums')
        scores = [s.get_text() for s in starspan]
        #   
        sumspan = soup.find_all('span', class_='inq')
        sums = [i.get_text() for i in sumspan]
        for name, author, score, sum in zip(names, authors, scores, sums):
            name = '  :' + str(name) + '
' author = ' :' + str(author) + '
' score = ' :' + str(score) + '
' sum = ' :' + str(sum) + '
' data = name + author + score + sum # f.writelines(data + '=======================' + '
') # filename = ' Top250.txt' # f = open(filename, 'w', encoding='utf-8') # html_parse() f.close() print(' 。') # find_all() , # class Python , _: # alldiv = soup.find_all('div', class_='pl2') # for a in alldiv: # names = a.find('a')['title'] # print('find_all():', names) # find() : # alldiv2 = soup.find('div', class_='pl2') # names2 = alldiv2.find('a')['title'] # print('find():', names2 )

シロの爬虫道、初めての試水.