python爬虫類は豆弁Top 250の書籍情報を取り出し、ファイルに保存する

2547 ワード

import requests
from bs4 import BeautifulSoup

resp = requests.get('https://book.douban.com/top250?start=0')
soup = BeautifulSoup(resp.text, 'lxml')


#       HTML     
def get_html(url):
    #         
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
    resp = requests.get(url, headers=headers).text

    return resp
#          
def all_page():
    base_url = 'https://book.douban.com/top250?start='
    urllist = []
    #  0 225，  25   
    for page in range(0, 250, 25):
        allurl = base_url + str(page)
        urllist.append(allurl)

    return urllist

#     ，      
def html_parse():
    #     ，for         
    for url in all_page():
        # BeautifulSoup   
        soup = BeautifulSoup(get_html(url), 'lxml')
        #   
        alldiv = soup.find_all('div', class_='pl2')
        names = [a.find('a')['title'] for a in alldiv]
        #   
        allp = soup.find_all('p', class_='pl')
        authors = [p.get_text() for p in allp]
        #   
        starspan = soup.find_all('span', class_='rating_nums')
        scores = [s.get_text() for s in starspan]
        #   
        sumspan = soup.find_all('span', class_='inq')
        sums = [i.get_text() for i in sumspan]
        for name, author, score, sum in zip(names, authors, scores, sums):
            name = '  ：' + str(name) + '
'
            author = '  ：' + str(author) + '
'
            score = '  ：' + str(score) + '
'
            sum = '  ：' + str(sum) + '
'
            data = name + author + score + sum
            #     
            f.writelines(data + '=======================' + '
')



#    
filename = '    Top250.txt'
#       
f = open(filename, 'w', encoding='utf-8')
#     
html_parse()
f.close()
print('    。')





# find_all()  ，
#   class Python   ，       _：
# alldiv = soup.find_all('div', class_='pl2')
# for a in alldiv:
#     names = a.find('a')['title']
#     print('find_all():', names)

# find()  ：
# alldiv2 = soup.find('div', class_='pl2')
# names2 = alldiv2.find('a')['title']
# print('find():', names2 )

シロの爬虫道、初めての試水.

どうすれば良いプログラムが書けるようになるのか？

python機械学習sklearn一般化線形モデル