Python爬虫類大学ランキング

2591 ワード

#!/usr/bin/env python
# -*- coding: utf_8 -*-

import bs4
import requests
from bs4 import BeautifulSoup

'''
    Python?     【     】      Python  !
'''


# 1.                     
def getHTMLText(url):
    #                 
    try:
        #         ,        30s
        r = requests.get(url, timeout=30)
        #      
        r.raise_for_status()
        #       
        r.encoding = r.apparent_encoding
        #           
        return r.text
    except:
        #                
        return ""


# 2.                                 
def fillUnivList(ulist, html):
    #  BeautifulSoup      ’html.parser‘   
    soup = BeautifulSoup(html, "html.parser")
    #                       tbody   ,        ’tbody‘          
    for tr in soup.find('tbody').children:
        #   bs4.element.Tag     tr  
        if isinstance(tr, bs4.element.Tag):
            #   tr       tr    td  
            tds = tr('td')
            # [1, 
, , 95.3... # td ulist ulist.append([tds[0].string, tds[1].string, tds[2].string, tds[3].string]) # 3. def printUnivList(ulist, province): # print(" 2019({} )".center(45, '-').format(province)) # format # : {4} utf8 ,python tplt = "{0:^10}\t{1:{4}^10}\t{2:^10}\t{3:^10}" # , chr(12288) # print(tplt.format(" ", " ", " ", " ", chr(12288))) if province == ' ': print(tplt.format(1, ' ', ' ', 99.9, chr(12288))) # , ( range(len(ulist)) ) for i in range(len(ulist)): # u u = ulist[i] # u[2] , ( , , , ) if u[2] == province: # , print(tplt.format(u[0], u[1], u[2], u[3], chr(12288))) # def main(province=' '): # , uinfo = [] # url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html' # html = getHTMLText(url) # fillUnivList(uinfo, html) # printUnivList(uinfo, province=province) main(province=' ')

知識点をまとめる:
  • 解析Webソース
  • タグの内容を検索
  • 正確な内容を再編成