python爬虫類(requests+bs 4の基本使用)

8540 ワード

直接コードを貼って、すべて注釈の説明があります
# -*- coding: utf-8 -*-
# author:Gary
#    ,        
import requests
from bs4 import BeautifulSoup

#    ,      
url = 'https://gary666.com/'  #           
html = requests.get(url)  # get      
# print(html.status_code)  #         (200      )
html.encoding = html.apparent_encoding  #     ,              
# print(html.text)  #         
content = html.text

#    ,        
#        html.parser,    lxml,     ,           
soup = BeautifulSoup(content, "html.parser")
# 1         
# soup.    #            
print('            ', soup.a)  #             

# 2   
# soup.   .attrs #                    
print('                    ', soup.a.attrs)  #                     
# soup.   .attrs["   "]           
print('     href        ', soup.a.attrs["href"])  #      href        
# soup.   ["   "]              
print('     href           ', soup.a["href"])  #      href           
# soup.   .string  #               
print('               ', soup.a.string)  #                
# soup.   .text  #                           
print('                           ', soup.a.text)  #                            

# 3   
# soup.   .get_text()  #  soup.   .text
print(soup.a.get_text())  #  soup.a.text
# soup.find("   ") #  soup.   
print(soup.find("a"))  #  soup.a
print(soup.find("a", href="/detail?dbname=life&num=35"))  #                  
#   :       class            ,   class_
# soup.find_all("   ") #           ,       
a_list = soup.find_all("a")  #         
for a in a_list:  #              url
    if a.string is None:  #           
        continue
    else:
        print(a.string + ":" + a.get("href"))
# soup.find_all(["  1", "  2"])             ,             
soup.find_all(["a", "div"])  #           ,             
# soup.find_all("   ", limit=int(n))  # limit        ,   
print(soup.find_all("a", limit=2))  #    2        

#    ,