python爬虫類(requests+bs 4の基本使用)
8540 ワード
直接コードを貼って、すべて注釈の説明があります
# -*- coding: utf-8 -*-
# author:Gary
# ,
import requests
from bs4 import BeautifulSoup
# ,
url = 'https://gary666.com/' #
html = requests.get(url) # get
# print(html.status_code) # (200 )
html.encoding = html.apparent_encoding # ,
# print(html.text) #
content = html.text
# ,
# html.parser, lxml, ,
soup = BeautifulSoup(content, "html.parser")
# 1
# soup. #
print(' ', soup.a) #
# 2
# soup. .attrs #
print(' ', soup.a.attrs) #
# soup. .attrs[" "]
print(' href ', soup.a.attrs["href"]) # href
# soup. [" "]
print(' href ', soup.a["href"]) # href
# soup. .string #
print(' ', soup.a.string) #
# soup. .text #
print(' ', soup.a.text) #
# 3
# soup. .get_text() # soup. .text
print(soup.a.get_text()) # soup.a.text
# soup.find(" ") # soup.
print(soup.find("a")) # soup.a
print(soup.find("a", href="/detail?dbname=life&num=35")) #
# : class , class_
# soup.find_all(" ") # ,
a_list = soup.find_all("a") #
for a in a_list: # url
if a.string is None: #
continue
else:
print(a.string + ":" + a.get("href"))
# soup.find_all([" 1", " 2"]) ,
soup.find_all(["a", "div"]) # ,
# soup.find_all(" ", limit=int(n)) # limit ,
print(soup.find_all("a", limit=2)) # 2
# ,