Python常用爬虫類コードまとめ便利検索

7230 ワード

beautifulsoup解析ページ

'''
    Python？Python     ：973783996      ，          ，      ！
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(htmltxt, "lxml")
#      
soup = BeautifulSoup("", "html.parser")
###  ラベルのみ    、  ラベルのみ    
###  :
soup = BeautifulSoup("", "lxml")
###  :
soup = BeautifulSoup("", "html5lib")
###html 5 libでは   なラベルが   に  されます
###  :
#      、id、class、       
###   class、id、    alog-action         
soup.find("a",class_="title",id="t1",attrs={"alog-action": "qb-ask-uname"}))
###           
pubtime = soup.find("meta",attrs={"itemprop":"datePublished"}).attrs['content']
###     class title   
for i in soup.find_all(class_="title"):
  print(i.get_text())
###        class title   
for i in soup.find_all(class_="title",limit = 2):
  print(i.get_text())
###                      ，              。
soup = BeautifulSoup(' The Dormouses story 
The Dormouses story', "html5lib")
soup.find(class_="title").get_text("|", strip=True)
#   ：The Dormouses story|The Dormouses story
###   class title p   id
soup.find(class_="title").get("id")
###  class    ：
soup.find_all(class_=re.compile("tit"))
### recursive  ，recursive=False ， find              
soup = BeautifulSoup('abc','lxml')
soup.html.find_all("title", recursive=False)

unicodeコード変換中国語

content = "\u65f6\u75c7\u5b85"
content = content.encode("utf8","ignore").decode('unicode_escape')

url encodeの復号と復号

from urllib import parse
#   
x = "    "
y = parse.quote(x)
print(y)
#   
x = parse.unquote(y)
print(x)

htmlエスケープ文字の復号

from html.parser import HTMLParser
htmls = ""
txt = HTMLParser().unescape(htmls)
print(txt)  . #   
 
  
base64       
  import base64
#   
content = "      123"
contents_base64 = base64.b64encode(content.encode('utf-8','ignore')).decode("utf-8")
#   
contents = base64.b64decode(contents_base64)
  
emoji  をフィルタdef filter_emoji(desstr,restr=''):
   try:
     co = re.compile(u'[\U00010000-\U0010ffff]')
   except re.error:
     co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
   return co.sub(restr, desstr)
  
scriptとstyleラベルを  にフィルタimport requests
from bs4 import BeautifulSoup
soup = BeautifulSoup(htmls, "lxml")
for script in soup(["script", "style"]):  
  script.extract()
print(soup)
  
htmlのラベルをフィルタしますが、ラベルの  は  します.import re
htmls = "abc"
dr = re.compile(r']+>',re.S)
htmls2 = dr.sub('',htmls)
print(htmls2)  #abc
      （    json）
rollback({
 "response": {
 "code": "0",
 "msg": "Success",
 "dext": ""
 },
 "data": {
 "count": 3,
 "page": 1,
 "article_info": [{
  "title": "“   ”：                  ",
  "url": "http:\/\/sports.qq.com\/a\/20180704\/035378.htm",
  "time": "2018-07-04 16:58:36",
  "column": "NBA",
  "img": "",
  "desc": ""
 }, {
  "title": "                        ",
  "url": "http:\/\/sports.qq.com\/a\/20180704\/034698.htm",
  "time": "2018-07-04 16:34:44",
  "column": "    ",
  "img": "",
  "desc": ""
 }...]
 }
})
import re
#     json       title、url
# (.*?)       ，           .*？          
reg_str = r'"title":"(.*?)",.*?"url":"(.*?)"'
pattern = re.compile(reg_str,re.DOTALL)
items = re.findall(pattern,htmls)
for i in items:
  tilte = i[0]
  url = i[1]
  
    '''
    Python？Python     ：973783996      ，          ，      ！
'''
#       
today = datetime.date.today()
print(today)   #2018-07-05
#           
time_now = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))
print(time_now)   #2018-07-05 14:20:55
#        
a = 1502691655
time_a = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(a))) 
print(time_a)    #2017-08-14 14:20:55
#      datetime  
str = "2018-07-01 00:00:00"
datetime.datetime.strptime(st, "%Y-%m-%d %H:%M:%S")
#          
time_line = "2018-07-16 10:38:50"
time_tuple = time.strptime(time_line, "%Y-%m-%d %H:%M:%S")
time_line2 = int(time.mktime(time_tuple))
#      
today = datetime.date.today()
tomorrow = today + datetime.timedelta(days=1)
print(tomorrow)   #2018-07-06
#       
today = datetime.datetime.today()
tomorrow = today + datetime.timedelta(days=-3)
print(tomorrow)   #2018-07-02 13:37:00.107703
#      
start = "2018-07-03 00:00:00"
time_now = datetime.datetime.now()
b = datetime.datetime.strptime(start,'%Y-%m-%d %H:%M:%S')
minutes = (time_now-b).seconds/60
days = (time_now-b).days
all_minutes = days*24*60+minutes
print(minutes)   #821.7666666666667
print(days)   #2
print(all_minutes)   #3701.7666666666664
  
データベース  '''
    Python？Python     ：973783996      ，          ，      ！
'''
import pymysql
conn = pymysql.connect(host='10.0.8.81', port=3306, user='root', passwd='root',db='xxx', charset='utf8')
cur = conn.cursor()
insert_sql = "insert into tbl_name(id,name,age) values(%s,%s,%s)
id = 1
name = "like"
age = 26
data_list = []
data = (id,name,age)
#     
cur.execute(insert_sql,data)
conn.commit()
#     
data_list.append(data)
cur.executemany(insert_sql,data_list)
conn.commit()
#      (name       )
data = (id,pymysql.escape_string(name),age)
#  
update_sql = "update tbl_name set content = '%s' where id = "+str(id)
cur.execute(update_sql%(pymysql.escape_string(content)))
conn.commit()
#    
update_sql = "UPDATE tbl_recieve SET content = %s ,title = %s , is_spider = %s WHERE id = %s"
update_data = (contents,title,is_spider,one_new[0])
update_data_list.append(update_data)
if len(update_data_list) > 500:
try:
  cur.executemany(update_sql,update_data_list) 
  conn.commit()

C++のIOライブラリの概要と操作ノート

python 3-文字列操作