古詩文のウェブサイトのネットの爬虫類の編纂の方式、ネットの爬虫類を通じて内容をつかみます

3054 ワード
1.以下は古詩文サイトの爬虫類コードです.ご覧ください.
# encoding:utf-8
import requests
import re
import json
 
 
def parse_page(url):
    # 1.    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    text = response.text
    # 2.    
    titles = re.findall(r'.*?(.*?)', text, re.DOTALL)
    # print json.dumps(titles, encoding="utf-8", ensure_ascii=False)
    times = re.findall(r'.*?(.*?)', text, re.DOTALL)
    # print json.dumps(times, encoding="utf-8", ensure_ascii=False)
    authors = re.findall(r'
.*?(.*?)', text, re.DOTALL)
    poems_ret = re.findall(r'
(.*?)', text, re.DOTALL)
    poems = []
    for poem in poems_ret:
        temp = re.sub("<.>", "", poem)
        poems.append(temp.strip())
    # for index, value in enumerate(titles):
    #     print titles[index]
    #     print times[index]
    #     print authors[index]
    #     print poems[index]
    #     print "*"*50
    # zip          
    results = []
    for value in zip(titles, times, authors, poems):
        title, time, author, poem = value
        result = {
            "  ": title,
            "  ": time,
            "  ": author,
            "  ": poem
        }
        print result["  "]
        results.append(result)
    # print results
 
 
def main():
    url_base = "https://www.xzslx.net/gushi/"
    for i in range(1, 11):
        url = url_base.format(i)
        print " "*20+"     "+" "*20
        print "*"*50
        parse_page(url)
        print "*"*50
 
 
if __name__ == '__main__':
    main() 
  
 
  2.        ： 
  C:\DDD\python22\python.exe C:/PyCharm/dytt_spider/poems.py
                                           
**************************************************
   
     ，     。
   ×××，     。 
     ，     。
[2]      ，     。
     ，     。 
     ，     。
**************************************************
                                           
**************************************************
     ·  
       ，       。
       ，       ！
**************************************************
                                           
**************************************************
  （       )
       ，        。        ，        。
**************************************************
  
  
Process finished with exit code 0
MS Learnおすすめラーニングパス 2月号【AI入門編】
ショック!Selenium別れPhantomJS