実戦爬虫類抓取恥事百科ネタ(抓ネタ詳細ページ)
実戦爬虫類抓取恥事百科ネタ(抓ネタ詳細ページ)まず詳細ページのリンクをつかみ、正しいアドレス につづる.詳細ページデータをキャプチャし、不要な文字 を処理する.正則書の結果が一意でない場合、スライスにより必要なデータ を取得する.
#法案先出力結果に表情リンクがあり、置換
joke = re.sub(r'| | ', '', joke[0])
# jokes = jokes[0].replace('','')
print(joke)
time.sleep(2)
def main():
base_url = 'https://www.qiushibaike.com/text/page/{}/'
for i in range(1, 11):
page_url = base_url.format(i)
detai_page_list = detail_pages(page_url)
parse_page(detai_page_list)
time.sleep(2)
print(i)
break
if __name__ == '__main__':
main()
# !/usr/bin/python
# Filename: ( ).py
# Data : 2020/06/15
# Author : --king--
# ctrl+alt+L
import requests
import re
import time
# url
def detail_pages(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
html = requests.get(url, headers=headers)
text = html.text
detail_pages = re.findall(r'''.+?' '', text, re.VERBOSE | re.DOTALL)
urls = []
for detail_page in detail_pages:
detail_page = 'https://www.qiushibaike.com' + detail_page
urls.append(detail_page)
return urls
# joke
def parse_page(urls):
for url in urls:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
html = requests.get(url, headers=headers)
text = html.text
# ,
joke = re.findall(r'''.+?(.+?)
''', text, re.VERBOSE | re.DOTALL)#法案先出力結果に表情リンクがあり、置換
joke = re.sub(r'|
# jokes = jokes[0].replace('','')
print(joke)
time.sleep(2)
def main():
base_url = 'https://www.qiushibaike.com/text/page/{}/'
for i in range(1, 11):
page_url = base_url.format(i)
detai_page_list = detail_pages(page_url)
parse_page(detai_page_list)
time.sleep(2)
print(i)
break
if __name__ == '__main__':
main()