python登取恥事百科
3331 ワード
#
import requests
import re
from bs4 import BeautifulSoup
def getHTMLText(url):
try:
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'}
r=requests.get(url,timeout=30,headers=headers)
r.raise_for_status()
r.encoding='utf-8'
return r.text
except:
raise
def parseHTML(ilt,html):
soup=BeautifulSoup(html,'html.parser')
articleDiv=soup.find_all('div',attrs={'class':'article block untagged mb15'})
for item in articleDiv:
try:
title=item.h2.string
article=item.span.get_text()
joy=item.find('span',attrs={'class':'stats-vote'}).get_text()
comment=item.find('a',attrs={'class':'qiushi_comments'}).get_text()
ilt.append([title,article,joy,comment])
except:
continue
def printArticle(ilt):
pattern=u'{0}
{1}
{2}{3}
' # Unicode ,
for every in ilt:
print(pattern.format(every[0],every[1],every[2],every[3]))
with open('joke.txt','a',encoding='utf-8') as f:
f.write(every[1]+'
'*2)
def main():
start_url='http://www.qiushibaike.com/hot/page/'
pageNumber=1
url=start_url+str(pageNumber)
html=getHTMLText(url)
#print(html)
ilt=[]
parseHTML(ilt,html)
printArticle(ilt)
main()
#
import re
import requests
from bs4 import BeautifulSoup
class QSBK(object):
def __init__(self):
self.pageNumber=2
self.start_url='http://www.qiushibaike.com/hot/page/'
self.url=self.start_url+str(self.pageNumber)
self.ilt=[]
self.headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'}
def getHTMLText(self):
try:
r=requests.get(self.url,timeout=20,headers=self.headers)
r.raise_for_status()
r.encoding='utf-8'
return r.text
except:
return 'ERROR'
def parsePage(self):
soup=BeautifulSoup(self.getHTMLText(),'html.parser')
articleDiv=soup.find_all('div',attrs={'class':'article block untagged mb15'})
for item in articleDiv:
try:
title=item.h2.string
article=item.span.get_text()
joy=item.find('span',attrs={'class':'stats-vote'}).get_text()
comment=item.find('a',attrs={'class':'qiushi_comments'}).get_text()
self.ilt.append([title,article,joy,comment])
except:
continue
def printJoke(self):
pattern=u'{0}
{1}
{2}{3}
' # Unicode ,
for every in self.ilt:
print(pattern.format(every[0],every[1],every[2],every[3]))
with open('joke.txt','a',encoding='utf-8') as f:
f.write(every[1]+'
'*2)
spider=QSBK()
spider.getHTMLText()
spider.parsePage()
spider.printJoke()