python登取恥事百科

3331 ワード

           

#      
import requests
import re
from bs4 import BeautifulSoup


def getHTMLText(url):
    try:
        headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'}
        r=requests.get(url,timeout=30,headers=headers)
        r.raise_for_status()
        r.encoding='utf-8'
        return r.text
    except:
        raise

def parseHTML(ilt,html):
    soup=BeautifulSoup(html,'html.parser')
    articleDiv=soup.find_all('div',attrs={'class':'article block untagged mb15'})
    for item in articleDiv:
        try:    
            title=item.h2.string
            article=item.span.get_text()
            joy=item.find('span',attrs={'class':'stats-vote'}).get_text()
            comment=item.find('a',attrs={'class':'qiushi_comments'}).get_text()
            ilt.append([title,article,joy,comment])
        except:
            continue
        

def printArticle(ilt):
    pattern=u'{0}
{1}
{2}{3}

' # Unicode , for every in ilt: print(pattern.format(every[0],every[1],every[2],every[3])) with open('joke.txt','a',encoding='utf-8') as f: f.write(every[1]+'
'*2) def main(): start_url='http://www.qiushibaike.com/hot/page/' pageNumber=1 url=start_url+str(pageNumber) html=getHTMLText(url) #print(html) ilt=[] parseHTML(ilt,html) printArticle(ilt) main() # import re import requests from bs4 import BeautifulSoup class QSBK(object): def __init__(self): self.pageNumber=2 self.start_url='http://www.qiushibaike.com/hot/page/' self.url=self.start_url+str(self.pageNumber) self.ilt=[] self.headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'} def getHTMLText(self): try: r=requests.get(self.url,timeout=20,headers=self.headers) r.raise_for_status() r.encoding='utf-8' return r.text except: return 'ERROR' def parsePage(self): soup=BeautifulSoup(self.getHTMLText(),'html.parser') articleDiv=soup.find_all('div',attrs={'class':'article block untagged mb15'}) for item in articleDiv: try: title=item.h2.string article=item.span.get_text() joy=item.find('span',attrs={'class':'stats-vote'}).get_text() comment=item.find('a',attrs={'class':'qiushi_comments'}).get_text() self.ilt.append([title,article,joy,comment]) except: continue def printJoke(self): pattern=u'{0}
{1}
{2}{3}

' # Unicode , for every in self.ilt: print(pattern.format(every[0],every[1],every[2],every[3])) with open('joke.txt','a',encoding='utf-8') as f: f.write(every[1]+'
'*2) spider=QSBK() spider.getHTMLText() spider.parsePage() spider.printJoke()