Pythonは答えの内容をつかむ

3322 ワード

import urllib2
import re
from bs4 import BeautifulSoup


class Spider():
    def __init__(self, user_agent):
        self.user_agent = user_agent

    def analyzeHtml(self, content):
        if content is None:
            print "Empty"
        print content
        bs = BeautifulSoup(content,"html.parser")
        title = bs.title
        author = bs.find_all("a",class_="author-link")
        if author is not None:
            for a in author:
                print a
                for a_name in a.strings:
                    print a_name
        answers = bs.find_all("div", class_="zm-editable-content clearfix")
        if answers is not None:
            for answer in answers:
                for answer_detail in answer.strings:
                    print answer_detail
        print answer

    def getContentFromHost(self, url):
        header = {"User-Agent": self.user_agent}
        request = urllib2.Request(url, headers=header)
        response = urllib2.urlopen(request)
        content = response.read()
        return content


if __name__ == '__main__':
    host = "https://www.zhihu.com/question/48554642"
    user_agent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
    spider = Spider(user_agent)
    spider.analyzeHtml(spider.getContentFromHost(host))