python解析url

10283 ワード

dive into pythonの例を抜粋しました
HTML ParserとSGMLParserの2つの方法があります
1つ目:


#-*-coding:utf-8-*-
import  HTMLParser
#html  ，  HTMLParser 
class MyHTMLParser(HTMLParser.HTMLParser):
    def _init(self):
        HTMLParser.HTMLParser.__init__(self);

        
#             -- finish processing of start+end tag: <tag.../>
    def handle_startendtag(self, tag, attrs):
        self.handle_starttag(tag, attrs)
        self.handle_endtag(tag)

    #handle start tag
   #                 a   href    
    def handle_starttag(self,tag, attrs):
        if tag=='a':
             for name,value in attrs:
                 if  name=='href':
                     print  value  

    #       ，  </xx> -- handle end tag
    def handle_endtag(self,tag):
        pass;

    #        ，   &#   ，           -- handle character reference
    def handle_charref(self, name):
        pass

    #         ， &   ，   &nbsp; -- handle entity reference
    def handle_entityref(self, name):
        pass

    #     ，  <xx>data</xx>        -- handle data
    def handle_data(self, data):
        pass

    #      -- handle comment
    def handle_comment(self, data):
        pass

    #   <!   ，  <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" -- handle declaration
    def handle_decl(self, decl):
        pass

    #     <?instruction>    -- handle processing instruction
    def handle_pi(self, data):
        pass
    
    

a='<body><a href="www.163.com">test</a></body>'    
print a
my=MyHTMLParser()
my.feed(a)
#   www.163.com

2つ目の方法:
まず基礎クラスで、上記の方法と同じです.


#!/usr/bin/env python
#-*-coding:utf-8-*-
from sgmllib import SGMLParser
import htmlentitydefs

class BaseHTMLProcessor(SGMLParser):
    def reset(self):                       
        # extend (called by SGMLParser.__init__)
        self.pieces = []
        SGMLParser.reset(self)
    
    #          HTML   ，  <html>，<head>，<body>   <pre>  ，         ，
    #  <br>   <img>  。           tagname，SGMLParser       start_tagname
    #  do_tagname    。  ，       <pre>   ，       start_pre   do_pre    。
    #     ，SGMLParser                    ；  ，            
    #      unknown_starttag   。
    def unknown_starttag(self, tag, attrs):
        # called for each start tag
        # attrs is a list of (attr, value) tuples
        # e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")]
        # Ideally we would like to reconstruct original tag and attributes, but
        # we may end up quoting attribute values that weren't quoted in the source
        # document, or we may change the type of quotes around the attribute value
        # (single to double quotes).
        # Note that improperly embedded non-HTML code (like client-side Javascript)
        # may be parsed incorrectly by the ancestor, causing runtime script errors.
        # All non-HTML code must be enclosed in HTML comment tags (<!-- code -->)
        # to ensure that it will pass through this parser unaltered (in handle_comment).
        strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
        self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
    
    #        HTML   ，  </html>，</head>，</body>   </pre>  。
    #          ，SGMLParser       end_tagname    。    ，
    #SGMLParser       ，              unknown_endtag 。
    def unknown_endtag(self, tag):         
        # called for each end tag, e.g. for </pre>, tag will be "pre"
        # Reconstruct the original end tag.
        self.pieces.append("</%(tag)s>" % locals())
        
    #                       ，  &#160;。 
    #  ，SGMLParser                      handle_charref 。
    def handle_charref(self, ref):         
        # called for each character reference, e.g. for "&#160;", ref will be "160"
        # Reconstruct the original character reference.
        self.pieces.append("&#%(ref)s;" % locals())

    #HTML   ，  &copy;。   ，SGMLParser    HTML          handle_entityref 。
    def handle_entityref(self, ref):       
        # called for each entity reference, e.g. for "&copy;", ref will be "copy"
        # Reconstruct the original entity reference.
        self.pieces.append("&%(ref)s" % locals())
        # standard HTML entities are closed with a semicolon; other entities are not
        if htmlentitydefs.entitydefs.has_key(ref):
            self.pieces.append(";")

    #   。      7         。   ，SGMLParser        handle_data。
    def handle_data(self, text):           
        # called for each block of plain text, i.e. outside of any tag and
        # not containing any character or entity references
        # Store the original text verbatim.
        #     
        self.pieces.append(text)
        
    #HTML   ,     <!-- ... -->  。   ，SGMLParser          handle_comment
    def handle_comment(self, text):        
        # called for each HTML comment, e.g. <!-- insert Javascript code here -->
        # Reconstruct the original comment.
        # It is especially important that the source document enclose client-side
        # code (like Javascript) within comments so it can pass through this
        # processor undisturbed; see comments in unknown_starttag for details.
        self.pieces.append("<!--%(text)s-->" % locals())

    #HTML     ，    <? ... >   。   ，SGMLParser            handle_pi。
    def handle_pi(self, text):             
        # called for each processing instruction, e.g. <?instruction>
        # Reconstruct original processing instruction.
        self.pieces.append("<?%(text)s>" % locals())

    #HTML   ，  DOCTYPE，    <! ... >  。   ，SGMLParser          handle_decl
    def handle_decl(self, text):
        # called for the DOCTYPE, if present, e.g.
        # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
        #     "http://www.w3.org/TR/html4/loose.dtd">
        # Reconstruct original DOCTYPE
        self.pieces.append("<!%(text)s>" % locals())

    def output(self):              
        """Return processed HTML as a single string"""
        return "".join(self.pieces)

次に、第2の方法の具体的な応用は、新浪の特定のblogの文章の内容とタイトルコードを以下のように解析した.


#!/usr/bin/env python
#coding:utf8
import re
from BaseHTMLProcessor import BaseHTMLProcessor
import urllib

class Dialectizer(BaseHTMLProcessor):
    subs = ()

    def reset(self):
        # extend (called from __init__ in ancestor)
        # Reset all data attributes
        self.verbatim = 0
        BaseHTMLProcessor.reset(self)

    def unknown_starttag(self, tag, attrs):
        self.pieces.append("")
        
    def unknown_endtag(self, tag):
        self.pieces.append("")
        
    def start_title(self, attrs):
        self.pieces.append("title")  
    
    def end_title(self): 
        self.pieces.append("title")
        
    def start_p(self, attrs):
        self.pieces.append("
")  
    
    def end_p(self): 
        self.pieces.append("")
        
    def start_div(self, attrs):
        strattrs = "".join([value for key, value in attrs])
        self.pieces.append(strattrs)        
       
    
    def end_div(self):  
        self.pieces.append("div") 
    
    def handle_data(self, text):
        self.pieces.append(self.verbatim and text or self.process(text))

    def process(self, text):
        for fromPattern, toPattern in self.subs:
            text = re.sub(fromPattern, toPattern, text)
        return text


def translate(url):    
    import urllib                      
    sock = urllib.urlopen(url)         
    htmlSource = sock.read()           
    sock.close()                    
    parser = Dialectizer()
    #parser.subs=((r"&#26412;",r"aaa"),)
    parser.feed(htmlSource)#    
    parser.close()         
    return parser.output() 

def test(url,filename):
    htmlSource=translate(url)
    #  
    title=htmlSource[re.search("title",htmlSource).end():]
    title=title[:re.search("title",title).end()-5]
    #  
    content=htmlSource[re.search("articleBody",htmlSource).end()+2:]
    content=content[:re.search("div",content).end()-3]
    content=re.sub("&nbsp;","",content)
    content=re.sub("nbsp;","",content)
    #    
    fileName=title;
    #       
    fileContent=title+"


"+content;    
    fsock = open(filename, "wb")
    fsock.write(fileContent)
    fsock.close()

if __name__ == "__main__":
    test("http://blog.sina.com.cn/s/blog_4bd7b9a20100cpgb.html",'test.txt')

devise+ActiveAdmin+CancancanでAbilityクラスのuserがnilだった時に確認するところ

Ubuntu18.04アリミラーソースを交換