python解析url
10283 ワード
dive into pythonの例を抜粋しました
HTML ParserとSGMLParserの2つの方法があります
1つ目:
2つ目の方法:
まず基礎クラスで、上記の方法と同じです.
次に、第2の方法の具体的な応用は、新浪の特定のblogの文章の内容とタイトルコードを以下のように解析した.
HTML ParserとSGMLParserの2つの方法があります
1つ目:
#-*-coding:utf-8-*-
import HTMLParser
#html , HTMLParser
class MyHTMLParser(HTMLParser.HTMLParser):
def _init(self):
HTMLParser.HTMLParser.__init__(self);
# -- finish processing of start+end tag: <tag.../>
def handle_startendtag(self, tag, attrs):
self.handle_starttag(tag, attrs)
self.handle_endtag(tag)
#handle start tag
# a href
def handle_starttag(self,tag, attrs):
if tag=='a':
for name,value in attrs:
if name=='href':
print value
# , </xx> -- handle end tag
def handle_endtag(self,tag):
pass;
# , &# , -- handle character reference
def handle_charref(self, name):
pass
# , & , -- handle entity reference
def handle_entityref(self, name):
pass
# , <xx>data</xx> -- handle data
def handle_data(self, data):
pass
# -- handle comment
def handle_comment(self, data):
pass
# <! , <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" -- handle declaration
def handle_decl(self, decl):
pass
# <?instruction> -- handle processing instruction
def handle_pi(self, data):
pass
a='<body><a href="www.163.com">test</a></body>'
print a
my=MyHTMLParser()
my.feed(a)
# www.163.com
2つ目の方法:
まず基礎クラスで、上記の方法と同じです.
#!/usr/bin/env python
#-*-coding:utf-8-*-
from sgmllib import SGMLParser
import htmlentitydefs
class BaseHTMLProcessor(SGMLParser):
def reset(self):
# extend (called by SGMLParser.__init__)
self.pieces = []
SGMLParser.reset(self)
# HTML , <html>,<head>,<body> <pre> , ,
# <br> <img> 。 tagname,SGMLParser start_tagname
# do_tagname 。 , <pre> , start_pre do_pre 。
# ,SGMLParser ; ,
# unknown_starttag 。
def unknown_starttag(self, tag, attrs):
# called for each start tag
# attrs is a list of (attr, value) tuples
# e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")]
# Ideally we would like to reconstruct original tag and attributes, but
# we may end up quoting attribute values that weren't quoted in the source
# document, or we may change the type of quotes around the attribute value
# (single to double quotes).
# Note that improperly embedded non-HTML code (like client-side Javascript)
# may be parsed incorrectly by the ancestor, causing runtime script errors.
# All non-HTML code must be enclosed in HTML comment tags (<!-- code -->)
# to ensure that it will pass through this parser unaltered (in handle_comment).
strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
# HTML , </html>,</head>,</body> </pre> 。
# ,SGMLParser end_tagname 。 ,
#SGMLParser , unknown_endtag 。
def unknown_endtag(self, tag):
# called for each end tag, e.g. for </pre>, tag will be "pre"
# Reconstruct the original end tag.
self.pieces.append("</%(tag)s>" % locals())
# ,  。
# ,SGMLParser handle_charref 。
def handle_charref(self, ref):
# called for each character reference, e.g. for " ", ref will be "160"
# Reconstruct the original character reference.
self.pieces.append("&#%(ref)s;" % locals())
#HTML , ©。 ,SGMLParser HTML handle_entityref 。
def handle_entityref(self, ref):
# called for each entity reference, e.g. for "©", ref will be "copy"
# Reconstruct the original entity reference.
self.pieces.append("&%(ref)s" % locals())
# standard HTML entities are closed with a semicolon; other entities are not
if htmlentitydefs.entitydefs.has_key(ref):
self.pieces.append(";")
# 。 7 。 ,SGMLParser handle_data。
def handle_data(self, text):
# called for each block of plain text, i.e. outside of any tag and
# not containing any character or entity references
# Store the original text verbatim.
#
self.pieces.append(text)
#HTML , <!-- ... --> 。 ,SGMLParser handle_comment
def handle_comment(self, text):
# called for each HTML comment, e.g. <!-- insert Javascript code here -->
# Reconstruct the original comment.
# It is especially important that the source document enclose client-side
# code (like Javascript) within comments so it can pass through this
# processor undisturbed; see comments in unknown_starttag for details.
self.pieces.append("<!--%(text)s-->" % locals())
#HTML , <? ... > 。 ,SGMLParser handle_pi。
def handle_pi(self, text):
# called for each processing instruction, e.g. <?instruction>
# Reconstruct original processing instruction.
self.pieces.append("<?%(text)s>" % locals())
#HTML , DOCTYPE, <! ... > 。 ,SGMLParser handle_decl
def handle_decl(self, text):
# called for the DOCTYPE, if present, e.g.
# <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
# "http://www.w3.org/TR/html4/loose.dtd">
# Reconstruct original DOCTYPE
self.pieces.append("<!%(text)s>" % locals())
def output(self):
"""Return processed HTML as a single string"""
return "".join(self.pieces)
次に、第2の方法の具体的な応用は、新浪の特定のblogの文章の内容とタイトルコードを以下のように解析した.
#!/usr/bin/env python
#coding:utf8
import re
from BaseHTMLProcessor import BaseHTMLProcessor
import urllib
class Dialectizer(BaseHTMLProcessor):
subs = ()
def reset(self):
# extend (called from __init__ in ancestor)
# Reset all data attributes
self.verbatim = 0
BaseHTMLProcessor.reset(self)
def unknown_starttag(self, tag, attrs):
self.pieces.append("")
def unknown_endtag(self, tag):
self.pieces.append("")
def start_title(self, attrs):
self.pieces.append("title")
def end_title(self):
self.pieces.append("title")
def start_p(self, attrs):
self.pieces.append("
")
def end_p(self):
self.pieces.append("")
def start_div(self, attrs):
strattrs = "".join([value for key, value in attrs])
self.pieces.append(strattrs)
def end_div(self):
self.pieces.append("div")
def handle_data(self, text):
self.pieces.append(self.verbatim and text or self.process(text))
def process(self, text):
for fromPattern, toPattern in self.subs:
text = re.sub(fromPattern, toPattern, text)
return text
def translate(url):
import urllib
sock = urllib.urlopen(url)
htmlSource = sock.read()
sock.close()
parser = Dialectizer()
#parser.subs=((r"本",r"aaa"),)
parser.feed(htmlSource)#
parser.close()
return parser.output()
def test(url,filename):
htmlSource=translate(url)
#
title=htmlSource[re.search("title",htmlSource).end():]
title=title[:re.search("title",title).end()-5]
#
content=htmlSource[re.search("articleBody",htmlSource).end()+2:]
content=content[:re.search("div",content).end()-3]
content=re.sub(" ","",content)
content=re.sub("nbsp;","",content)
#
fileName=title;
#
fileContent=title+"
"+content;
fsock = open(filename, "wb")
fsock.write(fileContent)
fsock.close()
if __name__ == "__main__":
test("http://blog.sina.com.cn/s/blog_4bd7b9a20100cpgb.html",'test.txt')