Pythonはネット上の爬虫類が静的なホームページをつかむことを実現する【コード】
1407 ワード
#---------------------------------import---------------------------------------
#coding:utf-8
import urllib2;
from BeautifulSoup import BeautifulSoup;
#------------------------------------------------------------------------------
def main():
#
userMainUrl = "http://tieba.baidu.com/home/main?id=38b94c4ed8add8bcccabd7d31b22&fr=userbar"; #
req = urllib2.Request(userMainUrl);
resp = urllib2.urlopen(req);
respHtml = resp.read();
print "respHtml=",respHtml; # HTML
#
songtasteHtmlEncoding = "GBK";# charset
soup = BeautifulSoup(respHtml, fromEncoding=songtasteHtmlEncoding);
foundClassH1user = soup.find(attrs={"target":"_blank"});#
print "foundClassH1user=%s",foundClassH1user;
if(foundClassH1user):
h1userStr = foundClassH1user.string;
print "h1userStr=",h1userStr;
###############################################################################
if __name__=="__main__":
main();
1種類のラベルをキャプチャ
#eg:siteUrls=soup.findAll('a')
2種類のラベルをキャプチャ
#eg:foundClassH1user = soup.find(attrs={"target":"_blank"});
2種類のラベルをキャプチャ
#foundClassH1user = soup.find(attrs={"class":"h1user"});