Pythonはネット上の爬虫類が静的なホームページをつかむことを実現する【コード】

1407 ワード

#---------------------------------import---------------------------------------
#coding:utf-8
import urllib2;
from BeautifulSoup import BeautifulSoup;

#------------------------------------------------------------------------------
def main():
    #  
    userMainUrl = "http://tieba.baidu.com/home/main?id=38b94c4ed8add8bcccabd7d31b22&fr=userbar"; #         
    req = urllib2.Request(userMainUrl);
    resp = urllib2.urlopen(req);
    respHtml = resp.read();
    print "respHtml=",respHtml; #          HTML  
    
    #     
    songtasteHtmlEncoding = "GBK";#    charset   
    soup = BeautifulSoup(respHtml, fromEncoding=songtasteHtmlEncoding);
    foundClassH1user = soup.find(attrs={"target":"_blank"});#      
    print "foundClassH1user=%s",foundClassH1user;
    if(foundClassH1user):
        h1userStr = foundClassH1user.string;
        print "h1userStr=",h1userStr;

###############################################################################
if __name__=="__main__":
    main();

1種類のラベルをキャプチャ
     #eg:siteUrls=soup.findAll('a') 
2種類のラベルをキャプチャ
    #eg:foundClassH1user = soup.find(attrs={"target":"_blank"});
2種類のラベルをキャプチャ
    #foundClassH1user = soup.find(attrs={"class":"h1user"});