あるページの画像をダウンロードします。
5967 ワード
Beautiful Soupというライブラリを使って、インストールをダウンロードしておく必要があります。ダウンロードアドレスhttp://www.crummy.com/software/BeautifulSoup/
config.py
downloadPictrues.py
config.py
1 url = "http://www.baidu.com"
2 folder = "d:\\test"
View CodedownloadPictrues.py
import config
from bs4 import BeautifulSoup
import urlparse
from urllib2 import urlopen
from urllib import urlretrieve
import os
###########################################
#to resolve the fucking character encoding problem
import sys
reload(sys)
sys.setdefaultencoding('utf8')
def main(url, out_folder):
"""Downloads all the images at 'url' to out_folder"""
pageFile = urlopen(url) #pageFile ---a file-liked object
soup = BeautifulSoup(pageFile) #get a BeatifulSoup Object
#print soup.prettify() #
elements = urlparse.urlparse(url) #parse url into a 6-tuple
print elements
parsed = list(elements) #new list initialized from iterable items
for image in soup.findAll("img"): #find all "img"tag
#print "Image: %(src)s" % image
print image,image['src'],type(image)
image_url = urlparse.urljoin(url, image['src']) #construct a full url
filename = image["src"].split("/")[-1]
outpath = os.path.join(out_folder, filename) #
#print out_folder,filename,outpath
urlretrieve(image_url, outpath) #download pictrues
if __name__ == "__main__":
url = config.url
folder = config.folder
if os.path.exists(folder):
print 'ok'
main(url,folder)
else:
os.makedirs(folder)
main(url,folder)