あるページの画像をダウンロードします。

5967 ワード

Beautiful Soupというライブラリを使って、インストールをダウンロードしておく必要があります。ダウンロードアドレスhttp://www.crummy.com/software/BeautifulSoup/
config.py

1 url = "http://www.baidu.com"

2 folder = "d:\\test"

View Code
downloadPictrues.py

import config

from bs4 import BeautifulSoup

import urlparse

from urllib2 import urlopen

from urllib import urlretrieve

import os



###########################################

#to resolve the fucking character encoding problem

import sys

reload(sys)

sys.setdefaultencoding('utf8')



def main(url, out_folder):

    """Downloads all the images at 'url' to out_folder"""

    pageFile =  urlopen(url)                 #pageFile ---a file-liked object

    soup = BeautifulSoup(pageFile)           #get a BeatifulSoup Object

    #print soup.prettify()                    #

    elements = urlparse.urlparse(url)        #parse url into a 6-tuple

    print elements

    parsed = list(elements)                  #new list initialized from iterable items

    for image in soup.findAll("img"):        #find all "img"tag

        #print "Image: %(src)s" % image

        print image,image['src'],type(image)

        image_url = urlparse.urljoin(url, image['src'])  #construct a full url

        filename = image["src"].split("/")[-1]           

        outpath = os.path.join(out_folder, filename)     #

        #print out_folder,filename,outpath

        urlretrieve(image_url, outpath)                  #download pictrues



if __name__ == "__main__":

    url = config.url

    folder = config.folder

    if os.path.exists(folder):

        print 'ok'

        main(url,folder)

    else:

        os.makedirs(folder)

        main(url,folder)

Tune TCP time wait interval for high-load environments（HP-UX，Linux，Solaris，Windows）

Report-Yahoo YUI