requests-bs 4でネット画像を這い出す

4059 ワード

Webアドレス:http://www.mm131.com/qingchun/
簡単に言えば、imgラベルのsrc属性、すなわち画像のアドレスを得るだけです.

サンプルコード

**    **
1、      
2、      ，         
3、          ，

# CrawBeaGirlImage.py

import requests
import os
from bs4 import BeautifulSoup

# url       
def getHtmlText(url, code):
    try:
        r = requests.get(url)
        r.encoding = code
        r.raise_for_status()
        return r.text
    except:
        return ''

#     ，      
def parseHtml(images, url):
    html = getHtmlText(url, 'utf-8')
    soup = BeautifulSoup(html, 'lxml')
    imgs = soup.find_all('img')
    for i in range(len(imgs)):
        try:
          images.append(imgs[i].attrs['src'])
          print(imgs[i].attrs['src'])
        except:
          print('')
#       url     ，        
def storeImages(images):
    root = '/Users/xiaolian/images/' 
    if not os.path.exists(root):
        os.mkdir(root) #               
    for i in range(len(images)):
        #                
        path = root + str(i) + '.jpg'
        try:
            r = requests.get(images[i])
            r.encoding = 'utf-8'
            #         b，     2       ，       TypeError: write() argument must be str, not bytes
            with open(path, 'wb') as f:
                f.write(r.content)
                f.close()
                print('      ')
        except:
            continue
            print('      ')

def main():
    url = '''http://www.mm131.com/'''
    images = []
    parseHtml(images, url)
    storeImages(images)

if __name__ == '__main__':
    main()

LeetCode 486は勝者HERODINGのLeetCodeの道を予測する

cssでテキストの両端を揃える方法