requests-bs 4でネット画像を這い出す
4059 ワード
Webアドレス:http://www.mm131.com/qingchun/
簡単に言えば、imgラベルのsrc属性、すなわち画像のアドレスを得るだけです.
サンプルコード
簡単に言えば、imgラベルのsrc属性、すなわち画像のアドレスを得るだけです.
サンプルコード
** **
1、
2、 ,
3、 ,
# CrawBeaGirlImage.py
import requests
import os
from bs4 import BeautifulSoup
# url
def getHtmlText(url, code):
try:
r = requests.get(url)
r.encoding = code
r.raise_for_status()
return r.text
except:
return ''
# ,
def parseHtml(images, url):
html = getHtmlText(url, 'utf-8')
soup = BeautifulSoup(html, 'lxml')
imgs = soup.find_all('img')
for i in range(len(imgs)):
try:
images.append(imgs[i].attrs['src'])
print(imgs[i].attrs['src'])
except:
print('')
# url ,
def storeImages(images):
root = '/Users/xiaolian/images/'
if not os.path.exists(root):
os.mkdir(root) #
for i in range(len(images)):
#
path = root + str(i) + '.jpg'
try:
r = requests.get(images[i])
r.encoding = 'utf-8'
# b, 2 , TypeError: write() argument must be str, not bytes
with open(path, 'wb') as f:
f.write(r.content)
f.close()
print(' ')
except:
continue
print(' ')
def main():
url = '''http://www.mm131.com/'''
images = []
parseHtml(images, url)
storeImages(images)
if __name__ == '__main__':
main()