斗図とか表情包ノートを取りに行きます.
1268 ワード
斗図とか表情包ノートを取りに行きます.
1
:
1、 xpath get e.xpath.get( " " )
2、 os.path.splittext(url)[ ] url
3、 re.sub()
4、 request request.urlretrieve( )
5、 format
import requests
from lxml import etree
from urllib import request
import os
import re
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
def parse_index(url):
resp=requests.get(url,headers=headers)
html=resp.text
e=etree.HTML(html)
imgs=e.xpath('//div[@class="page-content text-center"]//img')
for img in imgs:
#
img_url = img.get('data-original')
#
alt=img.get('alt')
#
# alt=re.sub(r'[\??\.。!!]', '', img)
# img_url 1
suffix=os.path.splitext(img_url)[1]
filename=alt+suffix
#
request.urlretrieve(img_url,'image/'+filename)
def main():
for i in range(1,100):
url='https://www.doutula.com/photo/list/?page=%d'%i
parse_index(url)
if __name__ == '__main__':
main()