Python爬虫類の使用xpath開多スレッドと協程は協力して爬取妹子図を使用して、大量にダウンロードします
7897 ワード
コードの削除を開始し、python 3で実行することに注意します.x環境下注意:時効性のため、ここのURLは後期に妹図サイトに修正される可能性がありますので、その時に対応するURLが利用可能かどうかを確認してください.
import gevent # ,
from lxml import etree # xml xpath ,
import os,threading # os
import urllib.request #
import time
#
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.9 (KHTML, like Gecko) Chrome/ Safari/530.9 '
}
# url
baseurl = 'http://www.meizitu.com/a/more'
'''
:
, url
url
'''
# //div[@class="pic"]/a/@href URL
# //div[@id="picture"]/p/img/@alt
# //div[@id="picture"]/p/img/@src
#
def download_img(image_url_list,image_name_list,image_fenye_path):
try:
#
os.mkdir(image_fenye_path)
except Exception as e:
pass
for i in range(len(image_url_list)):
#
houzhui = (os.path.splitext(image_url_list[i]))[-1]
#
file_name = image_name_list[i] + houzhui
#
save_path = os.path.join(image_fenye_path,file_name)
#
print(image_url_list[i])
try:
# ,
# urllib.request.urlretrieve(image_url_list[i],save_path)
newrequest = urllib.request.Request(url=image_url_list[i], headers=headers)
newresponse = urllib.request.urlopen(newrequest)
data = newresponse.read()
with open(save_path,'wb') as f: #with
# f = open('test.jpg', 'wb')
f.write(data)
f.close()
print('%s '%save_path)
except Exception as e:
print('%s xxxxxxxx '%save_path)
# url
def read_get_url(sure_url,image_fenye_path):
request = urllib.request.Request(url=sure_url, headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode('gbk')
html_tree = etree.HTML(html)
need_url_list = html_tree.xpath('//div[@class="pic"]/a/@href')
# print(need_url_list)
#
xiecheng = []
for down_url in need_url_list:
#
xiecheng.append(gevent.spawn(down_load, down_url,image_fenye_path))
#
gevent.joinall(xiecheng)
#
def down_load(read_url,image_fenye_path):
# print(read_url,image_fenye_path)
try:
request = urllib.request.Request(url=read_url, headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode('gbk')
html_tree = etree.HTML(html)
#
image_name_list = html_tree.xpath('//div[@id="picture"]/p/img/@alt')
# url
image_url_list = html_tree.xpath('//div[@id="picture"]/p/img/@src')
# print(image_url_list,image_name_list)
download_img(image_url_list,image_name_list,image_fenye_path)
except Exception as e:
pass
#
def main(baseurl):
start_page = int(input(' :'))
end_page = int(input(' :'))
#
try:
global father_path
#
father_path = (os.path.dirname(os.path.abspath(__file__)))
#
mkdir_name = father_path + '/meizitufiles'
os.mkdir(mkdir_name)
except Exception as e:
print(e)
print(' ...')
t_list = []
#
for page_num in range(start_page,end_page + 1):
# url
sure_url = baseurl + '_' + str(page_num) + '.html'
#
image_fenye_path = father_path + '/meizitufiles' + '/ %s '%page_num
t = threading.Thread(target=read_get_url,args=(sure_url,image_fenye_path))
t.start()
t_list.append(t)
for j in t_list:
j.join()
print(' !')
if __name__ == '__main__':
start_time = time.time()
main(baseurl)
print(' :%s'%(time.time()-start_time))