Python爬虫類の使用xpath開多スレッドと協程は協力して爬取妹子図を使用して、大量にダウンロードします

7897 ワード

コードの削除を開始し、python 3で実行することに注意します.x環境下注意:時効性のため、ここのURLは後期に妹図サイトに修正される可能性がありますので、その時に対応するURLが利用可能かどうかを確認してください.
import gevent  #       ,        
from lxml import etree  #  xml xpath     ,       
import os,threading  #  os          
import urllib.request  #        
import time

#       
headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.9 (KHTML, like Gecko) Chrome/ Safari/530.9 '
}
#        url
baseurl = 'http://www.meizitu.com/a/more'
'''
  :
               ,             url
       url         
'''
# //div[@class="pic"]/a/@href            URL
# //div[@id="picture"]/p/img/@alt        
# //div[@id="picture"]/p/img/@src        
#        
def download_img(image_url_list,image_name_list,image_fenye_path):
    try:
        #           
        os.mkdir(image_fenye_path)
    except Exception as e:
        pass
    for i in range(len(image_url_list)):
        #       
        houzhui = (os.path.splitext(image_url_list[i]))[-1]
        #      
        file_name = image_name_list[i] + houzhui
        #         
        save_path = os.path.join(image_fenye_path,file_name)
        #       
        print(image_url_list[i])
        try:
            #         ,        
            # urllib.request.urlretrieve(image_url_list[i],save_path)
            newrequest = urllib.request.Request(url=image_url_list[i], headers=headers)
            newresponse = urllib.request.urlopen(newrequest)
            data = newresponse.read()
            with open(save_path,'wb') as f:  #with  
                # f = open('test.jpg', 'wb')       
                f.write(data)
                f.close()
            print('%s     '%save_path)
        except Exception as e:
            print('%s xxxxxxxx    '%save_path)

#   url   
def read_get_url(sure_url,image_fenye_path):
    request = urllib.request.Request(url=sure_url, headers=headers)
    response = urllib.request.urlopen(request)
    html = response.read().decode('gbk')
    html_tree = etree.HTML(html)
    need_url_list = html_tree.xpath('//div[@class="pic"]/a/@href')
    # print(need_url_list)
    #               
    xiecheng = []
    for down_url in need_url_list:
        #         
        xiecheng.append(gevent.spawn(down_load, down_url,image_fenye_path))
    #     
    gevent.joinall(xiecheng)

#       
def down_load(read_url,image_fenye_path):
    # print(read_url,image_fenye_path)
    try:
        request = urllib.request.Request(url=read_url, headers=headers)
        response = urllib.request.urlopen(request)
        html = response.read().decode('gbk')
        html_tree = etree.HTML(html)
        #        
        image_name_list = html_tree.xpath('//div[@id="picture"]/p/img/@alt')
        #      url
        image_url_list = html_tree.xpath('//div[@id="picture"]/p/img/@src')
        # print(image_url_list,image_name_list)
        download_img(image_url_list,image_name_list,image_fenye_path)
    except Exception as e:
        pass

#       
def main(baseurl):
    start_page = int(input('       :'))
    end_page = int(input('       :'))
    #         
    try:
        global father_path
        #           
        father_path = (os.path.dirname(os.path.abspath(__file__)))
        #             
        mkdir_name = father_path + '/meizitufiles'
        os.mkdir(mkdir_name)
    except Exception as e:
        print(e)
    print('    ...')
    t_list = []
    #           
    for page_num in range(start_page,end_page + 1):
        #   url
        sure_url = baseurl + '_' + str(page_num) + '.html'
        #          
        image_fenye_path = father_path + '/meizitufiles' + '/ %s '%page_num
        t = threading.Thread(target=read_get_url,args=(sure_url,image_fenye_path))
        t.start()
        t_list.append(t)
    for j in t_list:
        j.join()
    print('    !')

if __name__ == '__main__':
    start_time = time.time()
    main(baseurl)
    print('       :%s'%(time.time()-start_time))