Pythonは妹の図を登ってwordpressにアップロードしてアリ雲ossを使います

23261 ワード

#!/usr/bin/env python
# coding=utf-8

import os
import time
import threading, datetime, hashlib
import oss2
import phpserialize
from multiprocessing import Pool, cpu_count
import requests
import pymysql
from bs4 import BeautifulSoup

now = datetime.datetime.now()

HEADERS = {
    'X-Requested-With': 'XMLHttpRequest',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Referer': "http://www.mmjpg.com"
}

DIR_PATH = r"/var/www/python/mmjpg"  #         

#       AccessKey    API     ,    。          RAM    API       ,    https://ram.console.aliyun.com   RAM  。
auth = oss2.Auth('ak', 'sk')
# Endpoint     ,  Region        。
bucket = oss2.Bucket(auth, 'http://oss-cn-shenzhen-internal.aliyuncs.com', 'bucket')


def save_pic(pic_src, pic_cnt, folder_name):
    """
               
    """
    try:
        img = requests.get(pic_src, headers=HEADERS, timeout=10)
        img_name = "pic_cnt_{}.jpg".format(pic_cnt + 1)
        with open(img_name, 'ab') as f:
            f.write(img.content)
        bucket.put_object_from_file('uploads/' + now.strftime('%Y') + '/' + now.strftime('%m') + '/' + folder_name +
                                    img_name,
                                    DIR_PATH + '/' + folder_name + '/' + img_name)
        print(img_name)
        #   oss    
        return 'http://123.oss-cn-shenzhen.aliyuncs.com/uploads/' + now.strftime('%Y') \
               + '/' + now.strftime('%m') \
               + '/' + folder_name + img_name
    except Exception as e:
        print(e)


def make_dir(folder_name):
    """
                   
    """
    path = os.path.join(DIR_PATH, folder_name)
    #                 ,  ,    。     False,    
    if not os.path.exists(path):
        os.makedirs(path)
        print(path)
        os.chdir(path)
        return True
    print("Folder has existed!")
    return False


def delete_empty_dir(save_dir):
    """
              ,                       
                           ,         
    """
    if os.path.exists(save_dir):
        if os.path.isdir(save_dir):
            for d in os.listdir(save_dir):
                path = os.path.join(save_dir, d)  #        
                if os.path.isdir(path):
                    delete_empty_dir(path)  #         
        if not os.listdir(save_dir):
            os.rmdir(save_dir)
            print("remove the empty dir: {}".format(save_dir))
    else:
        print("Please start your performance!")  #        


lock = threading.Lock()  #      


def urls_crawler(url):
    """
        ,      
    """
    try:
        r = requests.get(url, headers=HEADERS, timeout=10).text
        #    ,       
        folder_name = BeautifulSoup(r, 'lxml').find(
            'h2').text.encode('ISO-8859-1').decode('utf-8')
        post_tags = []
        tags = BeautifulSoup(r, 'lxml').find(
                    'div', class_='tags').find_all('a')
        for tag in tags:
            post_tags.append('"'+tag.text.encode('ISO-8859-1').decode('utf-8')+'"')
        path_name = hashlib.md5(folder_name.encode('utf-8')).hexdigest()[8:-8]
        with lock:
            if make_dir(path_name):
                #     
                max_count = BeautifulSoup(r, 'lxml').find(
                    'div', class_='page').find_all('a')[-2].get_text()
                #     
                page_urls = [url + "/" + str(i) for i in
                             range(1, int(max_count) + 1)]
                #     
                img_urls = []
                for index, page_url in enumerate(page_urls):
                    result = requests.get(
                        page_url, headers=HEADERS, timeout=10).text
                    #         a      img      
                    if index + 1 < len(page_urls):
                        img_url = BeautifulSoup(result, 'lxml').find(
                            'div', class_='content').find('a').img['src']
                        img_urls.append(img_url)
                    else:
                        img_url = BeautifulSoup(result, 'lxml').find(
                            'div', class_='content').find('img')['src']
                        img_urls.append(img_url)
                oss_img_urls = [];
                for cnt, url in enumerate(img_urls):
                    oss_img_urls.append(save_pic(url, cnt, path_name))
                #        
                db = pymysql.connect("127.0.0.1", "root", "123456", "wordpress")
                #   cursor()        
                cursor = db.cursor()
                now_time = now.strftime('%Y-%m-%d %H:%M:%S')
                try:
                    # SQL     
                    sql = """INSERT INTO `wordpress`.`wdposts` 
                        (`post_author`, `post_date`, `post_date_gmt`, `post_content`, 
                        `post_title`, `post_excerpt`, `post_status`, `comment_status`, 
                        `ping_status`, `post_password`, `post_name`, `to_ping`, `pinged`, 
                        `post_modified`, `post_modified_gmt`, `post_content_filtered`, 
                        `post_parent`, `guid`, `menu_order`, `post_type`, `post_mime_type`, 
                        `comment_count`) VALUES 
                        (1, '""" + now_time + """', '""" + now_time + """', '[vc_row][vc_column][vc_column_text] 

""" + folder_name + """

[/vc_column_text][/vc_column][/vc_row][vc_row full_width="stretch_row_content" css=".vc_custom_1444807993418{padding-right: 35px !important;padding-left: 35px !important;}"][vc_column][ultimate_spacer height="60"][/vc_column][/vc_row][vc_row full_width="stretch_row_content" css=".vc_custom_1444987361109{padding-right: 35px !important;padding-left: 35px !important;}"][vc_column][royal_portfolio portfolio_display_filters="yes" portfolio_display_title="yes" portfolio_display_testimonial="yes" portfolio_posts_number="15" portfolio_columns_rate="+1" portfolio_gutter_horz="17" portfolio_gutter_vert="17" portfolio_stretch_container="yes"][/vc_column][/vc_row]', '""" + folder_name + """', '', 'publish', 'open', 'closed', '', '""" + path_name + """', '', '', '""" + now_time + """', '""" + now_time + """', '', 0, 'http://meizg.louislivi.com/?post=""" + path_name + """', 0, 'post', '', 0)""" # sql cursor.execute(sql) db.commit() # post_id = str(cursor.lastrowid) sql = "select term_taxonomy_id from wdterms right join wdterm_taxonomy on wdterm_taxonomy.term_id=wdterms.term_id where name in (" + (",".join(str(i) for i in post_tags)) + ")" cursor.execute(sql) result = cursor.fetchall() sql = "insert into wdterm_relationships(object_id,term_taxonomy_id) VALUES (" + post_id + ",43),(" + post_id + ",30),(" + post_id + ",227941)," term_ides = [] for tag_id in result: sql += "(" + post_id + "," + str(tag_id[0]) + ")," term_ides.append(tag_id[0]) cursor.execute(sql[0:-1]) db.commit() sql = "update wdterm_taxonomy set count=count+1 where term_id in (" + (",".join(str(i) for i in term_ides)) + ")" cursor.execute(sql) db.commit() sql = """INSERT INTO wordpress.wdposts(post_author, post_date, post_date_gmt, post_content, post_title, post_excerpt, post_status, comment_status, ping_status, post_password, post_name, to_ping, pinged, post_modified, post_modified_gmt, post_content_filtered, post_parent, guid, menu_order, post_type, post_mime_type, comment_count) VALUES""" for cnt, url in enumerate(oss_img_urls): sql += """(1, '""" + now_time + """', '""" + now_time + """', '""" + folder_name + str(cnt) + """', '""" + folder_name + str(cnt) + """', '""" + folder_name + str(cnt) + """', 'inherit', 'open','closed', '', '""" + folder_name + str( cnt) + """', '', '', '""" + now_time + """', '""" + now_time + """', '', """ + post_id + """, '""" + url + """', 0, 'attachment', 'image/jpeg', 0),""" cursor.execute(sql[0:-1]) db.commit() sql = """select ID from wdposts where post_parent=""" + post_id + """ order by id asc;""" cursor.execute(sql) result = cursor.fetchall() img_post_ides = [] for i in range(len(result)): img_post_ides.append(result[i][0]) sql = """insert into wdpostmeta(post_id,meta_key,meta_value) values """ pic_cnt = 1 for img_post_id, img_url in zip(img_post_ides, oss_img_urls): img_file_name = path_name + 'pic_cnt_' + str(pic_cnt) img_name = img_file_name + '.jpg' meta_list = { "width": "800", "height": "1200", "hwstring_small": "height='96' width='64'", "file": now.strftime('%Y') + '/' + now.strftime('%m') + '/' + img_name, "sizes": { "thumbnail": { "file": img_file_name + "-150x150.jpg", "width": "150", "height": "150", "mime-type": "image/jpeg", }, "medium": { "file": img_file_name + "-200x300.jpg", "width": "200", "height": "300", "mime-type": "image/jpeg", }, "medium_large": { "file": img_file_name + "-768x1152.jpg", "width": "768", "height": "1152", "mime-type": "image/jpeg", }, "large": { "file": img_file_name + "-683x1024.jpg", "width": "683", "height": "1024", "mime-type": "image/jpeg", }, "royal-similar-items": { "file": img_file_name + "-350x350.jpg", "width": "350", "height": "350", "mime-type": "image/jpeg", }, "royal-search-results": { "file": img_file_name + "-150x150.jpg", "width": "150", "height": "150", "mime-type": "image/jpeg", }, "royal-blog-post": { "file": img_file_name + "-750x450.jpg", "width": "750", "height": "450", "mime-type": "image/jpeg", }, "royal-portfolio-post": { "file": img_file_name + "-500x340.jpg", "width": "500", "height": "340", "mime-type": "image/jpeg", }, "post-thumbnail": { "file": img_file_name + "-800x450.jpg", "width": "800", "height": "450", "mime-type": "image/jpeg", }, "detail": { "file": img_file_name + "-150x150.jpg", "width": "150", "height": "150", "mime-type": "image/jpeg", } }, "image_meta": { "aperture": "0", "credit": "", "camera": "", "caption": "", "created_timestamp": "0", "copyright": "", "focal_length": "0", "iso": "0", "shutter_speed": "0", "title": "", "orientation": "0", "keywords": {}, } } meta_value = phpserialize.dumps(meta_list) sql += """(""" + str(img_post_id) + """,'_wp_attachment_metadata','""" + str(meta_value)[2:-1] + """'), (""" + str(img_post_id) + """,'_wp_attached_file','/var/www/html/wp-content/uploads/""" \ + now.strftime('%Y') + '/' + now.strftime('%m') + '/' + img_name \ + """'),""" pic_cnt += 1 cursor.execute(sql[0:-1]) db.commit() sql = """insert into wdpostmeta(post_id,meta_key,meta_value) values (""" + post_id + """,'_thumbnail_id','""" + str( img_post_ides[-1]) + """'), (""" + post_id + """,'_vc_post_settings','a:1:{s:10:"vc_grid_id";a:0:{}}'), (""" + post_id + """,'slide_template','default'), (""" + post_id + """,'rf_metro_post_width','1x'), (""" + post_id + """,'rf_exc_featured_img','""" + str( img_post_ides[1]) + """'), (""" + post_id + """,'rf_audio_type','embed'), (""" + post_id + """,'rf_audio_embed',''), (""" + post_id + """,'rf_audio_self_mp3',''), (""" + post_id + """,'rf_audio_self_ogg',''), (""" + post_id + """,'rf_video_type','embed'), (""" + post_id + """,'rf_video_embed',''), (""" + post_id + """,'rf_video_self_mp4',''), (""" + post_id + """,'rf_video_self_ogv',''), (""" + post_id + """,'rf_gallery_type','stacked'), (""" + post_id + """,'rf_gallery_img_ids','""" + ( ",".join(str(i) for i in img_post_ides)) + """'), (""" + post_id + """,'rf_gallery_imgs_src','""" + (",".join(str(i) for i in oss_img_urls)) + """'), (""" + post_id + """,'rf_back_link','""" + str( int(post_id) - 1) + """'), (""" + post_id + """,'rf_project_desc_title','""" + folder_name + """'), (""" + post_id + """,'rf_project_description','""" + folder_name + """'), (""" + post_id + """,'rf_project_details_title',''), (""" + post_id + """,'rf_project_client','meizg.com'), (""" + post_id + """,'rf_project_url','http://meizg.louislivi.com/?post=""" + path_name + """'), (""" + post_id + """,'rf_testimonial_author','meizg.com'), (""" + post_id + """,'rf_testimonial_content',''), (""" + post_id + """,'rf_revslider_shortcode',''), (""" + post_id + """,'rf_revslider_select','none'), (""" + post_id + """,'rf_project_info_sticky','no'), (""" + post_id + """,'second_featured_img_id','""" + str( img_post_ides[-2]) + """'), (""" + post_id + """,'_wpb_vc_js_status','true'), (""" + post_id + """,'_wpb_shortcodes_custom_css','.vc_custom_1444807993418{padding-right: 35px !important;padding-left: 35px !important;}.vc_custom_1444987361109{padding-right: 35px !important;padding-left: 35px !important;}'), (""" + post_id + """,'rf_enable_project_info','yes'), (""" + post_id + """,'_edit_lock','1535778316:1'), (""" + post_id + """,'_edit_last','1'), (""" + post_id + """,'rf_project_info_offset','0'), (""" + post_id + """,'rf_project_ext_url',''), (""" + post_id + """,'_wp_trash_meta_status','publish'), (""" + post_id + """,'_wp_trash_meta_time','1535773952'), (""" + post_id + """,'_wp_desired_post_slug','gallery-slideshow-3'); """ # sql cursor.execute(sql) # db.commit() except: # db.rollback() # db.close() except Exception as e: print(e) if __name__ == "__main__": #today = datetime.date.today() #today_time = int(time.mktime(today.timetuple())) #cnt_num = int((today_time - 28800 - 1535644800)/86400+1459) urls = ['http://mmjpg.com/mm/{cnt}'.format(cnt=cnt) # 1459 for cnt in range(1, 1459)] #urls = ['http://mmjpg.com/mm/{cnt}'.format(cnt=cnt) # for cnt in range(cnt_num-1, cnt_num)] pool = Pool(processes=cpu_count()) try: delete_empty_dir(DIR_PATH) pool.map(urls_crawler, urls) except Exception: time.sleep(30) delete_empty_dir(DIR_PATH) pool.map(urls_crawler, urls)

 
Webサイトのプレビュー