京東生鮮の商品データとコメントデータを取得


むかしむかし、京東の生鮮ページに登って、靴はコードを出して共有しようとしました
まず簡単に説明します.商品データを抽出するにはselenium操作chromeシミュレーションブラウザでページを動的にレンダリング+ajaxでコメントをロードします.
具体的には以下の説明を参照してください.
必要な内容:
商品小分類名称(りんご、オレンジなど)
商品名称(煙台紅富士リンゴ5 kg一級プラチナ大果単果230-320 gフレッシュフルーツ)
商品総コメント数
商品評価率
星のコメント
コメントの長さ
コメント数
コメントの返信数
コメントテキストの内容
評論家の等級.
レビュー発表からキャプチャまでの日数(days)
部分的に追評のある評論をつかむ:追評テキストの内容、追評と初評の距離時間
以上が今回の任務の需要である
このページのほとんどの情報はダイナミックにレンダリングされるのでseleniumを使います
コメントを見つけるにはよくあるHXRではなくJSで、peoductの冒頭にあるのがコメント情報です
Request URL: https://sclub.jd.com/comment/...
Request Method: GET
Status Code: 200
Remote Address: 117.148.129.129:443
Referrer Policy: no-referrer-when-downgrade
このurlでは、クエリ文字列のほとんどのパラメータは必須ではありません.
def make_url(baseurl, page=0, score=0, productId='3756271'):
    data1 = {
        'callback': 'fetchJSON_comment98vv7490',
        'productId': productId,
        'score': score,
        'sortType': '6',
        'page': page,
        'pageSize': '10',
        'isShadowSku': '0',  #
        'fold': '1',  #
    }
    url = baseurl + urlencode(data1)
    return url

具体的にはコードに表すことができます.
次はコードを貼ります.座ってください.コピーしたくないなら、githubでダウンロードしてもいいです.
# https://www.jd.com/allSort.aspx
import requests
from pyquery import PyQuery as pq
from prettyprinter import cpprint
import json
from urllib.parse import urlencode
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import datetime
import sys


def get_ajax(url):
    headers = {
        'referer': 'https://item.jd.com/3756271.html',  # referer: https://item.jd.com/3756271.html
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
    }
    response = requests.get(url, headers=headers)
    return json.loads(response.text[26:-2])


def make_url(baseurl, page=0, score=0, productId='3756271'):
    data1 = {
        'callback': 'fetchJSON_comment98vv7490',
        'productId': productId,
        'score': score,
        'sortType': '6',
        'page': page,
        'pageSize': '10',
        'isShadowSku': '0',  #
        'fold': '1',  #
    }
    url = baseurl + urlencode(data1)
    return url


def parse_json(rjson, url=None):
    for comment in rjson.get('comments'):
        item = {}
        item['url'] = url
        item['    '] = comment.get('score')
        item['    '] = len(comment.get('content'))
        item['      '] = comment.get('usefulVoteCount')
        item['      '] = comment.get('replyCount')
        item['      '] = comment.get('content')
        item['     '] = comment.get('userLevelId')
        try:
            date1 = time.strptime(comment.get('creationTime'), "%Y-%m-%d %H:%M:%S")
            date2 = time.localtime(time.time())
            date1 = datetime.datetime(date1[0], date1[1], date1[2])
            date2 = datetime.datetime(date2[0], date2[1], date2[2])
            item['          (days)'] = str((date2 - date1).days)
        except Exception as error:
            print('error is >>>', error)
            item['          (days)'] = ''
        if comment.get('afterUserComment', {}).get('hAfterUserComment', {}).get('content', '') == '          ':
            item['      '] = ''
        else:
            item['      '] = comment.get('afterUserComment', {}).get('hAfterUserComment', {}).get('content', '')
        try:
            date1 = time.strptime(comment.get('afterUserComment', {}).get('created', ''), "%Y-%m-%d %H:%M:%S")
            date2 = time.localtime(time.time())
            date1 = datetime.datetime(date1[0], date1[1], date1[2])
            date2 = datetime.datetime(date2[0], date2[1], date2[2])
            item['         '] = str((date2 - date1).days)
        except Exception:
            item['         '] = ''
        if item['      '] == '':
            item['         '] = ''
        yield item


def save_csv_merinfo(item):
    with open(FILENAME_MER, 'a', encoding=ENCODING, newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames_merinfo)
        # writer.writeheader()
        writer.writerow(item)


def save_csv_cominfo(item):
    with open(FILENAME_COM, 'a', encoding=ENCODING, newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames_cominfo)
        # writer.writeheader()
        writer.writerow(item)


def get_page(url):
    browser.get(url)
    submit = wait.until(EC.presence_of_element_located((By.XPATH, '//div[contains(@class,"tab-main")]/ul/li[5]')))
    time.sleep(2)
    for i in range(30):
        browser.execute_script("window.scrollBy(0,50)")
        time.sleep(0.1)
    submit.click()
    time.sleep(3)
    return browser.page_source


def parse_page(html, url):
    page_item = {}
    doc = pq(html, parser='html')
    page_item['url'] = url
    page_item['       '] = doc('#crumb-wrap > div > div.crumb.fl.clearfix > div:nth-child(5) > a').text()
    page_item['    '] = doc('div.itemInfo-wrap div.sku-name').text()
    page_item['       '] = doc('#detail > div.tab-main.large > ul > li.current > s').text().replace('(', '').replace(
        ')', '')
    page_item['     '] = doc('#comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div').text()
    ##comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div
    return page_item


def csv_create():
    with open(FILENAME_MER, 'w', encoding=ENCODING, newline='') as f:
        writer = csv.writer(f)
        writer.writerow(fieldnames_merinfo)

    with open(FILENAME_COM, 'w', encoding=ENCODING, newline='') as f:
        writer = csv.writer(f)
        writer.writerow(fieldnames_cominfo)


def crawl_all_page_url():
    global ALL_PAGE_URL
    browser = webdriver.Chrome()
    wait = WebDriverWait(browser, 20)

    browser.get('https://www.jd.com/allSort.aspx')
    wait.until(EC.presence_of_element_located(
        (By.XPATH, '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]')))

    CASE = []
    for i in range(10):  #   
        initcase = '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]/dl[2]/dd/a[{}]'.format(i + 1)
        CASE.append(initcase)
    for i in range(4):  #     
        initcase = '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]/dl[3]/dd/a[{}]'.format(i + 1)
        CASE.append(initcase)
    for i in range(8):  #     
        initcase = '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]/dl[4]/dd/a[{}]'.format(i + 1)
        CASE.append(initcase)
    for i in range(4):  #     
        initcase = '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]/dl[5]/dd/a[{}]'.format(i + 1)
        CASE.append(initcase)
    for i in range(6):  #     
        initcase = '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]/dl[6]/dd/a[{}]'.format(i + 1)
        CASE.append(initcase)
    #       range     dl[]    ,     

    for case in CASE:
        print('>>>>>>>>>')
        submit = wait.until(EC.element_to_be_clickable(
            (By.XPATH, case)))
        submit.click()

        print(browser.current_url)

        handle = browser.current_window_handle
        handles = browser.window_handles
        for newhandle in handles:
            if newhandle != handle:
                browser.switch_to.window(newhandle)
        time.sleep(1.5)
        wait.until(EC.presence_of_element_located((By.XPATH, '//div[@id="plist"]/ul[contains(@class,"gl-warp")]')))
        doc = pq(browser.page_source, parser='html')
        for li in list(doc('div#plist ul.gl-warp li').items())[:10]:
            res = 'https:' + str(li('div div.p-commit-n strong a').attr('href')).replace('#comment', '')
            print(res)
            ALL_PAGE_URL.append(res)
        time.sleep(1.5)
        browser.close()
        browser.switch_to.window(handle)


def load_all_page_url():
    global ALL_PAGE_URL
    with open(FILENAME_CACHE, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        for item in reader:
            ALL_PAGE_URL.append(item[0])


if __name__ == '__main__':
    #     >>>>>>>>>>
    browser = webdriver.Chrome()  # selenium     
    wait = WebDriverWait(browser, 20)
    MAXINDEX = 7  #         ,         500   ,     35  ,35    500(         )

    #         ********************************
    TIMESLEEP = 2  #     
    FILENAME_MER = 'merinfo_test.csv'  #         
    FILENAME_COM = 'cominfo_test.csv'  #         
    FILENAME_CACHE = 'cache.csv'
    ENCODING = 'UTF-8'  #    CSV   
    # **********************************************

    # csv     
    fieldnames_merinfo = ['url', '       ', '    ', '       ', '     ']
    fieldnames_cominfo = ['url', '    ', '    ', '      ', '      ', '      ', '     ', '          (days)', '      ',
                          '         ']
    # <<<<<<<<<<<<<<<<<

    start = time.time()

    # csv_create()  #   
    #     >>>
    URLSET = []  #     url   
    with open(FILENAME_MER, 'r', encoding=ENCODING) as f:
        reader = csv.reader(f)
        for res in reader:
            URLSET.append(res[0])
    print('URLSET is', URLSET)
    #       
    ALL_PAGE_URL = []  #        
    load_all_page_url()  #          ,load_all_page_url     cache.csv  ,    ,    ,     
    # crawl_all_page_url() #          ,load_all_page_url     cache.csv  ,    ,    ,     
    for page_url in ALL_PAGE_URL:
        if page_url not in URLSET:
            URLSET.append(page_url)  #     
            try:
                html = get_page(page_url)  #     ,selenium    
                item_mer = parse_page(html, url=page_url)  #     ,pyquery
                cpprint(item_mer)

                #       ,ajax
                Flag = 0  #    
                ITEMS = []
                baseurl = 'https://sclub.jd.com/comment/productPageComments.action?'
                for score in [5, 3, 2, 1]:  # 0    ,5  ,3  ,2  ,1  
                    if score == 5:
                        MAXINDEX_TEMP = MAXINDEX
                    else:
                        MAXINDEX_TEMP = int(MAXINDEX / 7)  #      7:1:1:1
                    for index in range(MAXINDEX_TEMP):
                        time.sleep(TIMESLEEP)
                        url = make_url(baseurl, page=index, score=score,
                                       productId=''.join(list(filter(str.isdigit, page_url))))  #   url
                        try:
                            json_ = get_ajax(url)  #   ajax  
                            if len(json_.get('comments')) != 0:
                                for item in parse_json(json_, url=page_url):  #   json
                                    cpprint(item)
                                    ITEMS.append(item)
                                    Flag += 1
                            else:
                                break
                        except Exception as error:
                            print('AJAX      {}>>>'.format(error))
                            print('url is {}'.format(url))
                            print(str(datetime.datetime.now()))
                            sys.exit(0)  # ajax         ,       
                #                     ,    
                save_csv_merinfo(item_mer)  #       

                for item in ITEMS:  #       
                    try:
                        save_csv_cominfo(item)
                    except Exception as error:
                        print(error)
                print("   {}   ".format(Flag))
            except Exception as error:
                print('        {}>>>'.format(error))
        print('          >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
        # time.sleep(TIMESLEEP)

    end = time.time()
    print('    {} '.format(end - start))

やはりいくつか注釈を書いたので、評論の中で聞くことができます.しばらくこうしましょう.