jdの売上高が最も高い商品名とコメント数を取得

3337 ワード

参考ブログ:
https://blog.csdn.net/menghuannvxia/article/details/51333689
コード:
import requests
import re
from urllib.parse import quote
from lxml import etree
import sys


def get_jd_data(keyword):
    keyword = quote(keyword)  # 
    url = "https://search.jd.com/Search?keyword={}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&wq={}&stock=1&psort=3&click=0".format(keyword,keyword)
    # print(url)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
        'Cookie':'__jda=122270672.1551510490895255034850.1551510491.1551510491.1551510491.1; __jdb=122270672.12.1551510490895255034850|1.1551510491; __jdc=122270672; __jdv=122270672|direct|-|none|-|1551510490916; __jdu=1551510490895255034850; ipLoc-djd=1-72-2799-0; shshshfp=f21d0cc732d79f86048c96127c37924b; shshshfpa=20192d5f-27f0-cfdd-6be7-6b8564d8956b-1551510516; shshshsID=48476bb6cedc7b2515036bd5adb494f7_9_1551511402939; shshshfpb=fWfTtVViJK0Phfe8j4vlqfw%3D%3D; PCSYCityID=2; 3AB9D23F7A4B3C9B=4QG3GNC2A4EH3Q5AOYTTJ6N644PJZZWSXEJVD5A4FHRPXFN2KSIQJ4WRIICWOY2ON4UV2A2HXWCWHZGYSVUGDSBT64',
        'Refer':'https://www.jd.com/'
    }
    respose = requests.get(url=url,headers=headers).content.decode(encoding='utf-8')  # 
    # print(respose)

    # 
    tree = etree.HTML(respose)
    #  
    comment_list_url = tree.xpath(".//li[@class='gl-item']//div[@class='p-commit']//a/@id")
    sku_url_of_topthree = comment_list_url[:3]  #  
    # 3 
    ids = []
    for j in range(3):
        # 
        url_sku = sku_url_of_topthree[j]
        id = ""
        for i in range(len(list(url_sku))):
            if i >= 10:
                id += list(url_sku)[i]
        ids.append(id)
    # print(ids)

    #********************************************* *************************************************

    result_jddata = []    # 


    # 
    for k in range(3):
        name_list = []
        url_sku_forname = 'https://item.jd.com/{}.html'.format(ids[k])
        re_forname = requests.get(url=url_sku_forname,headers=headers).content.decode('gbk','ignore')
        tree_forname = etree.HTML(re_forname)
        name = tree_forname.xpath("//div[@class='sku-name']/text()")
        # 
        name_sku = ''
        for na in name:
            if na != '/n' :
                name_sku += na.strip()
        # print(name_sku)
        name_list.append(name_sku)
        result_jddata.append(name_list)
    # print(result_jddata)

    # ******************************* ******************************************

    #  
    for j in range(3):
        url_sku_forcommt = 'http://club.jd.com/productpage/p-{}-s-0-t-3-p-0.html'.format(ids[j])
        res_sku = requests.get(url=url_sku_forcommt,headers=headers).content.decode('gbk','ignore')
        # print(res_sku)
        comment_number = re.findall('commentCountStr":"(.*?)","',res_sku )[0]
        result_jddata[j].append(comment_number)

    print(result_jddata)
    return result_jddata
    pass

if __name__ == '__main__':
    keyword = ' '
    result = get_jd_data(keyword)
    pass