jdの売上高が最も高い商品名とコメント数を取得
3337 ワード
参考ブログ:
https://blog.csdn.net/menghuannvxia/article/details/51333689
コード:
https://blog.csdn.net/menghuannvxia/article/details/51333689
コード:
import requests
import re
from urllib.parse import quote
from lxml import etree
import sys
def get_jd_data(keyword):
keyword = quote(keyword) #
url = "https://search.jd.com/Search?keyword={}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&wq={}&stock=1&psort=3&click=0".format(keyword,keyword)
# print(url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
'Cookie':'__jda=122270672.1551510490895255034850.1551510491.1551510491.1551510491.1; __jdb=122270672.12.1551510490895255034850|1.1551510491; __jdc=122270672; __jdv=122270672|direct|-|none|-|1551510490916; __jdu=1551510490895255034850; ipLoc-djd=1-72-2799-0; shshshfp=f21d0cc732d79f86048c96127c37924b; shshshfpa=20192d5f-27f0-cfdd-6be7-6b8564d8956b-1551510516; shshshsID=48476bb6cedc7b2515036bd5adb494f7_9_1551511402939; shshshfpb=fWfTtVViJK0Phfe8j4vlqfw%3D%3D; PCSYCityID=2; 3AB9D23F7A4B3C9B=4QG3GNC2A4EH3Q5AOYTTJ6N644PJZZWSXEJVD5A4FHRPXFN2KSIQJ4WRIICWOY2ON4UV2A2HXWCWHZGYSVUGDSBT64',
'Refer':'https://www.jd.com/'
}
respose = requests.get(url=url,headers=headers).content.decode(encoding='utf-8') #
# print(respose)
#
tree = etree.HTML(respose)
#
comment_list_url = tree.xpath(".//li[@class='gl-item']//div[@class='p-commit']//a/@id")
sku_url_of_topthree = comment_list_url[:3] #
# 3
ids = []
for j in range(3):
#
url_sku = sku_url_of_topthree[j]
id = ""
for i in range(len(list(url_sku))):
if i >= 10:
id += list(url_sku)[i]
ids.append(id)
# print(ids)
#********************************************* *************************************************
result_jddata = [] #
#
for k in range(3):
name_list = []
url_sku_forname = 'https://item.jd.com/{}.html'.format(ids[k])
re_forname = requests.get(url=url_sku_forname,headers=headers).content.decode('gbk','ignore')
tree_forname = etree.HTML(re_forname)
name = tree_forname.xpath("//div[@class='sku-name']/text()")
#
name_sku = ''
for na in name:
if na != '/n' :
name_sku += na.strip()
# print(name_sku)
name_list.append(name_sku)
result_jddata.append(name_list)
# print(result_jddata)
# ******************************* ******************************************
#
for j in range(3):
url_sku_forcommt = 'http://club.jd.com/productpage/p-{}-s-0-t-3-p-0.html'.format(ids[j])
res_sku = requests.get(url=url_sku_forcommt,headers=headers).content.decode('gbk','ignore')
# print(res_sku)
comment_number = re.findall('commentCountStr":"(.*?)","',res_sku )[0]
result_jddata[j].append(comment_number)
print(result_jddata)
return result_jddata
pass
if __name__ == '__main__':
keyword = ' '
result = get_jd_data(keyword)
pass