pythonはscrapyに基づいて京東ノートパソコンのデータを取って、そして簡単な処理と分析を行います。

16378 ワード

一、環境準備

python 3.8.3

pycharm

プロジェクトに必要な第三者パッケージ


pip install scrapy fake-useragent requests selenium virtualenv -i https://pypi.douban.com/simple

1.1仮想環境の作成
指定されたディレクトリの作成に切り替えます。


virtualenv .venv

作成後は仮想環境を有効にしてください。
1.2作成項目


scrapy startproject

1.3 pycharmを使ってプロジェクトを開けて、作成した仮想環境をプロジェクトに配置します。
1.4京東spiderを創建する


scrapy genspider      url

1.5アクセス可能なドメイン名を修正し、httpsを削除する：
二、問題分析
データを取り込む発想は、まずトップページの基本情報を取得し、詳細ページの商品詳細情報を取得することです。京東データを取る時、40本のデータしか戻りません。ここでは、著者はseleniumを使って、scrapyフレームの中にダウンロード機の中間部品を作成して、ページのすべてのデータを返します。
登るフィールドはそれぞれです。
商品の価格
商品評価数
商品屋
商品SKU（東京直々に対応商品を検索できます）
商品のタイトル
商品の詳細
三、spider


import re
import scrapy


from lianjia.items import jd_detailItem


class JiComputerDetailSpider(scrapy.Spider):
    name = 'ji_computer_detail'
    allowed_domains = ['search.jd.com', 'item.jd.com']
    start_urls = [
        'https://search.jd.com/Search?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91&suggest=1.def.0.base&wq=%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91&page=1&s=1&click=0']

    def parse(self, response):
        lls = response.xpath('//ul[@class="gl-warp clearfix"]/li')
        for ll in lls:
            item = jd_detailItem()
            computer_price = ll.xpath('.//div[@class="p-price"]/strong/i/text()').extract_first()
            computer_commit = ll.xpath('.//div[@class="p-commit"]/strong/a/text()').extract_first()
            computer_p_shop = ll.xpath('.//div[@class="p-shop"]/span/a/text()').extract_first()
            item['computer_price'] = computer_price
            item['computer_commit'] = computer_commit
            item['computer_p_shop'] = computer_p_shop
            meta = {
                'item': item
            }
            shop_detail_url = ll.xpath('.//div[@class="p-img"]/a/@href').extract_first()
            shop_detail_url = 'https:' + shop_detail_url
            yield scrapy.Request(url=shop_detail_url, callback=self.detail_parse, meta=meta)
        for i in range(2, 200, 2):
            next_page_url = f'https://search.jd.com/Search?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91&suggest=1.def.0.base&wq=%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91&page={i}&s=116&click=0'
            yield scrapy.Request(url=next_page_url, callback=self.parse)

    def detail_parse(self, response):
        item = response.meta.get('item')
        computer_sku = response.xpath('//a[@class="notice J-notify-sale"]/@data-sku').extract_first()
        item['computer_sku'] = computer_sku
        computer_title = response.xpath('//div[@class="sku-name"]/text()').extract_first().strip()
        computer_title = ''.join(re.findall('\S', computer_title))
        item['computer_title'] = computer_title
        computer_detail = response.xpath('string(//ul[@class="parameter2 p-parameter-list"])').extract_first().strip()
        computer_detail = ''.join(re.findall('\S', computer_detail))
        item['computer_detail'] = computer_detail
        yield item

四、アイテム


class jd_detailItem(scrapy.Item):
    # define the fields for your item here like:
    computer_sku = scrapy.Field()
    computer_price = scrapy.Field()
    computer_title = scrapy.Field()
    computer_commit = scrapy.Field()
    computer_p_shop = scrapy.Field()
    computer_detail = scrapy.Field()

五、セットニング


import random


from fake_useragent import UserAgent
ua = UserAgent()
USER_AGENT = ua.random
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = random.uniform(0.5, 1)
DOWNLOADER_MIDDLEWARES = {
    'lianjia.middlewares.jdDownloaderMiddleware': 543
}
ITEM_PIPELINES = {
    'lianjia.pipelines.jd_csv_Pipeline': 300
}

六、pipelines


class jd_csv_Pipeline:
    # def process_item(self, item, spider):
    #     return item
    def open_spider(self, spider):
        self.fp = open('./jd_computer_message.xlsx', mode='w+', encoding='utf-8')
        self.fp.write('computer_sku\tcomputer_title\tcomputer_p_shop\tcomputer_price\tcomputer_commit\tcomputer_detail
')

    def process_item(self, item, spider):
        #     
        try:
            line = '\t'.join(list(item.values())) + '
'
            self.fp.write(line)
            return item
        except:
            pass

    def close_spider(self, spider):
        #     
        self.fp.close()

ミddlewares


class jdDownloaderMiddleware:
    def process_request(self, request, spider):
        #      ji_computer_detail   
        #        
        if spider.name == 'ji_computer_detail' and re.findall(f'.*(item.jd.com).*', request.url) == []:
            options = ChromeOptions()
            options.add_argument("--headless")
            driver = webdriver.Chrome(options=options)
            driver.get(request.url)
            for i in range(0, 15000, 5000):
                driver.execute_script(f'window.scrollTo(0, {i})')
                time.sleep(0.5)
            body = driver.page_source.encode()
            time.sleep(1)
            return HtmlResponse(url=request.url, body=body, request=request)
        return None

八、jupyterを使って簡単な処理と分析を行います。
その他のファイル：Baiduの辞書、簡体字ファイルを停止します。
第三者パッケージをダウンロード


!pip install seaborn jieba wordcloud PIL  -i https://pypi.douban.com/simple

8.1第三者パッケージの導入


import re
import os
import jieba
import wordcloud
import pandas as pd
import numpy as np
from PIL import Image
import seaborn as sns
from docx import Document
from docx.shared import Inches
import matplotlib.pyplot as plt
from pandas import DataFrame,Series

8.2可視化されたデフォルトのフォントとseabornのスタイルを設定します。


sns.set_style('darkgrid')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

8.3データを読み出す


df_jp = pd.read_excel('./jd_shop.xlsx')

8.4 Inteli 5、i 7、i 9プロセッサデータをフィルタする


def convert_one(s):
    if re.findall(f'.*?(i5).*', str(s)) != []:
        return re.findall(f'.*?(i5).*', str(s))[0]
    elif re.findall(f'.*?(i7).*', str(s)) != []:
        return re.findall(f'.*?(i7).*', str(s))[0]
    elif re.findall(f'.*?(i9).*', str(s)) != []:
        return re.findall(f'.*?(i9).*', str(s))[0]
df_jp['computer_intel'] = df_jp['computer_detail'].map(convert_one)

8.5ノートパソコンの画面サイズ範囲をフィルタする


def convert_two(s):
    if re.findall(f'.*?(\d+\.\d+  -\d+\.\d+  ).*', str(s)) != []:
        return re.findall(f'.*?(\d+\.\d+  -\d+\.\d+  ).*', str(s))[0]
df_jp['computer_in'] = df_jp['computer_detail'].map(convert_two)

8.6コメント数を整形に変換する


def convert_three(s):
    if re.findall(f'(\d+) +', str(s)) != []:
        number = int(re.findall(f'(\d+) +', str(s))[0]) * 10000
        return number
    elif re.findall(f'(\d+)+', str(s)) != []:
        number = re.findall(f'(\d+)+', str(s))[0]
        return number
df_jp['computer_commit'] = df_jp['computer_commit'].map(convert_three)

8.7分析が必要なブランドを選別する


def find_computer(name, s):
    sr = re.findall(f'.*({name}).*', str(s))[0]
    return sr
def convert(s):
    if re.findall(f'.*(  ).*', str(s)) != []:
        return find_computer('  ', s)
    elif re.findall(f'.*(  ).*', str(s)) != []:
        return find_computer('  ', s)
    elif re.findall(f'.*(  ).*', str(s)) != []:
        return find_computer('  ', s)
    elif re.findall(f'.*(  ).*', str(s)) != []:
        return find_computer('  ', s)
    elif re.findall(f'.*(  ).*', str(s)) != []:
        return find_computer('  ', s)
    elif re.findall(f'.*(  ).*', str(s)) != []:
        return find_computer('  ', s)
    elif re.findall(f'.*(  ).*', str(s)) != []:
        return find_computer('  ', s)
    elif re.findall(f'.*(  ).*', str(s)) != []:
        return find_computer('  ', s)
    elif re.findall(f'.*(   ).*', str(s)) != []:
        return find_computer('   ', s)
df_jp['computer_p_shop'] = df_jp['computer_p_shop'].map(convert)

8.8指定フィールドが空の値のデータを削除する


for n in ['computer_price', 'computer_commit', 'computer_p_shop', 'computer_sku', 'computer_detail', 'computer_intel', 'computer_in']:
    index_ls = df_jp[df_jp[[n]].isnull().any(axis=1)==True].index
    df_jp.drop(index=index_ls, inplace=True)

8.9各ブランドの平均価格を確認する


plt.figure(figsize=(10, 8), dpi=100)
ax = sns.barplot(x='computer_p_shop', y='computer_price', data=df_jp.groupby(by='computer_p_shop')[['computer_price']].mean().reset_index())
for index,row in df_jp.groupby(by='computer_p_shop')[['computer_price']].mean().reset_index().iterrows():
    ax.text(row.name,row['computer_price'] + 2,round(row['computer_price'],2),color="black",ha="center")
ax.set_xlabel('  ')
ax.set_ylabel('    ')
ax.set_title('       ')
boxplot_fig = ax.get_figure()
boxplot_fig.savefig('       .png', dpi=400)

8.10各ブランドの価格区間を確認する


plt.figure(figsize=(10, 8), dpi=100)
ax = sns.boxenplot(x='computer_p_shop', y='computer_price', data=df_jp.query('computer_price>500'))
ax.set_xlabel('  ')
ax.set_ylabel('    ')
ax.set_title('       ')
boxplot_fig = ax.get_figure()
boxplot_fig.savefig('       .png', dpi=400)

8.11価格とコメント数の関係を見る


df_jp['computer_commit'] = df_jp['computer_commit'].astype('int64')
ax = sns.jointplot(x="computer_commit", y="computer_price", data=df_jp, kind="reg", truncate=False,color="m", height=10)
ax.fig.savefig('         .png')

8.12商品のタイトルに出てくるキーワードを確認する


import imageio

#         
ls = df_jp['computer_title'].to_list()
#          
feature_points = [re.sub(r'[^a-zA-Z\u4E00-\u9FA5]+',' ',str(feature)) for feature in ls]
#      
stop_world = list(pd.read_csv('./      .txt', engine='python', encoding='utf-8', names=['stopwords'])['stopwords'])
feature_points2 = []
for feature in feature_points:  #        
    words = jieba.lcut(feature) #     ，    .        jieba  
    ind1 = np.array([len(word) > 1 for word in words])  #              1
    ser1 = pd.Series(words)
    ser2 = ser1[ind1] #         1     
    ind2 = ~ser2.isin(stop_world)  #       
    ser3 = ser2[ind2].unique()  #               ,   
    if len(ser3) > 0:
        feature_points2.append(list(ser3))
#              
wordlist = [word for feature in feature_points2 for word in feature]
#                  
feature_str =  ' '.join(wordlist)   
#     
font_path = r'./simhei.ttf'
shoes_box_jpg = imageio.imread('./home.jpg')
wc=wordcloud.WordCloud(
    background_color='black',
    mask=shoes_box_jpg,
    font_path = font_path,
    min_font_size=5,
    max_font_size=50,
    width=260,
    height=260,
)
wc.generate(feature_str)
plt.figure(figsize=(10, 8), dpi=100)
plt.imshow(wc)
plt.axis('off')
plt.savefig('       ')

8.13選別価格は4000から5000までで、ブランド、プロセッサはi 5、スクリーンサイズは15インチ以上のデータを連想して価格を調べます。


df_jd_query = df_jp.loc[(df_jp['computer_price'] <=5000) & (df_jp['computer_price']>=4000) & (df_jp['computer_p_shop']=="  ") & (df_jp['computer_intel']=="i5") & (df_jp['computer_in']=="15.0  -15.9  "), :].copy()
plt.figure(figsize=(20, 10), dpi=100)
ax = sns.barplot(x='computer_sku', y='computer_price', data=df_jd_query)
ax.set_xlabel('    SKU')
ax.set_ylabel('  ')
ax.set_title('  i5     15    SKU   ')
boxplot_fig = ax.get_figure()
boxplot_fig.savefig('  i5     15    SKU   .png', dpi=400)

8.14選別価格は4000から5000までで、デルブランド、プロセッサはi 7、スクリーンサイズは15インチ以上のデータで、価格を調べます。


df_jp_daier = df_jp.loc[(df_jp['computer_price'] <=5000) & (df_jp['computer_price']>=4000) & (df_jp['computer_p_shop']=="  ") & (df_jp['computer_intel']=="i7") & (df_jp['computer_in']=="15.0  -15.9  "), :].copy()
plt.figure(figsize=(10, 8), dpi=100)
ax = sns.barplot(x='computer_sku', y='computer_price', data=df_jp_daier)
ax.set_xlabel('    SKU')
ax.set_ylabel('  ')
ax.set_title('  i7     15    SKU   ')
boxplot_fig = ax.get_figure()
boxplot_fig.savefig('  i7     15    SKU   .png', dpi=400)

8.15異なるIntelプロセッサのブランドの価格


plt.figure(figsize=(10, 8), dpi=100)
ax = sns.barplot(x='computer_p_shop', y='computer_price', data=df_jp, hue='computer_intel')
ax.set_xlabel('  ')
ax.set_ylabel('  ')
ax.set_title('            ')
boxplot_fig = ax.get_figure()
boxplot_fig.savefig('            .png', dpi=400)

8.16サイズ別ブランドの価格


plt.figure(figsize=(10, 8), dpi=100)
ax = sns.barplot(x='computer_p_shop', y='computer_price', data=df_jp, hue='computer_in')
ax.set_xlabel('  ')
ax.set_ylabel('  ')
ax.set_title('         ')
boxplot_fig = ax.get_figure()
boxplot_fig.savefig('         .png', dpi=400)

以上はpythonがscrapyに基づいて京東ノートパソコンのデータを取って、そして簡単に処理して分析する詳しい内容を行って、もっと多いpythonが京東のデータの資料を取ることに関して私達のその他の関連している文章に注意して下さい！

C++constキーワードの実例的な使い方

c+++vectorシミュレーション実現コード