Pythonデータ分析-google play storeのappデータ分析
35237 ワード
#!/usr/bin/env python
# coding: utf-8
# : app 、 , app
# APP 10 ;#
# ,
#
# app / 、
# ,
#
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# App、Category、Type、Size、Price、Reviews、Installs、Rating
# 0 1 2 3 4 5 6 7 12
df = pd.read_csv('./googleplaystore.csv', usecols=(0, 1, 2, 3, 4, 5, 6, 7))
df.head() #
App
Category
Rating
Reviews
Size
Installs
Type
Price
0
Photo Editor & Candy Camera & Grid & ScrapBook
ART_AND_DESIGN
4.1
159
19M
10,000+
Free
0
1
Coloring book moana
ART_AND_DESIGN
3.9
967
14M
500,000+
Free
0
2
U Launcher Lite – FREE Live Cool Themes, Hide ...
ART_AND_DESIGN
4.7
87510
8.7M
5,000,000+
Free
0
3
Sketch - Draw & Paint
ART_AND_DESIGN
4.5
215644
25M
50,000,000+
Free
0
4
Pixel Draw - Number Art Coloring Book
ART_AND_DESIGN
4.3
967
2.8M
100,000+
Free
0
#
df.info()
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 8 columns):
App 10841 non-null object
Category 10841 non-null object
Rating 9367 non-null float64
Reviews 10841 non-null object
Size 10841 non-null object
Installs 10841 non-null object
Type 10840 non-null object
Price 10841 non-null object
dtypes: float64(1), object(7)
memory usage: 677.7+ KB
#
df.shape
(10841, 8)
#
df.count()
# --> Rating、Type、Android Ver
App 10841
Category 10841
Rating 9367
Reviews 10841
Size 10841
Installs 10841
Type 10840
Price 10841
dtype: int64
#
len(df[df.duplicated()])
485
# App (App )
pd.unique(df['App']).size # App
#
# df[df['App'].duplicated()] # App
# df['App'].counts()
9660
# Category
df['Category'].value_counts(dropna=False) # 1.9
df.drop(df[df['Category'] == '1.9'].index, inplace = True) #
# Rating
df['Rating'].value_counts(dropna = False) # 1474 NaN
# df.count() Rating
#
df['Rating'].fillna(df['Rating'].mean(), inplace = True)
# Reviews
# df['Reviews'].value_counts(dropna = False) # ,
# isnumeric()
print(df['Reviews'].str.isnumeric().sum()) # 10840, 10841 1
# Reviews str
df['Reviews'] = df['Reviews'].astype('i8')
10840
# Size
df['Size'].value_counts(dropna = False) # Size
df['Size'] = df['Size'].str.replace('M', 'e+6') #
df['Size'] = df['Size'].str.replace('k', 'e+3') #
# str -> float
# df['Size'] = df['Size'].astype('f8')
# ValueError: could not convert string to float: 'Varies with device'
#
#
def is_convertable(v):
try:
float(v)
return True
except ValueError:
return False
#
temp = df['Size'].apply(is_convertable)
df['Size'][-temp].value_counts() #
# :Varies with device 1695 Name: Size, dtype: int64
#
df['Size'] = df['Size'].str.replace('Varies with device', '0') #
#
temp = df['Size'].apply(is_convertable)
df['Size'][-temp].value_counts() #
# :Series([], Name: Size, dtype: int64)
Series([], Name: Size, dtype: int64)
# Size -
# e+6 astype int , int, f8, i8
df['Size'] = df['Size'].astype('f8').astype('i8')
# Size 0
df['Size'] = df['Size'].replace(0, df['Size'].mean())
# Installs
#
df['Installs'].value_counts(dropna = False)
#
df.Installs = df.Installs.str.replace('+', '')
df.Installs = df.Installs.str.replace(',', '')
#
df.Installs = df.Installs.astype('i8')
df.describe()
Rating
Reviews
Size
Installs
count
10840.000000
1.084000e+04
1.084000e+04
1.084000e+04
mean
4.191757
4.441529e+05
2.099045e+07
1.546434e+07
std
0.478907
2.927761e+06
2.078345e+07
8.502936e+07
min
1.000000
0.000000e+00
8.500000e+03
0.000000e+00
25%
4.100000
3.800000e+01
5.900000e+06
1.000000e+03
50%
4.200000
2.094000e+03
1.800000e+07
1.000000e+05
75%
4.500000
5.477550e+04
2.600000e+07
5.000000e+06
max
5.000000
7.815831e+07
1.000000e+08
1.000000e+09
# Type
# df.count() Type
df.Type.value_counts(dropna = False) # NaN
df.drop(df[df.Type.isnull()].index, inplace = True)
# Price
df['Price'].value_counts() # $394.99
df['Price'] = df['Price'].str.replace('$', '')
df['Price']
#
df['Price'] = df['Price'].astype('f8')
#
#
df.describe()
Rating
Reviews
Size
Installs
Price
count
10839.000000
1.083900e+04
1.083900e+04
1.083900e+04
10839.000000
mean
4.191757
4.441939e+05
2.099071e+07
1.546577e+07
1.027463
std
0.478929
2.927893e+06
2.078439e+07
8.503315e+07
15.950436
min
1.000000
0.000000e+00
8.500000e+03
0.000000e+00
0.000000
25%
4.100000
3.800000e+01
5.900000e+06
3.000000e+03
0.000000
50%
4.200000
2.094000e+03
1.800000e+07
1.000000e+05
0.000000
75%
4.500000
5.478300e+04
2.600000e+07
5.000000e+06
0.000000
max
5.000000
7.815831e+07
1.000000e+08
1.000000e+09
400.000000
#
df.Category.unique().size
33
# , APP 10
df_Cate = df.groupby('Category').count().sort_values('App', ascending = False)
df_Cate = df_Cate.head(10)['App']
df_Cate
Category
FAMILY 1971
GAME 1144
TOOLS 843
MEDICAL 463
BUSINESS 460
PRODUCTIVITY 424
PERSONALIZATION 392
COMMUNICATION 387
SPORTS 384
LIFESTYLE 382
Name: App, dtype: int64
#
df_Ins = df.groupby('Category').mean()['Installs']
df_Ins.sort_values(ascending = False).head()
Category
COMMUNICATION 8.435989e+07
SOCIAL 4.769447e+07
VIDEO_PLAYERS 3.555430e+07
PRODUCTIVITY 3.343418e+07
GAME 3.066960e+07
Name: Installs, dtype: float64
# ,
df_Paid = df[df['Type'] == 'Paid']
df_Paid_Cate = df_Paid.groupby('Category', as_index = False).count()[['Category', 'App']]
df_Paid_Cate.rename(columns = {'App':'App_num'}, inplace = True)
df_Paid_Cate_head = df_Paid_Cate.sort_values('App_num', ascending = False).head()
print(df_Paid_Cate_head)
print(' ' + str(len(df_Paid_Cate_head)/len(df_Paid_Cate)))
Category App_num
9 FAMILY 191
17 MEDICAL 109
12 GAME 83
20 PERSONALIZATION 83
26 TOOLS 78
0.16666666666666666
#
df_Type = df.groupby('Type').mean()
df_Type['RIO'] = df_Type['Reviews']/df_Type['Installs']
df_Type
# -->
Rating
Reviews
Size
Installs
Price
RIO
Type
Free
4.186933
478661.096026
2.113359e+07
1.669095e+07
0.000000
0.028678
Paid
4.252299
11673.312500
1.919770e+07
9.119510e+04
13.920837
0.128004
# / 、
# df.head()
df_Cate_Type = df.groupby(['Category', 'Type'], as_index = False).mean()
df_Cate_Type
Category
Type
Rating
Reviews
Size
Installs
Price
0
ART_AND_DESIGN
Free
4.331859
27617.322581
1.299768e+07
2.005195e+06
0.0000
1
ART_AND_DESIGN
Paid
4.733333
722.000000
5.200000e+06
5.333333e+03
1.9900
2
AUTO_AND_VEHICLES
Free
4.185580
14140.280488
1.991652e+07
6.473178e+05
0.0000
3
AUTO_AND_VEHICLES
Paid
4.327838
1387.666667
1.705070e+07
1.671667e+04
4.4900
4
BEAUTY
Free
4.260553
7476.226415
1.428892e+07
5.131519e+05
0.0000
...
...
...
...
...
...
...
...
58
TRAVEL_AND_LOCAL
Paid
4.130586
1506.083333
3.795035e+07
1.525500e+04
4.1625
59
VIDEO_PLAYERS
Free
4.074131
645420.005848
1.666267e+07
3.638557e+07
0.0000
60
VIDEO_PLAYERS
Paid
4.100000
3341.750000
1.411407e+07
1.775000e+04
2.6150
61
WEATHER
Free
4.227710
195517.486486
1.471595e+07
5.747142e+06
0.0000
62
WEATHER
Paid
4.348970
17055.125000
1.411302e+07
1.015000e+05
4.0525
63 rows × 7 columns
# ,
df['Reviews'].astype('f8')
df_Cate_Rev_Rat = df.groupby('Category', as_index = False).mean()
df_Cate_Rev_Rat = df_Cate_Rev_Rat[['Category', 'Reviews', 'Rating']]
df_Cate_Rev_Rat.sort_values('Reviews', ascending = False, inplace = True)
df_Cate_Rev_Rat.head()
Category
Reviews
Rating
6
COMMUNICATION
2.107138e+06
4.163601
27
SOCIAL
2.105903e+06
4.247808
14
GAME
1.385859e+06
4.282441
24
PHOTOGRAPHY
6.373631e+05
4.192094
31
VIDEO_PLAYERS
6.307439e+05
4.074722
# (0.5 ,0.3 )
df.corr()
Rating
Reviews
Size
Installs
Price
Rating
1.000000
0.068018
0.070416
0.051221
-0.020190
Reviews
0.068018
1.000000
0.118931
0.643121
-0.009668
Size
0.070416
0.118931
1.000000
0.068670
-0.021610
Installs
0.051221
0.643121
0.068670
1.000000
-0.011691
Price
-0.020190
-0.009668
-0.021610
-0.011691
1.000000
ReviewsとInstallsの間に強い相関があることがわかります