Pythonデータ分析-google play storeのappデータ分析

35237 ワード

#!/usr/bin/env python
# coding: utf-8
#       :    app       、     ,    app   
#      APP      10   ;#            
#             ,      
#                   
#   app     /         、     
#            ,      
#      
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#         App、Category、Type、Size、Price、Reviews、Installs、Rating
#             0 1 2 3 4 5 6 7 12
df = pd.read_csv('./googleplaystore.csv', usecols=(0, 1, 2, 3, 4, 5, 6, 7))
df.head() #      

App
Category
Rating
Reviews
Size
Installs
Type
Price
0
Photo Editor & Candy Camera & Grid & ScrapBook
ART_AND_DESIGN
4.1
159
19M
10,000+
Free
0
1
Coloring book moana
ART_AND_DESIGN
3.9
967
14M
500,000+
Free
0
2
U Launcher Lite – FREE Live Cool Themes, Hide ...
ART_AND_DESIGN
4.7
87510
8.7M
5,000,000+
Free
0
3
Sketch - Draw & Paint
ART_AND_DESIGN
4.5
215644
25M
50,000,000+
Free
0
4
Pixel Draw - Number Art Coloring Book
ART_AND_DESIGN
4.3
967
2.8M
100,000+
Free
0
#       
df.info()

RangeIndex: 10841 entries, 0 to 10840
Data columns (total 8 columns):
App         10841 non-null object
Category    10841 non-null object
Rating      9367 non-null float64
Reviews     10841 non-null object
Size        10841 non-null object
Installs    10841 non-null object
Type        10840 non-null object
Price       10841 non-null object
dtypes: float64(1), object(7)
memory usage: 677.7+ KB
#       
df.shape
(10841, 8)
#          
df.count()
# -->  Rating、Type、Android Ver     
App         10841
Category    10841
Rating       9367
Reviews     10841
Size        10841
Installs    10841
Type        10840
Price       10841
dtype: int64
#              
len(df[df.duplicated()])
485
# App           (App       )
pd.unique(df['App']).size #  App        
#    
# df[df['App'].duplicated()] #    App   
# df['App'].counts()
9660
# Category   
df['Category'].value_counts(dropna=False)  #        1.9    
df.drop(df[df['Category'] == '1.9'].index, inplace = True) #   
# Rating   
df['Rating'].value_counts(dropna = False) #      1474 NaN
#    df.count()     Rating    
#        
df['Rating'].fillna(df['Rating'].mean(), inplace = True)
# Reviews   
# df['Reviews'].value_counts(dropna = False) #     ,         
#                  isnumeric()
print(df['Reviews'].str.isnumeric().sum()) #   10840,   10841  1
#        Reviews   str    
df['Reviews'] = df['Reviews'].astype('i8')
10840
# Size   
df['Size'].value_counts(dropna = False) #   Size       
df['Size'] = df['Size'].str.replace('M', 'e+6') #     
df['Size'] = df['Size'].str.replace('k', 'e+3') #     
#        str -> float
# df['Size'] = df['Size'].astype('f8') 
#   ValueError: could not convert string to float: 'Varies with device'
#                 

#                   
def is_convertable(v):
    try:
        float(v)
        return True
    except ValueError:
        return False
    
#             
temp = df['Size'].apply(is_convertable)
df['Size'][-temp].value_counts() #           
#   :Varies with device    1695  Name: Size, dtype: int64

#        
df['Size'] = df['Size'].str.replace('Varies with device', '0') #         

#             
temp = df['Size'].apply(is_convertable)
df['Size'][-temp].value_counts() #           
#   :Series([], Name: Size, dtype: int64)
Series([], Name: Size, dtype: int64)
# Size   -    
# e+6      astype    int   ,     int,     f8,  i8
df['Size'] = df['Size'].astype('f8').astype('i8')

#  Size 0       
df['Size'] = df['Size'].replace(0, df['Size'].mean())
# Installs   
#       
df['Installs'].value_counts(dropna = False)
#   
df.Installs = df.Installs.str.replace('+', '')
df.Installs = df.Installs.str.replace(',', '')
#     
df.Installs = df.Installs.astype('i8')
df.describe()

Rating
Reviews
Size
Installs
count
10840.000000
1.084000e+04
1.084000e+04
1.084000e+04
mean
4.191757
4.441529e+05
2.099045e+07
1.546434e+07
std
0.478907
2.927761e+06
2.078345e+07
8.502936e+07
min
1.000000
0.000000e+00
8.500000e+03
0.000000e+00
25%
4.100000
3.800000e+01
5.900000e+06
1.000000e+03
50%
4.200000
2.094000e+03
1.800000e+07
1.000000e+05
75%
4.500000
5.477550e+04
2.600000e+07
5.000000e+06
max
5.000000
7.815831e+07
1.000000e+08
1.000000e+09
# Type   
#    df.count()     Type    
df.Type.value_counts(dropna = False) #      NaN 
df.drop(df[df.Type.isnull()].index, inplace = True)
# Price   
df['Price'].value_counts()  # $394.99
df['Price'] = df['Price'].str.replace('$', '')
df['Price']
#     
df['Price'] = df['Price'].astype('f8')
#     
#          
df.describe()

Rating
Reviews
Size
Installs
Price
count
10839.000000
1.083900e+04
1.083900e+04
1.083900e+04
10839.000000
mean
4.191757
4.441939e+05
2.099071e+07
1.546577e+07
1.027463
std
0.478929
2.927893e+06
2.078439e+07
8.503315e+07
15.950436
min
1.000000
0.000000e+00
8.500000e+03
0.000000e+00
0.000000
25%
4.100000
3.800000e+01
5.900000e+06
3.000000e+03
0.000000
50%
4.200000
2.094000e+03
1.800000e+07
1.000000e+05
0.000000
75%
4.500000
5.478300e+04
2.600000e+07
5.000000e+06
0.000000
max
5.000000
7.815831e+07
1.000000e+08
1.000000e+09
400.000000
#      
df.Category.unique().size
33
#                 ,  APP    10        
df_Cate = df.groupby('Category').count().sort_values('App', ascending = False)
df_Cate = df_Cate.head(10)['App']
df_Cate
Category
FAMILY             1971
GAME               1144
TOOLS               843
MEDICAL             463
BUSINESS            460
PRODUCTIVITY        424
PERSONALIZATION     392
COMMUNICATION       387
SPORTS              384
LIFESTYLE           382
Name: App, dtype: int64
#               
df_Ins = df.groupby('Category').mean()['Installs']
df_Ins.sort_values(ascending = False).head()
Category
COMMUNICATION    8.435989e+07
SOCIAL           4.769447e+07
VIDEO_PLAYERS    3.555430e+07
PRODUCTIVITY     3.343418e+07
GAME             3.066960e+07
Name: Installs, dtype: float64
#             ,      
df_Paid = df[df['Type'] == 'Paid']
df_Paid_Cate = df_Paid.groupby('Category', as_index = False).count()[['Category', 'App']]
df_Paid_Cate.rename(columns = {'App':'App_num'}, inplace = True)
df_Paid_Cate_head = df_Paid_Cate.sort_values('App_num', ascending = False).head()
print(df_Paid_Cate_head)
print('                     ' + str(len(df_Paid_Cate_head)/len(df_Paid_Cate)))
           Category  App_num
9            FAMILY      191
17          MEDICAL      109
12             GAME       83
20  PERSONALIZATION       83
26            TOOLS       78
                     0.16666666666666666
#                   
df_Type = df.groupby('Type').mean()
df_Type['RIO'] = df_Type['Reviews']/df_Type['Installs']
df_Type
# -->            

Rating
Reviews
Size
Installs
Price
RIO
Type
Free
4.186933
478661.096026
2.113359e+07
1.669095e+07
0.000000
0.028678
Paid
4.252299
11673.312500
1.919770e+07
9.119510e+04
13.920837
0.128004
#   /               、     
# df.head()
df_Cate_Type = df.groupby(['Category', 'Type'], as_index = False).mean()
df_Cate_Type

Category
Type
Rating
Reviews
Size
Installs
Price
0
ART_AND_DESIGN
Free
4.331859
27617.322581
1.299768e+07
2.005195e+06
0.0000
1
ART_AND_DESIGN
Paid
4.733333
722.000000
5.200000e+06
5.333333e+03
1.9900
2
AUTO_AND_VEHICLES
Free
4.185580
14140.280488
1.991652e+07
6.473178e+05
0.0000
3
AUTO_AND_VEHICLES
Paid
4.327838
1387.666667
1.705070e+07
1.671667e+04
4.4900
4
BEAUTY
Free
4.260553
7476.226415
1.428892e+07
5.131519e+05
0.0000
...
...
...
...
...
...
...
...
58
TRAVEL_AND_LOCAL
Paid
4.130586
1506.083333
3.795035e+07
1.525500e+04
4.1625
59
VIDEO_PLAYERS
Free
4.074131
645420.005848
1.666267e+07
3.638557e+07
0.0000
60
VIDEO_PLAYERS
Paid
4.100000
3341.750000
1.411407e+07
1.775000e+04
2.6150
61
WEATHER
Free
4.227710
195517.486486
1.471595e+07
5.747142e+06
0.0000
62
WEATHER
Paid
4.348970
17055.125000
1.411302e+07
1.015000e+05
4.0525
63 rows × 7 columns
#            ,      
df['Reviews'].astype('f8')
df_Cate_Rev_Rat = df.groupby('Category', as_index = False).mean()
df_Cate_Rev_Rat = df_Cate_Rev_Rat[['Category', 'Reviews', 'Rating']]
df_Cate_Rev_Rat.sort_values('Reviews', ascending = False, inplace = True)
df_Cate_Rev_Rat.head()

Category
Reviews
Rating
6
COMMUNICATION
2.107138e+06
4.163601
27
SOCIAL
2.105903e+06
4.247808
14
GAME
1.385859e+06
4.282441
24
PHOTOGRAPHY
6.373631e+05
4.192094
31
VIDEO_PLAYERS
6.307439e+05
4.074722
#       (0.5        ,0.3          )
df.corr()

Rating
Reviews
Size
Installs
Price
Rating
1.000000
0.068018
0.070416
0.051221
-0.020190
Reviews
0.068018
1.000000
0.118931
0.643121
-0.009668
Size
0.070416
0.118931
1.000000
0.068670
-0.021610
Installs
0.051221
0.643121
0.068670
1.000000
-0.011691
Price
-0.020190
-0.009668
-0.021610
-0.011691
1.000000
ReviewsとInstallsの間に強い相関があることがわかります