映画評価データ

9417 ワード

import pandas as pd
import os #  os  
encoding = 'latin1'#      “latinl""
# pandas.read_table          pandas Dataframe   
#pandas        Dataframe      ,         ;Series             ,         
#             ,     ,    Series           :
#os.path             ,
#os.path.expanduser(path)  # path    "~" "~user"       
#os.path.expandvars(path)  #          path    ”$name” ”${name}”
upath = os.path.expanduser('ch02/movielens/users.dat')
rpath = os.path.expanduser('ch02/movielens/ratings.dat')
mpath = os.path.expanduser('ch02/movielens/movies.dat')

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']#    
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
mnames = ['movie_id', 'title', 'genres']
users = pd.read_csv(upath, sep='::', header=None, names=unames, encoding=encoding)
ratings = pd.read_csv(rpath, sep='::', header=None, names=rnames, encoding=encoding)
movies = pd.read_csv(mpath, sep='::', header=None, names=mnames, encoding=encoding)
#    pd.read_cav     #sep="::"     “:”        “,” sep=","
#header=None   ,encoding=encoding     utf-8
/Users/zhongyaode/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  if __name__ == '__main__':
/Users/zhongyaode/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  from ipykernel import kernelapp as app
/Users/zhongyaode/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  app.launch_new_instance()
users[:20]菗切文法;Data Frameを表示します
ratings[:5]
movies[:5]
ラティングス
ワンちゃんはパンdasを利用して、ratingsとusersを統合して、またmoviewを統合します.
xiang pansdasは列名からどの列が合併列かを推断します.
data=pd.merge(pd.merge(ratings、users)、movies)
ダタ
data=pd.merge(pd.merge)data
data['rating'].mean()
3.5815644530293169
data.ix[1]#  
user_id                                            2
movie_id                                        1193
rating                                             5
timestamp                                  978298413
gender                                             M
age                                               56
occupation                                        16
zip                                            70072
title         One Flew Over the Cuckoo's Nest (1975)
genres                                         Drama
Name: 1, dtype: object
性別によって映画ごとの平均得点を計算します.ピvot_を使用できます.テーブル
mean_ratings=data.pivot_テーブル('rating',index='title',columns='gender',aggfunc='mean')mean_ratings[:7]
#       250    , title    ,  ize()               Series   
ratings_by_title = data.groupby('title').size()
ratings_by_title[:10]
title
$1,000,000 Duck (1971)                37
'Night Mother (1986)                  70
'Til There Was You (1997)             52
'burbs, The (1989)                   303
...And Justice for All (1979)        199
1-900 (1994)                           2
10 Things I Hate About You (1999)    700
101 Dalmatians (1961)                565
101 Dalmatians (1996)                364
12 Angry Men (1957)                  616
dtype: int64
active_titles=ratings_by_title.index[ratings_by_title>=250]
active_titles
Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
       '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
       '13th Warrior, The (1999)', '2 Days in the Valley (1996)',
       '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
       '2010 (1984)',
       ...
       'X-Men (2000)', 'Year of Living Dangerously (1982)',
       'Yellow Submarine (1968)', 'You've Got Mail (1998)',
       'Young Frankenstein (1974)', 'Young Guns (1988)',
       'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
       'Zero Effect (1998)', 'eXistenZ (1999)'],
      dtype='object', name='title', length=1216)
mean_ratings=mean_ラティングス.ix[activemutitles]mean_ラティングス
女性が一番知っている映画を知るために、Fを降順に並べます.
top_female_ratings=mean_ratings.sort_values(by='F's,ascending=False)刋sortindexは使用禁止されました.sort_を使います.values top_female_ratings[:10]
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

data['age'].mean()
29.738313692438279
data['age'].max()
56
data['age'].min()
1
data['age'].var()#var  std   
138.10909427256377
#    
fig=plt.figure()
x=data['age']
ax=fig.add_subplot(111)
numBins=5
ax.hist(x,numBins,color='red',alpha=0.8,rwidth=0.5)
plt.title(u'age')
plt.show()

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline
%config InlineBackend.figure_format='retina'
def normfun(x,mu,sigma):
    pdf=np.exp(-((x-mu)**2)/(2*sigma**2))/(sigma*np.sqrt(2*np.pi))
    return pdf
def normfun(x,mu,sigma):
    pdf=np.exp(-((x-mu)**2)/(2*sigma**2))/(sigma*np.sqrt(2*np.pi))
    return pdf
p=data['age']
x=np.arange(1,60,1)#140 160      ,    、  、    ;
#0.3        (  )
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(p,bins=6,rwidth=0.9,normed=True)#time    ,bins       
#rwidth      ,normed      
plt.title("age")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
x=np.arange(1,60,1)#140 160      ,    、  、    ;
#0.3        (  )
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(p,bins=2,rwidth=0.9,normed=True)#time    ,bins       
#rwidth      ,normed      
plt.title("time")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
x=np.arange(1,60,1)#140 160      ,    、  、    ;
#0.3        (  )
y=normfun(x,mean,std)
plt.plot(x,y)
![
![
![
![
![output_34_0.png](http://upload-images.jianshu.io/upload_images/2007820-408b82f75a63a3f3.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-cb7ffce4a59d0504.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-a8fce3d1a47c8184.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-9271b213266a0748.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-0554285cc748af49.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
plt.hist(p,bins=6,rwidth=0.9,normed=True)#time    ,bins       
#rwidth      ,normed      
plt.title("time")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
len(p)#     
1000209
std=p.std()
std
11.751982567744209
mean=p.mean()
std=p.std()
    :        29.74,        20~30  。
    11.75,     68%     29.74-11.75 29.74+11.75  
    10       ,       20~60  
a=p[:100000]#   10%   ,    
x=np.arange(1,60,1)#140 160      ,    、  、    ;
#0.3        (  )
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(a,bins=6,rwidth=0.9,normed=True)#time    ,bins       
#rwidth      ,normed      
plt.title("age")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
c=p[:10000]#   1%       
x=np.arange(1,60,1)#140 160      ,    、  、    ;
#0.3        (  )
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(c,bins=6,rwidth=0.9,normed=True)#time    ,bins       
#rwidth      ,normed      
plt.title("age")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
       :
##        ,