映画評価データ

9417 ワード

import pandas as pd
import os #  os  
encoding = 'latin1'#      “latinl""
# pandas.read_table          pandas Dataframe   
#pandas        Dataframe      ，         ；Series             ，         
#             ，     ，    Series           ：

#os.path             ，
#os.path.expanduser(path)  # path    "~" "~user"       
#os.path.expandvars(path)  #          path    ”$name” ”${name}”
upath = os.path.expanduser('ch02/movielens/users.dat')
rpath = os.path.expanduser('ch02/movielens/ratings.dat')
mpath = os.path.expanduser('ch02/movielens/movies.dat')

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']#    
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
mnames = ['movie_id', 'title', 'genres']

users = pd.read_csv(upath, sep='::', header=None, names=unames, encoding=encoding)
ratings = pd.read_csv(rpath, sep='::', header=None, names=rnames, encoding=encoding)
movies = pd.read_csv(mpath, sep='::', header=None, names=mnames, encoding=encoding)
#    pd.read_cav     #sep="::"     “：”        “，” sep=","
#header=None   ，encoding=encoding     utf-8

/Users/zhongyaode/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  if __name__ == '__main__':
/Users/zhongyaode/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  from ipykernel import kernelapp as app
/Users/zhongyaode/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  app.launch_new_instance()

users[:20]菗切文法;Data Frameを表示します
ratings[:5]
movies[:5]
ラティングス
ワンちゃんはパンdasを利用して、ratingsとusersを統合して、またmoviewを統合します.
xiang pansdasは列名からどの列が合併列かを推断します.
data=pd.merge(pd.merge(ratings、users)、movies)
ダタ
data=pd.merge(pd.merge)data

data['rating'].mean()

3.5815644530293169

data.ix[1]#

user_id                                            2
movie_id                                        1193
rating                                             5
timestamp                                  978298413
gender                                             M
age                                               56
occupation                                        16
zip                                            70072
title         One Flew Over the Cuckoo's Nest (1975)
genres                                         Drama
Name: 1, dtype: object

性別によって映画ごとの平均得点を計算します.ピvot_を使用できます.テーブル
mean_ratings=data.pivot_テーブル('rating',index='title',columns='gender',aggfunc='mean')mean_ratings[:7]

#       250    ， title    ，  ize()               Series   
ratings_by_title = data.groupby('title').size()

ratings_by_title[:10]

title
$1,000,000 Duck (1971)                37
'Night Mother (1986)                  70
'Til There Was You (1997)             52
'burbs, The (1989)                   303
...And Justice for All (1979)        199
1-900 (1994)                           2
10 Things I Hate About You (1999)    700
101 Dalmatians (1961)                565
101 Dalmatians (1996)                364
12 Angry Men (1957)                  616
dtype: int64

active_titles=ratings_by_title.index[ratings_by_title>=250]
active_titles

Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
       '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
       '13th Warrior, The (1999)', '2 Days in the Valley (1996)',
       '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
       '2010 (1984)',
       ...
       'X-Men (2000)', 'Year of Living Dangerously (1982)',
       'Yellow Submarine (1968)', 'You've Got Mail (1998)',
       'Young Frankenstein (1974)', 'Young Guns (1988)',
       'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
       'Zero Effect (1998)', 'eXistenZ (1999)'],
      dtype='object', name='title', length=1216)

mean_ratings=mean_ラティングス.ix[activemutitles]mean_ラティングス
女性が一番知っている映画を知るために、Fを降順に並べます.
top_female_ratings=mean_ratings.sort_values(by='F's,ascending=False)刋sortindexは使用禁止されました.sort_を使います.values top_female_ratings[:10]

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

data['age'].mean()

29.738313692438279

data['age'].max()

data['age'].min()

data['age'].var()#var  std

138.10909427256377

#    
fig=plt.figure()
x=data['age']
ax=fig.add_subplot(111)
numBins=5
ax.hist(x,numBins,color='red',alpha=0.8,rwidth=0.5)
plt.title(u'age')
plt.show()


import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline
%config InlineBackend.figure_format='retina'
def normfun(x,mu,sigma):
    pdf=np.exp(-((x-mu)**2)/(2*sigma**2))/(sigma*np.sqrt(2*np.pi))
    return pdf

def normfun(x,mu,sigma):
    pdf=np.exp(-((x-mu)**2)/(2*sigma**2))/(sigma*np.sqrt(2*np.pi))
    return pdf

p=data['age']

x=np.arange(1,60,1)#140 160      ，    、  、    ;
#0.3        （  ）
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(p,bins=6,rwidth=0.9,normed=True)#time    ,bins       
#rwidth      ,normed      
plt.title("age")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()

x=np.arange(1,60,1)#140 160      ，    、  、    ;
#0.3        （  ）
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(p,bins=2,rwidth=0.9,normed=True)#time    ,bins       
#rwidth      ,normed      
plt.title("time")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()

x=np.arange(1,60,1)#140 160      ，    、  、    ;
#0.3        （  ）
y=normfun(x,mean,std)
plt.plot(x,y)
![
![
![
![
![output_34_0.png](http://upload-images.jianshu.io/upload_images/2007820-408b82f75a63a3f3.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-cb7ffce4a59d0504.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-a8fce3d1a47c8184.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-9271b213266a0748.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-0554285cc748af49.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
plt.hist(p,bins=6,rwidth=0.9,normed=True)#time    ,bins       
#rwidth      ,normed      
plt.title("time")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()

len(p)#

std=p.std()
std

11.751982567744209

mean=p.mean()
std=p.std()

    ：        29.74，        20~30  。
    11.75，     68%     29.74-11.75 29.74+11.75  
    10       ，       20～60

a=p[:100000]#   10%   ，

x=np.arange(1,60,1)#140 160      ，    、  、    ;
#0.3        （  ）
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(a,bins=6,rwidth=0.9,normed=True)#time    ,bins       
#rwidth      ,normed      
plt.title("age")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()

c=p[:10000]#   1%

x=np.arange(1,60,1)#140 160      ，    、  、    ;
#0.3        （  ）
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(c,bins=6,rwidth=0.9,normed=True)#time    ,bins       
#rwidth      ,normed      
plt.title("age")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()

       ：
##        ，

(二)二叉樹の抽象的なデータタイプの定義と遍歴