映画評価データ
9417 ワード
import pandas as pd
import os # os
encoding = 'latin1'# “latinl""
# pandas.read_table pandas Dataframe
#pandas Dataframe , ;Series ,
# , , Series :
#os.path ,
#os.path.expanduser(path) # path "~" "~user"
#os.path.expandvars(path) # path ”$name” ”${name}”
upath = os.path.expanduser('ch02/movielens/users.dat')
rpath = os.path.expanduser('ch02/movielens/ratings.dat')
mpath = os.path.expanduser('ch02/movielens/movies.dat')
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']#
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
mnames = ['movie_id', 'title', 'genres']
users = pd.read_csv(upath, sep='::', header=None, names=unames, encoding=encoding)
ratings = pd.read_csv(rpath, sep='::', header=None, names=rnames, encoding=encoding)
movies = pd.read_csv(mpath, sep='::', header=None, names=mnames, encoding=encoding)
# pd.read_cav #sep="::" “:” “,” sep=","
#header=None ,encoding=encoding utf-8
/Users/zhongyaode/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
if __name__ == '__main__':
/Users/zhongyaode/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
from ipykernel import kernelapp as app
/Users/zhongyaode/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
app.launch_new_instance()
users[:20]菗切文法;Data Frameを表示しますratings[:5]
movies[:5]
ラティングス
ワンちゃんはパンdasを利用して、ratingsとusersを統合して、またmoviewを統合します.
xiang pansdasは列名からどの列が合併列かを推断します.
data=pd.merge(pd.merge(ratings、users)、movies)
ダタ
data=pd.merge(pd.merge)data
data['rating'].mean()
3.5815644530293169
data.ix[1]#
user_id 2
movie_id 1193
rating 5
timestamp 978298413
gender M
age 56
occupation 16
zip 70072
title One Flew Over the Cuckoo's Nest (1975)
genres Drama
Name: 1, dtype: object
性別によって映画ごとの平均得点を計算します.ピvot_を使用できます.テーブルmean_ratings=data.pivot_テーブル('rating',index='title',columns='gender',aggfunc='mean')mean_ratings[:7]
# 250 , title , ize() Series
ratings_by_title = data.groupby('title').size()
ratings_by_title[:10]
title
$1,000,000 Duck (1971) 37
'Night Mother (1986) 70
'Til There Was You (1997) 52
'burbs, The (1989) 303
...And Justice for All (1979) 199
1-900 (1994) 2
10 Things I Hate About You (1999) 700
101 Dalmatians (1961) 565
101 Dalmatians (1996) 364
12 Angry Men (1957) 616
dtype: int64
active_titles=ratings_by_title.index[ratings_by_title>=250]
active_titles
Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
'101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
'13th Warrior, The (1999)', '2 Days in the Valley (1996)',
'20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
'2010 (1984)',
...
'X-Men (2000)', 'Year of Living Dangerously (1982)',
'Yellow Submarine (1968)', 'You've Got Mail (1998)',
'Young Frankenstein (1974)', 'Young Guns (1988)',
'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
'Zero Effect (1998)', 'eXistenZ (1999)'],
dtype='object', name='title', length=1216)
mean_ratings=mean_ラティングス.ix[activemutitles]mean_ラティングス女性が一番知っている映画を知るために、Fを降順に並べます.
top_female_ratings=mean_ratings.sort_values(by='F's,ascending=False)刋sortindexは使用禁止されました.sort_を使います.values top_female_ratings[:10]
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
data['age'].mean()
29.738313692438279
data['age'].max()
56
data['age'].min()
1
data['age'].var()#var std
138.10909427256377
#
fig=plt.figure()
x=data['age']
ax=fig.add_subplot(111)
numBins=5
ax.hist(x,numBins,color='red',alpha=0.8,rwidth=0.5)
plt.title(u'age')
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'
def normfun(x,mu,sigma):
pdf=np.exp(-((x-mu)**2)/(2*sigma**2))/(sigma*np.sqrt(2*np.pi))
return pdf
def normfun(x,mu,sigma):
pdf=np.exp(-((x-mu)**2)/(2*sigma**2))/(sigma*np.sqrt(2*np.pi))
return pdf
p=data['age']
x=np.arange(1,60,1)#140 160 , 、 、 ;
#0.3 ( )
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(p,bins=6,rwidth=0.9,normed=True)#time ,bins
#rwidth ,normed
plt.title("age")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
x=np.arange(1,60,1)#140 160 , 、 、 ;
#0.3 ( )
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(p,bins=2,rwidth=0.9,normed=True)#time ,bins
#rwidth ,normed
plt.title("time")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
x=np.arange(1,60,1)#140 160 , 、 、 ;
#0.3 ( )
y=normfun(x,mean,std)
plt.plot(x,y)
![
![
![
![
![output_34_0.png](http://upload-images.jianshu.io/upload_images/2007820-408b82f75a63a3f3.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-cb7ffce4a59d0504.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-a8fce3d1a47c8184.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-9271b213266a0748.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-0554285cc748af49.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
plt.hist(p,bins=6,rwidth=0.9,normed=True)#time ,bins
#rwidth ,normed
plt.title("time")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
len(p)#
1000209
std=p.std()
std
11.751982567744209
mean=p.mean()
std=p.std()
: 29.74, 20~30 。
11.75, 68% 29.74-11.75 29.74+11.75
10 , 20~60
a=p[:100000]# 10% ,
x=np.arange(1,60,1)#140 160 , 、 、 ;
#0.3 ( )
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(a,bins=6,rwidth=0.9,normed=True)#time ,bins
#rwidth ,normed
plt.title("age")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
c=p[:10000]# 1%
x=np.arange(1,60,1)#140 160 , 、 、 ;
#0.3 ( )
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(c,bins=6,rwidth=0.9,normed=True)#time ,bins
#rwidth ,normed
plt.title("age")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
:
## ,