python集計データテーブル

18374 ワード


リストの読み込み
import pandas as pd
unames=['user_id','gender','age','occupation','zip'] 
users=pd.read_table('E:/DataAnalysis/pydata-book/pydata-book-1st-edition/ch02/movielens/users.dat',sep='::',header=None,names=unames)

rnames=['user_id','movie_id','rating','timestamp']
ratings=pd.read_table('E:/DataAnalysis/pydata-book/pydata-book-1st-edition/ch02/movielens/ratings.dat',sep='::',header=None,names=rnames)

mnames=['movie_id','title','genres'] #     genres,  gender,      movies=pd.read_table('E:/DataAnalysis/pydata-book/pydata-book-1st-edition/ch02/movielens/movies.dat',sep='::',header=None,names=mnames)

集計リスト
data=pd.merge(pd.merge(ratings,users),movies)

平均点を求めて、しかも有効な採点は250回以上です
mean_ratings=data.pivot_table('rating',index='title',columns='gender',aggfunc='mean')#         

mean_ratings[:5]
Out[41]: 
gender                                F         M
title                                            
$1,000,000 Duck (1971)         3.375000  2.761905
'Night Mother (1986)           3.388889  3.352941
'Til There Was You (1997)      2.675676  2.733333
'burbs, The (1989)             2.793478  2.962085
...And Justice for All (1979)  3.828571  3.689024

#      250    ,  title  ,  size()              series

rating_by_title=data.groupby('title').size()

rating_by_title[:10]
Out[44]: 
title
$1,000,000 Duck (1971)                37
'Night Mother (1986)                  70
'Til There Was You (1997)             52
'burbs, The (1989)                   303
...And Justice for All (1979)        199
1-900 (1994)                           2
10 Things I Hate About You (1999)    700
101 Dalmatians (1961)                565
101 Dalmatians (1996)                364
12 Angry Men (1957)                  616
dtype: int64

active_titles=rating_by_title.index[rating_by_title>=250]

active_titles
Out[46]: 
Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
       '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
       '13th Warrior, The (1999)', '2 Days in the Valley (1996)',
       '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
       '2010 (1984)',
       ...
       'X-Men (2000)', 'Year of Living Dangerously (1982)',
       'Yellow Submarine (1968)', 'You've Got Mail (1998)',
       'Young Frankenstein (1974)', 'Young Guns (1988)',
       'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
       'Zero Effect (1998)', 'eXistenZ (1999)'],
      dtype='object', name='title', length=1216)

#      250 ,       

mean_ratings=mean_ratings.ix[active_titles]
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.

mean_ratings=mean_ratings.loc[active_titles]

mean_ratings
Out[50]: 
gender                                                     F         M
title                                                                 
'burbs, The (1989)                                  2.793478  2.962085
10 Things I Hate About You (1999)                   3.646552  3.311966
101 Dalmatians (1961)                               3.791444  3.500000
101 Dalmatians (1996)                               3.240000  2.911215
12 Angry Men (1957)                                 4.184397  4.328421
13th Warrior, The (1999)                            3.112000  3.168000
2 Days in the Valley (1996)                         3.488889  3.244813
20,000 Leagues Under the Sea (1954)                 3.670103  3.709205
2001: A Space Odyssey (1968)                        3.825581  4.129738
2010 (1984)                                         3.446809  3.413712
28 Days (2000)                                      3.209424  2.977707
39 Steps, The (1935)                                3.965517  4.107692
54 (1998)                                           2.701754  2.782178
7th Voyage of Sinbad, The (1958)                    3.409091  3.658879
8MM (1999)                                          2.906250  2.850962
About Last Night... (1986)                          3.188679  3.140909
Absent Minded Professor, The (1961)                 3.469388  3.446809
Absolute Power (1997)                               3.469136  3.327759
Abyss, The (1989)                                   3.659236  3.689507
Ace Ventura: Pet Detective (1994)                   3.000000  3.197917
Ace Ventura: When Nature Calls (1995)               2.269663  2.543333
Addams Family Values (1993)                         3.000000  2.878531
Addams Family, The (1991)                           3.186170  3.163498
Adventures in Babysitting (1987)                    3.455782  3.208122
Adventures of Buckaroo Bonzai Across the 8th Di...  3.308511  3.402321
Adventures of Priscilla, Queen of the Desert, T...  3.989071  3.688811
Adventures of Robin Hood, The (1938)                4.166667  3.918367
African Queen, The (1951)                           4.324232  4.223822
Age of Innocence, The (1993)                        3.827068  3.339506
Agnes of God (1985)                                 3.534884  3.244898
                                                     ...       ...
White Men Can't Jump (1992)                         3.028777  3.231061
Who Framed Roger Rabbit? (1988)                     3.569378  3.713251
Who's Afraid of Virginia Woolf? (1966)              4.029703  4.096939
Whole Nine Yards, The (2000)                        3.296552  3.404814
Wild Bunch, The (1969)                              3.636364  4.128099
Wild Things (1998)                                  3.392000  3.459082
Wild Wild West (1999)                               2.275449  2.131973
William Shakespeare's Romeo and Juliet (1996)       3.532609  3.318644
Willow (1988)                                       3.658683  3.453543
Willy Wonka and the Chocolate Factory (1971)        4.063953  3.789474
Witness (1985)                                      4.115854  3.941504
Wizard of Oz, The (1939)                            4.355030  4.203138
Wolf (1994)                                         3.074074  2.899083
Women on the Verge of a Nervous Breakdown (1988)    3.934307  3.865741
Wonder Boys (2000)                                  4.043796  3.913649
Working Girl (1988)                                 3.606742  3.312500
World Is Not Enough, The (1999)                     3.337500  3.388889
Wrong Trousers, The (1993)                          4.588235  4.478261
Wyatt Earp (1994)                                   3.147059  3.283898
X-Files: Fight the Future, The (1998)               3.489474  3.493797
X-Men (2000)                                        3.682310  3.851702
Year of Living Dangerously (1982)                   3.951220  3.869403
Yellow Submarine (1968)                             3.714286  3.689286
You've Got Mail (1998)                              3.542424  3.275591
Young Frankenstein (1974)                           4.289963  4.239177
Young Guns (1988)                                   3.371795  3.425620
Young Guns II (1990)                                2.934783  2.904025
Young Sherlock Holmes (1985)                        3.514706  3.363344
Zero Effect (1998)                                  3.864407  3.723140
eXistenZ (1999)                                     3.098592  3.289086

女性の最も好きな映画を求めて、Fによってランクダウンを行います
#        , F      

top_female_ratings=mean_ratings.sort_index(by='F',ascending='False')
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: by argument to sort_index is deprecated, please use .sort_values(by=...)
  Entry point for launching an IPython kernel.
#  ,    :1,sort_index     sort_values,2,'False'     #

top_female_ratings=mean_ratings.sort_values(by='F',ascending=False)
top_female_ratings[:10]

スコア分岐の計算
#      ,         


mean_ratings
Out[62]: 
gender                                                     F         M
title                                                                 
'burbs, The (1989)                                  2.793478  2.962085
10 Things I Hate About You (1999)                   3.646552  3.311966
101 Dalmatians (1961)                               3.791444  3.500000
101 Dalmatians (1996)                               3.240000  2.911215
12 Angry Men (1957)                                 4.184397  4.328421
13th Warrior, The (1999)                            3.112000  3.168000
2 Days in the Valley (1996)                         3.488889  3.244813
20,000 Leagues Under the Sea (1954)                 3.670103  3.709205
2001: A Space Odyssey (1968)                        3.825581  4.129738
2010 (1984)                                         3.446809  3.413712
28 Days (2000)                                      3.209424  2.977707
39 Steps, The (1935)                                3.965517  4.107692
54 (1998)                                           2.701754  2.782178
7th Voyage of Sinbad, The (1958)                    3.409091  3.658879
8MM (1999)                                          2.906250  2.850962
About Last Night... (1986)                          3.188679  3.140909
Absent Minded Professor, The (1961)                 3.469388  3.446809
Absolute Power (1997)                               3.469136  3.327759
Abyss, The (1989)                                   3.659236  3.689507
Ace Ventura: Pet Detective (1994)                   3.000000  3.197917
Ace Ventura: When Nature Calls (1995)               2.269663  2.543333
Addams Family Values (1993)                         3.000000  2.878531
Addams Family, The (1991)                           3.186170  3.163498
Adventures in Babysitting (1987)                    3.455782  3.208122
Adventures of Buckaroo Bonzai Across the 8th Di...  3.308511  3.402321
Adventures of Priscilla, Queen of the Desert, T...  3.989071  3.688811
Adventures of Robin Hood, The (1938)                4.166667  3.918367
African Queen, The (1951)                           4.324232  4.223822
Age of Innocence, The (1993)                        3.827068  3.339506
Agnes of God (1985)                                 3.534884  3.244898
                                                     ...       ...
White Men Can't Jump (1992)                         3.028777  3.231061
Who Framed Roger Rabbit? (1988)                     3.569378  3.713251
Who's Afraid of Virginia Woolf? (1966)              4.029703  4.096939
Whole Nine Yards, The (2000)                        3.296552  3.404814
Wild Bunch, The (1969)                              3.636364  4.128099
Wild Things (1998)                                  3.392000  3.459082
Wild Wild West (1999)                               2.275449  2.131973
William Shakespeare's Romeo and Juliet (1996)       3.532609  3.318644
Willow (1988)                                       3.658683  3.453543
Willy Wonka and the Chocolate Factory (1971)        4.063953  3.789474
Witness (1985)                                      4.115854  3.941504
Wizard of Oz, The (1939)                            4.355030  4.203138
Wolf (1994)                                         3.074074  2.899083
Women on the Verge of a Nervous Breakdown (1988)    3.934307  3.865741
Wonder Boys (2000)                                  4.043796  3.913649
Working Girl (1988)                                 3.606742  3.312500
World Is Not Enough, The (1999)                     3.337500  3.388889
Wrong Trousers, The (1993)                          4.588235  4.478261
Wyatt Earp (1994)                                   3.147059  3.283898
X-Files: Fight the Future, The (1998)               3.489474  3.493797
X-Men (2000)                                        3.682310  3.851702
Year of Living Dangerously (1982)                   3.951220  3.869403
Yellow Submarine (1968)                             3.714286  3.689286
You've Got Mail (1998)                              3.542424  3.275591
Young Frankenstein (1974)                           4.289963  4.239177
Young Guns (1988)                                   3.371795  3.425620
Young Guns II (1990)                                2.934783  2.904025
Young Sherlock Holmes (1985)                        3.514706  3.363344
Zero Effect (1998)                                  3.864407  3.723140
eXistenZ (1999)                                     3.098592  3.289086

[1216 rows x 2 columns]

#      ,         

mean_ratings['diff']=mean_ratings['M']-mean_ratings['F']

# diff    

sorted_by_diff=mean_ratings.sort_values(by='diff')

sorted_by_diff[:15]
Out[67]: 
gender                                        F         M      diff
title                                                              
Dirty Dancing (1987)                   3.790378  2.959596 -0.830782
Jumpin' Jack Flash (1986)              3.254717  2.578358 -0.676359
Grease (1978)                          3.975265  3.367041 -0.608224
Little Women (1994)                    3.870588  3.321739 -0.548849
Steel Magnolias (1989)                 3.901734  3.365957 -0.535777
Anastasia (1997)                       3.800000  3.281609 -0.518391
Rocky Horror Picture Show, The (1975)  3.673016  3.160131 -0.512885
Color Purple, The (1985)               4.158192  3.659341 -0.498851
Age of Innocence, The (1993)           3.827068  3.339506 -0.487561
Free Willy (1993)                      2.921348  2.438776 -0.482573
French Kiss (1995)                     3.535714  3.056962 -0.478752
Little Shop of Horrors, The (1960)     3.650000  3.179688 -0.470312
Guys and Dolls (1955)                  4.051724  3.583333 -0.468391
Mary Poppins (1964)                    4.197740  3.730594 -0.467147
Patch Adams (1998)                     3.473282  3.008746 -0.464536

#        

sprted_by_diff[::-1][:15]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
 in ()
----> 1 sprted_by_diff[::-1][:15]

NameError: name 'sprted_by_diff' is not defined

sorted_by_diff[::-1][:15]
Out[70]: 
gender                                         F         M      diff
title                                                               
Good, The Bad and The Ugly, The (1966)  3.494949  4.221300  0.726351
Kentucky Fried Movie, The (1977)        2.878788  3.555147  0.676359
Dumb & Dumber (1994)                    2.697987  3.336595  0.638608
Longest Day, The (1962)                 3.411765  4.031447  0.619682
Cable Guy, The (1996)                   2.250000  2.863787  0.613787
Evil Dead II (Dead By Dawn) (1987)      3.297297  3.909283  0.611985
Hidden, The (1987)                      3.137931  3.745098  0.607167
Rocky III (1982)                        2.361702  2.943503  0.581801
Caddyshack (1980)                       3.396135  3.969737  0.573602
For a Few Dollars More (1965)           3.409091  3.953795  0.544704
Porky's (1981)                          2.296875  2.836364  0.539489
Animal House (1978)                     3.628906  4.167192  0.538286
Exorcist, The (1973)                    3.537634  4.067239  0.529605
Fright Night (1985)                     2.973684  3.500000  0.526316
Barb Wire (1996)                        1.585366  2.100386  0.515020

#         ,       

#        

rating_std_by_title=data.groupby('title')['rating'].std()

#  active_titles  

rating_std_by_title=rating_std_by_title.loc[active_titles]

#    ,Series  order,  sort,   sort_values

rating_std_by_title.order(ascending=False)[:10]
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
 in ()
----> 1 rating_std_by_title.order(ascending=False)[:10]

D:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
   4370             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   4371                 return self[name]
-> 4372             return object.__getattribute__(self, name)
   4373 
   4374     def __setattr__(self, name, value):

AttributeError: 'Series' object has no attribute 'order'

rating_std_by_title.sort(ascending=False)[:10]
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
 in ()
----> 1 rating_std_by_title.sort(ascending=False)[:10]

D:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
   4370             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   4371                 return self[name]
-> 4372             return object.__getattribute__(self, name)
   4373 
   4374     def __setattr__(self, name, value):

AttributeError: 'Series' object has no attribute 'sort'

rating_std_by_title.sort_values(ascending=False)[:10]
Out[80]: 
title
Dumb & Dumber (1994)                     1.321333
Blair Witch Project, The (1999)          1.316368
Natural Born Killers (1994)              1.307198
Tank Girl (1995)                         1.277695
Rocky Horror Picture Show, The (1975)    1.260177
Eyes Wide Shut (1999)                    1.259624
Evita (1996)                             1.253631
Billy Madison (1995)                     1.249970
Fear and Loathing in Las Vegas (1998)    1.246408
Bicentennial Man (1999)                  1.245533
Name: rating, dtype: float64