4.親和性分析アルゴリズム


# -*- coding: utf-8 -*-
"""
Created on Thu Sep 27 10:59:39 2018

@author: asus
"""
#4             

#4.1      
#                     。
#    :    ;    ;    ;    。

#4.1.1        
#Apriori  
#     Eclat         (FP-growth)

#4.1.2     
#                ,    Apriori        。  ,      
#           ,      。

#4.2       
#4.2.1      
#         Grouplens                      。
#      ,http://grouplens.org/datasets/movielens/
import pandas as pd

#4.2.2  pandas    
#          ,          (header=None),        。
all_ratings = pd.read_csv("u.data", delimiter="\t", header=None,
                          names = ["UserID", "MovieID", "Rating", "Datetime"])
#               。
all_ratings["Datetime"] = pd.to_datetime(all_ratings['Datetime'], unit='s')
all_ratings[:5]

#4.2.3       
#   0      ,  (UserID)#196 1997 12 4    (MovieID)#242  3 ( 
#  5 )
#                               。

#4.3 Apriori     
#  :          ,            。

#               ,     Favorable,   True。
all_ratings["Favorable"] = all_ratings["Rating"] > 3
all_ratings[10:15]
#                 ,           ,  Apriori     。
#  200        。
ratings = all_ratings[all_ratings['UserID'].isin(range(200))]
#       ,                。
favorable_ratings = ratings[ratings["Favorable"]]
#      ,           。
favorable_reviews_by_users = dict((k, frozenset(v.values))
                                    for k, v in favorable_ratings.
                                    groupby("UserID")["MovieID"])
#      v.values   frozenset,                   。   
#         。
#  ,       ,             。
num_favorable_by_movie = ratings[["MovieID", "Favorable"]].\
                                groupby("MovieID").sum() 
#         
num_favorable_by_movie.sort_values("Favorable", ascending=False)[:5]

#4.3.1 Apriori  
#      ,           ,    ,      
'''
favorable_reviews_by_users:          
k_1_itemsets:        
min_support:     
     :
    dict(             )
'''
frequent_itemsets = {}
min_support = 50 #     
#                ,         。
frequent_itemsets[1] = dict((frozenset((movie_id,)),row["Favorable"])\
                    for movie_id, row in num_favorable_by_movie.iterrows()
                    if row["Favorable"] > min_support)
#          ,    ,      
from collections import defaultdict
def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets,
                           min_support):
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews - itemset:
                    current_superset = itemset | frozenset((
                            other_reviewed_movie,))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in 
                 counts.items() if frequency >= min_support])
#    ,  Apriori  ,               。
for k in range(2, 20):
    cur_fequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users,
                                                  frequent_itemsets[k-1],
                                                  min_support)
    frequent_itemsets[k] = cur_fequent_itemsets
    if len(cur_fequent_itemsets) == 0:
        print("       {}     ".format(k))
        break
    else:
        print("I foud {} frequent itemsets of length {}".
              format(len(cur_fequent_itemsets), k))
#     1   
del frequent_itemsets[1]

#4.4       
#               ,           ,         ,    
#     :              ,              。

#           ,         。
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        for conclusion in itemset:
            premise = itemset - set((conclusion,))
            candidate_rules.append((premise, conclusion))
print(candidate_rules[:5])
#        ,    (forzenset)            ,         
#       。

#   ,     。
#       ,        (  )      (  )   。
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
#               ,              。
#                。      ,               ,  
#   ,    ,  ,     。
for user, reviews in favorable_reviews_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1
#                    ,          。
rule_confidence = {candidate_rule: correct_counts[candidate_rule]/
                   float(correct_counts[candidate_rule] + 
                         incorrect_counts[candidate_rule])
                   for candidate_rule in candidate_rules}
#       ,             
from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(), 
                           key=itemgetter(1), reverse=True)
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print("Rule: If a person recommends {0} they will also recommend {1}".
          format(premise, conclusion))
    print(" - Confidence:\
            {0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")
#           ,      。 u.item   。   |。
movie_name_data = pd.read_csv('u.item', delimiter='|',
                                  header=None, encoding='mac-roman')
movie_name_data.columns = ["MovieID", "Title", "Release Date",
                           "Video Release", "IMDB", "",
                           "Action", "Adventure","Animation",
                           "Children's", "Comedy", "Crime",
                           "Documentary","Drama", "Fantasy",
                           "Film-Noir","Horror", "Musical",
                           "Mystery", "Romance", "Sci-Fi",
                           "Thriller","War", "Western"]
#               。
def get_movie_name(movie_id):
    title_object = movie_name_data[movie_name_data["MovieID"] == 
                                   movie_id]["Title"]
    title = title_object.values[0]
    return title

#       
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print("Rule: If a person recommends {0} they will also recommend {1}".
          format(premise_names, conclusion_name))
    print(" - Confidence:\
            {0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")

#            
test_dataset = all_ratings[~all_ratings["UserID"].isin(range(200))]
test_favorable = test_dataset[test_dataset["Favorable"]]
test_favorable_by_users = dict((k, frozenset(v.values)) for k, v
                               in test_favorable.groupby("UserID")["MovieID"])
#           
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in test_favorable_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1
#   ,          
test_confidence = {candidate_rule: correct_counts[candidate_rule]/
                   float(correct_counts[candidate_rule] + 
                         incorrect_counts[candidate_rule])
                   for candidate_rule in candidate_rules}

#  ,                       。
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print("Rule: If a person recommends {0} they will also recommend {1}".
          format(premise_names, conclusion_name))
    print(" - Confidence:\
            {0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print(" - Confidence:\
            {0:.3f}".format(test_confidence[(premise, conclusion)]))
    print("")