4.親和性分析アルゴリズム
8858 ワード
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 27 10:59:39 2018
@author: asus
"""
#4
#4.1
# 。
# : ; ; ; 。
#4.1.1
#Apriori
# Eclat (FP-growth)
#4.1.2
# , Apriori 。 ,
# , 。
#4.2
#4.2.1
# Grouplens 。
# ,http://grouplens.org/datasets/movielens/
import pandas as pd
#4.2.2 pandas
# , (header=None), 。
all_ratings = pd.read_csv("u.data", delimiter="\t", header=None,
names = ["UserID", "MovieID", "Rating", "Datetime"])
# 。
all_ratings["Datetime"] = pd.to_datetime(all_ratings['Datetime'], unit='s')
all_ratings[:5]
#4.2.3
# 0 , (UserID)#196 1997 12 4 (MovieID)#242 3 (
# 5 )
# 。
#4.3 Apriori
# : , 。
# , Favorable, True。
all_ratings["Favorable"] = all_ratings["Rating"] > 3
all_ratings[10:15]
# , , Apriori 。
# 200 。
ratings = all_ratings[all_ratings['UserID'].isin(range(200))]
# , 。
favorable_ratings = ratings[ratings["Favorable"]]
# , 。
favorable_reviews_by_users = dict((k, frozenset(v.values))
for k, v in favorable_ratings.
groupby("UserID")["MovieID"])
# v.values frozenset, 。
# 。
# , , 。
num_favorable_by_movie = ratings[["MovieID", "Favorable"]].\
groupby("MovieID").sum()
#
num_favorable_by_movie.sort_values("Favorable", ascending=False)[:5]
#4.3.1 Apriori
# , , ,
'''
favorable_reviews_by_users:
k_1_itemsets:
min_support:
:
dict( )
'''
frequent_itemsets = {}
min_support = 50 #
# , 。
frequent_itemsets[1] = dict((frozenset((movie_id,)),row["Favorable"])\
for movie_id, row in num_favorable_by_movie.iterrows()
if row["Favorable"] > min_support)
# , ,
from collections import defaultdict
def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets,
min_support):
counts = defaultdict(int)
for user, reviews in favorable_reviews_by_users.items():
for itemset in k_1_itemsets:
if itemset.issubset(reviews):
for other_reviewed_movie in reviews - itemset:
current_superset = itemset | frozenset((
other_reviewed_movie,))
counts[current_superset] += 1
return dict([(itemset, frequency) for itemset, frequency in
counts.items() if frequency >= min_support])
# , Apriori , 。
for k in range(2, 20):
cur_fequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users,
frequent_itemsets[k-1],
min_support)
frequent_itemsets[k] = cur_fequent_itemsets
if len(cur_fequent_itemsets) == 0:
print(" {} ".format(k))
break
else:
print("I foud {} frequent itemsets of length {}".
format(len(cur_fequent_itemsets), k))
# 1
del frequent_itemsets[1]
#4.4
# , , ,
# : , 。
# , 。
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
for itemset in itemset_counts.keys():
for conclusion in itemset:
premise = itemset - set((conclusion,))
candidate_rules.append((premise, conclusion))
print(candidate_rules[:5])
# , (forzenset) ,
# 。
# , 。
# , ( ) ( ) 。
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
# , 。
# 。 , ,
# , , , 。
for user, reviews in favorable_reviews_by_users.items():
for candidate_rule in candidate_rules:
premise, conclusion = candidate_rule
if premise.issubset(reviews):
if conclusion in reviews:
correct_counts[candidate_rule] += 1
else:
incorrect_counts[candidate_rule] += 1
# , 。
rule_confidence = {candidate_rule: correct_counts[candidate_rule]/
float(correct_counts[candidate_rule] +
incorrect_counts[candidate_rule])
for candidate_rule in candidate_rules}
# ,
from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(),
key=itemgetter(1), reverse=True)
for index in range(5):
print("Rule #{0}".format(index + 1))
(premise, conclusion) = sorted_confidence[index][0]
print("Rule: If a person recommends {0} they will also recommend {1}".
format(premise, conclusion))
print(" - Confidence:\
{0:.3f}".format(rule_confidence[(premise, conclusion)]))
print("")
# , 。 u.item 。 |。
movie_name_data = pd.read_csv('u.item', delimiter='|',
header=None, encoding='mac-roman')
movie_name_data.columns = ["MovieID", "Title", "Release Date",
"Video Release", "IMDB", "",
"Action", "Adventure","Animation",
"Children's", "Comedy", "Crime",
"Documentary","Drama", "Fantasy",
"Film-Noir","Horror", "Musical",
"Mystery", "Romance", "Sci-Fi",
"Thriller","War", "Western"]
# 。
def get_movie_name(movie_id):
title_object = movie_name_data[movie_name_data["MovieID"] ==
movie_id]["Title"]
title = title_object.values[0]
return title
#
for index in range(5):
print("Rule #{0}".format(index + 1))
(premise, conclusion) = sorted_confidence[index][0]
premise_names = ", ".join(get_movie_name(idx) for idx in premise)
conclusion_name = get_movie_name(conclusion)
print("Rule: If a person recommends {0} they will also recommend {1}".
format(premise_names, conclusion_name))
print(" - Confidence:\
{0:.3f}".format(rule_confidence[(premise, conclusion)]))
print("")
#
test_dataset = all_ratings[~all_ratings["UserID"].isin(range(200))]
test_favorable = test_dataset[test_dataset["Favorable"]]
test_favorable_by_users = dict((k, frozenset(v.values)) for k, v
in test_favorable.groupby("UserID")["MovieID"])
#
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in test_favorable_by_users.items():
for candidate_rule in candidate_rules:
premise, conclusion = candidate_rule
if premise.issubset(reviews):
if conclusion in reviews:
correct_counts[candidate_rule] += 1
else:
incorrect_counts[candidate_rule] += 1
# ,
test_confidence = {candidate_rule: correct_counts[candidate_rule]/
float(correct_counts[candidate_rule] +
incorrect_counts[candidate_rule])
for candidate_rule in candidate_rules}
# , 。
for index in range(5):
print("Rule #{0}".format(index + 1))
(premise, conclusion) = sorted_confidence[index][0]
premise_names = ", ".join(get_movie_name(idx) for idx in premise)
conclusion_name = get_movie_name(conclusion)
print("Rule: If a person recommends {0} they will also recommend {1}".
format(premise_names, conclusion_name))
print(" - Confidence:\
{0:.3f}".format(rule_confidence[(premise, conclusion)]))
print(" - Confidence:\
{0:.3f}".format(test_confidence[(premise, conclusion)]))
print("")