python協同フィルタ推奨アルゴリズムの実現
16046 ワード
テストデータhttp://grouplens.org/datasets/movielens/
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from numpy import *
import time
from texttable import Texttable
# :
# 1、 。 , ,
# 2、 。 A C, A C , C A, C C。
# 、 , :
# 1、
# 1)
# 2)
# 3)
# 2、 ( ) ( )
# 3、 。
# :
# 1、
# movies ratings
# userDict , 5
# ItemUser
# 2、 userId
# userId
# userId
# A userId 。 :{' ID',[A ,userId ]}, 0
# A userId ,
# 3、
# 4、
class CF:
def __init__(self, movies, ratings, k=5, n=10):
self.movies = movies
self.ratings = ratings
#
self.k = k
#
self.n = n
#
# {'UserID: ID':[(MovieID: ID,Rating: )]}
self.userDict = {}
#
# :{'MovieID: ID',[UserID: ID]}
# {'1',[1,2,3..],...}
self.ItemUser = {}
#
self.neighbors = []
#
self.recommandList = []
self.cost = 0.0
#
#
def recommendByUser(self, userId):
self.formatRate()
# ,
self.n = len(self.userDict[userId])
self.getNearestNeighbor(userId)
self.getrecommandList(userId)
self.getPrecision(userId)
#
def getrecommandList(self, userId):
self.recommandList = []
#
recommandDict = {}
for neighbor in self.neighbors:
movies = self.userDict[neighbor[1]]
for movie in movies:
if(movie[0] in recommandDict):
recommandDict[movie[0]] += neighbor[0]
else:
recommandDict[movie[0]] = neighbor[0]
#
for key in recommandDict:
self.recommandList.append([recommandDict[key], key])
self.recommandList.sort(reverse=True)
self.recommandList = self.recommandList[:self.n]
# ratings userDict ItemUser
def formatRate(self):
self.userDict = {}
self.ItemUser = {}
for i in self.ratings:
# 5 5
temp = (i[1], float(i[2]) / 5)
# userDict {'1':[(1,5),(2,5)...],'2':[...]...}
if(i[0] in self.userDict):
self.userDict[i[0]].append(temp)
else:
self.userDict[i[0]] = [temp]
# ItemUser {'1',[1,2,3..],...}
if(i[1] in self.ItemUser):
self.ItemUser[i[1]].append(i[0])
else:
self.ItemUser[i[1]] = [i[0]]
#
def getNearestNeighbor(self, userId):
neighbors = []
self.neighbors = []
# userId
for i in self.userDict[userId]:
for j in self.ItemUser[i[0]]:
if(j != userId and j not in neighbors):
neighbors.append(j)
# userId
for i in neighbors:
dist = self.getCost(userId, i)
self.neighbors.append([dist, i])
# ,reverse=True
self.neighbors.sort(reverse=True)
self.neighbors = self.neighbors[:self.k]
# userDict
def formatuserDict(self, userId, l):
user = {}
for i in self.userDict[userId]:
user[i[0]] = [i[1], 0]
for j in self.userDict[l]:
if(j[0] not in user):
user[j[0]] = [0, j[1]]
else:
user[j[0]][1] = j[1]
return user
#
def getCost(self, userId, l):
# userId l
# {' ID':[userId ,l ]} 0
user = self.formatuserDict(userId, l)
x = 0.0
y = 0.0
z = 0.0
for k, v in user.items():
x += float(v[0]) * float(v[0])
y += float(v[1]) * float(v[1])
z += float(v[0]) * float(v[1])
if(z == 0.0):
return 0
return z / sqrt(x * y)
#
def getPrecision(self, userId):
user = [i[0] for i in self.userDict[userId]]
recommand = [i[1] for i in self.recommandList]
count = 0.0
if(len(user) >= len(recommand)):
for i in recommand:
if(i in user):
count += 1.0
self.cost = count / len(recommand)
else:
for i in user:
if(i in recommand):
count += 1.0
self.cost = count / len(user)
#
def showTable(self):
neighbors_id = [i[1] for i in self.neighbors]
table = Texttable()
table.set_deco(Texttable.HEADER)
table.set_cols_dtype(["t", "t", "t", "t"])
table.set_cols_align(["l", "l", "l", "l"])
rows = []
rows.append([u"movie ID", u"Name", u"release", u"from userID"])
for item in self.recommandList:
fromID = []
for i in self.movies:
if i[0] == item[1]:
movie = i
break
for i in self.ItemUser[item[1]]:
if i in neighbors_id:
fromID.append(i)
movie.append(fromID)
rows.append(movie)
table.add_rows(rows)
print(table.draw())
#
def readFile(filename):
files = open(filename, "r", encoding="utf-8")
#
# files = open(filename, "r", encoding="iso-8859-15")
data = []
for line in files.readlines():
item = line.strip().split("::")
data.append(item)
return data
# ------------------------- -------------------------------
start = time.clock()
movies = readFile("/home/hadoop/Python/CF/movies.dat")
ratings = readFile("/home/hadoop/Python/CF/ratings.dat")
demo = CF(movies, ratings, k=20)
demo.recommendByUser("100")
print(" :")
demo.showTable()
print(" %d " % (len(demo.ratings)))
print(" : %.2f %%" % (demo.cost * 100))
end = time.clock()
print(" : %f s" % (end - start))