python協同フィルタ推奨アルゴリズムの実現


テストデータhttp://grouplens.org/datasets/movielens/ python 实现协同过滤推荐算法_第1张图片
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from numpy import *
import time
from texttable import Texttable

#             :
# 1、    。      ,                ,                 
# 2、    。     A        C,        A   C      ,   C    A,         C       C。
#      、                   ,        :
#   1、       
#       1)      
#       2)                
#       3)        
#   2、      (    )    (    )
#   3、          。            
#       :
# 1、     
#       movies ratings
#          userDict                  ,      5     
#          ItemUser                   
# 2、       userId    
#              userId      
#               userId    
#           A   userId   。   :{'  ID',[A     ,userId   ]},      0
#           A   userId     ,     
# 3、             
# 4、          


class CF:

    def __init__(self, movies, ratings, k=5, n=10):
        self.movies = movies
        self.ratings = ratings
        #     
        self.k = k
        #     
        self.n = n
        #         
        #     {'UserID:  ID':[(MovieID:  ID,Rating:        )]}
        self.userDict = {}
        #          
        #     :{'MovieID:  ID',[UserID:  ID]}
        # {'1',[1,2,3..],...}
        self.ItemUser = {}
        #      
        self.neighbors = []
        #     
        self.recommandList = []
        self.cost = 0.0

    #        
    #                   
    def recommendByUser(self, userId):
        self.formatRate()
        #                 ,       
        self.n = len(self.userDict[userId])
        self.getNearestNeighbor(userId)
        self.getrecommandList(userId)
        self.getPrecision(userId)

    #       
    def getrecommandList(self, userId):
        self.recommandList = []
        #       
        recommandDict = {}
        for neighbor in self.neighbors:
            movies = self.userDict[neighbor[1]]
            for movie in movies:
                if(movie[0] in recommandDict):
                    recommandDict[movie[0]] += neighbor[0]
                else:
                    recommandDict[movie[0]] = neighbor[0]

        #       
        for key in recommandDict:
            self.recommandList.append([recommandDict[key], key])
        self.recommandList.sort(reverse=True)
        self.recommandList = self.recommandList[:self.n]

    #  ratings   userDict ItemUser
    def formatRate(self):
        self.userDict = {}
        self.ItemUser = {}
        for i in self.ratings:
            #      5   5        
            temp = (i[1], float(i[2]) / 5)
            #   userDict {'1':[(1,5),(2,5)...],'2':[...]...}
            if(i[0] in self.userDict):
                self.userDict[i[0]].append(temp)
            else:
                self.userDict[i[0]] = [temp]
            #   ItemUser {'1',[1,2,3..],...}
            if(i[1] in self.ItemUser):
                self.ItemUser[i[1]].append(i[0])
            else:
                self.ItemUser[i[1]] = [i[0]]

    #           
    def getNearestNeighbor(self, userId):
        neighbors = []
        self.neighbors = []
        #   userId               
        for i in self.userDict[userId]:
            for j in self.ItemUser[i[0]]:
                if(j != userId and j not in neighbors):
                    neighbors.append(j)
        #        userId       
        for i in neighbors:
            dist = self.getCost(userId, i)
            self.neighbors.append([dist, i])
        #        ,reverse=True    
        self.neighbors.sort(reverse=True)
        self.neighbors = self.neighbors[:self.k]

    #    userDict  
    def formatuserDict(self, userId, l):
        user = {}
        for i in self.userDict[userId]:
            user[i[0]] = [i[1], 0]
        for j in self.userDict[l]:
            if(j[0] not in user):
                user[j[0]] = [0, j[1]]
            else:
                user[j[0]][1] = j[1]
        return user

    #       
    def getCost(self, userId, l):
        #     userId l       
        # {'  ID':[userId   ,l   ]}      0
        user = self.formatuserDict(userId, l)
        x = 0.0
        y = 0.0
        z = 0.0
        for k, v in user.items():
            x += float(v[0]) * float(v[0])
            y += float(v[1]) * float(v[1])
            z += float(v[0]) * float(v[1])
        if(z == 0.0):
            return 0
        return z / sqrt(x * y)

    #       
    def getPrecision(self, userId):
        user = [i[0] for i in self.userDict[userId]]
        recommand = [i[1] for i in self.recommandList]
        count = 0.0
        if(len(user) >= len(recommand)):
            for i in recommand:
                if(i in user):
                    count += 1.0
            self.cost = count / len(recommand)
        else:
            for i in user:
                if(i in recommand):
                    count += 1.0
            self.cost = count / len(user)

    #       
    def showTable(self):
        neighbors_id = [i[1] for i in self.neighbors]
        table = Texttable()
        table.set_deco(Texttable.HEADER)
        table.set_cols_dtype(["t", "t", "t", "t"])
        table.set_cols_align(["l", "l", "l", "l"])
        rows = []
        rows.append([u"movie ID", u"Name", u"release", u"from userID"])
        for item in self.recommandList:
            fromID = []
            for i in self.movies:
                if i[0] == item[1]:
                    movie = i
                    break
            for i in self.ItemUser[item[1]]:
                if i in neighbors_id:
                    fromID.append(i)
            movie.append(fromID)
            rows.append(movie)
        table.add_rows(rows)
        print(table.draw())


#     
def readFile(filename):
    files = open(filename, "r", encoding="utf-8")
    #           
    # files = open(filename, "r", encoding="iso-8859-15")
    data = []
    for line in files.readlines():
        item = line.strip().split("::")
        data.append(item)
    return data

# -------------------------  -------------------------------
start = time.clock()
movies = readFile("/home/hadoop/Python/CF/movies.dat")
ratings = readFile("/home/hadoop/Python/CF/ratings.dat")
demo = CF(movies, ratings, k=20)
demo.recommendByUser("100")
print("     :")
demo.showTable()
print("      %d " % (len(demo.ratings)))
print("   : %.2f %%" % (demo.cost * 100))
end = time.clock()
print("    : %f s" % (end - start))

python 实现协同过滤推荐算法_第2张图片 python 实现协同过滤推荐算法_第3张图片