Pythonベースの映画推薦システムの実現

62560 ワード

最近、映画・テレビのウェブサイトを作る時、推薦システムをウェブサイトに集積することを考慮して、だからネット上からいくつかの資料を調べて、最終的に実現することができて、以下は自分の実現原理と過程を書いて、記録とします.
1、映画とテレビの類似度計算
この推奨システムは主にユーザの視聴記録に基づいて類似の映像を推奨するため,最後にコンテンツベースの協同フィルタリングアルゴリズムを用いて実現し,アルゴリズムではユークリッド距離を映像類似度の測定基準として採用した.コードは次のとおりです.
#           (      )
def calculate_euclidean(movie1,movie2, types, weight_types_dic):
    #          ,            
    #        ,      
    sum = 0
    for i in range(len(types)):
        tmp_type = types[i]
        weight = weight_types_dic[tmp_type]
        tmp_type_content1 = movie1[tmp_type]
        tmp_type_content2 = movie2[tmp_type]
        if (type(tmp_type_content1).__name__ == 'list' and type(tmp_type_content2).__name__ == 'list'):
            tmp_type_content1 = '|'.join(tmp_type_content1)
            tmp_type_content2 = '|'.join(tmp_type_content2)
        similarity = weight * get_equal_rate_1(tmp_type_content1, tmp_type_content2)
        sum += similarity
        # print(tmp_type + ' ' + tmp_type_content1 + ' ' + tmp_type_content2 + ' ' + (str)(similarity))
    euclidean = sum / 10
    print(movie1['name'] + '   ' + movie2['name'] + '      ' + (str)(euclidean) + '
'
) return euclidean

2、映画とテレビの類似度マトリックス計算
サーバリソースの制限のため、メッセージキューを使用して映画とテレビの類似度を計算することはできません.ここで、映画とテレビの類似度の計算は主に2つのタイプに分けられます.
1、総量
この場合,主に推奨システムの初期化段階に用いられ,現在のサーバにおけるすべての映像間の類似度を計算し,総量計算は1回のみ行い,以降は増分計算のみを行う.
2、増分
毎日、前日に登って取り直した映画と他の映画との類似度を計算し、同時に他の映画と前日に新しく登った映画との類似度を計算し、それから映画とテレビの類似度行列を更新する.コードは次のとおりです.
#           key  
def distinct(items,key):
    key = itemgetter(key)
    items = sorted(items, key=key)
    return [next(v) for _, v in groupby(items, key=key)]
#        
def get_yesterday():
    today=datetime.date.today()
    oneday=datetime.timedelta(days=1)
    yesterday=today-oneday
    return (str)(yesterday)
#            
def get_equal_rate_1(str1, str2):
    if (str2 == None):
        str2 = ''
    if (type(str1).__name__ == 'list' or type(str2).__name__ == 'list'):
        str1 = '|'.join(str1)
        str2 = '|'.join(str2)
    return difflib.SequenceMatcher(None, str1.lower(), str2.lower()).quick_ratio()
#         
#      :name、type、type2、region、language、release_date
def get_recommendations(movie_type, type):
    if (movie_type == 'movie'):
        types = ['name', 'type', 'type2', 'region', 'language', 'release_date', 'directors', 'actors']
        weight_types_dic = {'name': 2.5, 'type': 0.5, 'type2': 0.5, 'region': 0.5, 'language': 0.5, 'release_date': 0.5,
                            'directors': 2.5, 'actors': 2.5}
        collection = 'movie'
    elif (movie_type == 'drama'):
        types = ['name', 'type']
        weight_types_dic = {'name': 6, 'type': 4}
        collection = 'drama'
    elif (movie_type == 'piece'):
        types = ['name', 'type', 'type2', 'description']
        weight_types_dic = {'name': 4, 'type': 2, 'type2': 2, 'description': 2}
        collection = 'piece'
    #             
    db_utils = MongoDbUtils(collection)
    db_utils2 = MongoDbUtils(collection)

    if (type == 'all'):
        #             
        dic = {}
        dic2 = {}
    elif (type == 'latest'):
        #                     
        dic = {'acquisition_time': {'$regex': '.*' +get_yesterday() + '.*'}}
        dic2 = {}

    #                  
    movies = db_utils.find(dic)
    total = movies.count() + 1
    for i, movie1 in enumerate(movies):
        collection = 'recommendations'
        db_utils3 = MongoDbUtils(collection)
        db_utils5 = MongoDbUtils(collection)
        tmp_dic = [{'$project': {"_id": 0, "euclidean": 0}}, {'$match': {"temp_id": movie1['_id']}}]
        movies2 = db_utils2.find(dic2)
        dic3 = {'temp_id': movie1['_id']}
        sort_movies = (list)(db_utils5.find(dic3).sort([('euclidean', -1)]))
        if (len(sort_movies) < 20):
            min_euclidean = 0
        else:
            min_euclidean = sort_movies[len(sort_movies) - 1]['euclidean']
        total2 = movies2.count() + 1
        recommendations = []
        for j, movie2 in enumerate(movies2):
            #        _id  (     ),   
            if (movie2['_id'] == movie1['_id']):
                continue
            #              ,   
            print('     ' + (str)(i + 1) + '/' + (str)(total) + ' ' + (str)(j + 1) + '/' + (str)(total2) + ' ' +
                  movie1['name'] + ' ' + movie2['name'])
            euclidean = calculate_euclidean(movie1, movie2, types, weight_types_dic)
            if (euclidean < min_euclidean):
                print('   ' + (str)(i + 1) + '/' + (str)(total) + ' ' + (str)(j + 1) + '/' + (str)(total2) + ' ' +
                      movie1['name'] + ' ' + movie2['name'])
                continue
            recommendation = {'temp_id': movie1['_id'], 'temp_id2': movie2['_id'], 'euclidean': euclidean}
            recommendations.append(recommendation)
        recommendations = distinct(recommendations, 'temp_id2')
        recommendations = sorted(recommendations, key=lambda x: x['euclidean'], reverse=True)[:20]
        if (len(sort_movies) > 0 and recommendations[len(recommendations) - 1]['euclidean'] == sort_movies[len(sort_movies) - 1]['euclidean']):
            print(movie1['name'] + '         ')
            continue
        #              ,          
        try:
            db_utils5.delete(dic3)
            db_utils5.insert(recommendations)
        except:
            continue

    #                ,             
    if (type == 'latest'):
        #        
        if (total > 1):
            movies = db_utils.find(dic2)
            total = movies.count() + 1
            for i, movie1 in enumerate(movies):
                collection = 'recommendations'
                db_utils5 = MongoDbUtils(collection)
                db_utils6 = MongoDbUtils(collection)
                tmp_dic = [{'$project': {"_id": 0, "euclidean": 0}}, {'$match': {"temp_id": movie1['_id']}}]
                movies2 = db_utils2.find(dic)
                dic3 = {'temp_id': movie1['_id']}
                sort_movies = (list)(db_utils5.find(dic3).sort([('euclidean', -1)]))
                if (len(sort_movies) < 20):
                    min_euclidean = 0
                else:
                    min_euclidean = sort_movies[len(sort_movies) - 1]['euclidean']
                total2 = movies2.count() + 1
                recommendations = (list)(db_utils6.find(dic3).sort([('euclidean', -1)]))
                for j, movie2 in enumerate(movies2):
                    #        _id  (     ),   
                    if (movie2['_id'] == movie1['_id']):
                        continue
                    #              ,   
                    print('     ' + (str)(i + 1) + '/' + (str)(total) + ' ' + (str)(j + 1) + '/' + (str)(total2) + ' ' +
                          movie1['name'] + ' ' + movie2['name'])
                    euclidean = calculate_euclidean(movie1, movie2, types, weight_types_dic)
                    if (euclidean < min_euclidean):
                        print('   ' + (str)(i + 1) + '/' + (str)(total) + ' ' + (str)(j + 1) + '/' + (str)(total2) + ' ' +
                              movie1['name'] + ' ' + movie2['name'])
                        continue
                    recommendation = {'temp_id': movie1['_id'], 'temp_id2': movie2['_id'], 'euclidean': euclidean}
                    recommendations.append(recommendation)
                recommendations = distinct(recommendations, 'temp_id2')
                recommendations = sorted(recommendations, key=lambda x: x['euclidean'], reverse=True)[:20]
                if (len(sort_movies) > 0 and recommendations[len(recommendations) - 1]['euclidean'] ==
                        sort_movies[len(sort_movies) - 1]['euclidean']):
                    print(movie1['name'] + '         ')
                    continue
                #              ,          
                print((str)(movie1['_id']) + ' ' + (str)(min_euclidean) + ' ' + (str)(recommendations[len(recommendations) - 1]['euclidean']))
                db_utils5.delete(dic3)
                try:
                    db_utils5.insert(recommendations)
                except:
                    continue

3、試験例及び結果
共通セクション:
types = ['name', 'type', 'type2', 'region', 'language', 'release_date', 'directors', 'actors']
weight_types_dic = {'name': 2.5, 'type': 0.5, 'type2': 0.5, 'region': 0.5, 'language': 0.5, 'release_date': 0.5,
                        'directors': 2.5, 'actors': 2.5}
collection = 'movie'
db_utils = MongoDbUtils(collection)
drama1 = db_utils.find({'name': '    '}).__getitem__(0)
drama2 = db_utils.find({'name': '    (  )'}).__getitem__(0)
calculate_euclidean(drama1, drama2, types, weight_types_dic)

結果:1、 および ( )
name          (  ) 1.6666666666666665
type        0.2
type2         0.0
region       0.5
language       0.5
release_date 2014 2014 0.5
directors         0.0
actors    |  |  |  |     �W|   |   |   |    0.5714285714285714
           (  )      0.3938095238095237

2、 および
name             2.0
type        0.2
type2         0.0
region       0.0
language       0.0
release_date 2014 2019 0.375
directors     Sakon Tiacharoen 0.0
actors    |  |  |  |      ·     |    ·      |   ·     0.22222222222222224
                   0.27972222222222226

3、 および
name               1.6666666666666665
type        0.2
type2         0.0
region       0.5
language       0.5
release_date 2014 2017 0.375
directors       /   0.0
actors    |  |  |  |       /   /    0.18518518518518517
                     0.34268518518518515

4、 ( )および
name     (  )        1.4285714285714284
type         0.5
type2         0.16666666666666666
region       0.0
language       0.0
release_date 2014 2019 0.375
directors     Sakon Tiacharoen 0.0
actors  �W|   |   |   |      ·     |    ·      |   ·     0.20833333333333331
    (  )               0.26785714285714285

5、 ( )および
name     (  )          1.25
type         0.5
type2         0.5
region       0.5
language       0.5
release_date 2014 2017 0.375
directors       /   0.0
actors  �W|   |   |   |       /   /    0.0
    (  )                 0.3625

6、 および
name                 1.4285714285714284
type         0.5
type2         0.16666666666666666
region       0.0
language       0.0
release_date 2019 2017 0.375
directors Sakon Tiacharoen   /   0.0
actors   ·     |    ·      |   ·        /   /    0.0
                       0.24702380952380948

完全なコードアドレス:https://github.com/wpwbb510582246/PocketFilm/tree/master/Recommender親プロジェクトのアドレス:https://github.com/wpwbb510582246/PocketFilm