SimHashを使用して大量のテキストを削除

25288 ワード

データ構造 NLP simhash

私の新しいブログへようこそ:転送ドア
一、参考資料
SimHashを使用して大量のテキストを削除by Pollのノート

simhashの主な流れは:

分詞

重み付け

合併

次元ダウン

の重要な考えは次のとおりです.

hashmapの思想に倣ってsimhashに適したデータ構造を再構築する

64 bitビットhashを多段、例えば4段16ビットに分割し、各段をmap_とするkey、チェーンテーブルをmap_として使用value,格納テキストhash値

次に、インスタンス要件について具体的に実装する.
二、実例
インスタンスシーン

現在、5千万のテキストデータがmongodbに格納されており、重複または大面積の類似したテキストが多く含まれており、現在は重量除去が必要です.(大面積類似のテキストも削除する必要がある)

simhashを利用してこの問題を解決するのはちょうど良くて、私は主にjieba分詞とTF-IDFを使ってキーワードの抽出と重み付けを行って、それからdictとlistを利用して自分でsimhashの特徴に合ったhashmap.

を構築します.

jieba分詞使用参考資料:Pythonのjieba分詞およびTF-IDFおよびTextRankアルゴリズム抽出キーワード

インスタンスコード

simhash

# -*- coding: utf-8 -*-
from jieba import lcut, analyse

def count_diff_str(str1, str2):
    res = 0
    for i in range(min(len(str1),len(str2))):
        if str1[i] != str2[i]:
            res += 1
    return res

def count_diff_int(value1, value2):
    res = 0
    value = value1 ^ value2
    x = 1
    for i in range(64):
        if x & value != 0:
            res += 1
        x <<= 1
    return res

def simhash_(sentence):
    keywords = analyse.extract_tags(sentence, topK=30, withWeight=True)
    value_list = [0 for i in range(64)]
    for item in keywords:
        # print(item[0],item[1])
        value = hash(item[0])
        x = 1
        for i in range(64):
            if x & value == 0:
                value_list[i] -= item[1]
            else:
                value_list[i] += item[1]
            x <<= 1
    value_str = ''
    value_int = 0
    x = 1
    for item in value_list:
        if item > 0:
            value_str += '1'
            value_int |= x
        else:
            value_str += '0'
        x <<= 1

    return value_str, value_int

重量除去

import pymongo

MONGO_CONNECT = 'xxx' # mongo uri
MONGO_DBS = 'xxx' # mongo database name
MONGO_SOURCE_COLLECTION = 'xxx' # mongo source collectiton name
MONGO_TARGET_COLLECTION = 'xxx' # mongo target collectiton name
THRESHOLD = 3  #      hash   ，                 

#         ，        
def clean_ques(ques_txt):
    #      MongoDB download     ，       css/html  
    # txt = txt.decode('utf-8')
    tmp = 0
    res = ""
    for ch in ques_txt:
        if ch == ':
            tmp = tmp + 1
            continue
        elif ch == '>':
            tmp = tmp - 1
            continue
        elif tmp == 0:
            res = res + ch
    res = res.replace(" ", "")
    res = res.replace("\r
", "")
    res = res[0: res.find("  ：")]
    return res

#     str     int
def str2int(data):
    res = 0
    x = 1
    for i in range(len(data)):
        if data[i] == '1':
            res |= x
        x <<= 1
    return res

#     int     str
def int2str(data):
    res = ''
    x = 1
    for i in range(64):
        if x & data == 0:
            res += '0'
        else:
            res += '1'
        x <<= 1
    return res


hash_map = dict()

if __name__ == '__main__':

    repeated_count = 0
    count = 0
    client = pymongo.MongoClient(MONGO_CONNECT)
    db = client[MONGO_DBS]
    source_col = db[MONGO_SOURCE_COLLECTION]
    target_col = db[MONGO_TARGET_COLLECTION]
    sum_count = source_col.count()
    #        limit,      limit
    source_cursor = source_col.find().limit(1000)

    # print(source_col.count())
    for item in source_cursor:
        count += 1
        if count % 100000 == 0:
            print("count: ", count)
            print("rate of progress: ", "%.2f%%" % (count/sum_count*100))
        ques = clean_ques(item['data'])
        # print(ques)
        old_id = item['_id']
        hash_value_str, hash_value_int = simhash_(ques)
        flag = True
        for i in range(4):
            key = str2int(hash_value_str[i*16:i*(i+1)*16])
            content_list = hash_map.get(key, None)
            if not content_list:
                new_list = [hash_value_int]
                hash_map[key] = new_list
            elif not flag:
                content_list.append(hash_value_int)
                hash_map[key] = content_list
            else:
                for item in content_list:
                    if count_diff_int(item, hash_value_int) < THRESHOLD:
                        flag = False
                        break
                if not flag:
                    content_list.append(hash_value_int)
                    hash_map[key] = content_list
        if flag:
            data = {'ques': ques, 'old_id': old_id, 'hash_value': str(hash_value_int)}
            target_col.insert(data)
        else:
            repeated_count += 1

    print("total documents: ", sum_count)
    print("total count: ", count)
    print("repeated count: ", repeated_count)
    print("valid count: ", count - repeated_count)
    print("valid rate: ", 1 - repeated_count/count)
    print("repeated rate: ", repeated_count/count)

simhash test

sentence1 = "I don't like the first one．I like _________．（　　）A．twoB．secondC．the second "
sentence2 = 'To stay awake， he coffee and ordered _______．（　　）A．．otherC．the othersD．another           '

str1, _ = simhash_(sentence1)
str2, _ = simhash_(sentence2)
print(str1)
print(str2)

print(count_diff_str(str1, str2))

獣医師が必死にバイオインフォマティクスPythonによる実践レシピをwin10で動かす（その3）

『猪弟アーチJava』連載番外編:Javaエージェント(中)