ハンgmanゲーム

6171 ワード

挑戦はコード死刑囚のアルゴリズムでこのゲームをします.あなたのアルゴリズムはbaselineアルゴリズムよりも、私たちはあなたと理想に50%を超える精度を提供するべきです.このメールでは、トレーニング辞書ファイルとJupyterノートテンプレートを添付して、コード、実行、アルゴリズムの提出方法を示しています.
ゲームのルールは、単語の長さを与えて、単語を当てさせて、当てたアルファベットは保留することができて、当て間違えたアルファベットはあなたの命を使って、すべての単語はあなたが6つの命を持っています.
モデルの背景には、ゲームのルールが知られており、トレーニングセットとして単語辞書もあります.
このゲームには2つの考え方があり,1つはGPUハードウェア施設を比較的に食べること,すなわちRNNの方法で辞書内の情報を訓練してnlp予測を行い,もう1つはゲームルールに基づいて訓練された辞書からいくつかの統計情報を得,これらの先行情報に基づいて辞書外の単語を予測することである.ハードウェアの問題で、私は後者を使っていますが、単語の長さや単語を当てた後、アルファベットの対応する位置、当てられなかったアルファベットの対応する位置などを私の特徴として訓練しています.
// An highlighted block

 #my code
def update_counter(lm, data, order):
    for i in range(len(data)-order):
        history, char = data[i:i+order], data[i+order]
        if history.find(' ') == -1 and char != ' ':
            lm[history][char]+=1
    return lm

def train_char_lm(dic, order=3):
    lm = defaultdict(Counter)
    data = dic
    # calculate the probability of letter appearnce in a specified gram (length decided by order)
    for i in range(len(data)-order):
        gram = data[i:i+order]
        histories = []
        chars = []
        if gram.find(' ') == -1 and gram.find('_') == -1:
            replace_space = order - 1
            while(replace_space>0):
                gram_list = list(gram)
                comb = combinations(range(len(gram)), replace_space)
                for i in list(comb):
                    keeped_letter = [x if idx not in list(i) else '_' for idx, x in enumerate(gram_list)]
                    replaced_letter = [x for idx, x in enumerate(gram_list) if idx in list(i)]
                    histories += ["".join(keeped_letter)]
                    chars += [Counter("".join(replaced_letter)).most_common()[0][0]]               
                replace_space -= 1
        for history, char in zip(histories, chars):
            if history.find(' ') == -1 and char != ' ':
                lm[history][char]+=1
    
    def normalize(counter):
        s = float(sum(counter.values()))
        return [(c,cnt/s) for c,cnt in counter.items()]
    outlm = {hist:normalize(chars) for hist, chars in lm.items()}
    return outlm
def check_guessed(sorted_letter_count, guessed_letters):
    flag = False
    for letter, instance_prob in sorted_letter_count:
        if letter not in guessed_letters and letter is not '_':
            flag = True
            break
    if flag:
        return (letter, instance_prob)
    else:
        return

def update_candidate(candidate_list, curr_guess):
    if curr_guess is not None:
        if curr_guess[0] in candidate_list:
            candidate_list[curr_guess[0]] += curr_guess[1]
        else:
            candidate_list[curr_guess[0]] = curr_guess[1]
    return candidate_list

def generate_letter(full_dict, guess, lm, d, guessed_letters, order = 3):
    candidate_list = {} 
    print (guess)
     ## the first time guess is based on probablity of all words with the length of target word
    if len(set(guess))== 1 and guess[0] == '_':
        curr_guess = check_guessed(collections.Counter("".join(d[len(guess)])).most_common(), guessed_letters)
        candidate_list = update_candidate(candidate_list, curr_guess)
    
    ## get the gram of current word and get the probablity of letters from language model
    for i in range(len(guess)):
        stem = guess[i:i+order]
        if stem in lm:
            curr_guess = check_guessed(sorted(lm[stem], key=lambda item:item[1], reverse=True), guessed_letters)
            candidate_list = update_candidate(candidate_list, curr_guess)
            
    ## if the gram is not in the language model, then use the default probability of all words with the length of target word
    if (len(candidate_list) == 0):
        curr_guess = check_guessed(collections.Counter("".join(d[len(guess)])).most_common(), guessed_letters)
        candidate_list = update_candidate(candidate_list, curr_guess)
    
    letter = max(candidate_list, key=lambda k: candidate_list[k])
    return letter

def play(answer, lm,  d, order, nTrials=6):
    guess = "_ " * int(len(answer)/2)
    guess_clean = guess[::2].replace(" ", "")
    full_dict = full_dictionary
    guessed_letters = []
    errors = 0
    count = 0
    flag = False
    while(errors < nTrials):
        c = generate_letter(full_dict, guess_clean, lm, d, guessed_letters, order)
        guessed_letters += [c]
        if answer.find(c)!=-1:
            idx = [pos for pos, char in enumerate(answer) if char == c]
            for j in idx:
                guess = '%s%s%s'%(guess[:j],c,guess[j+1:])
        else:
            errors += 1
        print ("-------------")   
        print (count, errors, c, ':  ', guess_clean)
        print ("-------------")
        guess_clean = guess[::2]
        if guess_clean.find('_') == -1:
            flag = True
            break
        count += 1
    return guess, flag
def build_dictionary(dictionary_file_location):
    text_file = open(dictionary_file_location,"r")
    full_dictionary = text_file.read().splitlines()
    text_file.close()
    return full_dictionary

full_dictionary_location = "words_250000_train.txt"
full_dictionary = build_dictionary(full_dictionary_location)
#95% train and 5% test
full_dic, answers = train_test_split(full_dictionary, test_size = 0.05)
lm = train_char_lm(" ".join(full_dic), 5)
d=defaultdict(list)
for word in full_dic:
    d[len(word)].append(word)
N = len(answers)
success = 0
for answer in answers:
    answer = " ".join(answer) + " "
    res, flag = play(answer, lm, d, order = 5)
    if flag:
        success += 1
        print ("success!, the answer is " + res)
    else:
        print("failed!, the answer is " + answer + " the guess is " + res)
acc = success/(N*1.0)*100
print ("success rate is %0.2f%%"%acc)
# here ahout is 51%
, 50% , , 6 live , trex ,