TensorFlowでオセロに挑戦


概要

TnsorFlowでオセロをやるために、OpenAiGymのオセロ環境作って見た。
手で確認用のサンプルコード、載せる。
誰かが、TensorFlowで負かせてくれる事を望む。

写真

環境

windows 7 sp1 64bit
anaconda3
tensorflow 1.0
OpenAi Gym 0.5

オセロ環境の概要

observe:

8*8の盤面を0から63までの配列にして返す

 0  1  2  3  4  5  6  7 
 8  9 10 11 12 13 14 15
16 17 18 19 20 21 22 23
24 25 26 27 28 29 30 31
32 33 34 35 36 37 38 39
40 41 42 43 44 45 46 47
48 49 50 51 52 53 54 55
56 57 58 59 60 61 62 63

0 なし
1 白
-1 黒

reward:

獲った石の数

gameOver:

石が無くなったか、盤が埋まった。

action:

0から63で指定。
-1なら相手。

確認用のサンプルコード

from __future__ import print_function
import math
import sys
import gym
import gym.spaces
import numpy as np
from gym import core, spaces
from gym.utils import seeding
import time
import random
import os.path

class oseroEnv(gym.Env):
    metadata = {
        'render.modes': ['human', 'rgb_array'],
        'video.frames_per_second' : 10
    }
    def __init__(self):
        self.viewer = None
        self.gridSize = 8
        self.nbStates = self.gridSize * self.gridSize
        self.state = np.empty(self.nbStates, dtype = np.int8)
        self.count = 0
    def check(self, put, d):
        res = 0
        x = put % 8
        y = math.floor(put / 8)
        if (x == 0 and (d == -9 or d == -1 or d == 7)):
            res = 1
        if (x == 7 and (d == -7 or d == 1 or d == 9)):
            res = 1
        if (y == 0 and (d == -9 or d == -8 or d == -7)):
            res = 1
        if (y == 7 and (d == 7 or d == 8 or d == 9)):
            res = 1
        i = put + d
        if i < 0:
            res = 1
        if i > 63:
            res = 1
        return res
    def oku(self, put, iro):
        res = 0
        turn = 1
        if iro == 1:
            turn = -1
        dir = [-9, -8, -7, -1, 1, 7, 8, 9]
        if self.state[put] == 0:
            for i in range(8):
                count = 0
                tugi = put
                while True:
                    if self.check(tugi, dir[i]) == 1:
                        break
                    count += 1
                    tugi += dir[i]
                    if self.state[tugi] != turn:
                        break
                if (count > 1) and (self.state[tugi] == iro):
                    res = -1
                    tugi = put
                    while True:
                        self.state[tugi] = iro
                        tugi += dir[i]
                        if self.state[tugi] != turn:
                            break
                        self.count += 1
        return res
    def sasu(self):
        suji = [0, 7, 56, 63, 18, 21, 42, 45, 2, 16, 5, 23, 40, 58, 47, 61, 3, 4, 11, 12, 19, 20, 24, 25, 26, 32, 33, 34, 29, 30, 31, 37, 38, 39, 43, 44, 51, 52, 59, 60, 1, 8, 9, 10, 17, 6, 13, 14, 15, 22, 41, 48, 49, 50, 57, 46, 53, 54, 55, 62]
        res = 0
        all = 0
        iro = -1
        turn = 1
        dir = [-9, 9, -7, 7, -1, 1, -8, 8]
        for j in range(60):
            put = suji[j]
            if self.state[put] == 0:
                for i in range(8):
                    count = 0;
                    if self.check(put, dir[i]) == 0:
                        tugi = put + dir[i]
                        while True:
                            if self.state[tugi] == turn:
                                count += 1
                                if self.check(tugi, dir[i]) == 1:
                                    break
                                else:
                                    tugi += dir[i]
                            else:
                                break
                    if (count > 0) and (self.state[tugi] == iro):
                        all += count;
            if all > 0:
                res = put
                break
        return res
    def getState(self):
        return self.state
    def getReward(self):
        return self.count
    def isGameOver(self):
        siro = 0
        kuro = 0
        end = True
        for i in range(64):
            if self.state[i] == 0:
                end = False 
            if self.state[i] == 1:
                siro = 1
            if self.state[i] == -1:
                kuro = 1
        if siro == 0:
            end = True
        if kuro == 0:
            end = True
        return end
    def updateState(self, action):
        if action < -1:
            return
        if action > 63:
            return
        if action == -1:
            ai = self.sasu()
            self.count = 0
            self.oku(ai, -1)
        else:
            self.count = 0
            self.oku(action, 1)
    def observe(self):
        return self.state
    def _reset(self):
        self.state = np.zeros(self.nbStates, dtype = np.int8)
        self.state[27] = 1
        self.state[28] = -1
        self.state[35] = -1
        self.state[36] = 1
        self.count = 0
        return self.observe()
    def _step(self, action):
        self.updateState(action)
        reward = self.getReward()
        gameOver = self.isGameOver()
        return self.observe(), reward, gameOver, {}
    def _render(self, mode = 'human', close = False):
        if close:
            if self.viewer is not None:
                self.viewer.close()
                self.viewer = None
            return
        from gym.envs.classic_control import rendering
        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.Viewer(500, 500)
            self.viewer.set_bounds(-2.2, 2.2, -2.2, 2.2)
        self.viewer.draw_polygon([(-2.0, -2.0), (2.0, -2.0), (2.0, 2.0), (-2.0, 2.0)], color = (0, 1, 0))
        self.viewer.draw_line((-2.0, 2.0), (2.0, 2.0), color = (0, 0, 0))
        self.viewer.draw_line((-2.0, 2.0), (-2.0, -2.0), color = (0, 0, 0))
        self.viewer.draw_line((2.0, 2.0), (2.0, -2.0), color = (0, 0, 0))
        self.viewer.draw_line((-2.0, -2.0), (2.0, -2.0), color = (0, 0, 0))
        for i in range(7):
            x0 = i * 0.5 - 1.5
            y0 = -2.0
            x1 = i * 0.5 - 1.5
            y1 = 2.0
            self.viewer.draw_line((x0, y0), (x1, y1), color = (0, 0, 0))
            self.viewer.draw_line((y0, x0), (y1, x1), color = (0, 0, 0))
        for i in range(64):
            if self.state[i] == 1:
                x = (i % 8) * 0.5 - 1.75
                y = 1.75 - (math.floor(i / 8)) * 0.5
                transform0 = rendering.Transform(translation = (x, y))
                self.viewer.draw_circle(0.2, 20, color = (1, 1, 1)).add_attr(transform0)
            if self.state[i] == -1:
                x = (i % 8) * 0.5 - 1.75
                y = 1.75 - (math.floor(i / 8)) * 0.5
                transform0 = rendering.Transform(translation = (x, y))
                self.viewer.draw_circle(0.2, 20, color = (0, 0, 0)).add_attr(transform0)
        return self.viewer.render(return_rgb_array = mode == 'rgb_array')

env = oseroEnv()

def test(env):
    obser = env.reset()
    env.render()
    while True:
        i = int(input())
        observe, reward, gameOver, info = env.step(i)
        print (observe, reward)
        env.render()
        if gameOver:
            print ("game over!")
            break

test(env)