TensorFlowでTicTacToeに挑戦 その2
概要
TnsorFlowでTicTacToeをやるために、OpenAiGymのTicTacToe環境作って見た。
TnsorFlowでやってみた。
写真
百回やって、4回、勝って、ドローが4回。
環境
windows 7 sp1 64bit
anaconda3
tensorflow 1.0
OpenAi Gym 0.5
方針
Qラーニングしない。CNN使わない。RNN使わない。
盤を判断する推定器、つくる。
環境observeを、貯めておいて、
報酬rewardが1と-1のときだけ、ブレイン学習する。
実行時、ランダムにアクション求めて、盤を作って
ブレインにお伺いをたてる。
学習は、1000ゲーム。
tensorflowのグラフは、input,fc,relu,fc,outputです。
inputは、9個、outputは、2個。fcは、20個、20個。
オプチマイザーは、GradientDescentOptimizer。
学習プログラムと実行プログラムつくる。
学習は、saver.saveする。
実行で、saver.restoreする。
学習のサンプルコード
def main(_):
env = toeEnv()
saver = tf.train.Saver()
with tf.Session() as sess:
tf.initialize_all_variables().run()
for j in range(1000):
states = []
rewards = []
env.reset()
isGameOver = False
pstate = [0, 0, 0, 0, 0, 0, 0, 0, 0]
while (isGameOver != True):
action = 0
global X, Y, train_op, loss
action = sasu(pstate)
state, reward, gameOver, _ = env.step(action)
env.render()
states.append(state)
if gameOver:
isGameOver = gameOver
else:
pstate, reward, gameOver, _ = env.step(-1)
isGameOver = gameOver
if (reward == 1):
for i in range(len(states)):
rewards.append(np.array([0, 1]))
op, lss = sess.run([train_op, loss], feed_dict = {
X: states,
Y: rewards
})
print("Epo: " + str(j) + " loss: " + str(lss) + " reward: 1")
if (reward == -1):
for i in range(len(states)):
rewards.append(np.array([1, 0]))
op, lss = sess.run([train_op, loss], feed_dict = {
X: states,
Y: rewards
})
print("Epo: " + str(j) + " loss: " + str(lss) + " reward: -1")
save_path = saver.save(sess, os.getcwd() + "/toe1.ckpt")
print("Model saved in file: %s" % save_path)
確認のサンプルコード
def main(_):
env = toeEnv()
saver = tf.train.Saver()
with tf.Session() as sess:
saver.restore(sess, os.getcwd() + "/toe1.ckpt")
win = 0
draw = 0
for j in range(100):
env.reset()
isGameOver = False
pstate = [0, 0, 0, 0, 0, 0, 0, 0, 0]
while (isGameOver != True):
action = -1
global X, Y, predict_op, loss
found = False
while (found != True):
ppstate = pstate.copy()
i = np.random.randint(0, 9)
ppstate[i] = 1.0
op = sess.run(predict_op, feed_dict = {
X: [ppstate],
})
if op == [1]:
found = True
action = i
state, reward, gameOver, _ = env.step(action)
env.render()
if gameOver:
isGameOver = gameOver
else:
pstate, reward, gameOver, _ = env.step(-1)
env.render()
isGameOver = gameOver
if reward == 1:
win += 1
if reward == 0:
draw += 1
print ("Epo: " + str(j) + " reward: " + str(reward) + " win: " + str(win) + " draw: " + str(draw))
def main(_):
env = toeEnv()
saver = tf.train.Saver()
with tf.Session() as sess:
tf.initialize_all_variables().run()
for j in range(1000):
states = []
rewards = []
env.reset()
isGameOver = False
pstate = [0, 0, 0, 0, 0, 0, 0, 0, 0]
while (isGameOver != True):
action = 0
global X, Y, train_op, loss
action = sasu(pstate)
state, reward, gameOver, _ = env.step(action)
env.render()
states.append(state)
if gameOver:
isGameOver = gameOver
else:
pstate, reward, gameOver, _ = env.step(-1)
isGameOver = gameOver
if (reward == 1):
for i in range(len(states)):
rewards.append(np.array([0, 1]))
op, lss = sess.run([train_op, loss], feed_dict = {
X: states,
Y: rewards
})
print("Epo: " + str(j) + " loss: " + str(lss) + " reward: 1")
if (reward == -1):
for i in range(len(states)):
rewards.append(np.array([1, 0]))
op, lss = sess.run([train_op, loss], feed_dict = {
X: states,
Y: rewards
})
print("Epo: " + str(j) + " loss: " + str(lss) + " reward: -1")
save_path = saver.save(sess, os.getcwd() + "/toe1.ckpt")
print("Model saved in file: %s" % save_path)
def main(_):
env = toeEnv()
saver = tf.train.Saver()
with tf.Session() as sess:
saver.restore(sess, os.getcwd() + "/toe1.ckpt")
win = 0
draw = 0
for j in range(100):
env.reset()
isGameOver = False
pstate = [0, 0, 0, 0, 0, 0, 0, 0, 0]
while (isGameOver != True):
action = -1
global X, Y, predict_op, loss
found = False
while (found != True):
ppstate = pstate.copy()
i = np.random.randint(0, 9)
ppstate[i] = 1.0
op = sess.run(predict_op, feed_dict = {
X: [ppstate],
})
if op == [1]:
found = True
action = i
state, reward, gameOver, _ = env.step(action)
env.render()
if gameOver:
isGameOver = gameOver
else:
pstate, reward, gameOver, _ = env.step(-1)
env.render()
isGameOver = gameOver
if reward == 1:
win += 1
if reward == 0:
draw += 1
print ("Epo: " + str(j) + " reward: " + str(reward) + " win: " + str(win) + " draw: " + str(draw))
Author And Source
この問題について(TensorFlowでTicTacToeに挑戦 その2), 我々は、より多くの情報をここで見つけました https://qiita.com/ohisama@github/items/96430b81e551e2e7d721著者帰属:元の著者の情報は、元のURLに含まれています。著作権は原作者に属する。
Content is automatically searched and collected through network algorithms . If there is a violation . Please contact us . We will adjust (correct author information ,or delete content ) as soon as possible .