Merge remote-tracking branch 'origin' into mcts_virtual_loss

This commit is contained in:
mcgrady00h 2017-12-24 15:44:30 +08:00
commit 941284e7b1
23 changed files with 1114 additions and 1162 deletions

2
.gitignore vendored
View File

@ -7,3 +7,5 @@ parameters
checkpoints
checkpoints_origin
*.json
.DS_Store
data

View File

@ -1,211 +0,0 @@
import os
import time
import sys
import numpy as np
import time
import tensorflow as tf
import tensorflow.contrib.layers as layers
import multi_gpu
import time
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
def residual_block(input, is_training):
normalizer_params = {'is_training': is_training,
'updates_collections': tf.GraphKeys.UPDATE_OPS}
h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu,
normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
weights_regularizer=layers.l2_regularizer(1e-4))
h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity,
normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
weights_regularizer=layers.l2_regularizer(1e-4))
h = h + input
return tf.nn.relu(h)
def policy_heads(input, is_training):
normalizer_params = {'is_training': is_training,
'updates_collections': tf.GraphKeys.UPDATE_OPS}
h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
weights_regularizer=layers.l2_regularizer(1e-4))
h = layers.flatten(h)
h = layers.fully_connected(h, 362, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4))
return h
def value_heads(input, is_training):
normalizer_params = {'is_training': is_training,
'updates_collections': tf.GraphKeys.UPDATE_OPS}
h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
weights_regularizer=layers.l2_regularizer(1e-4))
h = layers.flatten(h)
h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4))
h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4))
return h
class Network(object):
def __init__(self):
self.x = tf.placeholder(tf.float32, shape=[None, 19, 19, 17])
self.is_training = tf.placeholder(tf.bool, shape=[])
self.z = tf.placeholder(tf.float32, shape=[None, 1])
self.pi = tf.placeholder(tf.float32, shape=[None, 362])
self.build_network()
def build_network(self):
h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm,
normalizer_params={'is_training': self.is_training,
'updates_collections': tf.GraphKeys.UPDATE_OPS},
weights_regularizer=layers.l2_regularizer(1e-4))
for i in range(19):
h = residual_block(h, self.is_training)
self.v = value_heads(h, self.is_training)
self.p = policy_heads(h, self.is_training)
# loss = tf.reduce_mean(tf.square(z-v)) - tf.multiply(pi, tf.log(tf.clip_by_value(tf.nn.softmax(p), 1e-8, tf.reduce_max(tf.nn.softmax(p)))))
self.value_loss = tf.reduce_mean(tf.square(self.z - self.v))
self.policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.pi, logits=self.p))
self.reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
self.total_loss = self.value_loss + self.policy_loss + self.reg
# train_op = tf.train.MomentumOptimizer(1e-4, momentum=0.9, use_nesterov=True).minimize(total_loss)
self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(self.update_ops):
self.train_op = tf.train.RMSPropOptimizer(1e-4).minimize(self.total_loss)
self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
self.saver = tf.train.Saver(max_to_keep=10, var_list=self.var_list)
def train(self):
data_path = "/home/tongzheng/data/"
data_name = os.listdir("/home/tongzheng/data/")
epochs = 100
batch_size = 128
result_path = "./checkpoints/"
with multi_gpu.create_session() as sess:
sess.run(tf.global_variables_initializer())
ckpt_file = tf.train.latest_checkpoint(result_path)
if ckpt_file is not None:
print('Restoring model from {}...'.format(ckpt_file))
self.saver.restore(sess, ckpt_file)
for epoch in range(epochs):
for name in data_name:
data = np.load(data_path + name)
boards = data["boards"]
wins = data["wins"]
ps = data["ps"]
print (boards.shape)
print (wins.shape)
print (ps.shape)
batch_num = boards.shape[0] // batch_size
index = np.arange(boards.shape[0])
np.random.shuffle(index)
value_losses = []
policy_losses = []
regs = []
time_train = -time.time()
for iter in range(batch_num):
lv, lp, r, value, prob, _ = sess.run(
[self.value_loss, self.policy_loss, self.reg, self.v, tf.nn.softmax(p), self.train_op],
feed_dict={self.x: boards[
index[iter * batch_size:(iter + 1) * batch_size]],
self.z: wins[index[
iter * batch_size:(iter + 1) * batch_size]],
self.pi: ps[index[
iter * batch_size:(iter + 1) * batch_size]],
self.is_training: True})
value_losses.append(lv)
policy_losses.append(lp)
regs.append(r)
if iter % 1 == 0:
print(
"Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format(
epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)),
np.mean(np.array(policy_losses)), np.mean(np.array(regs))))
time_train = -time.time()
value_losses = []
policy_losses = []
regs = []
if iter % 20 == 0:
save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter)
self.saver.save(sess, result_path + save_path)
del data, boards, wins, ps
# def forward(call_number):
# # checkpoint_path = "/home/yama/rl/tianshou/AlphaGo/checkpoints"
# checkpoint_path = "/home/jialian/stuGo/tianshou/stuGo/checkpoints/"
# board_file = np.genfromtxt("/home/jialian/stuGo/tianshou/leela-zero/src/mcts_nn_files/board_" + call_number,
# dtype='str');
# human_board = np.zeros((17, 19, 19))
#
# # TODO : is it ok to ignore the last channel?
# for i in range(17):
# human_board[i] = np.array(list(board_file[i])).reshape(19, 19)
# # print("============================")
# # print("human board sum : " + str(np.sum(human_board[-1])))
# # print("============================")
# # print(human_board)
# # print("============================")
# # rint(human_board)
# feed_board = human_board.transpose(1, 2, 0).reshape(1, 19, 19, 17)
# # print(feed_board[:,:,:,-1])
# # print(feed_board.shape)
#
# # npz_board = np.load("/home/yama/rl/tianshou/AlphaGo/data/7f83928932f64a79bc1efdea268698ae.npz")
# # print(npz_board["boards"].shape)
# # feed_board = npz_board["boards"][10].reshape(-1, 19, 19, 17)
# ##print(feed_board)
# # show_board = feed_board[0].transpose(2, 0, 1)
# # print("board shape : ", show_board.shape)
# # print(show_board)
#
# itflag = False
# with multi_gpu.create_session() as sess:
# sess.run(tf.global_variables_initializer())
# ckpt_file = tf.train.latest_checkpoint(checkpoint_path)
# if ckpt_file is not None:
# # print('Restoring model from {}...'.format(ckpt_file))
# saver.restore(sess, ckpt_file)
# else:
# raise ValueError("No model loaded")
# res = sess.run([tf.nn.softmax(p), v], feed_dict={x: feed_board, is_training: itflag})
# # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False})
# # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True})
# # print(np.argmax(res[0]))
# np.savetxt(sys.stdout, res[0][0], fmt="%.6f", newline=" ")
# np.savetxt(sys.stdout, res[1][0], fmt="%.6f", newline=" ")
# pv_file = "/home/jialian/stuGotianshou/leela-zero/src/mcts_nn_files/policy_value"
# np.savetxt(pv_file, np.concatenate((res[0][0], res[1][0])), fmt="%.6f", newline=" ")
# # np.savetxt(pv_file, res[1][0], fmt="%.6f", newline=" ")
# return res
def forward(self):
checkpoint_path = "/home/tongzheng/tianshou/AlphaGo/checkpoints/"
sess = multi_gpu.create_session()
sess.run(tf.global_variables_initializer())
ckpt_file = tf.train.latest_checkpoint(checkpoint_path)
if ckpt_file is not None:
print('Restoring model from {}...'.format(ckpt_file))
self.saver.restore(sess, ckpt_file)
print('Successfully loaded')
else:
raise ValueError("No model loaded")
# prior, value = sess.run([tf.nn.softmax(p), v], feed_dict={x: state, is_training: False})
# return prior, value
return sess
if __name__ == '__main__':
state = np.random.randint(0, 1, [1, 19, 19, 17])
net = Network()
sess = net.forward()
start = time.time()
for i in range(100):
sess.run([tf.nn.softmax(net.p), net.v], feed_dict={net.x: state, net.is_training: False})
print("Step {}, Cumulative time {}".format(i, time.time() - start))

View File

@ -1,175 +0,0 @@
import os
import time
import gc
import numpy as np
import tensorflow as tf
import tensorflow.contrib.layers as layers
import multi_gpu
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
def residual_block(input, is_training):
normalizer_params = {'is_training': is_training,
'updates_collections': tf.GraphKeys.UPDATE_OPS}
h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu,
normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
weights_regularizer=layers.l2_regularizer(1e-4))
h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity,
normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
weights_regularizer=layers.l2_regularizer(1e-4))
h = h + input
return tf.nn.relu(h)
def policy_heads(input, is_training):
normalizer_params = {'is_training': is_training,
'updates_collections': tf.GraphKeys.UPDATE_OPS}
h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
weights_regularizer=layers.l2_regularizer(1e-4))
h = layers.flatten(h)
h = layers.fully_connected(h, 362, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4))
return h
def value_heads(input, is_training):
normalizer_params = {'is_training': is_training,
'updates_collections': tf.GraphKeys.UPDATE_OPS}
h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
weights_regularizer=layers.l2_regularizer(1e-4))
h = layers.flatten(h)
h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4))
h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4))
return h
x = tf.placeholder(tf.float32, shape=[None, 19, 19, 17])
is_training = tf.placeholder(tf.bool, shape=[])
z = tf.placeholder(tf.float32, shape=[None, 1])
pi = tf.placeholder(tf.float32, shape=[None, 362])
h = layers.conv2d(x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm,
normalizer_params={'is_training': is_training, 'updates_collections': tf.GraphKeys.UPDATE_OPS},
weights_regularizer=layers.l2_regularizer(1e-4))
for i in range(19):
h = residual_block(h, is_training)
v = value_heads(h, is_training)
p = policy_heads(h, is_training)
# loss = tf.reduce_mean(tf.square(z-v)) - tf.multiply(pi, tf.log(tf.clip_by_value(tf.nn.softmax(p), 1e-8, tf.reduce_max(tf.nn.softmax(p)))))
value_loss = tf.reduce_mean(tf.square(z - v))
policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=pi, logits=p))
reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
total_loss = value_loss + policy_loss + reg
# train_op = tf.train.MomentumOptimizer(1e-4, momentum=0.9, use_nesterov=True).minimize(total_loss)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = tf.train.RMSPropOptimizer(1e-4).minimize(total_loss)
var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
saver = tf.train.Saver(max_to_keep=10, var_list=var_list)
def train():
data_path = "/home/tongzheng/data/"
data_name = os.listdir("/home/tongzheng/data/")
epochs = 100
batch_size = 128
result_path = "./checkpoints/"
with multi_gpu.create_session() as sess:
sess.run(tf.global_variables_initializer())
ckpt_file = tf.train.latest_checkpoint(result_path)
if ckpt_file is not None:
print('Restoring model from {}...'.format(ckpt_file))
saver.restore(sess, ckpt_file)
for epoch in range(epochs):
for name in data_name:
data = np.load(data_path + name)
boards = data["boards"]
wins = data["wins"]
ps = data["ps"]
print (boards.shape)
print (wins.shape)
print (ps.shape)
# batch_num = 1
batch_num = boards.shape[0] // batch_size
index = np.arange(boards.shape[0])
np.random.shuffle(index)
value_losses = []
policy_losses = []
regs = []
time_train = -time.time()
for iter in range(batch_num):
lv, lp, r, _ = sess.run([value_loss, policy_loss, reg, train_op],
feed_dict={x: boards[
index[iter * batch_size:(iter + 1) * batch_size]],
z: wins[index[
iter * batch_size:(iter + 1) * batch_size]],
pi: ps[index[
iter * batch_size:(iter + 1) * batch_size]],
is_training: True})
value_losses.append(lv)
policy_losses.append(lp)
regs.append(r)
del lv, lp, r
if iter % 1 == 0:
print(
"Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format(
epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)),
np.mean(np.array(policy_losses)), np.mean(np.array(regs))))
del value_losses, policy_losses, regs, time_train
time_train = -time.time()
value_losses = []
policy_losses = []
regs = []
if iter % 20 == 0:
save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter)
saver.save(sess, result_path + save_path)
del save_path
del data, boards, wins, ps, batch_num, index
gc.collect()
def forward(board):
result_path = "./checkpoints"
itflag = False
res = None
if board is None:
# data = np.load("/home/tongzheng/meta-data/80b7bf21bce14862806d48c3cd760a1b.npz")
data = np.load("./data/7f83928932f64a79bc1efdea268698ae.npz")
board = data["boards"][50].reshape(-1, 19, 19, 17)
human_board = board[0].transpose(2, 0, 1)
print("============================")
print("human board sum : " + str(np.sum(human_board)))
print("============================")
print(board[:, :, :, -1])
itflag = False
with multi_gpu.create_session() as sess:
sess.run(tf.global_variables_initializer())
ckpt_file = tf.train.latest_checkpoint(result_path)
if ckpt_file is not None:
print('Restoring model from {}...'.format(ckpt_file))
saver.restore(sess, ckpt_file)
else:
raise ValueError("No model loaded")
res = sess.run([tf.nn.softmax(p), v], feed_dict={x: board, is_training: itflag})
# res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False})
# res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True})
# print(np.argmax(res[0]))
print(res)
print(data["p"][0])
print(np.argmax(res[0]))
print(np.argmax(data["p"][0]))
# print(res[0].tolist()[0])
# print(np.argmax(res[0]))
return res
if __name__ == '__main__':
# train()
# if sys.argv[1] == "test":
forward(None)

View File

@ -167,7 +167,7 @@ class GTPEngine():
move = self._parse_move(args)
if move:
color, vertex = move
res = self._game.do_move(color, vertex)
res = self._game.play_move(color, vertex)
if res:
return None, True
else:
@ -177,17 +177,21 @@ class GTPEngine():
def cmd_genmove(self, args, **kwargs):
color = self._parse_color(args)
if color:
move = self._game.gen_move(color)
move = self._game.think_play_move(color)
return self._vertex_point2string(move), True
else:
return 'unknown player', False
def cmd_get_score(self, args, **kwargs):
return self._game.executor.get_score(), None
return self._game.game_engine.executor_get_score(self._game.board, True), True
def cmd_show_board(self, args, **kwargs):
return self._game.board, True
def cmd_get_prob(self, args, **kwargs):
return self._game.prob, True
if __name__ == "main":
game = Game()
engine = GTPEngine(game_obj=Game)

View File

@ -9,16 +9,13 @@ import utils
import copy
import tensorflow as tf
import numpy as np
import sys
import sys, os
import go
import network_small
import strategy
import model
from collections import deque
sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir))
from tianshou.core.mcts.mcts import MCTS
import Network
#from strategy import strategy
class Game:
'''
Load the real game and trained weights.
@ -26,35 +23,21 @@ class Game:
TODO : Maybe merge with the engine class in future,
currently leave it untouched for interacting with Go UI.
'''
def __init__(self, size=9, komi=6.5, checkpoint_path=None):
def __init__(self, size=9, komi=3.75, checkpoint_path=None):
self.size = size
self.komi = komi
self.board = [utils.EMPTY] * (self.size * self.size)
self.board = [utils.EMPTY] * (self.size ** 2)
self.history = []
self.latest_boards = deque(maxlen=8)
for _ in range(8):
self.latest_boards.append(self.board)
self.executor = go.Go(game=self)
#self.strategy = strategy(checkpoint_path)
self.simulator = strategy.GoEnv(game=self)
self.net = network_small.Network()
self.sess = self.net.forward(checkpoint_path)
self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v],
feed_dict={self.net.x: state, self.net.is_training: False})
def _flatten(self, vertex):
x, y = vertex
return (y - 1) * self.size + (x - 1)
def _deflatten(self, idx):
x = idx % self.size + 1
y = idx // self.size + 1
return (x,y)
self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8, checkpoint_path=checkpoint_path)
# self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v],
# feed_dict={self.net.x: state, self.net.is_training: False})
self.game_engine = go.Go(size=self.size, komi=self.komi)
def clear(self):
self.board = [utils.EMPTY] * (self.size * self.size)
self.board = [utils.EMPTY] * (self.size ** 2)
self.history = []
for _ in range(8):
self.latest_boards.append(self.board)
@ -66,42 +49,30 @@ class Game:
def set_komi(self, k):
self.komi = k
def generate_nn_input(self, history, color):
state = np.zeros([1, self.size, self.size, 17])
for i in range(8):
state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.size ** 2)).reshape(self.size, self.size)
state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(self.size ** 2)).reshape(self.size, self.size)
if color == utils.BLACK:
state[0, :, :, 16] = np.ones([self.size, self.size])
if color == utils.WHITE:
state[0, :, :, 16] = np.zeros([self.size, self.size])
return state
def strategy_gen_move(self, latest_boards, color):
self.simulator.latest_boards = copy.copy(latest_boards)
self.simulator.board = copy.copy(latest_boards[-1])
nn_input = self.generate_nn_input(self.simulator.latest_boards, color)
mcts = MCTS(self.simulator, self.evaluator, nn_input, self.size ** 2 + 1, inverse=True, max_step=1)
def think(self, latest_boards, color):
mcts = MCTS(self.game_engine, self.evaluator, [latest_boards, color], self.size ** 2 + 1, inverse=True)
mcts.search(max_step=20)
temp = 1
prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp)
choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0]
if choice == self.size ** 2:
move = utils.PASS
else:
move = (choice % self.size + 1, choice / self.size + 1)
move = self.game_engine._deflatten(choice)
return move, prob
def do_move(self, color, vertex):
def play_move(self, color, vertex):
# this function can be called directly to play the opponent's move
if vertex == utils.PASS:
return True
res = self.executor.do_move(color, vertex)
res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex)
return res
def gen_move(self, color):
# move = self.strategy.gen_move(color)
# return move
move, self.prob = self.strategy_gen_move(self.latest_boards, color)
self.do_move(color, move)
def think_play_move(self, color):
# although we don't need to return self.prob, however it is needed for neural network training
move, self.prob = self.think(self.latest_boards, color)
# play the move immediately
self.play_move(color, move)
return move
def status2symbol(self, s):
@ -125,8 +96,9 @@ class Game:
sys.stdout.flush()
if __name__ == "__main__":
g = Game()
g = Game(checkpoint_path='./checkpoints/')
g.show_board()
g.think_play_move(1)
#file = open("debug.txt", "a")
#file.write("mcts check\n")
#file.close()

View File

@ -1,7 +1,7 @@
from __future__ import print_function
import utils
import copy
import sys
import numpy as np
from collections import deque
'''
@ -12,84 +12,26 @@ Settings of the Go game.
'''
NEIGHBOR_OFFSET = [[1, 0], [-1, 0], [0, -1], [0, 1]]
CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]]
class Go:
def __init__(self, **kwargs):
self.game = kwargs['game']
self.size = kwargs['size']
self.komi = kwargs['komi']
def _bfs(self, vertex, color, block, status, alive_break):
block.append(vertex)
status[self.game._flatten(vertex)] = True
nei = self._neighbor(vertex)
for n in nei:
if not status[self.game._flatten(n)]:
if self.game.board[self.game._flatten(n)] == color:
self._bfs(n, color, block, status, alive_break)
def _flatten(self, vertex):
x, y = vertex
return (x - 1) * self.size + (y - 1)
def _find_block(self, vertex, alive_break=False):
block = []
status = [False] * (self.game.size * self.game.size)
color = self.game.board[self.game._flatten(vertex)]
self._bfs(vertex, color, block, status, alive_break)
for b in block:
for n in self._neighbor(b):
if self.game.board[self.game._flatten(n)] == utils.EMPTY:
return False, block
return True, block
def _find_boarder(self, vertex):
block = []
status = [False] * (self.game.size * self.game.size)
self._bfs(vertex, utils.EMPTY, block, status, False)
border = []
for b in block:
for n in self._neighbor(b):
if not (n in block):
border.append(n)
return border
def _is_qi(self, color, vertex):
nei = self._neighbor(vertex)
for n in nei:
if self.game.board[self.game._flatten(n)] == utils.EMPTY:
return True
self.game.board[self.game._flatten(vertex)] = color
for n in nei:
if self.game.board[self.game._flatten(n)] == utils.another_color(color):
can_kill, block = self._find_block(n)
if can_kill:
self.game.board[self.game._flatten(vertex)] = utils.EMPTY
return True
### can not suicide
can_kill, block = self._find_block(vertex)
if can_kill:
self.game.board[self.game._flatten(vertex)] = utils.EMPTY
return False
self.game.board[self.game._flatten(vertex)] = utils.EMPTY
return True
def _check_global_isomorphous(self, color, vertex):
##backup
_board = copy.copy(self.game.board)
self.game.board[self.game._flatten(vertex)] = color
self._process_board(color, vertex)
if self.game.board in self.game.history:
res = True
else:
res = False
self.game.board = _board
return res
def _deflatten(self, idx):
x = idx // self.size + 1
y = idx % self.size + 1
return (x, y)
def _in_board(self, vertex):
x, y = vertex
if x < 1 or x > self.game.size: return False
if y < 1 or y > self.game.size: return False
if x < 1 or x > self.size: return False
if y < 1 or y > self.size: return False
return True
def _neighbor(self, vertex):
@ -102,96 +44,201 @@ class Go:
nei.append((_x, _y))
return nei
def _process_board(self, color, vertex):
def _corner(self, vertex):
x, y = vertex
corner = []
for d in CORNER_OFFSET:
_x = x + d[0]
_y = y + d[1]
if self._in_board((_x, _y)):
corner.append((_x, _y))
return corner
def _find_group(self, current_board, vertex):
color = current_board[self._flatten(vertex)]
# print ("color : ", color)
chain = set()
frontier = [vertex]
has_liberty = False
while frontier:
current = frontier.pop()
# print ("current : ", current)
chain.add(current)
for n in self._neighbor(current):
if current_board[self._flatten(n)] == color and not n in chain:
frontier.append(n)
if current_board[self._flatten(n)] == utils.EMPTY:
has_liberty = True
return has_liberty, chain
def _is_suicide(self, current_board, color, vertex):
current_board[self._flatten(vertex)] = color # assume that we already take this move
suicide = False
has_liberty, group = self._find_group(current_board, vertex)
if not has_liberty:
suicide = True # no liberty, suicide
for n in self._neighbor(vertex):
if current_board[self._flatten(n)] == utils.another_color(color):
opponent_liberty, group = self._find_group(current_board, n)
if not opponent_liberty:
suicide = False # this move is able to take opponent's stone, not suicide
current_board[self._flatten(vertex)] = utils.EMPTY # undo this move
return suicide
def _process_board(self, current_board, color, vertex):
nei = self._neighbor(vertex)
for n in nei:
if self.game.board[self.game._flatten(n)] == utils.another_color(color):
can_kill, block = self._find_block(n, alive_break=True)
if can_kill:
for b in block:
self.game.board[self.game._flatten(b)] = utils.EMPTY
if current_board[self._flatten(n)] == utils.another_color(color):
has_liberty, group = self._find_group(current_board, n)
if not has_liberty:
for b in group:
current_board[self._flatten(b)] = utils.EMPTY
def is_valid(self, color, vertex):
def _check_global_isomorphous(self, history_boards, current_board, color, vertex):
repeat = False
next_board = copy.copy(current_board)
next_board[self._flatten(vertex)] = color
self._process_board(next_board, color, vertex)
if next_board in history_boards:
repeat = True
return repeat
def _is_eye(self, current_board, color, vertex):
nei = self._neighbor(vertex)
cor = self._corner(vertex)
ncolor = {color == current_board[self._flatten(n)] for n in nei}
if False in ncolor:
# print "not all neighbors are in same color with us"
return False
_, group = self._find_group(current_board, nei[0])
if set(nei) < group:
# print "all neighbors are in same group and same color with us"
return True
else:
opponent_number = [current_board[self._flatten(c)] for c in cor].count(-color)
opponent_propotion = float(opponent_number) / float(len(cor))
if opponent_propotion < 0.5:
# print "few opponents, real eye"
return True
else:
# print "many opponents, fake eye"
return False
def _knowledge_prunning(self, current_board, color, vertex):
# forbid some stupid selfplay using human knowledge
if self._is_eye(current_board, color, vertex):
return False
# forbid position on its own eye.
return True
def _is_game_finished(self, current_board, color):
'''
for each empty position, if it has both BLACK and WHITE neighbors, the game is still not finished
:return: return the game is finished
'''
board = copy.deepcopy(current_board)
empty_idx = [i for i, x in enumerate(board) if x == utils.EMPTY] # find all empty idx
for idx in empty_idx:
neighbor_idx = self._neighbor(self.deflatten(idx))
if len(neighbor_idx) > 1:
first_idx = neighbor_idx[0]
for other_idx in neighbor_idx[1:]:
if board[self.flatten(other_idx)] != board[self.flatten(first_idx)]:
return False
return True
def _action2vertex(self, action):
if action == self.size ** 2:
vertex = (0, 0)
else:
vertex = self._deflatten(action)
return vertex
def _is_valid(self, history_boards, current_board, color, vertex):
### in board
if not self._in_board(vertex):
return False
### already have stone
if not self.game.board[self.game._flatten(vertex)] == utils.EMPTY:
if not current_board[self._flatten(vertex)] == utils.EMPTY:
return False
### check if it is qi
if not self._is_qi(color, vertex):
### check if it is suicide
if self._is_suicide(current_board, color, vertex):
return False
if self._check_global_isomorphous(color, vertex):
### forbid global isomorphous
if self._check_global_isomorphous(history_boards, current_board, color, vertex):
return False
return True
def do_move(self, color, vertex):
if not self.is_valid(color, vertex):
def simulate_is_valid(self, state, action):
history_boards, color = state
vertex = self._action2vertex(action)
current_board = history_boards[-1]
if not self._is_valid(history_boards, current_board, color, vertex):
return False
if not self._knowledge_prunning(current_board, color, vertex):
return False
self.game.board[self.game._flatten(vertex)] = color
self._process_board(color, vertex)
self.game.history.append(copy.copy(self.game.board))
self.game.latest_boards.append(copy.copy(self.game.board))
return True
def _find_empty(self):
idx = [i for i,x in enumerate(self.game.board) if x == utils.EMPTY ][0]
return self.game._deflatten(idx)
def simulate_is_valid_list(self, state, action_set):
# find all the invalid actions
invalid_action_list = []
for action_candidate in action_set[:-1]:
# go through all the actions excluding pass
if not self.simulate_is_valid(state, action_candidate):
invalid_action_list.append(action_candidate)
if len(invalid_action_list) < len(action_set) - 1:
invalid_action_list.append(action_set[-1])
# forbid pass, if we have other choices
# TODO: In fact we should not do this. In some extreme cases, we should permit pass.
return invalid_action_list
def get_score(self, is_unknown_estimation = False):
'''
is_unknown_estimation: whether use nearby stone to predict the unknown
return score from BLACK perspective.
'''
_board = copy.copy(self.game.board)
while utils.EMPTY in self.game.board:
vertex = self._find_empty()
boarder = self._find_boarder(vertex)
boarder_color = set(map(lambda v: self.game.board[self.game._flatten(v)], boarder))
if boarder_color == {utils.BLACK}:
self.game.board[self.game._flatten(vertex)] = utils.BLACK
elif boarder_color == {utils.WHITE}:
self.game.board[self.game._flatten(vertex)] = utils.WHITE
elif is_unknown_estimation:
self.game.board[self.game._flatten(vertex)] = self._predict_from_nearby(vertex)
else:
self.game.board[self.game._flatten(vertex)] =utils.UNKNOWN
score = 0
for i in self.game.board:
if i == utils.BLACK:
score += 1
elif i == utils.WHITE:
score -= 1
score -= self.game.komi
def _do_move(self, board, color, vertex):
if vertex == utils.PASS:
return board
else:
id_ = self._flatten(vertex)
board[id_] = color
return board
self.game.board = _board
return score
def simulate_step_forward(self, state, action):
# initialize the simulate_board from state
history_boards, color = state
vertex = self._action2vertex(action)
new_board = self._do_move(copy.copy(history_boards[-1]), color, vertex)
history_boards.append(new_board)
new_color = -color
return [history_boards, new_color], 0
def _predict_from_nearby(self, vertex, neighbor_step = 3):
'''
step: the nearby 3 steps is considered
:vertex: position to be estimated
:neighbor_step: how many steps nearby
:return: the nearby positions of the input position
currently the nearby 3*3 grid is returned, altogether 4*8 points involved
'''
for step in range(1, neighbor_step + 1): # check the stones within the steps in range
neighbor_vertex_set = []
self._add_nearby_stones(neighbor_vertex_set, vertex[0] - step, vertex[1], 1, 1, neighbor_step)
self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] + step, 1, -1, neighbor_step)
self._add_nearby_stones(neighbor_vertex_set, vertex[0] + step, vertex[1], -1, -1, neighbor_step)
self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] - step, -1, 1, neighbor_step)
color_estimate = 0
for neighbor_vertex in neighbor_vertex_set:
color_estimate += self.game.board[self.game._flatten(neighbor_vertex)]
if color_estimate > 0:
return utils.BLACK
elif color_estimate < 0:
return utils.WHITE
def executor_do_move(self, history, latest_boards, current_board, color, vertex):
if not self._is_valid(history, current_board, color, vertex):
return False
current_board[self._flatten(vertex)] = color
self._process_board(current_board, color, vertex)
history.append(copy.copy(current_board))
latest_boards.append(copy.copy(current_board))
return True
def _find_empty(self, current_board):
idx = [i for i,x in enumerate(current_board) if x == utils.EMPTY ][0]
return self._deflatten(idx)
def _find_boarder(self, current_board, vertex):
_, group = self._find_group(current_board, vertex)
border = []
for b in group:
for n in self._neighbor(b):
if not (n in group):
border.append(n)
return border
def _add_nearby_stones(self, neighbor_vertex_set, start_vertex_x, start_vertex_y, x_diff, y_diff, num_step):
'''
@ -210,3 +257,93 @@ class Go:
neighbor_vertex_set.append((start_vertex_x, start_vertex_y))
start_vertex_x += x_diff
start_vertex_y += y_diff
def _predict_from_nearby(self, current_board, vertex, neighbor_step=3):
'''
step: the nearby 3 steps is considered
:vertex: position to be estimated
:neighbor_step: how many steps nearby
:return: the nearby positions of the input position
currently the nearby 3*3 grid is returned, altogether 4*8 points involved
'''
for step in range(1, neighbor_step + 1): # check the stones within the steps in range
neighbor_vertex_set = []
self._add_nearby_stones(neighbor_vertex_set, vertex[0] - step, vertex[1], 1, 1, neighbor_step)
self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] + step, 1, -1, neighbor_step)
self._add_nearby_stones(neighbor_vertex_set, vertex[0] + step, vertex[1], -1, -1, neighbor_step)
self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] - step, -1, 1, neighbor_step)
color_estimate = 0
for neighbor_vertex in neighbor_vertex_set:
color_estimate += current_board[self._flatten(neighbor_vertex)]
if color_estimate > 0:
return utils.BLACK
elif color_estimate < 0:
return utils.WHITE
def executor_get_score(self, current_board, is_unknown_estimation=False):
'''
is_unknown_estimation: whether use nearby stone to predict the unknown
return score from BLACK perspective.
'''
_board = copy.deepcopy(current_board)
while utils.EMPTY in _board:
vertex = self._find_empty(_board)
boarder = self._find_boarder(_board, vertex)
boarder_color = set(map(lambda v: _board[self._flatten(v)], boarder))
if boarder_color == {utils.BLACK}:
_board[self._flatten(vertex)] = utils.BLACK
elif boarder_color == {utils.WHITE}:
_board[self._flatten(vertex)] = utils.WHITE
elif is_unknown_estimation:
_board[self._flatten(vertex)] = self._predict_from_nearby(_board, vertex)
else:
_board[self._flatten(vertex)] =utils.UNKNOWN
score = 0
for i in _board:
if i == utils.BLACK:
score += 1
elif i == utils.WHITE:
score -= 1
score -= self.komi
return score
if __name__ == "__main__":
### do unit test for Go class
pure_test = [
0, 1, 0, 1, 0, 1, 0, 0, 0,
1, 0, 1, 0, 1, 0, 0, 0, 0,
0, 1, 0, 1, 0, 0, 1, 0, 0,
0, 0, 1, 0, 0, 1, 0, 1, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0,
1, 1, 1, 0, 0, 0, 0, 0, 0,
1, 0, 1, 0, 0, 1, 1, 0, 0,
1, 1, 1, 0, 1, 0, 1, 0, 0,
0, 0, 0, 0, 1, 1, 1, 0, 0
]
pt_qry = [(1, 1), (1, 5), (3, 3), (4, 7), (7, 2), (8, 6)]
pt_ans = [True, True, True, True, True, True]
opponent_test = [
0, 1, 0, 1, 0, 1, 0,-1, 1,
1,-1, 0,-1, 1,-1, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 1,-1, 0, 1,-1, 1, 0, 0,
1, 0, 1, 0, 1, 0, 1, 0, 0,
-1,1, 1, 0, 1, 1, 1, 0, 0,
0, 1,-1, 0,-1,-1,-1, 0, 0,
1, 0, 1, 0,-1, 0,-1, 0, 0,
0, 1, 0, 0,-1,-1,-1, 0, 0
]
ot_qry = [(1, 1), (1, 5), (2, 9), (5, 2), (5, 6), (8, 6), (8, 2)]
ot_ans = [False, False, False, False, False, False, True]
go = Go(size=9, komi=3.75)
for i in range(6):
print (go._is_eye(pure_test, utils.BLACK, pt_qry[i]))
print("Test of pure eye\n")
for i in range(7):
print (go._is_eye(opponent_test, utils.BLACK, ot_qry[i]))
print("Test of eye surrend by opponents\n")

286
AlphaGo/model.py Normal file
View File

@ -0,0 +1,286 @@
import os
import time
import random
import sys
import cPickle
from collections import deque
import numpy as np
import tensorflow as tf
import tensorflow.contrib.layers as layers
import multi_gpu
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
def residual_block(input, is_training):
"""
one residual block
:param input: a tensor, input of the residual block
:param is_training: a placeholder, indicate whether the model is training or not
:return: a tensor, output of the residual block
"""
normalizer_params = {'is_training': is_training,
'updates_collections': tf.GraphKeys.UPDATE_OPS}
h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu,
normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
weights_regularizer=layers.l2_regularizer(1e-4))
h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity,
normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
weights_regularizer=layers.l2_regularizer(1e-4))
h = h + input
return tf.nn.relu(h)
def policy_head(input, is_training, action_num):
"""
the head of policy branch
:param input: a tensor, input of the policy head
:param is_training: a placeholder, indicate whether the model is training or not
:param action_num: action_num: an integer, number of unique actions at any state
:return: a tensor: output of the policy head, shape [batch_size, action_num]
"""
normalizer_params = {'is_training': is_training,
'updates_collections': tf.GraphKeys.UPDATE_OPS}
h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
weights_regularizer=layers.l2_regularizer(1e-4))
h = layers.flatten(h)
h = layers.fully_connected(h, action_num, activation_fn=tf.identity,
weights_regularizer=layers.l2_regularizer(1e-4))
return h
def value_head(input, is_training):
"""
the head of value branch
:param input: a tensor, input of the value head
:param is_training: a placeholder, indicate whether the model is training or not
:return: a tensor, output of the value head, shape [batch_size, 1]
"""
normalizer_params = {'is_training': is_training,
'updates_collections': tf.GraphKeys.UPDATE_OPS}
h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
weights_regularizer=layers.l2_regularizer(1e-4))
h = layers.flatten(h)
h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4))
h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4))
return h
class Data(object):
def __init__(self):
self.boards = []
self.probs = []
self.winner = 0
class ResNet(object):
def __init__(self, board_size, action_num, history_length=1, residual_block_num=20, checkpoint_path=None):
"""
the resnet model
:param board_size: an integer, the board size
:param action_num: an integer, number of unique actions at any state
:param history_length: an integer, the history length to use, default is 1
:param residual_block_num: an integer, the number of residual block, default is 20, at least 1
:param checkpoint_path: a string, the path to the checkpoint, default is None,
"""
self.board_size = board_size
self.action_num = action_num
self.history_length = history_length
self.checkpoint_path = checkpoint_path
self.x = tf.placeholder(tf.float32, shape=[None, self.board_size, self.board_size, 2 * self.history_length + 1])
self.is_training = tf.placeholder(tf.bool, shape=[])
self.z = tf.placeholder(tf.float32, shape=[None, 1])
self.pi = tf.placeholder(tf.float32, shape=[None, self.action_num])
self._build_network(residual_block_num, self.checkpoint_path)
# training hyper-parameters:
self.window_length = 7000
self.save_freq = 5000
self.training_data = {'states': deque(maxlen=self.window_length), 'probs': deque(maxlen=self.window_length),
'winner': deque(maxlen=self.window_length), 'length': deque(maxlen=self.window_length)}
def _build_network(self, residual_block_num, checkpoint_path):
"""
build the network
:param residual_block_num: an integer, the number of residual block
:param checkpoint_path: a string, the path to the checkpoint, if None, use random initialization parameter
:return: None
"""
h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu,
normalizer_fn=layers.batch_norm,
normalizer_params={'is_training': self.is_training,
'updates_collections': tf.GraphKeys.UPDATE_OPS},
weights_regularizer=layers.l2_regularizer(1e-4))
for i in range(residual_block_num - 1):
h = residual_block(h, self.is_training)
self.v = value_head(h, self.is_training)
self.p = policy_head(h, self.is_training, self.action_num)
self.value_loss = tf.reduce_mean(tf.square(self.z - self.v))
self.policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.pi, logits=self.p))
self.reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
self.total_loss = self.value_loss + self.policy_loss + self.reg
self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(self.update_ops):
self.train_op = tf.train.AdamOptimizer(1e-4).minimize(self.total_loss)
self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
self.saver = tf.train.Saver(max_to_keep=0, var_list=self.var_list)
self.sess = multi_gpu.create_session()
self.sess.run(tf.global_variables_initializer())
if checkpoint_path is not None:
ckpt_file = tf.train.latest_checkpoint(checkpoint_path)
if ckpt_file is not None:
print('Restoring model from {}...'.format(ckpt_file))
self.saver.restore(self.sess, ckpt_file)
print('Successfully loaded')
else:
raise ValueError("No model in path {}".format(checkpoint_path))
def __call__(self, state):
"""
:param history: a list, the history
:param color: a string, indicate which one to play
:return: a list of tensor, the predicted value and policy given the history and color
"""
history, color = state
if len(history) != self.history_length:
raise ValueError(
'The length of history cannot meet the need of the model, given {}, need {}'.format(len(history),
self.history_length))
state = self._history2state(history, color)
return self.sess.run([self.p, self.v], feed_dict={self.x: state, self.is_training: False})
def _history2state(self, history, color):
"""
convert the history to the state we need
:param history: a list, the history
:param color: a string, indicate which one to play
:return: a ndarray, the state
"""
state = np.zeros([1, self.board_size, self.board_size, 2 * self.history_length + 1])
for i in range(self.history_length):
state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.board_size ** 2)).reshape(self.board_size,
self.board_size)
state[0, :, :, i + self.history_length] = np.array(
np.array(history[i]) == -np.ones(self.board_size ** 2)).reshape(self.board_size, self.board_size)
# TODO: need a config to specify the BLACK and WHITE
if color == +1:
state[0, :, :, 2 * self.history_length] = np.ones([self.board_size, self.board_size])
if color == -1:
state[0, :, :, 2 * self.history_length] = np.zeros([self.board_size, self.board_size])
return state
# TODO: design the interface between the environment and training
def train(self, mode='memory', *args, **kwargs):
if mode == 'memory':
pass
if mode == 'file':
self._train_with_file(data_path=kwargs['data_path'], batch_size=kwargs['batch_size'],
checkpoint_path=kwargs['checkpoint_path'])
def _train_with_file(self, data_path, batch_size, checkpoint_path):
# check if the path is valid
if not os.path.exists(data_path):
raise ValueError("{} doesn't exist".format(data_path))
self.checkpoint_path = checkpoint_path
if not os.path.exists(self.checkpoint_path):
os.mkdir(self.checkpoint_path)
new_file_list = []
all_file_list = []
training_data = {'states': [], 'probs': [], 'winner': []}
iters = 0
while True:
new_file_list = list(set(os.listdir(data_path)).difference(all_file_list))
while new_file_list:
all_file_list = os.listdir(data_path)
new_file_list.sort(
key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir(data_path + file) else 0)
for file in new_file_list:
states, probs, winner = self._file_to_training_data(data_path + file)
assert states.shape[0] == probs.shape[0]
assert states.shape[0] == winner.shape[0]
self.training_data['states'].append(states)
self.training_data['probs'].append(probs)
self.training_data['winner'].append(winner)
self.training_data['length'].append(states.shape[0])
new_file_list = list(set(os.listdir(data_path)).difference(all_file_list))
if len(self.training_data['states']) != self.window_length:
continue
else:
start_time = time.time()
for i in range(batch_size):
game_num = random.randint(0, self.window_length-1)
state_num = random.randint(0, self.training_data['length'][game_num]-1)
training_data['states'].append(np.expand_dims(self.training_data['states'][game_num][state_num], 0))
training_data['probs'].append(np.expand_dims(self.training_data['probs'][game_num][state_num], 0))
training_data['winner'].append(np.expand_dims(self.training_data['winner'][game_num][state_num], 0))
value_loss, policy_loss, reg, _ = self.sess.run(
[self.value_loss, self.policy_loss, self.reg, self.train_op],
feed_dict={self.x: np.concatenate(training_data['states'], axis=0),
self.z: np.concatenate(training_data['winner'], axis=0),
self.pi: np.concatenate(training_data['probs'], axis=0),
self.is_training: True})
print("Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format(iters,
time.time() - start_time,
value_loss,
policy_loss, reg))
if iters % self.save_freq == 0:
save_path = "Iteration{}.ckpt".format(iters)
self.saver.save(self.sess, self.checkpoint_path + save_path)
for key in training_data.keys():
training_data[key] = []
iters += 1
def _file_to_training_data(self, file_name):
read = False
with open(file_name, 'rb') as file:
while not read:
try:
file.seek(0)
data = cPickle.load(file)
read = True
print("{} Loaded!".format(file_name))
except Exception as e:
print(e)
time.sleep(1)
history = deque(maxlen=self.history_length)
states = []
probs = []
winner = []
for _ in range(self.history_length):
# Note that 0 is specified, need a more general way like config
history.append([0] * self.board_size ** 2)
# Still, +1 is specified
color = +1
for [board, prob] in zip(data.boards, data.probs):
history.append(board)
states.append(self._history2state(history, color))
probs.append(np.array(prob).reshape(1, self.board_size ** 2 + 1))
winner.append(np.array(data.winner).reshape(1, 1))
color *= -1
states = np.concatenate(states, axis=0)
probs = np.concatenate(probs, axis=0)
winner = np.concatenate(winner, axis=0)
return states, probs, winner
if __name__ == "__main__":
model = ResNet(board_size=9, action_num=82, history_length=8)
model.train("file", data_path="./data/", batch_size=128, checkpoint_path="./checkpoint/")

View File

@ -1,9 +1,22 @@
import argparse
import subprocess
import sys
import re
import Pyro4
import time
import os
import cPickle
class Data(object):
def __init__(self):
self.boards = []
self.probs = []
self.winner = 0
def reset(self):
self.__init__()
if __name__ == '__main__':
"""
@ -11,84 +24,135 @@ if __name__ == '__main__':
Note that, this function requires the installation of the Pyro4 library.
"""
# TODO : we should set the network path in a more configurable way.
black_weight_path = "./checkpoints"
white_weight_path = "./checkpoints_origin"
if (not os.path.exists(black_weight_path)):
print "Can't not find the network weights for black player."
sys.exit()
if (not os.path.exists(white_weight_path)):
print "Can't not find the network weights for white player."
sys.exit()
parser = argparse.ArgumentParser()
parser.add_argument("--result_path", type=str, default="./data/")
parser.add_argument("--black_weight_path", type=str, default=None)
parser.add_argument("--white_weight_path", type=str, default=None)
parser.add_argument("--id", type=int, default=0)
args = parser.parse_args()
if not os.path.exists(args.result_path):
os.mkdir(args.result_path)
# black_weight_path = "./checkpoints"
# white_weight_path = "./checkpoints_origin"
if args.black_weight_path is not None and (not os.path.exists(args.black_weight_path)):
raise ValueError("Can't not find the network weights for black player.")
if args.white_weight_path is not None and (not os.path.exists(args.white_weight_path)):
raise ValueError("Can't not find the network weights for white player.")
# kill the old server
kill_old_server = subprocess.Popen(['killall', 'pyro4-ns'])
print "kill the old pyro4 name server, the return code is : " + str(kill_old_server.wait())
time.sleep(1)
# kill_old_server = subprocess.Popen(['killall', 'pyro4-ns'])
# print "kill the old pyro4 name server, the return code is : " + str(kill_old_server.wait())
# time.sleep(1)
# start a name server to find the remote object
start_new_server = subprocess.Popen(['pyro4-ns', '&'])
print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait())
time.sleep(1)
# start_new_server = subprocess.Popen(['pyro4-ns', '&'])
# print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait())
# time.sleep(1)
# start a name server if no name server exists
if len(os.popen('ps aux | grep pyro4-ns | grep -v grep').readlines()) == 0:
start_new_server = subprocess.Popen(['pyro4-ns', '&'])
print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait())
time.sleep(1)
# start two different player with different network weights.
agent_v0 = subprocess.Popen(['python', '-u', 'player.py', '--role=black'],
stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
agent_v1 = subprocess.Popen(['python', '-u', 'player.py', '--role=white', '--checkpoint_path=./checkpoints_origin/'],
stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
black_role_name = 'black' + str(args.id)
white_role_name = 'white' + str(args.id)
agent_v0 = subprocess.Popen(
['python', '-u', 'player.py', '--role=' + black_role_name, '--checkpoint_path=' + str(args.black_weight_path)],
stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
agent_v1 = subprocess.Popen(
['python', '-u', 'player.py', '--role=' + white_role_name, '--checkpoint_path=' + str(args.white_weight_path)],
stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
server_list = ""
while ("black" not in server_list) or ("white" not in server_list):
while (black_role_name not in server_list) or (white_role_name not in server_list):
server_list = subprocess.check_output(['pyro4-nsc', 'list'])
print "Waining for the server start..."
print "Waiting for the server start..."
time.sleep(1)
print server_list
print "Start black player at : " + str(agent_v0.pid)
print "Start white player at : " + str(agent_v1.pid)
data = Data()
player = [None] * 2
player[0] = Pyro4.Proxy("PYRONAME:black")
player[1] = Pyro4.Proxy("PYRONAME:white")
player[0] = Pyro4.Proxy("PYRONAME:" + black_role_name)
player[1] = Pyro4.Proxy("PYRONAME:" + white_role_name)
role = ["BLACK", "WHITE"]
color = ['b', 'w']
pattern = "[A-Z]{1}[0-9]{1}"
space = re.compile("\s+")
size = 9
show = ['.', 'X', 'O']
evaluate_rounds = 1
game_num = 0
while game_num < evaluate_rounds:
num = 0
pass_flag = [False, False]
print("Start game {}".format(game_num))
# end the game if both palyer chose to pass, or play too much turns
while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2:
turn = num % 2
move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n')
print role[turn] + " : " + str(move),
num += 1
match = re.search(pattern, move)
if match is not None:
# print "match : " + str(match.group())
play_or_pass = match.group()
pass_flag[turn] = False
try:
while True:
start_time = time.time()
num = 0
pass_flag = [False, False]
print("Start game {}".format(game_num))
# end the game if both palyer chose to pass, or play too much turns
while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2:
turn = num % 2
board = player[turn].run_cmd(str(num) + ' show_board')
board = eval(board[board.index('['):board.index(']') + 1])
for i in range(size):
for j in range(size):
print show[board[i * size + j]] + " ",
print "\n",
data.boards.append(board)
move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n')
print role[turn] + " : " + str(move),
num += 1
match = re.search(pattern, move)
if match is not None:
# print "match : " + str(match.group())
play_or_pass = match.group()
pass_flag[turn] = False
else:
# print "no match"
play_or_pass = ' PASS'
pass_flag[turn] = True
result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n')
prob = player[turn].run_cmd(str(num) + ' get_prob')
prob = space.sub(',', prob[prob.index('['):prob.index(']') + 1])
prob = prob.replace('[,', '[')
prob = prob.replace('],', ']')
prob = eval(prob)
data.probs.append(prob)
score = player[turn].run_cmd(str(num) + ' get_score')
print "Finished : ", score.split(" ")[1]
# TODO: generalize the player
if eval(score.split(" ")[1]) > 0:
data.winner = 1
if eval(score.split(" ")[1]) < 0:
data.winner = -1
player[0].run_cmd(str(num) + ' clear_board')
player[1].run_cmd(str(num) + ' clear_board')
file_list = os.listdir(args.result_path)
if not file_list:
data_num = 0
else:
# print "no match"
play_or_pass = ' PASS'
pass_flag[turn] = True
result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n')
board = player[turn].run_cmd(str(num) + ' show_board')
board = eval(board[board.index('['):board.index(']') + 1])
for i in range(size):
for j in range(size):
print show[board[i * size + j]] + " ",
print "\n",
file_list.sort(key=lambda file: os.path.getmtime(args.result_path + file) if not os.path.isdir(
args.result_path + file) else 0)
data_num = eval(file_list[-1][:-4]) + 1
with open("./data/" + str(data_num) + ".pkl", "wb") as file:
picklestring = cPickle.dump(data, file)
data.reset()
game_num += 1
score = player[turn].run_cmd(str(num) + ' get_score')
print "Finished : ", score.split(" ")[1]
player[0].run_cmd(str(num) + ' clear_board')
player[1].run_cmd(str(num) + ' clear_board')
game_num += 1
except Exception as e:
print(e)
subprocess.call(["kill", "-9", str(agent_v0.pid)])
subprocess.call(["kill", "-9", str(agent_v1.pid)])
print "Kill all player, finish all game."
subprocess.call(["kill", "-9", str(agent_v0.pid)])
subprocess.call(["kill", "-9", str(agent_v1.pid)])

View File

@ -20,12 +20,15 @@ class Player(object):
#return "inside the Player of player.py"
return self.engine.run_cmd(command)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--checkpoint_path", type=str, default="./checkpoints/")
parser.add_argument("--checkpoint_path", type=str, default=None)
parser.add_argument("--role", type=str, default="unknown")
args = parser.parse_args()
if args.checkpoint_path == 'None':
args.checkpoint_path = None
game = Game(checkpoint_path=args.checkpoint_path)
engine = GTPEngine(game_obj=game, name='tianshou', version=0)

264
AlphaGo/reversi.py Normal file
View File

@ -0,0 +1,264 @@
from __future__ import print_function
import numpy as np
'''
Settings of the Go game.
(1, 1) is considered as the upper left corner of the board,
(size, 1) is the lower left
'''
def find_correct_moves(own, enemy):
"""return legal moves"""
left_right_mask = 0x7e7e7e7e7e7e7e7e # Both most left-right edge are 0, else 1
top_bottom_mask = 0x00ffffffffffff00 # Both most top-bottom edge are 0, else 1
mask = left_right_mask & top_bottom_mask
mobility = 0
mobility |= search_offset_left(own, enemy, left_right_mask, 1) # Left
mobility |= search_offset_left(own, enemy, mask, 9) # Left Top
mobility |= search_offset_left(own, enemy, top_bottom_mask, 8) # Top
mobility |= search_offset_left(own, enemy, mask, 7) # Top Right
mobility |= search_offset_right(own, enemy, left_right_mask, 1) # Right
mobility |= search_offset_right(own, enemy, mask, 9) # Bottom Right
mobility |= search_offset_right(own, enemy, top_bottom_mask, 8) # Bottom
mobility |= search_offset_right(own, enemy, mask, 7) # Left bottom
return mobility
def calc_flip(pos, own, enemy):
"""return flip stones of enemy by bitboard when I place stone at pos.
:param pos: 0~63
:param own: bitboard (0=top left, 63=bottom right)
:param enemy: bitboard
:return: flip stones of enemy when I place stone at pos.
"""
f1 = _calc_flip_half(pos, own, enemy)
f2 = _calc_flip_half(63 - pos, rotate180(own), rotate180(enemy))
return f1 | rotate180(f2)
def _calc_flip_half(pos, own, enemy):
el = [enemy, enemy & 0x7e7e7e7e7e7e7e7e, enemy & 0x7e7e7e7e7e7e7e7e, enemy & 0x7e7e7e7e7e7e7e7e]
masks = [0x0101010101010100, 0x00000000000000fe, 0x0002040810204080, 0x8040201008040200]
masks = [b64(m << pos) for m in masks]
flipped = 0
for e, mask in zip(el, masks):
outflank = mask & ((e | ~mask) + 1) & own
flipped |= (outflank - (outflank != 0)) & mask
return flipped
def search_offset_left(own, enemy, mask, offset):
e = enemy & mask
blank = ~(own | enemy)
t = e & (own >> offset)
t |= e & (t >> offset)
t |= e & (t >> offset)
t |= e & (t >> offset)
t |= e & (t >> offset)
t |= e & (t >> offset) # Up to six stones can be turned at once
return blank & (t >> offset) # Only the blank squares can be started
def search_offset_right(own, enemy, mask, offset):
e = enemy & mask
blank = ~(own | enemy)
t = e & (own << offset)
t |= e & (t << offset)
t |= e & (t << offset)
t |= e & (t << offset)
t |= e & (t << offset)
t |= e & (t << offset) # Up to six stones can be turned at once
return blank & (t << offset) # Only the blank squares can be started
def flip_vertical(x):
k1 = 0x00FF00FF00FF00FF
k2 = 0x0000FFFF0000FFFF
x = ((x >> 8) & k1) | ((x & k1) << 8)
x = ((x >> 16) & k2) | ((x & k2) << 16)
x = (x >> 32) | b64(x << 32)
return x
def b64(x):
return x & 0xFFFFFFFFFFFFFFFF
def bit_count(x):
return bin(x).count('1')
def bit_to_array(x, size):
"""bit_to_array(0b0010, 4) -> array([0, 1, 0, 0])"""
return np.array(list(reversed((("0" * size) + bin(x)[2:])[-size:])), dtype=np.uint8)
def flip_diag_a1h8(x):
k1 = 0x5500550055005500
k2 = 0x3333000033330000
k4 = 0x0f0f0f0f00000000
t = k4 & (x ^ b64(x << 28))
x ^= t ^ (t >> 28)
t = k2 & (x ^ b64(x << 14))
x ^= t ^ (t >> 14)
t = k1 & (x ^ b64(x << 7))
x ^= t ^ (t >> 7)
return x
def rotate90(x):
return flip_diag_a1h8(flip_vertical(x))
def rotate180(x):
return rotate90(rotate90(x))
class Reversi:
def __init__(self, black=None, white=None):
self.black = black or (0b00001000 << 24 | 0b00010000 << 32)
self.white = white or (0b00010000 << 24 | 0b00001000 << 32)
self.board = None # 8 * 8 board with 1 for black, -1 for white and 0 for blank
self.color = None # 1 for black and -1 for white
self.action = None # number in 0~63
# self.winner = None
self.black_win = None
def get_board(self, black=None, white=None):
self.black = black or (0b00001000 << 24 | 0b00010000 << 32)
self.white = white or (0b00010000 << 24 | 0b00001000 << 32)
self.board = self.bitboard2board()
return self.board
def simulate_is_valid(self, board, color):
self.board = board
self.color = color
self.board2bitboard()
own, enemy = self.get_own_and_enemy()
mobility = find_correct_moves(own, enemy)
valid_moves = bit_to_array(mobility, 64)
valid_moves = np.argwhere(valid_moves)
valid_moves = list(np.reshape(valid_moves, len(valid_moves)))
return valid_moves
def simulate_step_forward(self, state, vertex):
self.board = state[0]
self.color = state[1]
self.board2bitboard()
self.vertex2action(vertex)
step_forward = self.step()
if step_forward:
new_board = self.bitboard2board()
return [new_board, 0 - self.color], 0
def executor_do_move(self, board, color, vertex):
self.board = board
self.color = color
self.board2bitboard()
self.vertex2action(vertex)
step_forward = self.step()
if step_forward:
new_board = self.bitboard2board()
for i in range(64):
board[i] = new_board[i]
def executor_get_score(self, board):
self.board = board
self._game_over()
if self.black_win is not None:
return self.black_win
else:
ValueError("Game not finished!")
def board2bitboard(self):
count = 1
if self.board is None:
ValueError("None board!")
self.black = 0
self.white = 0
for i in range(64):
if self.board[i] == 1:
self.black |= count
elif self.board[i] == -1:
self.white |= count
count *= 2
def vertex2action(self, vertex):
x, y = vertex
if x == 0 and y == 0:
self.action = None
else:
self.action = 8 * (x - 1) + y - 1
def bitboard2board(self):
board = []
black = bit_to_array(self.black, 64)
white = bit_to_array(self.white, 64)
for i in range(64):
if black[i]:
board.append(1)
elif white[i]:
board.append(-1)
else:
board.append(0)
return board
def step(self):
if self.action < 0 or self.action > 63:
ValueError("Wrong action!")
if self.action is None:
return False
own, enemy = self.get_own_and_enemy()
flipped = calc_flip(self.action, own, enemy)
if bit_count(flipped) == 0:
self.illegal_move_to_lose(self.action)
return False
own ^= flipped
own |= 1 << self.action
enemy ^= flipped
self.set_own_and_enemy(own, enemy)
return True
def _game_over(self):
# self.done = True
'''
if self.winner is None:
black_num, white_num = self.number_of_black_and_white
if black_num > white_num:
self.winner = 1
elif black_num < white_num:
self.winner = -1
else:
self.winner = 0
'''
if self.black_win is None:
black_num, white_num = self.number_of_black_and_white
self.black_win = black_num - white_num
def illegal_move_to_lose(self, action):
self._game_over()
def get_own_and_enemy(self):
if self.color == 1:
own, enemy = self.black, self.white
elif self.color == -1:
own, enemy = self.white, self.black
else:
own, enemy = None, None
return own, enemy
def set_own_and_enemy(self, own, enemy):
if self.color == 1:
self.black, self.white = own, enemy
else:
self.white, self.black = own, enemy
@property
def number_of_black_and_white(self):
return bit_count(self.black), bit_count(self.white)

View File

@ -79,7 +79,7 @@ while True:
prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1))
print("Finished")
print("\n")
score = game.executor.get_score(True)
score = game.game_engine.executor_get_score(game.board, True)
if score > 0:
winner = utils.BLACK
else:

View File

@ -1,227 +0,0 @@
import os, sys
sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir))
import numpy as np
import utils
import time
import copy
import network_small
import tensorflow as tf
from collections import deque
from tianshou.core.mcts.mcts import MCTS
DELTA = [[1, 0], [-1, 0], [0, -1], [0, 1]]
CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]]
class GoEnv:
def __init__(self, **kwargs):
self.game = kwargs['game']
self.board = [utils.EMPTY] * (self.game.size * self.game.size)
self.latest_boards = deque(maxlen=8)
def _flatten(self, vertex):
x, y = vertex
return (x - 1) * self.game.size + (y - 1)
def _bfs(self, vertex, color, block, status, alive_break):
block.append(vertex)
status[self._flatten(vertex)] = True
nei = self._neighbor(vertex)
for n in nei:
if not status[self._flatten(n)]:
if self.board[self._flatten(n)] == color:
self._bfs(n, color, block, status, alive_break)
def _find_block(self, vertex, alive_break=False):
block = []
status = [False] * (self.game.size * self.game.size)
color = self.board[self._flatten(vertex)]
self._bfs(vertex, color, block, status, alive_break)
for b in block:
for n in self._neighbor(b):
if self.board[self._flatten(n)] == utils.EMPTY:
return False, block
return True, block
def _is_qi(self, color, vertex):
nei = self._neighbor(vertex)
for n in nei:
if self.board[self._flatten(n)] == utils.EMPTY:
return True
self.board[self._flatten(vertex)] = color
for n in nei:
if self.board[self._flatten(n)] == utils.another_color(color):
can_kill, block = self._find_block(n)
if can_kill:
self.board[self._flatten(vertex)] = utils.EMPTY
return True
### avoid suicide
can_kill, block = self._find_block(vertex)
if can_kill:
self.board[self._flatten(vertex)] = utils.EMPTY
return False
self.board[self._flatten(vertex)] = utils.EMPTY
return True
def _check_global_isomorphous(self, color, vertex):
##backup
_board = copy.copy(self.board)
self.board[self._flatten(vertex)] = color
self._process_board(color, vertex)
if self.board in self.latest_boards:
res = True
else:
res = False
self.board = _board
return res
def _in_board(self, vertex):
x, y = vertex
if x < 1 or x > self.game.size: return False
if y < 1 or y > self.game.size: return False
return True
def _neighbor(self, vertex):
x, y = vertex
nei = []
for d in DELTA:
_x = x + d[0]
_y = y + d[1]
if self._in_board((_x, _y)):
nei.append((_x, _y))
return nei
def _corner(self, vertex):
x, y = vertex
corner = []
for d in CORNER_OFFSET:
_x = x + d[0]
_y = y + d[1]
if self._in_board((_x, _y)):
corner.append((_x, _y))
return corner
def _process_board(self, color, vertex):
nei = self._neighbor(vertex)
for n in nei:
if self.board[self._flatten(n)] == utils.another_color(color):
can_kill, block = self._find_block(n, alive_break=True)
if can_kill:
for b in block:
self.board[self._flatten(b)] = utils.EMPTY
def _find_group(self, start):
color = self.board[self._flatten(start)]
# print ("color : ", color)
chain = set()
frontier = [start]
while frontier:
current = frontier.pop()
# print ("current : ", current)
chain.add(current)
for n in self._neighbor(current):
# print n, self._flatten(n), self.board[self._flatten(n)],
if self.board[self._flatten(n)] == color and not n in chain:
frontier.append(n)
return chain
def _is_eye(self, color, vertex):
nei = self._neighbor(vertex)
cor = self._corner(vertex)
ncolor = {color == self.board[self._flatten(n)] for n in nei}
if False in ncolor:
# print "not all neighbors are in same color with us"
return False
if set(nei) < self._find_group(nei[0]):
# print "all neighbors are in same group and same color with us"
return True
else:
opponent_number = [self.board[self._flatten(c)] for c in cor].count(-color)
opponent_propotion = float(opponent_number) / float(len(cor))
if opponent_propotion < 0.5:
# print "few opponents, real eye"
return True
else:
# print "many opponents, fake eye"
return False
def knowledge_prunning(self, color, vertex):
### check if it is an eye of yourself
### assumptions : notice that this judgement requires that the state is an endgame
if self._is_eye(color, vertex):
return False
return True
def simulate_is_valid(self, state, action):
# state is the play board, the shape is [1, 9, 9, 17]
if action == self.game.size * self.game.size:
vertex = (0, 0)
else:
vertex = (action / self.game.size + 1, action % self.game.size + 1)
if state[0, 0, 0, -1] == utils.BLACK:
color = utils.BLACK
else:
color = utils.WHITE
self.latest_boards.clear()
for i in range(8):
self.latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist())
self.board = copy.copy(self.latest_boards[-1])
### in board
if not self._in_board(vertex):
return False
### already have stone
if not self.board[self._flatten(vertex)] == utils.EMPTY:
# print(np.array(self.board).reshape(9, 9))
# print(vertex)
return False
### check if it is qi
if not self._is_qi(color, vertex):
return False
### forbid global isomorphous
if self._check_global_isomorphous(color, vertex):
return False
if not self.knowledge_prunning(color, vertex):
return False
return True
def do_move(self, color, vertex):
if vertex == utils.PASS:
return True
id_ = self._flatten(vertex)
if self.board[id_] == utils.EMPTY:
self.board[id_] = color
return True
else:
return False
def step_forward(self, state, action):
if state[0, 0, 0, -1] == 1:
color = utils.BLACK
else:
color = utils.WHITE
if action == self.game.size ** 2:
vertex = utils.PASS
else:
vertex = (action % self.game.size + 1, action / self.game.size + 1)
# print(vertex)
# print(self.board)
self.board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist()
self.do_move(color, vertex)
new_state = np.concatenate(
[state[:, :, :, 1:8], (np.array(self.board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1),
state[:, :, :, 9:16], (np.array(self.board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1),
np.array(1 - state[:, :, :, -1]).reshape(1, self.game.size, self.game.size, 1)],
axis=3)
return new_state, 0

View File

@ -1,266 +0,0 @@
import numpy as np
import sys
from game import Game
from engine import GTPEngine
import utils
import time
import copy
import network_small
import tensorflow as tf
from collections import deque
from tianshou.core.mcts.mcts import MCTS
DELTA = [[1, 0], [-1, 0], [0, -1], [0, 1]]
CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]]
class GoEnv:
def __init__(self, size=9, komi=6.5):
self.size = size
self.komi = komi
self.board = [utils.EMPTY] * (self.size * self.size)
self.history = deque(maxlen=8)
def _set_board(self, board):
self.board = board
def _flatten(self, vertex):
x, y = vertex
return (x - 1) * self.size + (y - 1)
def _bfs(self, vertex, color, block, status, alive_break):
block.append(vertex)
status[self._flatten(vertex)] = True
nei = self._neighbor(vertex)
for n in nei:
if not status[self._flatten(n)]:
if self.board[self._flatten(n)] == color:
self._bfs(n, color, block, status, alive_break)
def _find_block(self, vertex, alive_break=False):
block = []
status = [False] * (self.size * self.size)
color = self.board[self._flatten(vertex)]
self._bfs(vertex, color, block, status, alive_break)
for b in block:
for n in self._neighbor(b):
if self.board[self._flatten(n)] == utils.EMPTY:
return False, block
return True, block
def _is_qi(self, color, vertex):
nei = self._neighbor(vertex)
for n in nei:
if self.board[self._flatten(n)] == utils.EMPTY:
return True
self.board[self._flatten(vertex)] = color
for n in nei:
if self.board[self._flatten(n)] == utils.another_color(color):
can_kill, block = self._find_block(n)
if can_kill:
self.board[self._flatten(vertex)] = utils.EMPTY
return True
### avoid suicide
can_kill, block = self._find_block(vertex)
if can_kill:
self.board[self._flatten(vertex)] = utils.EMPTY
return False
self.board[self._flatten(vertex)] = utils.EMPTY
return True
def _check_global_isomorphous(self, color, vertex):
##backup
_board = copy.copy(self.board)
self.board[self._flatten(vertex)] = color
self._process_board(color, vertex)
if self.board in self.history:
res = True
else:
res = False
self.board = _board
return res
def _in_board(self, vertex):
x, y = vertex
if x < 1 or x > self.size: return False
if y < 1 or y > self.size: return False
return True
def _neighbor(self, vertex):
x, y = vertex
nei = []
for d in DELTA:
_x = x + d[0]
_y = y + d[1]
if self._in_board((_x, _y)):
nei.append((_x, _y))
return nei
def _corner(self, vertex):
x, y = vertex
corner = []
for d in CORNER_OFFSET:
_x = x + d[0]
_y = y + d[1]
if self._in_board((_x, _y)):
corner.append((_x, _y))
return corner
def _process_board(self, color, vertex):
nei = self._neighbor(vertex)
for n in nei:
if self.board[self._flatten(n)] == utils.another_color(color):
can_kill, block = self._find_block(n, alive_break=True)
if can_kill:
for b in block:
self.board[self._flatten(b)] = utils.EMPTY
def _find_group(self, start):
color = self.board[self._flatten(start)]
#print ("color : ", color)
chain = set()
frontier = [start]
while frontier:
current = frontier.pop()
#print ("current : ", current)
chain.add(current)
for n in self._neighbor(current):
#print n, self._flatten(n), self.board[self._flatten(n)],
if self.board[self._flatten(n)] == color and not n in chain:
frontier.append(n)
return chain
def _is_eye(self, color, vertex):
nei = self._neighbor(vertex)
cor = self._corner(vertex)
ncolor = {color == self.board[self._flatten(n)] for n in nei}
if False in ncolor:
#print "not all neighbors are in same color with us"
return False
if set(nei) < self._find_group(nei[0]):
#print "all neighbors are in same group and same color with us"
return True
else:
opponent_number = [self.board[self._flatten(c)] for c in cor].count(-color)
opponent_propotion = float(opponent_number) / float(len(cor))
if opponent_propotion < 0.5:
#print "few opponents, real eye"
return True
else:
#print "many opponents, fake eye"
return False
# def is_valid(self, color, vertex):
def is_valid(self, state, action):
# state is the play board, the shape is [1, 9, 9, 17]
if action == self.size * self.size:
vertex = (0, 0)
else:
vertex = (action / self.size + 1, action % self.size + 1)
if state[0, 0, 0, -1] == utils.BLACK:
color = utils.BLACK
else:
color = utils.WHITE
self.history.clear()
for i in range(8):
self.history.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist())
self.board = copy.copy(self.history[-1])
### in board
if not self._in_board(vertex):
return False
### already have stone
if not self.board[self._flatten(vertex)] == utils.EMPTY:
# print(np.array(self.board).reshape(9, 9))
# print(vertex)
return False
### check if it is qi
if not self._is_qi(color, vertex):
return False
### check if it is an eye of yourself
### assumptions : notice that this judgement requires that the state is an endgame
#if self._is_eye(color, vertex):
# return False
if self._check_global_isomorphous(color, vertex):
return False
return True
def do_move(self, color, vertex):
if vertex == utils.PASS:
return True
id_ = self._flatten(vertex)
if self.board[id_] == utils.EMPTY:
self.board[id_] = color
self.history.append(copy.copy(self.board))
return True
else:
return False
def step_forward(self, state, action):
if state[0, 0, 0, -1] == 1:
color = 1
else:
color = -1
if action == 81:
vertex = (0, 0)
else:
vertex = (action % 9 + 1, action / 9 + 1)
# print(vertex)
# print(self.board)
self.board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist()
self.do_move(color, vertex)
new_state = np.concatenate(
[state[:, :, :, 1:8], (np.array(self.board) == 1).reshape(1, 9, 9, 1),
state[:, :, :, 9:16], (np.array(self.board) == -1).reshape(1, 9, 9, 1),
np.array(1 - state[:, :, :, -1]).reshape(1, 9, 9, 1)],
axis=3)
return new_state, 0
pure_test = [
0, 1, 0, 1, 0, 1, 0, 0, 0,
1, 0, 1, 0, 1, 0, 0, 0, 0,
0, 1, 0, 1, 0, 0, 1, 0, 0,
0, 0, 1, 0, 0, 1, 0, 1, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0,
1, 1, 1, 0, 0, 0, 0, 0, 0,
1, 0, 1, 0, 0, 1, 1, 0, 0,
1, 1, 1, 0, 1, 0, 1, 0, 0,
0, 0, 0, 0, 1, 1, 1, 0, 0
]
pt_qry = [(1, 1), (1, 5), (3, 3), (4, 7), (7, 2), (8, 6)]
pt_ans = [True, True, True, True, True, True]
opponent_test = [
0, 1, 0, 1, 0, 1, 0,-1, 1,
1,-1, 0,-1, 1,-1, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 1,-1, 0, 1,-1, 1, 0, 0,
1, 0, 1, 0, 1, 0, 1, 0, 0,
-1, 1, 1, 0, 1, 1, 1, 0, 0,
0, 1,-1, 0,-1,-1,-1, 0, 0,
1, 0, 1, 0,-1, 0,-1, 0, 0,
0, 1, 0, 0,-1,-1,-1, 0, 0
]
ot_qry = [(1, 1), (1, 5), (2, 9), (5, 2), (5, 6), (8, 2), (8, 6)]
ot_ans = [False, False, False, False, False, True, False]
#print (ge._find_group((6, 1)))
#print ge._is_eye(utils.BLACK, pt_qry[0])
ge = GoEnv()
ge._set_board(pure_test)
for i in range(6):
print (ge._is_eye(utils.BLACK, pt_qry[i]))
ge._set_board(opponent_test)
for i in range(7):
print (ge._is_eye(utils.BLACK, ot_qry[i]))

View File

@ -46,6 +46,8 @@ Tianshou(天授) is a reinforcement learning platform. The following image illus
Please follow [google python coding style](https://google.github.io/styleguide/pyguide.html)
There's a more detailed Chinese version [google python coding style in Chinese](http://www.runoob.com/w3cnote/google-python-styleguide.html)
All files/folders should be named with lower case letters and underline (except specified names such as `AlphaGo`).
Try to use full names. Don't use abbrevations for class/function/variable names except common abbrevations (such as `num` for number, `dim` for dimension, `env` for environment, `op` for operation). For now we use `pi` to refer to the policy in examples/ppo_example.py.
@ -73,4 +75,4 @@ HaoshengZou: collaborate mainly on Policy and losses; interfaces and architectur
Note: install openai/gym first to run the Atari environment; note that interfaces between modules may not be finalized; the management of placeholders and `feed_dict` may have to be done manually for the time being;
Without preprocessing and other tricks, this example will not train to any meaningful results. Codes should past two tests: individual module test and run through this example code.
Without preprocessing and other tricks, this example will not train to any meaningful results. Codes should past two tests: individual module test and run through this example code.

View File

@ -19,10 +19,10 @@ class rollout_policy(evaluator):
# TODO: prior for rollout policy
total_reward = 0.
action = np.random.randint(0, self.action_num)
state, reward = self.env.step_forward(state, action)
state, reward = self.env.simulate_step_forward(state, action)
total_reward += reward
while state is not None:
action = np.random.randint(0, self.action_num)
state, reward = self.env.step_forward(state, action)
state, reward = self.env.simulate_step_forward(state, action)
total_reward += reward
return np.ones([self.action_num])/self.action_num, total_reward

View File

@ -59,15 +59,10 @@ class UCTNode(MCTSNode):
self.parent.backpropagation(self.children[action].reward)
def valid_mask(self, simulator):
# let all invalid actions be illeagel in mcts
if self.mask is None:
start_time = time.time()
self.mask = []
for act in range(self.action_num - 1):
if not simulator.simulate_is_valid(self.state, act):
self.mask.append(act)
self.ucb[act] = -float("Inf")
else:
self.ucb[self.mask] = -float("Inf")
self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num))
self.ucb[self.mask] = -float("Inf")
class TSNode(MCTSNode):
@ -104,7 +99,7 @@ class ActionNode(object):
self.next_state = tuple2list(self.next_state)
def selection(self, simulator):
self.next_state, self.reward = simulator.step_forward(self.parent.state, self.action)
self.next_state, self.reward = simulator.simulate_step_forward(self.parent.state, self.action)
self.origin_state = self.next_state
self.state_type = type(self.next_state)
self.type_conversion_to_tuple()
@ -131,8 +126,7 @@ class ActionNode(object):
class MCTS(object):
def __init__(self, simulator, evaluator, root, action_num, method="UCT", inverse=False, max_step=None,
max_time=None):
def __init__(self, simulator, evaluator, root, action_num, method="UCT", inverse=False):
self.simulator = simulator
self.evaluator = evaluator
prior, _ = self.evaluator(root)
@ -140,33 +134,26 @@ class MCTS(object):
if method == "":
self.root = root
if method == "UCT":
self.root = UCTNode(None, None, root, action_num, prior, inverse)
self.root = UCTNode(None, None, root, action_num, prior, inverse=inverse)
if method == "TS":
self.root = TSNode(None, None, root, action_num, prior, inverse=inverse)
self.inverse = inverse
if max_step is not None:
self.step = 0
self.max_step = max_step
# TODO: Optimize the stop criteria
# else:
# self.max_step = 0
if max_time is not None:
self.start_time = time.time()
self.max_time = max_time
def search(self, max_step=None, max_time=None):
step = 0
start_time = time.time()
if max_step is None:
max_step = int("Inf")
if max_time is None:
max_time = float("Inf")
if max_step is None and max_time is None:
raise ValueError("Need a stop criteria!")
# TODO: running mcts should be implemented in another function, e.g. def search(self, max_step, max_time)
self.select_time = []
self.evaluate_time = []
self.bp_time = []
while (max_step is not None and self.step < self.max_step or max_step is None) \
and (max_time is not None and time.time() - self.start_time < self.max_time or max_time is None):
self.expand()
if max_step is not None:
self.step += 1
while step < max_step and time.time() - start_time < max_step:
self._expand()
step += 1
def expand(self):
def _expand(self):
node, new_action = self.root.selection(self.simulator)
value = node.children[new_action].expansion(self.evaluator, self.action_num)
node.children[new_action].backpropagation(value + 0.)

View File

@ -15,7 +15,7 @@ __all__ = [
'QValuePolicy',
]
# TODO: separate actor and critic, we should focus on it once we finish the basic module.
# TODO: a even more "base" class for policy
class QValuePolicy(object):

View File

@ -1,5 +1,16 @@
from tianshou.core.policy.base import QValuePolicy
import tensorflow as tf
import sys
sys.path.append('..')
import value_function.action_value as value_func
class DQN_refactor(object):
"""
use DQN from value_function as a member
"""
def __init__(self, value_tensor, observation_placeholder, action_placeholder):
self._network = value_func.DQN(value_tensor, observation_placeholder, action_placeholder)
class DQN(QValuePolicy):

View File

View File

@ -0,0 +1,53 @@
from base import ValueFunctionBase
import tensorflow as tf
class ActionValue(ValueFunctionBase):
"""
class of action values Q(s, a).
"""
def __init__(self, value_tensor, observation_placeholder, action_placeholder):
self._action_placeholder = action_placeholder
super(ActionValue, self).__init__(
value_tensor=value_tensor,
observation_placeholder=observation_placeholder
)
def get_value(self, observation, action):
"""
:param observation: numpy array of observations, of shape (batchsize, observation_dim).
:param action: numpy array of actions, of shape (batchsize, action_dim)
# TODO: Atari discrete action should have dim 1. Super Mario may should have, say, dim 5, where each can be 0/1
:return: numpy array of state values, of shape (batchsize, )
# TODO: dealing with the last dim of 1 in V(s) and Q(s, a)
"""
sess = tf.get_default_session()
return sess.run(self.get_value_tensor(), feed_dict=
{self._observation_placeholder: observation, self._action_placeholder:action})[:, 0]
class DQN(ActionValue):
"""
class of the very DQN architecture. Instead of feeding s and a to the network to get a value, DQN feed s to the
network and the last layer is Q(s, *) for all actions
"""
def __init__(self, value_tensor, observation_placeholder, action_placeholder):
"""
:param value_tensor: of shape (batchsize, num_actions)
:param observation_placeholder: of shape (batchsize, observation_dim)
:param action_placeholder: of shape (batchsize, )
"""
self._value_tensor_all_actions = value_tensor
canonical_value_tensor = value_tensor[action_placeholder] # maybe a tf.map_fn. for now it's wrong
super(DQN, self).__init__(value_tensor=canonical_value_tensor,
observation_placeholder=observation_placeholder,
action_placeholder=action_placeholder)
def get_value_all_actions(self, observation):
sess = tf.get_default_session()
return sess.run(self._value_tensor_all_actions, feed_dict={self._observation_placeholder: observation})
def get_value_tensor_all_actions(self):
return self._value_tensor_all_actions

View File

@ -0,0 +1,23 @@
# TODO: linear feature baseline also in tf?
class ValueFunctionBase(object):
"""
base class of value functions. Children include state values V(s) and action values Q(s, a)
"""
def __init__(self, value_tensor, observation_placeholder):
self._observation_placeholder = observation_placeholder
self._value_tensor = value_tensor
def get_value(self, **kwargs):
"""
:return: batch of corresponding values in numpy array
"""
raise NotImplementedError()
def get_value_tensor(self):
"""
:return: tensor of the corresponding values
"""
return self._value_tensor

View File

@ -0,0 +1,23 @@
from base import ValueFunctionBase
import tensorflow as tf
class StateValue(ValueFunctionBase):
"""
class of state values V(s).
"""
def __init__(self, value_tensor, observation_placeholder):
super(StateValue, self).__init__(
value_tensor=value_tensor,
observation_placeholder=observation_placeholder
)
def get_value(self, observation):
"""
:param observation: numpy array of observations, of shape (batchsize, observation_dim).
:return: numpy array of state values, of shape (batchsize, )
# TODO: dealing with the last dim of 1 in V(s) and Q(s, a)
"""
sess = tf.get_default_session()
return sess.run(self.get_value_tensor(), feed_dict={self._observation_placeholder: observation})[:, 0]