diff --git a/.gitignore b/.gitignore index 36d134c..d697b92 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ checkpoints checkpoints_origin *.json .DS_Store +data diff --git a/AlphaGo/Network.py b/AlphaGo/Network.py deleted file mode 100644 index caf7710..0000000 --- a/AlphaGo/Network.py +++ /dev/null @@ -1,211 +0,0 @@ -import os -import time -import sys - -import numpy as np -import time -import tensorflow as tf -import tensorflow.contrib.layers as layers - -import multi_gpu -import time - -# os.environ["CUDA_VISIBLE_DEVICES"] = "1" -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - - -def residual_block(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = h + input - return tf.nn.relu(h) - - -def policy_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 362, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -def value_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -class Network(object): - def __init__(self): - self.x = tf.placeholder(tf.float32, shape=[None, 19, 19, 17]) - self.is_training = tf.placeholder(tf.bool, shape=[]) - self.z = tf.placeholder(tf.float32, shape=[None, 1]) - self.pi = tf.placeholder(tf.float32, shape=[None, 362]) - self.build_network() - - def build_network(self): - h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm, - normalizer_params={'is_training': self.is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS}, - weights_regularizer=layers.l2_regularizer(1e-4)) - for i in range(19): - h = residual_block(h, self.is_training) - self.v = value_heads(h, self.is_training) - self.p = policy_heads(h, self.is_training) - # loss = tf.reduce_mean(tf.square(z-v)) - tf.multiply(pi, tf.log(tf.clip_by_value(tf.nn.softmax(p), 1e-8, tf.reduce_max(tf.nn.softmax(p))))) - self.value_loss = tf.reduce_mean(tf.square(self.z - self.v)) - self.policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.pi, logits=self.p)) - - self.reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) - self.total_loss = self.value_loss + self.policy_loss + self.reg - # train_op = tf.train.MomentumOptimizer(1e-4, momentum=0.9, use_nesterov=True).minimize(total_loss) - self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) - with tf.control_dependencies(self.update_ops): - self.train_op = tf.train.RMSPropOptimizer(1e-4).minimize(self.total_loss) - self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) - self.saver = tf.train.Saver(max_to_keep=10, var_list=self.var_list) - - def train(self): - data_path = "/home/tongzheng/data/" - data_name = os.listdir("/home/tongzheng/data/") - epochs = 100 - batch_size = 128 - - result_path = "./checkpoints/" - with multi_gpu.create_session() as sess: - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(result_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - self.saver.restore(sess, ckpt_file) - for epoch in range(epochs): - for name in data_name: - data = np.load(data_path + name) - boards = data["boards"] - wins = data["wins"] - ps = data["ps"] - print (boards.shape) - print (wins.shape) - print (ps.shape) - batch_num = boards.shape[0] // batch_size - index = np.arange(boards.shape[0]) - np.random.shuffle(index) - value_losses = [] - policy_losses = [] - regs = [] - time_train = -time.time() - for iter in range(batch_num): - lv, lp, r, value, prob, _ = sess.run( - [self.value_loss, self.policy_loss, self.reg, self.v, tf.nn.softmax(p), self.train_op], - feed_dict={self.x: boards[ - index[iter * batch_size:(iter + 1) * batch_size]], - self.z: wins[index[ - iter * batch_size:(iter + 1) * batch_size]], - self.pi: ps[index[ - iter * batch_size:(iter + 1) * batch_size]], - self.is_training: True}) - value_losses.append(lv) - policy_losses.append(lp) - regs.append(r) - if iter % 1 == 0: - print( - "Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format( - epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)), - np.mean(np.array(policy_losses)), np.mean(np.array(regs)))) - time_train = -time.time() - value_losses = [] - policy_losses = [] - regs = [] - if iter % 20 == 0: - save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter) - self.saver.save(sess, result_path + save_path) - del data, boards, wins, ps - - - # def forward(call_number): - # # checkpoint_path = "/home/yama/rl/tianshou/AlphaGo/checkpoints" - # checkpoint_path = "/home/jialian/stuGo/tianshou/stuGo/checkpoints/" - # board_file = np.genfromtxt("/home/jialian/stuGo/tianshou/leela-zero/src/mcts_nn_files/board_" + call_number, - # dtype='str'); - # human_board = np.zeros((17, 19, 19)) - # - # # TODO : is it ok to ignore the last channel? - # for i in range(17): - # human_board[i] = np.array(list(board_file[i])).reshape(19, 19) - # # print("============================") - # # print("human board sum : " + str(np.sum(human_board[-1]))) - # # print("============================") - # # print(human_board) - # # print("============================") - # # rint(human_board) - # feed_board = human_board.transpose(1, 2, 0).reshape(1, 19, 19, 17) - # # print(feed_board[:,:,:,-1]) - # # print(feed_board.shape) - # - # # npz_board = np.load("/home/yama/rl/tianshou/AlphaGo/data/7f83928932f64a79bc1efdea268698ae.npz") - # # print(npz_board["boards"].shape) - # # feed_board = npz_board["boards"][10].reshape(-1, 19, 19, 17) - # ##print(feed_board) - # # show_board = feed_board[0].transpose(2, 0, 1) - # # print("board shape : ", show_board.shape) - # # print(show_board) - # - # itflag = False - # with multi_gpu.create_session() as sess: - # sess.run(tf.global_variables_initializer()) - # ckpt_file = tf.train.latest_checkpoint(checkpoint_path) - # if ckpt_file is not None: - # # print('Restoring model from {}...'.format(ckpt_file)) - # saver.restore(sess, ckpt_file) - # else: - # raise ValueError("No model loaded") - # res = sess.run([tf.nn.softmax(p), v], feed_dict={x: feed_board, is_training: itflag}) - # # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False}) - # # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True}) - # # print(np.argmax(res[0])) - # np.savetxt(sys.stdout, res[0][0], fmt="%.6f", newline=" ") - # np.savetxt(sys.stdout, res[1][0], fmt="%.6f", newline=" ") - # pv_file = "/home/jialian/stuGotianshou/leela-zero/src/mcts_nn_files/policy_value" - # np.savetxt(pv_file, np.concatenate((res[0][0], res[1][0])), fmt="%.6f", newline=" ") - # # np.savetxt(pv_file, res[1][0], fmt="%.6f", newline=" ") - # return res - - def forward(self): - checkpoint_path = "/home/tongzheng/tianshou/AlphaGo/checkpoints/" - sess = multi_gpu.create_session() - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(checkpoint_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - self.saver.restore(sess, ckpt_file) - print('Successfully loaded') - else: - raise ValueError("No model loaded") - # prior, value = sess.run([tf.nn.softmax(p), v], feed_dict={x: state, is_training: False}) - # return prior, value - return sess - - -if __name__ == '__main__': - state = np.random.randint(0, 1, [1, 19, 19, 17]) - net = Network() - sess = net.forward() - start = time.time() - for i in range(100): - sess.run([tf.nn.softmax(net.p), net.v], feed_dict={net.x: state, net.is_training: False}) - print("Step {}, Cumulative time {}".format(i, time.time() - start)) diff --git a/AlphaGo/Network_ori.py b/AlphaGo/Network_ori.py deleted file mode 100644 index 9d33bb9..0000000 --- a/AlphaGo/Network_ori.py +++ /dev/null @@ -1,175 +0,0 @@ -import os -import time -import gc - -import numpy as np -import tensorflow as tf -import tensorflow.contrib.layers as layers - -import multi_gpu - -os.environ["CUDA_VISIBLE_DEVICES"] = "1" - - -def residual_block(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = h + input - return tf.nn.relu(h) - - -def policy_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 362, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -def value_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -x = tf.placeholder(tf.float32, shape=[None, 19, 19, 17]) -is_training = tf.placeholder(tf.bool, shape=[]) -z = tf.placeholder(tf.float32, shape=[None, 1]) -pi = tf.placeholder(tf.float32, shape=[None, 362]) - -h = layers.conv2d(x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm, - normalizer_params={'is_training': is_training, 'updates_collections': tf.GraphKeys.UPDATE_OPS}, - weights_regularizer=layers.l2_regularizer(1e-4)) -for i in range(19): - h = residual_block(h, is_training) -v = value_heads(h, is_training) -p = policy_heads(h, is_training) -# loss = tf.reduce_mean(tf.square(z-v)) - tf.multiply(pi, tf.log(tf.clip_by_value(tf.nn.softmax(p), 1e-8, tf.reduce_max(tf.nn.softmax(p))))) -value_loss = tf.reduce_mean(tf.square(z - v)) -policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=pi, logits=p)) - -reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) -total_loss = value_loss + policy_loss + reg -# train_op = tf.train.MomentumOptimizer(1e-4, momentum=0.9, use_nesterov=True).minimize(total_loss) -update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) -with tf.control_dependencies(update_ops): - train_op = tf.train.RMSPropOptimizer(1e-4).minimize(total_loss) -var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) -saver = tf.train.Saver(max_to_keep=10, var_list=var_list) - - -def train(): - data_path = "/home/tongzheng/data/" - data_name = os.listdir("/home/tongzheng/data/") - epochs = 100 - batch_size = 128 - - result_path = "./checkpoints/" - with multi_gpu.create_session() as sess: - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(result_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - saver.restore(sess, ckpt_file) - for epoch in range(epochs): - for name in data_name: - data = np.load(data_path + name) - boards = data["boards"] - wins = data["wins"] - ps = data["ps"] - print (boards.shape) - print (wins.shape) - print (ps.shape) - # batch_num = 1 - batch_num = boards.shape[0] // batch_size - index = np.arange(boards.shape[0]) - np.random.shuffle(index) - value_losses = [] - policy_losses = [] - regs = [] - time_train = -time.time() - for iter in range(batch_num): - lv, lp, r, _ = sess.run([value_loss, policy_loss, reg, train_op], - feed_dict={x: boards[ - index[iter * batch_size:(iter + 1) * batch_size]], - z: wins[index[ - iter * batch_size:(iter + 1) * batch_size]], - pi: ps[index[ - iter * batch_size:(iter + 1) * batch_size]], - is_training: True}) - value_losses.append(lv) - policy_losses.append(lp) - regs.append(r) - del lv, lp, r - if iter % 1 == 0: - print( - "Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format( - epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)), - np.mean(np.array(policy_losses)), np.mean(np.array(regs)))) - del value_losses, policy_losses, regs, time_train - time_train = -time.time() - value_losses = [] - policy_losses = [] - regs = [] - if iter % 20 == 0: - save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter) - saver.save(sess, result_path + save_path) - del save_path - del data, boards, wins, ps, batch_num, index - gc.collect() - - -def forward(board): - result_path = "./checkpoints" - itflag = False - res = None - if board is None: - # data = np.load("/home/tongzheng/meta-data/80b7bf21bce14862806d48c3cd760a1b.npz") - data = np.load("./data/7f83928932f64a79bc1efdea268698ae.npz") - board = data["boards"][50].reshape(-1, 19, 19, 17) - human_board = board[0].transpose(2, 0, 1) - print("============================") - print("human board sum : " + str(np.sum(human_board))) - print("============================") - print(board[:, :, :, -1]) - itflag = False - with multi_gpu.create_session() as sess: - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(result_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - saver.restore(sess, ckpt_file) - else: - raise ValueError("No model loaded") - res = sess.run([tf.nn.softmax(p), v], feed_dict={x: board, is_training: itflag}) - # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False}) - # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True}) - # print(np.argmax(res[0])) - print(res) - print(data["p"][0]) - print(np.argmax(res[0])) - print(np.argmax(data["p"][0])) - # print(res[0].tolist()[0]) - # print(np.argmax(res[0])) - return res - - -if __name__ == '__main__': - # train() - # if sys.argv[1] == "test": - forward(None) diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index 1f9af85..8b54470 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -167,7 +167,7 @@ class GTPEngine(): move = self._parse_move(args) if move: color, vertex = move - res = self._game.do_move(color, vertex) + res = self._game.play_move(color, vertex) if res: return None, True else: @@ -177,17 +177,21 @@ class GTPEngine(): def cmd_genmove(self, args, **kwargs): color = self._parse_color(args) if color: - move = self._game.gen_move(color) + move = self._game.think_play_move(color) return self._vertex_point2string(move), True else: return 'unknown player', False def cmd_get_score(self, args, **kwargs): - return self._game.executor.get_score(), None + return self._game.game_engine.executor_get_score(self._game.board, True), True def cmd_show_board(self, args, **kwargs): return self._game.board, True + def cmd_get_prob(self, args, **kwargs): + return self._game.prob, True + + if __name__ == "main": game = Game() engine = GTPEngine(game_obj=Game) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 2a82d8e..8706572 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -9,16 +9,13 @@ import utils import copy import tensorflow as tf import numpy as np -import sys +import sys, os import go -import network_small -import strategy +import model from collections import deque +sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir)) from tianshou.core.mcts.mcts import MCTS -import Network -#from strategy import strategy - class Game: ''' Load the real game and trained weights. @@ -26,7 +23,7 @@ class Game: TODO : Maybe merge with the engine class in future, currently leave it untouched for interacting with Go UI. ''' - def __init__(self, size=9, komi=6.5, checkpoint_path=None): + def __init__(self, size=9, komi=3.75, checkpoint_path=None): self.size = size self.komi = komi self.board = [utils.EMPTY] * (self.size ** 2) @@ -34,24 +31,10 @@ class Game: self.latest_boards = deque(maxlen=8) for _ in range(8): self.latest_boards.append(self.board) - - self.executor = go.Go(game=self) - #self.strategy = strategy(checkpoint_path) - - self.simulator = strategy.GoEnv(game=self) - self.net = network_small.Network() - self.sess = self.net.forward(checkpoint_path) - self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], - feed_dict={self.net.x: state, self.net.is_training: False}) - - def _flatten(self, vertex): - x, y = vertex - return (x - 1) * self.size + (y - 1) - - def _deflatten(self, idx): - x = idx // self.size + 1 - y = idx % self.size + 1 - return (x, y) + self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8) + # self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], + # feed_dict={self.net.x: state, self.net.is_training: False}) + self.game_engine = go.Go(size=self.size, komi=self.komi) def clear(self): self.board = [utils.EMPTY] * (self.size ** 2) @@ -66,42 +49,30 @@ class Game: def set_komi(self, k): self.komi = k - def generate_nn_input(self, latest_boards, color): - state = np.zeros([1, self.size, self.size, 17]) - for i in range(8): - state[0, :, :, i] = np.array(np.array(latest_boards[i]) == np.ones(self.size ** 2)).reshape(self.size, self.size) - state[0, :, :, i + 8] = np.array(np.array(latest_boards[i]) == -np.ones(self.size ** 2)).reshape(self.size, self.size) - if color == utils.BLACK: - state[0, :, :, 16] = np.ones([self.size, self.size]) - if color == utils.WHITE: - state[0, :, :, 16] = np.zeros([self.size, self.size]) - return state - - def strategy_gen_move(self, latest_boards, color): - self.simulator.simulate_latest_boards = copy.copy(latest_boards) - self.simulator.simulate_board = copy.copy(latest_boards[-1]) - nn_input = self.generate_nn_input(self.simulator.simulate_latest_boards, color) - mcts = MCTS(self.simulator, self.evaluator, nn_input, self.size ** 2 + 1, inverse=True, max_step=1) + def think(self, latest_boards, color): + mcts = MCTS(self.game_engine, self.evaluator, [latest_boards, color], self.size ** 2 + 1, inverse=True) + mcts.search(max_step=20) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0] if choice == self.size ** 2: move = utils.PASS else: - move = self._deflatten(choice) + move = self.game_engine._deflatten(choice) return move, prob - def do_move(self, color, vertex): + def play_move(self, color, vertex): + # this function can be called directly to play the opponent's move if vertex == utils.PASS: return True - res = self.executor.do_move(color, vertex) + res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex) return res - def gen_move(self, color): - # move = self.strategy.gen_move(color) - # return move - move, self.prob = self.strategy_gen_move(self.latest_boards, color) - self.do_move(color, move) + def think_play_move(self, color): + # although we don't need to return self.prob, however it is needed for neural network training + move, self.prob = self.think(self.latest_boards, color) + # play the move immediately + self.play_move(color, move) return move def status2symbol(self, s): @@ -127,6 +98,7 @@ class Game: if __name__ == "__main__": g = Game() g.show_board() + g.think_play_move(1) #file = open("debug.txt", "a") #file.write("mcts check\n") #file.close() diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 7b1d3e7..9b7e21f 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -1,7 +1,7 @@ from __future__ import print_function import utils import copy -import sys +import numpy as np from collections import deque ''' @@ -12,83 +12,26 @@ Settings of the Go game. ''' NEIGHBOR_OFFSET = [[1, 0], [-1, 0], [0, -1], [0, 1]] +CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] class Go: def __init__(self, **kwargs): - self.game = kwargs['game'] + self.size = kwargs['size'] + self.komi = kwargs['komi'] - def _bfs(self, vertex, color, block, status): - block.append(vertex) - status[self.game._flatten(vertex)] = True - nei = self._neighbor(vertex) - for n in nei: - if not status[self.game._flatten(n)]: - if self.game.board[self.game._flatten(n)] == color: - self._bfs(n, color, block, status) + def _flatten(self, vertex): + x, y = vertex + return (x - 1) * self.size + (y - 1) - def _find_block(self, vertex): - block = [] - status = [False] * (self.game.size ** 2) - color = self.game.board[self.game._flatten(vertex)] - self._bfs(vertex, color, block, status) - - for b in block: - for n in self._neighbor(b): - if self.game.board[self.game._flatten(n)] == utils.EMPTY: - return False, block - return True, block - - def _find_boarder(self, vertex): - block = [] - status = [False] * (self.game.size ** 2) - self._bfs(vertex, utils.EMPTY, block, status) - border = [] - for b in block: - for n in self._neighbor(b): - if not (n in block): - border.append(n) - return border - - def _is_qi(self, color, vertex): - nei = self._neighbor(vertex) - for n in nei: - if self.game.board[self.game._flatten(n)] == utils.EMPTY: - return True - - self.game.board[self.game._flatten(vertex)] = color - for n in nei: - if self.game.board[self.game._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n) - if can_kill: - self.game.board[self.game._flatten(vertex)] = utils.EMPTY - return True - - ### can not suicide - can_kill, block = self._find_block(vertex) - if can_kill: - self.game.board[self.game._flatten(vertex)] = utils.EMPTY - return False - - self.game.board[self.game._flatten(vertex)] = utils.EMPTY - return True - - def _check_global_isomorphous(self, color, vertex): - ##backup - _board = copy.copy(self.game.board) - self.game.board[self.game._flatten(vertex)] = color - self._process_board(color, vertex) - if self.game.board in self.game.history: - res = True - else: - res = False - - self.game.board = _board - return res + def _deflatten(self, idx): + x = idx // self.size + 1 + y = idx % self.size + 1 + return (x, y) def _in_board(self, vertex): x, y = vertex - if x < 1 or x > self.game.size: return False - if y < 1 or y > self.game.size: return False + if x < 1 or x > self.size: return False + if y < 1 or y > self.size: return False return True def _neighbor(self, vertex): @@ -101,45 +44,201 @@ class Go: nei.append((_x, _y)) return nei - def _process_board(self, color, vertex): + def _corner(self, vertex): + x, y = vertex + corner = [] + for d in CORNER_OFFSET: + _x = x + d[0] + _y = y + d[1] + if self._in_board((_x, _y)): + corner.append((_x, _y)) + return corner + + def _find_group(self, current_board, vertex): + color = current_board[self._flatten(vertex)] + # print ("color : ", color) + chain = set() + frontier = [vertex] + has_liberty = False + while frontier: + current = frontier.pop() + # print ("current : ", current) + chain.add(current) + for n in self._neighbor(current): + if current_board[self._flatten(n)] == color and not n in chain: + frontier.append(n) + if current_board[self._flatten(n)] == utils.EMPTY: + has_liberty = True + return has_liberty, chain + + def _is_suicide(self, current_board, color, vertex): + current_board[self._flatten(vertex)] = color # assume that we already take this move + suicide = False + + has_liberty, group = self._find_group(current_board, vertex) + if not has_liberty: + suicide = True # no liberty, suicide + for n in self._neighbor(vertex): + if current_board[self._flatten(n)] == utils.another_color(color): + opponent_liberty, group = self._find_group(current_board, n) + if not opponent_liberty: + suicide = False # this move is able to take opponent's stone, not suicide + + current_board[self._flatten(vertex)] = utils.EMPTY # undo this move + return suicide + + def _process_board(self, current_board, color, vertex): nei = self._neighbor(vertex) for n in nei: - if self.game.board[self.game._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n) - if can_kill: - for b in block: - self.game.board[self.game._flatten(b)] = utils.EMPTY + if current_board[self._flatten(n)] == utils.another_color(color): + has_liberty, group = self._find_group(current_board, n) + if not has_liberty: + for b in group: + current_board[self._flatten(b)] = utils.EMPTY - def is_valid(self, color, vertex): + def _check_global_isomorphous(self, history_boards, current_board, color, vertex): + repeat = False + next_board = copy.copy(current_board) + next_board[self._flatten(vertex)] = color + self._process_board(next_board, color, vertex) + if next_board in history_boards: + repeat = True + return repeat + + def _is_eye(self, current_board, color, vertex): + nei = self._neighbor(vertex) + cor = self._corner(vertex) + ncolor = {color == current_board[self._flatten(n)] for n in nei} + if False in ncolor: + # print "not all neighbors are in same color with us" + return False + _, group = self._find_group(current_board, nei[0]) + if set(nei) < group: + # print "all neighbors are in same group and same color with us" + return True + else: + opponent_number = [current_board[self._flatten(c)] for c in cor].count(-color) + opponent_propotion = float(opponent_number) / float(len(cor)) + if opponent_propotion < 0.5: + # print "few opponents, real eye" + return True + else: + # print "many opponents, fake eye" + return False + + def _knowledge_prunning(self, current_board, color, vertex): + # forbid some stupid selfplay using human knowledge + if self._is_eye(current_board, color, vertex): + return False + # forbid position on its own eye. + return True + + def _is_game_finished(self, current_board, color): + ''' + for each empty position, if it has both BLACK and WHITE neighbors, the game is still not finished + :return: return the game is finished + ''' + board = copy.deepcopy(current_board) + empty_idx = [i for i, x in enumerate(board) if x == utils.EMPTY] # find all empty idx + for idx in empty_idx: + neighbor_idx = self._neighbor(self.deflatten(idx)) + if len(neighbor_idx) > 1: + first_idx = neighbor_idx[0] + for other_idx in neighbor_idx[1:]: + if board[self.flatten(other_idx)] != board[self.flatten(first_idx)]: + return False + + return True + + def _action2vertex(self, action): + if action == self.size ** 2: + vertex = (0, 0) + else: + vertex = self._deflatten(action) + return vertex + + def _is_valid(self, history_boards, current_board, color, vertex): ### in board if not self._in_board(vertex): return False ### already have stone - if not self.game.board[self.game._flatten(vertex)] == utils.EMPTY: + if not current_board[self._flatten(vertex)] == utils.EMPTY: return False - ### check if it is qi - if not self._is_qi(color, vertex): + ### check if it is suicide + if self._is_suicide(current_board, color, vertex): return False - if self._check_global_isomorphous(color, vertex): + ### forbid global isomorphous + if self._check_global_isomorphous(history_boards, current_board, color, vertex): return False return True - def do_move(self, color, vertex): - if not self.is_valid(color, vertex): + def simulate_is_valid(self, state, action): + history_boards, color = state + vertex = self._action2vertex(action) + current_board = history_boards[-1] + + if not self._is_valid(history_boards, current_board, color, vertex): + return False + + if not self._knowledge_prunning(current_board, color, vertex): return False - self.game.board[self.game._flatten(vertex)] = color - self._process_board(color, vertex) - self.game.history.append(copy.copy(self.game.board)) - self.game.latest_boards.append(copy.copy(self.game.board)) return True - def _find_empty(self): - idx = [i for i,x in enumerate(self.game.board) if x == utils.EMPTY ][0] - return self.game._deflatten(idx) + def simulate_is_valid_list(self, state, action_set): + # find all the invalid actions + invalid_action_list = [] + for action_candidate in action_set[:-1]: + # go through all the actions excluding pass + if not self.simulate_is_valid(state, action_candidate): + invalid_action_list.append(action_candidate) + if len(invalid_action_list) < len(action_set) - 1: + invalid_action_list.append(action_set[-1]) + # forbid pass, if we have other choices + # TODO: In fact we should not do this. In some extreme cases, we should permit pass. + return invalid_action_list + + def _do_move(self, board, color, vertex): + if vertex == utils.PASS: + return board + else: + id_ = self._flatten(vertex) + board[id_] = color + return board + + def simulate_step_forward(self, state, action): + # initialize the simulate_board from state + history_boards, color = state + vertex = self._action2vertex(action) + new_board = self._do_move(copy.copy(history_boards[-1]), color, vertex) + history_boards.append(new_board) + new_color = -color + return [history_boards, new_color], 0 + + def executor_do_move(self, history, latest_boards, current_board, color, vertex): + if not self._is_valid(history, current_board, color, vertex): + return False + current_board[self._flatten(vertex)] = color + self._process_board(current_board, color, vertex) + history.append(copy.copy(current_board)) + latest_boards.append(copy.copy(current_board)) + return True + + def _find_empty(self, current_board): + idx = [i for i,x in enumerate(current_board) if x == utils.EMPTY ][0] + return self._deflatten(idx) + + def _find_boarder(self, current_board, vertex): + _, group = self._find_group(current_board, vertex) + border = [] + for b in group: + for n in self._neighbor(b): + if not (n in group): + border.append(n) + return border def _add_nearby_stones(self, neighbor_vertex_set, start_vertex_x, start_vertex_y, x_diff, y_diff, num_step): ''' @@ -159,7 +258,7 @@ class Go: start_vertex_x += x_diff start_vertex_y += y_diff - def _predict_from_nearby(self, vertex, neighbor_step = 3): + def _predict_from_nearby(self, current_board, vertex, neighbor_step=3): ''' step: the nearby 3 steps is considered :vertex: position to be estimated @@ -175,38 +274,37 @@ class Go: self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] - step, -1, 1, neighbor_step) color_estimate = 0 for neighbor_vertex in neighbor_vertex_set: - color_estimate += self.game.board[self.game._flatten(neighbor_vertex)] + color_estimate += current_board[self._flatten(neighbor_vertex)] if color_estimate > 0: return utils.BLACK elif color_estimate < 0: return utils.WHITE - def get_score(self, is_unknown_estimation = False): + def executor_get_score(self, current_board, is_unknown_estimation=False): ''' is_unknown_estimation: whether use nearby stone to predict the unknown return score from BLACK perspective. ''' - _board = copy.copy(self.game.board) - while utils.EMPTY in self.game.board: - vertex = self._find_empty() - boarder = self._find_boarder(vertex) - boarder_color = set(map(lambda v: self.game.board[self.game._flatten(v)], boarder)) + _board = copy.deepcopy(current_board) + while utils.EMPTY in _board: + vertex = self._find_empty(_board) + boarder = self._find_boarder(_board, vertex) + boarder_color = set(map(lambda v: _board[self._flatten(v)], boarder)) if boarder_color == {utils.BLACK}: - self.game.board[self.game._flatten(vertex)] = utils.BLACK + _board[self._flatten(vertex)] = utils.BLACK elif boarder_color == {utils.WHITE}: - self.game.board[self.game._flatten(vertex)] = utils.WHITE + _board[self._flatten(vertex)] = utils.WHITE elif is_unknown_estimation: - self.game.board[self.game._flatten(vertex)] = self._predict_from_nearby(vertex) + _board[self._flatten(vertex)] = self._predict_from_nearby(_board, vertex) else: - self.game.board[self.game._flatten(vertex)] =utils.UNKNOWN + _board[self._flatten(vertex)] =utils.UNKNOWN score = 0 - for i in self.game.board: + for i in _board: if i == utils.BLACK: score += 1 elif i == utils.WHITE: score -= 1 - score -= self.game.komi + score -= self.komi - self.game.board = _board return score diff --git a/AlphaGo/model.py b/AlphaGo/model.py new file mode 100644 index 0000000..41f3a47 --- /dev/null +++ b/AlphaGo/model.py @@ -0,0 +1,270 @@ +import os +import time +import sys +import cPickle +from collections import deque + +import numpy as np +import tensorflow as tf +import tensorflow.contrib.layers as layers + +import multi_gpu + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + + +def residual_block(input, is_training): + """ + one residual block + + :param input: a tensor, input of the residual block + :param is_training: a placeholder, indicate whether the model is training or not + :return: a tensor, output of the residual block + """ + normalizer_params = {'is_training': is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS} + h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = h + input + return tf.nn.relu(h) + + +def policy_head(input, is_training, action_num): + """ + the head of policy branch + + :param input: a tensor, input of the policy head + :param is_training: a placeholder, indicate whether the model is training or not + :param action_num: action_num: an integer, number of unique actions at any state + :return: a tensor: output of the policy head, shape [batch_size, action_num] + """ + normalizer_params = {'is_training': is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS} + h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.flatten(h) + h = layers.fully_connected(h, action_num, activation_fn=tf.identity, + weights_regularizer=layers.l2_regularizer(1e-4)) + return h + + +def value_head(input, is_training): + """ + the head of value branch + + :param input: a tensor, input of the value head + :param is_training: a placeholder, indicate whether the model is training or not + :return: a tensor, output of the value head, shape [batch_size, 1] + """ + normalizer_params = {'is_training': is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS} + h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.flatten(h) + h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4)) + return h + + +class Data(object): + def __init__(self): + self.boards = [] + self.probs = [] + self.winner = 0 + + +class ResNet(object): + def __init__(self, board_size, action_num, history_length=1, residual_block_num=20, checkpoint_path=None): + """ + the resnet model + + :param board_size: an integer, the board size + :param action_num: an integer, number of unique actions at any state + :param history_length: an integer, the history length to use, default is 1 + :param residual_block_num: an integer, the number of residual block, default is 20, at least 1 + :param checkpoint_path: a string, the path to the checkpoint, default is None, + """ + self.board_size = board_size + self.action_num = action_num + self.history_length = history_length + self.checkpoint_path = checkpoint_path + self.x = tf.placeholder(tf.float32, shape=[None, self.board_size, self.board_size, 2 * self.history_length + 1]) + self.is_training = tf.placeholder(tf.bool, shape=[]) + self.z = tf.placeholder(tf.float32, shape=[None, 1]) + self.pi = tf.placeholder(tf.float32, shape=[None, self.action_num]) + self._build_network(residual_block_num, self.checkpoint_path) + + # training hyper-parameters: + self.window_length = 1000 + self.save_freq = 1000 + self.training_data = {'states': deque(maxlen=self.window_length), 'probs': deque(maxlen=self.window_length), + 'winner': deque(maxlen=self.window_length)} + + def _build_network(self, residual_block_num, checkpoint_path): + """ + build the network + + :param residual_block_num: an integer, the number of residual block + :param checkpoint_path: a string, the path to the checkpoint, if None, use random initialization parameter + :return: None + """ + + h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, + normalizer_params={'is_training': self.is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS}, + weights_regularizer=layers.l2_regularizer(1e-4)) + for i in range(residual_block_num - 1): + h = residual_block(h, self.is_training) + self.v = value_head(h, self.is_training) + self.p = policy_head(h, self.is_training, self.action_num) + self.value_loss = tf.reduce_mean(tf.square(self.z - self.v)) + self.policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.pi, logits=self.p)) + + self.reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) + self.total_loss = self.value_loss + self.policy_loss + self.reg + self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + with tf.control_dependencies(self.update_ops): + self.train_op = tf.train.AdamOptimizer(1e-4).minimize(self.total_loss) + self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) + self.saver = tf.train.Saver(var_list=self.var_list) + self.sess = multi_gpu.create_session() + self.sess.run(tf.global_variables_initializer()) + if checkpoint_path is not None: + ckpt_file = tf.train.latest_checkpoint(checkpoint_path) + if ckpt_file is not None: + print('Restoring model from {}...'.format(ckpt_file)) + self.saver.restore(self.sess, ckpt_file) + print('Successfully loaded') + else: + raise ValueError("No model in path {}".format(checkpoint_path)) + + def __call__(self, state): + """ + + :param history: a list, the history + :param color: a string, indicate which one to play + :return: a list of tensor, the predicted value and policy given the history and color + """ + history, color = state + if len(history) != self.history_length: + raise ValueError( + 'The length of history cannot meet the need of the model, given {}, need {}'.format(len(history), + self.history_length)) + state = self._history2state(history, color) + return self.sess.run([self.p, self.v], feed_dict={self.x: state, self.is_training: False}) + + def _history2state(self, history, color): + """ + convert the history to the state we need + + :param history: a list, the history + :param color: a string, indicate which one to play + :return: a ndarray, the state + """ + state = np.zeros([1, self.board_size, self.board_size, 2 * self.history_length + 1]) + for i in range(self.history_length): + state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.board_size ** 2)).reshape(self.board_size, + self.board_size) + state[0, :, :, i + self.history_length] = np.array( + np.array(history[i]) == -np.ones(self.board_size ** 2)).reshape(self.board_size, self.board_size) + # TODO: need a config to specify the BLACK and WHITE + if color == +1: + state[0, :, :, 2 * self.history_length] = np.ones([self.board_size, self.board_size]) + if color == -1: + state[0, :, :, 2 * self.history_length] = np.zeros([self.board_size, self.board_size]) + return state + + # TODO: design the interface between the environment and training + def train(self, mode='memory', *args, **kwargs): + if mode == 'memory': + pass + if mode == 'file': + self._train_with_file(data_path=kwargs['data_path'], batch_size=kwargs['batch_size'], + checkpoint_path=kwargs['checkpoint_path']) + + def _train_with_file(self, data_path, batch_size, checkpoint_path): + # check if the path is valid + if not os.path.exists(data_path): + raise ValueError("{} doesn't exist".format(data_path)) + self.checkpoint_path = checkpoint_path + if not os.path.exists(self.checkpoint_path): + os.mkdir(self.checkpoint_path) + + new_file_list = [] + all_file_list = [] + training_data = {} + iters = 0 + while True: + new_file_list = list(set(os.listdir(data_path)).difference(all_file_list)) + all_file_list = os.listdir(data_path) + new_file_list.sort( + key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir(data_path + file) else 0) + if new_file_list: + for file in new_file_list: + states, probs, winner = self._file_to_training_data(data_path + file) + assert states.shape[0] == probs.shape[0] + assert states.shape[0] == winner.shape[0] + self.training_data['states'].append(states) + self.training_data['probs'].append(probs) + self.training_data['winner'].append(winner) + training_data['states'] = np.concatenate(self.training_data['states'], axis=0) + training_data['probs'] = np.concatenate(self.training_data['probs'], axis=0) + training_data['winner'] = np.concatenate(self.training_data['winner'], axis=0) + + if len(self.training_data['states']) != self.window_length: + continue + else: + data_num = training_data['states'].shape[0] + index = np.arange(data_num) + np.random.shuffle(index) + start_time = time.time() + value_loss, policy_loss, reg, _ = self.sess.run( + [self.value_loss, self.policy_loss, self.reg, self.train_op], + feed_dict={self.x: training_data['states'][index[:batch_size]], + self.z: training_data['winner'][index[:batch_size]], + self.pi: training_data['probs'][index[:batch_size]], + self.is_training: True}) + print("Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format(iters, + time.time() - start_time, + value_loss, + policy_loss, reg)) + iters += 1 + if iters % self.save_freq == 0: + save_path = "Iteration{}.ckpt".format(iters) + self.saver.save(self.sess, self.checkpoint_path + save_path) + + def _file_to_training_data(self, file_name): + with open(file_name, 'r') as file: + data = cPickle.load(file) + history = deque(maxlen=self.history_length) + states = [] + probs = [] + winner = [] + for _ in range(self.history_length): + # Note that 0 is specified, need a more general way like config + history.append([0] * self.board_size ** 2) + # Still, +1 is specified + color = +1 + + for [board, prob] in zip(data.boards, data.probs): + history.append(board) + states.append(self._history2state(history, color)) + probs.append(np.array(prob).reshape(1, self.board_size ** 2 + 1)) + winner.append(np.array(data.winner).reshape(1, 1)) + color *= -1 + states = np.concatenate(states, axis=0) + probs = np.concatenate(probs, axis=0) + winner = np.concatenate(winner, axis=0) + return states, probs, winner + + +if __name__=="__main__": + model = ResNet(board_size=9, action_num=82) + model.train("file", data_path="./data/", batch_size=128, checkpoint_path="./checkpoint/") \ No newline at end of file diff --git a/AlphaGo/network_small.py b/AlphaGo/network.py similarity index 100% rename from AlphaGo/network_small.py rename to AlphaGo/network.py diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 7367804..a8267a7 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -5,6 +5,18 @@ import re import Pyro4 import time import os +import cPickle + + +class Data(object): + def __init__(self): + self.boards = [] + self.probs = [] + self.winner = 0 + + def reset(self): + self.__init__() + if __name__ == '__main__': """ @@ -13,10 +25,14 @@ if __name__ == '__main__': """ # TODO : we should set the network path in a more configurable way. parser = argparse.ArgumentParser() + parser.add_argument("--result_path", type=str, default="./data/") parser.add_argument("--black_weight_path", type=str, default=None) parser.add_argument("--white_weight_path", type=str, default=None) + parser.add_argument("--id", type=int, default=0) args = parser.parse_args() + if not os.path.exists(args.result_path): + os.mkdir(args.result_path) # black_weight_path = "./checkpoints" # white_weight_path = "./checkpoints_origin" if args.black_weight_path is not None and (not os.path.exists(args.black_weight_path)): @@ -25,24 +41,29 @@ if __name__ == '__main__': raise ValueError("Can't not find the network weights for white player.") # kill the old server - kill_old_server = subprocess.Popen(['killall', 'pyro4-ns']) - print "kill the old pyro4 name server, the return code is : " + str(kill_old_server.wait()) - time.sleep(1) + # kill_old_server = subprocess.Popen(['killall', 'pyro4-ns']) + # print "kill the old pyro4 name server, the return code is : " + str(kill_old_server.wait()) + # time.sleep(1) # start a name server to find the remote object - start_new_server = subprocess.Popen(['pyro4-ns', '&']) - print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait()) - time.sleep(1) + # start_new_server = subprocess.Popen(['pyro4-ns', '&']) + # print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait()) + # time.sleep(1) # start two different player with different network weights. - agent_v0 = subprocess.Popen(['python', '-u', 'player.py', '--role=black', '--checkpoint_path=' + str(args.black_weight_path)], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + black_role_name = 'black' + str(args.id) + white_role_name = 'white' + str(args.id) - agent_v1 = subprocess.Popen(['python', '-u', 'player.py', '--role=white', '--checkpoint_path=' + str(args.white_weight_path)], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + agent_v0 = subprocess.Popen( + ['python', '-u', 'player.py', '--role=' + black_role_name, '--checkpoint_path=' + str(args.black_weight_path)], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + agent_v1 = subprocess.Popen( + ['python', '-u', 'player.py', '--role=' + white_role_name, '--checkpoint_path=' + str(args.white_weight_path)], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) server_list = "" - while ("black" not in server_list) or ("white" not in server_list): + while (black_role_name not in server_list) or (white_role_name not in server_list): server_list = subprocess.check_output(['pyro4-nsc', 'list']) print "Waiting for the server start..." time.sleep(1) @@ -50,51 +71,82 @@ if __name__ == '__main__': print "Start black player at : " + str(agent_v0.pid) print "Start white player at : " + str(agent_v1.pid) + data = Data() player = [None] * 2 - player[0] = Pyro4.Proxy("PYRONAME:black") - player[1] = Pyro4.Proxy("PYRONAME:white") + player[0] = Pyro4.Proxy("PYRONAME:" + black_role_name) + player[1] = Pyro4.Proxy("PYRONAME:" + white_role_name) role = ["BLACK", "WHITE"] color = ['b', 'w'] pattern = "[A-Z]{1}[0-9]{1}" + space = re.compile("\s+") size = 9 show = ['.', 'X', 'O'] evaluate_rounds = 1 game_num = 0 - while game_num < evaluate_rounds: - num = 0 - pass_flag = [False, False] - print("Start game {}".format(game_num)) - # end the game if both palyer chose to pass, or play too much turns - while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2: - turn = num % 2 - move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n') - print role[turn] + " : " + str(move), - num += 1 - match = re.search(pattern, move) - if match is not None: - # print "match : " + str(match.group()) - play_or_pass = match.group() - pass_flag[turn] = False + try: + while True: + start_time = time.time() + num = 0 + pass_flag = [False, False] + print("Start game {}".format(game_num)) + # end the game if both palyer chose to pass, or play too much turns + while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2: + turn = num % 2 + board = player[turn].run_cmd(str(num) + ' show_board') + board = eval(board[board.index('['):board.index(']') + 1]) + for i in range(size): + for j in range(size): + print show[board[i * size + j]] + " ", + print "\n", + data.boards.append(board) + move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n') + print role[turn] + " : " + str(move), + num += 1 + match = re.search(pattern, move) + if match is not None: + # print "match : " + str(match.group()) + play_or_pass = match.group() + pass_flag[turn] = False + else: + # print "no match" + play_or_pass = ' PASS' + pass_flag[turn] = True + result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n') + prob = player[turn].run_cmd(str(num) + ' get_prob') + prob = space.sub(',', prob[prob.index('['):prob.index(']') + 1]) + prob = prob.replace('[,', '[') + prob = prob.replace('],', ']') + prob = eval(prob) + data.probs.append(prob) + score = player[turn].run_cmd(str(num) + ' get_score') + print "Finished : ", score.split(" ")[1] + # TODO: generalize the player + if eval(score.split(" ")[1]) > 0: + data.winner = 1 + if eval(score.split(" ")[1]) < 0: + data.winner = -1 + player[0].run_cmd(str(num) + ' clear_board') + player[1].run_cmd(str(num) + ' clear_board') + file_list = os.listdir(args.result_path) + if not file_list: + data_num = 0 else: - # print "no match" - play_or_pass = ' PASS' - pass_flag[turn] = True - result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n') - board = player[turn].run_cmd(str(num) + ' show_board') - board = eval(board[board.index('['):board.index(']') + 1]) - for i in range(size): - for j in range(size): - print show[board[i * size + j]] + " ", - print "\n", + file_list.sort(key=lambda file: os.path.getmtime(args.result_path + file) if not os.path.isdir( + args.result_path + file) else 0) + data_num = eval(file_list[-1][:-4]) + 1 + with open("./data/" + str(data_num) + ".pkl", "w") as file: + picklestring = cPickle.dump(data, file) + data.reset() + game_num += 1 - score = player[turn].run_cmd(str(num) + ' get_score') - print "Finished : ", score.split(" ")[1] - player[0].run_cmd(str(num) + ' clear_board') - player[1].run_cmd(str(num) + ' clear_board') - game_num += 1 + except Exception as e: + print(e) + subprocess.call(["kill", "-9", str(agent_v0.pid)]) + subprocess.call(["kill", "-9", str(agent_v1.pid)]) + print "Kill all player, finish all game." subprocess.call(["kill", "-9", str(agent_v0.pid)]) subprocess.call(["kill", "-9", str(agent_v1.pid)]) diff --git a/AlphaGo/player.py b/AlphaGo/player.py index b468cf3..0e3daff 100644 --- a/AlphaGo/player.py +++ b/AlphaGo/player.py @@ -20,6 +20,7 @@ class Player(object): #return "inside the Player of player.py" return self.engine.run_cmd(command) + if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--checkpoint_path", type=str, default=None) diff --git a/AlphaGo/self-play.py b/AlphaGo/self-play.py index 98ccf84..4387b24 100644 --- a/AlphaGo/self-play.py +++ b/AlphaGo/self-play.py @@ -79,7 +79,7 @@ while True: prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1)) print("Finished") print("\n") - score = game.executor.get_score(True) + score = game.game_engine.executor_get_score(game.board, True) if score > 0: winner = utils.BLACK else: diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py deleted file mode 100644 index 112f130..0000000 --- a/AlphaGo/strategy.py +++ /dev/null @@ -1,210 +0,0 @@ -import os, sys - -sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir)) -import numpy as np -import utils -import time -import copy -import network_small -import tensorflow as tf -from collections import deque -from tianshou.core.mcts.mcts import MCTS - -DELTA = [[1, 0], [-1, 0], [0, -1], [0, 1]] -CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] - -class GoEnv: - def __init__(self, **kwargs): - self.game = kwargs['game'] - self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) - self.simulate_latest_boards = deque(maxlen=8) - - def simulate_flatten(self, vertex): - x, y = vertex - return (x - 1) * self.game.size + (y - 1) - - def simulate_deflatten(self, idx): - x = idx // self.game.size + 1 - y = idx % self.game.size + 1 - return (x, y) - - def _find_group(self, start): - color = self.simulate_board[self.simulate_flatten(start)] - # print ("color : ", color) - chain = set() - frontier = [start] - has_liberty = False - while frontier: - current = frontier.pop() - # print ("current : ", current) - chain.add(current) - for n in self._neighbor(current): - # print n, self._flatten(n), self.board[self._flatten(n)], - if self.simulate_board[self.simulate_flatten(n)] == color and not n in chain: - frontier.append(n) - if self.simulate_board[self.simulate_flatten(n)] == utils.EMPTY: - has_liberty = True - return has_liberty, chain - - def _is_suicide(self, color, vertex): - self.simulate_board[self.simulate_flatten(vertex)] = color # assume that we already take this move - suicide = False - - has_liberty, group = self._find_group(vertex) - if not has_liberty: - suicide = True # no liberty, suicide - for n in self._neighbor(vertex): - if self.simulate_board[self.simulate_flatten(n)] == utils.another_color(color): - opponent_liberty, group = self._find_group(n) - if not opponent_liberty: - suicide = False # this move is able to take opponent's stone, not suicide - - self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY # undo this move - return suicide - - def _check_global_isomorphous(self, color, vertex): - ##backup - _board = copy.copy(self.simulate_board) - self.simulate_board[self.simulate_flatten(vertex)] = color - self._process_board(color, vertex) - if self.simulate_board in self.game.history: - res = True - else: - res = False - - self.simulate_board = _board - return res - - def _in_board(self, vertex): - x, y = vertex - if x < 1 or x > self.game.size: return False - if y < 1 or y > self.game.size: return False - return True - - def _neighbor(self, vertex): - x, y = vertex - nei = [] - for d in DELTA: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - nei.append((_x, _y)) - return nei - - def _corner(self, vertex): - x, y = vertex - corner = [] - for d in CORNER_OFFSET: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - corner.append((_x, _y)) - return corner - - def _process_board(self, color, vertex): - nei = self._neighbor(vertex) - for n in nei: - if self.simulate_board[self.simulate_flatten(n)] == utils.another_color(color): - has_liberty, group = self._find_group(n) - if not has_liberty: - for b in group: - self.simulate_board[self.simulate_flatten(b)] = utils.EMPTY - - def _is_eye(self, color, vertex): - nei = self._neighbor(vertex) - cor = self._corner(vertex) - ncolor = {color == self.simulate_board[self.simulate_flatten(n)] for n in nei} - if False in ncolor: - # print "not all neighbors are in same color with us" - return False - _, group = self._find_group(nei[0]) - if set(nei) < group: - # print "all neighbors are in same group and same color with us" - return True - else: - opponent_number = [self.simulate_board[self.simulate_flatten(c)] for c in cor].count(-color) - opponent_propotion = float(opponent_number) / float(len(cor)) - if opponent_propotion < 0.5: - # print "few opponents, real eye" - return True - else: - # print "many opponents, fake eye" - return False - - def knowledge_prunning(self, color, vertex): - ### check if it is an eye of yourself - ### assumptions : notice that this judgement requires that the state is an endgame - if self._is_eye(color, vertex): - return False - return True - - def simulate_is_valid(self, state, action): - # State is the play board, the shape is [1, self.game.size, self.game.size, 17]. - # Action is an index - # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move - if action == self.game.size ** 2: - vertex = (0, 0) - else: - vertex = self.simulate_deflatten(action) - if state[0, 0, 0, -1] == utils.BLACK: - color = utils.BLACK - else: - color = utils.WHITE - self.simulate_latest_boards.clear() - for i in range(8): - self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) - self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) - - ### in board - if not self._in_board(vertex): - return False - - ### already have stone - if not self.simulate_board[self.simulate_flatten(vertex)] == utils.EMPTY: - # print(np.array(self.board).reshape(9, 9)) - # print(vertex) - return False - - ### check if it is suicide - if self._is_suicide(color, vertex): - return False - - ### forbid global isomorphous - if self._check_global_isomorphous(color, vertex): - return False - - if not self.knowledge_prunning(color, vertex): - return False - - return True - - def simulate_do_move(self, color, vertex): - if vertex == utils.PASS: - return True - - id_ = self.simulate_flatten(vertex) - if self.simulate_board[id_] == utils.EMPTY: - self.simulate_board[id_] = color - return True - else: - return False - - def step_forward(self, state, action): - if state[0, 0, 0, -1] == 1: - color = utils.BLACK - else: - color = utils.WHITE - if action == self.game.size ** 2: - vertex = utils.PASS - else: - vertex = self.simulate_deflatten(action) - # print(vertex) - # print(self.board) - self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() - self.simulate_do_move(color, vertex) - new_state = np.concatenate( - [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), - state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), - np.array(1 - state[:, :, :, -1]).reshape(1, self.game.size, self.game.size, 1)], - axis=3) - return new_state, 0 diff --git a/tianshou/core/mcts/evaluator.py b/tianshou/core/mcts/evaluator.py index 9c4ee8e..a1f9456 100644 --- a/tianshou/core/mcts/evaluator.py +++ b/tianshou/core/mcts/evaluator.py @@ -19,10 +19,10 @@ class rollout_policy(evaluator): # TODO: prior for rollout policy total_reward = 0. action = np.random.randint(0, self.action_num) - state, reward = self.env.step_forward(state, action) + state, reward = self.env.simulate_step_forward(state, action) total_reward += reward while state is not None: action = np.random.randint(0, self.action_num) - state, reward = self.env.step_forward(state, action) + state, reward = self.env.simulate_step_forward(state, action) total_reward += reward return np.ones([self.action_num])/self.action_num, total_reward diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 979e994..8bb5f06 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -71,15 +71,10 @@ class UCTNode(MCTSNode): self.parent.backpropagation(self.children[action].reward) def valid_mask(self, simulator): + # let all invalid actions be illeagel in mcts if self.mask is None: - start_time = time.time() - self.mask = [] - for act in range(self.action_num - 1): - if not simulator.simulate_is_valid(self.state, act): - self.mask.append(act) - self.ucb[act] = -float("Inf") - else: - self.ucb[self.mask] = -float("Inf") + self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num)) + self.ucb[self.mask] = -float("Inf") class TSNode(MCTSNode): @@ -116,7 +111,7 @@ class ActionNode(object): self.next_state = tuple2list(self.next_state) def selection(self, simulator): - self.next_state, self.reward = simulator.step_forward(self.parent.state, self.action) + self.next_state, self.reward = simulator.simulate_step_forward(self.parent.state, self.action) self.origin_state = self.next_state self.state_type = type(self.next_state) self.type_conversion_to_tuple() @@ -143,8 +138,7 @@ class ActionNode(object): class MCTS(object): - def __init__(self, simulator, evaluator, root, action_num, method="UCT", inverse=False, max_step=None, - max_time=None): + def __init__(self, simulator, evaluator, root, action_num, method="UCT", inverse=False): self.simulator = simulator self.evaluator = evaluator prior, _ = self.evaluator(root) @@ -152,33 +146,26 @@ class MCTS(object): if method == "": self.root = root if method == "UCT": - self.root = UCTNode(None, None, root, action_num, prior, inverse) + self.root = UCTNode(None, None, root, action_num, prior, inverse=inverse) if method == "TS": self.root = TSNode(None, None, root, action_num, prior, inverse=inverse) self.inverse = inverse - if max_step is not None: - self.step = 0 - self.max_step = max_step - # TODO: Optimize the stop criteria - # else: - # self.max_step = 0 - if max_time is not None: - self.start_time = time.time() - self.max_time = max_time + + def search(self, max_step=None, max_time=None): + step = 0 + start_time = time.time() + if max_step is None: + max_step = int("Inf") + if max_time is None: + max_time = float("Inf") if max_step is None and max_time is None: raise ValueError("Need a stop criteria!") - # TODO: running mcts should be implemented in another function, e.g. def search(self, max_step, max_time) - self.select_time = [] - self.evaluate_time = [] - self.bp_time = [] - while (max_step is not None and self.step < self.max_step or max_step is None) \ - and (max_time is not None and time.time() - self.start_time < self.max_time or max_time is None): - self.expand() - if max_step is not None: - self.step += 1 + while step < max_step and time.time() - start_time < max_step: + self._expand() + step += 1 - def expand(self): + def _expand(self): node, new_action = self.root.selection(self.simulator) value = node.children[new_action].expansion(self.evaluator, self.action_num) node.children[new_action].backpropagation(value + 0.)