diff --git a/.gitignore b/.gitignore index e795259..d697b92 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,5 @@ parameters checkpoints checkpoints_origin *.json +.DS_Store +data diff --git a/AlphaGo/Network.py b/AlphaGo/Network.py deleted file mode 100644 index caf7710..0000000 --- a/AlphaGo/Network.py +++ /dev/null @@ -1,211 +0,0 @@ -import os -import time -import sys - -import numpy as np -import time -import tensorflow as tf -import tensorflow.contrib.layers as layers - -import multi_gpu -import time - -# os.environ["CUDA_VISIBLE_DEVICES"] = "1" -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - - -def residual_block(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = h + input - return tf.nn.relu(h) - - -def policy_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 362, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -def value_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -class Network(object): - def __init__(self): - self.x = tf.placeholder(tf.float32, shape=[None, 19, 19, 17]) - self.is_training = tf.placeholder(tf.bool, shape=[]) - self.z = tf.placeholder(tf.float32, shape=[None, 1]) - self.pi = tf.placeholder(tf.float32, shape=[None, 362]) - self.build_network() - - def build_network(self): - h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm, - normalizer_params={'is_training': self.is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS}, - weights_regularizer=layers.l2_regularizer(1e-4)) - for i in range(19): - h = residual_block(h, self.is_training) - self.v = value_heads(h, self.is_training) - self.p = policy_heads(h, self.is_training) - # loss = tf.reduce_mean(tf.square(z-v)) - tf.multiply(pi, tf.log(tf.clip_by_value(tf.nn.softmax(p), 1e-8, tf.reduce_max(tf.nn.softmax(p))))) - self.value_loss = tf.reduce_mean(tf.square(self.z - self.v)) - self.policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.pi, logits=self.p)) - - self.reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) - self.total_loss = self.value_loss + self.policy_loss + self.reg - # train_op = tf.train.MomentumOptimizer(1e-4, momentum=0.9, use_nesterov=True).minimize(total_loss) - self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) - with tf.control_dependencies(self.update_ops): - self.train_op = tf.train.RMSPropOptimizer(1e-4).minimize(self.total_loss) - self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) - self.saver = tf.train.Saver(max_to_keep=10, var_list=self.var_list) - - def train(self): - data_path = "/home/tongzheng/data/" - data_name = os.listdir("/home/tongzheng/data/") - epochs = 100 - batch_size = 128 - - result_path = "./checkpoints/" - with multi_gpu.create_session() as sess: - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(result_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - self.saver.restore(sess, ckpt_file) - for epoch in range(epochs): - for name in data_name: - data = np.load(data_path + name) - boards = data["boards"] - wins = data["wins"] - ps = data["ps"] - print (boards.shape) - print (wins.shape) - print (ps.shape) - batch_num = boards.shape[0] // batch_size - index = np.arange(boards.shape[0]) - np.random.shuffle(index) - value_losses = [] - policy_losses = [] - regs = [] - time_train = -time.time() - for iter in range(batch_num): - lv, lp, r, value, prob, _ = sess.run( - [self.value_loss, self.policy_loss, self.reg, self.v, tf.nn.softmax(p), self.train_op], - feed_dict={self.x: boards[ - index[iter * batch_size:(iter + 1) * batch_size]], - self.z: wins[index[ - iter * batch_size:(iter + 1) * batch_size]], - self.pi: ps[index[ - iter * batch_size:(iter + 1) * batch_size]], - self.is_training: True}) - value_losses.append(lv) - policy_losses.append(lp) - regs.append(r) - if iter % 1 == 0: - print( - "Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format( - epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)), - np.mean(np.array(policy_losses)), np.mean(np.array(regs)))) - time_train = -time.time() - value_losses = [] - policy_losses = [] - regs = [] - if iter % 20 == 0: - save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter) - self.saver.save(sess, result_path + save_path) - del data, boards, wins, ps - - - # def forward(call_number): - # # checkpoint_path = "/home/yama/rl/tianshou/AlphaGo/checkpoints" - # checkpoint_path = "/home/jialian/stuGo/tianshou/stuGo/checkpoints/" - # board_file = np.genfromtxt("/home/jialian/stuGo/tianshou/leela-zero/src/mcts_nn_files/board_" + call_number, - # dtype='str'); - # human_board = np.zeros((17, 19, 19)) - # - # # TODO : is it ok to ignore the last channel? - # for i in range(17): - # human_board[i] = np.array(list(board_file[i])).reshape(19, 19) - # # print("============================") - # # print("human board sum : " + str(np.sum(human_board[-1]))) - # # print("============================") - # # print(human_board) - # # print("============================") - # # rint(human_board) - # feed_board = human_board.transpose(1, 2, 0).reshape(1, 19, 19, 17) - # # print(feed_board[:,:,:,-1]) - # # print(feed_board.shape) - # - # # npz_board = np.load("/home/yama/rl/tianshou/AlphaGo/data/7f83928932f64a79bc1efdea268698ae.npz") - # # print(npz_board["boards"].shape) - # # feed_board = npz_board["boards"][10].reshape(-1, 19, 19, 17) - # ##print(feed_board) - # # show_board = feed_board[0].transpose(2, 0, 1) - # # print("board shape : ", show_board.shape) - # # print(show_board) - # - # itflag = False - # with multi_gpu.create_session() as sess: - # sess.run(tf.global_variables_initializer()) - # ckpt_file = tf.train.latest_checkpoint(checkpoint_path) - # if ckpt_file is not None: - # # print('Restoring model from {}...'.format(ckpt_file)) - # saver.restore(sess, ckpt_file) - # else: - # raise ValueError("No model loaded") - # res = sess.run([tf.nn.softmax(p), v], feed_dict={x: feed_board, is_training: itflag}) - # # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False}) - # # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True}) - # # print(np.argmax(res[0])) - # np.savetxt(sys.stdout, res[0][0], fmt="%.6f", newline=" ") - # np.savetxt(sys.stdout, res[1][0], fmt="%.6f", newline=" ") - # pv_file = "/home/jialian/stuGotianshou/leela-zero/src/mcts_nn_files/policy_value" - # np.savetxt(pv_file, np.concatenate((res[0][0], res[1][0])), fmt="%.6f", newline=" ") - # # np.savetxt(pv_file, res[1][0], fmt="%.6f", newline=" ") - # return res - - def forward(self): - checkpoint_path = "/home/tongzheng/tianshou/AlphaGo/checkpoints/" - sess = multi_gpu.create_session() - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(checkpoint_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - self.saver.restore(sess, ckpt_file) - print('Successfully loaded') - else: - raise ValueError("No model loaded") - # prior, value = sess.run([tf.nn.softmax(p), v], feed_dict={x: state, is_training: False}) - # return prior, value - return sess - - -if __name__ == '__main__': - state = np.random.randint(0, 1, [1, 19, 19, 17]) - net = Network() - sess = net.forward() - start = time.time() - for i in range(100): - sess.run([tf.nn.softmax(net.p), net.v], feed_dict={net.x: state, net.is_training: False}) - print("Step {}, Cumulative time {}".format(i, time.time() - start)) diff --git a/AlphaGo/Network_ori.py b/AlphaGo/Network_ori.py deleted file mode 100644 index 9d33bb9..0000000 --- a/AlphaGo/Network_ori.py +++ /dev/null @@ -1,175 +0,0 @@ -import os -import time -import gc - -import numpy as np -import tensorflow as tf -import tensorflow.contrib.layers as layers - -import multi_gpu - -os.environ["CUDA_VISIBLE_DEVICES"] = "1" - - -def residual_block(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = h + input - return tf.nn.relu(h) - - -def policy_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 362, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -def value_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -x = tf.placeholder(tf.float32, shape=[None, 19, 19, 17]) -is_training = tf.placeholder(tf.bool, shape=[]) -z = tf.placeholder(tf.float32, shape=[None, 1]) -pi = tf.placeholder(tf.float32, shape=[None, 362]) - -h = layers.conv2d(x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm, - normalizer_params={'is_training': is_training, 'updates_collections': tf.GraphKeys.UPDATE_OPS}, - weights_regularizer=layers.l2_regularizer(1e-4)) -for i in range(19): - h = residual_block(h, is_training) -v = value_heads(h, is_training) -p = policy_heads(h, is_training) -# loss = tf.reduce_mean(tf.square(z-v)) - tf.multiply(pi, tf.log(tf.clip_by_value(tf.nn.softmax(p), 1e-8, tf.reduce_max(tf.nn.softmax(p))))) -value_loss = tf.reduce_mean(tf.square(z - v)) -policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=pi, logits=p)) - -reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) -total_loss = value_loss + policy_loss + reg -# train_op = tf.train.MomentumOptimizer(1e-4, momentum=0.9, use_nesterov=True).minimize(total_loss) -update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) -with tf.control_dependencies(update_ops): - train_op = tf.train.RMSPropOptimizer(1e-4).minimize(total_loss) -var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) -saver = tf.train.Saver(max_to_keep=10, var_list=var_list) - - -def train(): - data_path = "/home/tongzheng/data/" - data_name = os.listdir("/home/tongzheng/data/") - epochs = 100 - batch_size = 128 - - result_path = "./checkpoints/" - with multi_gpu.create_session() as sess: - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(result_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - saver.restore(sess, ckpt_file) - for epoch in range(epochs): - for name in data_name: - data = np.load(data_path + name) - boards = data["boards"] - wins = data["wins"] - ps = data["ps"] - print (boards.shape) - print (wins.shape) - print (ps.shape) - # batch_num = 1 - batch_num = boards.shape[0] // batch_size - index = np.arange(boards.shape[0]) - np.random.shuffle(index) - value_losses = [] - policy_losses = [] - regs = [] - time_train = -time.time() - for iter in range(batch_num): - lv, lp, r, _ = sess.run([value_loss, policy_loss, reg, train_op], - feed_dict={x: boards[ - index[iter * batch_size:(iter + 1) * batch_size]], - z: wins[index[ - iter * batch_size:(iter + 1) * batch_size]], - pi: ps[index[ - iter * batch_size:(iter + 1) * batch_size]], - is_training: True}) - value_losses.append(lv) - policy_losses.append(lp) - regs.append(r) - del lv, lp, r - if iter % 1 == 0: - print( - "Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format( - epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)), - np.mean(np.array(policy_losses)), np.mean(np.array(regs)))) - del value_losses, policy_losses, regs, time_train - time_train = -time.time() - value_losses = [] - policy_losses = [] - regs = [] - if iter % 20 == 0: - save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter) - saver.save(sess, result_path + save_path) - del save_path - del data, boards, wins, ps, batch_num, index - gc.collect() - - -def forward(board): - result_path = "./checkpoints" - itflag = False - res = None - if board is None: - # data = np.load("/home/tongzheng/meta-data/80b7bf21bce14862806d48c3cd760a1b.npz") - data = np.load("./data/7f83928932f64a79bc1efdea268698ae.npz") - board = data["boards"][50].reshape(-1, 19, 19, 17) - human_board = board[0].transpose(2, 0, 1) - print("============================") - print("human board sum : " + str(np.sum(human_board))) - print("============================") - print(board[:, :, :, -1]) - itflag = False - with multi_gpu.create_session() as sess: - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(result_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - saver.restore(sess, ckpt_file) - else: - raise ValueError("No model loaded") - res = sess.run([tf.nn.softmax(p), v], feed_dict={x: board, is_training: itflag}) - # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False}) - # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True}) - # print(np.argmax(res[0])) - print(res) - print(data["p"][0]) - print(np.argmax(res[0])) - print(np.argmax(data["p"][0])) - # print(res[0].tolist()[0]) - # print(np.argmax(res[0])) - return res - - -if __name__ == '__main__': - # train() - # if sys.argv[1] == "test": - forward(None) diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index 1f9af85..8b54470 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -167,7 +167,7 @@ class GTPEngine(): move = self._parse_move(args) if move: color, vertex = move - res = self._game.do_move(color, vertex) + res = self._game.play_move(color, vertex) if res: return None, True else: @@ -177,17 +177,21 @@ class GTPEngine(): def cmd_genmove(self, args, **kwargs): color = self._parse_color(args) if color: - move = self._game.gen_move(color) + move = self._game.think_play_move(color) return self._vertex_point2string(move), True else: return 'unknown player', False def cmd_get_score(self, args, **kwargs): - return self._game.executor.get_score(), None + return self._game.game_engine.executor_get_score(self._game.board, True), True def cmd_show_board(self, args, **kwargs): return self._game.board, True + def cmd_get_prob(self, args, **kwargs): + return self._game.prob, True + + if __name__ == "main": game = Game() engine = GTPEngine(game_obj=Game) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 02ccb27..df08c0a 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -9,16 +9,13 @@ import utils import copy import tensorflow as tf import numpy as np -import sys +import sys, os import go -import network_small -import strategy +import model from collections import deque +sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir)) from tianshou.core.mcts.mcts import MCTS -import Network -#from strategy import strategy - class Game: ''' Load the real game and trained weights. @@ -26,35 +23,21 @@ class Game: TODO : Maybe merge with the engine class in future, currently leave it untouched for interacting with Go UI. ''' - def __init__(self, size=9, komi=6.5, checkpoint_path=None): + def __init__(self, size=9, komi=3.75, checkpoint_path=None): self.size = size self.komi = komi - self.board = [utils.EMPTY] * (self.size * self.size) + self.board = [utils.EMPTY] * (self.size ** 2) self.history = [] self.latest_boards = deque(maxlen=8) for _ in range(8): self.latest_boards.append(self.board) - - self.executor = go.Go(game=self) - #self.strategy = strategy(checkpoint_path) - - self.simulator = strategy.GoEnv(game=self) - self.net = network_small.Network() - self.sess = self.net.forward(checkpoint_path) - self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], - feed_dict={self.net.x: state, self.net.is_training: False}) - - def _flatten(self, vertex): - x, y = vertex - return (y - 1) * self.size + (x - 1) - - def _deflatten(self, idx): - x = idx % self.size + 1 - y = idx // self.size + 1 - return (x,y) + self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8, checkpoint_path=checkpoint_path) + # self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], + # feed_dict={self.net.x: state, self.net.is_training: False}) + self.game_engine = go.Go(size=self.size, komi=self.komi) def clear(self): - self.board = [utils.EMPTY] * (self.size * self.size) + self.board = [utils.EMPTY] * (self.size ** 2) self.history = [] for _ in range(8): self.latest_boards.append(self.board) @@ -66,42 +49,30 @@ class Game: def set_komi(self, k): self.komi = k - def generate_nn_input(self, history, color): - state = np.zeros([1, self.size, self.size, 17]) - for i in range(8): - state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.size ** 2)).reshape(self.size, self.size) - state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(self.size ** 2)).reshape(self.size, self.size) - if color == utils.BLACK: - state[0, :, :, 16] = np.ones([self.size, self.size]) - if color == utils.WHITE: - state[0, :, :, 16] = np.zeros([self.size, self.size]) - return state - - def strategy_gen_move(self, latest_boards, color): - self.simulator.latest_boards = copy.copy(latest_boards) - self.simulator.board = copy.copy(latest_boards[-1]) - nn_input = self.generate_nn_input(self.simulator.latest_boards, color) - mcts = MCTS(self.simulator, self.evaluator, nn_input, self.size ** 2 + 1, inverse=True, max_step=1) + def think(self, latest_boards, color): + mcts = MCTS(self.game_engine, self.evaluator, [latest_boards, color], self.size ** 2 + 1, inverse=True) + mcts.search(max_step=20) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0] if choice == self.size ** 2: move = utils.PASS else: - move = (choice % self.size + 1, choice / self.size + 1) + move = self.game_engine._deflatten(choice) return move, prob - def do_move(self, color, vertex): + def play_move(self, color, vertex): + # this function can be called directly to play the opponent's move if vertex == utils.PASS: return True - res = self.executor.do_move(color, vertex) + res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex) return res - def gen_move(self, color): - # move = self.strategy.gen_move(color) - # return move - move, self.prob = self.strategy_gen_move(self.latest_boards, color) - self.do_move(color, move) + def think_play_move(self, color): + # although we don't need to return self.prob, however it is needed for neural network training + move, self.prob = self.think(self.latest_boards, color) + # play the move immediately + self.play_move(color, move) return move def status2symbol(self, s): @@ -125,8 +96,9 @@ class Game: sys.stdout.flush() if __name__ == "__main__": - g = Game() + g = Game(checkpoint_path='./checkpoints/') g.show_board() + g.think_play_move(1) #file = open("debug.txt", "a") #file.write("mcts check\n") #file.close() diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 0afc877..661d918 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -1,7 +1,7 @@ from __future__ import print_function import utils import copy -import sys +import numpy as np from collections import deque ''' @@ -12,84 +12,26 @@ Settings of the Go game. ''' NEIGHBOR_OFFSET = [[1, 0], [-1, 0], [0, -1], [0, 1]] - +CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] class Go: def __init__(self, **kwargs): - self.game = kwargs['game'] + self.size = kwargs['size'] + self.komi = kwargs['komi'] - def _bfs(self, vertex, color, block, status, alive_break): - block.append(vertex) - status[self.game._flatten(vertex)] = True - nei = self._neighbor(vertex) - for n in nei: - if not status[self.game._flatten(n)]: - if self.game.board[self.game._flatten(n)] == color: - self._bfs(n, color, block, status, alive_break) + def _flatten(self, vertex): + x, y = vertex + return (x - 1) * self.size + (y - 1) - def _find_block(self, vertex, alive_break=False): - block = [] - status = [False] * (self.game.size * self.game.size) - color = self.game.board[self.game._flatten(vertex)] - self._bfs(vertex, color, block, status, alive_break) - - for b in block: - for n in self._neighbor(b): - if self.game.board[self.game._flatten(n)] == utils.EMPTY: - return False, block - return True, block - - def _find_boarder(self, vertex): - block = [] - status = [False] * (self.game.size * self.game.size) - self._bfs(vertex, utils.EMPTY, block, status, False) - border = [] - for b in block: - for n in self._neighbor(b): - if not (n in block): - border.append(n) - return border - - def _is_qi(self, color, vertex): - nei = self._neighbor(vertex) - for n in nei: - if self.game.board[self.game._flatten(n)] == utils.EMPTY: - return True - - self.game.board[self.game._flatten(vertex)] = color - for n in nei: - if self.game.board[self.game._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n) - if can_kill: - self.game.board[self.game._flatten(vertex)] = utils.EMPTY - return True - - ### can not suicide - can_kill, block = self._find_block(vertex) - if can_kill: - self.game.board[self.game._flatten(vertex)] = utils.EMPTY - return False - - self.game.board[self.game._flatten(vertex)] = utils.EMPTY - return True - - def _check_global_isomorphous(self, color, vertex): - ##backup - _board = copy.copy(self.game.board) - self.game.board[self.game._flatten(vertex)] = color - self._process_board(color, vertex) - if self.game.board in self.game.history: - res = True - else: - res = False - - self.game.board = _board - return res + def _deflatten(self, idx): + x = idx // self.size + 1 + y = idx % self.size + 1 + return (x, y) def _in_board(self, vertex): x, y = vertex - if x < 1 or x > self.game.size: return False - if y < 1 or y > self.game.size: return False + if x < 1 or x > self.size: return False + if y < 1 or y > self.size: return False return True def _neighbor(self, vertex): @@ -102,96 +44,201 @@ class Go: nei.append((_x, _y)) return nei - def _process_board(self, color, vertex): + def _corner(self, vertex): + x, y = vertex + corner = [] + for d in CORNER_OFFSET: + _x = x + d[0] + _y = y + d[1] + if self._in_board((_x, _y)): + corner.append((_x, _y)) + return corner + + def _find_group(self, current_board, vertex): + color = current_board[self._flatten(vertex)] + # print ("color : ", color) + chain = set() + frontier = [vertex] + has_liberty = False + while frontier: + current = frontier.pop() + # print ("current : ", current) + chain.add(current) + for n in self._neighbor(current): + if current_board[self._flatten(n)] == color and not n in chain: + frontier.append(n) + if current_board[self._flatten(n)] == utils.EMPTY: + has_liberty = True + return has_liberty, chain + + def _is_suicide(self, current_board, color, vertex): + current_board[self._flatten(vertex)] = color # assume that we already take this move + suicide = False + + has_liberty, group = self._find_group(current_board, vertex) + if not has_liberty: + suicide = True # no liberty, suicide + for n in self._neighbor(vertex): + if current_board[self._flatten(n)] == utils.another_color(color): + opponent_liberty, group = self._find_group(current_board, n) + if not opponent_liberty: + suicide = False # this move is able to take opponent's stone, not suicide + + current_board[self._flatten(vertex)] = utils.EMPTY # undo this move + return suicide + + def _process_board(self, current_board, color, vertex): nei = self._neighbor(vertex) for n in nei: - if self.game.board[self.game._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n, alive_break=True) - if can_kill: - for b in block: - self.game.board[self.game._flatten(b)] = utils.EMPTY + if current_board[self._flatten(n)] == utils.another_color(color): + has_liberty, group = self._find_group(current_board, n) + if not has_liberty: + for b in group: + current_board[self._flatten(b)] = utils.EMPTY - def is_valid(self, color, vertex): + def _check_global_isomorphous(self, history_boards, current_board, color, vertex): + repeat = False + next_board = copy.copy(current_board) + next_board[self._flatten(vertex)] = color + self._process_board(next_board, color, vertex) + if next_board in history_boards: + repeat = True + return repeat + + def _is_eye(self, current_board, color, vertex): + nei = self._neighbor(vertex) + cor = self._corner(vertex) + ncolor = {color == current_board[self._flatten(n)] for n in nei} + if False in ncolor: + # print "not all neighbors are in same color with us" + return False + _, group = self._find_group(current_board, nei[0]) + if set(nei) < group: + # print "all neighbors are in same group and same color with us" + return True + else: + opponent_number = [current_board[self._flatten(c)] for c in cor].count(-color) + opponent_propotion = float(opponent_number) / float(len(cor)) + if opponent_propotion < 0.5: + # print "few opponents, real eye" + return True + else: + # print "many opponents, fake eye" + return False + + def _knowledge_prunning(self, current_board, color, vertex): + # forbid some stupid selfplay using human knowledge + if self._is_eye(current_board, color, vertex): + return False + # forbid position on its own eye. + return True + + def _is_game_finished(self, current_board, color): + ''' + for each empty position, if it has both BLACK and WHITE neighbors, the game is still not finished + :return: return the game is finished + ''' + board = copy.deepcopy(current_board) + empty_idx = [i for i, x in enumerate(board) if x == utils.EMPTY] # find all empty idx + for idx in empty_idx: + neighbor_idx = self._neighbor(self.deflatten(idx)) + if len(neighbor_idx) > 1: + first_idx = neighbor_idx[0] + for other_idx in neighbor_idx[1:]: + if board[self.flatten(other_idx)] != board[self.flatten(first_idx)]: + return False + + return True + + def _action2vertex(self, action): + if action == self.size ** 2: + vertex = (0, 0) + else: + vertex = self._deflatten(action) + return vertex + + def _is_valid(self, history_boards, current_board, color, vertex): ### in board if not self._in_board(vertex): return False ### already have stone - if not self.game.board[self.game._flatten(vertex)] == utils.EMPTY: + if not current_board[self._flatten(vertex)] == utils.EMPTY: return False - ### check if it is qi - if not self._is_qi(color, vertex): + ### check if it is suicide + if self._is_suicide(current_board, color, vertex): return False - if self._check_global_isomorphous(color, vertex): + ### forbid global isomorphous + if self._check_global_isomorphous(history_boards, current_board, color, vertex): return False return True - def do_move(self, color, vertex): - if not self.is_valid(color, vertex): + def simulate_is_valid(self, state, action): + history_boards, color = state + vertex = self._action2vertex(action) + current_board = history_boards[-1] + + if not self._is_valid(history_boards, current_board, color, vertex): + return False + + if not self._knowledge_prunning(current_board, color, vertex): return False - self.game.board[self.game._flatten(vertex)] = color - self._process_board(color, vertex) - self.game.history.append(copy.copy(self.game.board)) - self.game.latest_boards.append(copy.copy(self.game.board)) return True - def _find_empty(self): - idx = [i for i,x in enumerate(self.game.board) if x == utils.EMPTY ][0] - return self.game._deflatten(idx) + def simulate_is_valid_list(self, state, action_set): + # find all the invalid actions + invalid_action_list = [] + for action_candidate in action_set[:-1]: + # go through all the actions excluding pass + if not self.simulate_is_valid(state, action_candidate): + invalid_action_list.append(action_candidate) + if len(invalid_action_list) < len(action_set) - 1: + invalid_action_list.append(action_set[-1]) + # forbid pass, if we have other choices + # TODO: In fact we should not do this. In some extreme cases, we should permit pass. + return invalid_action_list - def get_score(self, is_unknown_estimation = False): - ''' - is_unknown_estimation: whether use nearby stone to predict the unknown - return score from BLACK perspective. - ''' - _board = copy.copy(self.game.board) - while utils.EMPTY in self.game.board: - vertex = self._find_empty() - boarder = self._find_boarder(vertex) - boarder_color = set(map(lambda v: self.game.board[self.game._flatten(v)], boarder)) - if boarder_color == {utils.BLACK}: - self.game.board[self.game._flatten(vertex)] = utils.BLACK - elif boarder_color == {utils.WHITE}: - self.game.board[self.game._flatten(vertex)] = utils.WHITE - elif is_unknown_estimation: - self.game.board[self.game._flatten(vertex)] = self._predict_from_nearby(vertex) - else: - self.game.board[self.game._flatten(vertex)] =utils.UNKNOWN - score = 0 - for i in self.game.board: - if i == utils.BLACK: - score += 1 - elif i == utils.WHITE: - score -= 1 - score -= self.game.komi + def _do_move(self, board, color, vertex): + if vertex == utils.PASS: + return board + else: + id_ = self._flatten(vertex) + board[id_] = color + return board - self.game.board = _board - return score + def simulate_step_forward(self, state, action): + # initialize the simulate_board from state + history_boards, color = state + vertex = self._action2vertex(action) + new_board = self._do_move(copy.copy(history_boards[-1]), color, vertex) + history_boards.append(new_board) + new_color = -color + return [history_boards, new_color], 0 - def _predict_from_nearby(self, vertex, neighbor_step = 3): - ''' - step: the nearby 3 steps is considered - :vertex: position to be estimated - :neighbor_step: how many steps nearby - :return: the nearby positions of the input position - currently the nearby 3*3 grid is returned, altogether 4*8 points involved - ''' - for step in range(1, neighbor_step + 1): # check the stones within the steps in range - neighbor_vertex_set = [] - self._add_nearby_stones(neighbor_vertex_set, vertex[0] - step, vertex[1], 1, 1, neighbor_step) - self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] + step, 1, -1, neighbor_step) - self._add_nearby_stones(neighbor_vertex_set, vertex[0] + step, vertex[1], -1, -1, neighbor_step) - self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] - step, -1, 1, neighbor_step) - color_estimate = 0 - for neighbor_vertex in neighbor_vertex_set: - color_estimate += self.game.board[self.game._flatten(neighbor_vertex)] - if color_estimate > 0: - return utils.BLACK - elif color_estimate < 0: - return utils.WHITE + def executor_do_move(self, history, latest_boards, current_board, color, vertex): + if not self._is_valid(history, current_board, color, vertex): + return False + current_board[self._flatten(vertex)] = color + self._process_board(current_board, color, vertex) + history.append(copy.copy(current_board)) + latest_boards.append(copy.copy(current_board)) + return True + + def _find_empty(self, current_board): + idx = [i for i,x in enumerate(current_board) if x == utils.EMPTY ][0] + return self._deflatten(idx) + + def _find_boarder(self, current_board, vertex): + _, group = self._find_group(current_board, vertex) + border = [] + for b in group: + for n in self._neighbor(b): + if not (n in group): + border.append(n) + return border def _add_nearby_stones(self, neighbor_vertex_set, start_vertex_x, start_vertex_y, x_diff, y_diff, num_step): ''' @@ -210,3 +257,93 @@ class Go: neighbor_vertex_set.append((start_vertex_x, start_vertex_y)) start_vertex_x += x_diff start_vertex_y += y_diff + + def _predict_from_nearby(self, current_board, vertex, neighbor_step=3): + ''' + step: the nearby 3 steps is considered + :vertex: position to be estimated + :neighbor_step: how many steps nearby + :return: the nearby positions of the input position + currently the nearby 3*3 grid is returned, altogether 4*8 points involved + ''' + for step in range(1, neighbor_step + 1): # check the stones within the steps in range + neighbor_vertex_set = [] + self._add_nearby_stones(neighbor_vertex_set, vertex[0] - step, vertex[1], 1, 1, neighbor_step) + self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] + step, 1, -1, neighbor_step) + self._add_nearby_stones(neighbor_vertex_set, vertex[0] + step, vertex[1], -1, -1, neighbor_step) + self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] - step, -1, 1, neighbor_step) + color_estimate = 0 + for neighbor_vertex in neighbor_vertex_set: + color_estimate += current_board[self._flatten(neighbor_vertex)] + if color_estimate > 0: + return utils.BLACK + elif color_estimate < 0: + return utils.WHITE + + def executor_get_score(self, current_board, is_unknown_estimation=False): + ''' + is_unknown_estimation: whether use nearby stone to predict the unknown + return score from BLACK perspective. + ''' + _board = copy.deepcopy(current_board) + while utils.EMPTY in _board: + vertex = self._find_empty(_board) + boarder = self._find_boarder(_board, vertex) + boarder_color = set(map(lambda v: _board[self._flatten(v)], boarder)) + if boarder_color == {utils.BLACK}: + _board[self._flatten(vertex)] = utils.BLACK + elif boarder_color == {utils.WHITE}: + _board[self._flatten(vertex)] = utils.WHITE + elif is_unknown_estimation: + _board[self._flatten(vertex)] = self._predict_from_nearby(_board, vertex) + else: + _board[self._flatten(vertex)] =utils.UNKNOWN + score = 0 + for i in _board: + if i == utils.BLACK: + score += 1 + elif i == utils.WHITE: + score -= 1 + score -= self.komi + + return score + +if __name__ == "__main__": + ### do unit test for Go class + pure_test = [ + 0, 1, 0, 1, 0, 1, 0, 0, 0, + 1, 0, 1, 0, 1, 0, 0, 0, 0, + 0, 1, 0, 1, 0, 0, 1, 0, 0, + 0, 0, 1, 0, 0, 1, 0, 1, 0, + 0, 0, 0, 0, 0, 1, 1, 1, 0, + 1, 1, 1, 0, 0, 0, 0, 0, 0, + 1, 0, 1, 0, 0, 1, 1, 0, 0, + 1, 1, 1, 0, 1, 0, 1, 0, 0, + 0, 0, 0, 0, 1, 1, 1, 0, 0 + ] + + pt_qry = [(1, 1), (1, 5), (3, 3), (4, 7), (7, 2), (8, 6)] + pt_ans = [True, True, True, True, True, True] + + opponent_test = [ + 0, 1, 0, 1, 0, 1, 0,-1, 1, + 1,-1, 0,-1, 1,-1, 0, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, + 1, 1,-1, 0, 1,-1, 1, 0, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 0, + -1,1, 1, 0, 1, 1, 1, 0, 0, + 0, 1,-1, 0,-1,-1,-1, 0, 0, + 1, 0, 1, 0,-1, 0,-1, 0, 0, + 0, 1, 0, 0,-1,-1,-1, 0, 0 + ] + ot_qry = [(1, 1), (1, 5), (2, 9), (5, 2), (5, 6), (8, 6), (8, 2)] + ot_ans = [False, False, False, False, False, False, True] + + go = Go(size=9, komi=3.75) + for i in range(6): + print (go._is_eye(pure_test, utils.BLACK, pt_qry[i])) + print("Test of pure eye\n") + + for i in range(7): + print (go._is_eye(opponent_test, utils.BLACK, ot_qry[i])) + print("Test of eye surrend by opponents\n") diff --git a/AlphaGo/model.py b/AlphaGo/model.py new file mode 100644 index 0000000..22e8626 --- /dev/null +++ b/AlphaGo/model.py @@ -0,0 +1,286 @@ +import os +import time +import random +import sys +import cPickle +from collections import deque + +import numpy as np +import tensorflow as tf +import tensorflow.contrib.layers as layers + +import multi_gpu + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + + +def residual_block(input, is_training): + """ + one residual block + + :param input: a tensor, input of the residual block + :param is_training: a placeholder, indicate whether the model is training or not + :return: a tensor, output of the residual block + """ + normalizer_params = {'is_training': is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS} + h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = h + input + return tf.nn.relu(h) + + +def policy_head(input, is_training, action_num): + """ + the head of policy branch + + :param input: a tensor, input of the policy head + :param is_training: a placeholder, indicate whether the model is training or not + :param action_num: action_num: an integer, number of unique actions at any state + :return: a tensor: output of the policy head, shape [batch_size, action_num] + """ + normalizer_params = {'is_training': is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS} + h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.flatten(h) + h = layers.fully_connected(h, action_num, activation_fn=tf.identity, + weights_regularizer=layers.l2_regularizer(1e-4)) + return h + + +def value_head(input, is_training): + """ + the head of value branch + + :param input: a tensor, input of the value head + :param is_training: a placeholder, indicate whether the model is training or not + :return: a tensor, output of the value head, shape [batch_size, 1] + """ + normalizer_params = {'is_training': is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS} + h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.flatten(h) + h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4)) + return h + + +class Data(object): + def __init__(self): + self.boards = [] + self.probs = [] + self.winner = 0 + + +class ResNet(object): + def __init__(self, board_size, action_num, history_length=1, residual_block_num=20, checkpoint_path=None): + """ + the resnet model + + :param board_size: an integer, the board size + :param action_num: an integer, number of unique actions at any state + :param history_length: an integer, the history length to use, default is 1 + :param residual_block_num: an integer, the number of residual block, default is 20, at least 1 + :param checkpoint_path: a string, the path to the checkpoint, default is None, + """ + self.board_size = board_size + self.action_num = action_num + self.history_length = history_length + self.checkpoint_path = checkpoint_path + self.x = tf.placeholder(tf.float32, shape=[None, self.board_size, self.board_size, 2 * self.history_length + 1]) + self.is_training = tf.placeholder(tf.bool, shape=[]) + self.z = tf.placeholder(tf.float32, shape=[None, 1]) + self.pi = tf.placeholder(tf.float32, shape=[None, self.action_num]) + self._build_network(residual_block_num, self.checkpoint_path) + + # training hyper-parameters: + self.window_length = 7000 + self.save_freq = 5000 + self.training_data = {'states': deque(maxlen=self.window_length), 'probs': deque(maxlen=self.window_length), + 'winner': deque(maxlen=self.window_length), 'length': deque(maxlen=self.window_length)} + + def _build_network(self, residual_block_num, checkpoint_path): + """ + build the network + + :param residual_block_num: an integer, the number of residual block + :param checkpoint_path: a string, the path to the checkpoint, if None, use random initialization parameter + :return: None + """ + + h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, + normalizer_params={'is_training': self.is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS}, + weights_regularizer=layers.l2_regularizer(1e-4)) + for i in range(residual_block_num - 1): + h = residual_block(h, self.is_training) + self.v = value_head(h, self.is_training) + self.p = policy_head(h, self.is_training, self.action_num) + self.value_loss = tf.reduce_mean(tf.square(self.z - self.v)) + self.policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.pi, logits=self.p)) + + self.reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) + self.total_loss = self.value_loss + self.policy_loss + self.reg + self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + with tf.control_dependencies(self.update_ops): + self.train_op = tf.train.AdamOptimizer(1e-4).minimize(self.total_loss) + self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) + self.saver = tf.train.Saver(max_to_keep=0, var_list=self.var_list) + self.sess = multi_gpu.create_session() + self.sess.run(tf.global_variables_initializer()) + if checkpoint_path is not None: + ckpt_file = tf.train.latest_checkpoint(checkpoint_path) + if ckpt_file is not None: + print('Restoring model from {}...'.format(ckpt_file)) + self.saver.restore(self.sess, ckpt_file) + print('Successfully loaded') + else: + raise ValueError("No model in path {}".format(checkpoint_path)) + + def __call__(self, state): + """ + + :param history: a list, the history + :param color: a string, indicate which one to play + :return: a list of tensor, the predicted value and policy given the history and color + """ + history, color = state + if len(history) != self.history_length: + raise ValueError( + 'The length of history cannot meet the need of the model, given {}, need {}'.format(len(history), + self.history_length)) + state = self._history2state(history, color) + return self.sess.run([self.p, self.v], feed_dict={self.x: state, self.is_training: False}) + + def _history2state(self, history, color): + """ + convert the history to the state we need + + :param history: a list, the history + :param color: a string, indicate which one to play + :return: a ndarray, the state + """ + state = np.zeros([1, self.board_size, self.board_size, 2 * self.history_length + 1]) + for i in range(self.history_length): + state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.board_size ** 2)).reshape(self.board_size, + self.board_size) + state[0, :, :, i + self.history_length] = np.array( + np.array(history[i]) == -np.ones(self.board_size ** 2)).reshape(self.board_size, self.board_size) + # TODO: need a config to specify the BLACK and WHITE + if color == +1: + state[0, :, :, 2 * self.history_length] = np.ones([self.board_size, self.board_size]) + if color == -1: + state[0, :, :, 2 * self.history_length] = np.zeros([self.board_size, self.board_size]) + return state + + # TODO: design the interface between the environment and training + def train(self, mode='memory', *args, **kwargs): + if mode == 'memory': + pass + if mode == 'file': + self._train_with_file(data_path=kwargs['data_path'], batch_size=kwargs['batch_size'], + checkpoint_path=kwargs['checkpoint_path']) + + def _train_with_file(self, data_path, batch_size, checkpoint_path): + # check if the path is valid + if not os.path.exists(data_path): + raise ValueError("{} doesn't exist".format(data_path)) + self.checkpoint_path = checkpoint_path + if not os.path.exists(self.checkpoint_path): + os.mkdir(self.checkpoint_path) + + new_file_list = [] + all_file_list = [] + training_data = {'states': [], 'probs': [], 'winner': []} + + iters = 0 + while True: + new_file_list = list(set(os.listdir(data_path)).difference(all_file_list)) + while new_file_list: + all_file_list = os.listdir(data_path) + new_file_list.sort( + key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir(data_path + file) else 0) + for file in new_file_list: + states, probs, winner = self._file_to_training_data(data_path + file) + assert states.shape[0] == probs.shape[0] + assert states.shape[0] == winner.shape[0] + self.training_data['states'].append(states) + self.training_data['probs'].append(probs) + self.training_data['winner'].append(winner) + self.training_data['length'].append(states.shape[0]) + new_file_list = list(set(os.listdir(data_path)).difference(all_file_list)) + + if len(self.training_data['states']) != self.window_length: + continue + else: + start_time = time.time() + for i in range(batch_size): + game_num = random.randint(0, self.window_length-1) + state_num = random.randint(0, self.training_data['length'][game_num]-1) + training_data['states'].append(np.expand_dims(self.training_data['states'][game_num][state_num], 0)) + training_data['probs'].append(np.expand_dims(self.training_data['probs'][game_num][state_num], 0)) + training_data['winner'].append(np.expand_dims(self.training_data['winner'][game_num][state_num], 0)) + value_loss, policy_loss, reg, _ = self.sess.run( + [self.value_loss, self.policy_loss, self.reg, self.train_op], + feed_dict={self.x: np.concatenate(training_data['states'], axis=0), + self.z: np.concatenate(training_data['winner'], axis=0), + self.pi: np.concatenate(training_data['probs'], axis=0), + self.is_training: True}) + + print("Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format(iters, + time.time() - start_time, + value_loss, + policy_loss, reg)) + if iters % self.save_freq == 0: + save_path = "Iteration{}.ckpt".format(iters) + self.saver.save(self.sess, self.checkpoint_path + save_path) + for key in training_data.keys(): + training_data[key] = [] + iters += 1 + + def _file_to_training_data(self, file_name): + read = False + with open(file_name, 'rb') as file: + while not read: + try: + file.seek(0) + data = cPickle.load(file) + read = True + print("{} Loaded!".format(file_name)) + except Exception as e: + print(e) + time.sleep(1) + history = deque(maxlen=self.history_length) + states = [] + probs = [] + winner = [] + for _ in range(self.history_length): + # Note that 0 is specified, need a more general way like config + history.append([0] * self.board_size ** 2) + # Still, +1 is specified + color = +1 + + for [board, prob] in zip(data.boards, data.probs): + history.append(board) + states.append(self._history2state(history, color)) + probs.append(np.array(prob).reshape(1, self.board_size ** 2 + 1)) + winner.append(np.array(data.winner).reshape(1, 1)) + color *= -1 + states = np.concatenate(states, axis=0) + probs = np.concatenate(probs, axis=0) + winner = np.concatenate(winner, axis=0) + return states, probs, winner + + +if __name__ == "__main__": + model = ResNet(board_size=9, action_num=82, history_length=8) + model.train("file", data_path="./data/", batch_size=128, checkpoint_path="./checkpoint/") diff --git a/AlphaGo/network_small.py b/AlphaGo/network.py similarity index 100% rename from AlphaGo/network_small.py rename to AlphaGo/network.py diff --git a/AlphaGo/play.py b/AlphaGo/play.py index fe6c7ce..3681430 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -1,9 +1,22 @@ +import argparse import subprocess import sys import re import Pyro4 import time import os +import cPickle + + +class Data(object): + def __init__(self): + self.boards = [] + self.probs = [] + self.winner = 0 + + def reset(self): + self.__init__() + if __name__ == '__main__': """ @@ -11,84 +24,135 @@ if __name__ == '__main__': Note that, this function requires the installation of the Pyro4 library. """ # TODO : we should set the network path in a more configurable way. - black_weight_path = "./checkpoints" - white_weight_path = "./checkpoints_origin" - if (not os.path.exists(black_weight_path)): - print "Can't not find the network weights for black player." - sys.exit() - if (not os.path.exists(white_weight_path)): - print "Can't not find the network weights for white player." - sys.exit() + parser = argparse.ArgumentParser() + parser.add_argument("--result_path", type=str, default="./data/") + parser.add_argument("--black_weight_path", type=str, default=None) + parser.add_argument("--white_weight_path", type=str, default=None) + parser.add_argument("--id", type=int, default=0) + args = parser.parse_args() + + if not os.path.exists(args.result_path): + os.mkdir(args.result_path) + # black_weight_path = "./checkpoints" + # white_weight_path = "./checkpoints_origin" + if args.black_weight_path is not None and (not os.path.exists(args.black_weight_path)): + raise ValueError("Can't not find the network weights for black player.") + if args.white_weight_path is not None and (not os.path.exists(args.white_weight_path)): + raise ValueError("Can't not find the network weights for white player.") # kill the old server - kill_old_server = subprocess.Popen(['killall', 'pyro4-ns']) - print "kill the old pyro4 name server, the return code is : " + str(kill_old_server.wait()) - time.sleep(1) + # kill_old_server = subprocess.Popen(['killall', 'pyro4-ns']) + # print "kill the old pyro4 name server, the return code is : " + str(kill_old_server.wait()) + # time.sleep(1) # start a name server to find the remote object - start_new_server = subprocess.Popen(['pyro4-ns', '&']) - print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait()) - time.sleep(1) + # start_new_server = subprocess.Popen(['pyro4-ns', '&']) + # print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait()) + # time.sleep(1) + + # start a name server if no name server exists + if len(os.popen('ps aux | grep pyro4-ns | grep -v grep').readlines()) == 0: + start_new_server = subprocess.Popen(['pyro4-ns', '&']) + print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait()) + time.sleep(1) # start two different player with different network weights. - agent_v0 = subprocess.Popen(['python', '-u', 'player.py', '--role=black'], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - agent_v1 = subprocess.Popen(['python', '-u', 'player.py', '--role=white', '--checkpoint_path=./checkpoints_origin/'], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + black_role_name = 'black' + str(args.id) + white_role_name = 'white' + str(args.id) + + agent_v0 = subprocess.Popen( + ['python', '-u', 'player.py', '--role=' + black_role_name, '--checkpoint_path=' + str(args.black_weight_path)], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + agent_v1 = subprocess.Popen( + ['python', '-u', 'player.py', '--role=' + white_role_name, '--checkpoint_path=' + str(args.white_weight_path)], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + server_list = "" - while ("black" not in server_list) or ("white" not in server_list): + while (black_role_name not in server_list) or (white_role_name not in server_list): server_list = subprocess.check_output(['pyro4-nsc', 'list']) - print "Waining for the server start..." + print "Waiting for the server start..." time.sleep(1) print server_list print "Start black player at : " + str(agent_v0.pid) print "Start white player at : " + str(agent_v1.pid) + data = Data() player = [None] * 2 - player[0] = Pyro4.Proxy("PYRONAME:black") - player[1] = Pyro4.Proxy("PYRONAME:white") + player[0] = Pyro4.Proxy("PYRONAME:" + black_role_name) + player[1] = Pyro4.Proxy("PYRONAME:" + white_role_name) role = ["BLACK", "WHITE"] color = ['b', 'w'] pattern = "[A-Z]{1}[0-9]{1}" + space = re.compile("\s+") size = 9 show = ['.', 'X', 'O'] evaluate_rounds = 1 game_num = 0 - while game_num < evaluate_rounds: - num = 0 - pass_flag = [False, False] - print("Start game {}".format(game_num)) - # end the game if both palyer chose to pass, or play too much turns - while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2: - turn = num % 2 - move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n') - print role[turn] + " : " + str(move), - num += 1 - match = re.search(pattern, move) - if match is not None: - # print "match : " + str(match.group()) - play_or_pass = match.group() - pass_flag[turn] = False + try: + while True: + start_time = time.time() + num = 0 + pass_flag = [False, False] + print("Start game {}".format(game_num)) + # end the game if both palyer chose to pass, or play too much turns + while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2: + turn = num % 2 + board = player[turn].run_cmd(str(num) + ' show_board') + board = eval(board[board.index('['):board.index(']') + 1]) + for i in range(size): + for j in range(size): + print show[board[i * size + j]] + " ", + print "\n", + data.boards.append(board) + move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n') + print role[turn] + " : " + str(move), + num += 1 + match = re.search(pattern, move) + if match is not None: + # print "match : " + str(match.group()) + play_or_pass = match.group() + pass_flag[turn] = False + else: + # print "no match" + play_or_pass = ' PASS' + pass_flag[turn] = True + result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n') + prob = player[turn].run_cmd(str(num) + ' get_prob') + prob = space.sub(',', prob[prob.index('['):prob.index(']') + 1]) + prob = prob.replace('[,', '[') + prob = prob.replace('],', ']') + prob = eval(prob) + data.probs.append(prob) + score = player[turn].run_cmd(str(num) + ' get_score') + print "Finished : ", score.split(" ")[1] + # TODO: generalize the player + if eval(score.split(" ")[1]) > 0: + data.winner = 1 + if eval(score.split(" ")[1]) < 0: + data.winner = -1 + player[0].run_cmd(str(num) + ' clear_board') + player[1].run_cmd(str(num) + ' clear_board') + file_list = os.listdir(args.result_path) + if not file_list: + data_num = 0 else: - # print "no match" - play_or_pass = ' PASS' - pass_flag[turn] = True - result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n') - board = player[turn].run_cmd(str(num) + ' show_board') - board = eval(board[board.index('['):board.index(']') + 1]) - for i in range(size): - for j in range(size): - print show[board[i * size + j]] + " ", - print "\n", + file_list.sort(key=lambda file: os.path.getmtime(args.result_path + file) if not os.path.isdir( + args.result_path + file) else 0) + data_num = eval(file_list[-1][:-4]) + 1 + with open("./data/" + str(data_num) + ".pkl", "wb") as file: + picklestring = cPickle.dump(data, file) + data.reset() + game_num += 1 - score = player[turn].run_cmd(str(num) + ' get_score') - print "Finished : ", score.split(" ")[1] - player[0].run_cmd(str(num) + ' clear_board') - player[1].run_cmd(str(num) + ' clear_board') - game_num += 1 + except Exception as e: + print(e) + subprocess.call(["kill", "-9", str(agent_v0.pid)]) + subprocess.call(["kill", "-9", str(agent_v1.pid)]) + print "Kill all player, finish all game." subprocess.call(["kill", "-9", str(agent_v0.pid)]) subprocess.call(["kill", "-9", str(agent_v1.pid)]) diff --git a/AlphaGo/player.py b/AlphaGo/player.py index 8245c38..0e3daff 100644 --- a/AlphaGo/player.py +++ b/AlphaGo/player.py @@ -20,12 +20,15 @@ class Player(object): #return "inside the Player of player.py" return self.engine.run_cmd(command) + if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument("--checkpoint_path", type=str, default="./checkpoints/") + parser.add_argument("--checkpoint_path", type=str, default=None) parser.add_argument("--role", type=str, default="unknown") args = parser.parse_args() + if args.checkpoint_path == 'None': + args.checkpoint_path = None game = Game(checkpoint_path=args.checkpoint_path) engine = GTPEngine(game_obj=game, name='tianshou', version=0) diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py new file mode 100644 index 0000000..cba91d9 --- /dev/null +++ b/AlphaGo/reversi.py @@ -0,0 +1,264 @@ +from __future__ import print_function +import numpy as np + +''' +Settings of the Go game. + +(1, 1) is considered as the upper left corner of the board, +(size, 1) is the lower left +''' + + +def find_correct_moves(own, enemy): + """return legal moves""" + left_right_mask = 0x7e7e7e7e7e7e7e7e # Both most left-right edge are 0, else 1 + top_bottom_mask = 0x00ffffffffffff00 # Both most top-bottom edge are 0, else 1 + mask = left_right_mask & top_bottom_mask + mobility = 0 + mobility |= search_offset_left(own, enemy, left_right_mask, 1) # Left + mobility |= search_offset_left(own, enemy, mask, 9) # Left Top + mobility |= search_offset_left(own, enemy, top_bottom_mask, 8) # Top + mobility |= search_offset_left(own, enemy, mask, 7) # Top Right + mobility |= search_offset_right(own, enemy, left_right_mask, 1) # Right + mobility |= search_offset_right(own, enemy, mask, 9) # Bottom Right + mobility |= search_offset_right(own, enemy, top_bottom_mask, 8) # Bottom + mobility |= search_offset_right(own, enemy, mask, 7) # Left bottom + return mobility + + +def calc_flip(pos, own, enemy): + """return flip stones of enemy by bitboard when I place stone at pos. + + :param pos: 0~63 + :param own: bitboard (0=top left, 63=bottom right) + :param enemy: bitboard + :return: flip stones of enemy when I place stone at pos. + """ + f1 = _calc_flip_half(pos, own, enemy) + f2 = _calc_flip_half(63 - pos, rotate180(own), rotate180(enemy)) + return f1 | rotate180(f2) + + +def _calc_flip_half(pos, own, enemy): + el = [enemy, enemy & 0x7e7e7e7e7e7e7e7e, enemy & 0x7e7e7e7e7e7e7e7e, enemy & 0x7e7e7e7e7e7e7e7e] + masks = [0x0101010101010100, 0x00000000000000fe, 0x0002040810204080, 0x8040201008040200] + masks = [b64(m << pos) for m in masks] + flipped = 0 + for e, mask in zip(el, masks): + outflank = mask & ((e | ~mask) + 1) & own + flipped |= (outflank - (outflank != 0)) & mask + return flipped + + +def search_offset_left(own, enemy, mask, offset): + e = enemy & mask + blank = ~(own | enemy) + t = e & (own >> offset) + t |= e & (t >> offset) + t |= e & (t >> offset) + t |= e & (t >> offset) + t |= e & (t >> offset) + t |= e & (t >> offset) # Up to six stones can be turned at once + return blank & (t >> offset) # Only the blank squares can be started + + +def search_offset_right(own, enemy, mask, offset): + e = enemy & mask + blank = ~(own | enemy) + t = e & (own << offset) + t |= e & (t << offset) + t |= e & (t << offset) + t |= e & (t << offset) + t |= e & (t << offset) + t |= e & (t << offset) # Up to six stones can be turned at once + return blank & (t << offset) # Only the blank squares can be started + + +def flip_vertical(x): + k1 = 0x00FF00FF00FF00FF + k2 = 0x0000FFFF0000FFFF + x = ((x >> 8) & k1) | ((x & k1) << 8) + x = ((x >> 16) & k2) | ((x & k2) << 16) + x = (x >> 32) | b64(x << 32) + return x + + +def b64(x): + return x & 0xFFFFFFFFFFFFFFFF + + +def bit_count(x): + return bin(x).count('1') + + +def bit_to_array(x, size): + """bit_to_array(0b0010, 4) -> array([0, 1, 0, 0])""" + return np.array(list(reversed((("0" * size) + bin(x)[2:])[-size:])), dtype=np.uint8) + + +def flip_diag_a1h8(x): + k1 = 0x5500550055005500 + k2 = 0x3333000033330000 + k4 = 0x0f0f0f0f00000000 + t = k4 & (x ^ b64(x << 28)) + x ^= t ^ (t >> 28) + t = k2 & (x ^ b64(x << 14)) + x ^= t ^ (t >> 14) + t = k1 & (x ^ b64(x << 7)) + x ^= t ^ (t >> 7) + return x + + +def rotate90(x): + return flip_diag_a1h8(flip_vertical(x)) + + +def rotate180(x): + return rotate90(rotate90(x)) + + +class Reversi: + def __init__(self, black=None, white=None): + self.black = black or (0b00001000 << 24 | 0b00010000 << 32) + self.white = white or (0b00010000 << 24 | 0b00001000 << 32) + self.board = None # 8 * 8 board with 1 for black, -1 for white and 0 for blank + self.color = None # 1 for black and -1 for white + self.action = None # number in 0~63 + # self.winner = None + self.black_win = None + + def get_board(self, black=None, white=None): + self.black = black or (0b00001000 << 24 | 0b00010000 << 32) + self.white = white or (0b00010000 << 24 | 0b00001000 << 32) + self.board = self.bitboard2board() + return self.board + + def simulate_is_valid(self, board, color): + self.board = board + self.color = color + self.board2bitboard() + own, enemy = self.get_own_and_enemy() + mobility = find_correct_moves(own, enemy) + valid_moves = bit_to_array(mobility, 64) + valid_moves = np.argwhere(valid_moves) + valid_moves = list(np.reshape(valid_moves, len(valid_moves))) + return valid_moves + + def simulate_step_forward(self, state, vertex): + self.board = state[0] + self.color = state[1] + self.board2bitboard() + self.vertex2action(vertex) + step_forward = self.step() + if step_forward: + new_board = self.bitboard2board() + return [new_board, 0 - self.color], 0 + + def executor_do_move(self, board, color, vertex): + self.board = board + self.color = color + self.board2bitboard() + self.vertex2action(vertex) + step_forward = self.step() + if step_forward: + new_board = self.bitboard2board() + for i in range(64): + board[i] = new_board[i] + + def executor_get_score(self, board): + self.board = board + self._game_over() + if self.black_win is not None: + return self.black_win + else: + ValueError("Game not finished!") + + def board2bitboard(self): + count = 1 + if self.board is None: + ValueError("None board!") + self.black = 0 + self.white = 0 + for i in range(64): + if self.board[i] == 1: + self.black |= count + elif self.board[i] == -1: + self.white |= count + count *= 2 + + def vertex2action(self, vertex): + x, y = vertex + if x == 0 and y == 0: + self.action = None + else: + self.action = 8 * (x - 1) + y - 1 + + def bitboard2board(self): + board = [] + black = bit_to_array(self.black, 64) + white = bit_to_array(self.white, 64) + for i in range(64): + if black[i]: + board.append(1) + elif white[i]: + board.append(-1) + else: + board.append(0) + return board + + def step(self): + if self.action < 0 or self.action > 63: + ValueError("Wrong action!") + if self.action is None: + return False + + own, enemy = self.get_own_and_enemy() + + flipped = calc_flip(self.action, own, enemy) + if bit_count(flipped) == 0: + self.illegal_move_to_lose(self.action) + return False + own ^= flipped + own |= 1 << self.action + enemy ^= flipped + + self.set_own_and_enemy(own, enemy) + return True + + def _game_over(self): + # self.done = True + ''' + if self.winner is None: + black_num, white_num = self.number_of_black_and_white + if black_num > white_num: + self.winner = 1 + elif black_num < white_num: + self.winner = -1 + else: + self.winner = 0 + ''' + if self.black_win is None: + black_num, white_num = self.number_of_black_and_white + self.black_win = black_num - white_num + + def illegal_move_to_lose(self, action): + self._game_over() + + def get_own_and_enemy(self): + if self.color == 1: + own, enemy = self.black, self.white + elif self.color == -1: + own, enemy = self.white, self.black + else: + own, enemy = None, None + return own, enemy + + def set_own_and_enemy(self, own, enemy): + if self.color == 1: + self.black, self.white = own, enemy + else: + self.white, self.black = own, enemy + + @property + def number_of_black_and_white(self): + return bit_count(self.black), bit_count(self.white) diff --git a/AlphaGo/self-play.py b/AlphaGo/self-play.py index 98ccf84..4387b24 100644 --- a/AlphaGo/self-play.py +++ b/AlphaGo/self-play.py @@ -79,7 +79,7 @@ while True: prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1)) print("Finished") print("\n") - score = game.executor.get_score(True) + score = game.game_engine.executor_get_score(game.board, True) if score > 0: winner = utils.BLACK else: diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py deleted file mode 100644 index 0bad998..0000000 --- a/AlphaGo/strategy.py +++ /dev/null @@ -1,227 +0,0 @@ -import os, sys - -sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir)) -import numpy as np -import utils -import time -import copy -import network_small -import tensorflow as tf -from collections import deque -from tianshou.core.mcts.mcts import MCTS - -DELTA = [[1, 0], [-1, 0], [0, -1], [0, 1]] -CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] - -class GoEnv: - def __init__(self, **kwargs): - self.game = kwargs['game'] - self.board = [utils.EMPTY] * (self.game.size * self.game.size) - self.latest_boards = deque(maxlen=8) - - def _flatten(self, vertex): - x, y = vertex - return (x - 1) * self.game.size + (y - 1) - - def _bfs(self, vertex, color, block, status, alive_break): - block.append(vertex) - status[self._flatten(vertex)] = True - nei = self._neighbor(vertex) - for n in nei: - if not status[self._flatten(n)]: - if self.board[self._flatten(n)] == color: - self._bfs(n, color, block, status, alive_break) - - def _find_block(self, vertex, alive_break=False): - block = [] - status = [False] * (self.game.size * self.game.size) - color = self.board[self._flatten(vertex)] - self._bfs(vertex, color, block, status, alive_break) - - for b in block: - for n in self._neighbor(b): - if self.board[self._flatten(n)] == utils.EMPTY: - return False, block - return True, block - - def _is_qi(self, color, vertex): - nei = self._neighbor(vertex) - for n in nei: - if self.board[self._flatten(n)] == utils.EMPTY: - return True - - self.board[self._flatten(vertex)] = color - for n in nei: - if self.board[self._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n) - if can_kill: - self.board[self._flatten(vertex)] = utils.EMPTY - return True - - ### avoid suicide - can_kill, block = self._find_block(vertex) - if can_kill: - self.board[self._flatten(vertex)] = utils.EMPTY - return False - - self.board[self._flatten(vertex)] = utils.EMPTY - return True - - def _check_global_isomorphous(self, color, vertex): - ##backup - _board = copy.copy(self.board) - self.board[self._flatten(vertex)] = color - self._process_board(color, vertex) - if self.board in self.latest_boards: - res = True - else: - res = False - - self.board = _board - return res - - def _in_board(self, vertex): - x, y = vertex - if x < 1 or x > self.game.size: return False - if y < 1 or y > self.game.size: return False - return True - - def _neighbor(self, vertex): - x, y = vertex - nei = [] - for d in DELTA: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - nei.append((_x, _y)) - return nei - - def _corner(self, vertex): - x, y = vertex - corner = [] - for d in CORNER_OFFSET: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - corner.append((_x, _y)) - return corner - - def _process_board(self, color, vertex): - nei = self._neighbor(vertex) - for n in nei: - if self.board[self._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n, alive_break=True) - if can_kill: - for b in block: - self.board[self._flatten(b)] = utils.EMPTY - - def _find_group(self, start): - color = self.board[self._flatten(start)] - # print ("color : ", color) - chain = set() - frontier = [start] - while frontier: - current = frontier.pop() - # print ("current : ", current) - chain.add(current) - for n in self._neighbor(current): - # print n, self._flatten(n), self.board[self._flatten(n)], - if self.board[self._flatten(n)] == color and not n in chain: - frontier.append(n) - return chain - - def _is_eye(self, color, vertex): - nei = self._neighbor(vertex) - cor = self._corner(vertex) - ncolor = {color == self.board[self._flatten(n)] for n in nei} - if False in ncolor: - # print "not all neighbors are in same color with us" - return False - if set(nei) < self._find_group(nei[0]): - # print "all neighbors are in same group and same color with us" - return True - else: - opponent_number = [self.board[self._flatten(c)] for c in cor].count(-color) - opponent_propotion = float(opponent_number) / float(len(cor)) - if opponent_propotion < 0.5: - # print "few opponents, real eye" - return True - else: - # print "many opponents, fake eye" - return False - - def knowledge_prunning(self, color, vertex): - ### check if it is an eye of yourself - ### assumptions : notice that this judgement requires that the state is an endgame - if self._is_eye(color, vertex): - return False - return True - - def simulate_is_valid(self, state, action): - # state is the play board, the shape is [1, 9, 9, 17] - if action == self.game.size * self.game.size: - vertex = (0, 0) - else: - vertex = (action / self.game.size + 1, action % self.game.size + 1) - if state[0, 0, 0, -1] == utils.BLACK: - color = utils.BLACK - else: - color = utils.WHITE - self.latest_boards.clear() - for i in range(8): - self.latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) - self.board = copy.copy(self.latest_boards[-1]) - - ### in board - if not self._in_board(vertex): - return False - - ### already have stone - if not self.board[self._flatten(vertex)] == utils.EMPTY: - # print(np.array(self.board).reshape(9, 9)) - # print(vertex) - return False - - ### check if it is qi - if not self._is_qi(color, vertex): - return False - - ### forbid global isomorphous - if self._check_global_isomorphous(color, vertex): - return False - - if not self.knowledge_prunning(color, vertex): - return False - - return True - - def do_move(self, color, vertex): - if vertex == utils.PASS: - return True - - id_ = self._flatten(vertex) - if self.board[id_] == utils.EMPTY: - self.board[id_] = color - return True - else: - return False - - def step_forward(self, state, action): - if state[0, 0, 0, -1] == 1: - color = utils.BLACK - else: - color = utils.WHITE - if action == self.game.size ** 2: - vertex = utils.PASS - else: - vertex = (action % self.game.size + 1, action / self.game.size + 1) - # print(vertex) - # print(self.board) - self.board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() - self.do_move(color, vertex) - new_state = np.concatenate( - [state[:, :, :, 1:8], (np.array(self.board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), - state[:, :, :, 9:16], (np.array(self.board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), - np.array(1 - state[:, :, :, -1]).reshape(1, self.game.size, self.game.size, 1)], - axis=3) - return new_state, 0 diff --git a/AlphaGo/unit_test.py b/AlphaGo/unit_test.py deleted file mode 100644 index 7a33b8e..0000000 --- a/AlphaGo/unit_test.py +++ /dev/null @@ -1,266 +0,0 @@ -import numpy as np -import sys -from game import Game -from engine import GTPEngine -import utils -import time -import copy -import network_small -import tensorflow as tf -from collections import deque -from tianshou.core.mcts.mcts import MCTS - -DELTA = [[1, 0], [-1, 0], [0, -1], [0, 1]] -CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] - -class GoEnv: - def __init__(self, size=9, komi=6.5): - self.size = size - self.komi = komi - self.board = [utils.EMPTY] * (self.size * self.size) - self.history = deque(maxlen=8) - - def _set_board(self, board): - self.board = board - - def _flatten(self, vertex): - x, y = vertex - return (x - 1) * self.size + (y - 1) - - def _bfs(self, vertex, color, block, status, alive_break): - block.append(vertex) - status[self._flatten(vertex)] = True - nei = self._neighbor(vertex) - for n in nei: - if not status[self._flatten(n)]: - if self.board[self._flatten(n)] == color: - self._bfs(n, color, block, status, alive_break) - - def _find_block(self, vertex, alive_break=False): - block = [] - status = [False] * (self.size * self.size) - color = self.board[self._flatten(vertex)] - self._bfs(vertex, color, block, status, alive_break) - - for b in block: - for n in self._neighbor(b): - if self.board[self._flatten(n)] == utils.EMPTY: - return False, block - return True, block - - def _is_qi(self, color, vertex): - nei = self._neighbor(vertex) - for n in nei: - if self.board[self._flatten(n)] == utils.EMPTY: - return True - - self.board[self._flatten(vertex)] = color - for n in nei: - if self.board[self._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n) - if can_kill: - self.board[self._flatten(vertex)] = utils.EMPTY - return True - - ### avoid suicide - can_kill, block = self._find_block(vertex) - if can_kill: - self.board[self._flatten(vertex)] = utils.EMPTY - return False - - self.board[self._flatten(vertex)] = utils.EMPTY - return True - - def _check_global_isomorphous(self, color, vertex): - ##backup - _board = copy.copy(self.board) - self.board[self._flatten(vertex)] = color - self._process_board(color, vertex) - if self.board in self.history: - res = True - else: - res = False - - self.board = _board - return res - - def _in_board(self, vertex): - x, y = vertex - if x < 1 or x > self.size: return False - if y < 1 or y > self.size: return False - return True - - def _neighbor(self, vertex): - x, y = vertex - nei = [] - for d in DELTA: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - nei.append((_x, _y)) - return nei - - def _corner(self, vertex): - x, y = vertex - corner = [] - for d in CORNER_OFFSET: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - corner.append((_x, _y)) - return corner - - def _process_board(self, color, vertex): - nei = self._neighbor(vertex) - for n in nei: - if self.board[self._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n, alive_break=True) - if can_kill: - for b in block: - self.board[self._flatten(b)] = utils.EMPTY - - def _find_group(self, start): - color = self.board[self._flatten(start)] - #print ("color : ", color) - chain = set() - frontier = [start] - while frontier: - current = frontier.pop() - #print ("current : ", current) - chain.add(current) - for n in self._neighbor(current): - #print n, self._flatten(n), self.board[self._flatten(n)], - if self.board[self._flatten(n)] == color and not n in chain: - frontier.append(n) - return chain - - def _is_eye(self, color, vertex): - nei = self._neighbor(vertex) - cor = self._corner(vertex) - ncolor = {color == self.board[self._flatten(n)] for n in nei} - if False in ncolor: - #print "not all neighbors are in same color with us" - return False - if set(nei) < self._find_group(nei[0]): - #print "all neighbors are in same group and same color with us" - return True - else: - opponent_number = [self.board[self._flatten(c)] for c in cor].count(-color) - opponent_propotion = float(opponent_number) / float(len(cor)) - if opponent_propotion < 0.5: - #print "few opponents, real eye" - return True - else: - #print "many opponents, fake eye" - return False - - # def is_valid(self, color, vertex): - def is_valid(self, state, action): - # state is the play board, the shape is [1, 9, 9, 17] - if action == self.size * self.size: - vertex = (0, 0) - else: - vertex = (action / self.size + 1, action % self.size + 1) - if state[0, 0, 0, -1] == utils.BLACK: - color = utils.BLACK - else: - color = utils.WHITE - self.history.clear() - for i in range(8): - self.history.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) - self.board = copy.copy(self.history[-1]) - ### in board - if not self._in_board(vertex): - return False - - ### already have stone - if not self.board[self._flatten(vertex)] == utils.EMPTY: - # print(np.array(self.board).reshape(9, 9)) - # print(vertex) - return False - - ### check if it is qi - if not self._is_qi(color, vertex): - return False - - ### check if it is an eye of yourself - ### assumptions : notice that this judgement requires that the state is an endgame - #if self._is_eye(color, vertex): - # return False - - if self._check_global_isomorphous(color, vertex): - return False - - return True - - def do_move(self, color, vertex): - if vertex == utils.PASS: - return True - - id_ = self._flatten(vertex) - if self.board[id_] == utils.EMPTY: - self.board[id_] = color - self.history.append(copy.copy(self.board)) - return True - else: - return False - - def step_forward(self, state, action): - if state[0, 0, 0, -1] == 1: - color = 1 - else: - color = -1 - if action == 81: - vertex = (0, 0) - else: - vertex = (action % 9 + 1, action / 9 + 1) - # print(vertex) - # print(self.board) - self.board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() - self.do_move(color, vertex) - new_state = np.concatenate( - [state[:, :, :, 1:8], (np.array(self.board) == 1).reshape(1, 9, 9, 1), - state[:, :, :, 9:16], (np.array(self.board) == -1).reshape(1, 9, 9, 1), - np.array(1 - state[:, :, :, -1]).reshape(1, 9, 9, 1)], - axis=3) - return new_state, 0 - - -pure_test = [ - 0, 1, 0, 1, 0, 1, 0, 0, 0, - 1, 0, 1, 0, 1, 0, 0, 0, 0, - 0, 1, 0, 1, 0, 0, 1, 0, 0, - 0, 0, 1, 0, 0, 1, 0, 1, 0, - 0, 0, 0, 0, 0, 1, 1, 1, 0, - 1, 1, 1, 0, 0, 0, 0, 0, 0, - 1, 0, 1, 0, 0, 1, 1, 0, 0, - 1, 1, 1, 0, 1, 0, 1, 0, 0, - 0, 0, 0, 0, 1, 1, 1, 0, 0 -] - -pt_qry = [(1, 1), (1, 5), (3, 3), (4, 7), (7, 2), (8, 6)] -pt_ans = [True, True, True, True, True, True] - -opponent_test = [ - 0, 1, 0, 1, 0, 1, 0,-1, 1, - 1,-1, 0,-1, 1,-1, 0, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, - 1, 1,-1, 0, 1,-1, 1, 0, 0, - 1, 0, 1, 0, 1, 0, 1, 0, 0, - -1, 1, 1, 0, 1, 1, 1, 0, 0, - 0, 1,-1, 0,-1,-1,-1, 0, 0, - 1, 0, 1, 0,-1, 0,-1, 0, 0, - 0, 1, 0, 0,-1,-1,-1, 0, 0 -] -ot_qry = [(1, 1), (1, 5), (2, 9), (5, 2), (5, 6), (8, 2), (8, 6)] -ot_ans = [False, False, False, False, False, True, False] - -#print (ge._find_group((6, 1))) -#print ge._is_eye(utils.BLACK, pt_qry[0]) -ge = GoEnv() -ge._set_board(pure_test) -for i in range(6): - print (ge._is_eye(utils.BLACK, pt_qry[i])) -ge._set_board(opponent_test) -for i in range(7): - print (ge._is_eye(utils.BLACK, ot_qry[i])) diff --git a/README.md b/README.md index 543d237..9c3af16 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,8 @@ Tianshou(天授) is a reinforcement learning platform. The following image illus Please follow [google python coding style](https://google.github.io/styleguide/pyguide.html) +There's a more detailed Chinese version [google python coding style in Chinese](http://www.runoob.com/w3cnote/google-python-styleguide.html) + All files/folders should be named with lower case letters and underline (except specified names such as `AlphaGo`). Try to use full names. Don't use abbrevations for class/function/variable names except common abbrevations (such as `num` for number, `dim` for dimension, `env` for environment, `op` for operation). For now we use `pi` to refer to the policy in examples/ppo_example.py. @@ -73,4 +75,4 @@ HaoshengZou: collaborate mainly on Policy and losses; interfaces and architectur Note: install openai/gym first to run the Atari environment; note that interfaces between modules may not be finalized; the management of placeholders and `feed_dict` may have to be done manually for the time being; -Without preprocessing and other tricks, this example will not train to any meaningful results. Codes should past two tests: individual module test and run through this example code. \ No newline at end of file +Without preprocessing and other tricks, this example will not train to any meaningful results. Codes should past two tests: individual module test and run through this example code. diff --git a/tianshou/core/mcts/evaluator.py b/tianshou/core/mcts/evaluator.py index 9c4ee8e..a1f9456 100644 --- a/tianshou/core/mcts/evaluator.py +++ b/tianshou/core/mcts/evaluator.py @@ -19,10 +19,10 @@ class rollout_policy(evaluator): # TODO: prior for rollout policy total_reward = 0. action = np.random.randint(0, self.action_num) - state, reward = self.env.step_forward(state, action) + state, reward = self.env.simulate_step_forward(state, action) total_reward += reward while state is not None: action = np.random.randint(0, self.action_num) - state, reward = self.env.step_forward(state, action) + state, reward = self.env.simulate_step_forward(state, action) total_reward += reward return np.ones([self.action_num])/self.action_num, total_reward diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 16d13d5..16890d7 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -59,15 +59,10 @@ class UCTNode(MCTSNode): self.parent.backpropagation(self.children[action].reward) def valid_mask(self, simulator): + # let all invalid actions be illeagel in mcts if self.mask is None: - start_time = time.time() - self.mask = [] - for act in range(self.action_num - 1): - if not simulator.simulate_is_valid(self.state, act): - self.mask.append(act) - self.ucb[act] = -float("Inf") - else: - self.ucb[self.mask] = -float("Inf") + self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num)) + self.ucb[self.mask] = -float("Inf") class TSNode(MCTSNode): @@ -104,7 +99,7 @@ class ActionNode(object): self.next_state = tuple2list(self.next_state) def selection(self, simulator): - self.next_state, self.reward = simulator.step_forward(self.parent.state, self.action) + self.next_state, self.reward = simulator.simulate_step_forward(self.parent.state, self.action) self.origin_state = self.next_state self.state_type = type(self.next_state) self.type_conversion_to_tuple() @@ -131,8 +126,7 @@ class ActionNode(object): class MCTS(object): - def __init__(self, simulator, evaluator, root, action_num, method="UCT", inverse=False, max_step=None, - max_time=None): + def __init__(self, simulator, evaluator, root, action_num, method="UCT", inverse=False): self.simulator = simulator self.evaluator = evaluator prior, _ = self.evaluator(root) @@ -140,33 +134,26 @@ class MCTS(object): if method == "": self.root = root if method == "UCT": - self.root = UCTNode(None, None, root, action_num, prior, inverse) + self.root = UCTNode(None, None, root, action_num, prior, inverse=inverse) if method == "TS": self.root = TSNode(None, None, root, action_num, prior, inverse=inverse) self.inverse = inverse - if max_step is not None: - self.step = 0 - self.max_step = max_step - # TODO: Optimize the stop criteria - # else: - # self.max_step = 0 - if max_time is not None: - self.start_time = time.time() - self.max_time = max_time + + def search(self, max_step=None, max_time=None): + step = 0 + start_time = time.time() + if max_step is None: + max_step = int("Inf") + if max_time is None: + max_time = float("Inf") if max_step is None and max_time is None: raise ValueError("Need a stop criteria!") - # TODO: running mcts should be implemented in another function, e.g. def search(self, max_step, max_time) - self.select_time = [] - self.evaluate_time = [] - self.bp_time = [] - while (max_step is not None and self.step < self.max_step or max_step is None) \ - and (max_time is not None and time.time() - self.start_time < self.max_time or max_time is None): - self.expand() - if max_step is not None: - self.step += 1 + while step < max_step and time.time() - start_time < max_step: + self._expand() + step += 1 - def expand(self): + def _expand(self): node, new_action = self.root.selection(self.simulator) value = node.children[new_action].expansion(self.evaluator, self.action_num) node.children[new_action].backpropagation(value + 0.) diff --git a/tianshou/core/policy/base.py b/tianshou/core/policy/base.py index eecfc4f..025abd5 100644 --- a/tianshou/core/policy/base.py +++ b/tianshou/core/policy/base.py @@ -15,7 +15,7 @@ __all__ = [ 'QValuePolicy', ] -# TODO: separate actor and critic, we should focus on it once we finish the basic module. +# TODO: a even more "base" class for policy class QValuePolicy(object): diff --git a/tianshou/core/policy/dqn.py b/tianshou/core/policy/dqn.py index 39f6a16..d03dbd4 100644 --- a/tianshou/core/policy/dqn.py +++ b/tianshou/core/policy/dqn.py @@ -1,5 +1,16 @@ from tianshou.core.policy.base import QValuePolicy import tensorflow as tf +import sys +sys.path.append('..') +import value_function.action_value as value_func + + +class DQN_refactor(object): + """ + use DQN from value_function as a member + """ + def __init__(self, value_tensor, observation_placeholder, action_placeholder): + self._network = value_func.DQN(value_tensor, observation_placeholder, action_placeholder) class DQN(QValuePolicy): diff --git a/tianshou/core/value_function/__init__.py b/tianshou/core/value_function/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tianshou/core/value_function/action_value.py b/tianshou/core/value_function/action_value.py new file mode 100644 index 0000000..cb8acc8 --- /dev/null +++ b/tianshou/core/value_function/action_value.py @@ -0,0 +1,53 @@ +from base import ValueFunctionBase +import tensorflow as tf + + +class ActionValue(ValueFunctionBase): + """ + class of action values Q(s, a). + """ + def __init__(self, value_tensor, observation_placeholder, action_placeholder): + self._action_placeholder = action_placeholder + super(ActionValue, self).__init__( + value_tensor=value_tensor, + observation_placeholder=observation_placeholder + ) + + def get_value(self, observation, action): + """ + + :param observation: numpy array of observations, of shape (batchsize, observation_dim). + :param action: numpy array of actions, of shape (batchsize, action_dim) + # TODO: Atari discrete action should have dim 1. Super Mario may should have, say, dim 5, where each can be 0/1 + :return: numpy array of state values, of shape (batchsize, ) + # TODO: dealing with the last dim of 1 in V(s) and Q(s, a) + """ + sess = tf.get_default_session() + return sess.run(self.get_value_tensor(), feed_dict= + {self._observation_placeholder: observation, self._action_placeholder:action})[:, 0] + + +class DQN(ActionValue): + """ + class of the very DQN architecture. Instead of feeding s and a to the network to get a value, DQN feed s to the + network and the last layer is Q(s, *) for all actions + """ + def __init__(self, value_tensor, observation_placeholder, action_placeholder): + """ + :param value_tensor: of shape (batchsize, num_actions) + :param observation_placeholder: of shape (batchsize, observation_dim) + :param action_placeholder: of shape (batchsize, ) + """ + self._value_tensor_all_actions = value_tensor + canonical_value_tensor = value_tensor[action_placeholder] # maybe a tf.map_fn. for now it's wrong + + super(DQN, self).__init__(value_tensor=canonical_value_tensor, + observation_placeholder=observation_placeholder, + action_placeholder=action_placeholder) + + def get_value_all_actions(self, observation): + sess = tf.get_default_session() + return sess.run(self._value_tensor_all_actions, feed_dict={self._observation_placeholder: observation}) + + def get_value_tensor_all_actions(self): + return self._value_tensor_all_actions \ No newline at end of file diff --git a/tianshou/core/value_function/base.py b/tianshou/core/value_function/base.py new file mode 100644 index 0000000..0b27759 --- /dev/null +++ b/tianshou/core/value_function/base.py @@ -0,0 +1,23 @@ + +# TODO: linear feature baseline also in tf? +class ValueFunctionBase(object): + """ + base class of value functions. Children include state values V(s) and action values Q(s, a) + """ + def __init__(self, value_tensor, observation_placeholder): + self._observation_placeholder = observation_placeholder + self._value_tensor = value_tensor + + def get_value(self, **kwargs): + """ + + :return: batch of corresponding values in numpy array + """ + raise NotImplementedError() + + def get_value_tensor(self): + """ + + :return: tensor of the corresponding values + """ + return self._value_tensor diff --git a/tianshou/core/value_function/state_value.py b/tianshou/core/value_function/state_value.py new file mode 100644 index 0000000..04fe442 --- /dev/null +++ b/tianshou/core/value_function/state_value.py @@ -0,0 +1,23 @@ +from base import ValueFunctionBase +import tensorflow as tf + + +class StateValue(ValueFunctionBase): + """ + class of state values V(s). + """ + def __init__(self, value_tensor, observation_placeholder): + super(StateValue, self).__init__( + value_tensor=value_tensor, + observation_placeholder=observation_placeholder + ) + + def get_value(self, observation): + """ + + :param observation: numpy array of observations, of shape (batchsize, observation_dim). + :return: numpy array of state values, of shape (batchsize, ) + # TODO: dealing with the last dim of 1 in V(s) and Q(s, a) + """ + sess = tf.get_default_session() + return sess.run(self.get_value_tensor(), feed_dict={self._observation_placeholder: observation})[:, 0] \ No newline at end of file