From 7fca90c61b97704463985f1c1774e90a834c906c Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Wed, 20 Dec 2017 16:43:42 +0800 Subject: [PATCH] modify the mcts, refactor the network --- AlphaGo/Network.py | 211 ----------------------- AlphaGo/Network_ori.py | 175 ------------------- AlphaGo/game.py | 15 +- AlphaGo/go.py | 58 ++----- AlphaGo/model.py | 170 ++++++++++++++++++ AlphaGo/{network_small.py => network.py} | 0 tianshou/core/mcts/mcts.py | 40 ++--- 7 files changed, 212 insertions(+), 457 deletions(-) delete mode 100644 AlphaGo/Network.py delete mode 100644 AlphaGo/Network_ori.py create mode 100644 AlphaGo/model.py rename AlphaGo/{network_small.py => network.py} (100%) diff --git a/AlphaGo/Network.py b/AlphaGo/Network.py deleted file mode 100644 index caf7710..0000000 --- a/AlphaGo/Network.py +++ /dev/null @@ -1,211 +0,0 @@ -import os -import time -import sys - -import numpy as np -import time -import tensorflow as tf -import tensorflow.contrib.layers as layers - -import multi_gpu -import time - -# os.environ["CUDA_VISIBLE_DEVICES"] = "1" -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - - -def residual_block(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = h + input - return tf.nn.relu(h) - - -def policy_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 362, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -def value_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -class Network(object): - def __init__(self): - self.x = tf.placeholder(tf.float32, shape=[None, 19, 19, 17]) - self.is_training = tf.placeholder(tf.bool, shape=[]) - self.z = tf.placeholder(tf.float32, shape=[None, 1]) - self.pi = tf.placeholder(tf.float32, shape=[None, 362]) - self.build_network() - - def build_network(self): - h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm, - normalizer_params={'is_training': self.is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS}, - weights_regularizer=layers.l2_regularizer(1e-4)) - for i in range(19): - h = residual_block(h, self.is_training) - self.v = value_heads(h, self.is_training) - self.p = policy_heads(h, self.is_training) - # loss = tf.reduce_mean(tf.square(z-v)) - tf.multiply(pi, tf.log(tf.clip_by_value(tf.nn.softmax(p), 1e-8, tf.reduce_max(tf.nn.softmax(p))))) - self.value_loss = tf.reduce_mean(tf.square(self.z - self.v)) - self.policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.pi, logits=self.p)) - - self.reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) - self.total_loss = self.value_loss + self.policy_loss + self.reg - # train_op = tf.train.MomentumOptimizer(1e-4, momentum=0.9, use_nesterov=True).minimize(total_loss) - self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) - with tf.control_dependencies(self.update_ops): - self.train_op = tf.train.RMSPropOptimizer(1e-4).minimize(self.total_loss) - self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) - self.saver = tf.train.Saver(max_to_keep=10, var_list=self.var_list) - - def train(self): - data_path = "/home/tongzheng/data/" - data_name = os.listdir("/home/tongzheng/data/") - epochs = 100 - batch_size = 128 - - result_path = "./checkpoints/" - with multi_gpu.create_session() as sess: - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(result_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - self.saver.restore(sess, ckpt_file) - for epoch in range(epochs): - for name in data_name: - data = np.load(data_path + name) - boards = data["boards"] - wins = data["wins"] - ps = data["ps"] - print (boards.shape) - print (wins.shape) - print (ps.shape) - batch_num = boards.shape[0] // batch_size - index = np.arange(boards.shape[0]) - np.random.shuffle(index) - value_losses = [] - policy_losses = [] - regs = [] - time_train = -time.time() - for iter in range(batch_num): - lv, lp, r, value, prob, _ = sess.run( - [self.value_loss, self.policy_loss, self.reg, self.v, tf.nn.softmax(p), self.train_op], - feed_dict={self.x: boards[ - index[iter * batch_size:(iter + 1) * batch_size]], - self.z: wins[index[ - iter * batch_size:(iter + 1) * batch_size]], - self.pi: ps[index[ - iter * batch_size:(iter + 1) * batch_size]], - self.is_training: True}) - value_losses.append(lv) - policy_losses.append(lp) - regs.append(r) - if iter % 1 == 0: - print( - "Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format( - epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)), - np.mean(np.array(policy_losses)), np.mean(np.array(regs)))) - time_train = -time.time() - value_losses = [] - policy_losses = [] - regs = [] - if iter % 20 == 0: - save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter) - self.saver.save(sess, result_path + save_path) - del data, boards, wins, ps - - - # def forward(call_number): - # # checkpoint_path = "/home/yama/rl/tianshou/AlphaGo/checkpoints" - # checkpoint_path = "/home/jialian/stuGo/tianshou/stuGo/checkpoints/" - # board_file = np.genfromtxt("/home/jialian/stuGo/tianshou/leela-zero/src/mcts_nn_files/board_" + call_number, - # dtype='str'); - # human_board = np.zeros((17, 19, 19)) - # - # # TODO : is it ok to ignore the last channel? - # for i in range(17): - # human_board[i] = np.array(list(board_file[i])).reshape(19, 19) - # # print("============================") - # # print("human board sum : " + str(np.sum(human_board[-1]))) - # # print("============================") - # # print(human_board) - # # print("============================") - # # rint(human_board) - # feed_board = human_board.transpose(1, 2, 0).reshape(1, 19, 19, 17) - # # print(feed_board[:,:,:,-1]) - # # print(feed_board.shape) - # - # # npz_board = np.load("/home/yama/rl/tianshou/AlphaGo/data/7f83928932f64a79bc1efdea268698ae.npz") - # # print(npz_board["boards"].shape) - # # feed_board = npz_board["boards"][10].reshape(-1, 19, 19, 17) - # ##print(feed_board) - # # show_board = feed_board[0].transpose(2, 0, 1) - # # print("board shape : ", show_board.shape) - # # print(show_board) - # - # itflag = False - # with multi_gpu.create_session() as sess: - # sess.run(tf.global_variables_initializer()) - # ckpt_file = tf.train.latest_checkpoint(checkpoint_path) - # if ckpt_file is not None: - # # print('Restoring model from {}...'.format(ckpt_file)) - # saver.restore(sess, ckpt_file) - # else: - # raise ValueError("No model loaded") - # res = sess.run([tf.nn.softmax(p), v], feed_dict={x: feed_board, is_training: itflag}) - # # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False}) - # # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True}) - # # print(np.argmax(res[0])) - # np.savetxt(sys.stdout, res[0][0], fmt="%.6f", newline=" ") - # np.savetxt(sys.stdout, res[1][0], fmt="%.6f", newline=" ") - # pv_file = "/home/jialian/stuGotianshou/leela-zero/src/mcts_nn_files/policy_value" - # np.savetxt(pv_file, np.concatenate((res[0][0], res[1][0])), fmt="%.6f", newline=" ") - # # np.savetxt(pv_file, res[1][0], fmt="%.6f", newline=" ") - # return res - - def forward(self): - checkpoint_path = "/home/tongzheng/tianshou/AlphaGo/checkpoints/" - sess = multi_gpu.create_session() - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(checkpoint_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - self.saver.restore(sess, ckpt_file) - print('Successfully loaded') - else: - raise ValueError("No model loaded") - # prior, value = sess.run([tf.nn.softmax(p), v], feed_dict={x: state, is_training: False}) - # return prior, value - return sess - - -if __name__ == '__main__': - state = np.random.randint(0, 1, [1, 19, 19, 17]) - net = Network() - sess = net.forward() - start = time.time() - for i in range(100): - sess.run([tf.nn.softmax(net.p), net.v], feed_dict={net.x: state, net.is_training: False}) - print("Step {}, Cumulative time {}".format(i, time.time() - start)) diff --git a/AlphaGo/Network_ori.py b/AlphaGo/Network_ori.py deleted file mode 100644 index 9d33bb9..0000000 --- a/AlphaGo/Network_ori.py +++ /dev/null @@ -1,175 +0,0 @@ -import os -import time -import gc - -import numpy as np -import tensorflow as tf -import tensorflow.contrib.layers as layers - -import multi_gpu - -os.environ["CUDA_VISIBLE_DEVICES"] = "1" - - -def residual_block(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = h + input - return tf.nn.relu(h) - - -def policy_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 362, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -def value_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -x = tf.placeholder(tf.float32, shape=[None, 19, 19, 17]) -is_training = tf.placeholder(tf.bool, shape=[]) -z = tf.placeholder(tf.float32, shape=[None, 1]) -pi = tf.placeholder(tf.float32, shape=[None, 362]) - -h = layers.conv2d(x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm, - normalizer_params={'is_training': is_training, 'updates_collections': tf.GraphKeys.UPDATE_OPS}, - weights_regularizer=layers.l2_regularizer(1e-4)) -for i in range(19): - h = residual_block(h, is_training) -v = value_heads(h, is_training) -p = policy_heads(h, is_training) -# loss = tf.reduce_mean(tf.square(z-v)) - tf.multiply(pi, tf.log(tf.clip_by_value(tf.nn.softmax(p), 1e-8, tf.reduce_max(tf.nn.softmax(p))))) -value_loss = tf.reduce_mean(tf.square(z - v)) -policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=pi, logits=p)) - -reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) -total_loss = value_loss + policy_loss + reg -# train_op = tf.train.MomentumOptimizer(1e-4, momentum=0.9, use_nesterov=True).minimize(total_loss) -update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) -with tf.control_dependencies(update_ops): - train_op = tf.train.RMSPropOptimizer(1e-4).minimize(total_loss) -var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) -saver = tf.train.Saver(max_to_keep=10, var_list=var_list) - - -def train(): - data_path = "/home/tongzheng/data/" - data_name = os.listdir("/home/tongzheng/data/") - epochs = 100 - batch_size = 128 - - result_path = "./checkpoints/" - with multi_gpu.create_session() as sess: - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(result_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - saver.restore(sess, ckpt_file) - for epoch in range(epochs): - for name in data_name: - data = np.load(data_path + name) - boards = data["boards"] - wins = data["wins"] - ps = data["ps"] - print (boards.shape) - print (wins.shape) - print (ps.shape) - # batch_num = 1 - batch_num = boards.shape[0] // batch_size - index = np.arange(boards.shape[0]) - np.random.shuffle(index) - value_losses = [] - policy_losses = [] - regs = [] - time_train = -time.time() - for iter in range(batch_num): - lv, lp, r, _ = sess.run([value_loss, policy_loss, reg, train_op], - feed_dict={x: boards[ - index[iter * batch_size:(iter + 1) * batch_size]], - z: wins[index[ - iter * batch_size:(iter + 1) * batch_size]], - pi: ps[index[ - iter * batch_size:(iter + 1) * batch_size]], - is_training: True}) - value_losses.append(lv) - policy_losses.append(lp) - regs.append(r) - del lv, lp, r - if iter % 1 == 0: - print( - "Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format( - epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)), - np.mean(np.array(policy_losses)), np.mean(np.array(regs)))) - del value_losses, policy_losses, regs, time_train - time_train = -time.time() - value_losses = [] - policy_losses = [] - regs = [] - if iter % 20 == 0: - save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter) - saver.save(sess, result_path + save_path) - del save_path - del data, boards, wins, ps, batch_num, index - gc.collect() - - -def forward(board): - result_path = "./checkpoints" - itflag = False - res = None - if board is None: - # data = np.load("/home/tongzheng/meta-data/80b7bf21bce14862806d48c3cd760a1b.npz") - data = np.load("./data/7f83928932f64a79bc1efdea268698ae.npz") - board = data["boards"][50].reshape(-1, 19, 19, 17) - human_board = board[0].transpose(2, 0, 1) - print("============================") - print("human board sum : " + str(np.sum(human_board))) - print("============================") - print(board[:, :, :, -1]) - itflag = False - with multi_gpu.create_session() as sess: - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(result_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - saver.restore(sess, ckpt_file) - else: - raise ValueError("No model loaded") - res = sess.run([tf.nn.softmax(p), v], feed_dict={x: board, is_training: itflag}) - # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False}) - # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True}) - # print(np.argmax(res[0])) - print(res) - print(data["p"][0]) - print(np.argmax(res[0])) - print(np.argmax(data["p"][0])) - # print(res[0].tolist()[0]) - # print(np.argmax(res[0])) - return res - - -if __name__ == '__main__': - # train() - # if sys.argv[1] == "test": - forward(None) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index aee8d3a..37b7878 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -11,7 +11,7 @@ import tensorflow as tf import numpy as np import sys, os import go -import network_small +import model from collections import deque sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir)) from tianshou.core.mcts.mcts import MCTS @@ -31,10 +31,9 @@ class Game: self.latest_boards = deque(maxlen=8) for _ in range(8): self.latest_boards.append(self.board) - self.net = network_small.Network() - self.sess = self.net.forward(checkpoint_path) - self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], - feed_dict={self.net.x: state, self.net.is_training: False}) + self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8) + # self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], + # feed_dict={self.net.x: state, self.net.is_training: False}) self.game_engine = go.Go(game=self) def _flatten(self, vertex): @@ -75,7 +74,8 @@ class Game: self.game_engine.simulate_latest_boards = copy.copy(latest_boards) self.game_engine.simulate_board = copy.copy(latest_boards[-1]) nn_input = self.generate_nn_input(self.game_engine.simulate_latest_boards, color) - mcts = MCTS(self.game_engine, self.evaluator, nn_input, self.size ** 2 + 1, inverse=True, max_step=1) + mcts = MCTS(self.game_engine, self.evaluator, [self.game_engine.simulate_latest_boards, color], self.size ** 2 + 1, inverse=True) + mcts.search(max_step=1) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0] @@ -93,7 +93,7 @@ class Game: return res def think_play_move(self, color): - # although we dont need to return self.prob, however it is needed for neural network training + # although we don't need to return self.prob, however it is needed for neural network training move, self.prob = self.think(self.latest_boards, color) # play the move immediately self.play_move(color, move) @@ -122,6 +122,7 @@ class Game: if __name__ == "__main__": g = Game() g.show_board() + g.think_play_move(1) #file = open("debug.txt", "a") #file.write("mcts check\n") #file.close() diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 10ce7e1..335ee39 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -17,8 +17,6 @@ CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] class Go: def __init__(self, **kwargs): self.game = kwargs['game'] - self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) - self.simulate_latest_boards = deque(maxlen=8) def _in_board(self, vertex): x, y = vertex @@ -125,18 +123,12 @@ class Go: return False return True - def _sa2cv(self, state, action): - # State is the play board, the shape is [1, self.game.size, self.game.size, 17], action is an index. - # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move - if state[0, 0, 0, -1] == utils.BLACK: - color = utils.BLACK - else: - color = utils.WHITE + def _action2vertex(self, action): if action == self.game.size ** 2: vertex = (0, 0) else: vertex = self.game._deflatten(action) - return color, vertex + return vertex def _is_valid(self, history_boards, current_board, color, vertex): ### in board @@ -157,14 +149,10 @@ class Go: return True - def simulate_is_valid(self, history_boards, current_board, state, action): - # initialize simulate_latest_boards and simulate_board from state - self.simulate_latest_boards.clear() - for i in range(8): - self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) - self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) - - color, vertex = self._sa2cv(state, action) + def simulate_is_valid(self, state, action): + history_boards, color = state + vertex = self._action2vertex(action) + current_board = history_boards[-1] if not self._is_valid(history_boards, current_board, color, vertex): return False @@ -174,30 +162,22 @@ class Go: return True - def _do_move(self, color, vertex): + def _do_move(self, board, color, vertex): if vertex == utils.PASS: - return True - - id_ = self.game._flatten(vertex) - if self.simulate_board[id_] == utils.EMPTY: - self.simulate_board[id_] = color - return True + return board else: - return False + id_ = self.game._flatten(vertex) + board[id_] = color + return board def simulate_step_forward(self, state, action): # initialize the simulate_board from state - self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() - - color, vertex = self._sa2cv(state, action) - - self._do_move(color, vertex) - new_state = np.concatenate( - [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), - state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), - np.array(1 - state[:, :, :, -1]).reshape(1, self.game.size, self.game.size, 1)], - axis=3) - return new_state, 0 + history_boards, color = state + vertex = self._action2vertex(action) + new_board = self._do_move(copy.copy(history_boards[-1]), color, vertex) + history_boards.append(new_board) + new_color = -color + return [history_boards, new_color], 0 def executor_do_move(self, color, vertex): if not self._is_valid(self.game.history, self.game.board, color, vertex): @@ -239,7 +219,7 @@ class Go: start_vertex_x += x_diff start_vertex_y += y_diff - def _predict_from_nearby(self, vertex, neighbor_step = 3): + def _predict_from_nearby(self, vertex, neighbor_step=3): ''' step: the nearby 3 steps is considered :vertex: position to be estimated @@ -261,7 +241,7 @@ class Go: elif color_estimate < 0: return utils.WHITE - def executor_get_score(self, is_unknown_estimation = False): + def executor_get_score(self, is_unknown_estimation=False): ''' is_unknown_estimation: whether use nearby stone to predict the unknown return score from BLACK perspective. diff --git a/AlphaGo/model.py b/AlphaGo/model.py new file mode 100644 index 0000000..725dbd2 --- /dev/null +++ b/AlphaGo/model.py @@ -0,0 +1,170 @@ +import os +import time +import sys + +import numpy as np +import tensorflow as tf +import tensorflow.contrib.layers as layers + +import multi_gpu + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + + +def residual_block(input, is_training): + """ + one residual block + + :param input: a tensor, input of the residual block + :param is_training: a placeholder, indicate whether the model is training or not + :return: a tensor, output of the residual block + """ + normalizer_params = {'is_training': is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS} + h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = h + input + return tf.nn.relu(h) + + +def policy_head(input, is_training, action_num): + """ + the head of policy branch + + :param input: a tensor, input of the policy head + :param is_training: a placeholder, indicate whether the model is training or not + :param action_num: action_num: an integer, number of unique actions at any state + :return: a tensor: output of the policy head, shape [batch_size, action_num] + """ + normalizer_params = {'is_training': is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS} + h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.flatten(h) + h = layers.fully_connected(h, action_num, activation_fn=tf.identity, + weights_regularizer=layers.l2_regularizer(1e-4)) + return h + + +def value_head(input, is_training): + """ + the head of value branch + + :param input: a tensor, input of the value head + :param is_training: a placeholder, indicate whether the model is training or not + :return: a tensor, output of the value head, shape [batch_size, 1] + """ + normalizer_params = {'is_training': is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS} + h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.flatten(h) + h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4)) + return h + + +class ResNet(object): + def __init__(self, board_size, action_num, history_length=1, residual_block_num=20, checkpoint_path=None): + """ + the resnet model + + :param board_size: an integer, the board size + :param action_num: an integer, number of unique actions at any state + :param history_length: an integer, the history length to use, default is 1 + :param residual_block_num: an integer, the number of residual block, default is 20, at least 1 + :param checkpoint_path: a string, the path to the checkpoint, default is None, + """ + self.board_size = board_size + self.action_num = action_num + self.history_length = history_length + self.x = tf.placeholder(tf.float32, shape=[None, self.board_size, self.board_size, 2 * self.history_length + 1]) + self.is_training = tf.placeholder(tf.bool, shape=[]) + self.z = tf.placeholder(tf.float32, shape=[None, 1]) + self.pi = tf.placeholder(tf.float32, shape=[None, self.action_num]) + self._build_network(residual_block_num, checkpoint_path) + + def _build_network(self, residual_block_num, checkpoint_path): + """ + build the network + + :param residual_block_num: an integer, the number of residual block + :param checkpoint_path: a string, the path to the checkpoint, if None, use random initialization parameter + :return: None + """ + + h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, + normalizer_params={'is_training': self.is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS}, + weights_regularizer=layers.l2_regularizer(1e-4)) + for i in range(residual_block_num - 1): + h = residual_block(h, self.is_training) + self.v = value_head(h, self.is_training) + self.p = policy_head(h, self.is_training, self.action_num) + self.value_loss = tf.reduce_mean(tf.square(self.z - self.v)) + self.policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.pi, logits=self.p)) + + self.reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) + self.total_loss = self.value_loss + self.policy_loss + self.reg + self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + with tf.control_dependencies(self.update_ops): + self.train_op = tf.train.AdamOptimizer(1e-4).minimize(self.total_loss) + self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) + self.saver = tf.train.Saver(max_to_keep=10, var_list=self.var_list) + self.sess = multi_gpu.create_session() + self.sess.run(tf.global_variables_initializer()) + if checkpoint_path is not None: + ckpt_file = tf.train.latest_checkpoint(checkpoint_path) + if ckpt_file is not None: + print('Restoring model from {}...'.format(ckpt_file)) + self.saver.restore(self.sess, ckpt_file) + print('Successfully loaded') + else: + raise ValueError("No model in path {}".format(checkpoint_path)) + + def __call__(self, state): + """ + + :param history: a list, the history + :param color: a string, indicate which one to play + :return: a list of tensor, the predicted value and policy given the history and color + """ + history, color = state + if len(history) != self.history_length: + raise ValueError( + 'The length of history cannot meet the need of the model, given {}, need {}'.format(len(history), + self.history_length)) + state = self._history2state(history, color) + return self.sess.run([self.p, self.v], feed_dict={self.x: state, self.is_training: False}) + + def _history2state(self, history, color): + """ + convert the history to the state we need + + :param history: a list, the history + :param color: a string, indicate which one to play + :return: a ndarray, the state + """ + state = np.zeros([1, self.board_size, self.board_size, 2 * self.history_length + 1]) + for i in range(self.history_length): + state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.board_size ** 2)).reshape(self.board_size, + self.board_size) + state[0, :, :, i + self.history_length] = np.array( + np.array(history[i]) == -np.ones(self.board_size ** 2)).reshape(self.board_size, self.board_size) + # TODO: need a config to specify the BLACK and WHITE + if color == +1: + state[0, :, :, 2 * self.history_length] = np.ones([self.board_size, self.board_size]) + if color == -1: + state[0, :, :, 2 * self.history_length] = np.zeros([self.board_size, self.board_size]) + return state + + #TODO: design the interface between the environment and training + def train(self, mode='memory', *args, **kwargs): + pass \ No newline at end of file diff --git a/AlphaGo/network_small.py b/AlphaGo/network.py similarity index 100% rename from AlphaGo/network_small.py rename to AlphaGo/network.py diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 12fc85d..fac00fb 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -72,11 +72,9 @@ class UCTNode(MCTSNode): def valid_mask(self, simulator): if self.mask is None: - start_time = time.time() self.mask = [] for act in range(self.action_num - 1): - if not simulator.simulate_is_valid( - simulator.simulate_latest_boards, simulator.simulate_board, self.state, act): + if not simulator.simulate_is_valid(self.state, act): self.mask.append(act) self.ucb[act] = -float("Inf") else: @@ -144,8 +142,7 @@ class ActionNode(object): class MCTS(object): - def __init__(self, simulator, evaluator, root, action_num, method="UCT", inverse=False, max_step=None, - max_time=None): + def __init__(self, simulator, evaluator, root, action_num, method="UCT", inverse=False): self.simulator = simulator self.evaluator = evaluator prior, _ = self.evaluator(root) @@ -153,33 +150,26 @@ class MCTS(object): if method == "": self.root = root if method == "UCT": - self.root = UCTNode(None, None, root, action_num, prior, inverse) + self.root = UCTNode(None, None, root, action_num, prior, inverse=inverse) if method == "TS": self.root = TSNode(None, None, root, action_num, prior, inverse=inverse) self.inverse = inverse - if max_step is not None: - self.step = 0 - self.max_step = max_step - # TODO: Optimize the stop criteria - # else: - # self.max_step = 0 - if max_time is not None: - self.start_time = time.time() - self.max_time = max_time + + def search(self, max_step=None, max_time=None): + step = 0 + start_time = time.time() + if max_step is None: + max_step = int("Inf") + if max_time is None: + max_time = float("Inf") if max_step is None and max_time is None: raise ValueError("Need a stop criteria!") - # TODO: running mcts should be implemented in another function, e.g. def search(self, max_step, max_time) - self.select_time = [] - self.evaluate_time = [] - self.bp_time = [] - while (max_step is not None and self.step < self.max_step or max_step is None) \ - and (max_time is not None and time.time() - self.start_time < self.max_time or max_time is None): - self.expand() - if max_step is not None: - self.step += 1 + while step < max_step and time.time() - start_time < max_step: + self._expand() + step += 1 - def expand(self): + def _expand(self): node, new_action = self.root.selection(self.simulator) value = node.children[new_action].expansion(self.evaluator, self.action_num) node.children[new_action].backpropagation(value + 0.)