From 906ced84a3d588143e5b5d1f8177ed5f123e4dbe Mon Sep 17 00:00:00 2001 From: rtz19970824 Date: Fri, 8 Dec 2017 17:05:33 +0800 Subject: [PATCH] self play --- AlphaGo/game.py | 7 ++- AlphaGo/network_small.py | 103 +++++++++++++++++------------------ AlphaGo/self-play.py | 114 +++++++++++++++++++++++++++++---------- AlphaGo/strategy.py | 45 ++++++++-------- AlphaGo/test.py | 14 +++++ bin/activate | 1 + bin/conda | 1 + bin/deactivate | 1 + 8 files changed, 182 insertions(+), 104 deletions(-) create mode 100644 AlphaGo/test.py create mode 120000 bin/activate create mode 120000 bin/conda create mode 120000 bin/deactivate diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 4db1007..941401e 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -190,7 +190,7 @@ class Game: self.executor = Executor(game=self) self.history = [] self.past = deque(maxlen=8) - for i in range(8): + for _ in range(8): self.past.append(self.board) def _flatten(self, vertex): @@ -205,6 +205,9 @@ class Game: def clear(self): self.board = [utils.EMPTY] * (self.size * self.size) + self.history = [] + for _ in range(8): + self.past.append(self.board) def set_size(self, n): self.size = n @@ -225,7 +228,7 @@ class Game: def gen_move(self, color): # move = self.strategy.gen_move(color) # return move - move = self.strategy.gen_move(self.past, color) + move, self.prob = self.strategy.gen_move(self.past, color) self.do_move(color, move) return move diff --git a/AlphaGo/network_small.py b/AlphaGo/network_small.py index 8dd5140..096aea6 100644 --- a/AlphaGo/network_small.py +++ b/AlphaGo/network_small.py @@ -59,11 +59,12 @@ class Network(object): self.build_network() def build_network(self): - h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm, + h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, normalizer_params={'is_training': self.is_training, 'updates_collections': tf.GraphKeys.UPDATE_OPS}, weights_regularizer=layers.l2_regularizer(1e-4)) - for i in range(19): + for i in range(4): h = residual_block(h, self.is_training) self.v = value_heads(h, self.is_training) self.p = policy_heads(h, self.is_training) @@ -115,9 +116,9 @@ class Network(object): feed_dict={self.x: boards[ index[iter * batch_size:(iter + 1) * batch_size]], self.z: wins[index[ - iter * batch_size:(iter + 1) * batch_size]], + iter * batch_size:(iter + 1) * batch_size]], self.pi: ps[index[ - iter * batch_size:(iter + 1) * batch_size]], + iter * batch_size:(iter + 1) * batch_size]], self.is_training: True}) value_losses.append(lv) policy_losses.append(lp) @@ -137,53 +138,53 @@ class Network(object): del data, boards, wins, ps - # def forward(call_number): - # # checkpoint_path = "/home/yama/rl/tianshou/AlphaGo/checkpoints" - # checkpoint_path = "/home/jialian/stuGo/tianshou/stuGo/checkpoints/" - # board_file = np.genfromtxt("/home/jialian/stuGo/tianshou/leela-zero/src/mcts_nn_files/board_" + call_number, - # dtype='str'); - # human_board = np.zeros((17, 19, 19)) - # - # # TODO : is it ok to ignore the last channel? - # for i in range(17): - # human_board[i] = np.array(list(board_file[i])).reshape(19, 19) - # # print("============================") - # # print("human board sum : " + str(np.sum(human_board[-1]))) - # # print("============================") - # # print(human_board) - # # print("============================") - # # rint(human_board) - # feed_board = human_board.transpose(1, 2, 0).reshape(1, 19, 19, 17) - # # print(feed_board[:,:,:,-1]) - # # print(feed_board.shape) - # - # # npz_board = np.load("/home/yama/rl/tianshou/AlphaGo/data/7f83928932f64a79bc1efdea268698ae.npz") - # # print(npz_board["boards"].shape) - # # feed_board = npz_board["boards"][10].reshape(-1, 19, 19, 17) - # ##print(feed_board) - # # show_board = feed_board[0].transpose(2, 0, 1) - # # print("board shape : ", show_board.shape) - # # print(show_board) - # - # itflag = False - # with multi_gpu.create_session() as sess: - # sess.run(tf.global_variables_initializer()) - # ckpt_file = tf.train.latest_checkpoint(checkpoint_path) - # if ckpt_file is not None: - # # print('Restoring model from {}...'.format(ckpt_file)) - # saver.restore(sess, ckpt_file) - # else: - # raise ValueError("No model loaded") - # res = sess.run([tf.nn.softmax(p), v], feed_dict={x: feed_board, is_training: itflag}) - # # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False}) - # # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True}) - # # print(np.argmax(res[0])) - # np.savetxt(sys.stdout, res[0][0], fmt="%.6f", newline=" ") - # np.savetxt(sys.stdout, res[1][0], fmt="%.6f", newline=" ") - # pv_file = "/home/jialian/stuGotianshou/leela-zero/src/mcts_nn_files/policy_value" - # np.savetxt(pv_file, np.concatenate((res[0][0], res[1][0])), fmt="%.6f", newline=" ") - # # np.savetxt(pv_file, res[1][0], fmt="%.6f", newline=" ") - # return res + # def forward(call_number): + # # checkpoint_path = "/home/yama/rl/tianshou/AlphaGo/checkpoints" + # checkpoint_path = "/home/jialian/stuGo/tianshou/stuGo/checkpoints/" + # board_file = np.genfromtxt("/home/jialian/stuGo/tianshou/leela-zero/src/mcts_nn_files/board_" + call_number, + # dtype='str'); + # human_board = np.zeros((17, 19, 19)) + # + # # TODO : is it ok to ignore the last channel? + # for i in range(17): + # human_board[i] = np.array(list(board_file[i])).reshape(19, 19) + # # print("============================") + # # print("human board sum : " + str(np.sum(human_board[-1]))) + # # print("============================") + # # print(human_board) + # # print("============================") + # # rint(human_board) + # feed_board = human_board.transpose(1, 2, 0).reshape(1, 19, 19, 17) + # # print(feed_board[:,:,:,-1]) + # # print(feed_board.shape) + # + # # npz_board = np.load("/home/yama/rl/tianshou/AlphaGo/data/7f83928932f64a79bc1efdea268698ae.npz") + # # print(npz_board["boards"].shape) + # # feed_board = npz_board["boards"][10].reshape(-1, 19, 19, 17) + # ##print(feed_board) + # # show_board = feed_board[0].transpose(2, 0, 1) + # # print("board shape : ", show_board.shape) + # # print(show_board) + # + # itflag = False + # with multi_gpu.create_session() as sess: + # sess.run(tf.global_variables_initializer()) + # ckpt_file = tf.train.latest_checkpoint(checkpoint_path) + # if ckpt_file is not None: + # # print('Restoring model from {}...'.format(ckpt_file)) + # saver.restore(sess, ckpt_file) + # else: + # raise ValueError("No model loaded") + # res = sess.run([tf.nn.softmax(p), v], feed_dict={x: feed_board, is_training: itflag}) + # # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False}) + # # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True}) + # # print(np.argmax(res[0])) + # np.savetxt(sys.stdout, res[0][0], fmt="%.6f", newline=" ") + # np.savetxt(sys.stdout, res[1][0], fmt="%.6f", newline=" ") + # pv_file = "/home/jialian/stuGotianshou/leela-zero/src/mcts_nn_files/policy_value" + # np.savetxt(pv_file, np.concatenate((res[0][0], res[1][0])), fmt="%.6f", newline=" ") + # # np.savetxt(pv_file, res[1][0], fmt="%.6f", newline=" ") + # return res def forward(self): # checkpoint_path = "/home/tongzheng/tianshou/AlphaGo/checkpoints/" diff --git a/AlphaGo/self-play.py b/AlphaGo/self-play.py index 2336e72..20e51c8 100644 --- a/AlphaGo/self-play.py +++ b/AlphaGo/self-play.py @@ -1,40 +1,98 @@ from game import Game from engine import GTPEngine import re +import numpy as np +from collections import deque +import utils +import argparse -g = Game() +parser = argparse.ArgumentParser() +parser.add_argument('--result_path', type=str, default='./part1') +args = parser.parse_args() + +game = Game() +engine = GTPEngine(game_obj=game) +history = deque(maxlen=8) +for i in range(8): + history.append(game.board) +state = [] +prob = [] +winner = [] pattern = "[A-Z]{1}[0-9]{1}" +game.show_board() + + +def history2state(history, color): + state = np.zeros([1, game.size, game.size, 17]) + for i in range(8): + state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(game.size ** 2)).reshape(game.size, game.size) + state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(game.size ** 2)).reshape(game.size, game.size) + if color == utils.BLACK: + state[0, :, :, 16] = np.ones([game.size, game.size]) + if color == utils.WHITE: + state[0, :, :, 16] = np.zeros([game.size, game.size]) + return state -g.show_board() -e = GTPEngine(game_obj=g) num = 0 +game_num = 0 black_pass = False white_pass = False -while not (black_pass and white_pass): - if num % 2 == 0: - res = e.run_cmd(str(num) + " genmove BLACK") - num += 1 - # print(res) - match = re.search(pattern, res) - if match is not None: - print(match.group()) +while True: + while not (black_pass and white_pass) and num < game.size ** 2 * 2: + if num % 2 == 0: + color = utils.BLACK + new_state = history2state(history, color) + state.append(new_state) + result = engine.run_cmd(str(num) + " genmove BLACK") + num += 1 + match = re.search(pattern, result) + if match is not None: + print(match.group()) + else: + print("pass") + if re.search("pass", result) is not None: + black_pass = True + else: + black_pass = False else: - print("pass") - if re.search("pass", res) is not None: - black_pass = True - else: - black_pass = False + color = utils.WHITE + new_state = history2state(history, color) + state.append(new_state) + result = engine.run_cmd(str(num) + " genmove WHITE") + num += 1 + match = re.search(pattern, result) + if match is not None: + print(match.group()) + else: + print("pass") + if re.search("pass", result) is not None: + white_pass = True + else: + white_pass = False + game.show_board() + prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1)) + print("Finished") + score = game.executor.get_score() + if score > 0: + winner = utils.BLACK else: - res = e.run_cmd(str(num) + " genmove WHITE") - num += 1 - match = re.search(pattern, res) - if match is not None: - print(match.group()) - else: - print("pass") - if re.search("pass", res) is not None: - white_pass = True - else: - white_pass = False - g.show_board() + winner = utils.WHITE + state = np.concatenate(state, axis=0) + prob = np.concatenate(prob, axis=0) + winner = np.ones([num, 1]) * winner + assert state.shape[0] == prob.shape[0] + assert state.shape[0] == winner.shape[0] + np.savez(args.result_path + "/game" + game_num, state=state, prob=prob, winner=winner) + state = [] + prob = [] + winner = [] + num = 0 + black_pass = False + white_pass = False + engine.run_cmd(str(num) + " clear_board") + history.clear() + for _ in range(8): + history.append(game.board) + game.show_board() + game_num += 1 diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index dc328d5..590edb3 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -198,28 +198,27 @@ class GoEnv: id_ = self._flatten(vertex) if self.board[id_] == utils.EMPTY: self.board[id_] = color - self.history.append(copy.copy(self.board)) return True else: return False def step_forward(self, state, action): if state[0, 0, 0, -1] == 1: - color = 1 + color = utils.BLACK else: - color = -1 - if action == 81: - vertex = (0, 0) + color = utils.WHITE + if action == self.size ** 2: + vertex = utils.PASS else: - vertex = (action % 9 + 1, action / 9 + 1) + vertex = (action % self.size + 1, action / self.size + 1) # print(vertex) # print(self.board) self.board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() self.do_move(color, vertex) new_state = np.concatenate( - [state[:, :, :, 1:8], (np.array(self.board) == 1).reshape(1, 9, 9, 1), - state[:, :, :, 9:16], (np.array(self.board) == -1).reshape(1, 9, 9, 1), - np.array(1 - state[:, :, :, -1]).reshape(1, 9, 9, 1)], + [state[:, :, :, 1:8], (np.array(self.board) == utils.BLACK).reshape(1, self.size, self.size, 1), + state[:, :, :, 9:16], (np.array(self.board) == utils.WHITE).reshape(1, self.size, self.size, 1), + np.array(1 - state[:, :, :, -1]).reshape(1, self.size, self.size, 1)], axis=3) return new_state, 0 @@ -233,26 +232,26 @@ class strategy(object): feed_dict={self.net.x: state, self.net.is_training: False}) def data_process(self, history, color): - state = np.zeros([1, 9, 9, 17]) + state = np.zeros([1, self.simulator.size, self.simulator.size, 17]) for i in range(8): - state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(81)).reshape(9, 9) - state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(81)).reshape(9, 9) - if color == 1: - state[0, :, :, 16] = np.ones([9, 9]) - if color == -1: - state[0, :, :, 16] = np.zeros([9, 9]) + state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.simulator.size ** 2)).reshape(self.simulator.size, self.simulator.size) + state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(self.simulator.size ** 2)).reshape(self.simulator.size, self.simulator.size) + if color == utils.BLACK: + state[0, :, :, 16] = np.ones([self.simulator.size, self.simulator.size]) + if color == utils.WHITE: + state[0, :, :, 16] = np.zeros([self.simulator.size, self.simulator.size]) return state def gen_move(self, history, color): self.simulator.history = copy.copy(history) self.simulator.board = copy.copy(history[-1]) state = self.data_process(self.simulator.history, color) - mcts = MCTS(self.simulator, self.evaluator, state, 82, inverse=True, max_step=10) + mcts = MCTS(self.simulator, self.evaluator, state, self.simulator.size ** 2 + 1, inverse=True, max_step=100) temp = 1 - p = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) - choice = np.random.choice(82, 1, p=p).tolist()[0] - if choice == 81: - move = (0, 0) + prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) + choice = np.random.choice(self.simulator.size ** 2 + 1, 1, p=prob).tolist()[0] + if choice == self.simulator.size ** 2: + move = utils.PASS else: - move = (choice % 9 + 1, choice / 9 + 1) - return move + move = (choice % self.simulator.size + 1, choice / self.simulator.size + 1) + return move, prob diff --git a/AlphaGo/test.py b/AlphaGo/test.py new file mode 100644 index 0000000..d9a0915 --- /dev/null +++ b/AlphaGo/test.py @@ -0,0 +1,14 @@ +import sys +from game import Game +from engine import GTPEngine +# import utils + +game = Game() +engine = GTPEngine(game_obj=game, name='tianshou') +cmd = raw_input + +while not engine.disconnect: + command = cmd() + result = engine.run_cmd(command) + sys.stdout.write(result) + sys.stdout.flush() diff --git a/bin/activate b/bin/activate new file mode 120000 index 0000000..1515f22 --- /dev/null +++ b/bin/activate @@ -0,0 +1 @@ +/home/tongzheng/anaconda2/bin/activate \ No newline at end of file diff --git a/bin/conda b/bin/conda new file mode 120000 index 0000000..79ea700 --- /dev/null +++ b/bin/conda @@ -0,0 +1 @@ +/home/tongzheng/anaconda2/bin/conda \ No newline at end of file diff --git a/bin/deactivate b/bin/deactivate new file mode 120000 index 0000000..02b42f9 --- /dev/null +++ b/bin/deactivate @@ -0,0 +1 @@ +/home/tongzheng/anaconda2/bin/deactivate \ No newline at end of file