diff --git a/AlphaGo/.gitignore b/AlphaGo/.gitignore index e578e5a..ff61326 100644 --- a/AlphaGo/.gitignore +++ b/AlphaGo/.gitignore @@ -2,3 +2,4 @@ data checkpoints checkpoints_origin *.log +*.txt diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index 5624a2f..b662dbd 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -198,5 +198,4 @@ class GTPEngine(): if __name__ == "main": - game = Game() - engine = GTPEngine(game_obj=game) + print ("test engine.py") diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 3a7959c..a962f5c 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -26,15 +26,15 @@ class Game: TODO : Maybe merge with the engine class in future, currently leave it untouched for interacting with Go UI. ''' - def __init__(self, name="reversi", role="unknown", debug=False, checkpoint_path=None): + def __init__(self, name=None, role=None, debug=False, checkpoint_path=None): self.name = name self.role = role self.debug = debug if self.name == "go": self.size = 9 self.komi = 3.75 - self.history = [] self.history_length = 8 + self.history = [] self.game_engine = go.Go(size=self.size, komi=self.komi, role=self.role) self.board = [utils.EMPTY] * (self.size ** 2) elif self.name == "reversi": @@ -119,10 +119,7 @@ class Game: sys.stdout.flush() if __name__ == "__main__": - g = Game("go") - print(g.board) - g.clear() - g.think_play_move(1) + print("test game.py") #file = open("debug.txt", "a") #file.write("mcts check\n") #file.close() diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 2731948..5777982 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -60,13 +60,14 @@ if __name__ == '__main__': black_role_name = 'black' + str(args.id) white_role_name = 'white' + str(args.id) + game_name = 'go' agent_v0 = subprocess.Popen( - ['python', '-u', 'player.py', '--role=' + black_role_name, + ['python', '-u', 'player.py', '--game=' + game_name, '--role=' + black_role_name, '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) agent_v1 = subprocess.Popen( - ['python', '-u', 'player.py', '--role=' + white_role_name, + ['python', '-u', 'player.py', '--game=' + game_name, '--role=' + white_role_name, '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) @@ -102,13 +103,13 @@ if __name__ == '__main__': pass_flag = [False, False] print("Start game {}".format(game_num)) # end the game if both palyer chose to pass, or play too much turns - while not (pass_flag[0] and pass_flag[1]) and num < size["reversi"] ** 2 * 2: + while not (pass_flag[0] and pass_flag[1]) and num < size[game_name] ** 2 * 2: turn = num % 2 board = player[turn].run_cmd(str(num) + ' show_board') board = eval(board[board.index('['):board.index(']') + 1]) - for i in range(size["reversi"]): - for j in range(size["reversi"]): - print show[board[i * size["reversi"] + j]] + " ", + for i in range(size[game_name]): + for j in range(size[game_name]): + print show[board[i * size[game_name] + j]] + " ", print "\n", data.boards.append(board) start_time = time.time() diff --git a/AlphaGo/player.py b/AlphaGo/player.py index 66a487f..a8f61c1 100644 --- a/AlphaGo/player.py +++ b/AlphaGo/player.py @@ -26,6 +26,7 @@ if __name__ == '__main__': parser.add_argument("--checkpoint_path", type=str, default=None) parser.add_argument("--role", type=str, default="unknown") parser.add_argument("--debug", type=str, default=False) + parser.add_argument("--game", type=str, default=False) args = parser.parse_args() if args.checkpoint_path == 'None': @@ -33,7 +34,7 @@ if __name__ == '__main__': debug = False if args.debug == "True": debug = True - game = Game(role=args.role, checkpoint_path=args.checkpoint_path, debug=debug) + game = Game(name=args.game, role=args.role, checkpoint_path=args.checkpoint_path, debug=debug) engine = GTPEngine(game_obj=game, name='tianshou', version=0) daemon = Pyro4.Daemon() # make a Pyro daemon diff --git a/AlphaGo/random_data.py b/AlphaGo/random_data.py deleted file mode 100644 index 5b53bd6..0000000 --- a/AlphaGo/random_data.py +++ /dev/null @@ -1,123 +0,0 @@ -import os -import numpy as np -import time - -size = 9 -path = "/raid/tongzheng/tianshou/AlphaGo/data/part1/" -save_path = "/raid/tongzheng/tianshou/AlphaGo/data/" -name = os.listdir(path) -print(len(name)) -batch_size = 128 -batch_num = 512 - -block_size = batch_size * batch_num -slots_num = 16 - - -class block(object): - def __init__(self, block_size, block_id): - self.boards = [] - self.wins = [] - self.ps = [] - self.block_size = block_size - self.block_id = block_id - - def concat(self, board, p, win): - board = board.reshape(-1, size, size, 17) - win = win.reshape(-1, 1) - p = p.reshape(-1, size ** 2 + 1) - self.boards.append(board) - self.wins.append(win) - self.ps.append(p) - - def isfull(self): - assert len(self.boards) == len(self.wins) - assert len(self.boards) == len(self.ps) - return len(self.boards) == self.block_size - - def save_and_reset(self, block_id): - self.boards = np.concatenate(self.boards, axis=0) - self.wins = np.concatenate(self.wins, axis=0) - self.ps = np.concatenate(self.ps, axis=0) - print ("Block {}, Boards shape {}, Wins Shape {}, Ps Shape {}".format(self.block_id, self.boards.shape[0], - self.wins.shape[0], self.ps.shape[0])) - np.savez(save_path + "block" + str(self.block_id), boards=self.boards, wins=self.wins, ps=self.ps) - self.boards = [] - self.wins = [] - self.ps = [] - self.block_id = block_id - - def store_num(self): - assert len(self.boards) == len(self.wins) - assert len(self.boards) == len(self.ps) - return len(self.boards) - - -def concat(block_list, board, win, p): - global index - seed = np.random.randint(slots_num) - block_list[seed].concat(board, win, p) - if block_list[seed].isfull(): - block_list[seed].save_and_reset(index) - index = index + 1 - - -block_list = [] -for index in range(slots_num): - block_list.append(block(block_size, index)) -index = index + 1 -for n in name: - data = np.load(path + n) - board = data["boards"] - win = data["win"] - p = data["p"] - print("Start {}".format(n)) - print("Shape {}".format(board.shape[0])) - start = -time.time() - for i in range(board.shape[0]): - board_ori = board[i].reshape(-1, size, size, 17) - win_ori = win[i].reshape(-1, 1) - p_ori = p[i].reshape(-1, size ** 2 + 1) - concat(block_list, board_ori, p_ori, win_ori) - - for t in range(1, 4): - board_aug = np.rot90(board_ori, t, (1, 2)) - p_aug = np.concatenate( - [np.rot90(p_ori[:, :-1].reshape(-1, size, size), t, (1, 2)).reshape(-1, size ** 2), p_ori[:, -1].reshape(-1, 1)], - axis=1) - concat(block_list, board_aug, p_aug, win_ori) - - board_aug = board_ori[:, ::-1] - p_aug = np.concatenate( - [p_ori[:, :-1].reshape(-1, size, size)[:, ::-1].reshape(-1, size ** 2), p_ori[:, -1].reshape(-1, 1)], - axis=1) - concat(block_list, board_aug, p_aug, win_ori) - - board_aug = board_ori[:, :, ::-1] - p_aug = np.concatenate( - [p_ori[:, :-1].reshape(-1, size, size)[:, :, ::-1].reshape(-1, size ** 2), p_ori[:, -1].reshape(-1, 1)], - axis=1) - concat(block_list, board_aug, p_aug, win_ori) - - board_aug = np.rot90(board_ori[:, ::-1], 1, (1, 2)) - p_aug = np.concatenate( - [np.rot90(p_ori[:, :-1].reshape(-1, size, size)[:, ::-1], 1, (1, 2)).reshape(-1, size ** 2), - p_ori[:, -1].reshape(-1, 1)], - axis=1) - concat(block_list, board_aug, p_aug, win_ori) - - board_aug = np.rot90(board_ori[:, :, ::-1], 1, (1, 2)) - p_aug = np.concatenate( - [np.rot90(p_ori[:, :-1].reshape(-1, size, size)[:, :, ::-1], 1, (1, 2)).reshape(-1, size ** 2), - p_ori[:, -1].reshape(-1, 1)], - axis=1) - concat(block_list, board_aug, p_aug, win_ori) - print ("Finished {} with time {}".format(n, time.time() + start)) - data_num = 0 - for i in range(slots_num): - print("Block {} ".format(block_list[i].block_id) + "Size {}".format(block_list[i].store_num())) - data_num = data_num + block_list[i].store_num() - print ("Total data {}".format(data_num)) - -for i in range(slots_num): - block_list[i].save_and_reset(block_list[i].block_id) diff --git a/AlphaGo/self-play.py b/AlphaGo/self-play.py deleted file mode 100644 index dd03b13..0000000 --- a/AlphaGo/self-play.py +++ /dev/null @@ -1,103 +0,0 @@ -from game import Game -from engine import GTPEngine -import re -import numpy as np -import os -from collections import deque -import utils -import argparse - -parser = argparse.ArgumentParser() -parser.add_argument('--result_path', type=str, default='./part1') -args = parser.parse_args() - -if not os.path.exists(args.result_path): - os.makedirs(args.result_path) - -game = Game() -engine = GTPEngine(game_obj=game) -history = deque(maxlen=8) -for i in range(8): - history.append(game.board) -state = [] -prob = [] -winner = [] -pattern = "[A-Z]{1}[0-9]{1}" -game.show_board() - - -def history2state(history, color): - state = np.zeros([1, game.size, game.size, 17]) - for i in range(8): - state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(game.size ** 2)).reshape(game.size, game.size) - state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(game.size ** 2)).reshape(game.size, game.size) - if color == utils.BLACK: - state[0, :, :, 16] = np.ones([game.size, game.size]) - if color == utils.WHITE: - state[0, :, :, 16] = np.zeros([game.size, game.size]) - return state - - -num = 0 -game_num = 0 -black_pass = False -white_pass = False -while True: - print("Start game {}".format(game_num)) - while not (black_pass and white_pass) and num < game.size ** 2 * 2: - if num % 2 == 0: - color = utils.BLACK - new_state = history2state(history, color) - state.append(new_state) - result = engine.run_cmd(str(num) + " genmove BLACK") - num += 1 - match = re.search(pattern, result) - if match is not None: - print(match.group()) - else: - print("pass") - if re.search("pass", result) is not None: - black_pass = True - else: - black_pass = False - else: - color = utils.WHITE - new_state = history2state(history, color) - state.append(new_state) - result = engine.run_cmd(str(num) + " genmove WHITE") - num += 1 - match = re.search(pattern, result) - if match is not None: - print(match.group()) - else: - print("pass") - if re.search("pass", result) is not None: - white_pass = True - else: - white_pass = False - game.show_board() - prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1)) - print("Finished") - print("\n") - score = game.game_engine.executor_get_score(game.board) - if score > 0: - winner = utils.BLACK - else: - winner = utils.WHITE - state = np.concatenate(state, axis=0) - prob = np.concatenate(prob, axis=0) - winner = np.ones([num, 1]) * winner - assert state.shape[0] == prob.shape[0] - assert state.shape[0] == winner.shape[0] - np.savez(args.result_path + "/game" + str(game_num), state=state, prob=prob, winner=winner) - state = [] - prob = [] - winner = [] - num = 0 - black_pass = False - white_pass = False - engine.run_cmd(str(num) + " clear_board") - history.clear() - for _ in range(8): - history.append(game.board) - game_num += 1 diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 1ba1145..5c96d38 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -1,10 +1,16 @@ import numpy as np import math import time -import sys,os -from .utils import list2tuple, tuple2list +import sys +import collections +c_puct = 5 +def list2tuple(obj): + if isinstance(obj, collections.Hashable): + return obj + else: + return tuple(list2tuple(sub) for sub in obj) class MCTSNode(object): def __init__(self, parent, action, state, action_num, prior, inverse=False): @@ -25,9 +31,8 @@ class MCTSNode(object): def valid_mask(self, simulator): pass - class UCTNode(MCTSNode): - def __init__(self, parent, action, state, action_num, prior, debug=False, inverse=False, c_puct = 5): + def __init__(self, parent, action, state, action_num, prior, mcts, inverse=False): super(UCTNode, self).__init__(parent, action, state, action_num, prior, inverse) self.Q = np.zeros([action_num]) self.W = np.zeros([action_num]) @@ -35,21 +40,20 @@ class UCTNode(MCTSNode): self.c_puct = c_puct self.ucb = self.Q + self.c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1) self.mask = None - self.debug=debug - self.elapse_time = 0 - - def clear_elapse_time(self): self.elapse_time = 0 + self.mcts = mcts def selection(self, simulator): head = time.time() self.valid_mask(simulator) - self.elapse_time += time.time() - head + self.mcts.valid_mask_time += time.time() - head action = np.argmax(self.ucb) if action in self.children.keys(): + self.mcts.state_selection_time += time.time() - head return self.children[action].selection(simulator) else: - self.children[action] = ActionNode(self, action) + self.children[action] = ActionNode(self, action, mcts=self.mcts) + self.mcts.state_selection_time += time.time() - head return self.children[action].selection(simulator) def backpropagation(self, action): @@ -88,7 +92,7 @@ class TSNode(MCTSNode): class ActionNode(object): - def __init__(self, parent, action): + def __init__(self, parent, action, mcts): self.parent = parent self.action = action self.children = {} @@ -96,37 +100,43 @@ class ActionNode(object): self.origin_state = None self.state_type = None self.reward = 0 + self.mcts = mcts def type_conversion_to_tuple(self): + t0 = time.time() if isinstance(self.next_state, np.ndarray): self.next_state = self.next_state.tolist() + t1 = time.time() if isinstance(self.next_state, list): self.next_state = list2tuple(self.next_state) - - def type_conversion_to_origin(self): - if isinstance(self.state_type, np.ndarray): - self.next_state = np.array(self.next_state) - if isinstance(self.state_type, np.ndarray): - self.next_state = tuple2list(self.next_state) + t2 = time.time() + self.mcts.ndarray2list_time += t1 - t0 + self.mcts.list2tuple_time += t2 - t1 + self.mcts.check += sys.getsizeof(object) def selection(self, simulator): + head = time.time() self.next_state, self.reward = simulator.simulate_step_forward(self.parent.state, self.action) + self.mcts.simulate_sf_time += time.time() - head self.origin_state = self.next_state self.state_type = type(self.next_state) self.type_conversion_to_tuple() if self.next_state is not None: if self.next_state in self.children.keys(): + self.mcts.action_selection_time += time.time() - head return self.children[self.next_state].selection(simulator) else: + self.mcts.action_selection_time += time.time() - head return self.parent, self.action else: + self.mcts.action_selection_time += time.time() - head return self.parent, self.action def expansion(self, evaluator, action_num): if self.next_state is not None: prior, value = evaluator(self.next_state) self.children[self.next_state] = UCTNode(self, self.action, self.origin_state, action_num, prior, - self.parent.inverse) + mcts=self.mcts, inverse=self.parent.inverse) return value else: return 0. @@ -148,11 +158,23 @@ class MCTS(object): if method == "": self.root = root if method == "UCT": - self.root = UCTNode(None, None, root, action_num, prior, self.debug, inverse=inverse) + self.root = UCTNode(None, None, root, action_num, prior, mcts=self, inverse=inverse) if method == "TS": self.root = TSNode(None, None, root, action_num, prior, inverse=inverse) self.inverse = inverse + # time spend on each step + self.selection_time = 0 + self.expansion_time = 0 + self.backpropagation_time = 0 + self.action_selection_time = 0 + self.state_selection_time = 0 + self.simulate_sf_time = 0 + self.valid_mask_time = 0 + self.ndarray2list_time = 0 + self.list2tuple_time = 0 + self.check = 0 + def search(self, max_step=None, max_time=None): step = 0 start_time = time.time() @@ -163,23 +185,25 @@ class MCTS(object): if max_step is None and max_time is None: raise ValueError("Need a stop criteria!") - selection_time = 0 - expansion_time = 0 - backprop_time = 0 - self.root.clear_elapse_time() while step < max_step and time.time() - start_time < max_step: sel_time, exp_time, back_time = self._expand() - selection_time += sel_time - expansion_time += exp_time - backprop_time += back_time + self.selection_time += sel_time + self.expansion_time += exp_time + self.backpropagation_time += back_time step += 1 if (self.debug): - file = open("debug.txt", "a") + file = open("mcts_profiling.txt", "a") file.write("[" + str(self.role) + "]" - + " selection : " + str(selection_time) + "\t" - + " validmask : " + str(self.root.elapse_time) + "\t" - + " expansion : " + str(expansion_time) + "\t" - + " backprop : " + str(backprop_time) + "\t" + + " sel " + '%.3f' % self.selection_time + " " + + " sel_sta " + '%.3f' % self.state_selection_time + " " + + " valid " + '%.3f' % self.valid_mask_time + " " + + " sel_act " + '%.3f' % self.action_selection_time + " " + + " array2list " + '%.4f' % self.ndarray2list_time + " " + + " check " + str(self.check) + " " + + " list2tuple " + '%.4f' % self.list2tuple_time + " \t" + + " forward " + '%.3f' % self.simulate_sf_time + " " + + " exp " + '%.3f' % self.expansion_time + " " + + " bak " + '%.3f' % self.backpropagation_time + " " + "\n") file.close()