Merge branch 'master' into mcts_virtual_loss

2017-12-24 21:27:54 +08:00 · 2017-12-24 21:27:54 +08:00 · 2b24f0760e
commit 2b24f0760e
parent 5aa5dcd191 89226b449a
8 changed files with 69 additions and 272 deletions
--- a/AlphaGo/.gitignore
+++ b/AlphaGo/.gitignore
@ -2,3 +2,4 @@ data
 checkpoints
 checkpoints_origin
 *.log
+*.txt
--- a/AlphaGo/engine.py
+++ b/AlphaGo/engine.py
@ -198,5 +198,4 @@ class GTPEngine():


 if __name__ == "main":
-    game = Game()
-    engine = GTPEngine(game_obj=game)
+    print ("test engine.py")
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@ -26,15 +26,15 @@ class Game:
    TODO : Maybe merge with the engine class in future, 
    currently leave it untouched for interacting with Go UI.
    '''
-    def __init__(self, name="reversi", role="unknown", debug=False, checkpoint_path=None):
+    def __init__(self, name=None, role=None, debug=False, checkpoint_path=None):
        self.name = name
        self.role = role
        self.debug = debug
        if self.name == "go":
            self.size = 9
            self.komi = 3.75
-            self.history = []
            self.history_length = 8
+            self.history = []
            self.game_engine = go.Go(size=self.size, komi=self.komi, role=self.role)
            self.board = [utils.EMPTY] * (self.size ** 2)
        elif self.name == "reversi":
@ -119,10 +119,7 @@ class Game:
        sys.stdout.flush()

 if __name__ == "__main__":
-    g = Game("go")
-    print(g.board)
-    g.clear()
-    g.think_play_move(1)
+    print("test game.py")
    #file = open("debug.txt", "a")
    #file.write("mcts check\n")
    #file.close()
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@ -60,13 +60,14 @@ if __name__ == '__main__':
    black_role_name = 'black' + str(args.id)
    white_role_name = 'white' + str(args.id)

+    game_name = 'go'
    agent_v0 = subprocess.Popen(
-        ['python', '-u', 'player.py', '--role=' + black_role_name,
+        ['python', '-u', 'player.py', '--game=' + game_name, '--role=' + black_role_name,
         '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)],
        stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

    agent_v1 = subprocess.Popen(
-        ['python', '-u', 'player.py', '--role=' + white_role_name,
+        ['python', '-u', 'player.py', '--game=' + game_name, '--role=' + white_role_name,
        '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)],
        stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

@ -102,13 +103,13 @@ if __name__ == '__main__':
            pass_flag = [False, False]
            print("Start game {}".format(game_num))
            # end the game if both palyer chose to pass, or play too much turns
-            while not (pass_flag[0] and pass_flag[1]) and num < size["reversi"] ** 2 * 2:
+            while not (pass_flag[0] and pass_flag[1]) and num < size[game_name] ** 2 * 2:
                turn = num % 2
                board = player[turn].run_cmd(str(num) + ' show_board')
                board = eval(board[board.index('['):board.index(']') + 1])
-                for i in range(size["reversi"]):
-                    for j in range(size["reversi"]):
-                        print show[board[i * size["reversi"] + j]] + " ",
+                for i in range(size[game_name]):
+                    for j in range(size[game_name]):
+                        print show[board[i * size[game_name] + j]] + " ",
                    print "\n",
                data.boards.append(board)
                start_time = time.time()
--- a/AlphaGo/player.py
+++ b/AlphaGo/player.py
@ -26,6 +26,7 @@ if __name__ == '__main__':
    parser.add_argument("--checkpoint_path", type=str, default=None)
    parser.add_argument("--role", type=str, default="unknown")
    parser.add_argument("--debug", type=str, default=False)
+    parser.add_argument("--game", type=str, default=False)
    args = parser.parse_args()

    if args.checkpoint_path == 'None':
@ -33,7 +34,7 @@ if __name__ == '__main__':
    debug = False
    if args.debug == "True":
        debug = True
-    game = Game(role=args.role, checkpoint_path=args.checkpoint_path, debug=debug)
+    game = Game(name=args.game, role=args.role, checkpoint_path=args.checkpoint_path, debug=debug)
    engine = GTPEngine(game_obj=game, name='tianshou', version=0)

    daemon = Pyro4.Daemon()                # make a Pyro daemon
--- a/AlphaGo/random_data.py
+++ b/AlphaGo/random_data.py
@ -1,123 +0,0 @@
-import os
-import numpy as np
-import time
-
-size = 9
-path = "/raid/tongzheng/tianshou/AlphaGo/data/part1/"
-save_path = "/raid/tongzheng/tianshou/AlphaGo/data/"
-name = os.listdir(path)
-print(len(name))
-batch_size = 128
-batch_num = 512
-
-block_size = batch_size * batch_num
-slots_num = 16
-
-
-class block(object):
-    def __init__(self, block_size, block_id):
-        self.boards = []
-        self.wins = []
-        self.ps = []
-        self.block_size = block_size
-        self.block_id = block_id
-
-    def concat(self, board, p, win):
-        board = board.reshape(-1, size, size, 17)
-        win = win.reshape(-1, 1)
-        p = p.reshape(-1, size ** 2 + 1)
-        self.boards.append(board)
-        self.wins.append(win)
-        self.ps.append(p)
-
-    def isfull(self):
-        assert len(self.boards) == len(self.wins)
-        assert len(self.boards) == len(self.ps)
-        return len(self.boards) == self.block_size
-
-    def save_and_reset(self, block_id):
-        self.boards = np.concatenate(self.boards, axis=0)
-        self.wins = np.concatenate(self.wins, axis=0)
-        self.ps = np.concatenate(self.ps, axis=0)
-        print ("Block {}, Boards shape {}, Wins Shape {}, Ps Shape {}".format(self.block_id, self.boards.shape[0],
-                                                                              self.wins.shape[0], self.ps.shape[0]))
-        np.savez(save_path + "block" + str(self.block_id), boards=self.boards, wins=self.wins, ps=self.ps)
-        self.boards = []
-        self.wins = []
-        self.ps = []
-        self.block_id = block_id
-
-    def store_num(self):
-        assert len(self.boards) == len(self.wins)
-        assert len(self.boards) == len(self.ps)
-        return len(self.boards)
-
-
-def concat(block_list, board, win, p):
-    global index
-    seed = np.random.randint(slots_num)
-    block_list[seed].concat(board, win, p)
-    if block_list[seed].isfull():
-        block_list[seed].save_and_reset(index)
-        index = index + 1
-
-
-block_list = []
-for index in range(slots_num):
-    block_list.append(block(block_size, index))
-index = index + 1
-for n in name:
-    data = np.load(path + n)
-    board = data["boards"]
-    win = data["win"]
-    p = data["p"]
-    print("Start {}".format(n))
-    print("Shape {}".format(board.shape[0]))
-    start = -time.time()
-    for i in range(board.shape[0]):
-        board_ori = board[i].reshape(-1, size, size, 17)
-        win_ori = win[i].reshape(-1, 1)
-        p_ori = p[i].reshape(-1, size ** 2 + 1)
-        concat(block_list, board_ori, p_ori, win_ori)
-
-        for t in range(1, 4):
-            board_aug = np.rot90(board_ori, t, (1, 2))
-            p_aug = np.concatenate(
-                [np.rot90(p_ori[:, :-1].reshape(-1, size, size), t, (1, 2)).reshape(-1, size ** 2), p_ori[:, -1].reshape(-1, 1)],
-                axis=1)
-            concat(block_list, board_aug, p_aug, win_ori)
-
-        board_aug = board_ori[:, ::-1]
-        p_aug = np.concatenate(
-            [p_ori[:, :-1].reshape(-1, size, size)[:, ::-1].reshape(-1, size ** 2), p_ori[:, -1].reshape(-1, 1)],
-            axis=1)
-        concat(block_list, board_aug, p_aug, win_ori)
-
-        board_aug = board_ori[:, :, ::-1]
-        p_aug = np.concatenate(
-            [p_ori[:, :-1].reshape(-1, size, size)[:, :, ::-1].reshape(-1, size ** 2), p_ori[:, -1].reshape(-1, 1)],
-            axis=1)
-        concat(block_list, board_aug, p_aug, win_ori)
-
-        board_aug = np.rot90(board_ori[:, ::-1], 1, (1, 2))
-        p_aug = np.concatenate(
-            [np.rot90(p_ori[:, :-1].reshape(-1, size, size)[:, ::-1], 1, (1, 2)).reshape(-1, size ** 2),
-             p_ori[:, -1].reshape(-1, 1)],
-            axis=1)
-        concat(block_list, board_aug, p_aug, win_ori)
-
-        board_aug = np.rot90(board_ori[:, :, ::-1], 1, (1, 2))
-        p_aug = np.concatenate(
-            [np.rot90(p_ori[:, :-1].reshape(-1, size, size)[:, :, ::-1], 1, (1, 2)).reshape(-1, size ** 2),
-             p_ori[:, -1].reshape(-1, 1)],
-            axis=1)
-        concat(block_list, board_aug, p_aug, win_ori)
-    print ("Finished {} with time {}".format(n, time.time() + start))
-    data_num = 0
-    for i in range(slots_num):
-        print("Block {} ".format(block_list[i].block_id) + "Size {}".format(block_list[i].store_num()))
-        data_num = data_num + block_list[i].store_num()
-    print ("Total data {}".format(data_num))
-
-for i in range(slots_num):
-    block_list[i].save_and_reset(block_list[i].block_id)
--- a/AlphaGo/self-play.py
+++ b/AlphaGo/self-play.py
@ -1,103 +0,0 @@
-from game import Game
-from engine import GTPEngine
-import re
-import numpy as np
-import os
-from collections import deque
-import utils
-import argparse
-
-parser = argparse.ArgumentParser()
-parser.add_argument('--result_path', type=str, default='./part1')
-args = parser.parse_args()
-
-if not os.path.exists(args.result_path):
-    os.makedirs(args.result_path)
-
-game = Game()
-engine = GTPEngine(game_obj=game)
-history = deque(maxlen=8)
-for i in range(8):
-    history.append(game.board)
-state = []
-prob = []
-winner = []
-pattern = "[A-Z]{1}[0-9]{1}"
-game.show_board()
-
-
-def history2state(history, color):
-    state = np.zeros([1, game.size, game.size, 17])
-    for i in range(8):
-        state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(game.size ** 2)).reshape(game.size, game.size)
-        state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(game.size ** 2)).reshape(game.size, game.size)
-    if color == utils.BLACK:
-        state[0, :, :, 16] = np.ones([game.size, game.size])
-    if color == utils.WHITE:
-        state[0, :, :, 16] = np.zeros([game.size, game.size])
-    return state
-
-
-num = 0
-game_num = 0
-black_pass = False
-white_pass = False
-while True:
-    print("Start game {}".format(game_num))
-    while not (black_pass and white_pass) and num < game.size ** 2 * 2:
-        if num % 2 == 0:
-            color = utils.BLACK
-            new_state = history2state(history, color)
-            state.append(new_state)
-            result = engine.run_cmd(str(num) + " genmove BLACK")
-            num += 1
-            match = re.search(pattern, result)
-            if match is not None:
-                print(match.group())
-            else:
-                print("pass")
-            if re.search("pass", result) is not None:
-                black_pass = True
-            else:
-                black_pass = False
-        else:
-            color = utils.WHITE
-            new_state = history2state(history, color)
-            state.append(new_state)
-            result = engine.run_cmd(str(num) + " genmove WHITE")
-            num += 1
-            match = re.search(pattern, result)
-            if match is not None:
-                print(match.group())
-            else:
-                print("pass")
-            if re.search("pass", result) is not None:
-                white_pass = True
-            else:
-                white_pass = False
-        game.show_board()
-        prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1))
-    print("Finished")
-    print("\n")
-    score = game.game_engine.executor_get_score(game.board)
-    if score > 0:
-        winner = utils.BLACK
-    else:
-        winner = utils.WHITE
-    state = np.concatenate(state, axis=0)
-    prob = np.concatenate(prob, axis=0)
-    winner = np.ones([num, 1]) * winner
-    assert state.shape[0] == prob.shape[0]
-    assert state.shape[0] == winner.shape[0]
-    np.savez(args.result_path + "/game" + str(game_num), state=state, prob=prob, winner=winner)
-    state = []
-    prob = []
-    winner = []
-    num = 0
-    black_pass = False
-    white_pass = False
-    engine.run_cmd(str(num) + " clear_board")
-    history.clear()
-    for _ in range(8):
-        history.append(game.board)
-    game_num += 1
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@ -1,10 +1,16 @@
 import numpy as np
 import math
 import time
-import sys,os
-from .utils import list2tuple, tuple2list
+import sys
+import collections

+c_puct = 5

+def list2tuple(obj):
+    if isinstance(obj, collections.Hashable):
+        return obj
+    else:
+        return tuple(list2tuple(sub) for sub in obj)

 class MCTSNode(object):
    def __init__(self, parent, action, state, action_num, prior, inverse=False):
@ -25,9 +31,8 @@ class MCTSNode(object):
    def valid_mask(self, simulator):
        pass

-
 class UCTNode(MCTSNode):
-    def __init__(self, parent, action, state, action_num, prior, debug=False, inverse=False, c_puct = 5):
+    def __init__(self, parent, action, state, action_num, prior, mcts, inverse=False):
        super(UCTNode, self).__init__(parent, action, state, action_num, prior, inverse)
        self.Q = np.zeros([action_num])
        self.W = np.zeros([action_num])
@ -35,21 +40,20 @@ class UCTNode(MCTSNode):
        self.c_puct = c_puct
        self.ucb = self.Q + self.c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1)
        self.mask = None
-        self.debug=debug
-        self.elapse_time = 0
-
-    def clear_elapse_time(self):
        self.elapse_time = 0
+        self.mcts = mcts

    def selection(self, simulator):
        head = time.time()
        self.valid_mask(simulator)
-        self.elapse_time += time.time() - head
+        self.mcts.valid_mask_time += time.time() - head
        action = np.argmax(self.ucb)
        if action in self.children.keys():
+            self.mcts.state_selection_time += time.time() - head
            return self.children[action].selection(simulator)
        else:
-            self.children[action] = ActionNode(self, action)
+            self.children[action] = ActionNode(self, action, mcts=self.mcts)
+            self.mcts.state_selection_time += time.time() - head
            return self.children[action].selection(simulator)

    def backpropagation(self, action):
@ -88,7 +92,7 @@ class TSNode(MCTSNode):


 class ActionNode(object):
-    def __init__(self, parent, action):
+    def __init__(self, parent, action, mcts):
        self.parent = parent
        self.action = action
        self.children = {}
@ -96,37 +100,43 @@ class ActionNode(object):
        self.origin_state = None
        self.state_type = None
        self.reward = 0
+        self.mcts = mcts

    def type_conversion_to_tuple(self):
+        t0 = time.time()
        if isinstance(self.next_state, np.ndarray):
            self.next_state = self.next_state.tolist()
+        t1 = time.time()
        if isinstance(self.next_state, list):
            self.next_state = list2tuple(self.next_state)
-
-    def type_conversion_to_origin(self):
-        if isinstance(self.state_type, np.ndarray):
-            self.next_state = np.array(self.next_state)
-        if isinstance(self.state_type, np.ndarray):
-            self.next_state = tuple2list(self.next_state)
+        t2 = time.time()
+        self.mcts.ndarray2list_time += t1 - t0
+        self.mcts.list2tuple_time += t2 - t1
+        self.mcts.check += sys.getsizeof(object)

    def selection(self, simulator):
+        head = time.time()
        self.next_state, self.reward = simulator.simulate_step_forward(self.parent.state, self.action)
+        self.mcts.simulate_sf_time += time.time() - head
        self.origin_state = self.next_state
        self.state_type = type(self.next_state)
        self.type_conversion_to_tuple()
        if self.next_state is not None:
            if self.next_state in self.children.keys():
+                self.mcts.action_selection_time += time.time() - head
                return self.children[self.next_state].selection(simulator)
            else:
+                self.mcts.action_selection_time += time.time() - head
                return self.parent, self.action
        else:
+            self.mcts.action_selection_time += time.time() - head
            return self.parent, self.action

    def expansion(self, evaluator, action_num):
        if self.next_state is not None:
            prior, value = evaluator(self.next_state)
            self.children[self.next_state] = UCTNode(self, self.action, self.origin_state, action_num, prior,
-                                                     self.parent.inverse)
+                                                     mcts=self.mcts, inverse=self.parent.inverse)
            return value
        else:
            return 0.
@ -148,11 +158,23 @@ class MCTS(object):
        if method == "":
            self.root = root
        if method == "UCT":
-            self.root = UCTNode(None, None, root, action_num, prior, self.debug, inverse=inverse)
+            self.root = UCTNode(None, None, root, action_num, prior, mcts=self, inverse=inverse)
        if method == "TS":
            self.root = TSNode(None, None, root, action_num, prior, inverse=inverse)
        self.inverse = inverse

+        # time spend on each step
+        self.selection_time = 0
+        self.expansion_time = 0
+        self.backpropagation_time = 0
+        self.action_selection_time = 0
+        self.state_selection_time = 0
+        self.simulate_sf_time = 0
+        self.valid_mask_time = 0
+        self.ndarray2list_time = 0
+        self.list2tuple_time = 0
+        self.check = 0
+
    def search(self, max_step=None, max_time=None):
        step = 0
        start_time = time.time()
@ -163,23 +185,25 @@ class MCTS(object):
        if max_step is None and max_time is None:
            raise ValueError("Need a stop criteria!")

-        selection_time = 0
-        expansion_time = 0
-        backprop_time = 0
-        self.root.clear_elapse_time()
        while step < max_step and time.time() - start_time < max_step:
            sel_time, exp_time, back_time = self._expand()
-            selection_time += sel_time
-            expansion_time += exp_time
-            backprop_time += back_time
+            self.selection_time += sel_time
+            self.expansion_time += exp_time
+            self.backpropagation_time += back_time
            step += 1
        if (self.debug):
-            file = open("debug.txt", "a")
+            file = open("mcts_profiling.txt", "a")
            file.write("[" + str(self.role) + "]"
-                       + " selection : " + str(selection_time) + "\t"
-                       + " validmask : " + str(self.root.elapse_time) + "\t"
-                       + " expansion : " + str(expansion_time) + "\t"
-                       + " backprop  : " + str(backprop_time) + "\t"
+                       + " sel  " + '%.3f' % self.selection_time + "  "
+                       + " sel_sta  " + '%.3f' % self.state_selection_time + "  "
+                       + " valid  " + '%.3f' % self.valid_mask_time + "  "
+                       + " sel_act  " + '%.3f' % self.action_selection_time + "  "
+                       + " array2list  " + '%.4f' % self.ndarray2list_time + "  "
+                       + " check " + str(self.check) + "  "
+                       + " list2tuple  " + '%.4f' % self.list2tuple_time + " \t"
+                       + " forward  " + '%.3f' % self.simulate_sf_time + "  "
+                       + " exp  " + '%.3f' % self.expansion_time + "  "
+                       + " bak  " + '%.3f' % self.backpropagation_time + "  "
                       + "\n")
            file.close()