Tianshou/tianshou/core/mcts/mcts.py

import numpy as np
import math
import time

c_puct = 5

class MCTSNode(object):
    def __init__(self, parent, action, state, action_num, prior, inverse=False):
        self.parent = parent
        self.action = action
        self.children = {}
        self.state = state
        self.action_num = action_num
        self.prior = np.array(prior).reshape(-1)
        self.inverse = inverse

    def selection(self, simulator):
        raise NotImplementedError("Need to implement function selection")

    def backpropagation(self, action):
        raise NotImplementedError("Need to implement function backpropagation")

    def valid_mask(self, simulator):
        pass

class UCTNode(MCTSNode):
    def __init__(self, parent, action, state, action_num, prior, mcts, inverse=False):
        super(UCTNode, self).__init__(parent, action, state, action_num, prior, inverse)
        self.Q = np.zeros([action_num])
        self.W = np.zeros([action_num])
        self.N = np.zeros([action_num])
        self.c_puct = c_puct
        self.ucb = self.Q + self.c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1)
        self.mask = None
        self.elapse_time = 0
        self.mcts = mcts

    def selection(self, simulator):
        head = time.time()
        self.valid_mask(simulator)
        self.mcts.valid_mask_time += time.time() - head
        action = np.argmax(self.ucb)
        if action in self.children.keys():
            self.mcts.state_selection_time += time.time() - head
            return self.children[action].selection(simulator)
        else:
            self.children[action] = ActionNode(self, action, mcts=self.mcts)
            self.mcts.state_selection_time += time.time() - head
            return self.children[action].selection(simulator)

    def backpropagation(self, action):
        action = int(action)
        self.N[action] += 1
        self.W[action] += self.children[action].reward
        for i in range(self.action_num):
            if self.N[i] != 0:
                self.Q[i] = (self.W[i] + 0.) / self.N[i]
        self.ucb = self.Q + c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1.)
        if self.parent is not None:
            if self.inverse:
                self.parent.backpropagation(-self.children[action].reward)
            else:
                self.parent.backpropagation(self.children[action].reward)

    def valid_mask(self, simulator):
        # let all invalid actions be illegal in mcts
        if not hasattr(simulator, 'simulate_get_mask'):
            pass
        else:
            if self.mask is None:
                self.mask = simulator.simulate_get_mask(self.state, range(self.action_num))
            self.ucb[self.mask] = -float("Inf")

# Code reserved for Thompson Sampling
class TSNode(MCTSNode):
    def __init__(self, parent, action, state, action_num, prior, method="Gaussian", inverse=False):
        super(TSNode, self).__init__(parent, action, state, action_num, prior, inverse)
        if method == "Beta":
            self.alpha = np.ones([action_num])
            self.beta = np.ones([action_num])
        if method == "Gaussian":
            self.mu = np.zeros([action_num])
            self.sigma = np.zeros([action_num])


class ActionNode(object):
    def __init__(self, parent, action, mcts):
        self.parent = parent
        self.action = action
        self.children = {}
        self.next_state = None
        self.next_state_hashable = None
        self.state_type = None
        self.reward = 0
        self.mcts = mcts

    def selection(self, simulator):
        head = time.time()
        self.next_state, self.reward = simulator.simulate_step_forward(self.parent.state, self.action)
        self.mcts.simulate_sf_time += time.time() - head
        if self.next_state is None: # next_state is None means that self.parent.state is the terminate state
            self.mcts.action_selection_time += time.time() - head
            return self
        head = time.time()
        self.next_state_hashable = simulator.simulate_hashable_conversion(self.next_state)
        self.mcts.hash_time += time.time() - head
        if self.next_state_hashable in self.children.keys(): # next state has already visited before
            self.mcts.action_selection_time += time.time() - head
            return self.children[self.next_state_hashable].selection(simulator)
        else: # next state is a new state never seen before
            self.mcts.action_selection_time += time.time() - head
            return self

    def expansion(self, prior, action_num):
        if self.next_state is not None:
            # note that self.next_state was assigned already at the selection function
            # self.next_state is None means MCTS selected a terminate node
            self.children[self.next_state_hashable] = UCTNode(self, self.action, self.next_state, action_num, prior,
                                                     mcts=self.mcts, inverse=self.parent.inverse)

    def backpropagation(self, value):
        self.reward += value
        self.parent.backpropagation(self.action)

class MCTS(object):
    def __init__(self, simulator, evaluator, start_state, action_num, method="UCT",
                 role="unknown", debug=False, inverse=False):
        self.simulator = simulator
        self.evaluator = evaluator
        self.role = role
        self.debug = debug
        prior, _ = self.evaluator(start_state)
        self.action_num = action_num
        if method == "":
            self.root = start_state
        if method == "UCT":
            self.root = UCTNode(None, None, start_state, action_num, prior, mcts=self, inverse=inverse)
        if method == "TS":
            self.root = TSNode(None, None, start_state, action_num, prior, inverse=inverse)
        self.inverse = inverse

        # time spend on each step
        self.selection_time = 0
        self.expansion_time = 0
        self.backpropagation_time = 0
        self.action_selection_time = 0
        self.state_selection_time = 0
        self.simulate_sf_time = 0
        self.valid_mask_time = 0
        self.hash_time = 0

    def search(self, max_step=None, max_time=None):
        step = 0
        start_time = time.time()
        if max_step is None:
            max_step = int("Inf")
        if max_time is None:
            max_time = float("Inf")
        if max_step is None and max_time is None:
            raise ValueError("Need a stop criteria!")

        while step < max_step and time.time() - start_time < max_step:
            sel_time, exp_time, back_time = self._expand()
            self.selection_time += sel_time
            self.expansion_time += exp_time
            self.backpropagation_time += back_time
            step += 1
        if self.debug:
            file = open("mcts_profiling.log", "a")
            file.write("[" + str(self.role) + "]"
                       + " sel " + '%.3f' % self.selection_time + "  "
                       + " sel_sta " + '%.3f' % self.state_selection_time + "  "
                       + " valid " + '%.3f' % self.valid_mask_time + "  "
                       + " sel_act " + '%.3f' % self.action_selection_time + "  "
                       + " hash " + '%.3f' % self.hash_time + "  "
                       + " step forward " + '%.3f' % self.simulate_sf_time + "  "
                       + " expansion  " + '%.3f' % self.expansion_time + "  "
                       + " backprop " + '%.3f' % self.backpropagation_time + "  "
                       + "\n")
            file.close()

    def _expand(self):
        t0 = time.time()
        next_action = self.root.selection(self.simulator)
        t1 = time.time()
        prior, value = self.evaluator(next_action.next_state)
        next_action.expansion(prior, self.action_num)
        t2 = time.time()
        if self.inverse:
            next_action.backpropagation(-value + 0.)
        else:
            next_action.backpropagation(value + 0.)
        t3 = time.time()
        return t1 - t0, t2 - t1, t3 - t2

if __name__ == "__main__":
    pass
mcts framework 2017-11-16 13:21:27 +08:00			`import numpy as np`
			`import math`
mcts 2017-11-16 17:05:54 +08:00			`import time`
mcts framework 2017-11-16 13:21:27 +08:00
mcts update 2017-11-21 22:19:52 +08:00			`c_puct = 5`
AlphaGo update 2017-11-26 13:36:52 +08:00
mcts 2017-11-16 17:05:54 +08:00			`class MCTSNode(object):`
AlphaGo update 2017-11-26 13:36:52 +08:00			`def __init__(self, parent, action, state, action_num, prior, inverse=False):`
mcts framework 2017-11-16 13:21:27 +08:00			`self.parent = parent`
			`self.action = action`
			`self.children = {}`
			`self.state = state`
			`self.action_num = action_num`
combine gtp and network 2017-12-05 23:17:20 +08:00			`self.prior = np.array(prior).reshape(-1)`
AlphaGo update 2017-11-26 13:36:52 +08:00			`self.inverse = inverse`
mcts framework 2017-11-16 13:21:27 +08:00
mcts update 2017-11-21 22:19:52 +08:00			`def selection(self, simulator):`
mcts 2017-11-16 17:05:54 +08:00			`raise NotImplementedError("Need to implement function selection")`
mcts framework 2017-11-16 13:21:27 +08:00
mcts update 2017-11-21 22:19:52 +08:00			`def backpropagation(self, action):`
mcts 2017-11-16 17:05:54 +08:00			`raise NotImplementedError("Need to implement function backpropagation")`
mcts framework 2017-11-16 13:21:27 +08:00
combine gtp and network 2017-12-05 23:17:20 +08:00			`def valid_mask(self, simulator):`
			`pass`
mcts framework 2017-11-16 13:21:27 +08:00
			`class UCTNode(MCTSNode):`
fix bug of game config and add profing functions to mcts 2017-12-24 17:43:45 +08:00			`def __init__(self, parent, action, state, action_num, prior, mcts, inverse=False):`
AlphaGo update 2017-11-26 13:36:52 +08:00			`super(UCTNode, self).__init__(parent, action, state, action_num, prior, inverse)`
mcts framework 2017-11-16 13:21:27 +08:00			`self.Q = np.zeros([action_num])`
			`self.W = np.zeros([action_num])`
			`self.N = np.zeros([action_num])`
fix virtual loss bug 2017-12-23 02:48:53 +08:00			`self.c_puct = c_puct`
			`self.ucb = self.Q + self.c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1)`
minor fixed 2017-12-08 23:41:31 +08:00			`self.mask = None`
add some code for debug and profiling 2017-12-24 01:07:46 +08:00			`self.elapse_time = 0`
fix bug of game config and add profing functions to mcts 2017-12-24 17:43:45 +08:00			`self.mcts = mcts`
mcts framework 2017-11-16 13:21:27 +08:00
mcts update 2017-11-21 22:19:52 +08:00			`def selection(self, simulator):`
add some code for debug and profiling 2017-12-24 01:07:46 +08:00			`head = time.time()`
combine gtp and network 2017-12-05 23:17:20 +08:00			`self.valid_mask(simulator)`
fix bug of game config and add profing functions to mcts 2017-12-24 17:43:45 +08:00			`self.mcts.valid_mask_time += time.time() - head`
mcts update 2017-11-21 22:19:52 +08:00			`action = np.argmax(self.ucb)`
			`if action in self.children.keys():`
fix bug of game config and add profing functions to mcts 2017-12-24 17:43:45 +08:00			`self.mcts.state_selection_time += time.time() - head`
mcts update 2017-11-21 22:19:52 +08:00			`return self.children[action].selection(simulator)`
mcts framework 2017-11-16 13:21:27 +08:00			`else:`
fix bug of game config and add profing functions to mcts 2017-12-24 17:43:45 +08:00			`self.children[action] = ActionNode(self, action, mcts=self.mcts)`
			`self.mcts.state_selection_time += time.time() - head`
mcts update 2017-11-21 22:19:52 +08:00			`return self.children[action].selection(simulator)`

			`def backpropagation(self, action):`
minor fixed 2017-11-21 22:52:17 +08:00			`action = int(action)`
mcts update 2017-11-21 22:19:52 +08:00			`self.N[action] += 1`
			`self.W[action] += self.children[action].reward`
			`for i in range(self.action_num):`
			`if self.N[i] != 0:`
			`self.Q[i] = (self.W[i] + 0.) / self.N[i]`
			`self.ucb = self.Q + c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1.)`
			`if self.parent is not None:`
AlphaGo update 2017-11-26 13:36:52 +08:00			`if self.inverse:`
			`self.parent.backpropagation(-self.children[action].reward)`
			`else:`
			`self.parent.backpropagation(self.children[action].reward)`
mcts 2017-11-16 17:05:54 +08:00
combine gtp and network 2017-12-05 23:17:20 +08:00			`def valid_mask(self, simulator):`
minor fixed for mcts, check finish for go 2017-12-23 15:58:06 +08:00			`# let all invalid actions be illegal in mcts`
			`if not hasattr(simulator, 'simulate_get_mask'):`
			`pass`
			`else:`
			`if self.mask is None:`
			`self.mask = simulator.simulate_get_mask(self.state, range(self.action_num))`
			`self.ucb[self.mask] = -float("Inf")`
combine gtp and network 2017-12-05 23:17:20 +08:00
remove type_conversion function 2017-12-27 14:08:34 +08:00			`# Code reserved for Thompson Sampling`
mcts framework 2017-11-16 13:21:27 +08:00			`class TSNode(MCTSNode):`
AlphaGo update 2017-11-26 13:36:52 +08:00			`def __init__(self, parent, action, state, action_num, prior, method="Gaussian", inverse=False):`
			`super(TSNode, self).__init__(parent, action, state, action_num, prior, inverse)`
mcts framework 2017-11-16 13:21:27 +08:00			`if method == "Beta":`
			`self.alpha = np.ones([action_num])`
			`self.beta = np.ones([action_num])`
			`if method == "Gaussian":`
			`self.mu = np.zeros([action_num])`
			`self.sigma = np.zeros([action_num])`

mcts 2017-11-16 17:05:54 +08:00
combine gtp and network 2017-12-05 23:17:20 +08:00			`class ActionNode(object):`
fix bug of game config and add profing functions to mcts 2017-12-24 17:43:45 +08:00			`def __init__(self, parent, action, mcts):`
mcts framework 2017-11-16 13:21:27 +08:00			`self.parent = parent`
			`self.action = action`
			`self.children = {}`
mcts update 2017-11-21 22:19:52 +08:00			`self.next_state = None`
remove type_conversion function 2017-12-27 14:08:34 +08:00			`self.next_state_hashable = None`
AlphaGo update 2017-11-26 13:36:52 +08:00			`self.state_type = None`
mcts update 2017-11-21 22:19:52 +08:00			`self.reward = 0`
fix bug of game config and add profing functions to mcts 2017-12-24 17:43:45 +08:00			`self.mcts = mcts`
mcts update 2017-11-21 22:19:52 +08:00
			`def selection(self, simulator):`
fix bug of game config and add profing functions to mcts 2017-12-24 17:43:45 +08:00			`head = time.time()`
merge flatten and deflatten, rename variable for clarity 2017-12-19 16:51:50 +08:00			`self.next_state, self.reward = simulator.simulate_step_forward(self.parent.state, self.action)`
fix bug of game config and add profing functions to mcts 2017-12-24 17:43:45 +08:00			`self.mcts.simulate_sf_time += time.time() - head`
rewrite selection function of ActionNode for clarity, add and delete some notes 2017-12-27 11:43:04 +08:00			`if self.next_state is None: # next_state is None means that self.parent.state is the terminate state`
			`self.mcts.action_selection_time += time.time() - head`
rewrite the selection fuction of UCTNode to return the action node instead of return the state node and next action 2017-12-27 21:11:40 +08:00			`return self`
solve the performance bottleneck by only hashing the last board 2017-12-28 01:16:24 +08:00			`head = time.time()`
			`self.next_state_hashable = simulator.simulate_hashable_conversion(self.next_state)`
			`self.mcts.hash_time += time.time() - head`
remove type_conversion function 2017-12-27 14:08:34 +08:00			`if self.next_state_hashable in self.children.keys(): # next state has already visited before`
rewrite selection function of ActionNode for clarity, add and delete some notes 2017-12-27 11:43:04 +08:00			`self.mcts.action_selection_time += time.time() - head`
remove type_conversion function 2017-12-27 14:08:34 +08:00			`return self.children[self.next_state_hashable].selection(simulator)`
rewrite selection function of ActionNode for clarity, add and delete some notes 2017-12-27 11:43:04 +08:00			`else: # next state is a new state never seen before`
fix bug of game config and add profing functions to mcts 2017-12-24 17:43:45 +08:00			`self.mcts.action_selection_time += time.time() - head`
rewrite the selection fuction of UCTNode to return the action node instead of return the state node and next action 2017-12-27 21:11:40 +08:00			`return self`
mcts update 2017-11-21 22:19:52 +08:00
move evaluator from action node to mcts 2017-12-27 20:49:54 +08:00			`def expansion(self, prior, action_num):`
mcts update 2017-11-21 22:19:52 +08:00			`if self.next_state is not None:`
remove type_conversion function 2017-12-27 14:08:34 +08:00			`# note that self.next_state was assigned already at the selection function`
move evaluator from action node to mcts 2017-12-27 20:49:54 +08:00			`# self.next_state is None means MCTS selected a terminate node`
remove type_conversion function 2017-12-27 14:08:34 +08:00			`self.children[self.next_state_hashable] = UCTNode(self, self.action, self.next_state, action_num, prior,`
fix bug of game config and add profing functions to mcts 2017-12-24 17:43:45 +08:00			`mcts=self.mcts, inverse=self.parent.inverse)`
mcts update 2017-11-21 22:19:52 +08:00
			`def backpropagation(self, value):`
			`self.reward += value`
			`self.parent.backpropagation(self.action)`
mcts 2017-11-16 17:05:54 +08:00
combine gtp and network 2017-12-05 23:17:20 +08:00			`class MCTS(object):`
variable rename and delete redundant code 2017-12-26 22:19:10 +08:00			`def __init__(self, simulator, evaluator, start_state, action_num, method="UCT",`
add some code for debug and profiling 2017-12-24 01:07:46 +08:00			`role="unknown", debug=False, inverse=False):`
mcts 2017-11-16 17:05:54 +08:00			`self.simulator = simulator`
			`self.evaluator = evaluator`
add some code for debug and profiling 2017-12-24 01:07:46 +08:00			`self.role = role`
			`self.debug = debug`
variable rename and delete redundant code 2017-12-26 22:19:10 +08:00			`prior, _ = self.evaluator(start_state)`
mcts update 2017-11-21 22:19:52 +08:00			`self.action_num = action_num`
minor fixed 2017-12-03 19:16:21 +08:00			`if method == "":`
variable rename and delete redundant code 2017-12-26 22:19:10 +08:00			`self.root = start_state`
mcts 2017-11-16 17:05:54 +08:00			`if method == "UCT":`
variable rename and delete redundant code 2017-12-26 22:19:10 +08:00			`self.root = UCTNode(None, None, start_state, action_num, prior, mcts=self, inverse=inverse)`
mcts 2017-11-16 17:05:54 +08:00			`if method == "TS":`
variable rename and delete redundant code 2017-12-26 22:19:10 +08:00			`self.root = TSNode(None, None, start_state, action_num, prior, inverse=inverse)`
AlphaGo update 2017-11-26 13:36:52 +08:00			`self.inverse = inverse`
modify the mcts, refactor the network 2017-12-20 16:43:42 +08:00
fix bug of game config and add profing functions to mcts 2017-12-24 17:43:45 +08:00			`# time spend on each step`
			`self.selection_time = 0`
			`self.expansion_time = 0`
			`self.backpropagation_time = 0`
			`self.action_selection_time = 0`
			`self.state_selection_time = 0`
			`self.simulate_sf_time = 0`
			`self.valid_mask_time = 0`
solve the performance bottleneck by only hashing the last board 2017-12-28 01:16:24 +08:00			`self.hash_time = 0`
fix bug of game config and add profing functions to mcts 2017-12-24 17:43:45 +08:00
modify the mcts, refactor the network 2017-12-20 16:43:42 +08:00			`def search(self, max_step=None, max_time=None):`
			`step = 0`
			`start_time = time.time()`
			`if max_step is None:`
			`max_step = int("Inf")`
			`if max_time is None:`
			`max_time = float("Inf")`
mcts 2017-11-16 17:05:54 +08:00			`if max_step is None and max_time is None:`
			`raise ValueError("Need a stop criteria!")`
minor fixed 2017-12-08 23:41:31 +08:00
modify the mcts, refactor the network 2017-12-20 16:43:42 +08:00			`while step < max_step and time.time() - start_time < max_step:`
add some code for debug and profiling 2017-12-24 01:07:46 +08:00			`sel_time, exp_time, back_time = self._expand()`
fix bug of game config and add profing functions to mcts 2017-12-24 17:43:45 +08:00			`self.selection_time += sel_time`
			`self.expansion_time += exp_time`
			`self.backpropagation_time += back_time`
modify the mcts, refactor the network 2017-12-20 16:43:42 +08:00			`step += 1`
variable rename and delete redundant code 2017-12-26 22:19:10 +08:00			`if self.debug:`
solve the performance bottleneck by only hashing the last board 2017-12-28 01:16:24 +08:00			`file = open("mcts_profiling.log", "a")`
add some code for debug and profiling 2017-12-24 01:07:46 +08:00			`file.write("[" + str(self.role) + "]"`
solve the performance bottleneck by only hashing the last board 2017-12-28 01:16:24 +08:00			`+ " sel " + '%.3f' % self.selection_time + " "`
			`+ " sel_sta " + '%.3f' % self.state_selection_time + " "`
			`+ " valid " + '%.3f' % self.valid_mask_time + " "`
			`+ " sel_act " + '%.3f' % self.action_selection_time + " "`
			`+ " hash " + '%.3f' % self.hash_time + " "`
			`+ " step forward " + '%.3f' % self.simulate_sf_time + " "`
			`+ " expansion " + '%.3f' % self.expansion_time + " "`
			`+ " backprop " + '%.3f' % self.backpropagation_time + " "`
add some code for debug and profiling 2017-12-24 01:07:46 +08:00			`+ "\n")`
			`file.close()`
modify the mcts, refactor the network 2017-12-20 16:43:42 +08:00
			`def _expand(self):`
add some code for debug and profiling 2017-12-24 01:07:46 +08:00			`t0 = time.time()`
rewrite the selection fuction of UCTNode to return the action node instead of return the state node and next action 2017-12-27 21:11:40 +08:00			`next_action = self.root.selection(self.simulator)`
add some code for debug and profiling 2017-12-24 01:07:46 +08:00			`t1 = time.time()`
rewrite the selection fuction of UCTNode to return the action node instead of return the state node and next action 2017-12-27 21:11:40 +08:00			`prior, value = self.evaluator(next_action.next_state)`
			`next_action.expansion(prior, self.action_num)`
add some code for debug and profiling 2017-12-24 01:07:46 +08:00			`t2 = time.time()`
Modification for backpropagation process 2017-12-27 18:55:00 +08:00			`if self.inverse:`
rewrite the selection fuction of UCTNode to return the action node instead of return the state node and next action 2017-12-27 21:11:40 +08:00			`next_action.backpropagation(-value + 0.)`
Modification for backpropagation process 2017-12-27 18:55:00 +08:00			`else:`
rewrite the selection fuction of UCTNode to return the action node instead of return the state node and next action 2017-12-27 21:11:40 +08:00			`next_action.backpropagation(value + 0.)`
add some code for debug and profiling 2017-12-24 01:07:46 +08:00			`t3 = time.time()`
			`return t1 - t0, t2 - t1, t3 - t2`

mcts update 2017-11-21 22:19:52 +08:00			`if __name__ == "__main__":`
			`pass`