From 72ae304ab3477242dfad48aac22f4b54a208b4c0 Mon Sep 17 00:00:00 2001
From: Haosheng Zou <zouhaosheng@163.com>
Date: Wed, 13 Dec 2017 20:47:45 +0800
Subject: [PATCH 01/98] preliminary design of dqn_example, dqn interface.
 identify the assign of networks

---
 examples/dqn_example.py            | 86 ++++++++++++++++++++++++++++++
 examples/ppo_example.py            |  6 ++-
 tianshou/core/README.md            |  3 +-
 tianshou/core/policy/base.py       | 33 +++++++++++-
 tianshou/core/policy/stochastic.py |  1 +
 5 files changed, 124 insertions(+), 5 deletions(-)
 create mode 100644 examples/dqn_example.py

diff --git a/examples/dqn_example.py b/examples/dqn_example.py
new file mode 100644
index 0000000..0a5c084
--- /dev/null
+++ b/examples/dqn_example.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+
+import tensorflow as tf
+import numpy as np
+import time
+import gym
+
+# our lib imports here!
+import sys
+sys.path.append('..')
+import tianshou.core.losses as losses
+from tianshou.data.replay import Replay
+import tianshou.data.advantage_estimation as advantage_estimation
+import tianshou.core.policy as policy
+
+
+def policy_net(observation, action_dim):
+    """
+    Constructs the policy network. NOT NEEDED IN THE LIBRARY! this is pure tf
+
+    :param observation: Placeholder for the observation. A tensor of shape (bs, x, y, channels)
+    :param action_dim: int. The number of actions.
+    :param scope: str. Specifying the scope of the variables.
+    """
+    net = tf.layers.conv2d(observation, 16, 8, 4, 'valid', activation=tf.nn.relu)
+    net = tf.layers.conv2d(net, 32, 4, 2, 'valid', activation=tf.nn.relu)
+    net = tf.layers.flatten(net)
+    net = tf.layers.dense(net, 256, activation=tf.nn.relu)
+
+    q_values = tf.layers.dense(net, action_dim)
+
+    return q_values
+
+
+if __name__ == '__main__':
+    env = gym.make('PongNoFrameskip-v4')
+    observation_dim = env.observation_space.shape
+    action_dim = env.action_space.n
+
+    # 1. build network with pure tf
+    observation = tf.placeholder(tf.float32, shape=(None,) + observation_dim) # network input
+
+    with tf.variable_scope('q_net'):
+        q_values = policy_net(observation, action_dim)
+        train_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) # TODO: better management of TRAINABLE_VARIABLES
+    with tf.variable_scope('target_net'):
+        q_values_target = policy_net(observation, action_dim)
+
+    # 2. build losses, optimizers
+    q_net = policy.DQN(q_values, observation_placeholder=observation) # YongRen: policy.DQN
+    target_net = policy.DQN(q_values_target, observation_placeholder=observation)
+
+    action = tf.placeholder(dtype=tf.int32, shape=[None]) # batch of integer actions
+    target = tf.placeholder(dtype=tf.float32, shape=[None]) # target value for DQN
+
+    dqn_loss = losses.dqn_loss(action, target, pi) # TongzhengRen
+
+    total_loss = dqn_loss
+    optimizer = tf.train.AdamOptimizer(1e-3)
+    train_op = optimizer.minimize(total_loss, var_list=train_var_list)
+
+    # 3. define data collection
+    training_data = Replay(env, q_net, advantage_estimation.qlearning_target(target_net)) #
+                                                             # ShihongSong: Replay(env, pi, advantage_estimation.qlearning_target(target_network)), use your ReplayMemory, interact as follows. Simplify your advantage_estimation.dqn to run before YongRen's DQN
+    # maybe a dict to manage the elements to be collected
+
+    # 4. start training
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+
+        minibatch_count = 0
+        collection_count = 0
+        while True: # until some stopping criterion met...
+            # collect data
+            training_data.collect() # ShihongSong
+            collection_count += 1
+            print('Collected {} times.'.format(collection_count))
+
+            # update network
+            data = training_data.next_batch(64) # YouQiaoben, ShihongSong
+            # TODO: auto managing of the placeholders? or add this to params of data.Batch
+            sess.run(train_op, feed_dict={observation: data['observations'], action: data['actions'], target: data['target']})
+            minibatch_count += 1
+            print('Trained {} minibatches.'.format(minibatch_count))
+
+            # TODO: assigning pi to pi_old is not implemented yet
\ No newline at end of file
diff --git a/examples/ppo_example.py b/examples/ppo_example.py
index d085273..02ccb52 100755
--- a/examples/ppo_example.py
+++ b/examples/ppo_example.py
@@ -66,7 +66,7 @@ if __name__ == '__main__': # a clean version with only policy net, no value net
 
     # 3. define data collection
     training_data = Batch(env, pi, advantage_estimation.full_return) # YouQiaoben: finish and polish Batch, advantage_estimation.gae_lambda as in PPO paper
-                                                             # ShihongSong: Replay(env, pi, advantage_estimation.target_network), use your ReplayMemory, interact as follows. Simplify your advantage_estimation.dqn to run before YongRen's DQN
+                                                             # ShihongSong: Replay(), see dqn_example.py
     # maybe a dict to manage the elements to be collected
 
     # 4. start training
@@ -87,4 +87,6 @@ if __name__ == '__main__': # a clean version with only policy net, no value net
                 # TODO: auto managing of the placeholders? or add this to params of data.Batch
                 sess.run(train_op, feed_dict={observation: data['observations'], action: data['actions'], advantage: data['returns']})
                 minibatch_count += 1
-                print('Trained {} minibatches.'.format(minibatch_count))
\ No newline at end of file
+                print('Trained {} minibatches.'.format(minibatch_count))
+
+            # TODO: assigning pi to pi_old is not implemented yet
\ No newline at end of file
diff --git a/tianshou/core/README.md b/tianshou/core/README.md
index 16d915e..1e6d7c7 100644
--- a/tianshou/core/README.md
+++ b/tianshou/core/README.md
@@ -10,8 +10,7 @@ follow OnehotCategorical to write Gaussian, can be in the same file as stochasti
 
 not sure how to write, but should at least have act() method to interact with environment
 
-DQN should have an effective argmax_{actions}() method to use as a value network
-
+referencing QValuePolicy in base.py, should have at least the listed methods.
 
 
 # losses
diff --git a/tianshou/core/policy/base.py b/tianshou/core/policy/base.py
index b0bf28a..0ae20a1 100644
--- a/tianshou/core/policy/base.py
+++ b/tianshou/core/policy/base.py
@@ -14,6 +14,33 @@ __all__ = [
     'StochasticPolicy',
 ]
 
+class QValuePolicy(object):
+    """
+    The policy as in DQN
+    """
+    def __init__(self, value_tensor):
+        pass
+
+    def act(self, observation, exploration=None): # first implement no exploration
+        """
+        return the action (int) to be executed.
+        no exploration when exploration=None.
+        """
+        pass
+
+    def values(self, observation):
+        """
+        returns the Q(s, a) values (float) for all actions a at observation s
+        """
+        pass
+
+    def values_tensor(self, observation):
+        """
+        returns the tensor of the values for all actions a at observation s
+        """
+        pass
+
+
 
 class StochasticPolicy(object):
     """
@@ -194,4 +221,8 @@ class StochasticPolicy(object):
         """
         Private method for subclasses to rewrite the :meth:`prob` method.
         """
-        raise NotImplementedError()
\ No newline at end of file
+        raise NotImplementedError()
+
+
+class QValuePolicy(object):
+    pass
\ No newline at end of file
diff --git a/tianshou/core/policy/stochastic.py b/tianshou/core/policy/stochastic.py
index 37eb1be..3ef463e 100644
--- a/tianshou/core/policy/stochastic.py
+++ b/tianshou/core/policy/stochastic.py
@@ -70,6 +70,7 @@ class OnehotCategorical(StochasticPolicy):
     def _act(self, observation):
         sess = tf.get_default_session() # TODO: this may be ugly. also maybe huge problem when parallel
         sampled_action = sess.run(tf.multinomial(self.logits, num_samples=1), feed_dict={self._observation_placeholder: observation[None]})
+        # observation[None] adds one dimension at the beginning
 
         sampled_action = sampled_action[0, 0]
 

From f496725437ae4d80d9284284ce8148922cbab832 Mon Sep 17 00:00:00 2001
From: Haosheng Zou <zouhaosheng@163.com>
Date: Wed, 13 Dec 2017 22:43:45 +0800
Subject: [PATCH 02/98] add dqn.py to write

---
 tianshou/core/policy/dqn.py | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 tianshou/core/policy/dqn.py

diff --git a/tianshou/core/policy/dqn.py b/tianshou/core/policy/dqn.py
new file mode 100644
index 0000000..cfc6abf
--- /dev/null
+++ b/tianshou/core/policy/dqn.py
@@ -0,0 +1,7 @@
+
+
+from .base import QValuePolicy
+
+
+class DQN(QValuePolicy):
+	pass
\ No newline at end of file

From 9ed3e7b09276e072953a997b025a4d55728a5cf4 Mon Sep 17 00:00:00 2001
From: Haosheng Zou <zouhaosheng@163.com>
Date: Thu, 14 Dec 2017 19:46:38 +0800
Subject: [PATCH 03/98] minor fix

---
 examples/dqn_example.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/dqn_example.py b/examples/dqn_example.py
index 0a5c084..4fbe466 100644
--- a/examples/dqn_example.py
+++ b/examples/dqn_example.py
@@ -53,7 +53,7 @@ if __name__ == '__main__':
     action = tf.placeholder(dtype=tf.int32, shape=[None]) # batch of integer actions
     target = tf.placeholder(dtype=tf.float32, shape=[None]) # target value for DQN
 
-    dqn_loss = losses.dqn_loss(action, target, pi) # TongzhengRen
+    dqn_loss = losses.dqn_loss(action, target, q_net) # TongzhengRen
 
     total_loss = dqn_loss
     optimizer = tf.train.AdamOptimizer(1e-3)
@@ -61,7 +61,7 @@ if __name__ == '__main__':
 
     # 3. define data collection
     training_data = Replay(env, q_net, advantage_estimation.qlearning_target(target_net)) #
-                                                             # ShihongSong: Replay(env, pi, advantage_estimation.qlearning_target(target_network)), use your ReplayMemory, interact as follows. Simplify your advantage_estimation.dqn to run before YongRen's DQN
+                                                             # ShihongSong: Replay(env, q_net, advantage_estimation.qlearning_target(target_network)), use your ReplayMemory, interact as follows. Simplify your advantage_estimation.dqn to run before YongRen's DQN
     # maybe a dict to manage the elements to be collected
 
     # 4. start training

From 0874d5342f8bf2a4b32512f701a12affd6093869 Mon Sep 17 00:00:00 2001
From: rtz19970824 <1289226405@qq.com>
Date: Fri, 15 Dec 2017 14:24:08 +0800
Subject: [PATCH 04/98] implement dqn loss and dpg loss, add TODO for separate
 actor and critic

---
 examples/dqn_example.py      |  2 +-
 tianshou/core/README.md      |  4 ++++
 tianshou/core/losses.py      | 27 ++++++++++++++++++++++-----
 tianshou/core/policy/base.py | 10 ++++------
 tianshou/data/README.md      |  4 ++++
 5 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/examples/dqn_example.py b/examples/dqn_example.py
index 0a5c084..6a9e2a6 100644
--- a/examples/dqn_example.py
+++ b/examples/dqn_example.py
@@ -53,7 +53,7 @@ if __name__ == '__main__':
     action = tf.placeholder(dtype=tf.int32, shape=[None]) # batch of integer actions
     target = tf.placeholder(dtype=tf.float32, shape=[None]) # target value for DQN
 
-    dqn_loss = losses.dqn_loss(action, target, pi) # TongzhengRen
+    dqn_loss = losses.dqn_loss(action, target, q_net) # TongzhengRen
 
     total_loss = dqn_loss
     optimizer = tf.train.AdamOptimizer(1e-3)
diff --git a/tianshou/core/README.md b/tianshou/core/README.md
index 1e6d7c7..3617525 100644
--- a/tianshou/core/README.md
+++ b/tianshou/core/README.md
@@ -1,3 +1,7 @@
+#TODO: 
+
+Separate actor and critic. (Important, we need to focus on that recently)
+
 # policy
 
 YongRen
diff --git a/tianshou/core/losses.py b/tianshou/core/losses.py
index f7d798b..d281df9 100644
--- a/tianshou/core/losses.py
+++ b/tianshou/core/losses.py
@@ -26,7 +26,7 @@ def vanilla_policy_gradient(sampled_action, reward, pi, baseline="None"):
 
     :param sampled_action: placeholder of sampled actions during interaction with the environment
     :param reward: placeholder of reward the 'sampled_action' get
-    :param pi: current 'policy' to be optimized
+    :param pi: current `policy` to be optimized
     :param baseline: the baseline method used to reduce the variance, default is 'None'
     :return:
     """
@@ -35,8 +35,25 @@ def vanilla_policy_gradient(sampled_action, reward, pi, baseline="None"):
     # TODO： Different baseline methods like REINFORCE, etc.
     return vanilla_policy_gradient_loss
 
-def temporal_difference_loss():
-    pass
+def dqn_loss(sampled_action, sampled_target, q_net):
+    """
+    deep q-network
 
-def deterministic_policy_gradient():
-    pass
\ No newline at end of file
+    :param sampled_action: placeholder of sampled actions during the interaction with the environment
+    :param sampled_target: estimated Q(s,a)
+    :param q_net: current `policy` to be optimized
+    :return:
+    """
+    action_num = q_net.get_values().shape()[1]
+    sampled_q = tf.reduce_sum(q_net.get_values() * tf.one_hot(sampled_action, action_num), axis=1)
+    return tf.reduce_mean(tf.square(sampled_target - sampled_q))
+
+def deterministic_policy_gradient(sampled_state, critic):
+    """
+    deterministic policy gradient:
+
+    :param sampled_action: placeholder of sampled actions during the interaction with the environment
+    :param critic: current `value` function
+    :return:
+    """
+    return tf.reduce_mean(critic.get_value(sampled_state))
\ No newline at end of file
diff --git a/tianshou/core/policy/base.py b/tianshou/core/policy/base.py
index 0ae20a1..b6d8d48 100644
--- a/tianshou/core/policy/base.py
+++ b/tianshou/core/policy/base.py
@@ -14,12 +14,14 @@ __all__ = [
     'StochasticPolicy',
 ]
 
+#TODO: separate actor and critic, we should focus on it once we finish the basic module.
+
 class QValuePolicy(object):
     """
     The policy as in DQN
     """
-    def __init__(self, value_tensor):
-        pass
+    def __init__(self, observation_placeholder):
+        self.observation_placeholder = observation_placeholder
 
     def act(self, observation, exploration=None): # first implement no exploration
         """
@@ -222,7 +224,3 @@ class StochasticPolicy(object):
         Private method for subclasses to rewrite the :meth:`prob` method.
         """
         raise NotImplementedError()
-
-
-class QValuePolicy(object):
-    pass
\ No newline at end of file
diff --git a/tianshou/data/README.md b/tianshou/data/README.md
index 241971a..e9e6374 100644
--- a/tianshou/data/README.md
+++ b/tianshou/data/README.md
@@ -1,3 +1,7 @@
+# TODO:
+
+Notice that we will separate actor and critic, and batch will collect data for optimizing policy while replay will collect data for optimizing critic.
+
 # Batch
 
 YouQiaoben

From 00f599bba375c1f9b7614a2467ed031c87c542f7 Mon Sep 17 00:00:00 2001
From: rtz19970824 <1289226405@qq.com>
Date: Fri, 15 Dec 2017 14:27:04 +0800
Subject: [PATCH 05/98] assign TODO to Haosheng and Tongzheng

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index f1da719..543d237 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,11 @@ Try to use full names. Don't use abbrevations for class/function/variable names
 
 The """xxx""" comment should be written right after class/function. Also comment the part that's not intuitive during the code. We must comment, but for now we don't need to polish them.
 
+# High Priority TODO
+
+For Haosheng and Tongzheng: separate actor and critic, rewrite the interfaces for policy
+
+Others can still focus on the task below.
 
 ## TODO
 Search based method parallel.

From 6cb4b02fcad99e2768886878e2234a169b0333cb Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Fri, 15 Dec 2017 22:19:44 +0800
Subject: [PATCH 06/98] merge class strategy with class game. Next, merge Go
 with GoEnv

---
 AlphaGo/README.md          |   4 +
 AlphaGo/engine.py          |   1 -
 AlphaGo/game.py            | 264 +++--------------
 AlphaGo/go.py              | 592 ++++++++++++-------------------------
 AlphaGo/gtp_wrapper.py     |  70 -----
 AlphaGo/play.py            |   3 +-
 AlphaGo/player.py          |   4 +
 AlphaGo/strategy.py        |  35 ---
 tianshou/core/mcts/mcts.py |   1 +
 9 files changed, 244 insertions(+), 730 deletions(-)
 delete mode 100644 AlphaGo/gtp_wrapper.py

diff --git a/AlphaGo/README.md b/AlphaGo/README.md
index d21b9bd..720c4d0 100644
--- a/AlphaGo/README.md
+++ b/AlphaGo/README.md
@@ -10,3 +10,7 @@ Connecting our own policy-value neural network with leela-zero.
 ## checkpoints:
 
 Weights of the policy-value neural network
+
+
+## File Specification
+
diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py
index 716d40b..1f9af85 100644
--- a/AlphaGo/engine.py
+++ b/AlphaGo/engine.py
@@ -188,7 +188,6 @@ class GTPEngine():
     def cmd_show_board(self, args, **kwargs):
         return self._game.board, True
 
-
 if __name__ == "main":
     game = Game()
     engine = GTPEngine(game_obj=Game)
diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index 919a5d5..360921e 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -10,242 +10,49 @@ import copy
 import tensorflow as tf
 import numpy as np
 import sys
+import go
+import network_small
+import strategy
 from collections import deque
+from tianshou.core.mcts.mcts import MCTS
 
 import Network
-from strategy import strategy
-
-'''
-(1, 1) is considered as the upper left corner of the board,
-(size, 1) is the lower left
-'''
-
-DELTA = [[1, 0], [-1, 0], [0, -1], [0, 1]]
-
-
-class Executor:
-    def __init__(self, **kwargs):
-        self.game = kwargs['game']
-
-    def _bfs(self, vertex, color, block, status, alive_break):
-        block.append(vertex)
-        status[self.game._flatten(vertex)] = True
-        nei = self._neighbor(vertex)
-        for n in nei:
-            if not status[self.game._flatten(n)]:
-                if self.game.board[self.game._flatten(n)] == color:
-                    self._bfs(n, color, block, status, alive_break)
-
-    def _find_block(self, vertex, alive_break=False):
-        block = []
-        status = [False] * (self.game.size * self.game.size)
-        color = self.game.board[self.game._flatten(vertex)]
-        self._bfs(vertex, color, block, status, alive_break)
-
-        for b in block:
-            for n in self._neighbor(b):
-                if self.game.board[self.game._flatten(n)] == utils.EMPTY:
-                    return False, block
-        return True, block
-
-    def _find_boarder(self, vertex):
-        block = []
-        status = [False] * (self.game.size * self.game.size)
-        self._bfs(vertex, utils.EMPTY, block, status, False)
-        border = []
-        for b in block:
-            for n in self._neighbor(b):
-                if not (n in block):
-                    border.append(n)
-        return border
-
-    def _is_qi(self, color, vertex):
-        nei = self._neighbor(vertex)
-        for n in nei:
-            if self.game.board[self.game._flatten(n)] == utils.EMPTY:
-                return True
-
-        self.game.board[self.game._flatten(vertex)] = color
-        for n in nei:
-            if self.game.board[self.game._flatten(n)] == utils.another_color(color):
-                can_kill, block = self._find_block(n)
-                if can_kill:
-                    self.game.board[self.game._flatten(vertex)] = utils.EMPTY
-                    return True
-
-        ### can not suicide
-        can_kill, block = self._find_block(vertex)
-        if can_kill:
-            self.game.board[self.game._flatten(vertex)] = utils.EMPTY
-            return False
-
-        self.game.board[self.game._flatten(vertex)] = utils.EMPTY
-        return True
-
-    def _check_global_isomorphous(self, color, vertex):
-        ##backup
-        _board = copy.copy(self.game.board)
-        self.game.board[self.game._flatten(vertex)] = color
-        self._process_board(color, vertex)
-        if self.game.board in self.game.history:
-            res = True
-        else:
-            res = False
-
-        self.game.board = _board
-        return res
-
-    def _in_board(self, vertex):
-        x, y = vertex
-        if x < 1 or x > self.game.size: return False
-        if y < 1 or y > self.game.size: return False
-        return True
-
-    def _neighbor(self, vertex):
-        x, y = vertex
-        nei = []
-        for d in DELTA:
-            _x = x + d[0]
-            _y = y + d[1]
-            if self._in_board((_x, _y)):
-                nei.append((_x, _y))
-        return nei
-
-    def _process_board(self, color, vertex):
-        nei = self._neighbor(vertex)
-        for n in nei:
-            if self.game.board[self.game._flatten(n)] == utils.another_color(color):
-                can_kill, block = self._find_block(n, alive_break=True)
-                if can_kill:
-                    for b in block:
-                        self.game.board[self.game._flatten(b)] = utils.EMPTY
-
-    def is_valid(self, color, vertex):
-        ### in board
-        if not self._in_board(vertex):
-            return False
-
-        ### already have stone
-        if not self.game.board[self.game._flatten(vertex)] == utils.EMPTY:
-            return False
-
-        ### check if it is qi
-        if not self._is_qi(color, vertex):
-            return False
-
-        if self._check_global_isomorphous(color, vertex):
-            return False
-
-        return True
-
-    def do_move(self, color, vertex):
-        if not self.is_valid(color, vertex):
-            return False
-        self.game.board[self.game._flatten(vertex)] = color
-        self._process_board(color, vertex)
-        self.game.history.append(copy.copy(self.game.board))
-        self.game.past.append(copy.copy(self.game.board))
-        return True
-
-    def _find_empty(self):
-        idx = [i for i,x in enumerate(self.game.board) if x == utils.EMPTY ][0]
-        return self.game._deflatten(idx)
-
-    def get_score(self, is_unknown_estimation = False):
-        '''
-            is_unknown_estimation: whether use nearby stone to predict the unknown
-            return score from BLACK perspective.
-        '''
-        _board = copy.copy(self.game.board)
-        while utils.EMPTY in self.game.board:
-            vertex = self._find_empty()
-            boarder = self._find_boarder(vertex)
-            boarder_color = set(map(lambda v: self.game.board[self.game._flatten(v)], boarder))
-            if boarder_color == {utils.BLACK}:
-                self.game.board[self.game._flatten(vertex)] = utils.BLACK
-            elif boarder_color == {utils.WHITE}:
-                self.game.board[self.game._flatten(vertex)] = utils.WHITE
-            elif is_unknown_estimation:
-                self.game.board[self.game._flatten(vertex)] = self._predict_from_nearby(vertex)
-            else:
-                self.game.board[self.game._flatten(vertex)] =utils.UNKNOWN
-        score = 0
-        for i in self.game.board:
-            if i == utils.BLACK:
-                score += 1
-            elif i == utils.WHITE:
-                score -= 1
-        score -= self.game.komi
-
-        self.game.board = _board
-        return score
-
-    def _predict_from_nearby(self, vertex, neighbor_step = 3):
-        '''
-        step: the nearby 3 steps is considered
-        :vertex: position to be estimated
-        :neighbor_step: how many steps nearby
-        :return: the nearby positions of the input position
-            currently the nearby 3*3 grid is returned, altogether 4*8 points involved
-        '''
-        for step in range(1, neighbor_step + 1): # check the stones within the steps in range
-            neighbor_vertex_set = []
-            self._add_nearby_stones(neighbor_vertex_set, vertex[0] - step, vertex[1], 1, 1, neighbor_step)
-            self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] + step, 1, -1, neighbor_step)
-            self._add_nearby_stones(neighbor_vertex_set, vertex[0] + step, vertex[1], -1, -1, neighbor_step)
-            self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] -  step, -1, 1, neighbor_step)
-            color_estimate = 0
-            for neighbor_vertex in neighbor_vertex_set:
-                color_estimate += self.game.board[self.game._flatten(neighbor_vertex)]
-            if color_estimate > 0:
-                return utils.BLACK
-            elif color_estimate < 0:
-                return utils.WHITE
-
-    def _add_nearby_stones(self, neighbor_vertex_set, start_vertex_x, start_vertex_y, x_diff, y_diff, num_step):
-        '''
-        add the nearby stones around the input vertex
-        :param neighbor_vertex_set: input list
-        :param start_vertex_x: x axis of the input vertex
-        :param start_vertex_y: y axis of the input vertex
-        :param x_diff: add x axis
-        :param y_diff: add y axis
-        :param num_step: number of steps to be added
-        :return:
-        '''
-        for step in xrange(num_step):
-            new_neighbor_vertex = (start_vertex_x, start_vertex_y)
-            if self._in_board(new_neighbor_vertex):
-                neighbor_vertex_set.append((start_vertex_x, start_vertex_y))
-            start_vertex_x += x_diff
-            start_vertex_y += y_diff
-
-
-
+#from strategy import strategy
 
 class Game:
+    '''
+    Load the real game and trained weights.
+    
+    TODO : Maybe merge with the engine class in future, 
+    currently leave it untouched for interacting with Go UI.
+    '''
     def __init__(self, size=9, komi=6.5, checkpoint_path=None):
         self.size = size
         self.komi = komi
         self.board = [utils.EMPTY] * (self.size * self.size)
-        self.strategy = strategy(checkpoint_path)
-        # self.strategy = None
-        self.executor = Executor(game=self)
         self.history = []
         self.past = deque(maxlen=8)
         for _ in range(8):
             self.past.append(self.board)
 
+        self.executor = go.Go(game=self)
+        #self.strategy = strategy(checkpoint_path)
+
+        self.simulator = strategy.GoEnv()
+        self.net = network_small.Network()
+        self.sess = self.net.forward(checkpoint_path)
+        self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v],
+                                                     feed_dict={self.net.x: state, self.net.is_training: False})
+
     def _flatten(self, vertex):
         x, y = vertex
         return (y - 1) * self.size + (x - 1)
 
     def _deflatten(self, idx):
         x = idx % self.size + 1
-        y = idx // self.size  + 1
+        y = idx // self.size + 1
         return (x,y)
 
-
     def clear(self):
         self.board = [utils.EMPTY] * (self.size * self.size)
         self.history = []
@@ -259,8 +66,30 @@ class Game:
     def set_komi(self, k):
         self.komi = k
 
-    def check_valid(self, color, vertex):
-        return self.executor.is_valid(color, vertex)
+    def data_process(self, history, color):
+        state = np.zeros([1, self.simulator.size, self.simulator.size, 17])
+        for i in range(8):
+            state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.simulator.size ** 2)).reshape(self.simulator.size, self.simulator.size)
+            state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(self.simulator.size ** 2)).reshape(self.simulator.size, self.simulator.size)
+        if color == utils.BLACK:
+            state[0, :, :, 16] = np.ones([self.simulator.size, self.simulator.size])
+        if color == utils.WHITE:
+            state[0, :, :, 16] = np.zeros([self.simulator.size, self.simulator.size])
+        return state
+
+    def strategy_gen_move(self, history, color):
+        self.simulator.history = copy.copy(history)
+        self.simulator.board = copy.copy(history[-1])
+        state = self.data_process(self.simulator.history, color)
+        mcts = MCTS(self.simulator, self.evaluator, state, self.simulator.size ** 2 + 1, inverse=True, max_step=10)
+        temp = 1
+        prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp)
+        choice = np.random.choice(self.simulator.size ** 2 + 1, 1, p=prob).tolist()[0]
+        if choice == self.simulator.size ** 2:
+            move = utils.PASS
+        else:
+            move = (choice % self.simulator.size + 1, choice / self.simulator.size + 1)
+        return move, prob
 
     def do_move(self, color, vertex):
         if vertex == utils.PASS:
@@ -271,7 +100,7 @@ class Game:
     def gen_move(self, color):
         # move = self.strategy.gen_move(color)
         # return move
-        move, self.prob = self.strategy.gen_move(self.past, color)
+        move, self.prob = self.strategy_gen_move(self.past, color)
         self.do_move(color, move)
         return move
 
@@ -295,7 +124,6 @@ class Game:
             print('')
         sys.stdout.flush()
 
-
 if __name__ == "__main__":
     g = Game()
     g.show_board()
diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index b83d305..26540e1 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -1,428 +1,212 @@
-'''
-A board is a NxN numpy array.
-A Coordinate is a tuple index into the board.
-A Move is a (Coordinate c | None).
-A PlayerMove is a (Color, Move) tuple
-(0, 0) is considered to be the upper left corner of the board, and (18, 0) is the lower left.
-'''
-from collections import namedtuple
+from __future__ import print_function
+import utils
 import copy
-import itertools
+import sys
+from collections import deque
 
-import numpy as np
+'''
+Settings of the Go game.
 
-# Represent a board as a numpy array, with 0 empty, 1 is black, -1 is white.
-# This means that swapping colors is as simple as multiplying array by -1.
-WHITE, EMPTY, BLACK, FILL, KO, UNKNOWN = range(-1, 5)
+(1, 1) is considered as the upper left corner of the board,
+(size, 1) is the lower left
+'''
+
+NEIGHBOR_OFFSET = [[1, 0], [-1, 0], [0, -1], [0, 1]]
 
 
-class PlayerMove(namedtuple('PlayerMove', ['color', 'move'])): pass
+class Go:
+    def __init__(self, **kwargs):
+        self.game = kwargs['game']
 
+    def _bfs(self, vertex, color, block, status, alive_break):
+        block.append(vertex)
+        status[self.game._flatten(vertex)] = True
+        nei = self._neighbor(vertex)
+        for n in nei:
+            if not status[self.game._flatten(n)]:
+                if self.game.board[self.game._flatten(n)] == color:
+                    self._bfs(n, color, block, status, alive_break)
 
-# Represents "group not found" in the LibertyTracker object
-MISSING_GROUP_ID = -1
+    def _find_block(self, vertex, alive_break=False):
+        block = []
+        status = [False] * (self.game.size * self.game.size)
+        color = self.game.board[self.game._flatten(vertex)]
+        self._bfs(vertex, color, block, status, alive_break)
 
+        for b in block:
+            for n in self._neighbor(b):
+                if self.game.board[self.game._flatten(n)] == utils.EMPTY:
+                    return False, block
+        return True, block
 
-class IllegalMove(Exception): pass
+    def _find_boarder(self, vertex):
+        block = []
+        status = [False] * (self.game.size * self.game.size)
+        self._bfs(vertex, utils.EMPTY, block, status, False)
+        border = []
+        for b in block:
+            for n in self._neighbor(b):
+                if not (n in block):
+                    border.append(n)
+        return border
 
+    def _is_qi(self, color, vertex):
+        nei = self._neighbor(vertex)
+        for n in nei:
+            if self.game.board[self.game._flatten(n)] == utils.EMPTY:
+                return True
 
-# these are initialized by set_board_size
-N = None
-ALL_COORDS = []
-EMPTY_BOARD = None
-NEIGHBORS = {}
-DIAGONALS = {}
+        self.game.board[self.game._flatten(vertex)] = color
+        for n in nei:
+            if self.game.board[self.game._flatten(n)] == utils.another_color(color):
+                can_kill, block = self._find_block(n)
+                if can_kill:
+                    self.game.board[self.game._flatten(vertex)] = utils.EMPTY
+                    return True
 
-
-def set_board_size(n):
-    '''
-    Hopefully nobody tries to run both 9x9 and 19x19 game instances at once.
-    Also, never do "from go import N, W, ALL_COORDS, EMPTY_BOARD".
-    '''
-    global N, ALL_COORDS, EMPTY_BOARD, NEIGHBORS, DIAGONALS
-    if N == n: return
-    N = n
-    ALL_COORDS = [(i, j) for i in range(n) for j in range(n)]
-    EMPTY_BOARD = np.zeros([n, n], dtype=np.int8)
-
-    def check_bounds(c):
-        return c[0] % n == c[0] and c[1] % n == c[1]
-
-    NEIGHBORS = {(x, y): list(filter(check_bounds, [(x + 1, y), (x - 1, y), (x, y + 1), (x, y - 1)])) for x, y in
-                 ALL_COORDS}
-    DIAGONALS = {(x, y): list(filter(check_bounds, [(x + 1, y + 1), (x + 1, y - 1), (x - 1, y + 1), (x - 1, y - 1)]))
-                 for x, y in ALL_COORDS}
-
-
-def place_stones(board, color, stones):
-    for s in stones:
-        board[s] = color
-
-
-def find_reached(board, c):
-    # that can reach from one place
-    color = board[c]
-    chain = set([c])
-    reached = set()
-    frontier = [c]
-    while frontier:
-        current = frontier.pop()
-        chain.add(current)
-        for n in NEIGHBORS[current]:
-            if board[n] == color and (not n in chain):
-                frontier.append(n)
-            elif board[n] != color:
-                reached.add(n)
-    return chain, reached
-
-
-def is_koish(board, c):
-    'Check if c is surrounded on all sides by 1 color, and return that color'
-    if board[c] != EMPTY: return None
-    neighbors = {board[n] for n in NEIGHBORS[c]}
-    if len(neighbors) == 1 and not EMPTY in neighbors:
-        return list(neighbors)[0]
-    else:
-        return None
-
-
-def is_eyeish(board, c):
-    'Check if c is an eye, for the purpose of restricting MC rollouts.'
-    color = is_koish(board, c)
-    if color is None:
-        return None
-    diagonal_faults = 0
-    diagonals = DIAGONALS[c]
-    if len(diagonals) < 4:
-        diagonal_faults += 1
-    for d in diagonals:
-        if not board[d] in (color, EMPTY):
-            diagonal_faults += 1
-    if diagonal_faults > 1:
-        return None
-    else:
-        return color
-
-
-class Group(namedtuple('Group', ['id', 'stones', 'liberties', 'color'])):
-    '''
-    stones: a set of Coordinates belonging to this group
-    liberties: a set of Coordinates that are empty and adjacent to this group.
-    color: color of this group
-    '''
-
-    def __eq__(self, other):
-        return self.stones == other.stones and self.liberties == other.liberties and self.color == other.color
-
-
-class LibertyTracker(object):
-    @staticmethod
-    def from_board(board):
-        board = np.copy(board)
-        curr_group_id = 0
-        lib_tracker = LibertyTracker()
-        for color in (WHITE, BLACK):
-            while color in board:
-                curr_group_id += 1
-                found_color = np.where(board == color)
-                coord = found_color[0][0], found_color[1][0]
-                chain, reached = find_reached(board, coord)
-                liberties = set(r for r in reached if board[r] == EMPTY)
-                new_group = Group(curr_group_id, chain, liberties, color)
-                lib_tracker.groups[curr_group_id] = new_group
-                for s in chain:
-                    lib_tracker.group_index[s] = curr_group_id
-                place_stones(board, FILL, chain)
-
-        lib_tracker.max_group_id = curr_group_id
-
-        liberty_counts = np.zeros([N, N], dtype=np.uint8)
-        for group in lib_tracker.groups.values():
-            num_libs = len(group.liberties)
-            for s in group.stones:
-                liberty_counts[s] = num_libs
-        lib_tracker.liberty_cache = liberty_counts
-
-        return lib_tracker
-
-    def __init__(self, group_index=None, groups=None, liberty_cache=None, max_group_id=1):
-        # group_index: a NxN numpy array of group_ids. -1 means no group
-        # groups: a dict of group_id to groups
-        # liberty_cache: a NxN numpy array of liberty counts
-        self.group_index = group_index if group_index is not None else -np.ones([N, N], dtype=np.int32)
-        self.groups = groups or {}
-        self.liberty_cache = liberty_cache if liberty_cache is not None else np.zeros([N, N], dtype=np.uint8)
-        self.max_group_id = max_group_id
-
-    def __deepcopy__(self, memodict={}):
-        new_group_index = np.copy(self.group_index)
-        new_lib_cache = np.copy(self.liberty_cache)
-        new_groups = {
-            group.id: Group(group.id, set(group.stones), set(group.liberties), group.color)
-            for group in self.groups.values()
-        }
-        return LibertyTracker(new_group_index, new_groups, liberty_cache=new_lib_cache, max_group_id=self.max_group_id)
-
-    def add_stone(self, color, c):
-        assert self.group_index[c] == MISSING_GROUP_ID
-        captured_stones = set()
-        opponent_neighboring_group_ids = set()
-        friendly_neighboring_group_ids = set()
-        empty_neighbors = set()
-
-        for n in NEIGHBORS[c]:
-            neighbor_group_id = self.group_index[n]
-            if neighbor_group_id != MISSING_GROUP_ID:
-                neighbor_group = self.groups[neighbor_group_id]
-                if neighbor_group.color == color:
-                    friendly_neighboring_group_ids.add(neighbor_group_id)
-                else:
-                    opponent_neighboring_group_ids.add(neighbor_group_id)
-            else:
-                empty_neighbors.add(n)
-
-        new_group = self._create_group(color, c, empty_neighbors)
-
-        for group_id in friendly_neighboring_group_ids:
-            new_group = self._merge_groups(group_id, new_group.id)
-
-        for group_id in opponent_neighboring_group_ids:
-            neighbor_group = self.groups[group_id]
-            if len(neighbor_group.liberties) == 1:
-                captured = self._capture_group(group_id)
-                captured_stones.update(captured)
-            else:
-                self._update_liberties(group_id, remove={c})
-
-        self._handle_captures(captured_stones)
-
-        # suicide is illegal
-        if len(new_group.liberties) == 0:
-            raise IllegalMove("Move at {} would commit suicide!\n".format(c))
-
-        return captured_stones
-
-    def _create_group(self, color, c, liberties):
-        self.max_group_id += 1
-        new_group = Group(self.max_group_id, set([c]), liberties, color)
-        self.groups[new_group.id] = new_group
-        self.group_index[c] = new_group.id
-        self.liberty_cache[c] = len(liberties)
-        return new_group
-
-    def _merge_groups(self, group1_id, group2_id):
-        group1 = self.groups[group1_id]
-        group2 = self.groups[group2_id]
-        group1.stones.update(group2.stones)
-        del self.groups[group2_id]
-        for s in group2.stones:
-            self.group_index[s] = group1_id
-
-        self._update_liberties(group1_id, add=group2.liberties, remove=(group2.stones | group1.stones))
-
-        return group1
-
-    def _capture_group(self, group_id):
-        dead_group = self.groups[group_id]
-        del self.groups[group_id]
-        for s in dead_group.stones:
-            self.group_index[s] = MISSING_GROUP_ID
-            self.liberty_cache[s] = 0
-        return dead_group.stones
-
-    def _update_liberties(self, group_id, add=None, remove=None):
-        group = self.groups[group_id]
-        if add:
-            group.liberties.update(add)
-        if remove:
-            group.liberties.difference_update(remove)
-
-        new_lib_count = len(group.liberties)
-        for s in group.stones:
-            self.liberty_cache[s] = new_lib_count
-
-    def _handle_captures(self, captured_stones):
-        for s in captured_stones:
-            for n in NEIGHBORS[s]:
-                group_id = self.group_index[n]
-                if group_id != MISSING_GROUP_ID:
-                    self._update_liberties(group_id, add={s})
-
-
-class Position():
-    def __init__(self, board=None, n=0, komi=7.5, caps=(0, 0), lib_tracker=None, ko=None, recent=tuple(),
-                 to_play=BLACK):
-        '''
-        board: a numpy array
-        n: an int representing moves played so far
-        komi: a float, representing points given to the second player.
-        caps: a (int, int) tuple of captures for B, W.
-        lib_tracker: a LibertyTracker object
-        ko: a Move
-        recent: a tuple of PlayerMoves, such that recent[-1] is the last move.
-        to_play: BLACK or WHITE
-        '''
-        self.board = board if board is not None else np.copy(EMPTY_BOARD)
-        self.n = n
-        self.komi = komi
-        self.caps = caps
-        self.lib_tracker = lib_tracker or LibertyTracker.from_board(self.board)
-        self.ko = ko
-        self.recent = recent
-        self.to_play = to_play
-
-    def __deepcopy__(self, memodict={}):
-        new_board = np.copy(self.board)
-        new_lib_tracker = copy.deepcopy(self.lib_tracker)
-        return Position(new_board, self.n, self.komi, self.caps, new_lib_tracker, self.ko, self.recent, self.to_play)
-
-    def __str__(self):
-        pretty_print_map = {
-            WHITE: '\x1b[0;31;47mO',
-            EMPTY: '\x1b[0;31;43m.',
-            BLACK: '\x1b[0;31;40mX',
-            FILL: '#',
-            KO: '*',
-        }
-        board = np.copy(self.board)
-        captures = self.caps
-        if self.ko is not None:
-            place_stones(board, KO, [self.ko])
-        raw_board_contents = []
-        for i in range(N):
-            row = []
-            for j in range(N):
-                appended = '<' if (self.recent and (i, j) == self.recent[-1].move) else ' '
-                row.append(pretty_print_map[board[i, j]] + appended)
-                row.append('\x1b[0m')
-            raw_board_contents.append(''.join(row))
-
-        row_labels = ['%2d ' % i for i in range(N, 0, -1)]
-        annotated_board_contents = [''.join(r) for r in zip(row_labels, raw_board_contents, row_labels)]
-        header_footer_rows = ['   ' + ' '.join('ABCDEFGHJKLMNOPQRST'[:N]) + '   ']
-        annotated_board = '\n'.join(itertools.chain(header_footer_rows, annotated_board_contents, header_footer_rows))
-        details = "\nMove: {}. Captures X: {} O: {}\n".format(self.n, *captures)
-        return annotated_board + details
-
-    def is_move_suicidal(self, move):
-        potential_libs = set()
-        for n in NEIGHBORS[move]:
-            neighbor_group_id = self.lib_tracker.group_index[n]
-            if neighbor_group_id == MISSING_GROUP_ID:
-                # at least one liberty after playing here, so not a suicide
-                return False
-            neighbor_group = self.lib_tracker.groups[neighbor_group_id]
-            if neighbor_group.color == self.to_play:
-                potential_libs |= neighbor_group.liberties
-            elif len(neighbor_group.liberties) == 1:
-                # would capture an opponent group if they only had one lib.
-                return False
-        # it's possible to suicide by connecting several friendly groups
-        # each of which had one liberty.
-        potential_libs -= set([move])
-        return not potential_libs
-
-    def is_move_legal(self, move):
-        'Checks that a move is on an empty space, not on ko, and not suicide'
-        if move is None:
-            return True
-        if self.board[move] != EMPTY:
+        ### can not suicide
+        can_kill, block = self._find_block(vertex)
+        if can_kill:
+            self.game.board[self.game._flatten(vertex)] = utils.EMPTY
             return False
-        if move == self.ko:
+
+        self.game.board[self.game._flatten(vertex)] = utils.EMPTY
+        return True
+
+    def _check_global_isomorphous(self, color, vertex):
+        ##backup
+        _board = copy.copy(self.game.board)
+        self.game.board[self.game._flatten(vertex)] = color
+        self._process_board(color, vertex)
+        if self.game.board in self.game.history:
+            res = True
+        else:
+            res = False
+
+        self.game.board = _board
+        return res
+
+    def _in_board(self, vertex):
+        x, y = vertex
+        if x < 1 or x > self.game.size: return False
+        if y < 1 or y > self.game.size: return False
+        return True
+
+    def _neighbor(self, vertex):
+        x, y = vertex
+        nei = []
+        for d in NEIGHBOR_OFFSET:
+            _x = x + d[0]
+            _y = y + d[1]
+            if self._in_board((_x, _y)):
+                nei.append((_x, _y))
+        return nei
+
+    def _process_board(self, color, vertex):
+        nei = self._neighbor(vertex)
+        for n in nei:
+            if self.game.board[self.game._flatten(n)] == utils.another_color(color):
+                can_kill, block = self._find_block(n, alive_break=True)
+                if can_kill:
+                    for b in block:
+                        self.game.board[self.game._flatten(b)] = utils.EMPTY
+
+    def is_valid(self, color, vertex):
+        ### in board
+        if not self._in_board(vertex):
             return False
-        if self.is_move_suicidal(move):
+
+        ### already have stone
+        if not self.game.board[self.game._flatten(vertex)] == utils.EMPTY:
+            return False
+
+        ### check if it is qi
+        if not self._is_qi(color, vertex):
+            return False
+
+        if self._check_global_isomorphous(color, vertex):
             return False
 
         return True
 
-    def pass_move(self, mutate=False):
-        pos = self if mutate else copy.deepcopy(self)
-        pos.n += 1
-        pos.recent += (PlayerMove(pos.to_play, None),)
-        pos.to_play *= -1
-        pos.ko = None
-        return pos
+    def do_move(self, color, vertex):
+        if not self.is_valid(color, vertex):
+            return False
+        self.game.board[self.game._flatten(vertex)] = color
+        self._process_board(color, vertex)
+        self.game.history.append(copy.copy(self.game.board))
+        self.game.past.append(copy.copy(self.game.board))
+        return True
 
-    def flip_playerturn(self, mutate=False):
-        pos = self if mutate else copy.deepcopy(self)
-        pos.ko = None
-        pos.to_play *= -1
-        return pos
+    def _find_empty(self):
+        idx = [i for i,x in enumerate(self.game.board) if x == utils.EMPTY ][0]
+        return self.game._deflatten(idx)
 
-    def get_liberties(self):
-        return self.lib_tracker.liberty_cache
-
-    def play_move(self, c, color=None, mutate=False):
-        # Obeys CGOS Rules of Play. In short:
-        # No suicides
-        # Chinese/area scoring
-        # Positional superko (this is very crudely approximate at the moment.)
-        if color is None:
-            color = self.to_play
-
-        pos = self if mutate else copy.deepcopy(self)
-
-        if c is None:
-            pos = pos.pass_move(mutate=mutate)
-            return pos
-
-        if not self.is_move_legal(c):
-            raise IllegalMove("Move at {} is illegal: \n{}".format(c, self))
-
-        # check must be done before potentially mutating the board
-        potential_ko = is_koish(self.board, c)
-
-        place_stones(pos.board, color, [c])
-        captured_stones = pos.lib_tracker.add_stone(color, c)
-        place_stones(pos.board, EMPTY, captured_stones)
-
-        opp_color = color * -1
-
-        if len(captured_stones) == 1 and potential_ko == opp_color:
-            new_ko = list(captured_stones)[0]
-        else:
-            new_ko = None
-
-        if pos.to_play == BLACK:
-            new_caps = (pos.caps[0] + len(captured_stones), pos.caps[1])
-        else:
-            new_caps = (pos.caps[0], pos.caps[1] + len(captured_stones))
-
-        pos.n += 1
-        pos.caps = new_caps
-        pos.ko = new_ko
-        pos.recent += (PlayerMove(color, c),)
-        pos.to_play *= -1
-        return pos
-
-    def score(self):
-        'Return score from B perspective. If W is winning, score is negative.'
-        working_board = np.copy(self.board)
-        while EMPTY in working_board:
-            unassigned_spaces = np.where(working_board == EMPTY)
-            c = unassigned_spaces[0][0], unassigned_spaces[1][0]
-            territory, borders = find_reached(working_board, c)
-            border_colors = set(working_board[b] for b in borders)
-            X_border = BLACK in border_colors
-            O_border = WHITE in border_colors
-            if X_border and not O_border:
-                territory_color = BLACK
-            elif O_border and not X_border:
-                territory_color = WHITE
+    def get_score(self, is_unknown_estimation = False):
+        '''
+            is_unknown_estimation: whether use nearby stone to predict the unknown
+            return score from BLACK perspective.
+        '''
+        _board = copy.copy(self.game.board)
+        while utils.EMPTY in self.game.board:
+            vertex = self._find_empty()
+            boarder = self._find_boarder(vertex)
+            boarder_color = set(map(lambda v: self.game.board[self.game._flatten(v)], boarder))
+            if boarder_color == {utils.BLACK}:
+                self.game.board[self.game._flatten(vertex)] = utils.BLACK
+            elif boarder_color == {utils.WHITE}:
+                self.game.board[self.game._flatten(vertex)] = utils.WHITE
+            elif is_unknown_estimation:
+                self.game.board[self.game._flatten(vertex)] = self._predict_from_nearby(vertex)
             else:
-                territory_color = UNKNOWN  # dame, or seki
-            place_stones(working_board, territory_color, territory)
+                self.game.board[self.game._flatten(vertex)] =utils.UNKNOWN
+        score = 0
+        for i in self.game.board:
+            if i == utils.BLACK:
+                score += 1
+            elif i == utils.WHITE:
+                score -= 1
+        score -= self.game.komi
 
-        return np.count_nonzero(working_board == BLACK) - np.count_nonzero(working_board == WHITE) - self.komi
+        self.game.board = _board
+        return score
 
-    def result(self):
-        score = self.score()
-        if score > 0:
-            return 'B+' + '%.1f' % score
-        elif score < 0:
-            return 'W+' + '%.1f' % abs(score)
-        else:
-            return 'DRAW'
+    def _predict_from_nearby(self, vertex, neighbor_step = 3):
+        '''
+        step: the nearby 3 steps is considered
+        :vertex: position to be estimated
+        :neighbor_step: how many steps nearby
+        :return: the nearby positions of the input position
+            currently the nearby 3*3 grid is returned, altogether 4*8 points involved
+        '''
+        for step in range(1, neighbor_step + 1): # check the stones within the steps in range
+            neighbor_vertex_set = []
+            self._add_nearby_stones(neighbor_vertex_set, vertex[0] - step, vertex[1], 1, 1, neighbor_step)
+            self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] + step, 1, -1, neighbor_step)
+            self._add_nearby_stones(neighbor_vertex_set, vertex[0] + step, vertex[1], -1, -1, neighbor_step)
+            self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] -  step, -1, 1, neighbor_step)
+            color_estimate = 0
+            for neighbor_vertex in neighbor_vertex_set:
+                color_estimate += self.game.board[self.game._flatten(neighbor_vertex)]
+            if color_estimate > 0:
+                return utils.BLACK
+            elif color_estimate < 0:
+                return utils.WHITE
 
-
-set_board_size(19)
+    def _add_nearby_stones(self, neighbor_vertex_set, start_vertex_x, start_vertex_y, x_diff, y_diff, num_step):
+        '''
+        add the nearby stones around the input vertex
+        :param neighbor_vertex_set: input list
+        :param start_vertex_x: x axis of the input vertex
+        :param start_vertex_y: y axis of the input vertex
+        :param x_diff: add x axis
+        :param y_diff: add y axis
+        :param num_step: number of steps to be added
+        :return:
+        '''
+        for step in xrange(num_step):
+            new_neighbor_vertex = (start_vertex_x, start_vertex_y)
+            if self._in_board(new_neighbor_vertex):
+                neighbor_vertex_set.append((start_vertex_x, start_vertex_y))
+            start_vertex_x += x_diff
+            start_vertex_y += y_diff
diff --git a/AlphaGo/gtp_wrapper.py b/AlphaGo/gtp_wrapper.py
deleted file mode 100644
index 1da8f03..0000000
--- a/AlphaGo/gtp_wrapper.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import gtp
-import go
-import utils
-
-
-def translate_gtp_colors(gtp_color):
-    if gtp_color == gtp.BLACK:
-        return go.BLACK
-    elif gtp_color == gtp.WHITE:
-        return go.WHITE
-    else:
-        return go.EMPTY
-
-
-class GtpInterface(object):
-    def __init__(self):
-        self.size = 9
-        self.position = None
-        self.komi = 6.5
-        self.clear()
-
-    def set_size(self, n):
-        self.size = n
-        go.set_board_size(n)
-        self.clear()
-
-    def set_komi(self, komi):
-        self.komi = komi
-        self.position.komi = komi
-
-    def clear(self):
-        self.position = go.Position(komi=self.komi)
-
-    def accomodate_out_of_turn(self, color):
-        if not translate_gtp_colors(color) == self.position.to_play:
-            self.position.flip_playerturn(mutate=True)
-
-    def make_move(self, color, vertex):
-        coords = utils.parse_pygtp_coords(vertex)
-        self.accomodate_out_of_turn(color)
-        try:
-            self.position = self.position.play_move(coords, color=translate_gtp_colors(color))
-        except go.IllegalMove:
-            return False
-        return True
-
-    def get_move(self, color):
-        self.accomodate_out_of_turn(color)
-        if self.should_resign(self.position):
-            return gtp.RESIGN
-
-        if self.should_pass(self.position):
-            return gtp.PASS
-
-        move = self.suggest_move(self.position)
-        return utils.unparse_pygtp_coords(move)
-
-    def should_resign(self, position):
-        if position.caps[0] + 50 < position.caps[1]:
-            return gtp.RESIGN
-
-    def should_pass(self, position):
-        # Pass if the opponent passes
-        return position.n > 100 and position.recent and position.recent[-1].move == None
-
-    def get_score(self):
-        return self.position.result()
-
-    def suggest_move(self, position):
-        raise NotImplementedError
diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index 18ce869..180186a 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -13,12 +13,11 @@ print "Start Name Sever : " + str(start_new_server.pid)# + str(start_new_server.
 time.sleep(1)
 agent_v0 = subprocess.Popen(['python', '-u', 'player.py', '--role=black'],
                             stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-time.sleep(3)
 print "Start Player 0 at : " + str(agent_v0.pid)
 agent_v1 = subprocess.Popen(['python', '-u', 'player.py', '--role=white', '--checkpoint_path=./checkpoints_origin/'],
                             stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-time.sleep(3)
 print "Start Player 1 at : " + str(agent_v1.pid)
+time.sleep(5)
 
 player = [None] * 2
 player[0] = Pyro4.Proxy("PYRONAME:black")
diff --git a/AlphaGo/player.py b/AlphaGo/player.py
index 36965a9..8245c38 100644
--- a/AlphaGo/player.py
+++ b/AlphaGo/player.py
@@ -8,6 +8,10 @@ from engine import GTPEngine
 
 @Pyro4.expose
 class Player(object):
+    """
+    This is the class which defines the object called by Pyro4 (Python remote object).
+    It passes the command to our engine, and return the result.
+    """
     def __init__(self, **kwargs):
         self.role = kwargs['role']
         self.engine = kwargs['engine']
diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py
index 327111d..5a55002 100644
--- a/AlphaGo/strategy.py
+++ b/AlphaGo/strategy.py
@@ -13,7 +13,6 @@ from tianshou.core.mcts.mcts import MCTS
 DELTA = [[1, 0], [-1, 0], [0, -1], [0, 1]]
 CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]]
 
-
 class GoEnv:
     def __init__(self, size=9, komi=6.5):
         self.size = size
@@ -221,37 +220,3 @@ class GoEnv:
              np.array(1 - state[:, :, :, -1]).reshape(1, self.size, self.size, 1)],
             axis=3)
         return new_state, 0
-
-
-class strategy(object):
-    def __init__(self, checkpoint_path):
-        self.simulator = GoEnv()
-        self.net = network_small.Network()
-        self.sess = self.net.forward(checkpoint_path)
-        self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v],
-                                                     feed_dict={self.net.x: state, self.net.is_training: False})
-
-    def data_process(self, history, color):
-        state = np.zeros([1, self.simulator.size, self.simulator.size, 17])
-        for i in range(8):
-            state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.simulator.size ** 2)).reshape(self.simulator.size, self.simulator.size)
-            state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(self.simulator.size ** 2)).reshape(self.simulator.size, self.simulator.size)
-        if color == utils.BLACK:
-            state[0, :, :, 16] = np.ones([self.simulator.size, self.simulator.size])
-        if color == utils.WHITE:
-            state[0, :, :, 16] = np.zeros([self.simulator.size, self.simulator.size])
-        return state
-
-    def gen_move(self, history, color):
-        self.simulator.history = copy.copy(history)
-        self.simulator.board = copy.copy(history[-1])
-        state = self.data_process(self.simulator.history, color)
-        mcts = MCTS(self.simulator, self.evaluator, state, self.simulator.size ** 2 + 1, inverse=True, max_step=10)
-        temp = 1
-        prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp)
-        choice = np.random.choice(self.simulator.size ** 2 + 1, 1, p=prob).tolist()[0]
-        if choice == self.simulator.size ** 2:
-            move = utils.PASS
-        else:
-            move = (choice % self.simulator.size + 1, choice / self.simulator.size + 1)
-        return move, prob
diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index e29d919..47b0768 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -168,6 +168,7 @@ class MCTS(object):
         if max_step is None and max_time is None:
             raise ValueError("Need a stop criteria!")
 
+        # TODO: running mcts should be implemented in another function, e.g. def search(self, max_step, max_time)
         self.select_time = []
         self.evaluate_time = []
         self.bp_time = []

From b8bdfea8bd9e01115c570e9e2de6cdb7c3633b46 Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Sat, 16 Dec 2017 14:33:31 +0800
Subject: [PATCH 07/98] start the player server in a more robost way.

---
 AlphaGo/play.py | 110 +++++++++++++++++++++++++-----------------------
 1 file changed, 58 insertions(+), 52 deletions(-)

diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index 180186a..242ba52 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -4,63 +4,69 @@ import re
 import Pyro4
 import time
 
-#start a name server to find the remote object
-kill_old_server = subprocess.Popen(['killall', 'pyro4-ns'])
-print "kill old server, the return code is : " + str(kill_old_server.wait())
-time.sleep(1)
-start_new_server = subprocess.Popen(['pyro4-ns', '&'])
-print "Start Name Sever : " + str(start_new_server.pid)# + str(start_new_server.wait())
-time.sleep(1)
-agent_v0 = subprocess.Popen(['python', '-u', 'player.py', '--role=black'],
-                            stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-print "Start Player 0 at : " + str(agent_v0.pid)
-agent_v1 = subprocess.Popen(['python', '-u', 'player.py', '--role=white', '--checkpoint_path=./checkpoints_origin/'],
-                            stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-print "Start Player 1 at : " + str(agent_v1.pid)
-time.sleep(5)
+if __name__ == '__main__':
+    # start a name server to find the remote object
+    kill_old_server = subprocess.Popen(['killall', 'pyro4-ns'])
+    print "kill old server, the return code is : " + str(kill_old_server.wait())
+    time.sleep(1)
+    start_new_server = subprocess.Popen(['pyro4-ns', '&'])
+    print "Start Name Sever : " + str(start_new_server.pid)  # + str(start_new_server.wait())
+    time.sleep(1)
+    agent_v0 = subprocess.Popen(['python', '-u', 'player.py', '--role=black'],
+                                stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    agent_v1 = subprocess.Popen(['python', '-u', 'player.py', '--role=white', '--checkpoint_path=./checkpoints_origin/'],
+                                stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    server_list = ""
+    while ("black" not in server_list) or ("white" not in server_list):
+        server_list = subprocess.check_output(['pyro4-nsc', 'list'])
+        print "Waining for the server start..."
+        time.sleep(1)
+    print server_list
+    print "Start black player at : " + str(agent_v0.pid)
+    print "Start white player at : " + str(agent_v1.pid)
 
-player = [None] * 2
-player[0] = Pyro4.Proxy("PYRONAME:black")
-player[1] = Pyro4.Proxy("PYRONAME:white")
+    player = [None] * 2
+    player[0] = Pyro4.Proxy("PYRONAME:black")
+    player[1] = Pyro4.Proxy("PYRONAME:white")
 
-role = ["BLACK", "WHITE"]
-color = ['b', 'w']
+    role = ["BLACK", "WHITE"]
+    color = ['b', 'w']
 
-pattern = "[A-Z]{1}[0-9]{1}"
-size = 9
-show = ['.', 'X', 'O']
+    pattern = "[A-Z]{1}[0-9]{1}"
+    size = 9
+    show = ['.', 'X', 'O']
 
-game_num = 0
-while game_num < 1:
-    num = 0
-    pass_flag = [False, False]
-    print("Start game {}".format(game_num))
-    # end the game if both palyer chose to pass, or play too much turns
-    while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2:
-        turn = num % 2
-        move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n')
-        print role[turn] + " : " + str(move),
-        num += 1
-        match = re.search(pattern, move)
-        if match is not None:
-            #print "match : " + str(match.group())
-            play_or_pass = match.group()
-            pass_flag[turn] = False
-        else:
-            #print "no match"
-            play_or_pass = ' PASS'
-            pass_flag[turn] = True
-        result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n')
-        board = player[turn].run_cmd(str(num) + ' show_board')
-        board = eval(board[board.index('['):board.index(']') + 1])
-        for i in range(size):
-            for j in range(size):
-                print show[board[i * size + j]] + " ",
-            print "\n",
+    game_num = 0
+    while game_num < 1:
+        num = 0
+        pass_flag = [False, False]
+        print("Start game {}".format(game_num))
+        # end the game if both palyer chose to pass, or play too much turns
+        while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2:
+            turn = num % 2
+            move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n')
+            print role[turn] + " : " + str(move),
+            num += 1
+            match = re.search(pattern, move)
+            if match is not None:
+                # print "match : " + str(match.group())
+                play_or_pass = match.group()
+                pass_flag[turn] = False
+            else:
+                # print "no match"
+                play_or_pass = ' PASS'
+                pass_flag[turn] = True
+            result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n')
+            board = player[turn].run_cmd(str(num) + ' show_board')
+            board = eval(board[board.index('['):board.index(']') + 1])
+            for i in range(size):
+                for j in range(size):
+                    print show[board[i * size + j]] + " ",
+                print "\n",
 
-    score = player[turn].run_cmd(str(num) + ' get_score')
-    print "Finished : ", score.split(" ")[1]
-    player[0].run_cmd(str(num) + ' clear_board')
+        score = player[turn].run_cmd(str(num) + ' get_score')
+        print "Finished : ", score.split(" ")[1]
+        player[0].run_cmd(str(num) + ' clear_board')
     player[1].run_cmd(str(num) + ' clear_board')
     game_num += 1
 

From 431f551ce9ce44015e7563ffc5db06f9caf8fd2e Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Sat, 16 Dec 2017 14:55:19 +0800
Subject: [PATCH 08/98] check if the network weights exists for every player

---
 AlphaGo/play.py | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index 242ba52..d6e6138 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -3,15 +3,34 @@ import sys
 import re
 import Pyro4
 import time
+import os
 
 if __name__ == '__main__':
-    # start a name server to find the remote object
+    """
+    Starting two different players which load network weights to evaluate the winning ratio.
+    Note that, this function requires the installation of the Pyro4 library.
+    """
+    # TODO : we should set the network path in a more configurable way.
+    black_weight_path = "./checkpoints"
+    white_weight_path = "./checkpoints_origin"
+    if (not os.path.exists(black_weight_path)):
+        print "Can't not find the network weights for black player."
+        sys.exit()
+    if (not os.path.exists(white_weight_path)):
+        print "Can't not find the network weights for white player."
+        sys.exit()
+
+    # kill the old server
     kill_old_server = subprocess.Popen(['killall', 'pyro4-ns'])
-    print "kill old server, the return code is : " + str(kill_old_server.wait())
+    print "kill the old pyro4 name server, the return code is : " + str(kill_old_server.wait())
     time.sleep(1)
+
+    # start a name server to find the remote object
     start_new_server = subprocess.Popen(['pyro4-ns', '&'])
     print "Start Name Sever : " + str(start_new_server.pid)  # + str(start_new_server.wait())
     time.sleep(1)
+
+    # start two different player with different network weights.
     agent_v0 = subprocess.Popen(['python', '-u', 'player.py', '--role=black'],
                                 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
     agent_v1 = subprocess.Popen(['python', '-u', 'player.py', '--role=white', '--checkpoint_path=./checkpoints_origin/'],
@@ -36,8 +55,9 @@ if __name__ == '__main__':
     size = 9
     show = ['.', 'X', 'O']
 
+    evaluate_rounds = 1
     game_num = 0
-    while game_num < 1:
+    while game_num < evaluate_rounds:
         num = 0
         pass_flag = [False, False]
         print("Start game {}".format(game_num))
@@ -70,6 +90,6 @@ if __name__ == '__main__':
     player[1].run_cmd(str(num) + ' clear_board')
     game_num += 1
 
-subprocess.call(["kill", "-9", str(agent_v0.pid)])
-subprocess.call(["kill", "-9", str(agent_v1.pid)])
-print "Kill all player, finish all game."
+    subprocess.call(["kill", "-9", str(agent_v0.pid)])
+    subprocess.call(["kill", "-9", str(agent_v1.pid)])
+    print "Kill all player, finish all game."

From e10acf51303bf5fb6c246bf1e9d87ba3aede92bb Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Sat, 16 Dec 2017 23:29:11 +0800
Subject: [PATCH 09/98] 0. code refactor, try to merge Go and GoEnv

---
 AlphaGo/game.py            | 41 ++++++++++++++-------------
 AlphaGo/go.py              |  2 +-
 AlphaGo/play.py            |  4 +--
 AlphaGo/strategy.py        | 57 +++++++++++++++++++++-----------------
 tianshou/core/mcts/mcts.py |  2 +-
 5 files changed, 57 insertions(+), 49 deletions(-)

diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index 360921e..02ccb27 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -31,14 +31,14 @@ class Game:
         self.komi = komi
         self.board = [utils.EMPTY] * (self.size * self.size)
         self.history = []
-        self.past = deque(maxlen=8)
+        self.latest_boards = deque(maxlen=8)
         for _ in range(8):
-            self.past.append(self.board)
+            self.latest_boards.append(self.board)
 
         self.executor = go.Go(game=self)
         #self.strategy = strategy(checkpoint_path)
 
-        self.simulator = strategy.GoEnv()
+        self.simulator = strategy.GoEnv(game=self)
         self.net = network_small.Network()
         self.sess = self.net.forward(checkpoint_path)
         self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v],
@@ -57,7 +57,7 @@ class Game:
         self.board = [utils.EMPTY] * (self.size * self.size)
         self.history = []
         for _ in range(8):
-            self.past.append(self.board)
+            self.latest_boards.append(self.board)
 
     def set_size(self, n):
         self.size = n
@@ -66,29 +66,29 @@ class Game:
     def set_komi(self, k):
         self.komi = k
 
-    def data_process(self, history, color):
-        state = np.zeros([1, self.simulator.size, self.simulator.size, 17])
+    def generate_nn_input(self, history, color):
+        state = np.zeros([1, self.size, self.size, 17])
         for i in range(8):
-            state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.simulator.size ** 2)).reshape(self.simulator.size, self.simulator.size)
-            state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(self.simulator.size ** 2)).reshape(self.simulator.size, self.simulator.size)
+            state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.size ** 2)).reshape(self.size, self.size)
+            state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(self.size ** 2)).reshape(self.size, self.size)
         if color == utils.BLACK:
-            state[0, :, :, 16] = np.ones([self.simulator.size, self.simulator.size])
+            state[0, :, :, 16] = np.ones([self.size, self.size])
         if color == utils.WHITE:
-            state[0, :, :, 16] = np.zeros([self.simulator.size, self.simulator.size])
+            state[0, :, :, 16] = np.zeros([self.size, self.size])
         return state
 
-    def strategy_gen_move(self, history, color):
-        self.simulator.history = copy.copy(history)
-        self.simulator.board = copy.copy(history[-1])
-        state = self.data_process(self.simulator.history, color)
-        mcts = MCTS(self.simulator, self.evaluator, state, self.simulator.size ** 2 + 1, inverse=True, max_step=10)
+    def strategy_gen_move(self, latest_boards, color):
+        self.simulator.latest_boards = copy.copy(latest_boards)
+        self.simulator.board = copy.copy(latest_boards[-1])
+        nn_input = self.generate_nn_input(self.simulator.latest_boards, color)
+        mcts = MCTS(self.simulator, self.evaluator, nn_input, self.size ** 2 + 1, inverse=True, max_step=1)
         temp = 1
         prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp)
-        choice = np.random.choice(self.simulator.size ** 2 + 1, 1, p=prob).tolist()[0]
-        if choice == self.simulator.size ** 2:
+        choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0]
+        if choice == self.size ** 2:
             move = utils.PASS
         else:
-            move = (choice % self.simulator.size + 1, choice / self.simulator.size + 1)
+            move = (choice % self.size + 1, choice / self.size + 1)
         return move, prob
 
     def do_move(self, color, vertex):
@@ -100,7 +100,7 @@ class Game:
     def gen_move(self, color):
         # move = self.strategy.gen_move(color)
         # return move
-        move, self.prob = self.strategy_gen_move(self.past, color)
+        move, self.prob = self.strategy_gen_move(self.latest_boards, color)
         self.do_move(color, move)
         return move
 
@@ -127,3 +127,6 @@ class Game:
 if __name__ == "__main__":
     g = Game()
     g.show_board()
+    #file = open("debug.txt", "a")
+    #file.write("mcts check\n")
+    #file.close()
diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index 26540e1..0afc877 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -135,7 +135,7 @@ class Go:
         self.game.board[self.game._flatten(vertex)] = color
         self._process_board(color, vertex)
         self.game.history.append(copy.copy(self.game.board))
-        self.game.past.append(copy.copy(self.game.board))
+        self.game.latest_boards.append(copy.copy(self.game.board))
         return True
 
     def _find_empty(self):
diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index d6e6138..fe6c7ce 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -87,8 +87,8 @@ if __name__ == '__main__':
         score = player[turn].run_cmd(str(num) + ' get_score')
         print "Finished : ", score.split(" ")[1]
         player[0].run_cmd(str(num) + ' clear_board')
-    player[1].run_cmd(str(num) + ' clear_board')
-    game_num += 1
+        player[1].run_cmd(str(num) + ' clear_board')
+        game_num += 1
 
     subprocess.call(["kill", "-9", str(agent_v0.pid)])
     subprocess.call(["kill", "-9", str(agent_v1.pid)])
diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py
index 5a55002..0bad998 100644
--- a/AlphaGo/strategy.py
+++ b/AlphaGo/strategy.py
@@ -14,15 +14,14 @@ DELTA = [[1, 0], [-1, 0], [0, -1], [0, 1]]
 CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]]
 
 class GoEnv:
-    def __init__(self, size=9, komi=6.5):
-        self.size = size
-        self.komi = komi
-        self.board = [utils.EMPTY] * (self.size * self.size)
-        self.history = deque(maxlen=8)
+    def __init__(self, **kwargs):
+        self.game = kwargs['game']
+        self.board = [utils.EMPTY] * (self.game.size * self.game.size)
+        self.latest_boards = deque(maxlen=8)
 
     def _flatten(self, vertex):
         x, y = vertex
-        return (x - 1) * self.size + (y - 1)
+        return (x - 1) * self.game.size + (y - 1)
 
     def _bfs(self, vertex, color, block, status, alive_break):
         block.append(vertex)
@@ -35,7 +34,7 @@ class GoEnv:
 
     def _find_block(self, vertex, alive_break=False):
         block = []
-        status = [False] * (self.size * self.size)
+        status = [False] * (self.game.size * self.game.size)
         color = self.board[self._flatten(vertex)]
         self._bfs(vertex, color, block, status, alive_break)
 
@@ -73,7 +72,7 @@ class GoEnv:
         _board = copy.copy(self.board)
         self.board[self._flatten(vertex)] = color
         self._process_board(color, vertex)
-        if self.board in self.history:
+        if self.board in self.latest_boards:
             res = True
         else:
             res = False
@@ -83,8 +82,8 @@ class GoEnv:
 
     def _in_board(self, vertex):
         x, y = vertex
-        if x < 1 or x > self.size: return False
-        if y < 1 or y > self.size: return False
+        if x < 1 or x > self.game.size: return False
+        if y < 1 or y > self.game.size: return False
         return True
 
     def _neighbor(self, vertex):
@@ -151,21 +150,28 @@ class GoEnv:
                 # print "many opponents, fake eye"
                 return False
 
-    # def is_valid(self, color, vertex):
-    def is_valid(self, state, action):
+    def knowledge_prunning(self, color, vertex):
+        ### check if it is an eye of yourself
+        ### assumptions : notice that this judgement requires that the state is an endgame
+        if self._is_eye(color, vertex):
+            return False
+        return True
+
+    def simulate_is_valid(self, state, action):
         # state is the play board, the shape is [1, 9, 9, 17]
-        if action == self.size * self.size:
+        if action == self.game.size * self.game.size:
             vertex = (0, 0)
         else:
-            vertex = (action / self.size + 1, action % self.size + 1)
+            vertex = (action / self.game.size + 1, action % self.game.size + 1)
         if state[0, 0, 0, -1] == utils.BLACK:
             color = utils.BLACK
         else:
             color = utils.WHITE
-        self.history.clear()
+        self.latest_boards.clear()
         for i in range(8):
-            self.history.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist())
-        self.board = copy.copy(self.history[-1])
+            self.latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist())
+        self.board = copy.copy(self.latest_boards[-1])
+
         ### in board
         if not self._in_board(vertex):
             return False
@@ -180,12 +186,11 @@ class GoEnv:
         if not self._is_qi(color, vertex):
             return False
 
-        ### check if it is an eye of yourself
-        ### assumptions : notice that this judgement requires that the state is an endgame
-        if self._is_eye(color, vertex):
+        ### forbid global isomorphous
+        if self._check_global_isomorphous(color, vertex):
             return False
 
-        if self._check_global_isomorphous(color, vertex):
+        if not self.knowledge_prunning(color, vertex):
             return False
 
         return True
@@ -206,17 +211,17 @@ class GoEnv:
             color = utils.BLACK
         else:
             color = utils.WHITE
-        if action == self.size ** 2:
+        if action == self.game.size ** 2:
             vertex = utils.PASS
         else:
-            vertex = (action % self.size + 1, action / self.size + 1)
+            vertex = (action % self.game.size + 1, action / self.game.size + 1)
         # print(vertex)
         # print(self.board)
         self.board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist()
         self.do_move(color, vertex)
         new_state = np.concatenate(
-            [state[:, :, :, 1:8], (np.array(self.board) == utils.BLACK).reshape(1, self.size, self.size, 1),
-             state[:, :, :, 9:16], (np.array(self.board) == utils.WHITE).reshape(1, self.size, self.size, 1),
-             np.array(1 - state[:, :, :, -1]).reshape(1, self.size, self.size, 1)],
+            [state[:, :, :, 1:8], (np.array(self.board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1),
+             state[:, :, :, 9:16], (np.array(self.board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1),
+             np.array(1 - state[:, :, :, -1]).reshape(1, self.game.size, self.game.size, 1)],
             axis=3)
         return new_state, 0
diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index 47b0768..979e994 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -75,7 +75,7 @@ class UCTNode(MCTSNode):
             start_time = time.time()
             self.mask = []
             for act in range(self.action_num - 1):
-                if not simulator.is_valid(self.state, act):
+                if not simulator.simulate_is_valid(self.state, act):
                     self.mask.append(act)
                     self.ucb[act] = -float("Inf")
         else:

From 62e2c6582dcd862e55d7cfb27ffb6c76f18af97f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=AE=8B=E4=B8=96=E8=99=B9?=
 <songsh15@mails.tsinghua.edu.cn>
Date: Sun, 17 Dec 2017 12:52:00 +0800
Subject: [PATCH 10/98] finished very naive dqn: changed the interface of
 replay buffer by adding collect and next_batch, but still need refactoring;
 added implementation of dqn.py, but still need to consider the interface to
 make it more extensive; slightly refactored the code style of the codebase;
 more comments and todos will be in the next commit

---
 examples/dqn_example.py                       |  26 +++--
 tianshou/core/losses.py                       |   6 +-
 tianshou/core/policy/__init__.py              |   3 +-
 tianshou/core/policy/base.py                  |  13 ++-
 tianshou/core/policy/dqn.py                   |  55 ++++++++-
 tianshou/data/advantage_estimation.py         |  37 ++++++-
 tianshou/data/replay_buffer/buffer.py         |  60 ++++++----
 tianshou/data/replay_buffer/naive.py          | 104 ++++++++++++++----
 tianshou/data/replay_buffer/proportional.py   |  85 +++++++++++++-
 tianshou/data/replay_buffer/rank_based.py     |  83 ++++++++++++--
 .../data/replay_buffer/replay_buffer_test.py  |  10 +-
 tianshou/data/replay_buffer/utils.py          |  33 +++---
 12 files changed, 411 insertions(+), 104 deletions(-)

diff --git a/examples/dqn_example.py b/examples/dqn_example.py
index 4fbe466..7d20731 100644
--- a/examples/dqn_example.py
+++ b/examples/dqn_example.py
@@ -9,8 +9,7 @@ import gym
 import sys
 sys.path.append('..')
 import tianshou.core.losses as losses
-from tianshou.data.replay import Replay
-import tianshou.data.advantage_estimation as advantage_estimation
+from tianshou.data.replay_buffer.utils import get_replay_buffer
 import tianshou.core.policy as policy
 
 
@@ -38,11 +37,10 @@ if __name__ == '__main__':
     action_dim = env.action_space.n
 
     # 1. build network with pure tf
-    observation = tf.placeholder(tf.float32, shape=(None,) + observation_dim) # network input
+    observation = tf.placeholder(tf.float32, shape=(None,) + observation_dim, name="dqn_observation") # network input
 
     with tf.variable_scope('q_net'):
         q_values = policy_net(observation, action_dim)
-        train_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) # TODO: better management of TRAINABLE_VARIABLES
     with tf.variable_scope('target_net'):
         q_values_target = policy_net(observation, action_dim)
 
@@ -54,13 +52,15 @@ if __name__ == '__main__':
     target = tf.placeholder(dtype=tf.float32, shape=[None]) # target value for DQN
 
     dqn_loss = losses.dqn_loss(action, target, q_net) # TongzhengRen
-
+    global_step = tf.Variable(0, name='global_step', trainable=False)
+    train_var_list = tf.get_collection(
+        tf.GraphKeys.TRAINABLE_VARIABLES)  # TODO: better management of TRAINABLE_VARIABLES
     total_loss = dqn_loss
     optimizer = tf.train.AdamOptimizer(1e-3)
-    train_op = optimizer.minimize(total_loss, var_list=train_var_list)
-
+    train_op = optimizer.minimize(total_loss, var_list=train_var_list, global_step=tf.train.get_global_step())
     # 3. define data collection
-    training_data = Replay(env, q_net, advantage_estimation.qlearning_target(target_net)) #
+    replay_memory = get_replay_buffer('rank_based', env, q_values, q_net, target_net,
+                                      {'size': 1000, 'batch_size': 64, 'learn_start': 20})
                                                              # ShihongSong: Replay(env, q_net, advantage_estimation.qlearning_target(target_network)), use your ReplayMemory, interact as follows. Simplify your advantage_estimation.dqn to run before YongRen's DQN
     # maybe a dict to manage the elements to be collected
 
@@ -70,14 +70,16 @@ if __name__ == '__main__':
 
         minibatch_count = 0
         collection_count = 0
+        collect_freq = 100
         while True: # until some stopping criterion met...
             # collect data
-            training_data.collect() # ShihongSong
-            collection_count += 1
-            print('Collected {} times.'.format(collection_count))
+            for i in range(0, collect_freq):
+                replay_memory.collect() # ShihongSong
+                collection_count += 1
+                print('Collected {} times.'.format(collection_count))
 
             # update network
-            data = training_data.next_batch(64) # YouQiaoben, ShihongSong
+            data = replay_memory.next_batch(10) # YouQiaoben, ShihongSong
             # TODO: auto managing of the placeholders? or add this to params of data.Batch
             sess.run(train_op, feed_dict={observation: data['observations'], action: data['actions'], target: data['target']})
             minibatch_count += 1
diff --git a/tianshou/core/losses.py b/tianshou/core/losses.py
index d281df9..3461afb 100644
--- a/tianshou/core/losses.py
+++ b/tianshou/core/losses.py
@@ -32,7 +32,7 @@ def vanilla_policy_gradient(sampled_action, reward, pi, baseline="None"):
     """
     log_pi_act = pi.log_prob(sampled_action)
     vanilla_policy_gradient_loss = tf.reduce_mean(reward * log_pi_act)
-    # TODO： Different baseline methods like REINFORCE, etc.
+    # TODO: Different baseline methods like REINFORCE, etc.
     return vanilla_policy_gradient_loss
 
 def dqn_loss(sampled_action, sampled_target, q_net):
@@ -44,8 +44,8 @@ def dqn_loss(sampled_action, sampled_target, q_net):
     :param q_net: current `policy` to be optimized
     :return:
     """
-    action_num = q_net.get_values().shape()[1]
-    sampled_q = tf.reduce_sum(q_net.get_values() * tf.one_hot(sampled_action, action_num), axis=1)
+    action_num = q_net.values_tensor().get_shape()[1]
+    sampled_q = tf.reduce_sum(q_net.values_tensor() * tf.one_hot(sampled_action, action_num), axis=1)
     return tf.reduce_mean(tf.square(sampled_target - sampled_q))
 
 def deterministic_policy_gradient(sampled_state, critic):
diff --git a/tianshou/core/policy/__init__.py b/tianshou/core/policy/__init__.py
index f67b3ba..ccde775 100644
--- a/tianshou/core/policy/__init__.py
+++ b/tianshou/core/policy/__init__.py
@@ -2,4 +2,5 @@
 # -*- coding: utf-8 -*-
 
 from .base import *
-from .stochastic import *
\ No newline at end of file
+from .stochastic import *
+from .dqn import *
\ No newline at end of file
diff --git a/tianshou/core/policy/base.py b/tianshou/core/policy/base.py
index b6d8d48..eecfc4f 100644
--- a/tianshou/core/policy/base.py
+++ b/tianshou/core/policy/base.py
@@ -12,23 +12,28 @@ import tensorflow as tf
 
 __all__ = [
     'StochasticPolicy',
+    'QValuePolicy',
 ]
 
-#TODO: separate actor and critic, we should focus on it once we finish the basic module.
+# TODO: separate actor and critic, we should focus on it once we finish the basic module.
+
 
 class QValuePolicy(object):
     """
     The policy as in DQN
     """
     def __init__(self, observation_placeholder):
-        self.observation_placeholder = observation_placeholder
+        self._observation_placeholder = observation_placeholder
 
     def act(self, observation, exploration=None): # first implement no exploration
         """
         return the action (int) to be executed.
         no exploration when exploration=None.
         """
-        pass
+        self._act(observation, exploration)
+
+    def _act(self, observation, exploration = None):
+        raise NotImplementedError()
 
     def values(self, observation):
         """
@@ -36,7 +41,7 @@ class QValuePolicy(object):
         """
         pass
 
-    def values_tensor(self, observation):
+    def values_tensor(self):
         """
         returns the tensor of the values for all actions a at observation s
         """
diff --git a/tianshou/core/policy/dqn.py b/tianshou/core/policy/dqn.py
index cfc6abf..81efc9b 100644
--- a/tianshou/core/policy/dqn.py
+++ b/tianshou/core/policy/dqn.py
@@ -1,7 +1,54 @@
-
-
-from .base import QValuePolicy
+from tianshou.core.policy.base import QValuePolicy
+import tensorflow as tf
 
 
 class DQN(QValuePolicy):
-	pass
\ No newline at end of file
+    """
+    The policy as in DQN
+    """
+
+    def __init__(self, logits, observation_placeholder, dtype=None, **kwargs):
+        self._logits = tf.convert_to_tensor(logits)
+        if dtype is None:
+            dtype = tf.int32
+        self._n_categories = self._logits.get_shape()[-1].value
+
+        super(DQN, self).__init__(observation_placeholder)
+
+        net = tf.layers.conv2d(self._observation_placeholder, 16, 8, 4, 'valid', activation=tf.nn.relu)
+        net = tf.layers.conv2d(net, 32, 4, 2, 'valid', activation=tf.nn.relu)
+        net = tf.layers.flatten(net)
+        net = tf.layers.dense(net, 256, activation=tf.nn.relu, use_bias=True)
+        self._value = tf.layers.dense(net, self._n_categories)
+
+    def _act(self, observation, exploration=None):  # first implement no exploration
+        """
+        return the action (int) to be executed.
+        no exploration when exploration=None.
+        """
+        sess = tf.get_default_session()
+        sampled_action = sess.run(tf.multinomial(self.logits, num_samples=1),
+                                  feed_dict={self._observation_placeholder: observation[None]})
+        return sampled_action
+
+    @property
+    def logits(self):
+        return self._logits
+
+    @property
+    def n_categories(self):
+        return self._n_categories
+
+    def values(self, observation):
+        """
+        returns the Q(s, a) values (float) for all actions a at observation s
+        """
+        sess = tf.get_default_session()
+        value = sess.run(self._value, feed_dict={self._observation_placeholder: observation[None]})
+        return value
+
+    def values_tensor(self):
+        """
+        returns the tensor of the values for all actions a at observation s
+        """
+        return self._value
diff --git a/tianshou/data/advantage_estimation.py b/tianshou/data/advantage_estimation.py
index 6f5b8a6..3c2d644 100644
--- a/tianshou/data/advantage_estimation.py
+++ b/tianshou/data/advantage_estimation.py
@@ -19,7 +19,8 @@ def full_return(raw_data):
     returns = rewards.copy()
     episode_start_idx = 0
     for i in range(1, num_timesteps):
-        if episode_start_flags[i] or (i == num_timesteps - 1): # found the start of next episode or the end of all episodes
+        if episode_start_flags[i] or (
+                i == num_timesteps - 1):  # found the start of next episode or the end of all episodes
             if i < rewards.shape[0] - 1:
                 t = i - 1
             else:
@@ -34,4 +35,36 @@ def full_return(raw_data):
 
     data['returns'] = returns
 
-    return data
\ No newline at end of file
+    return data
+
+
+class QLearningTarget:
+    def __init__(self, policy, gamma):
+        self._policy = policy
+        self._gamma = gamma
+
+    def __call__(self, raw_data):
+        data = dict()
+        observations = list()
+        actions = list()
+        rewards = list()
+        wi = list()
+        all_data, data_wi, data_index = raw_data
+
+        for i in range(0, all_data.shape[0]):
+            current_data = all_data[i]
+            current_wi = data_wi[i]
+            current_index = data_index[i]
+            observations.append(current_data['observation'])
+            actions.append(current_data['action'])
+            next_max_qvalue = np.max(self._policy.values(current_data['observation']))
+            current_qvalue = self._policy.values(current_data['previous_observation'])[current_data['previous_action']]
+            reward = current_data['reward'] + next_max_qvalue - current_qvalue
+            rewards.append(reward)
+            wi.append(current_wi)
+
+        data['observations'] = np.array(observations)
+        data['actions'] = np.array(actions)
+        data['rewards'] = np.array(rewards)
+
+        return data
diff --git a/tianshou/data/replay_buffer/buffer.py b/tianshou/data/replay_buffer/buffer.py
index 4b92cfc..6a44170 100644
--- a/tianshou/data/replay_buffer/buffer.py
+++ b/tianshou/data/replay_buffer/buffer.py
@@ -1,39 +1,51 @@
 class ReplayBuffer(object):
-	def __init__(self, conf):
-		'''
+    def __init__(self, env, policy, qnet, target_qnet, conf):
+        """
 		Initialize a replay buffer with parameters in conf.
-		'''
-		pass
+		"""
+        pass
 
-	def add(self, data, priority):
-		'''
+    def add(self, data, priority):
+        """
 		Add a data with priority = priority to replay buffer.
-		'''
-		pass
+		"""
+        pass
 
-	def update_priority(self, indices, priorities):
-		'''
+    def collect(self):
+        """
+		Collect data from current environment and policy.
+		"""
+        pass
+
+    def next_batch(self, batch_size):
+        """
+		get batch of data from the replay buffer.
+		"""
+        pass
+
+    def update_priority(self, indices, priorities):
+        """
 		Update the data's priority whose indices = indices.
 		For proportional replay buffer, the priority is the priority.
 		For rank based replay buffer, the priorities parameter will be the delta used to update the priority.
-		'''
-		pass
+		"""
+        pass
 
-	def reset_alpha(self, alpha):
-		'''
+    def reset_alpha(self, alpha):
+        """
 		This function only works for proportional replay buffer.
 		This function resets alpha.
-		'''
-		pass
+		"""
+        pass
 
-	def sample(self, conf):
-		'''
+    def sample(self, conf):
+        """
 		Sample from replay buffer with parameters in conf.
-		'''
-		pass
+		"""
+        pass
 
-	def rebalance(self):
-		'''
+    def rebalance(self):
+        """
 		This is for rank based priority replay buffer, which is used to rebalance the sum tree of the priority queue.
-		'''
-		pass
\ No newline at end of file
+		"""
+        pass
diff --git a/tianshou/data/replay_buffer/naive.py b/tianshou/data/replay_buffer/naive.py
index 9436a39..50ba1c3 100644
--- a/tianshou/data/replay_buffer/naive.py
+++ b/tianshou/data/replay_buffer/naive.py
@@ -1,29 +1,93 @@
-from buffer import ReplayBuffer
 import numpy as np
+import tensorflow as tf
 from collections import deque
+from math import fabs
+
+from tianshou.data.replay_buffer.buffer import ReplayBuffer
+
 
 class NaiveExperience(ReplayBuffer):
-	def __init__(self, conf):
-		self.max_size = conf['size']
-		self.n_entries = 0
-		self.memory = deque(maxlen = self.max_size)
+    def __init__(self, env, policy, qnet, target_qnet, conf):
+        self.max_size = conf['size']
+        self._env = env
+        self._policy = policy
+        self._qnet = qnet
+        self._target_qnet = target_qnet
+        self._begin_act()
+        self.n_entries = 0
+        self.memory = deque(maxlen=self.max_size)
 
-	def add(self, data, priority = 0):
-		self.memory.append(data)
-		if self.n_entries < self.max_size:
-			self.n_entries += 1
+    def add(self, data, priority=0):
+        self.memory.append(data)
+        if self.n_entries < self.max_size:
+            self.n_entries += 1
 
-	def update_priority(self, indices, priorities = 0):
-		pass
+    def _begin_act(self):
+        self.observation = self._env.reset()
+        self.action = self._env.action_space.sample()
+        done = False
+        while not done:
+            if done:
+                self.observation = self._env.reset()
+                self.action = self._env.action_space.sample()
+            self.observation, _, done, _ = self._env.step(self.action)
 
-	def reset_alpha(self, alpha):
-		pass
+    def collect(self):
+        sess = tf.get_default_session()
+        current_data = dict()
+        current_data['previous_action'] = self.action
+        current_data['previous_observation'] = self.observation
+        self.action = np.argmax(sess.run(self._policy, feed_dict={"dqn_observation:0": self.observation.reshape((1,) + self.observation.shape)}))
+        self.observation, reward, done, _ = self._env.step(self.action)
+        current_data['action'] = self.action
+        current_data['observation'] = self.observation
+        current_data['reward'] = reward
+        self.add(current_data)
+        if done:
+            self._begin_act()
 
-	def sample(self, conf):
-		batch_size = conf['batch_size']
-		batch_size = min(len(self.memory), batch_size)
-		idxs = np.random.choice(len(self.memory), batch_size)
-		return [self.memory[idx] for idx in idxs], [1] * len(idxs), idxs
+    def update_priority(self, indices, priorities=0):
+        pass
 
-	def rebalance(self):
-		pass
+    def reset_alpha(self, alpha):
+        pass
+
+    def sample(self, conf):
+        batch_size = conf['batch_size']
+        batch_size = min(len(self.memory), batch_size)
+        idxs = np.random.choice(len(self.memory), batch_size)
+        return [self.memory[idx] for idx in idxs], [1] * len(idxs), idxs
+
+    def next_batch(self, batch_size):
+        data = dict()
+        observations = list()
+        actions = list()
+        rewards = list()
+        wi = list()
+        target = list()
+
+        for i in range(0, batch_size):
+            current_datas, current_wis, current_indexs = self.sample({'batch_size': 1})
+            current_data = current_datas[0]
+            current_wi = current_wis[0]
+            current_index = current_indexs[0]
+            observations.append(current_data['observation'])
+            actions.append(current_data['action'])
+            next_max_qvalue = np.max(self._target_qnet.values(current_data['observation']))
+            current_qvalue = self._qnet.values(current_data['previous_observation'])[0, current_data['previous_action']]
+            reward = current_data['reward'] + next_max_qvalue - current_qvalue
+            rewards.append(reward)
+            target.append(current_data['reward'] + next_max_qvalue)
+            self.update_priority(current_index, [fabs(reward)])
+            wi.append(current_wi)
+
+        data['observations'] = np.array(observations)
+        data['actions'] = np.array(actions)
+        data['rewards'] = np.array(rewards)
+        data['wi'] = np.array(wi)
+        data['target'] = np.array(target)
+
+        return data
+
+    def rebalance(self):
+        pass
diff --git a/tianshou/data/replay_buffer/proportional.py b/tianshou/data/replay_buffer/proportional.py
index 72d1457..63aab66 100644
--- a/tianshou/data/replay_buffer/proportional.py
+++ b/tianshou/data/replay_buffer/proportional.py
@@ -1,7 +1,10 @@
-import numpy
+import numpy as np
 import random
-import sum_tree
-from buffer import ReplayBuffer
+import tensorflow as tf
+import math
+
+from tianshou.data.replay_buffer import sum_tree
+from tianshou.data.replay_buffer.buffer import ReplayBuffer
 
 
 class PropotionalExperience(ReplayBuffer):
@@ -15,7 +18,7 @@ class PropotionalExperience(ReplayBuffer):
 
     """
     
-    def __init__(self, conf):
+    def __init__(self, env, policy, qnet, target_qnet, conf):
         """ Prioritized experience replay buffer initialization.
         
         Parameters
@@ -30,11 +33,26 @@ class PropotionalExperience(ReplayBuffer):
         """
         memory_size = conf['size']
         batch_size = conf['batch_size']
-        alpha = conf['alpha']
+        alpha = conf['alpha'] if 'alpha' in conf else 0.6
         self.tree = sum_tree.SumTree(memory_size)
         self.memory_size = memory_size
         self.batch_size = batch_size
         self.alpha = alpha
+        self._env = env
+        self._policy = policy
+        self._qnet = qnet
+        self._target_qnet = target_qnet
+        self._begin_act()
+
+    def _begin_act(self):
+        self.observation = self._env.reset()
+        self.action = self._env.action_space.sample()
+        done = False
+        while not done:
+            if done:
+                self.observation = self._env.reset()
+                self.action = self._env.action_space.sample()
+            self.observation, _, done, _ = self._env.step(self.action)
 
     def add(self, data, priority):
         """ Add new sample.
@@ -48,6 +66,12 @@ class PropotionalExperience(ReplayBuffer):
         """
         self.tree.add(data, priority**self.alpha)
 
+    def collect(self):
+        pass
+
+    def next_batch(self, batch_size):
+        pass
+
     def sample(self, conf):
         """ The method return samples randomly.
         
@@ -64,8 +88,9 @@ class PropotionalExperience(ReplayBuffer):
         indices:
             list of sample indices
             The indices indicate sample positions in a sum tree.
+            :param conf: giving beta
         """
-        beta = conf['beta']
+        beta = conf['beta'] if 'beta' in conf else 0.4
         if self.tree.filled_size() < self.batch_size:
             return None, None, None
 
@@ -91,6 +116,54 @@ class PropotionalExperience(ReplayBuffer):
         
         return out, weights, indices
 
+    def collect(self):
+        sess = tf.get_default_session()
+        current_data = dict()
+        current_data['previous_action'] = self.action
+        current_data['previous_observation'] = self.observation
+        # TODO: change the name of the feed_dict
+        self.action = np.argmax(sess.run(self._policy, feed_dict={"dqn_observation:0": self.observation.reshape((1,) + self.observation.shape)}))
+        self.observation, reward, done, _ = self._env.step(self.action)
+        current_data['action'] = self.action
+        current_data['observation'] = self.observation
+        current_data['reward'] = reward
+        priorities = np.array([self.tree.get_val(i) ** -self.alpha for i in range(self.tree.filled_size())])
+        priority = np.max(priorities) if len(priorities) > 0 else 1
+        self.add(current_data, priority)
+        if done:
+            self._begin_act()
+
+    def next_batch(self, batch_size):
+        data = dict()
+        observations = list()
+        actions = list()
+        rewards = list()
+        wi = list()
+        target = list()
+
+        for i in range(0, batch_size):
+            current_datas, current_wis, current_indexs = self.sample({'batch_size': 1})
+            current_data = current_datas[0]
+            current_wi = current_wis[0]
+            current_index = current_indexs[0]
+            observations.append(current_data['observation'])
+            actions.append(current_data['action'])
+            next_max_qvalue = np.max(self._target_qnet.values(current_data['observation']))
+            current_qvalue = self._qnet.values(current_data['previous_observation'])[0, current_data['previous_action']]
+            reward = current_data['reward'] + next_max_qvalue - current_qvalue
+            rewards.append(reward)
+            target.append(current_data['reward'] + next_max_qvalue)
+            self.update_priority([current_index], [math.fabs(reward)])
+            wi.append(current_wi)
+
+        data['observations'] = np.array(observations)
+        data['actions'] = np.array(actions)
+        data['rewards'] = np.array(rewards)
+        data['wi'] = np.array(wi)
+        data['target'] = np.array(target)
+
+        return data
+
     def update_priority(self, indices, priorities):
         """ The methods update samples's priority.
         
diff --git a/tianshou/data/replay_buffer/rank_based.py b/tianshou/data/replay_buffer/rank_based.py
index eb770af..da56763 100644
--- a/tianshou/data/replay_buffer/rank_based.py
+++ b/tianshou/data/replay_buffer/rank_based.py
@@ -8,13 +8,15 @@ import sys
 import math
 import random
 import numpy as np
+import tensorflow as tf
+
+from tianshou.data.replay_buffer.binary_heap import BinaryHeap
+from tianshou.data.replay_buffer.buffer import ReplayBuffer
 
-from binary_heap import BinaryHeap
-from buffer import ReplayBuffer
 
 class RankBasedExperience(ReplayBuffer):
 
-    def __init__(self, conf):
+    def __init__(self, env, policy, qnet, target_qnet, conf):
         self.size = conf['size']
         self.replace_flag = conf['replace_old'] if 'replace_old' in conf else True
         self.priority_size = conf['priority_size'] if 'priority_size' in conf else self.size
@@ -25,12 +27,18 @@ class RankBasedExperience(ReplayBuffer):
         self.learn_start = conf['learn_start'] if 'learn_start' in conf else 1000
         self.total_steps = conf['steps'] if 'steps' in conf else 100000
         # partition number N, split total size to N part
-        self.partition_num = conf['partition_num'] if 'partition_num' in conf else 100
+        self.partition_num = conf['partition_num'] if 'partition_num' in conf else 10
 
         self.index = 0
         self.record_size = 0
         self.isFull = False
 
+        self._env = env
+        self._policy = policy
+        self._qnet = qnet
+        self._target_qnet = target_qnet
+        self._begin_act()
+
         self._experience = {}
         self.priority_queue = BinaryHeap(self.priority_size)
         self.distributions = self.build_distributions()
@@ -98,7 +106,64 @@ class RankBasedExperience(ReplayBuffer):
             self.index += 1
             return self.index
 
-    def add(self, data, priority = 0):
+    def _begin_act(self):
+        self.observation = self._env.reset()
+        self.action = self._env.action_space.sample()
+        done = False
+        while not done:
+            if done:
+                self.observation = self._env.reset()
+                self.action = self._env.action_space.sample()
+            self.observation, _, done, _ = self._env.step(self.action)
+
+    def collect(self):
+        sess = tf.get_default_session()
+        current_data = dict()
+        current_data['previous_action'] = self.action
+        current_data['previous_observation'] = self.observation
+        self.action = np.argmax(sess.run(self._policy, feed_dict={"dqn_observation:0": self.observation.reshape((1,) + self.observation.shape)}))
+        self.observation, reward, done, _ = self._env.step(self.action)
+        current_data['action'] = self.action
+        current_data['observation'] = self.observation
+        current_data['reward'] = reward
+        self.add(current_data)
+        if done:
+            self._begin_act()
+
+    def next_batch(self, batch_size):
+        data = dict()
+        observations = list()
+        actions = list()
+        rewards = list()
+        wi = list()
+        target = list()
+
+        sess = tf.get_default_session()
+        current_datas, current_wis, current_indexs = self.sample({'global_step': sess.run(tf.train.get_global_step())})
+
+        for i in range(0, batch_size):
+            current_data = current_datas[i]
+            current_wi = current_wis[i]
+            current_index = current_indexs[i]
+            observations.append(current_data['observation'])
+            actions.append(current_data['action'])
+            next_max_qvalue = np.max(self._target_qnet.values(current_data['observation']))
+            current_qvalue = self._qnet.values(current_data['previous_observation'])[0, current_data['previous_action']]
+            reward = current_data['reward'] + next_max_qvalue - current_qvalue
+            rewards.append(reward)
+            target.append(current_data['reward'] + next_max_qvalue)
+            self.update_priority([current_index], [math.fabs(reward)])
+            wi.append(current_wi)
+
+        data['observations'] = np.array(observations)
+        data['actions'] = np.array(actions)
+        data['rewards'] = np.array(rewards)
+        data['wi'] = np.array(wi)
+        data['target'] = np.array(target)
+
+        return data
+
+    def add(self, data, priority = 1):
         """
         store experience, suggest that experience is a tuple of (s1, a, r, s2, t)
         so each experience is valid
@@ -156,16 +221,16 @@ class RankBasedExperience(ReplayBuffer):
             sys.stderr.write('Record size less than learn start! Sample failed\n')
             return False, False, False
 
-        dist_index = math.floor(self.record_size / self.size * self.partition_num)
+        dist_index = math.floor(self.record_size * 1. / self.size * self.partition_num)
         # issue 1 by @camigord
-        partition_size = math.floor(self.size / self.partition_num)
+        partition_size = math.floor(self.size * 1. / self.partition_num)
         partition_max = dist_index * partition_size
         distribution = self.distributions[dist_index]
         rank_list = []
         # sample from k segments
         for n in range(1, self.batch_size + 1):
-            index = random.randint(distribution['strata_ends'][n] + 1,
-                                   distribution['strata_ends'][n + 1])
+            index = random.randint(distribution['strata_ends'][n],
+                                       distribution['strata_ends'][n + 1])
             rank_list.append(index)
 
         # beta, increase by global_step, max 1
diff --git a/tianshou/data/replay_buffer/replay_buffer_test.py b/tianshou/data/replay_buffer/replay_buffer_test.py
index 9be659b..46b25c8 100644
--- a/tianshou/data/replay_buffer/replay_buffer_test.py
+++ b/tianshou/data/replay_buffer/replay_buffer_test.py
@@ -1,13 +1,15 @@
-from utils import *
 from functions import *
 
+from tianshou.data.replay_buffer.utils import get_replay_buffer
+
+
 def test_rank_based():
     conf = {'size': 50,
             'learn_start': 10,
             'partition_num': 5,
             'total_step': 100,
             'batch_size': 4}
-    experience = getReplayBuffer('rank_based', conf)
+    experience = get_replay_buffer('rank_based', conf)
 
     # insert to experience
     print 'test insert experience'
@@ -52,7 +54,7 @@ def test_proportional():
     conf = {'size': 50,
             'alpha': 0.7,
             'batch_size': 4}
-    experience = getReplayBuffer('proportional', conf)
+    experience = get_replay_buffer('proportional', conf)
 
     # insert to experience
     print 'test insert experience'
@@ -90,7 +92,7 @@ def test_proportional():
 
 def test_naive():
     conf = {'size': 50}
-    experience = getReplayBuffer('naive', conf)
+    experience = get_replay_buffer('naive', conf)
 
     # insert to experience
     print 'test insert experience'
diff --git a/tianshou/data/replay_buffer/utils.py b/tianshou/data/replay_buffer/utils.py
index 3bb9bfe..4480375 100644
--- a/tianshou/data/replay_buffer/utils.py
+++ b/tianshou/data/replay_buffer/utils.py
@@ -1,17 +1,20 @@
-from rank_based import *
-from proportional import *
-from naive import *
 import sys
 
-def getReplayBuffer(name, conf):
-	'''
-	Get replay buffer according to the given name.
-	'''
-	if (name == 'rank_based'):
-		return RankBasedExperience(conf)
-	elif (name == 'proportional'):
-		return PropotionalExperience(conf)
-	elif (name == 'naive'):
-		return NaiveExperience(conf)
-	else:
-		sys.stderr.write('no such replay buffer')
+from tianshou.data.replay_buffer.naive import NaiveExperience
+from tianshou.data.replay_buffer.proportional import PropotionalExperience
+from tianshou.data.replay_buffer.rank_based import RankBasedExperience
+
+
+def get_replay_buffer(name, env, policy, qnet, target_qnet, conf):
+    """
+    Get replay buffer according to the given name.
+    """
+
+    if name == 'rank_based':
+        return RankBasedExperience(env, policy, qnet, target_qnet, conf)
+    elif name == 'proportional':
+        return PropotionalExperience(env, policy, qnet, target_qnet, conf)
+    elif name == 'naive':
+        return NaiveExperience(env, policy, qnet, target_qnet, conf)
+    else:
+        sys.stderr.write('no such replay buffer')

From 7693c38f44e4f7d8f024e78a225a9efdbac40a83 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=AE=8B=E4=B8=96=E8=99=B9?=
 <songsh15@mails.tsinghua.edu.cn>
Date: Sun, 17 Dec 2017 13:28:21 +0800
Subject: [PATCH 11/98] add comments and todos

---
 examples/dqn_example.py                     |  5 +++++
 tianshou/core/policy/dqn.py                 |  9 +++++++++
 tianshou/data/replay_buffer/naive.py        | 15 +++++++++++++++
 tianshou/data/replay_buffer/proportional.py | 21 +++++++++++++++------
 tianshou/data/replay_buffer/rank_based.py   | 15 +++++++++++++++
 5 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/examples/dqn_example.py b/examples/dqn_example.py
index 7d20731..b676475 100644
--- a/examples/dqn_example.py
+++ b/examples/dqn_example.py
@@ -37,6 +37,9 @@ if __name__ == '__main__':
     action_dim = env.action_space.n
 
     # 1. build network with pure tf
+    # TODO:
+    # pass the observation variable to the replay buffer or find a more reasonable way to help replay buffer
+    # access this observation variable.
     observation = tf.placeholder(tf.float32, shape=(None,) + observation_dim, name="dqn_observation") # network input
 
     with tf.variable_scope('q_net'):
@@ -59,6 +62,7 @@ if __name__ == '__main__':
     optimizer = tf.train.AdamOptimizer(1e-3)
     train_op = optimizer.minimize(total_loss, var_list=train_var_list, global_step=tf.train.get_global_step())
     # 3. define data collection
+    # configuration should be given as parameters, different replay buffer has different parameters.
     replay_memory = get_replay_buffer('rank_based', env, q_values, q_net, target_net,
                                       {'size': 1000, 'batch_size': 64, 'learn_start': 20})
                                                              # ShihongSong: Replay(env, q_net, advantage_estimation.qlearning_target(target_network)), use your ReplayMemory, interact as follows. Simplify your advantage_estimation.dqn to run before YongRen's DQN
@@ -70,6 +74,7 @@ if __name__ == '__main__':
 
         minibatch_count = 0
         collection_count = 0
+        # need to first collect some then sample, collect_freq must be larger than batch_size
         collect_freq = 100
         while True: # until some stopping criterion met...
             # collect data
diff --git a/tianshou/core/policy/dqn.py b/tianshou/core/policy/dqn.py
index 81efc9b..39f6a16 100644
--- a/tianshou/core/policy/dqn.py
+++ b/tianshou/core/policy/dqn.py
@@ -8,6 +8,7 @@ class DQN(QValuePolicy):
     """
 
     def __init__(self, logits, observation_placeholder, dtype=None, **kwargs):
+        # TODO: this version only support non-continuous action space, extend it to support continuous action space
         self._logits = tf.convert_to_tensor(logits)
         if dtype is None:
             dtype = tf.int32
@@ -15,6 +16,7 @@ class DQN(QValuePolicy):
 
         super(DQN, self).__init__(observation_placeholder)
 
+        # TODO: put the net definition outside of the class
         net = tf.layers.conv2d(self._observation_placeholder, 16, 8, 4, 'valid', activation=tf.nn.relu)
         net = tf.layers.conv2d(net, 32, 4, 2, 'valid', activation=tf.nn.relu)
         net = tf.layers.flatten(net)
@@ -26,6 +28,7 @@ class DQN(QValuePolicy):
         return the action (int) to be executed.
         no exploration when exploration=None.
         """
+        # TODO: ensure thread safety
         sess = tf.get_default_session()
         sampled_action = sess.run(tf.multinomial(self.logits, num_samples=1),
                                   feed_dict={self._observation_placeholder: observation[None]})
@@ -33,10 +36,16 @@ class DQN(QValuePolicy):
 
     @property
     def logits(self):
+        """
+        :return: action values
+        """
         return self._logits
 
     @property
     def n_categories(self):
+        """
+        :return: dimension of action space if not continuous
+        """
         return self._n_categories
 
     def values(self, observation):
diff --git a/tianshou/data/replay_buffer/naive.py b/tianshou/data/replay_buffer/naive.py
index 50ba1c3..5eb4dd7 100644
--- a/tianshou/data/replay_buffer/naive.py
+++ b/tianshou/data/replay_buffer/naive.py
@@ -23,6 +23,10 @@ class NaiveExperience(ReplayBuffer):
             self.n_entries += 1
 
     def _begin_act(self):
+        """
+        if the previous interaction is ended or the interaction hasn't started
+        then begin act from the state of env.reset()
+        """
         self.observation = self._env.reset()
         self.action = self._env.action_space.sample()
         done = False
@@ -33,6 +37,10 @@ class NaiveExperience(ReplayBuffer):
             self.observation, _, done, _ = self._env.step(self.action)
 
     def collect(self):
+        """
+        collect data for replay memory and update the priority according to the given data.
+        store the previous action, previous observation, reward, action, observation in the replay memory.
+        """
         sess = tf.get_default_session()
         current_data = dict()
         current_data['previous_action'] = self.action
@@ -59,6 +67,13 @@ class NaiveExperience(ReplayBuffer):
         return [self.memory[idx] for idx in idxs], [1] * len(idxs), idxs
 
     def next_batch(self, batch_size):
+        """
+        collect a batch of data from replay buffer, update the priority and calculate the necessary statistics for
+        updating q value network.
+        :param batch_size: int batch size.
+        :return: a batch of data, with target storing the target q value and wi, rewards storing the coefficient
+        for gradient of q value network.
+        """
         data = dict()
         observations = list()
         actions = list()
diff --git a/tianshou/data/replay_buffer/proportional.py b/tianshou/data/replay_buffer/proportional.py
index 63aab66..52a231d 100644
--- a/tianshou/data/replay_buffer/proportional.py
+++ b/tianshou/data/replay_buffer/proportional.py
@@ -45,6 +45,10 @@ class PropotionalExperience(ReplayBuffer):
         self._begin_act()
 
     def _begin_act(self):
+        """
+        if the previous interaction is ended or the interaction hasn't started
+        then begin act from the state of env.reset()
+        """
         self.observation = self._env.reset()
         self.action = self._env.action_space.sample()
         done = False
@@ -66,12 +70,6 @@ class PropotionalExperience(ReplayBuffer):
         """
         self.tree.add(data, priority**self.alpha)
 
-    def collect(self):
-        pass
-
-    def next_batch(self, batch_size):
-        pass
-
     def sample(self, conf):
         """ The method return samples randomly.
         
@@ -117,6 +115,10 @@ class PropotionalExperience(ReplayBuffer):
         return out, weights, indices
 
     def collect(self):
+        """
+        collect data for replay memory and update the priority according to the given data.
+        store the previous action, previous observation, reward, action, observation in the replay memory.
+        """
         sess = tf.get_default_session()
         current_data = dict()
         current_data['previous_action'] = self.action
@@ -134,6 +136,13 @@ class PropotionalExperience(ReplayBuffer):
             self._begin_act()
 
     def next_batch(self, batch_size):
+        """
+        collect a batch of data from replay buffer, update the priority and calculate the necessary statistics for
+        updating q value network.
+        :param batch_size: int batch size.
+        :return: a batch of data, with target storing the target q value and wi, rewards storing the coefficient
+        for gradient of q value network.
+        """
         data = dict()
         observations = list()
         actions = list()
diff --git a/tianshou/data/replay_buffer/rank_based.py b/tianshou/data/replay_buffer/rank_based.py
index da56763..b71ca68 100644
--- a/tianshou/data/replay_buffer/rank_based.py
+++ b/tianshou/data/replay_buffer/rank_based.py
@@ -107,6 +107,10 @@ class RankBasedExperience(ReplayBuffer):
             return self.index
 
     def _begin_act(self):
+        """
+        if the previous interaction is ended or the interaction hasn't started
+        then begin act from the state of env.reset()
+        """
         self.observation = self._env.reset()
         self.action = self._env.action_space.sample()
         done = False
@@ -117,6 +121,10 @@ class RankBasedExperience(ReplayBuffer):
             self.observation, _, done, _ = self._env.step(self.action)
 
     def collect(self):
+        """
+        collect data for replay memory and update the priority according to the given data.
+        store the previous action, previous observation, reward, action, observation in the replay memory.
+        """
         sess = tf.get_default_session()
         current_data = dict()
         current_data['previous_action'] = self.action
@@ -131,6 +139,13 @@ class RankBasedExperience(ReplayBuffer):
             self._begin_act()
 
     def next_batch(self, batch_size):
+        """
+        collect a batch of data from replay buffer, update the priority and calculate the necessary statistics for
+        updating q value network.
+        :param batch_size: int batch size.
+        :return: a batch of data, with target storing the target q value and wi, rewards storing the coefficient
+        for gradient of q value network.
+        """
         data = dict()
         observations = list()
         actions = list()

From 75bc2968d27f0e77bd24863d5a887d787bdf4c47 Mon Sep 17 00:00:00 2001
From: Tongzheng Ren <tongzheng@Tongzhengs-MacBook-Pro.local>
Date: Mon, 18 Dec 2017 23:32:41 +0800
Subject: [PATCH 12/98] add a detailed Chinese google coding style for
 convenience

---
 .DS_Store         | Bin 0 -> 8196 bytes
 AlphaGo/.DS_Store | Bin 0 -> 6148 bytes
 README.md         |   4 +++-
 3 files changed, 3 insertions(+), 1 deletion(-)
 create mode 100644 .DS_Store
 create mode 100644 AlphaGo/.DS_Store

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..99fab83bc04e2a081117249f4d6eb1500ca26cd4
GIT binary patch
literal 8196
zcmeHMOHUL*5U%DSvm!FQiF%m0F(H8vh&&F)I4ls{_<*nkA%M&5?yyW8y4RUmo?`am
zZ!mf_G4bTVU*OS;SC1b21183^RzHAcRy~S|-J9;OrmL!}`<w0Qtyusdsd8Z$pceoP
ztQ^f|Y<^L=p4XaEr+T;_L4N=bBFN%r5K1-Hx<fOd8PE)91~dbjfq#JkJhOQ*esk_i
zt!^~~nt}h40sefjuyV8&RwRl?2R0%FKy1KnQBcM_K=@b+ZG{zy!WGAq*#l9oM5P!+
zx})CW=7_e!ibUxSM7je}k%`Jsh=`7UmM{lmOVq7qKr>Lw06+ViMb7R4J(a)jrG1vq
z`7DPPd{f=P+V?{glmMDw0({K#H^G4!U@_}Af40u;ge5ihhg`|`!f0sd6RE3jXgt?s
zBoaoV#aPR1vYkxiMCCB$1S{goE4Vo&FEHjG%T8|5b9b_4=Om+%<54%LGGBe*(E{5^
zu`(@$Vw^cA+C*OwC~Ni}9E^;P4p<{&!-oUb!T9(P%GbvZ4-KQ|+VIWv`qtk5!IQ(M
zFT}Ees3#D#%Io9yB}y?;y!^IWCV6+X%+IEIywLN_7g{d1nr-bJZJljhU0vNhUA^s>
zE@#b-oRfc44EU5Mna_fhUv{%*YlxJj^Nh{~?lvxXDROtCjJJ=f_L#%GTX=ZVwXj!0
zo6eICG3ZR-x^$%&c!k-GjnIvjs^n!JEgvx7pJSmHd5kWvis+HMm8KgEBk~!{mbfZh
zbb}DF#%F2S2?B=}-PGQi@8n#6mZn$~v8^Q!SHWj4LS{-%-lN6jt*?^IMAGh)k_pKp
zehrf>P)1!;&dI){eMcz`=*Mqk23FxQRNy(hg17J<KEfCH4nIf>=_Xf6KN%pSWSrP!
zf=rUzBu(a}ciqtqy>b*vB3VI<-O%k!Kz&AwsNr4>Pyhyb2v2|eNe}9(_lf~7?BYJz
z!u{f-9JhZKvu}dCkcLT^hWogx>DcJsV%7<mN6QJAf_sRA7^QI9L+stxReRUK_jDf4
zheC+^mq9sZ&v$RNeIt~GhhH;uzS3$-WJR37h!(0iOSZy_MC`$$&woLKZZrdD#z4J{
z+{=FhOyj=Q?*Y#2R_MT*fiq)(G)!ftQW#zy&9(F3xweIM9V;)~t|C#mV8cy#zevXs
dkN+@4*-|=NVMQWpu;?EGf(G4a2L37oKLNiaHb(#e

literal 0
HcmV?d00001

diff --git a/AlphaGo/.DS_Store b/AlphaGo/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..4587c268611be7e8dabb379fdedff650b2a85bc0
GIT binary patch
literal 6148
zcmeHK!H&}~5Pe>X+HR|=;sVDUIB=m7XD->Qd)NaU-~&p!-K`?h26a==-g4nLIP3w5
z@8bjD&DfO0b@zl2LY~TgsWWdpesSy=fSK-=FMu(C9*bb-lvTjwzLWzyaV?LCMq}it
zFz0WA4+q|{Au13R_}3JWcXy!gY>o|@@9*tJQC=>JazPJS9$vCM`=Q1wVO^rc5?6SR
zkF2+3U&Cj8^H^OnGOiMtt`VNM`R;l58FzyfBX4*mtBS{f#Q)mJs!Q(->#(<Z?0G+=
zcNvPSnKa`L8FPn%ndek%#Y}d`c;;y%pHsYME)_nppDo@8In2u3H8>-WGgodN;>uHL
zamT9E@BZevVdfb#FM@pP*W!0MvV#7%A#d~9^X~7x<@pDmkLcKnKcZj89G4u^no8a?
z8)R%JNkf-D-ryW(nBf(1LjKa5qa00(G34m4IG$_H(fh{5C(z;z$g4Ediu_er4bAUM
zBflZ%#IJWqcoHL|iS*Xw^~jTXRge+#r|K<#PU1ge+_S}c1B<ba3Pc5>0(%8yeMndY
zQ-`%hT{>9l5rEiXvl_4EV?j8H!_;AIk#}guQi+yo+!4cAI@=THmpZI1S~`q7d>GfW
zaVHd`y)%BI?l7svSVsk-0&N8z*uP`h|Id$~|J$VaFDeif_^%W&o!Q&jltXfR>%rh;
vuT5Cqv51LZZBZ$#+;*%NvK2pJQRBTtE{Lha+9F$M@k2mmh*ebJuPX2p(V7f*

literal 0
HcmV?d00001

diff --git a/README.md b/README.md
index 543d237..9c3af16 100644
--- a/README.md
+++ b/README.md
@@ -46,6 +46,8 @@ Tianshou(天授) is a reinforcement learning platform. The following image illus
 
 Please follow [google python coding style](https://google.github.io/styleguide/pyguide.html)
 
+There's a more detailed Chinese version [google python coding style in Chinese](http://www.runoob.com/w3cnote/google-python-styleguide.html)
+
 All files/folders should be named with lower case letters and underline (except specified names such as `AlphaGo`).
 
 Try to use full names. Don't use abbrevations for class/function/variable names except common abbrevations (such as `num` for number, `dim` for dimension, `env` for environment, `op` for operation). For now we use `pi` to refer to the policy in examples/ppo_example.py.
@@ -73,4 +75,4 @@ HaoshengZou: collaborate mainly on Policy and losses; interfaces and architectur
 
 Note: install openai/gym first to run the Atari environment; note that interfaces between modules may not be finalized; the management of placeholders and `feed_dict` may have to be done manually for the time being;
 
-Without preprocessing and other tricks, this example will not train to any meaningful results. Codes should past two tests: individual module test and run through this example code.
\ No newline at end of file
+Without preprocessing and other tricks, this example will not train to any meaningful results. Codes should past two tests: individual module test and run through this example code.

From 6b6c48f122aad3fc415cfbaecbeae449fc8f632d Mon Sep 17 00:00:00 2001
From: Tongzheng Ren <tongzheng@Tongzhengs-MacBook-Pro.local>
Date: Mon, 18 Dec 2017 23:34:32 +0800
Subject: [PATCH 13/98] update gitignore

---
 .DS_Store         | Bin 8196 -> 0 bytes
 .gitignore        |   1 +
 AlphaGo/.DS_Store | Bin 6148 -> 0 bytes
 3 files changed, 1 insertion(+)
 delete mode 100644 .DS_Store
 delete mode 100644 AlphaGo/.DS_Store

diff --git a/.DS_Store b/.DS_Store
deleted file mode 100644
index 99fab83bc04e2a081117249f4d6eb1500ca26cd4..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8196
zcmeHMOHUL*5U%DSvm!FQiF%m0F(H8vh&&F)I4ls{_<*nkA%M&5?yyW8y4RUmo?`am
zZ!mf_G4bTVU*OS;SC1b21183^RzHAcRy~S|-J9;OrmL!}`<w0Qtyusdsd8Z$pceoP
ztQ^f|Y<^L=p4XaEr+T;_L4N=bBFN%r5K1-Hx<fOd8PE)91~dbjfq#JkJhOQ*esk_i
zt!^~~nt}h40sefjuyV8&RwRl?2R0%FKy1KnQBcM_K=@b+ZG{zy!WGAq*#l9oM5P!+
zx})CW=7_e!ibUxSM7je}k%`Jsh=`7UmM{lmOVq7qKr>Lw06+ViMb7R4J(a)jrG1vq
z`7DPPd{f=P+V?{glmMDw0({K#H^G4!U@_}Af40u;ge5ihhg`|`!f0sd6RE3jXgt?s
zBoaoV#aPR1vYkxiMCCB$1S{goE4Vo&FEHjG%T8|5b9b_4=Om+%<54%LGGBe*(E{5^
zu`(@$Vw^cA+C*OwC~Ni}9E^;P4p<{&!-oUb!T9(P%GbvZ4-KQ|+VIWv`qtk5!IQ(M
zFT}Ees3#D#%Io9yB}y?;y!^IWCV6+X%+IEIywLN_7g{d1nr-bJZJljhU0vNhUA^s>
zE@#b-oRfc44EU5Mna_fhUv{%*YlxJj^Nh{~?lvxXDROtCjJJ=f_L#%GTX=ZVwXj!0
zo6eICG3ZR-x^$%&c!k-GjnIvjs^n!JEgvx7pJSmHd5kWvis+HMm8KgEBk~!{mbfZh
zbb}DF#%F2S2?B=}-PGQi@8n#6mZn$~v8^Q!SHWj4LS{-%-lN6jt*?^IMAGh)k_pKp
zehrf>P)1!;&dI){eMcz`=*Mqk23FxQRNy(hg17J<KEfCH4nIf>=_Xf6KN%pSWSrP!
zf=rUzBu(a}ciqtqy>b*vB3VI<-O%k!Kz&AwsNr4>Pyhyb2v2|eNe}9(_lf~7?BYJz
z!u{f-9JhZKvu}dCkcLT^hWogx>DcJsV%7<mN6QJAf_sRA7^QI9L+stxReRUK_jDf4
zheC+^mq9sZ&v$RNeIt~GhhH;uzS3$-WJR37h!(0iOSZy_MC`$$&woLKZZrdD#z4J{
z+{=FhOyj=Q?*Y#2R_MT*fiq)(G)!ftQW#zy&9(F3xweIM9V;)~t|C#mV8cy#zevXs
dkN+@4*-|=NVMQWpu;?EGf(G4a2L37oKLNiaHb(#e

diff --git a/.gitignore b/.gitignore
index e795259..36d134c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ parameters
 checkpoints
 checkpoints_origin
 *.json
+.DS_Store
diff --git a/AlphaGo/.DS_Store b/AlphaGo/.DS_Store
deleted file mode 100644
index 4587c268611be7e8dabb379fdedff650b2a85bc0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHK!H&}~5Pe>X+HR|=;sVDUIB=m7XD->Qd)NaU-~&p!-K`?h26a==-g4nLIP3w5
z@8bjD&DfO0b@zl2LY~TgsWWdpesSy=fSK-=FMu(C9*bb-lvTjwzLWzyaV?LCMq}it
zFz0WA4+q|{Au13R_}3JWcXy!gY>o|@@9*tJQC=>JazPJS9$vCM`=Q1wVO^rc5?6SR
zkF2+3U&Cj8^H^OnGOiMtt`VNM`R;l58FzyfBX4*mtBS{f#Q)mJs!Q(->#(<Z?0G+=
zcNvPSnKa`L8FPn%ndek%#Y}d`c;;y%pHsYME)_nppDo@8In2u3H8>-WGgodN;>uHL
zamT9E@BZevVdfb#FM@pP*W!0MvV#7%A#d~9^X~7x<@pDmkLcKnKcZj89G4u^no8a?
z8)R%JNkf-D-ryW(nBf(1LjKa5qa00(G34m4IG$_H(fh{5C(z;z$g4Ediu_er4bAUM
zBflZ%#IJWqcoHL|iS*Xw^~jTXRge+#r|K<#PU1ge+_S}c1B<ba3Pc5>0(%8yeMndY
zQ-`%hT{>9l5rEiXvl_4EV?j8H!_;AIk#}guQi+yo+!4cAI@=THmpZI1S~`q7d>GfW
zaVHd`y)%BI?l7svSVsk-0&N8z*uP`h|Id$~|J$VaFDeif_^%W&o!Q&jltXfR>%rh;
vuT5Cqv51LZZBZ$#+;*%NvK2pJQRBTtE{Lha+9F$M@k2mmh*ebJuPX2p(V7f*


From ea52096713fc42307b3bd5974f7f935edd1c58f5 Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Tue, 19 Dec 2017 00:16:21 +0800
Subject: [PATCH 14/98] delete unused parameter of _find_block, and using
 _find_group to replace _find_block

---
 AlphaGo/go.py       | 13 +++++----
 AlphaGo/strategy.py | 66 ++++++++++++++++++++-------------------------
 2 files changed, 35 insertions(+), 44 deletions(-)

diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index 0afc877..752973e 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -13,25 +13,24 @@ Settings of the Go game.
 
 NEIGHBOR_OFFSET = [[1, 0], [-1, 0], [0, -1], [0, 1]]
 
-
 class Go:
     def __init__(self, **kwargs):
         self.game = kwargs['game']
 
-    def _bfs(self, vertex, color, block, status, alive_break):
+    def _bfs(self, vertex, color, block, status):
         block.append(vertex)
         status[self.game._flatten(vertex)] = True
         nei = self._neighbor(vertex)
         for n in nei:
             if not status[self.game._flatten(n)]:
                 if self.game.board[self.game._flatten(n)] == color:
-                    self._bfs(n, color, block, status, alive_break)
+                    self._bfs(n, color, block, status)
 
-    def _find_block(self, vertex, alive_break=False):
+    def _find_block(self, vertex):
         block = []
         status = [False] * (self.game.size * self.game.size)
         color = self.game.board[self.game._flatten(vertex)]
-        self._bfs(vertex, color, block, status, alive_break)
+        self._bfs(vertex, color, block, status)
 
         for b in block:
             for n in self._neighbor(b):
@@ -42,7 +41,7 @@ class Go:
     def _find_boarder(self, vertex):
         block = []
         status = [False] * (self.game.size * self.game.size)
-        self._bfs(vertex, utils.EMPTY, block, status, False)
+        self._bfs(vertex, utils.EMPTY, block, status)
         border = []
         for b in block:
             for n in self._neighbor(b):
@@ -106,7 +105,7 @@ class Go:
         nei = self._neighbor(vertex)
         for n in nei:
             if self.game.board[self.game._flatten(n)] == utils.another_color(color):
-                can_kill, block = self._find_block(n, alive_break=True)
+                can_kill, block = self._find_block(n)
                 if can_kill:
                     for b in block:
                         self.game.board[self.game._flatten(b)] = utils.EMPTY
diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py
index 0bad998..8c12c71 100644
--- a/AlphaGo/strategy.py
+++ b/AlphaGo/strategy.py
@@ -23,26 +23,32 @@ class GoEnv:
         x, y = vertex
         return (x - 1) * self.game.size + (y - 1)
 
-    def _bfs(self, vertex, color, block, status, alive_break):
+    def _find_group(self, start):
+        color = self.board[self._flatten(start)]
+        # print ("color : ", color)
+        chain = set()
+        frontier = [start]
+        has_liberty = False
+        while frontier:
+            current = frontier.pop()
+            # print ("current : ", current)
+            chain.add(current)
+            for n in self._neighbor(current):
+                # print n, self._flatten(n), self.board[self._flatten(n)],
+                if self.board[self._flatten(n)] == color and not n in chain:
+                    frontier.append(n)
+                if self.board[self._flatten(n)] == utils.EMPTY:
+                    has_liberty = True
+        return has_liberty, chain
+
+    def _bfs(self, vertex, color, block, status):
         block.append(vertex)
         status[self._flatten(vertex)] = True
         nei = self._neighbor(vertex)
         for n in nei:
             if not status[self._flatten(n)]:
                 if self.board[self._flatten(n)] == color:
-                    self._bfs(n, color, block, status, alive_break)
-
-    def _find_block(self, vertex, alive_break=False):
-        block = []
-        status = [False] * (self.game.size * self.game.size)
-        color = self.board[self._flatten(vertex)]
-        self._bfs(vertex, color, block, status, alive_break)
-
-        for b in block:
-            for n in self._neighbor(b):
-                if self.board[self._flatten(n)] == utils.EMPTY:
-                    return False, block
-        return True, block
+                    self._bfs(n, color, block, status)
 
     def _is_qi(self, color, vertex):
         nei = self._neighbor(vertex)
@@ -53,14 +59,14 @@ class GoEnv:
         self.board[self._flatten(vertex)] = color
         for n in nei:
             if self.board[self._flatten(n)] == utils.another_color(color):
-                can_kill, block = self._find_block(n)
-                if can_kill:
+                has_liberty, group = self._find_group(n)
+                if not has_liberty:
                     self.board[self._flatten(vertex)] = utils.EMPTY
                     return True
 
         ### avoid suicide
-        can_kill, block = self._find_block(vertex)
-        if can_kill:
+        has_liberty, group = self._find_group(vertex)
+        if not has_liberty:
             self.board[self._flatten(vertex)] = utils.EMPTY
             return False
 
@@ -110,26 +116,11 @@ class GoEnv:
         nei = self._neighbor(vertex)
         for n in nei:
             if self.board[self._flatten(n)] == utils.another_color(color):
-                can_kill, block = self._find_block(n, alive_break=True)
-                if can_kill:
-                    for b in block:
+                has_liberty, group = self._find_group(n)
+                if not has_liberty:
+                    for b in group:
                         self.board[self._flatten(b)] = utils.EMPTY
 
-    def _find_group(self, start):
-        color = self.board[self._flatten(start)]
-        # print ("color : ", color)
-        chain = set()
-        frontier = [start]
-        while frontier:
-            current = frontier.pop()
-            # print ("current : ", current)
-            chain.add(current)
-            for n in self._neighbor(current):
-                # print n, self._flatten(n), self.board[self._flatten(n)],
-                if self.board[self._flatten(n)] == color and not n in chain:
-                    frontier.append(n)
-        return chain
-
     def _is_eye(self, color, vertex):
         nei = self._neighbor(vertex)
         cor = self._corner(vertex)
@@ -137,7 +128,8 @@ class GoEnv:
         if False in ncolor:
             # print "not all neighbors are in same color with us"
             return False
-        if set(nei) < self._find_group(nei[0]):
+        _, group = self._find_group(nei[0])
+        if set(nei) < group:
             # print "all neighbors are in same group and same color with us"
             return True
         else:

From 6a410384bbcccd65fd204503c266b09fd1fc8f4b Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Tue, 19 Dec 2017 00:47:21 +0800
Subject: [PATCH 15/98] rewrite _is_qi in a more understandable way

---
 AlphaGo/strategy.py | 46 ++++++++++++++++++---------------------------
 1 file changed, 18 insertions(+), 28 deletions(-)

diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py
index 8c12c71..e00e69d 100644
--- a/AlphaGo/strategy.py
+++ b/AlphaGo/strategy.py
@@ -41,37 +41,27 @@ class GoEnv:
                     has_liberty = True
         return has_liberty, chain
 
-    def _bfs(self, vertex, color, block, status):
-        block.append(vertex)
-        status[self._flatten(vertex)] = True
-        nei = self._neighbor(vertex)
-        for n in nei:
-            if not status[self._flatten(n)]:
-                if self.board[self._flatten(n)] == color:
-                    self._bfs(n, color, block, status)
-
-    def _is_qi(self, color, vertex):
-        nei = self._neighbor(vertex)
-        for n in nei:
-            if self.board[self._flatten(n)] == utils.EMPTY:
-                return True
-
+    def _is_suicide(self, color, vertex):
+        ### assume that we already take this move
         self.board[self._flatten(vertex)] = color
-        for n in nei:
-            if self.board[self._flatten(n)] == utils.another_color(color):
-                has_liberty, group = self._find_group(n)
-                if not has_liberty:
-                    self.board[self._flatten(vertex)] = utils.EMPTY
-                    return True
 
-        ### avoid suicide
         has_liberty, group = self._find_group(vertex)
-        if not has_liberty:
+        if has_liberty:
+            ### this group still has liberty after this move, not suicide
             self.board[self._flatten(vertex)] = utils.EMPTY
             return False
-
-        self.board[self._flatten(vertex)] = utils.EMPTY
-        return True
+        else:
+            ### liberty is zero
+            for n in self._neighbor(vertex):
+                if self.board[self._flatten(n)] == utils.another_color(color):
+                    opponent_liberty, group = self._find_group(n)
+                    # this move is able to take opponent's stone, not suicide
+                    if not opponent_liberty:
+                        self.board[self._flatten(vertex)] = utils.EMPTY
+                        return False
+            # not a take, suicide
+            self.board[self._flatten(vertex)] = utils.EMPTY
+            return True
 
     def _check_global_isomorphous(self, color, vertex):
         ##backup
@@ -174,8 +164,8 @@ class GoEnv:
             # print(vertex)
             return False
 
-        ### check if it is qi
-        if not self._is_qi(color, vertex):
+        ### check if it is suicide
+        if self._is_suicide(color, vertex):
             return False
 
         ### forbid global isomorphous

From 99a617a1f041643c1b0618d9de3b2017ed144b10 Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Tue, 19 Dec 2017 11:16:17 +0800
Subject: [PATCH 16/98] rename variable for clarity

---
 AlphaGo/game.py     | 16 ++++-----
 AlphaGo/go.py       | 83 +++++++++++++++++++++++----------------------
 AlphaGo/strategy.py | 60 ++++++++++++++++----------------
 3 files changed, 80 insertions(+), 79 deletions(-)

diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index 02ccb27..3b62435 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -29,7 +29,7 @@ class Game:
     def __init__(self, size=9, komi=6.5, checkpoint_path=None):
         self.size = size
         self.komi = komi
-        self.board = [utils.EMPTY] * (self.size * self.size)
+        self.board = [utils.EMPTY] * (self.size ** 2)
         self.history = []
         self.latest_boards = deque(maxlen=8)
         for _ in range(8):
@@ -54,7 +54,7 @@ class Game:
         return (x,y)
 
     def clear(self):
-        self.board = [utils.EMPTY] * (self.size * self.size)
+        self.board = [utils.EMPTY] * (self.size ** 2)
         self.history = []
         for _ in range(8):
             self.latest_boards.append(self.board)
@@ -66,11 +66,11 @@ class Game:
     def set_komi(self, k):
         self.komi = k
 
-    def generate_nn_input(self, history, color):
+    def generate_nn_input(self, latest_boards, color):
         state = np.zeros([1, self.size, self.size, 17])
         for i in range(8):
-            state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.size ** 2)).reshape(self.size, self.size)
-            state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(self.size ** 2)).reshape(self.size, self.size)
+            state[0, :, :, i] = np.array(np.array(latest_boards[i]) == np.ones(self.size ** 2)).reshape(self.size, self.size)
+            state[0, :, :, i + 8] = np.array(np.array(latest_boards[i]) == -np.ones(self.size ** 2)).reshape(self.size, self.size)
         if color == utils.BLACK:
             state[0, :, :, 16] = np.ones([self.size, self.size])
         if color == utils.WHITE:
@@ -78,9 +78,9 @@ class Game:
         return state
 
     def strategy_gen_move(self, latest_boards, color):
-        self.simulator.latest_boards = copy.copy(latest_boards)
-        self.simulator.board = copy.copy(latest_boards[-1])
-        nn_input = self.generate_nn_input(self.simulator.latest_boards, color)
+        self.simulator.simulate_latest_boards = copy.copy(latest_boards)
+        self.simulator.simulate_board = copy.copy(latest_boards[-1])
+        nn_input = self.generate_nn_input(self.simulator.simulate_latest_boards, color)
         mcts = MCTS(self.simulator, self.evaluator, nn_input, self.size ** 2 + 1, inverse=True, max_step=1)
         temp = 1
         prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp)
diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index 752973e..7b1d3e7 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -28,7 +28,7 @@ class Go:
 
     def _find_block(self, vertex):
         block = []
-        status = [False] * (self.game.size * self.game.size)
+        status = [False] * (self.game.size ** 2)
         color = self.game.board[self.game._flatten(vertex)]
         self._bfs(vertex, color, block, status)
 
@@ -40,7 +40,7 @@ class Go:
 
     def _find_boarder(self, vertex):
         block = []
-        status = [False] * (self.game.size * self.game.size)
+        status = [False] * (self.game.size ** 2)
         self._bfs(vertex, utils.EMPTY, block, status)
         border = []
         for b in block:
@@ -141,6 +141,46 @@ class Go:
         idx = [i for i,x in enumerate(self.game.board) if x == utils.EMPTY ][0]
         return self.game._deflatten(idx)
 
+    def _add_nearby_stones(self, neighbor_vertex_set, start_vertex_x, start_vertex_y, x_diff, y_diff, num_step):
+        '''
+        add the nearby stones around the input vertex
+        :param neighbor_vertex_set: input list
+        :param start_vertex_x: x axis of the input vertex
+        :param start_vertex_y: y axis of the input vertex
+        :param x_diff: add x axis
+        :param y_diff: add y axis
+        :param num_step: number of steps to be added
+        :return:
+        '''
+        for step in xrange(num_step):
+            new_neighbor_vertex = (start_vertex_x, start_vertex_y)
+            if self._in_board(new_neighbor_vertex):
+                neighbor_vertex_set.append((start_vertex_x, start_vertex_y))
+            start_vertex_x += x_diff
+            start_vertex_y += y_diff
+
+    def _predict_from_nearby(self, vertex, neighbor_step = 3):
+        '''
+        step: the nearby 3 steps is considered
+        :vertex: position to be estimated
+        :neighbor_step: how many steps nearby
+        :return: the nearby positions of the input position
+            currently the nearby 3*3 grid is returned, altogether 4*8 points involved
+        '''
+        for step in range(1, neighbor_step + 1): # check the stones within the steps in range
+            neighbor_vertex_set = []
+            self._add_nearby_stones(neighbor_vertex_set, vertex[0] - step, vertex[1], 1, 1, neighbor_step)
+            self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] + step, 1, -1, neighbor_step)
+            self._add_nearby_stones(neighbor_vertex_set, vertex[0] + step, vertex[1], -1, -1, neighbor_step)
+            self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] -  step, -1, 1, neighbor_step)
+            color_estimate = 0
+            for neighbor_vertex in neighbor_vertex_set:
+                color_estimate += self.game.board[self.game._flatten(neighbor_vertex)]
+            if color_estimate > 0:
+                return utils.BLACK
+            elif color_estimate < 0:
+                return utils.WHITE
+
     def get_score(self, is_unknown_estimation = False):
         '''
             is_unknown_estimation: whether use nearby stone to predict the unknown
@@ -170,42 +210,3 @@ class Go:
         self.game.board = _board
         return score
 
-    def _predict_from_nearby(self, vertex, neighbor_step = 3):
-        '''
-        step: the nearby 3 steps is considered
-        :vertex: position to be estimated
-        :neighbor_step: how many steps nearby
-        :return: the nearby positions of the input position
-            currently the nearby 3*3 grid is returned, altogether 4*8 points involved
-        '''
-        for step in range(1, neighbor_step + 1): # check the stones within the steps in range
-            neighbor_vertex_set = []
-            self._add_nearby_stones(neighbor_vertex_set, vertex[0] - step, vertex[1], 1, 1, neighbor_step)
-            self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] + step, 1, -1, neighbor_step)
-            self._add_nearby_stones(neighbor_vertex_set, vertex[0] + step, vertex[1], -1, -1, neighbor_step)
-            self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] -  step, -1, 1, neighbor_step)
-            color_estimate = 0
-            for neighbor_vertex in neighbor_vertex_set:
-                color_estimate += self.game.board[self.game._flatten(neighbor_vertex)]
-            if color_estimate > 0:
-                return utils.BLACK
-            elif color_estimate < 0:
-                return utils.WHITE
-
-    def _add_nearby_stones(self, neighbor_vertex_set, start_vertex_x, start_vertex_y, x_diff, y_diff, num_step):
-        '''
-        add the nearby stones around the input vertex
-        :param neighbor_vertex_set: input list
-        :param start_vertex_x: x axis of the input vertex
-        :param start_vertex_y: y axis of the input vertex
-        :param x_diff: add x axis
-        :param y_diff: add y axis
-        :param num_step: number of steps to be added
-        :return:
-        '''
-        for step in xrange(num_step):
-            new_neighbor_vertex = (start_vertex_x, start_vertex_y)
-            if self._in_board(new_neighbor_vertex):
-                neighbor_vertex_set.append((start_vertex_x, start_vertex_y))
-            start_vertex_x += x_diff
-            start_vertex_y += y_diff
diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py
index e00e69d..fe6bcbf 100644
--- a/AlphaGo/strategy.py
+++ b/AlphaGo/strategy.py
@@ -16,15 +16,15 @@ CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]]
 class GoEnv:
     def __init__(self, **kwargs):
         self.game = kwargs['game']
-        self.board = [utils.EMPTY] * (self.game.size * self.game.size)
-        self.latest_boards = deque(maxlen=8)
+        self.simulate_board = [utils.EMPTY] * (self.game.size ** 2)
+        self.simulate_latest_boards = deque(maxlen=8)
 
-    def _flatten(self, vertex):
+    def simulate_flatten(self, vertex):
         x, y = vertex
         return (x - 1) * self.game.size + (y - 1)
 
     def _find_group(self, start):
-        color = self.board[self._flatten(start)]
+        color = self.simulate_board[self.simulate_flatten(start)]
         # print ("color : ", color)
         chain = set()
         frontier = [start]
@@ -35,45 +35,45 @@ class GoEnv:
             chain.add(current)
             for n in self._neighbor(current):
                 # print n, self._flatten(n), self.board[self._flatten(n)],
-                if self.board[self._flatten(n)] == color and not n in chain:
+                if self.simulate_board[self.simulate_flatten(n)] == color and not n in chain:
                     frontier.append(n)
-                if self.board[self._flatten(n)] == utils.EMPTY:
+                if self.simulate_board[self.simulate_flatten(n)] == utils.EMPTY:
                     has_liberty = True
         return has_liberty, chain
 
     def _is_suicide(self, color, vertex):
         ### assume that we already take this move
-        self.board[self._flatten(vertex)] = color
+        self.simulate_board[self.simulate_flatten(vertex)] = color
 
         has_liberty, group = self._find_group(vertex)
         if has_liberty:
             ### this group still has liberty after this move, not suicide
-            self.board[self._flatten(vertex)] = utils.EMPTY
+            self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY
             return False
         else:
             ### liberty is zero
             for n in self._neighbor(vertex):
-                if self.board[self._flatten(n)] == utils.another_color(color):
+                if self.simulate_board[self.simulate_flatten(n)] == utils.another_color(color):
                     opponent_liberty, group = self._find_group(n)
                     # this move is able to take opponent's stone, not suicide
                     if not opponent_liberty:
-                        self.board[self._flatten(vertex)] = utils.EMPTY
+                        self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY
                         return False
             # not a take, suicide
-            self.board[self._flatten(vertex)] = utils.EMPTY
+            self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY
             return True
 
     def _check_global_isomorphous(self, color, vertex):
         ##backup
-        _board = copy.copy(self.board)
-        self.board[self._flatten(vertex)] = color
+        _board = copy.copy(self.simulate_board)
+        self.simulate_board[self.simulate_flatten(vertex)] = color
         self._process_board(color, vertex)
-        if self.board in self.latest_boards:
+        if self.simulate_board in self.simulate_latest_boards:
             res = True
         else:
             res = False
 
-        self.board = _board
+        self.simulate_board = _board
         return res
 
     def _in_board(self, vertex):
@@ -105,16 +105,16 @@ class GoEnv:
     def _process_board(self, color, vertex):
         nei = self._neighbor(vertex)
         for n in nei:
-            if self.board[self._flatten(n)] == utils.another_color(color):
+            if self.simulate_board[self.simulate_flatten(n)] == utils.another_color(color):
                 has_liberty, group = self._find_group(n)
                 if not has_liberty:
                     for b in group:
-                        self.board[self._flatten(b)] = utils.EMPTY
+                        self.simulate_board[self.simulate_flatten(b)] = utils.EMPTY
 
     def _is_eye(self, color, vertex):
         nei = self._neighbor(vertex)
         cor = self._corner(vertex)
-        ncolor = {color == self.board[self._flatten(n)] for n in nei}
+        ncolor = {color == self.simulate_board[self.simulate_flatten(n)] for n in nei}
         if False in ncolor:
             # print "not all neighbors are in same color with us"
             return False
@@ -123,7 +123,7 @@ class GoEnv:
             # print "all neighbors are in same group and same color with us"
             return True
         else:
-            opponent_number = [self.board[self._flatten(c)] for c in cor].count(-color)
+            opponent_number = [self.simulate_board[self.simulate_flatten(c)] for c in cor].count(-color)
             opponent_propotion = float(opponent_number) / float(len(cor))
             if opponent_propotion < 0.5:
                 # print "few opponents, real eye"
@@ -141,7 +141,7 @@ class GoEnv:
 
     def simulate_is_valid(self, state, action):
         # state is the play board, the shape is [1, 9, 9, 17]
-        if action == self.game.size * self.game.size:
+        if action == self.game.size ** 2:
             vertex = (0, 0)
         else:
             vertex = (action / self.game.size + 1, action % self.game.size + 1)
@@ -149,17 +149,17 @@ class GoEnv:
             color = utils.BLACK
         else:
             color = utils.WHITE
-        self.latest_boards.clear()
+        self.simulate_latest_boards.clear()
         for i in range(8):
-            self.latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist())
-        self.board = copy.copy(self.latest_boards[-1])
+            self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist())
+        self.simulate_board = copy.copy(self.simulate_latest_boards[-1])
 
         ### in board
         if not self._in_board(vertex):
             return False
 
         ### already have stone
-        if not self.board[self._flatten(vertex)] == utils.EMPTY:
+        if not self.simulate_board[self.simulate_flatten(vertex)] == utils.EMPTY:
             # print(np.array(self.board).reshape(9, 9))
             # print(vertex)
             return False
@@ -181,9 +181,9 @@ class GoEnv:
         if vertex == utils.PASS:
             return True
 
-        id_ = self._flatten(vertex)
-        if self.board[id_] == utils.EMPTY:
-            self.board[id_] = color
+        id_ = self.simulate_flatten(vertex)
+        if self.simulate_board[id_] == utils.EMPTY:
+            self.simulate_board[id_] = color
             return True
         else:
             return False
@@ -199,11 +199,11 @@ class GoEnv:
             vertex = (action % self.game.size + 1, action / self.game.size + 1)
         # print(vertex)
         # print(self.board)
-        self.board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist()
+        self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist()
         self.do_move(color, vertex)
         new_state = np.concatenate(
-            [state[:, :, :, 1:8], (np.array(self.board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1),
-             state[:, :, :, 9:16], (np.array(self.board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1),
+            [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1),
+             state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1),
              np.array(1 - state[:, :, :, -1]).reshape(1, self.game.size, self.game.size, 1)],
             axis=3)
         return new_state, 0

From 4440294c121d4fb36d62db703ce8e7d779424b42 Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Tue, 19 Dec 2017 12:00:17 +0800
Subject: [PATCH 17/98] fix bug in check_global_isomorphous and refactor
 _is_suicide again

---
 AlphaGo/strategy.py | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py
index fe6bcbf..e9457cf 100644
--- a/AlphaGo/strategy.py
+++ b/AlphaGo/strategy.py
@@ -42,33 +42,27 @@ class GoEnv:
         return has_liberty, chain
 
     def _is_suicide(self, color, vertex):
-        ### assume that we already take this move
-        self.simulate_board[self.simulate_flatten(vertex)] = color
+        self.simulate_board[self.simulate_flatten(vertex)] = color # assume that we already take this move
+        suicide = False
 
         has_liberty, group = self._find_group(vertex)
-        if has_liberty:
-            ### this group still has liberty after this move, not suicide
-            self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY
-            return False
-        else:
-            ### liberty is zero
+        if not has_liberty:
+            suicide = True # no liberty, suicide
             for n in self._neighbor(vertex):
                 if self.simulate_board[self.simulate_flatten(n)] == utils.another_color(color):
                     opponent_liberty, group = self._find_group(n)
-                    # this move is able to take opponent's stone, not suicide
                     if not opponent_liberty:
-                        self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY
-                        return False
-            # not a take, suicide
-            self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY
-            return True
+                        suicide = False # this move is able to take opponent's stone, not suicide
+
+        self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY # undo this move
+        return suicide
 
     def _check_global_isomorphous(self, color, vertex):
         ##backup
         _board = copy.copy(self.simulate_board)
         self.simulate_board[self.simulate_flatten(vertex)] = color
         self._process_board(color, vertex)
-        if self.simulate_board in self.simulate_latest_boards:
+        if self.simulate_board in self.game.history:
             res = True
         else:
             res = False
@@ -140,7 +134,9 @@ class GoEnv:
         return True
 
     def simulate_is_valid(self, state, action):
-        # state is the play board, the shape is [1, 9, 9, 17]
+        # State is the play board, the shape is [1, self.game.size, self.game.size, 17].
+        # Action is an index
+        # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move
         if action == self.game.size ** 2:
             vertex = (0, 0)
         else:
@@ -177,7 +173,7 @@ class GoEnv:
 
         return True
 
-    def do_move(self, color, vertex):
+    def simulate_do_move(self, color, vertex):
         if vertex == utils.PASS:
             return True
 
@@ -200,7 +196,7 @@ class GoEnv:
         # print(vertex)
         # print(self.board)
         self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist()
-        self.do_move(color, vertex)
+        self.simulate_do_move(color, vertex)
         new_state = np.concatenate(
             [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1),
              state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1),

From 0991fef527e73617114949a406e9da4632865e2d Mon Sep 17 00:00:00 2001
From: rtz19970824 <1289226405@qq.com>
Date: Tue, 19 Dec 2017 15:09:46 +0800
Subject: [PATCH 18/98] deflatten debug

---
 AlphaGo/game.py     | 10 +++++-----
 AlphaGo/strategy.py |  9 +++++++--
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index 3b62435..2a82d8e 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -46,12 +46,12 @@ class Game:
 
     def _flatten(self, vertex):
         x, y = vertex
-        return (y - 1) * self.size + (x - 1)
+        return (x - 1) * self.size + (y - 1)
 
     def _deflatten(self, idx):
-        x = idx % self.size + 1
-        y = idx // self.size + 1
-        return (x,y)
+        x = idx // self.size + 1
+        y = idx % self.size + 1
+        return (x, y)
 
     def clear(self):
         self.board = [utils.EMPTY] * (self.size ** 2)
@@ -88,7 +88,7 @@ class Game:
         if choice == self.size ** 2:
             move = utils.PASS
         else:
-            move = (choice % self.size + 1, choice / self.size + 1)
+            move = self._deflatten(choice)
         return move, prob
 
     def do_move(self, color, vertex):
diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py
index e9457cf..112f130 100644
--- a/AlphaGo/strategy.py
+++ b/AlphaGo/strategy.py
@@ -23,6 +23,11 @@ class GoEnv:
         x, y = vertex
         return (x - 1) * self.game.size + (y - 1)
 
+    def simulate_deflatten(self, idx):
+        x = idx // self.game.size + 1
+        y = idx % self.game.size + 1
+        return (x, y)
+
     def _find_group(self, start):
         color = self.simulate_board[self.simulate_flatten(start)]
         # print ("color : ", color)
@@ -140,7 +145,7 @@ class GoEnv:
         if action == self.game.size ** 2:
             vertex = (0, 0)
         else:
-            vertex = (action / self.game.size + 1, action % self.game.size + 1)
+            vertex = self.simulate_deflatten(action)
         if state[0, 0, 0, -1] == utils.BLACK:
             color = utils.BLACK
         else:
@@ -192,7 +197,7 @@ class GoEnv:
         if action == self.game.size ** 2:
             vertex = utils.PASS
         else:
-            vertex = (action % self.game.size + 1, action / self.game.size + 1)
+            vertex = self.simulate_deflatten(action)
         # print(vertex)
         # print(self.board)
         self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist()

From 4a2d8f0003443f6ca60f78370027914a4e4ff9c4 Mon Sep 17 00:00:00 2001
From: rtz19970824 <1289226405@qq.com>
Date: Tue, 19 Dec 2017 15:39:31 +0800
Subject: [PATCH 19/98] start a random player if checkpoint path is not
 specified

---
 AlphaGo/play.py   | 32 +++++++++++++++++++-------------
 AlphaGo/player.py |  4 +++-
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index fe6c7ce..7367804 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -1,3 +1,4 @@
+import argparse
 import subprocess
 import sys
 import re
@@ -11,14 +12,17 @@ if __name__ == '__main__':
     Note that, this function requires the installation of the Pyro4 library.
     """
     # TODO : we should set the network path in a more configurable way.
-    black_weight_path = "./checkpoints"
-    white_weight_path = "./checkpoints_origin"
-    if (not os.path.exists(black_weight_path)):
-        print "Can't not find the network weights for black player."
-        sys.exit()
-    if (not os.path.exists(white_weight_path)):
-        print "Can't not find the network weights for white player."
-        sys.exit()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--black_weight_path", type=str, default=None)
+    parser.add_argument("--white_weight_path", type=str, default=None)
+    args = parser.parse_args()
+
+    # black_weight_path = "./checkpoints"
+    # white_weight_path = "./checkpoints_origin"
+    if args.black_weight_path is not None and (not os.path.exists(args.black_weight_path)):
+        raise ValueError("Can't not find the network weights for black player.")
+    if args.white_weight_path is not None and (not os.path.exists(args.white_weight_path)):
+        raise ValueError("Can't not find the network weights for white player.")
 
     # kill the old server
     kill_old_server = subprocess.Popen(['killall', 'pyro4-ns'])
@@ -31,14 +35,16 @@ if __name__ == '__main__':
     time.sleep(1)
 
     # start two different player with different network weights.
-    agent_v0 = subprocess.Popen(['python', '-u', 'player.py', '--role=black'],
-                                stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    agent_v1 = subprocess.Popen(['python', '-u', 'player.py', '--role=white', '--checkpoint_path=./checkpoints_origin/'],
-                                stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    agent_v0 = subprocess.Popen(['python', '-u', 'player.py', '--role=black', '--checkpoint_path=' + str(args.black_weight_path)],
+                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+
+    agent_v1 = subprocess.Popen(['python', '-u', 'player.py', '--role=white', '--checkpoint_path=' + str(args.white_weight_path)],
+                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+
     server_list = ""
     while ("black" not in server_list) or ("white" not in server_list):
         server_list = subprocess.check_output(['pyro4-nsc', 'list'])
-        print "Waining for the server start..."
+        print "Waiting for the server start..."
         time.sleep(1)
     print server_list
     print "Start black player at : " + str(agent_v0.pid)
diff --git a/AlphaGo/player.py b/AlphaGo/player.py
index 8245c38..b468cf3 100644
--- a/AlphaGo/player.py
+++ b/AlphaGo/player.py
@@ -22,10 +22,12 @@ class Player(object):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint_path", type=str, default="./checkpoints/")
+    parser.add_argument("--checkpoint_path", type=str, default=None)
     parser.add_argument("--role", type=str, default="unknown")
     args = parser.parse_args()
 
+    if args.checkpoint_path == 'None':
+        args.checkpoint_path = None
     game = Game(checkpoint_path=args.checkpoint_path)
     engine = GTPEngine(game_obj=game, name='tianshou', version=0)
 

From fc8114fe35646673e4b2f4ac00527879878a6ce3 Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Tue, 19 Dec 2017 16:51:50 +0800
Subject: [PATCH 20/98] merge flatten and deflatten, rename variable for
 clarity

---
 AlphaGo/engine.py               |  4 +--
 AlphaGo/game.py                 | 15 ++++++-----
 AlphaGo/strategy.py             | 45 +++++++++++++--------------------
 tianshou/core/mcts/evaluator.py |  4 +--
 tianshou/core/mcts/mcts.py      |  2 +-
 5 files changed, 31 insertions(+), 39 deletions(-)

diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py
index 1f9af85..1ee8833 100644
--- a/AlphaGo/engine.py
+++ b/AlphaGo/engine.py
@@ -167,7 +167,7 @@ class GTPEngine():
         move = self._parse_move(args)
         if move:
             color, vertex = move
-            res = self._game.do_move(color, vertex)
+            res = self._game.play_move(color, vertex)
             if res:
                 return None, True
             else:
@@ -177,7 +177,7 @@ class GTPEngine():
     def cmd_genmove(self, args, **kwargs):
         color = self._parse_color(args)
         if color:
-            move = self._game.gen_move(color)
+            move = self._game.think_play_move(color)
             return self._vertex_point2string(move), True
         else:
             return 'unknown player', False
diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index 2a82d8e..d0cb91c 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -77,7 +77,7 @@ class Game:
             state[0, :, :, 16] = np.zeros([self.size, self.size])
         return state
 
-    def strategy_gen_move(self, latest_boards, color):
+    def think(self, latest_boards, color):
         self.simulator.simulate_latest_boards = copy.copy(latest_boards)
         self.simulator.simulate_board = copy.copy(latest_boards[-1])
         nn_input = self.generate_nn_input(self.simulator.simulate_latest_boards, color)
@@ -91,17 +91,18 @@ class Game:
             move = self._deflatten(choice)
         return move, prob
 
-    def do_move(self, color, vertex):
+    def play_move(self, color, vertex):
+        # this function can be called directly to play the opponent's move
         if vertex == utils.PASS:
             return True
         res = self.executor.do_move(color, vertex)
         return res
 
-    def gen_move(self, color):
-        # move = self.strategy.gen_move(color)
-        # return move
-        move, self.prob = self.strategy_gen_move(self.latest_boards, color)
-        self.do_move(color, move)
+    def think_play_move(self, color):
+        # although we dont need to return self.prob, however it is needed for neural network training
+        move, self.prob = self.think(self.latest_boards, color)
+        # play the move immediately
+        self.play_move(color, move)
         return move
 
     def status2symbol(self, s):
diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py
index 112f130..af017b1 100644
--- a/AlphaGo/strategy.py
+++ b/AlphaGo/strategy.py
@@ -10,7 +10,7 @@ import tensorflow as tf
 from collections import deque
 from tianshou.core.mcts.mcts import MCTS
 
-DELTA = [[1, 0], [-1, 0], [0, -1], [0, 1]]
+NEIGHBOR_OFFSET = [[1, 0], [-1, 0], [0, -1], [0, 1]]
 CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]]
 
 class GoEnv:
@@ -19,17 +19,8 @@ class GoEnv:
         self.simulate_board = [utils.EMPTY] * (self.game.size ** 2)
         self.simulate_latest_boards = deque(maxlen=8)
 
-    def simulate_flatten(self, vertex):
-        x, y = vertex
-        return (x - 1) * self.game.size + (y - 1)
-
-    def simulate_deflatten(self, idx):
-        x = idx // self.game.size + 1
-        y = idx % self.game.size + 1
-        return (x, y)
-
     def _find_group(self, start):
-        color = self.simulate_board[self.simulate_flatten(start)]
+        color = self.simulate_board[self.game._flatten(start)]
         # print ("color : ", color)
         chain = set()
         frontier = [start]
@@ -40,32 +31,32 @@ class GoEnv:
             chain.add(current)
             for n in self._neighbor(current):
                 # print n, self._flatten(n), self.board[self._flatten(n)],
-                if self.simulate_board[self.simulate_flatten(n)] == color and not n in chain:
+                if self.simulate_board[self.game._flatten(n)] == color and not n in chain:
                     frontier.append(n)
-                if self.simulate_board[self.simulate_flatten(n)] == utils.EMPTY:
+                if self.simulate_board[self.game._flatten(n)] == utils.EMPTY:
                     has_liberty = True
         return has_liberty, chain
 
     def _is_suicide(self, color, vertex):
-        self.simulate_board[self.simulate_flatten(vertex)] = color # assume that we already take this move
+        self.simulate_board[self.game._flatten(vertex)] = color # assume that we already take this move
         suicide = False
 
         has_liberty, group = self._find_group(vertex)
         if not has_liberty:
             suicide = True # no liberty, suicide
             for n in self._neighbor(vertex):
-                if self.simulate_board[self.simulate_flatten(n)] == utils.another_color(color):
+                if self.simulate_board[self.game._flatten(n)] == utils.another_color(color):
                     opponent_liberty, group = self._find_group(n)
                     if not opponent_liberty:
                         suicide = False # this move is able to take opponent's stone, not suicide
 
-        self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY # undo this move
+        self.simulate_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move
         return suicide
 
     def _check_global_isomorphous(self, color, vertex):
         ##backup
         _board = copy.copy(self.simulate_board)
-        self.simulate_board[self.simulate_flatten(vertex)] = color
+        self.simulate_board[self.game._flatten(vertex)] = color
         self._process_board(color, vertex)
         if self.simulate_board in self.game.history:
             res = True
@@ -84,7 +75,7 @@ class GoEnv:
     def _neighbor(self, vertex):
         x, y = vertex
         nei = []
-        for d in DELTA:
+        for d in NEIGHBOR_OFFSET:
             _x = x + d[0]
             _y = y + d[1]
             if self._in_board((_x, _y)):
@@ -104,16 +95,16 @@ class GoEnv:
     def _process_board(self, color, vertex):
         nei = self._neighbor(vertex)
         for n in nei:
-            if self.simulate_board[self.simulate_flatten(n)] == utils.another_color(color):
+            if self.simulate_board[self.game._flatten(n)] == utils.another_color(color):
                 has_liberty, group = self._find_group(n)
                 if not has_liberty:
                     for b in group:
-                        self.simulate_board[self.simulate_flatten(b)] = utils.EMPTY
+                        self.simulate_board[self.game._flatten(b)] = utils.EMPTY
 
     def _is_eye(self, color, vertex):
         nei = self._neighbor(vertex)
         cor = self._corner(vertex)
-        ncolor = {color == self.simulate_board[self.simulate_flatten(n)] for n in nei}
+        ncolor = {color == self.simulate_board[self.game._flatten(n)] for n in nei}
         if False in ncolor:
             # print "not all neighbors are in same color with us"
             return False
@@ -122,7 +113,7 @@ class GoEnv:
             # print "all neighbors are in same group and same color with us"
             return True
         else:
-            opponent_number = [self.simulate_board[self.simulate_flatten(c)] for c in cor].count(-color)
+            opponent_number = [self.simulate_board[self.game._flatten(c)] for c in cor].count(-color)
             opponent_propotion = float(opponent_number) / float(len(cor))
             if opponent_propotion < 0.5:
                 # print "few opponents, real eye"
@@ -145,7 +136,7 @@ class GoEnv:
         if action == self.game.size ** 2:
             vertex = (0, 0)
         else:
-            vertex = self.simulate_deflatten(action)
+            vertex = self.game._deflatten(action)
         if state[0, 0, 0, -1] == utils.BLACK:
             color = utils.BLACK
         else:
@@ -160,7 +151,7 @@ class GoEnv:
             return False
 
         ### already have stone
-        if not self.simulate_board[self.simulate_flatten(vertex)] == utils.EMPTY:
+        if not self.simulate_board[self.game._flatten(vertex)] == utils.EMPTY:
             # print(np.array(self.board).reshape(9, 9))
             # print(vertex)
             return False
@@ -182,14 +173,14 @@ class GoEnv:
         if vertex == utils.PASS:
             return True
 
-        id_ = self.simulate_flatten(vertex)
+        id_ = self.game._flatten(vertex)
         if self.simulate_board[id_] == utils.EMPTY:
             self.simulate_board[id_] = color
             return True
         else:
             return False
 
-    def step_forward(self, state, action):
+    def simulate_step_forward(self, state, action):
         if state[0, 0, 0, -1] == 1:
             color = utils.BLACK
         else:
@@ -197,7 +188,7 @@ class GoEnv:
         if action == self.game.size ** 2:
             vertex = utils.PASS
         else:
-            vertex = self.simulate_deflatten(action)
+            vertex = self.game._deflatten(action)
         # print(vertex)
         # print(self.board)
         self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist()
diff --git a/tianshou/core/mcts/evaluator.py b/tianshou/core/mcts/evaluator.py
index 9c4ee8e..a1f9456 100644
--- a/tianshou/core/mcts/evaluator.py
+++ b/tianshou/core/mcts/evaluator.py
@@ -19,10 +19,10 @@ class rollout_policy(evaluator):
         # TODO: prior for rollout policy
         total_reward = 0.
         action = np.random.randint(0, self.action_num)
-        state, reward = self.env.step_forward(state, action)
+        state, reward = self.env.simulate_step_forward(state, action)
         total_reward += reward
         while state is not None:
             action = np.random.randint(0, self.action_num)
-            state, reward = self.env.step_forward(state, action)
+            state, reward = self.env.simulate_step_forward(state, action)
             total_reward += reward
         return np.ones([self.action_num])/self.action_num, total_reward
diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index 979e994..b58c105 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -116,7 +116,7 @@ class ActionNode(object):
             self.next_state = tuple2list(self.next_state)
 
     def selection(self, simulator):
-        self.next_state, self.reward = simulator.step_forward(self.parent.state, self.action)
+        self.next_state, self.reward = simulator.simulate_step_forward(self.parent.state, self.action)
         self.origin_state = self.next_state
         self.state_type = type(self.next_state)
         self.type_conversion_to_tuple()

From 1f011a44ef12ca6a8651a6870cc37670a1c96dec Mon Sep 17 00:00:00 2001
From: mcgrady00h <281130306@qq.com>
Date: Tue, 19 Dec 2017 17:04:55 +0800
Subject: [PATCH 21/98] add mcts virtual loss version (may have bugs)

---
 tianshou/core/mcts/mcts_test.py              |   3 +
 tianshou/core/mcts/mcts_virtual_loss.py      | 263 +++++++++++++++++++
 tianshou/core/mcts/mcts_virtual_loss_test.py |  55 ++++
 3 files changed, 321 insertions(+)
 create mode 100644 tianshou/core/mcts/mcts_virtual_loss.py
 create mode 100644 tianshou/core/mcts/mcts_virtual_loss_test.py

diff --git a/tianshou/core/mcts/mcts_test.py b/tianshou/core/mcts/mcts_test.py
index da404ca..49b85be 100644
--- a/tianshou/core/mcts/mcts_test.py
+++ b/tianshou/core/mcts/mcts_test.py
@@ -12,6 +12,9 @@ class TestEnv:
         print(self.reward)
         # print("The best arm is {} with expected reward {}".format(self.best[0],self.best[1]))
 
+    def simulate_is_valid(self, state, act):
+        return True
+
     def step_forward(self, state, action):
         if action != 0 and action != 1:
             raise ValueError("Action must be 0 or 1! Your action is {}".format(action))
diff --git a/tianshou/core/mcts/mcts_virtual_loss.py b/tianshou/core/mcts/mcts_virtual_loss.py
new file mode 100644
index 0000000..9d20b5a
--- /dev/null
+++ b/tianshou/core/mcts/mcts_virtual_loss.py
@@ -0,0 +1,263 @@
+# -*- coding: utf-8 -*-
+# vim:fenc=utf-8
+# $File: mcts_virtual_loss.py
+# $Date: Tue Dec 19 17:0444 2017 +0800
+# Original file: mcts.py
+# $Author: renyong15 © <mails.tsinghua.edu.cn>
+#
+
+"""
+    This is an implementation of the MCTS with virtual loss.
+    Due to the limitation of Python design mechanism, we implements the virtual loss in a mini-batch
+    manner.
+"""
+
+import numpy as np
+import math
+import time
+
+c_puct = 5
+
+
+def list2tuple(list):
+    try:
+        return tuple(list2tuple(sub) for sub in list)
+    except TypeError:
+        return list
+
+
+def tuple2list(tuple):
+    try:
+        return list(tuple2list(sub) for sub in tuple)
+    except TypeError:
+        return tuple
+
+
+class MCTSNodeVirtualLoss(object):
+    def __init__(self, parent, action, state, action_num, prior, inverse=False):
+        self.parent = parent
+        self.action = action
+        self.children = {}
+        self.state = state
+        self.action_num = action_num
+        self.prior = np.array(prior).reshape(-1)
+        self.inverse = inverse
+
+    def selection(self, simulator):
+        raise NotImplementedError("Need to implement function selection")
+
+    def backpropagation(self, action):
+        raise NotImplementedError("Need to implement function backpropagation")
+
+    def valid_mask(self, simulator):
+        pass
+
+class UCTNodeVirtualLoss(MCTSNodeVirtualLoss):
+    def __init__(self, parent, action, state, action_num, prior, inverse=False):
+        super(UCTNodeVirtualLoss, self).__init__(parent, action, state, action_num, prior, inverse)
+        self.Q = np.zeros([action_num])
+        self.W = np.zeros([action_num])
+        self.N = np.zeros([action_num])
+        self.virtual_loss = np.zeros([action_num])
+        #### modified by adding virtual loss
+        #self.ucb = self.Q + c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1)
+
+        self.mask = None
+
+    def selection(self, simulator):
+        self.valid_mask(simulator)
+        self.Q = np.zeros([self.action_num])
+        N_not_zero = self.N > 0
+        self.Q[N_not_zero] = (self.W[N_not_zero] + self.virtual_loss[N_not_zero] + 0.) / self.N[N_not_zero]
+        self.ucb = self.Q + c_puct * self.prior * math.sqrt(np.sum(self.N + self.virtual_loss)) /\
+                   (self.N + self.virtual_loss + 1)
+        action = np.argmax(self.ucb)
+        self.virtual_loss[action] += 1
+
+        if action in self.children.keys():
+            return self.children[action].selection(simulator)
+        else:
+            self.children[action] = ActionNodeVirtualLoss(self, action)
+            return self.children[action].selection(simulator)
+
+    def remove_virtual_loss(self):
+        ### if not virtual_loss for every action is zero
+        if np.sum(self.virtual_loss > 0) > 0:
+            self.virtual_loss = np.zeros([self.action_num])
+            if self.parent:
+                self.parent.remove_virtual_loss()
+
+    def backpropagation(self, action):
+        action = int(action)
+        self.N[action] += 1
+        self.W[action] += self.children[action].reward
+
+        ## do not need to  compute Q and ucb immediately since it will be modified by virtual loss
+        #for i in range(self.action_num):
+        #    if self.N[i] != 0:
+        #        self.Q[i] = (self.W[i] + 0.) / self.N[i]
+        #self.ucb = self.Q + c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1.)
+
+        if self.parent is not None:
+            if self.inverse:
+                self.parent.backpropagation(-self.children[action].reward)
+            else:
+                self.parent.backpropagation(self.children[action].reward)
+
+    def valid_mask(self, simulator):
+        if self.mask is None:
+            start_time = time.time()
+            self.mask = []
+            for act in range(self.action_num - 1):
+                if not simulator.simulate_is_valid(self.state, act):
+                    self.mask.append(act)
+                    self.ucb[act] = -float("Inf")
+        else:
+            self.ucb[self.mask] = -float("Inf")
+
+
+
+class ActionNodeVirtualLoss(object):
+    def __init__(self, parent, action):
+        self.parent = parent
+        self.action = action
+        self.children = {}
+        self.next_state = None
+        self.origin_state = None
+        self.state_type = None
+        self.reward = 0
+
+    def remove_virtual_loss(self):
+        self.parent.remove_virtual_loss()
+
+    def type_conversion_to_tuple(self):
+        if type(self.next_state) is np.ndarray:
+            self.next_state = self.next_state.tolist()
+        if type(self.next_state) is list:
+            self.next_state = list2tuple(self.next_state)
+
+    def type_conversion_to_origin(self):
+        if self.state_type is np.ndarray:
+            self.next_state = np.array(self.next_state)
+        if self.state_type is list:
+            self.next_state = tuple2list(self.next_state)
+
+    def selection(self, simulator):
+        self.next_state, self.reward = simulator.step_forward(self.parent.state, self.action)
+        self.origin_state = self.next_state
+        self.state_type = type(self.next_state)
+        self.type_conversion_to_tuple()
+        if self.next_state is not None:
+            if self.next_state in self.children.keys():
+                return self.children[self.next_state].selection(simulator)
+            else:
+                return self.parent, self.action
+        else:
+            return self.parent, self.action
+
+    def expansion(self, action, state, action_num, prior, inverse ):
+        if state is not None:
+            self.children[state] = UCTNodeVirtualLoss(self, action, state, action_num, prior, inverse)
+
+
+    def backpropagation(self, value):
+        self.reward += value
+        self.parent.backpropagation(self.action)
+
+
+class MCTSVirtualLoss(object):
+    def __init__(self, simulator, evaluator, root, action_num, batch_size = 1, method = "UCT", inverse = False):
+        self.simulator = simulator
+        self.evaluator = evaluator
+        prior, _ = self.evaluator(root)
+        self.action_num = action_num
+        self.batch_size = batch_size
+
+        if method == "":
+            self.root = root
+        elif method == "UCT":
+            self.root = UCTNodeVirtualLoss(None, None, root, action_num, prior, inverse)
+        elif method == "TS":
+            self.root = TSNodeVirtualLoss(None, None, root, action_num, prior, inverse=inverse)
+        else:
+            raise ValueError("Need a root type")
+
+        self.inverse = inverse
+
+
+    def do_search(self, max_step=None, max_time=None):
+        if max_step is not None:
+            self.step = 0
+            self.max_step = max_step
+        if max_time is not None:
+            self.start_time = time.time()
+            self.max_time = max_time
+        if max_step is None and max_time is None:
+            raise ValueError("Need a stop criteria!")
+
+        self.select_time = []
+        self.evaluate_time = []
+        self.bp_time = []
+        while (max_step is not None and self.step < self.max_step or max_step is None) \
+                and (max_time is not None and time.time() - self.start_time < self.max_time or max_time is None):
+            self.expand()
+            if max_step is not None:
+                self.step += 1
+
+    def expand(self):
+        ## minibatch with virtual loss
+        nodes = []
+        new_actions = []
+        next_states = []
+
+        for i in range(self.batch_size):
+            node, new_action = self.root.selection(self.simulator)
+            nodes.append(node)
+            new_actions.append(new_action)
+            next_states.append(node.children[new_action].next_state)
+
+        for node in nodes:
+            node.remove_virtual_loss()
+
+        assert(np.sum(self.root.virtual_loss > 0) == 0)
+        #### compute value in batch manner unless the evaluator do not support it
+        try:
+            priors, values = self.evaluator(next_states)
+        except:
+            priors = []
+            values = []
+            for i in range(self.batch_size):
+                if next_states[i] is not None:
+                    prior, value = self.evaluator(next_states[i])
+                    priors.append(prior)
+                    values.append(value)
+                else:
+                    priors.append(0.)
+                    values.append(0.)
+
+        #### for now next_state == origin_state
+        #### may have problem here. What if we reached the same next_state with same parent and action pair
+        for i in range(self.batch_size):
+            nodes[i].children[new_actions[i]].expansion(new_actions[i],
+                                                        next_states[i],
+                                                        self.action_num,
+                                                        priors[i],
+                                                        nodes[i].inverse)
+
+        for i in range(self.batch_size):
+            nodes[i].children[new_actions[i]].backpropagation(values[i] + 0.)
+
+
+##### TODO 
+class TSNodeVirtualLoss(MCTSNodeVirtualLoss):
+    def __init__(self, parent, action, state, action_num, prior, method="Gaussian", inverse=False):
+        super(TSNodeVirtualLoss, self).__init__(parent, action, state, action_num, prior, inverse)
+        if method == "Beta":
+            self.alpha = np.ones([action_num])
+            self.beta = np.ones([action_num])
+        if method == "Gaussian":
+            self.mu = np.zeros([action_num])
+            self.sigma = np.zeros([action_num])
+
+if __name__ == "__main__":
+    mcts_virtual_loss = MCTSNodeVirtualLoss(None, None, 10, 1, 'UCT')
diff --git a/tianshou/core/mcts/mcts_virtual_loss_test.py b/tianshou/core/mcts/mcts_virtual_loss_test.py
new file mode 100644
index 0000000..d2d6c81
--- /dev/null
+++ b/tianshou/core/mcts/mcts_virtual_loss_test.py
@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+# vim:fenc=utf-8
+# $File: mcts_virtual_loss_test.py
+# $Date: Tue Dec 19 16:5459 2017 +0800
+# Original file: mcts_test.py
+# $Author: renyong15 © <mails.tsinghua.edu.cn>
+#
+
+
+
+import numpy as np
+from mcts_virtual_loss import MCTSVirtualLoss
+from evaluator import rollout_policy
+
+
+class TestEnv:
+    def __init__(self, max_step=5):
+        self.max_step = max_step
+        self.reward = {i: np.random.uniform() for i in range(2 ** max_step)}
+        # self.reward = {0:1, 1:0}
+        self.best = max(self.reward.items(), key=lambda x: x[1])
+        print(self.reward)
+        # print("The best arm is {} with expected reward {}".format(self.best[0],self.best[1]))
+
+    def simulate_is_valid(self, state, act):
+        return True
+
+    def step_forward(self, state, action):
+        if action != 0 and action != 1:
+            raise ValueError("Action must be 0 or 1! Your action is {}".format(action))
+        if state[0] >= 2 ** state[1] or state[1] > self.max_step:
+            raise ValueError("Invalid State! Your state is {}".format(state))
+        # print("Operate action {} at state {}, timestep {}".format(action, state[0], state[1]))
+        if state[1] == self.max_step:
+            new_state = None
+            reward = 0
+        else:
+            num = state[0] + 2 ** state[1] * action
+            step = state[1] + 1
+            new_state = [num, step]
+            if step == self.max_step:
+                reward = int(np.random.uniform() < self.reward[num])
+            else:
+                reward = 0.
+        return new_state, reward
+
+
+if __name__ == "__main__":
+    env = TestEnv(2)
+    rollout = rollout_policy(env, 2)
+    evaluator = lambda state: rollout(state)
+    mcts_virtual_loss = MCTSVirtualLoss(env, evaluator, [0, 0], 2, batch_size = 10)
+    for i in range(10):
+        mcts_virtual_loss.do_search(max_step = 100)
+

From 232204d7970ef261c8f99394f2cc631a674a17a0 Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Tue, 19 Dec 2017 22:57:38 +0800
Subject: [PATCH 22/98] fix the copy bug in check_global_isomorphous; refactor
 code to eliminate side effect

---
 AlphaGo/go.py              |  36 ++++++-------
 AlphaGo/strategy.py        | 104 +++++++++++++++++--------------------
 tianshou/core/mcts/mcts.py |   3 +-
 3 files changed, 67 insertions(+), 76 deletions(-)

diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index 7b1d3e7..8e3518d 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -72,18 +72,14 @@ class Go:
         self.game.board[self.game._flatten(vertex)] = utils.EMPTY
         return True
 
-    def _check_global_isomorphous(self, color, vertex):
-        ##backup
-        _board = copy.copy(self.game.board)
-        self.game.board[self.game._flatten(vertex)] = color
-        self._process_board(color, vertex)
-        if self.game.board in self.game.history:
-            res = True
-        else:
-            res = False
-
-        self.game.board = _board
-        return res
+    def _check_global_isomorphous(self, history_boards, current_board, color, vertex):
+        repeat = False
+        next_board = copy.copy(current_board)
+        next_board[self.game._flatten(vertex)] = color
+        self._process_board(next_board, color, vertex)
+        if next_board in history_boards:
+            repeat = True
+        return repeat
 
     def _in_board(self, vertex):
         x, y = vertex
@@ -101,38 +97,38 @@ class Go:
                 nei.append((_x, _y))
         return nei
 
-    def _process_board(self, color, vertex):
+    def _process_board(self, current_board, color, vertex):
         nei = self._neighbor(vertex)
         for n in nei:
-            if self.game.board[self.game._flatten(n)] == utils.another_color(color):
+            if current_board[self.game._flatten(n)] == utils.another_color(color):
                 can_kill, block = self._find_block(n)
                 if can_kill:
                     for b in block:
-                        self.game.board[self.game._flatten(b)] = utils.EMPTY
+                        current_board[self.game._flatten(b)] = utils.EMPTY
 
-    def is_valid(self, color, vertex):
+    def is_valid(self, history_boards, current_board, color, vertex):
         ### in board
         if not self._in_board(vertex):
             return False
 
         ### already have stone
-        if not self.game.board[self.game._flatten(vertex)] == utils.EMPTY:
+        if not current_board[self.game._flatten(vertex)] == utils.EMPTY:
             return False
 
         ### check if it is qi
         if not self._is_qi(color, vertex):
             return False
 
-        if self._check_global_isomorphous(color, vertex):
+        if self._check_global_isomorphous(history_boards, current_board, color, vertex):
             return False
 
         return True
 
     def do_move(self, color, vertex):
-        if not self.is_valid(color, vertex):
+        if not self.is_valid(self.game.history, self.game.board, color, vertex):
             return False
         self.game.board[self.game._flatten(vertex)] = color
-        self._process_board(color, vertex)
+        self._process_board(self.game.board, color, vertex)
         self.game.history.append(copy.copy(self.game.board))
         self.game.latest_boards.append(copy.copy(self.game.board))
         return True
diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py
index af017b1..07555e9 100644
--- a/AlphaGo/strategy.py
+++ b/AlphaGo/strategy.py
@@ -19,52 +19,47 @@ class GoEnv:
         self.simulate_board = [utils.EMPTY] * (self.game.size ** 2)
         self.simulate_latest_boards = deque(maxlen=8)
 
-    def _find_group(self, start):
-        color = self.simulate_board[self.game._flatten(start)]
+    def _find_group(self, current_board, vertex):
+        color = current_board[self.game._flatten(vertex)]
         # print ("color : ", color)
         chain = set()
-        frontier = [start]
+        frontier = [vertex]
         has_liberty = False
         while frontier:
             current = frontier.pop()
             # print ("current : ", current)
             chain.add(current)
             for n in self._neighbor(current):
-                # print n, self._flatten(n), self.board[self._flatten(n)],
-                if self.simulate_board[self.game._flatten(n)] == color and not n in chain:
+                if current_board[self.game._flatten(n)] == color and not n in chain:
                     frontier.append(n)
-                if self.simulate_board[self.game._flatten(n)] == utils.EMPTY:
+                if current_board[self.game._flatten(n)] == utils.EMPTY:
                     has_liberty = True
         return has_liberty, chain
 
-    def _is_suicide(self, color, vertex):
-        self.simulate_board[self.game._flatten(vertex)] = color # assume that we already take this move
+    def _is_suicide(self, current_board, color, vertex):
+        current_board[self.game._flatten(vertex)] = color # assume that we already take this move
         suicide = False
 
-        has_liberty, group = self._find_group(vertex)
+        has_liberty, group = self._find_group(current_board, vertex)
         if not has_liberty:
             suicide = True # no liberty, suicide
             for n in self._neighbor(vertex):
-                if self.simulate_board[self.game._flatten(n)] == utils.another_color(color):
-                    opponent_liberty, group = self._find_group(n)
+                if current_board[self.game._flatten(n)] == utils.another_color(color):
+                    opponent_liberty, group = self._find_group(current_board, n)
                     if not opponent_liberty:
                         suicide = False # this move is able to take opponent's stone, not suicide
 
-        self.simulate_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move
+        current_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move
         return suicide
 
-    def _check_global_isomorphous(self, color, vertex):
-        ##backup
-        _board = copy.copy(self.simulate_board)
-        self.simulate_board[self.game._flatten(vertex)] = color
-        self._process_board(color, vertex)
-        if self.simulate_board in self.game.history:
-            res = True
-        else:
-            res = False
-
-        self.simulate_board = _board
-        return res
+    def _check_global_isomorphous(self, history_boards, current_board, color, vertex):
+        repeat = False
+        next_board = copy.copy(current_board)
+        next_board[self.game._flatten(vertex)] = color
+        self._process_board(next_board, color, vertex)
+        if next_board in history_boards:
+            repeat = True
+        return repeat
 
     def _in_board(self, vertex):
         x, y = vertex
@@ -92,28 +87,28 @@ class GoEnv:
                 corner.append((_x, _y))
         return corner
 
-    def _process_board(self, color, vertex):
+    def _process_board(self, current_board, color, vertex):
         nei = self._neighbor(vertex)
         for n in nei:
-            if self.simulate_board[self.game._flatten(n)] == utils.another_color(color):
-                has_liberty, group = self._find_group(n)
+            if current_board[self.game._flatten(n)] == utils.another_color(color):
+                has_liberty, group = self._find_group(current_board, n)
                 if not has_liberty:
                     for b in group:
-                        self.simulate_board[self.game._flatten(b)] = utils.EMPTY
+                        current_board[self.game._flatten(b)] = utils.EMPTY
 
-    def _is_eye(self, color, vertex):
+    def _is_eye(self, current_board, color, vertex):
         nei = self._neighbor(vertex)
         cor = self._corner(vertex)
-        ncolor = {color == self.simulate_board[self.game._flatten(n)] for n in nei}
+        ncolor = {color == current_board[self.game._flatten(n)] for n in nei}
         if False in ncolor:
             # print "not all neighbors are in same color with us"
             return False
-        _, group = self._find_group(nei[0])
+        _, group = self._find_group(current_board, nei[0])
         if set(nei) < group:
             # print "all neighbors are in same group and same color with us"
             return True
         else:
-            opponent_number = [self.simulate_board[self.game._flatten(c)] for c in cor].count(-color)
+            opponent_number = [current_board[self.game._flatten(c)] for c in cor].count(-color)
             opponent_propotion = float(opponent_number) / float(len(cor))
             if opponent_propotion < 0.5:
                 # print "few opponents, real eye"
@@ -122,49 +117,54 @@ class GoEnv:
                 # print "many opponents, fake eye"
                 return False
 
-    def knowledge_prunning(self, color, vertex):
+    def knowledge_prunning(self, current_board, color, vertex):
         ### check if it is an eye of yourself
         ### assumptions : notice that this judgement requires that the state is an endgame
-        if self._is_eye(color, vertex):
+        if self._is_eye(current_board, color, vertex):
             return False
         return True
 
-    def simulate_is_valid(self, state, action):
-        # State is the play board, the shape is [1, self.game.size, self.game.size, 17].
-        # Action is an index
+    def sa2cv(self, state, action):
+        # State is the play board, the shape is [1, self.game.size, self.game.size, 17], action is an index.
         # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move
-        if action == self.game.size ** 2:
-            vertex = (0, 0)
-        else:
-            vertex = self.game._deflatten(action)
         if state[0, 0, 0, -1] == utils.BLACK:
             color = utils.BLACK
         else:
             color = utils.WHITE
+        if action == self.game.size ** 2:
+            vertex = (0, 0)
+        else:
+            vertex = self.game._deflatten(action)
+        return color, vertex
+
+    def simulate_is_valid(self, history_boards, current_board, state, action):
+        # initialize simulate_latest_boards and simulate_board from state
         self.simulate_latest_boards.clear()
         for i in range(8):
             self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist())
         self.simulate_board = copy.copy(self.simulate_latest_boards[-1])
 
+        color, vertex = self.sa2cv(state, action)
+
         ### in board
         if not self._in_board(vertex):
             return False
 
         ### already have stone
-        if not self.simulate_board[self.game._flatten(vertex)] == utils.EMPTY:
+        if not current_board[self.game._flatten(vertex)] == utils.EMPTY:
             # print(np.array(self.board).reshape(9, 9))
             # print(vertex)
             return False
 
         ### check if it is suicide
-        if self._is_suicide(color, vertex):
+        if self._is_suicide(current_board, color, vertex):
             return False
 
         ### forbid global isomorphous
-        if self._check_global_isomorphous(color, vertex):
+        if self._check_global_isomorphous(history_boards, current_board, color, vertex):
             return False
 
-        if not self.knowledge_prunning(color, vertex):
+        if not self.knowledge_prunning(current_board, color, vertex):
             return False
 
         return True
@@ -181,17 +181,11 @@ class GoEnv:
             return False
 
     def simulate_step_forward(self, state, action):
-        if state[0, 0, 0, -1] == 1:
-            color = utils.BLACK
-        else:
-            color = utils.WHITE
-        if action == self.game.size ** 2:
-            vertex = utils.PASS
-        else:
-            vertex = self.game._deflatten(action)
-        # print(vertex)
-        # print(self.board)
+        # initialize the simulate_board from state
         self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist()
+
+        color, vertex = self.sa2cv(state, action)
+
         self.simulate_do_move(color, vertex)
         new_state = np.concatenate(
             [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1),
diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index b58c105..12fc85d 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -75,7 +75,8 @@ class UCTNode(MCTSNode):
             start_time = time.time()
             self.mask = []
             for act in range(self.action_num - 1):
-                if not simulator.simulate_is_valid(self.state, act):
+                if not simulator.simulate_is_valid(
+                        simulator.simulate_latest_boards, simulator.simulate_board, self.state, act):
                     self.mask.append(act)
                     self.ucb[act] = -float("Inf")
         else:

From 2a9d949510f3e2032e868fa64bb0d6efc7624fc3 Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Wed, 20 Dec 2017 00:16:24 +0800
Subject: [PATCH 23/98] rearrange the sequence of functions of Go and GoEnv
 before merging

---
 AlphaGo/go.py       | 125 ++++++++++++++++++++------------------------
 AlphaGo/strategy.py |  70 ++++++++++++-------------
 2 files changed, 91 insertions(+), 104 deletions(-)

diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index 8e3518d..37d8339 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -17,70 +17,6 @@ class Go:
     def __init__(self, **kwargs):
         self.game = kwargs['game']
 
-    def _bfs(self, vertex, color, block, status):
-        block.append(vertex)
-        status[self.game._flatten(vertex)] = True
-        nei = self._neighbor(vertex)
-        for n in nei:
-            if not status[self.game._flatten(n)]:
-                if self.game.board[self.game._flatten(n)] == color:
-                    self._bfs(n, color, block, status)
-
-    def _find_block(self, vertex):
-        block = []
-        status = [False] * (self.game.size ** 2)
-        color = self.game.board[self.game._flatten(vertex)]
-        self._bfs(vertex, color, block, status)
-
-        for b in block:
-            for n in self._neighbor(b):
-                if self.game.board[self.game._flatten(n)] == utils.EMPTY:
-                    return False, block
-        return True, block
-
-    def _find_boarder(self, vertex):
-        block = []
-        status = [False] * (self.game.size ** 2)
-        self._bfs(vertex, utils.EMPTY, block, status)
-        border = []
-        for b in block:
-            for n in self._neighbor(b):
-                if not (n in block):
-                    border.append(n)
-        return border
-
-    def _is_qi(self, color, vertex):
-        nei = self._neighbor(vertex)
-        for n in nei:
-            if self.game.board[self.game._flatten(n)] == utils.EMPTY:
-                return True
-
-        self.game.board[self.game._flatten(vertex)] = color
-        for n in nei:
-            if self.game.board[self.game._flatten(n)] == utils.another_color(color):
-                can_kill, block = self._find_block(n)
-                if can_kill:
-                    self.game.board[self.game._flatten(vertex)] = utils.EMPTY
-                    return True
-
-        ### can not suicide
-        can_kill, block = self._find_block(vertex)
-        if can_kill:
-            self.game.board[self.game._flatten(vertex)] = utils.EMPTY
-            return False
-
-        self.game.board[self.game._flatten(vertex)] = utils.EMPTY
-        return True
-
-    def _check_global_isomorphous(self, history_boards, current_board, color, vertex):
-        repeat = False
-        next_board = copy.copy(current_board)
-        next_board[self.game._flatten(vertex)] = color
-        self._process_board(next_board, color, vertex)
-        if next_board in history_boards:
-            repeat = True
-        return repeat
-
     def _in_board(self, vertex):
         x, y = vertex
         if x < 1 or x > self.game.size: return False
@@ -97,15 +33,57 @@ class Go:
                 nei.append((_x, _y))
         return nei
 
+    def _find_group(self, current_board, vertex):
+        color = current_board[self.game._flatten(vertex)]
+        # print ("color : ", color)
+        chain = set()
+        frontier = [vertex]
+        has_liberty = False
+        while frontier:
+            current = frontier.pop()
+            # print ("current : ", current)
+            chain.add(current)
+            for n in self._neighbor(current):
+                if current_board[self.game._flatten(n)] == color and not n in chain:
+                    frontier.append(n)
+                if current_board[self.game._flatten(n)] == utils.EMPTY:
+                    has_liberty = True
+        return has_liberty, chain
+
+    def _is_suicide(self, current_board, color, vertex):
+        current_board[self.game._flatten(vertex)] = color # assume that we already take this move
+        suicide = False
+
+        has_liberty, group = self._find_group(current_board, vertex)
+        if not has_liberty:
+            suicide = True # no liberty, suicide
+            for n in self._neighbor(vertex):
+                if current_board[self.game._flatten(n)] == utils.another_color(color):
+                    opponent_liberty, group = self._find_group(current_board, n)
+                    if not opponent_liberty:
+                        suicide = False # this move is able to take opponent's stone, not suicide
+
+        current_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move
+        return suicide
+
     def _process_board(self, current_board, color, vertex):
         nei = self._neighbor(vertex)
         for n in nei:
             if current_board[self.game._flatten(n)] == utils.another_color(color):
-                can_kill, block = self._find_block(n)
-                if can_kill:
-                    for b in block:
+                has_liberty, group = self._find_group(current_board, n)
+                if not has_liberty:
+                    for b in group:
                         current_board[self.game._flatten(b)] = utils.EMPTY
 
+    def _check_global_isomorphous(self, history_boards, current_board, color, vertex):
+        repeat = False
+        next_board = copy.copy(current_board)
+        next_board[self.game._flatten(vertex)] = color
+        self._process_board(next_board, color, vertex)
+        if next_board in history_boards:
+            repeat = True
+        return repeat
+
     def is_valid(self, history_boards, current_board, color, vertex):
         ### in board
         if not self._in_board(vertex):
@@ -115,8 +93,8 @@ class Go:
         if not current_board[self.game._flatten(vertex)] == utils.EMPTY:
             return False
 
-        ### check if it is qi
-        if not self._is_qi(color, vertex):
+        ### check if it is suicide
+        if self._is_suicide(current_board, color, vertex):
             return False
 
         if self._check_global_isomorphous(history_boards, current_board, color, vertex):
@@ -137,6 +115,15 @@ class Go:
         idx = [i for i,x in enumerate(self.game.board) if x == utils.EMPTY ][0]
         return self.game._deflatten(idx)
 
+    def _find_boarder(self, vertex):
+        _, group = self._find_group(self.game.board, vertex)
+        border = []
+        for b in group:
+            for n in self._neighbor(b):
+                if not (n in group):
+                    border.append(n)
+        return border
+
     def _add_nearby_stones(self, neighbor_vertex_set, start_vertex_x, start_vertex_y, x_diff, y_diff, num_step):
         '''
         add the nearby stones around the input vertex
diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py
index 07555e9..9ebd421 100644
--- a/AlphaGo/strategy.py
+++ b/AlphaGo/strategy.py
@@ -19,6 +19,32 @@ class GoEnv:
         self.simulate_board = [utils.EMPTY] * (self.game.size ** 2)
         self.simulate_latest_boards = deque(maxlen=8)
 
+    def _in_board(self, vertex):
+        x, y = vertex
+        if x < 1 or x > self.game.size: return False
+        if y < 1 or y > self.game.size: return False
+        return True
+
+    def _neighbor(self, vertex):
+        x, y = vertex
+        nei = []
+        for d in NEIGHBOR_OFFSET:
+            _x = x + d[0]
+            _y = y + d[1]
+            if self._in_board((_x, _y)):
+                nei.append((_x, _y))
+        return nei
+
+    def _corner(self, vertex):
+        x, y = vertex
+        corner = []
+        for d in CORNER_OFFSET:
+            _x = x + d[0]
+            _y = y + d[1]
+            if self._in_board((_x, _y)):
+                corner.append((_x, _y))
+        return corner
+
     def _find_group(self, current_board, vertex):
         color = current_board[self.game._flatten(vertex)]
         # print ("color : ", color)
@@ -52,41 +78,6 @@ class GoEnv:
         current_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move
         return suicide
 
-    def _check_global_isomorphous(self, history_boards, current_board, color, vertex):
-        repeat = False
-        next_board = copy.copy(current_board)
-        next_board[self.game._flatten(vertex)] = color
-        self._process_board(next_board, color, vertex)
-        if next_board in history_boards:
-            repeat = True
-        return repeat
-
-    def _in_board(self, vertex):
-        x, y = vertex
-        if x < 1 or x > self.game.size: return False
-        if y < 1 or y > self.game.size: return False
-        return True
-
-    def _neighbor(self, vertex):
-        x, y = vertex
-        nei = []
-        for d in NEIGHBOR_OFFSET:
-            _x = x + d[0]
-            _y = y + d[1]
-            if self._in_board((_x, _y)):
-                nei.append((_x, _y))
-        return nei
-
-    def _corner(self, vertex):
-        x, y = vertex
-        corner = []
-        for d in CORNER_OFFSET:
-            _x = x + d[0]
-            _y = y + d[1]
-            if self._in_board((_x, _y)):
-                corner.append((_x, _y))
-        return corner
-
     def _process_board(self, current_board, color, vertex):
         nei = self._neighbor(vertex)
         for n in nei:
@@ -96,6 +87,15 @@ class GoEnv:
                     for b in group:
                         current_board[self.game._flatten(b)] = utils.EMPTY
 
+    def _check_global_isomorphous(self, history_boards, current_board, color, vertex):
+        repeat = False
+        next_board = copy.copy(current_board)
+        next_board[self.game._flatten(vertex)] = color
+        self._process_board(next_board, color, vertex)
+        if next_board in history_boards:
+            repeat = True
+        return repeat
+
     def _is_eye(self, current_board, color, vertex):
         nei = self._neighbor(vertex)
         cor = self._corner(vertex)

From d1af137686355b347f7c5b6b7fd117969b9a04cc Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Wed, 20 Dec 2017 00:43:31 +0800
Subject: [PATCH 24/98] final version before merge Go and GoEnv

---
 AlphaGo/engine.py    |  2 +-
 AlphaGo/game.py      |  3 ++-
 AlphaGo/go.py        |  8 ++++----
 AlphaGo/self-play.py |  2 +-
 AlphaGo/strategy.py  | 38 +++++++++++++++++++++-----------------
 5 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py
index 1ee8833..d11635a 100644
--- a/AlphaGo/engine.py
+++ b/AlphaGo/engine.py
@@ -183,7 +183,7 @@ class GTPEngine():
             return 'unknown player', False
 
     def cmd_get_score(self, args, **kwargs):
-        return self._game.executor.get_score(), None
+        return self._game.executor.executor_get_score(), None
 
     def cmd_show_board(self, args, **kwargs):
         return self._game.board, True
diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index d0cb91c..af4ef57 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -78,6 +78,7 @@ class Game:
         return state
 
     def think(self, latest_boards, color):
+        # TODO : using copy is right, or should we change to deepcopy?
         self.simulator.simulate_latest_boards = copy.copy(latest_boards)
         self.simulator.simulate_board = copy.copy(latest_boards[-1])
         nn_input = self.generate_nn_input(self.simulator.simulate_latest_boards, color)
@@ -95,7 +96,7 @@ class Game:
         # this function can be called directly to play the opponent's move
         if vertex == utils.PASS:
             return True
-        res = self.executor.do_move(color, vertex)
+        res = self.executor.executor_do_move(color, vertex)
         return res
 
     def think_play_move(self, color):
diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index 37d8339..108c9bd 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -84,7 +84,7 @@ class Go:
             repeat = True
         return repeat
 
-    def is_valid(self, history_boards, current_board, color, vertex):
+    def _is_valid(self, history_boards, current_board, color, vertex):
         ### in board
         if not self._in_board(vertex):
             return False
@@ -102,8 +102,8 @@ class Go:
 
         return True
 
-    def do_move(self, color, vertex):
-        if not self.is_valid(self.game.history, self.game.board, color, vertex):
+    def executor_do_move(self, color, vertex):
+        if not self._is_valid(self.game.history, self.game.board, color, vertex):
             return False
         self.game.board[self.game._flatten(vertex)] = color
         self._process_board(self.game.board, color, vertex)
@@ -164,7 +164,7 @@ class Go:
             elif color_estimate < 0:
                 return utils.WHITE
 
-    def get_score(self, is_unknown_estimation = False):
+    def executor_get_score(self, is_unknown_estimation = False):
         '''
             is_unknown_estimation: whether use nearby stone to predict the unknown
             return score from BLACK perspective.
diff --git a/AlphaGo/self-play.py b/AlphaGo/self-play.py
index 98ccf84..296112b 100644
--- a/AlphaGo/self-play.py
+++ b/AlphaGo/self-play.py
@@ -79,7 +79,7 @@ while True:
         prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1))
     print("Finished")
     print("\n")
-    score = game.executor.get_score(True)
+    score = game.executor.executor_get_score(True)
     if score > 0:
         winner = utils.BLACK
     else:
diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py
index 9ebd421..1e5fd02 100644
--- a/AlphaGo/strategy.py
+++ b/AlphaGo/strategy.py
@@ -117,14 +117,14 @@ class GoEnv:
                 # print "many opponents, fake eye"
                 return False
 
-    def knowledge_prunning(self, current_board, color, vertex):
+    def _knowledge_prunning(self, current_board, color, vertex):
         ### check if it is an eye of yourself
         ### assumptions : notice that this judgement requires that the state is an endgame
         if self._is_eye(current_board, color, vertex):
             return False
         return True
 
-    def sa2cv(self, state, action):
+    def _sa2cv(self, state, action):
         # State is the play board, the shape is [1, self.game.size, self.game.size, 17], action is an index.
         # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move
         if state[0, 0, 0, -1] == utils.BLACK:
@@ -137,23 +137,13 @@ class GoEnv:
             vertex = self.game._deflatten(action)
         return color, vertex
 
-    def simulate_is_valid(self, history_boards, current_board, state, action):
-        # initialize simulate_latest_boards and simulate_board from state
-        self.simulate_latest_boards.clear()
-        for i in range(8):
-            self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist())
-        self.simulate_board = copy.copy(self.simulate_latest_boards[-1])
-
-        color, vertex = self.sa2cv(state, action)
-
+    def _is_valid(self, history_boards, current_board, color, vertex):
         ### in board
         if not self._in_board(vertex):
             return False
 
         ### already have stone
         if not current_board[self.game._flatten(vertex)] == utils.EMPTY:
-            # print(np.array(self.board).reshape(9, 9))
-            # print(vertex)
             return False
 
         ### check if it is suicide
@@ -164,12 +154,26 @@ class GoEnv:
         if self._check_global_isomorphous(history_boards, current_board, color, vertex):
             return False
 
-        if not self.knowledge_prunning(current_board, color, vertex):
+        return True
+
+    def simulate_is_valid(self, history_boards, current_board, state, action):
+        # initialize simulate_latest_boards and simulate_board from state
+        self.simulate_latest_boards.clear()
+        for i in range(8):
+            self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist())
+        self.simulate_board = copy.copy(self.simulate_latest_boards[-1])
+
+        color, vertex = self._sa2cv(state, action)
+
+        if not self._is_valid(history_boards, current_board, color, vertex):
+            return False
+
+        if not self._knowledge_prunning(current_board, color, vertex):
             return False
 
         return True
 
-    def simulate_do_move(self, color, vertex):
+    def _do_move(self, color, vertex):
         if vertex == utils.PASS:
             return True
 
@@ -184,9 +188,9 @@ class GoEnv:
         # initialize the simulate_board from state
         self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist()
 
-        color, vertex = self.sa2cv(state, action)
+        color, vertex = self._sa2cv(state, action)
 
-        self.simulate_do_move(color, vertex)
+        self._do_move(color, vertex)
         new_state = np.concatenate(
             [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1),
              state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1),

From c2b46c44e7dce0ef4c73e230aaed07c91af32e0c Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Wed, 20 Dec 2017 01:14:05 +0800
Subject: [PATCH 25/98] merge Go and GoEnv finallygit status!

---
 AlphaGo/engine.py    |   2 +-
 AlphaGo/game.py      |  23 ++---
 AlphaGo/go.py        |  99 ++++++++++++++++++++-
 AlphaGo/self-play.py |   2 +-
 AlphaGo/strategy.py  | 199 -------------------------------------------
 5 files changed, 108 insertions(+), 217 deletions(-)
 delete mode 100644 AlphaGo/strategy.py

diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py
index d11635a..9948176 100644
--- a/AlphaGo/engine.py
+++ b/AlphaGo/engine.py
@@ -183,7 +183,7 @@ class GTPEngine():
             return 'unknown player', False
 
     def cmd_get_score(self, args, **kwargs):
-        return self._game.executor.executor_get_score(), None
+        return self._game.game_engine.executor_get_score(), None
 
     def cmd_show_board(self, args, **kwargs):
         return self._game.board, True
diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index af4ef57..aee8d3a 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -9,16 +9,13 @@ import utils
 import copy
 import tensorflow as tf
 import numpy as np
-import sys
+import sys, os
 import go
 import network_small
-import strategy
 from collections import deque
+sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir))
 from tianshou.core.mcts.mcts import MCTS
 
-import Network
-#from strategy import strategy
-
 class Game:
     '''
     Load the real game and trained weights.
@@ -34,15 +31,11 @@ class Game:
         self.latest_boards = deque(maxlen=8)
         for _ in range(8):
             self.latest_boards.append(self.board)
-
-        self.executor = go.Go(game=self)
-        #self.strategy = strategy(checkpoint_path)
-
-        self.simulator = strategy.GoEnv(game=self)
         self.net = network_small.Network()
         self.sess = self.net.forward(checkpoint_path)
         self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v],
                                                      feed_dict={self.net.x: state, self.net.is_training: False})
+        self.game_engine = go.Go(game=self)
 
     def _flatten(self, vertex):
         x, y = vertex
@@ -79,10 +72,10 @@ class Game:
 
     def think(self, latest_boards, color):
         # TODO : using copy is right, or should we change to deepcopy?
-        self.simulator.simulate_latest_boards = copy.copy(latest_boards)
-        self.simulator.simulate_board = copy.copy(latest_boards[-1])
-        nn_input = self.generate_nn_input(self.simulator.simulate_latest_boards, color)
-        mcts = MCTS(self.simulator, self.evaluator, nn_input, self.size ** 2 + 1, inverse=True, max_step=1)
+        self.game_engine.simulate_latest_boards = copy.copy(latest_boards)
+        self.game_engine.simulate_board = copy.copy(latest_boards[-1])
+        nn_input = self.generate_nn_input(self.game_engine.simulate_latest_boards, color)
+        mcts = MCTS(self.game_engine, self.evaluator, nn_input, self.size ** 2 + 1, inverse=True, max_step=1)
         temp = 1
         prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp)
         choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0]
@@ -96,7 +89,7 @@ class Game:
         # this function can be called directly to play the opponent's move
         if vertex == utils.PASS:
             return True
-        res = self.executor.executor_do_move(color, vertex)
+        res = self.game_engine.executor_do_move(color, vertex)
         return res
 
     def think_play_move(self, color):
diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index 108c9bd..10ce7e1 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -1,7 +1,7 @@
 from __future__ import print_function
 import utils
 import copy
-import sys
+import numpy as np
 from collections import deque
 
 '''
@@ -12,10 +12,13 @@ Settings of the Go game.
 '''
 
 NEIGHBOR_OFFSET = [[1, 0], [-1, 0], [0, -1], [0, 1]]
+CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]]
 
 class Go:
     def __init__(self, **kwargs):
         self.game = kwargs['game']
+        self.simulate_board = [utils.EMPTY] * (self.game.size ** 2)
+        self.simulate_latest_boards = deque(maxlen=8)
 
     def _in_board(self, vertex):
         x, y = vertex
@@ -33,6 +36,16 @@ class Go:
                 nei.append((_x, _y))
         return nei
 
+    def _corner(self, vertex):
+        x, y = vertex
+        corner = []
+        for d in CORNER_OFFSET:
+            _x = x + d[0]
+            _y = y + d[1]
+            if self._in_board((_x, _y)):
+                corner.append((_x, _y))
+        return corner
+
     def _find_group(self, current_board, vertex):
         color = current_board[self.game._flatten(vertex)]
         # print ("color : ", color)
@@ -84,6 +97,47 @@ class Go:
             repeat = True
         return repeat
 
+    def _is_eye(self, current_board, color, vertex):
+        nei = self._neighbor(vertex)
+        cor = self._corner(vertex)
+        ncolor = {color == current_board[self.game._flatten(n)] for n in nei}
+        if False in ncolor:
+            # print "not all neighbors are in same color with us"
+            return False
+        _, group = self._find_group(current_board, nei[0])
+        if set(nei) < group:
+            # print "all neighbors are in same group and same color with us"
+            return True
+        else:
+            opponent_number = [current_board[self.game._flatten(c)] for c in cor].count(-color)
+            opponent_propotion = float(opponent_number) / float(len(cor))
+            if opponent_propotion < 0.5:
+                # print "few opponents, real eye"
+                return True
+            else:
+                # print "many opponents, fake eye"
+                return False
+
+    def _knowledge_prunning(self, current_board, color, vertex):
+        ### check if it is an eye of yourself
+        ### assumptions : notice that this judgement requires that the state is an endgame
+        if self._is_eye(current_board, color, vertex):
+            return False
+        return True
+
+    def _sa2cv(self, state, action):
+        # State is the play board, the shape is [1, self.game.size, self.game.size, 17], action is an index.
+        # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move
+        if state[0, 0, 0, -1] == utils.BLACK:
+            color = utils.BLACK
+        else:
+            color = utils.WHITE
+        if action == self.game.size ** 2:
+            vertex = (0, 0)
+        else:
+            vertex = self.game._deflatten(action)
+        return color, vertex
+
     def _is_valid(self, history_boards, current_board, color, vertex):
         ### in board
         if not self._in_board(vertex):
@@ -97,11 +151,54 @@ class Go:
         if self._is_suicide(current_board, color, vertex):
             return False
 
+        ### forbid global isomorphous
         if self._check_global_isomorphous(history_boards, current_board, color, vertex):
             return False
 
         return True
 
+    def simulate_is_valid(self, history_boards, current_board, state, action):
+        # initialize simulate_latest_boards and simulate_board from state
+        self.simulate_latest_boards.clear()
+        for i in range(8):
+            self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist())
+        self.simulate_board = copy.copy(self.simulate_latest_boards[-1])
+
+        color, vertex = self._sa2cv(state, action)
+
+        if not self._is_valid(history_boards, current_board, color, vertex):
+            return False
+
+        if not self._knowledge_prunning(current_board, color, vertex):
+            return False
+
+        return True
+
+    def _do_move(self, color, vertex):
+        if vertex == utils.PASS:
+            return True
+
+        id_ = self.game._flatten(vertex)
+        if self.simulate_board[id_] == utils.EMPTY:
+            self.simulate_board[id_] = color
+            return True
+        else:
+            return False
+
+    def simulate_step_forward(self, state, action):
+        # initialize the simulate_board from state
+        self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist()
+
+        color, vertex = self._sa2cv(state, action)
+
+        self._do_move(color, vertex)
+        new_state = np.concatenate(
+            [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1),
+             state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1),
+             np.array(1 - state[:, :, :, -1]).reshape(1, self.game.size, self.game.size, 1)],
+            axis=3)
+        return new_state, 0
+
     def executor_do_move(self, color, vertex):
         if not self._is_valid(self.game.history, self.game.board, color, vertex):
             return False
diff --git a/AlphaGo/self-play.py b/AlphaGo/self-play.py
index 296112b..63b7e97 100644
--- a/AlphaGo/self-play.py
+++ b/AlphaGo/self-play.py
@@ -79,7 +79,7 @@ while True:
         prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1))
     print("Finished")
     print("\n")
-    score = game.executor.executor_get_score(True)
+    score = game.game_engine.executor_get_score(True)
     if score > 0:
         winner = utils.BLACK
     else:
diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py
deleted file mode 100644
index 1e5fd02..0000000
--- a/AlphaGo/strategy.py
+++ /dev/null
@@ -1,199 +0,0 @@
-import os, sys
-
-sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir))
-import numpy as np
-import utils
-import time
-import copy
-import network_small
-import tensorflow as tf
-from collections import deque
-from tianshou.core.mcts.mcts import MCTS
-
-NEIGHBOR_OFFSET = [[1, 0], [-1, 0], [0, -1], [0, 1]]
-CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]]
-
-class GoEnv:
-    def __init__(self, **kwargs):
-        self.game = kwargs['game']
-        self.simulate_board = [utils.EMPTY] * (self.game.size ** 2)
-        self.simulate_latest_boards = deque(maxlen=8)
-
-    def _in_board(self, vertex):
-        x, y = vertex
-        if x < 1 or x > self.game.size: return False
-        if y < 1 or y > self.game.size: return False
-        return True
-
-    def _neighbor(self, vertex):
-        x, y = vertex
-        nei = []
-        for d in NEIGHBOR_OFFSET:
-            _x = x + d[0]
-            _y = y + d[1]
-            if self._in_board((_x, _y)):
-                nei.append((_x, _y))
-        return nei
-
-    def _corner(self, vertex):
-        x, y = vertex
-        corner = []
-        for d in CORNER_OFFSET:
-            _x = x + d[0]
-            _y = y + d[1]
-            if self._in_board((_x, _y)):
-                corner.append((_x, _y))
-        return corner
-
-    def _find_group(self, current_board, vertex):
-        color = current_board[self.game._flatten(vertex)]
-        # print ("color : ", color)
-        chain = set()
-        frontier = [vertex]
-        has_liberty = False
-        while frontier:
-            current = frontier.pop()
-            # print ("current : ", current)
-            chain.add(current)
-            for n in self._neighbor(current):
-                if current_board[self.game._flatten(n)] == color and not n in chain:
-                    frontier.append(n)
-                if current_board[self.game._flatten(n)] == utils.EMPTY:
-                    has_liberty = True
-        return has_liberty, chain
-
-    def _is_suicide(self, current_board, color, vertex):
-        current_board[self.game._flatten(vertex)] = color # assume that we already take this move
-        suicide = False
-
-        has_liberty, group = self._find_group(current_board, vertex)
-        if not has_liberty:
-            suicide = True # no liberty, suicide
-            for n in self._neighbor(vertex):
-                if current_board[self.game._flatten(n)] == utils.another_color(color):
-                    opponent_liberty, group = self._find_group(current_board, n)
-                    if not opponent_liberty:
-                        suicide = False # this move is able to take opponent's stone, not suicide
-
-        current_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move
-        return suicide
-
-    def _process_board(self, current_board, color, vertex):
-        nei = self._neighbor(vertex)
-        for n in nei:
-            if current_board[self.game._flatten(n)] == utils.another_color(color):
-                has_liberty, group = self._find_group(current_board, n)
-                if not has_liberty:
-                    for b in group:
-                        current_board[self.game._flatten(b)] = utils.EMPTY
-
-    def _check_global_isomorphous(self, history_boards, current_board, color, vertex):
-        repeat = False
-        next_board = copy.copy(current_board)
-        next_board[self.game._flatten(vertex)] = color
-        self._process_board(next_board, color, vertex)
-        if next_board in history_boards:
-            repeat = True
-        return repeat
-
-    def _is_eye(self, current_board, color, vertex):
-        nei = self._neighbor(vertex)
-        cor = self._corner(vertex)
-        ncolor = {color == current_board[self.game._flatten(n)] for n in nei}
-        if False in ncolor:
-            # print "not all neighbors are in same color with us"
-            return False
-        _, group = self._find_group(current_board, nei[0])
-        if set(nei) < group:
-            # print "all neighbors are in same group and same color with us"
-            return True
-        else:
-            opponent_number = [current_board[self.game._flatten(c)] for c in cor].count(-color)
-            opponent_propotion = float(opponent_number) / float(len(cor))
-            if opponent_propotion < 0.5:
-                # print "few opponents, real eye"
-                return True
-            else:
-                # print "many opponents, fake eye"
-                return False
-
-    def _knowledge_prunning(self, current_board, color, vertex):
-        ### check if it is an eye of yourself
-        ### assumptions : notice that this judgement requires that the state is an endgame
-        if self._is_eye(current_board, color, vertex):
-            return False
-        return True
-
-    def _sa2cv(self, state, action):
-        # State is the play board, the shape is [1, self.game.size, self.game.size, 17], action is an index.
-        # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move
-        if state[0, 0, 0, -1] == utils.BLACK:
-            color = utils.BLACK
-        else:
-            color = utils.WHITE
-        if action == self.game.size ** 2:
-            vertex = (0, 0)
-        else:
-            vertex = self.game._deflatten(action)
-        return color, vertex
-
-    def _is_valid(self, history_boards, current_board, color, vertex):
-        ### in board
-        if not self._in_board(vertex):
-            return False
-
-        ### already have stone
-        if not current_board[self.game._flatten(vertex)] == utils.EMPTY:
-            return False
-
-        ### check if it is suicide
-        if self._is_suicide(current_board, color, vertex):
-            return False
-
-        ### forbid global isomorphous
-        if self._check_global_isomorphous(history_boards, current_board, color, vertex):
-            return False
-
-        return True
-
-    def simulate_is_valid(self, history_boards, current_board, state, action):
-        # initialize simulate_latest_boards and simulate_board from state
-        self.simulate_latest_boards.clear()
-        for i in range(8):
-            self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist())
-        self.simulate_board = copy.copy(self.simulate_latest_boards[-1])
-
-        color, vertex = self._sa2cv(state, action)
-
-        if not self._is_valid(history_boards, current_board, color, vertex):
-            return False
-
-        if not self._knowledge_prunning(current_board, color, vertex):
-            return False
-
-        return True
-
-    def _do_move(self, color, vertex):
-        if vertex == utils.PASS:
-            return True
-
-        id_ = self.game._flatten(vertex)
-        if self.simulate_board[id_] == utils.EMPTY:
-            self.simulate_board[id_] = color
-            return True
-        else:
-            return False
-
-    def simulate_step_forward(self, state, action):
-        # initialize the simulate_board from state
-        self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist()
-
-        color, vertex = self._sa2cv(state, action)
-
-        self._do_move(color, vertex)
-        new_state = np.concatenate(
-            [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1),
-             state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1),
-             np.array(1 - state[:, :, :, -1]).reshape(1, self.game.size, self.game.size, 1)],
-            axis=3)
-        return new_state, 0

From 7fca90c61b97704463985f1c1774e90a834c906c Mon Sep 17 00:00:00 2001
From: rtz19970824 <1289226405@qq.com>
Date: Wed, 20 Dec 2017 16:43:42 +0800
Subject: [PATCH 26/98] modify the mcts, refactor the network

---
 AlphaGo/Network.py                       | 211 -----------------------
 AlphaGo/Network_ori.py                   | 175 -------------------
 AlphaGo/game.py                          |  15 +-
 AlphaGo/go.py                            |  58 ++-----
 AlphaGo/model.py                         | 170 ++++++++++++++++++
 AlphaGo/{network_small.py => network.py} |   0
 tianshou/core/mcts/mcts.py               |  40 ++---
 7 files changed, 212 insertions(+), 457 deletions(-)
 delete mode 100644 AlphaGo/Network.py
 delete mode 100644 AlphaGo/Network_ori.py
 create mode 100644 AlphaGo/model.py
 rename AlphaGo/{network_small.py => network.py} (100%)

diff --git a/AlphaGo/Network.py b/AlphaGo/Network.py
deleted file mode 100644
index caf7710..0000000
--- a/AlphaGo/Network.py
+++ /dev/null
@@ -1,211 +0,0 @@
-import os
-import time
-import sys
-
-import numpy as np
-import time
-import tensorflow as tf
-import tensorflow.contrib.layers as layers
-
-import multi_gpu
-import time
-
-# os.environ["CUDA_VISIBLE_DEVICES"] = "1"
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
-
-
-def residual_block(input, is_training):
-    normalizer_params = {'is_training': is_training,
-                         'updates_collections': tf.GraphKeys.UPDATE_OPS}
-    h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu,
-                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
-                      weights_regularizer=layers.l2_regularizer(1e-4))
-    h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity,
-                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
-                      weights_regularizer=layers.l2_regularizer(1e-4))
-    h = h + input
-    return tf.nn.relu(h)
-
-
-def policy_heads(input, is_training):
-    normalizer_params = {'is_training': is_training,
-                         'updates_collections': tf.GraphKeys.UPDATE_OPS}
-    h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
-                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
-                      weights_regularizer=layers.l2_regularizer(1e-4))
-    h = layers.flatten(h)
-    h = layers.fully_connected(h, 362, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4))
-    return h
-
-
-def value_heads(input, is_training):
-    normalizer_params = {'is_training': is_training,
-                         'updates_collections': tf.GraphKeys.UPDATE_OPS}
-    h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
-                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
-                      weights_regularizer=layers.l2_regularizer(1e-4))
-    h = layers.flatten(h)
-    h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4))
-    h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4))
-    return h
-
-
-class Network(object):
-    def __init__(self):
-        self.x = tf.placeholder(tf.float32, shape=[None, 19, 19, 17])
-        self.is_training = tf.placeholder(tf.bool, shape=[])
-        self.z = tf.placeholder(tf.float32, shape=[None, 1])
-        self.pi = tf.placeholder(tf.float32, shape=[None, 362])
-        self.build_network()
-
-    def build_network(self):
-        h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm,
-                          normalizer_params={'is_training': self.is_training,
-                                             'updates_collections': tf.GraphKeys.UPDATE_OPS},
-                          weights_regularizer=layers.l2_regularizer(1e-4))
-        for i in range(19):
-            h = residual_block(h, self.is_training)
-        self.v = value_heads(h, self.is_training)
-        self.p = policy_heads(h, self.is_training)
-        # loss = tf.reduce_mean(tf.square(z-v)) - tf.multiply(pi, tf.log(tf.clip_by_value(tf.nn.softmax(p), 1e-8, tf.reduce_max(tf.nn.softmax(p)))))
-        self.value_loss = tf.reduce_mean(tf.square(self.z - self.v))
-        self.policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.pi, logits=self.p))
-
-        self.reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-        self.total_loss = self.value_loss + self.policy_loss + self.reg
-        # train_op = tf.train.MomentumOptimizer(1e-4, momentum=0.9, use_nesterov=True).minimize(total_loss)
-        self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-        with tf.control_dependencies(self.update_ops):
-            self.train_op = tf.train.RMSPropOptimizer(1e-4).minimize(self.total_loss)
-        self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
-        self.saver = tf.train.Saver(max_to_keep=10, var_list=self.var_list)
-
-    def train(self):
-        data_path = "/home/tongzheng/data/"
-        data_name = os.listdir("/home/tongzheng/data/")
-        epochs = 100
-        batch_size = 128
-
-        result_path = "./checkpoints/"
-        with multi_gpu.create_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            ckpt_file = tf.train.latest_checkpoint(result_path)
-            if ckpt_file is not None:
-                print('Restoring model from {}...'.format(ckpt_file))
-                self.saver.restore(sess, ckpt_file)
-            for epoch in range(epochs):
-                for name in data_name:
-                    data = np.load(data_path + name)
-                    boards = data["boards"]
-                    wins = data["wins"]
-                    ps = data["ps"]
-                    print (boards.shape)
-                    print (wins.shape)
-                    print (ps.shape)
-                    batch_num = boards.shape[0] // batch_size
-                    index = np.arange(boards.shape[0])
-                    np.random.shuffle(index)
-                    value_losses = []
-                    policy_losses = []
-                    regs = []
-                    time_train = -time.time()
-                    for iter in range(batch_num):
-                        lv, lp, r, value, prob, _ = sess.run(
-                            [self.value_loss, self.policy_loss, self.reg, self.v, tf.nn.softmax(p), self.train_op],
-                            feed_dict={self.x: boards[
-                                index[iter * batch_size:(iter + 1) * batch_size]],
-                                       self.z: wins[index[
-                                               iter * batch_size:(iter + 1) * batch_size]],
-                                       self.pi: ps[index[
-                                              iter * batch_size:(iter + 1) * batch_size]],
-                                       self.is_training: True})
-                        value_losses.append(lv)
-                        policy_losses.append(lp)
-                        regs.append(r)
-                        if iter % 1 == 0:
-                            print(
-                                "Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format(
-                                    epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)),
-                                    np.mean(np.array(policy_losses)), np.mean(np.array(regs))))
-                            time_train = -time.time()
-                            value_losses = []
-                            policy_losses = []
-                            regs = []
-                        if iter % 20 == 0:
-                            save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter)
-                            self.saver.save(sess, result_path + save_path)
-                    del data, boards, wins, ps
-
-
-                # def forward(call_number):
-                #     # checkpoint_path = "/home/yama/rl/tianshou/AlphaGo/checkpoints"
-                #     checkpoint_path = "/home/jialian/stuGo/tianshou/stuGo/checkpoints/"
-                #     board_file = np.genfromtxt("/home/jialian/stuGo/tianshou/leela-zero/src/mcts_nn_files/board_" + call_number,
-                #                                dtype='str');
-                #     human_board = np.zeros((17, 19, 19))
-                #
-                #     # TODO : is it ok to ignore the last channel?
-                #     for i in range(17):
-                #         human_board[i] = np.array(list(board_file[i])).reshape(19, 19)
-                #     # print("============================")
-                #     # print("human board sum : " + str(np.sum(human_board[-1])))
-                #     # print("============================")
-                #     # print(human_board)
-                #     # print("============================")
-                #     # rint(human_board)
-                #     feed_board = human_board.transpose(1, 2, 0).reshape(1, 19, 19, 17)
-                #     # print(feed_board[:,:,:,-1])
-                #     # print(feed_board.shape)
-                #
-                #     # npz_board = np.load("/home/yama/rl/tianshou/AlphaGo/data/7f83928932f64a79bc1efdea268698ae.npz")
-                #     # print(npz_board["boards"].shape)
-                #     # feed_board = npz_board["boards"][10].reshape(-1, 19, 19, 17)
-                #     ##print(feed_board)
-                #     # show_board = feed_board[0].transpose(2, 0, 1)
-                #     # print("board shape : ", show_board.shape)
-                #     # print(show_board)
-                #
-                #     itflag = False
-                #     with multi_gpu.create_session() as sess:
-                #         sess.run(tf.global_variables_initializer())
-                #         ckpt_file = tf.train.latest_checkpoint(checkpoint_path)
-                #         if ckpt_file is not None:
-                #             # print('Restoring model from {}...'.format(ckpt_file))
-                #             saver.restore(sess, ckpt_file)
-                #         else:
-                #             raise ValueError("No model loaded")
-                #         res = sess.run([tf.nn.softmax(p), v], feed_dict={x: feed_board, is_training: itflag})
-                #         # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False})
-                #         # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True})
-                #         # print(np.argmax(res[0]))
-                #         np.savetxt(sys.stdout, res[0][0], fmt="%.6f", newline=" ")
-                #         np.savetxt(sys.stdout, res[1][0], fmt="%.6f", newline=" ")
-                #         pv_file = "/home/jialian/stuGotianshou/leela-zero/src/mcts_nn_files/policy_value"
-                #         np.savetxt(pv_file, np.concatenate((res[0][0], res[1][0])), fmt="%.6f", newline=" ")
-                #     # np.savetxt(pv_file, res[1][0], fmt="%.6f", newline=" ")
-                #     return res
-
-    def forward(self):
-        checkpoint_path = "/home/tongzheng/tianshou/AlphaGo/checkpoints/"
-        sess = multi_gpu.create_session()
-        sess.run(tf.global_variables_initializer())
-        ckpt_file = tf.train.latest_checkpoint(checkpoint_path)
-        if ckpt_file is not None:
-            print('Restoring model from {}...'.format(ckpt_file))
-            self.saver.restore(sess, ckpt_file)
-            print('Successfully loaded')
-        else:
-            raise ValueError("No model loaded")
-        # prior, value = sess.run([tf.nn.softmax(p), v], feed_dict={x: state, is_training: False})
-        # return prior, value
-        return sess
-
-
-if __name__ == '__main__':
-    state = np.random.randint(0, 1, [1, 19, 19, 17])
-    net = Network()
-    sess = net.forward()
-    start = time.time()
-    for i in range(100):
-        sess.run([tf.nn.softmax(net.p), net.v], feed_dict={net.x: state, net.is_training: False})
-        print("Step {}, Cumulative time {}".format(i, time.time() - start))
diff --git a/AlphaGo/Network_ori.py b/AlphaGo/Network_ori.py
deleted file mode 100644
index 9d33bb9..0000000
--- a/AlphaGo/Network_ori.py
+++ /dev/null
@@ -1,175 +0,0 @@
-import os
-import time
-import gc
-
-import numpy as np
-import tensorflow as tf
-import tensorflow.contrib.layers as layers
-
-import multi_gpu
-
-os.environ["CUDA_VISIBLE_DEVICES"] = "1"
-
-
-def residual_block(input, is_training):
-    normalizer_params = {'is_training': is_training,
-                         'updates_collections': tf.GraphKeys.UPDATE_OPS}
-    h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu,
-                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
-                      weights_regularizer=layers.l2_regularizer(1e-4))
-    h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity,
-                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
-                      weights_regularizer=layers.l2_regularizer(1e-4))
-    h = h + input
-    return tf.nn.relu(h)
-
-
-def policy_heads(input, is_training):
-    normalizer_params = {'is_training': is_training,
-                         'updates_collections': tf.GraphKeys.UPDATE_OPS}
-    h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
-                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
-                      weights_regularizer=layers.l2_regularizer(1e-4))
-    h = layers.flatten(h)
-    h = layers.fully_connected(h, 362, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4))
-    return h
-
-
-def value_heads(input, is_training):
-    normalizer_params = {'is_training': is_training,
-                         'updates_collections': tf.GraphKeys.UPDATE_OPS}
-    h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
-                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
-                      weights_regularizer=layers.l2_regularizer(1e-4))
-    h = layers.flatten(h)
-    h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4))
-    h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4))
-    return h
-
-
-x = tf.placeholder(tf.float32, shape=[None, 19, 19, 17])
-is_training = tf.placeholder(tf.bool, shape=[])
-z = tf.placeholder(tf.float32, shape=[None, 1])
-pi = tf.placeholder(tf.float32, shape=[None, 362])
-
-h = layers.conv2d(x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm,
-                  normalizer_params={'is_training': is_training, 'updates_collections': tf.GraphKeys.UPDATE_OPS},
-                  weights_regularizer=layers.l2_regularizer(1e-4))
-for i in range(19):
-    h = residual_block(h, is_training)
-v = value_heads(h, is_training)
-p = policy_heads(h, is_training)
-# loss = tf.reduce_mean(tf.square(z-v)) - tf.multiply(pi, tf.log(tf.clip_by_value(tf.nn.softmax(p), 1e-8, tf.reduce_max(tf.nn.softmax(p)))))
-value_loss = tf.reduce_mean(tf.square(z - v))
-policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=pi, logits=p))
-
-reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-total_loss = value_loss + policy_loss + reg
-# train_op = tf.train.MomentumOptimizer(1e-4, momentum=0.9, use_nesterov=True).minimize(total_loss)
-update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-with tf.control_dependencies(update_ops):
-    train_op = tf.train.RMSPropOptimizer(1e-4).minimize(total_loss)
-var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
-saver = tf.train.Saver(max_to_keep=10, var_list=var_list)
-
-
-def train():
-    data_path = "/home/tongzheng/data/"
-    data_name = os.listdir("/home/tongzheng/data/")
-    epochs = 100
-    batch_size = 128
-
-    result_path = "./checkpoints/"
-    with multi_gpu.create_session() as sess:
-        sess.run(tf.global_variables_initializer())
-        ckpt_file = tf.train.latest_checkpoint(result_path)
-        if ckpt_file is not None:
-            print('Restoring model from {}...'.format(ckpt_file))
-            saver.restore(sess, ckpt_file)
-        for epoch in range(epochs):
-            for name in data_name:
-                data = np.load(data_path + name)
-                boards = data["boards"]
-                wins = data["wins"]
-                ps = data["ps"]
-                print (boards.shape)
-                print (wins.shape)
-                print (ps.shape)
-                # batch_num = 1
-                batch_num = boards.shape[0] // batch_size
-                index = np.arange(boards.shape[0])
-                np.random.shuffle(index)
-                value_losses = []
-                policy_losses = []
-                regs = []
-                time_train = -time.time()
-                for iter in range(batch_num):
-                    lv, lp, r, _ = sess.run([value_loss, policy_loss, reg, train_op],
-                                            feed_dict={x: boards[
-                                                index[iter * batch_size:(iter + 1) * batch_size]],
-                                                       z: wins[index[
-                                                               iter * batch_size:(iter + 1) * batch_size]],
-                                                       pi: ps[index[
-                                                              iter * batch_size:(iter + 1) * batch_size]],
-                                                       is_training: True})
-                    value_losses.append(lv)
-                    policy_losses.append(lp)
-                    regs.append(r)
-                    del lv, lp, r
-                    if iter % 1 == 0:
-                        print(
-                            "Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format(
-                                epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)),
-                                np.mean(np.array(policy_losses)), np.mean(np.array(regs))))
-                        del value_losses, policy_losses, regs, time_train
-                        time_train = -time.time()
-                        value_losses = []
-                        policy_losses = []
-                        regs = []
-                    if iter % 20 == 0:
-                        save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter)
-                        saver.save(sess, result_path + save_path)
-                        del save_path
-                del data, boards, wins, ps, batch_num, index
-                gc.collect()
-
-
-def forward(board):
-    result_path = "./checkpoints"
-    itflag = False
-    res = None
-    if board is None:
-        # data = np.load("/home/tongzheng/meta-data/80b7bf21bce14862806d48c3cd760a1b.npz")
-        data = np.load("./data/7f83928932f64a79bc1efdea268698ae.npz")
-        board = data["boards"][50].reshape(-1, 19, 19, 17)
-        human_board = board[0].transpose(2, 0, 1)
-        print("============================")
-        print("human board sum : " + str(np.sum(human_board)))
-        print("============================")
-        print(board[:, :, :, -1])
-        itflag = False
-    with multi_gpu.create_session() as sess:
-        sess.run(tf.global_variables_initializer())
-        ckpt_file = tf.train.latest_checkpoint(result_path)
-        if ckpt_file is not None:
-            print('Restoring model from {}...'.format(ckpt_file))
-            saver.restore(sess, ckpt_file)
-        else:
-            raise ValueError("No model loaded")
-        res = sess.run([tf.nn.softmax(p), v], feed_dict={x: board, is_training: itflag})
-        # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False})
-        # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True})
-        # print(np.argmax(res[0]))
-        print(res)
-        print(data["p"][0])
-        print(np.argmax(res[0]))
-        print(np.argmax(data["p"][0]))
-    # print(res[0].tolist()[0])
-    # print(np.argmax(res[0]))
-    return res
-
-
-if __name__ == '__main__':
-    # train()
-    # if sys.argv[1] == "test":
-    forward(None)
diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index aee8d3a..37b7878 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -11,7 +11,7 @@ import tensorflow as tf
 import numpy as np
 import sys, os
 import go
-import network_small
+import model
 from collections import deque
 sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir))
 from tianshou.core.mcts.mcts import MCTS
@@ -31,10 +31,9 @@ class Game:
         self.latest_boards = deque(maxlen=8)
         for _ in range(8):
             self.latest_boards.append(self.board)
-        self.net = network_small.Network()
-        self.sess = self.net.forward(checkpoint_path)
-        self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v],
-                                                     feed_dict={self.net.x: state, self.net.is_training: False})
+        self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8)
+        # self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v],
+        #                                              feed_dict={self.net.x: state, self.net.is_training: False})
         self.game_engine = go.Go(game=self)
 
     def _flatten(self, vertex):
@@ -75,7 +74,8 @@ class Game:
         self.game_engine.simulate_latest_boards = copy.copy(latest_boards)
         self.game_engine.simulate_board = copy.copy(latest_boards[-1])
         nn_input = self.generate_nn_input(self.game_engine.simulate_latest_boards, color)
-        mcts = MCTS(self.game_engine, self.evaluator, nn_input, self.size ** 2 + 1, inverse=True, max_step=1)
+        mcts = MCTS(self.game_engine, self.evaluator, [self.game_engine.simulate_latest_boards, color], self.size ** 2 + 1, inverse=True)
+        mcts.search(max_step=1)
         temp = 1
         prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp)
         choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0]
@@ -93,7 +93,7 @@ class Game:
         return res
 
     def think_play_move(self, color):
-        # although we dont need to return self.prob, however it is needed for neural network training
+        # although we don't need to return self.prob, however it is needed for neural network training
         move, self.prob = self.think(self.latest_boards, color)
         # play the move immediately
         self.play_move(color, move)
@@ -122,6 +122,7 @@ class Game:
 if __name__ == "__main__":
     g = Game()
     g.show_board()
+    g.think_play_move(1)
     #file = open("debug.txt", "a")
     #file.write("mcts check\n")
     #file.close()
diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index 10ce7e1..335ee39 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -17,8 +17,6 @@ CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]]
 class Go:
     def __init__(self, **kwargs):
         self.game = kwargs['game']
-        self.simulate_board = [utils.EMPTY] * (self.game.size ** 2)
-        self.simulate_latest_boards = deque(maxlen=8)
 
     def _in_board(self, vertex):
         x, y = vertex
@@ -125,18 +123,12 @@ class Go:
             return False
         return True
 
-    def _sa2cv(self, state, action):
-        # State is the play board, the shape is [1, self.game.size, self.game.size, 17], action is an index.
-        # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move
-        if state[0, 0, 0, -1] == utils.BLACK:
-            color = utils.BLACK
-        else:
-            color = utils.WHITE
+    def _action2vertex(self, action):
         if action == self.game.size ** 2:
             vertex = (0, 0)
         else:
             vertex = self.game._deflatten(action)
-        return color, vertex
+        return vertex
 
     def _is_valid(self, history_boards, current_board, color, vertex):
         ### in board
@@ -157,14 +149,10 @@ class Go:
 
         return True
 
-    def simulate_is_valid(self, history_boards, current_board, state, action):
-        # initialize simulate_latest_boards and simulate_board from state
-        self.simulate_latest_boards.clear()
-        for i in range(8):
-            self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist())
-        self.simulate_board = copy.copy(self.simulate_latest_boards[-1])
-
-        color, vertex = self._sa2cv(state, action)
+    def simulate_is_valid(self, state, action):
+        history_boards, color = state
+        vertex = self._action2vertex(action)
+        current_board = history_boards[-1]
 
         if not self._is_valid(history_boards, current_board, color, vertex):
             return False
@@ -174,30 +162,22 @@ class Go:
 
         return True
 
-    def _do_move(self, color, vertex):
+    def _do_move(self, board, color, vertex):
         if vertex == utils.PASS:
-            return True
-
-        id_ = self.game._flatten(vertex)
-        if self.simulate_board[id_] == utils.EMPTY:
-            self.simulate_board[id_] = color
-            return True
+            return board
         else:
-            return False
+            id_ = self.game._flatten(vertex)
+            board[id_] = color
+            return board
 
     def simulate_step_forward(self, state, action):
         # initialize the simulate_board from state
-        self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist()
-
-        color, vertex = self._sa2cv(state, action)
-
-        self._do_move(color, vertex)
-        new_state = np.concatenate(
-            [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1),
-             state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1),
-             np.array(1 - state[:, :, :, -1]).reshape(1, self.game.size, self.game.size, 1)],
-            axis=3)
-        return new_state, 0
+        history_boards, color = state
+        vertex = self._action2vertex(action)
+        new_board = self._do_move(copy.copy(history_boards[-1]), color, vertex)
+        history_boards.append(new_board)
+        new_color = -color
+        return [history_boards, new_color], 0
 
     def executor_do_move(self, color, vertex):
         if not self._is_valid(self.game.history, self.game.board, color, vertex):
@@ -239,7 +219,7 @@ class Go:
             start_vertex_x += x_diff
             start_vertex_y += y_diff
 
-    def _predict_from_nearby(self, vertex, neighbor_step = 3):
+    def _predict_from_nearby(self, vertex, neighbor_step=3):
         '''
         step: the nearby 3 steps is considered
         :vertex: position to be estimated
@@ -261,7 +241,7 @@ class Go:
             elif color_estimate < 0:
                 return utils.WHITE
 
-    def executor_get_score(self, is_unknown_estimation = False):
+    def executor_get_score(self, is_unknown_estimation=False):
         '''
             is_unknown_estimation: whether use nearby stone to predict the unknown
             return score from BLACK perspective.
diff --git a/AlphaGo/model.py b/AlphaGo/model.py
new file mode 100644
index 0000000..725dbd2
--- /dev/null
+++ b/AlphaGo/model.py
@@ -0,0 +1,170 @@
+import os
+import time
+import sys
+
+import numpy as np
+import tensorflow as tf
+import tensorflow.contrib.layers as layers
+
+import multi_gpu
+
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+
+
+def residual_block(input, is_training):
+    """
+    one residual block
+
+    :param input: a tensor, input of the residual block
+    :param is_training: a placeholder, indicate whether the model is training or not
+    :return: a tensor, output of the residual block
+    """
+    normalizer_params = {'is_training': is_training,
+                         'updates_collections': tf.GraphKeys.UPDATE_OPS}
+    h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu,
+                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
+                      weights_regularizer=layers.l2_regularizer(1e-4))
+    h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity,
+                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
+                      weights_regularizer=layers.l2_regularizer(1e-4))
+    h = h + input
+    return tf.nn.relu(h)
+
+
+def policy_head(input, is_training, action_num):
+    """
+    the head of policy branch
+
+    :param input: a tensor, input of the policy head
+    :param is_training: a placeholder, indicate whether the model is training or not
+    :param action_num: action_num: an integer, number of unique actions at any state
+    :return: a tensor: output of the policy head, shape [batch_size, action_num]
+    """
+    normalizer_params = {'is_training': is_training,
+                         'updates_collections': tf.GraphKeys.UPDATE_OPS}
+    h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
+                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
+                      weights_regularizer=layers.l2_regularizer(1e-4))
+    h = layers.flatten(h)
+    h = layers.fully_connected(h, action_num, activation_fn=tf.identity,
+                               weights_regularizer=layers.l2_regularizer(1e-4))
+    return h
+
+
+def value_head(input, is_training):
+    """
+    the head of value branch
+
+    :param input: a tensor, input of the value head
+    :param is_training: a placeholder, indicate whether the model is training or not
+    :return: a tensor, output of the value head, shape [batch_size, 1]
+    """
+    normalizer_params = {'is_training': is_training,
+                         'updates_collections': tf.GraphKeys.UPDATE_OPS}
+    h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
+                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
+                      weights_regularizer=layers.l2_regularizer(1e-4))
+    h = layers.flatten(h)
+    h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4))
+    h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4))
+    return h
+
+
+class ResNet(object):
+    def __init__(self, board_size, action_num, history_length=1, residual_block_num=20, checkpoint_path=None):
+        """
+        the resnet model
+
+        :param board_size: an integer, the board size
+        :param action_num: an integer, number of unique actions at any state
+        :param history_length: an integer, the history length to use, default is 1
+        :param residual_block_num: an integer, the number of residual block, default is 20, at least 1
+        :param checkpoint_path: a string, the path to the checkpoint, default is None,
+        """
+        self.board_size = board_size
+        self.action_num = action_num
+        self.history_length = history_length
+        self.x = tf.placeholder(tf.float32, shape=[None, self.board_size, self.board_size, 2 * self.history_length + 1])
+        self.is_training = tf.placeholder(tf.bool, shape=[])
+        self.z = tf.placeholder(tf.float32, shape=[None, 1])
+        self.pi = tf.placeholder(tf.float32, shape=[None, self.action_num])
+        self._build_network(residual_block_num, checkpoint_path)
+
+    def _build_network(self, residual_block_num, checkpoint_path):
+        """
+        build the network
+
+        :param residual_block_num: an integer, the number of residual block
+        :param checkpoint_path: a string, the path to the checkpoint, if None, use random initialization parameter
+        :return: None
+        """
+
+        h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu,
+                          normalizer_fn=layers.batch_norm,
+                          normalizer_params={'is_training': self.is_training,
+                                             'updates_collections': tf.GraphKeys.UPDATE_OPS},
+                          weights_regularizer=layers.l2_regularizer(1e-4))
+        for i in range(residual_block_num - 1):
+            h = residual_block(h, self.is_training)
+        self.v = value_head(h, self.is_training)
+        self.p = policy_head(h, self.is_training, self.action_num)
+        self.value_loss = tf.reduce_mean(tf.square(self.z - self.v))
+        self.policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.pi, logits=self.p))
+
+        self.reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+        self.total_loss = self.value_loss + self.policy_loss + self.reg
+        self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+        with tf.control_dependencies(self.update_ops):
+            self.train_op = tf.train.AdamOptimizer(1e-4).minimize(self.total_loss)
+        self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
+        self.saver = tf.train.Saver(max_to_keep=10, var_list=self.var_list)
+        self.sess = multi_gpu.create_session()
+        self.sess.run(tf.global_variables_initializer())
+        if checkpoint_path is not None:
+            ckpt_file = tf.train.latest_checkpoint(checkpoint_path)
+            if ckpt_file is not None:
+                print('Restoring model from {}...'.format(ckpt_file))
+                self.saver.restore(self.sess, ckpt_file)
+                print('Successfully loaded')
+            else:
+                raise ValueError("No model in path {}".format(checkpoint_path))
+
+    def __call__(self, state):
+        """
+
+        :param history: a list, the history
+        :param color: a string, indicate which one to play
+        :return: a list of tensor, the predicted value and policy given the history and color
+        """
+        history, color = state
+        if len(history) != self.history_length:
+            raise ValueError(
+                'The length of history cannot meet the need of the model, given {}, need {}'.format(len(history),
+                                                                                                    self.history_length))
+        state = self._history2state(history, color)
+        return self.sess.run([self.p, self.v], feed_dict={self.x: state, self.is_training: False})
+
+    def _history2state(self, history, color):
+        """
+        convert the history to the state we need
+
+        :param history: a list, the history
+        :param color: a string, indicate which one to play
+        :return: a ndarray, the state
+        """
+        state = np.zeros([1, self.board_size, self.board_size, 2 * self.history_length + 1])
+        for i in range(self.history_length):
+            state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.board_size ** 2)).reshape(self.board_size,
+                                                                                                        self.board_size)
+            state[0, :, :, i + self.history_length] = np.array(
+                np.array(history[i]) == -np.ones(self.board_size ** 2)).reshape(self.board_size, self.board_size)
+        # TODO: need a config to specify the BLACK and WHITE
+        if color == +1:
+            state[0, :, :, 2 * self.history_length] = np.ones([self.board_size, self.board_size])
+        if color == -1:
+            state[0, :, :, 2 * self.history_length] = np.zeros([self.board_size, self.board_size])
+        return state
+
+    #TODO: design the interface between the environment and training
+    def train(self, mode='memory', *args, **kwargs):
+        pass
\ No newline at end of file
diff --git a/AlphaGo/network_small.py b/AlphaGo/network.py
similarity index 100%
rename from AlphaGo/network_small.py
rename to AlphaGo/network.py
diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index 12fc85d..fac00fb 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -72,11 +72,9 @@ class UCTNode(MCTSNode):
 
     def valid_mask(self, simulator):
         if self.mask is None:
-            start_time = time.time()
             self.mask = []
             for act in range(self.action_num - 1):
-                if not simulator.simulate_is_valid(
-                        simulator.simulate_latest_boards, simulator.simulate_board, self.state, act):
+                if not simulator.simulate_is_valid(self.state, act):
                     self.mask.append(act)
                     self.ucb[act] = -float("Inf")
         else:
@@ -144,8 +142,7 @@ class ActionNode(object):
 
 
 class MCTS(object):
-    def __init__(self, simulator, evaluator, root, action_num, method="UCT", inverse=False, max_step=None,
-                 max_time=None):
+    def __init__(self, simulator, evaluator, root, action_num, method="UCT", inverse=False):
         self.simulator = simulator
         self.evaluator = evaluator
         prior, _ = self.evaluator(root)
@@ -153,33 +150,26 @@ class MCTS(object):
         if method == "":
             self.root = root
         if method == "UCT":
-            self.root = UCTNode(None, None, root, action_num, prior, inverse)
+            self.root = UCTNode(None, None, root, action_num, prior, inverse=inverse)
         if method == "TS":
             self.root = TSNode(None, None, root, action_num, prior, inverse=inverse)
         self.inverse = inverse
-        if max_step is not None:
-            self.step = 0
-            self.max_step = max_step
-        # TODO: Optimize the stop criteria
-        # else:
-        #     self.max_step = 0
-        if max_time is not None:
-            self.start_time = time.time()
-            self.max_time = max_time
+
+    def search(self, max_step=None, max_time=None):
+        step = 0
+        start_time = time.time()
+        if max_step is None:
+            max_step = int("Inf")
+        if max_time is None:
+            max_time = float("Inf")
         if max_step is None and max_time is None:
             raise ValueError("Need a stop criteria!")
 
-        # TODO: running mcts should be implemented in another function, e.g. def search(self, max_step, max_time)
-        self.select_time = []
-        self.evaluate_time = []
-        self.bp_time = []
-        while (max_step is not None and self.step < self.max_step or max_step is None) \
-                and (max_time is not None and time.time() - self.start_time < self.max_time or max_time is None):
-            self.expand()
-            if max_step is not None:
-                self.step += 1
+        while step < max_step and time.time() - start_time < max_step:
+            self._expand()
+            step += 1
 
-    def expand(self):
+    def _expand(self):
         node, new_action = self.root.selection(self.simulator)
         value = node.children[new_action].expansion(self.evaluator, self.action_num)
         node.children[new_action].backpropagation(value + 0.)

From 50e306368feabf13a8723412481c6f3103ff3c4e Mon Sep 17 00:00:00 2001
From: Wenbo Hu <huwenbo.rambo@gmail.com>
Date: Wed, 20 Dec 2017 20:12:08 +0800
Subject: [PATCH 27/98] checkpoint

---
 AlphaGo/go.py | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index 335ee39..7196533 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -117,10 +117,31 @@ class Go:
                 return False
 
     def _knowledge_prunning(self, current_board, color, vertex):
-        ### check if it is an eye of yourself
-        ### assumptions : notice that this judgement requires that the state is an endgame
+        #  forbid some stupid selfplay using human knowledge
         if self._is_eye(current_board, color, vertex):
             return False
+            # forbid position on its own eye.
+        if self._is_game_finish(current_board, color) and vertex == utils.PASS
+            return False
+            # forbid pass if the game is not finished.
+        return True
+
+
+    def _is_game_finished(self, current_board, color):
+        '''
+        for each empty position, if it has both BLACK and WHITE neighbors, the game is still not finished
+        :return: return the game is finished
+        '''
+        board = copy.deepcopy(current_board)
+        empty_idx = [i for i, x in enumerate(board) if x == utils.EMPTY]  # find all empty idx
+        for idx in empty_idx:
+            neighbor_idx = self._neighbor(self.game.deflatten(idx))
+        if len(neighbor_idx) > 1:
+            first_idx = neighbor_idx[0]
+        for other_idx in neighbor_idx[1:]:
+            if self.game.board[self.game.flatten(other_idx)] != self.game.board[self.game.flatten(first_idx)]:
+                return False
+
         return True
 
     def _action2vertex(self, action):

From 48e95a21eaeec6495a1bc5985c434d64d7447baf Mon Sep 17 00:00:00 2001
From: Wenbo Hu <huwenbo.rambo@gmail.com>
Date: Wed, 20 Dec 2017 21:35:35 +0800
Subject: [PATCH 28/98] simulator process a valid set, instead of a single
 action

---
 AlphaGo/go.py              | 18 +++++++++++++++---
 tianshou/core/mcts/mcts.py |  9 ++-------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index 7196533..559b375 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -121,9 +121,9 @@ class Go:
         if self._is_eye(current_board, color, vertex):
             return False
             # forbid position on its own eye.
-        if self._is_game_finish(current_board, color) and vertex == utils.PASS
-            return False
-            # forbid pass if the game is not finished.
+        #if self._is_game_finish(current_board, color) and vertex == utils.PASS
+        #    return False
+        # forbid pass if the game is not finished.
         return True
 
 
@@ -183,6 +183,18 @@ class Go:
 
         return True
 
+    def simulate_is_valid_list(self, state, action_set):
+        ## find all the valid actions
+        ## if no action is valid, then pass
+        valid_action_set = []
+        for action_candidate in action_set:
+            if self.simulate_is_valid(self, state, action_candidate)
+                valid_action_set.append(action_candidate)
+        if not valid_action_set:
+            valid_action_set.append(utils.PASS)
+            # if valid_action_set is a empty set, add pass
+        return valid_action_set
+
     def _do_move(self, board, color, vertex):
         if vertex == utils.PASS:
             return board
diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index fac00fb..c14496d 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -72,13 +72,8 @@ class UCTNode(MCTSNode):
 
     def valid_mask(self, simulator):
         if self.mask is None:
-            self.mask = []
-            for act in range(self.action_num - 1):
-                if not simulator.simulate_is_valid(self.state, act):
-                    self.mask.append(act)
-                    self.ucb[act] = -float("Inf")
-        else:
-            self.ucb[self.mask] = -float("Inf")
+            self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num - 1))
+        self.ucb[self.mask] = -float("Inf")
 
 
 class TSNode(MCTSNode):

From cabbb219680be465f03527ea90deb568b53f911f Mon Sep 17 00:00:00 2001
From: Wenbo Hu <huwenbo.rambo@gmail.com>
Date: Wed, 20 Dec 2017 21:40:03 +0800
Subject: [PATCH 29/98] minor revision

---
 AlphaGo/go.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index 559b375..009d369 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -186,14 +186,14 @@ class Go:
     def simulate_is_valid_list(self, state, action_set):
         ## find all the valid actions
         ## if no action is valid, then pass
-        valid_action_set = []
+        valid_action_list = []
         for action_candidate in action_set:
-            if self.simulate_is_valid(self, state, action_candidate)
-                valid_action_set.append(action_candidate)
-        if not valid_action_set:
-            valid_action_set.append(utils.PASS)
+            if self.simulate_is_valid(state, action_candidate):
+                valid_action_list.append(action_candidate)
+        if not valid_action_list:
+            valid_action_list.append(utils.PASS)
             # if valid_action_set is a empty set, add pass
-        return valid_action_set
+        return valid_action_list
 
     def _do_move(self, board, color, vertex):
         if vertex == utils.PASS:

From e2c6b96e5743341f92278a6437a85a7154bd5ec3 Mon Sep 17 00:00:00 2001
From: Wenbo Hu <huwenbo.rambo@gmail.com>
Date: Wed, 20 Dec 2017 21:52:30 +0800
Subject: [PATCH 30/98] minor revision.

---
 AlphaGo/go.py              | 3 +--
 tianshou/core/mcts/mcts.py | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index 009d369..cbbe07c 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -180,7 +180,6 @@ class Go:
 
         if not self._knowledge_prunning(current_board, color, vertex):
             return False
-
         return True
 
     def simulate_is_valid_list(self, state, action_set):
@@ -188,7 +187,7 @@ class Go:
         ## if no action is valid, then pass
         valid_action_list = []
         for action_candidate in action_set:
-            if self.simulate_is_valid(state, action_candidate):
+            if not self.simulate_is_valid(state, action_candidate):
                 valid_action_list.append(action_candidate)
         if not valid_action_list:
             valid_action_list.append(utils.PASS)
diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index c14496d..5aca06a 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -71,6 +71,7 @@ class UCTNode(MCTSNode):
                 self.parent.backpropagation(self.children[action].reward)
 
     def valid_mask(self, simulator):
+        # let all invalid actions illeagel in mcts
         if self.mask is None:
             self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num - 1))
         self.ucb[self.mask] = -float("Inf")

From f0d59dab6cef928cd580f301abbdd54b84af23df Mon Sep 17 00:00:00 2001
From: Wenbo Hu <huwenbo.rambo@gmail.com>
Date: Wed, 20 Dec 2017 22:10:47 +0800
Subject: [PATCH 31/98] forbid pass, if we have other choices

---
 AlphaGo/go.py              | 18 +++++++++---------
 tianshou/core/mcts/mcts.py |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index cbbe07c..1dfbb29 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -183,16 +183,16 @@ class Go:
         return True
 
     def simulate_is_valid_list(self, state, action_set):
-        ## find all the valid actions
-        ## if no action is valid, then pass
-        valid_action_list = []
-        for action_candidate in action_set:
+        # find all the invalid actions
+        invalid_action_list = []
+        for action_candidate in action_set[:-1]:
+            # go through all the actions excluding pass
             if not self.simulate_is_valid(state, action_candidate):
-                valid_action_list.append(action_candidate)
-        if not valid_action_list:
-            valid_action_list.append(utils.PASS)
-            # if valid_action_set is a empty set, add pass
-        return valid_action_list
+                invalid_action_list.append(action_candidate)
+        if len(invalid_action_list) < len(action_set) - 1:
+            invalid_action_list.append(action_set[-1])
+            # forbid pass, if we have other choices
+        return invalid_action_list
 
     def _do_move(self, board, color, vertex):
         if vertex == utils.PASS:
diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index 5aca06a..7edac97 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -71,7 +71,7 @@ class UCTNode(MCTSNode):
                 self.parent.backpropagation(self.children[action].reward)
 
     def valid_mask(self, simulator):
-        # let all invalid actions illeagel in mcts
+        # let all invalid actions be illeagel in mcts
         if self.mask is None:
             self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num - 1))
         self.ucb[self.mask] = -float("Inf")

From 00d2aa86bf668e17d6064b4896797cb79f7cbba7 Mon Sep 17 00:00:00 2001
From: Wenbo Hu <huwenbo.rambo@gmail.com>
Date: Wed, 20 Dec 2017 22:57:58 +0800
Subject: [PATCH 32/98] repair komi. add todo for forbid pass:

---
 AlphaGo/engine.py | 2 +-
 AlphaGo/game.py   | 4 ++--
 AlphaGo/go.py     | 5 +----
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py
index 9948176..bf30083 100644
--- a/AlphaGo/engine.py
+++ b/AlphaGo/engine.py
@@ -183,7 +183,7 @@ class GTPEngine():
             return 'unknown player', False
 
     def cmd_get_score(self, args, **kwargs):
-        return self._game.game_engine.executor_get_score(), None
+        return self._game.game_engine.executor_get_score(True), None
 
     def cmd_show_board(self, args, **kwargs):
         return self._game.board, True
diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index 37b7878..5f35c74 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -23,7 +23,7 @@ class Game:
     TODO : Maybe merge with the engine class in future, 
     currently leave it untouched for interacting with Go UI.
     '''
-    def __init__(self, size=9, komi=6.5, checkpoint_path=None):
+    def __init__(self, size=9, komi=3.75, checkpoint_path=None):
         self.size = size
         self.komi = komi
         self.board = [utils.EMPTY] * (self.size ** 2)
@@ -75,7 +75,7 @@ class Game:
         self.game_engine.simulate_board = copy.copy(latest_boards[-1])
         nn_input = self.generate_nn_input(self.game_engine.simulate_latest_boards, color)
         mcts = MCTS(self.game_engine, self.evaluator, [self.game_engine.simulate_latest_boards, color], self.size ** 2 + 1, inverse=True)
-        mcts.search(max_step=1)
+        mcts.search(max_step=5)
         temp = 1
         prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp)
         choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0]
diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index 1dfbb29..4f1c759 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -121,12 +121,8 @@ class Go:
         if self._is_eye(current_board, color, vertex):
             return False
             # forbid position on its own eye.
-        #if self._is_game_finish(current_board, color) and vertex == utils.PASS
-        #    return False
-        # forbid pass if the game is not finished.
         return True
 
-
     def _is_game_finished(self, current_board, color):
         '''
         for each empty position, if it has both BLACK and WHITE neighbors, the game is still not finished
@@ -192,6 +188,7 @@ class Go:
         if len(invalid_action_list) < len(action_set) - 1:
             invalid_action_list.append(action_set[-1])
             # forbid pass, if we have other choices
+            # TODO: In fact we should not do this. In some extreme cases, we should permit pass.
         return invalid_action_list
 
     def _do_move(self, board, color, vertex):

From ced63af18fcc790c4b1bb1548b5494bd2073f9a2 Mon Sep 17 00:00:00 2001
From: Wenbo Hu <huwenbo.rambo@gmail.com>
Date: Thu, 21 Dec 2017 19:31:51 +0800
Subject: [PATCH 33/98] fixing bug pass parameterg

---
 tianshou/core/mcts/mcts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index 7edac97..8bb5f06 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -73,7 +73,7 @@ class UCTNode(MCTSNode):
     def valid_mask(self, simulator):
         # let all invalid actions be illeagel in mcts
         if self.mask is None:
-            self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num - 1))
+            self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num))
         self.ucb[self.mask] = -float("Inf")
 
 
From eda7ed07a1b7b0251745981d71ab9f358f15944e Mon Sep 17 00:00:00 2001
From: rtz19970824 <1289226405@qq.com>
Date: Thu, 21 Dec 2017 21:01:25 +0800
Subject: [PATCH 34/98] implement data collection and part of training

---
 AlphaGo/engine.py |   6 ++-
 AlphaGo/game.py   |  19 +-------
 AlphaGo/model.py  |  18 +++++++-
 AlphaGo/play.py   | 115 ++++++++++++++++++++++++++++++----------------
 AlphaGo/player.py |   1 +
 5 files changed, 101 insertions(+), 58 deletions(-)

diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py
index bf30083..c9f1a3c 100644
--- a/AlphaGo/engine.py
+++ b/AlphaGo/engine.py
@@ -183,11 +183,15 @@ class GTPEngine():
             return 'unknown player', False
 
     def cmd_get_score(self, args, **kwargs):
-        return self._game.game_engine.executor_get_score(True), None
+        return self._game.game_engine.executor_get_score(True), True
 
     def cmd_show_board(self, args, **kwargs):
         return self._game.board, True
 
+    def cmd_get_prob(self, args, **kwargs):
+        return self._game.prob, True
+
+
 if __name__ == "main":
     game = Game()
     engine = GTPEngine(game_obj=Game)
diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index 5f35c74..bf0d084 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -58,24 +58,9 @@ class Game:
     def set_komi(self, k):
         self.komi = k
 
-    def generate_nn_input(self, latest_boards, color):
-        state = np.zeros([1, self.size, self.size, 17])
-        for i in range(8):
-            state[0, :, :, i] = np.array(np.array(latest_boards[i]) == np.ones(self.size ** 2)).reshape(self.size, self.size)
-            state[0, :, :, i + 8] = np.array(np.array(latest_boards[i]) == -np.ones(self.size ** 2)).reshape(self.size, self.size)
-        if color == utils.BLACK:
-            state[0, :, :, 16] = np.ones([self.size, self.size])
-        if color == utils.WHITE:
-            state[0, :, :, 16] = np.zeros([self.size, self.size])
-        return state
-
     def think(self, latest_boards, color):
-        # TODO : using copy is right, or should we change to deepcopy?
-        self.game_engine.simulate_latest_boards = copy.copy(latest_boards)
-        self.game_engine.simulate_board = copy.copy(latest_boards[-1])
-        nn_input = self.generate_nn_input(self.game_engine.simulate_latest_boards, color)
-        mcts = MCTS(self.game_engine, self.evaluator, [self.game_engine.simulate_latest_boards, color], self.size ** 2 + 1, inverse=True)
-        mcts.search(max_step=5)
+        mcts = MCTS(self.game_engine, self.evaluator, [latest_boards, color], self.size ** 2 + 1, inverse=True)
+        mcts.search(max_step=1)
         temp = 1
         prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp)
         choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0]
diff --git a/AlphaGo/model.py b/AlphaGo/model.py
index 725dbd2..fab864e 100644
--- a/AlphaGo/model.py
+++ b/AlphaGo/model.py
@@ -1,6 +1,7 @@
 import os
 import time
 import sys
+import cPickle
 
 import numpy as np
 import tensorflow as tf
@@ -167,4 +168,19 @@ class ResNet(object):
 
     #TODO: design the interface between the environment and training
     def train(self, mode='memory', *args, **kwargs):
-        pass
\ No newline at end of file
+        if mode == 'memory':
+            pass
+        if mode == 'file':
+            self.train_with_file(data_path=kwargs['data_path'], checkpoint_path=kwargs['checkpoint_path'])
+
+    def train_with_file(self, data_path, checkpoint_path):
+        if not os.path.exists(data_path):
+            raise ValueError("{} doesn't exist".format(data_path))
+
+        file_list = os.listdir(data_path)
+        if file_list <= 50:
+            time.sleep(1)
+        else:
+            file_list.sort(key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir(
+                data_path + file) else 0)
+
diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index 7367804..562dd14 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -5,6 +5,18 @@ import re
 import Pyro4
 import time
 import os
+import cPickle
+
+
+class Data(object):
+    def __init__(self):
+        self.boards = []
+        self.probs = []
+        self.winner = 0
+
+    def reset(self):
+        self.__init__()
+
 
 if __name__ == '__main__':
     """
@@ -13,10 +25,13 @@ if __name__ == '__main__':
     """
     # TODO : we should set the network path in a more configurable way.
     parser = argparse.ArgumentParser()
+    parser.add_argument("--result_path", type=str, default="./data/")
     parser.add_argument("--black_weight_path", type=str, default=None)
     parser.add_argument("--white_weight_path", type=str, default=None)
     args = parser.parse_args()
 
+    if not os.path.exists(args.result_path):
+        os.mkdir(args.result_path)
     # black_weight_path = "./checkpoints"
     # white_weight_path = "./checkpoints_origin"
     if args.black_weight_path is not None and (not os.path.exists(args.black_weight_path)):
@@ -35,11 +50,13 @@ if __name__ == '__main__':
     time.sleep(1)
 
     # start two different player with different network weights.
-    agent_v0 = subprocess.Popen(['python', '-u', 'player.py', '--role=black', '--checkpoint_path=' + str(args.black_weight_path)],
-                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    agent_v0 = subprocess.Popen(
+        ['python', '-u', 'player.py', '--role=black', '--checkpoint_path=' + str(args.black_weight_path)],
+        stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
-    agent_v1 = subprocess.Popen(['python', '-u', 'player.py', '--role=white', '--checkpoint_path=' + str(args.white_weight_path)],
-                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    agent_v1 = subprocess.Popen(
+        ['python', '-u', 'player.py', '--role=white', '--checkpoint_path=' + str(args.white_weight_path)],
+        stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
     server_list = ""
     while ("black" not in server_list) or ("white" not in server_list):
@@ -50,6 +67,7 @@ if __name__ == '__main__':
     print "Start black player at : " + str(agent_v0.pid)
     print "Start white player at : " + str(agent_v1.pid)
 
+    data = Data()
     player = [None] * 2
     player[0] = Pyro4.Proxy("PYRONAME:black")
     player[1] = Pyro4.Proxy("PYRONAME:white")
@@ -63,39 +81,58 @@ if __name__ == '__main__':
 
     evaluate_rounds = 1
     game_num = 0
-    while game_num < evaluate_rounds:
-        num = 0
-        pass_flag = [False, False]
-        print("Start game {}".format(game_num))
-        # end the game if both palyer chose to pass, or play too much turns
-        while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2:
-            turn = num % 2
-            move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n')
-            print role[turn] + " : " + str(move),
-            num += 1
-            match = re.search(pattern, move)
-            if match is not None:
-                # print "match : " + str(match.group())
-                play_or_pass = match.group()
-                pass_flag[turn] = False
+    try:
+        while True:
+            num = 0
+            pass_flag = [False, False]
+            print("Start game {}".format(game_num))
+            # end the game if both palyer chose to pass, or play too much turns
+            while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2:
+                turn = num % 2
+                move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n')
+                print role[turn] + " : " + str(move),
+                num += 1
+                match = re.search(pattern, move)
+                if match is not None:
+                    # print "match : " + str(match.group())
+                    play_or_pass = match.group()
+                    pass_flag[turn] = False
+                else:
+                    # print "no match"
+                    play_or_pass = ' PASS'
+                    pass_flag[turn] = True
+                result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n')
+                board = player[turn].run_cmd(str(num) + ' show_board')
+                board = eval(board[board.index('['):board.index(']') + 1])
+                for i in range(size):
+                    for j in range(size):
+                        print show[board[i * size + j]] + " ",
+                    print "\n",
+                data.boards.append(board)
+                prob = player[turn].run_cmd(str(num) + ' get_prob')
+                data.probs.append(prob)
+            score = player[turn].run_cmd(str(num) + ' get_score')
+            print "Finished : ", score.split(" ")[1]
+            # TODO: generalize the player
+            if score > 0:
+                data.winner = 1
+            if score < 0:
+                data.winner = -1
+            player[0].run_cmd(str(num) + ' clear_board')
+            player[1].run_cmd(str(num) + ' clear_board')
+            file_list = os.listdir(args.result_path)
+            if not file_list:
+                data_num = 0
             else:
-                # print "no match"
-                play_or_pass = ' PASS'
-                pass_flag[turn] = True
-            result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n')
-            board = player[turn].run_cmd(str(num) + ' show_board')
-            board = eval(board[board.index('['):board.index(']') + 1])
-            for i in range(size):
-                for j in range(size):
-                    print show[board[i * size + j]] + " ",
-                print "\n",
-
-        score = player[turn].run_cmd(str(num) + ' get_score')
-        print "Finished : ", score.split(" ")[1]
-        player[0].run_cmd(str(num) + ' clear_board')
-        player[1].run_cmd(str(num) + ' clear_board')
-        game_num += 1
-
-    subprocess.call(["kill", "-9", str(agent_v0.pid)])
-    subprocess.call(["kill", "-9", str(agent_v1.pid)])
-    print "Kill all player, finish all game."
+                file_list.sort(key=lambda file: os.path.getmtime(args.result_path + file) if not os.path.isdir(
+                    args.result_path + file) else 0)
+                data_num = eval(file_list[-1][:-4]) + 1
+                print(file_list)
+            with open("./data/" + str(data_num) + ".pkl", "w") as file:
+                picklestring = cPickle.dump(data, file)
+            data.reset()
+            game_num += 1
+    except KeyboardInterrupt:
+        subprocess.call(["kill", "-9", str(agent_v0.pid)])
+        subprocess.call(["kill", "-9", str(agent_v1.pid)])
+        print "Kill all player, finish all game."
diff --git a/AlphaGo/player.py b/AlphaGo/player.py
index b468cf3..0e3daff 100644
--- a/AlphaGo/player.py
+++ b/AlphaGo/player.py
@@ -20,6 +20,7 @@ class Player(object):
         #return "inside the Player of player.py"
         return self.engine.run_cmd(command)
 
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument("--checkpoint_path", type=str, default=None)

From 2acb1aab076f5393f79eb853e275de626d4d0247 Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Thu, 21 Dec 2017 22:48:53 +0800
Subject: [PATCH 35/98] eliminate all references of Game class in Go class

---
 AlphaGo/engine.py    |   2 +-
 AlphaGo/game.py      |  15 ++-----
 AlphaGo/go.py        | 101 +++++++++++++++++++++++--------------------
 AlphaGo/play.py      |   4 +-
 AlphaGo/self-play.py |   2 +-
 5 files changed, 63 insertions(+), 61 deletions(-)

diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py
index c9f1a3c..8b54470 100644
--- a/AlphaGo/engine.py
+++ b/AlphaGo/engine.py
@@ -183,7 +183,7 @@ class GTPEngine():
             return 'unknown player', False
 
     def cmd_get_score(self, args, **kwargs):
-        return self._game.game_engine.executor_get_score(True), True
+        return self._game.game_engine.executor_get_score(self._game.board, True), True
 
     def cmd_show_board(self, args, **kwargs):
         return self._game.board, True
diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index bf0d084..11ce52b 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -34,16 +34,7 @@ class Game:
         self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8)
         # self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v],
         #                                              feed_dict={self.net.x: state, self.net.is_training: False})
-        self.game_engine = go.Go(game=self)
-
-    def _flatten(self, vertex):
-        x, y = vertex
-        return (x - 1) * self.size + (y - 1)
-
-    def _deflatten(self, idx):
-        x = idx // self.size + 1
-        y = idx % self.size + 1
-        return (x, y)
+        self.game_engine = go.Go(size=self.size, komi=self.komi)
 
     def clear(self):
         self.board = [utils.EMPTY] * (self.size ** 2)
@@ -67,14 +58,14 @@ class Game:
         if choice == self.size ** 2:
             move = utils.PASS
         else:
-            move = self._deflatten(choice)
+            move = self.game_engine._deflatten(choice)
         return move, prob
 
     def play_move(self, color, vertex):
         # this function can be called directly to play the opponent's move
         if vertex == utils.PASS:
             return True
-        res = self.game_engine.executor_do_move(color, vertex)
+        res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex)
         return res
 
     def think_play_move(self, color):
diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index 4f1c759..9b7e21f 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -16,12 +16,22 @@ CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]]
 
 class Go:
     def __init__(self, **kwargs):
-        self.game = kwargs['game']
+        self.size = kwargs['size']
+        self.komi = kwargs['komi']
+
+    def _flatten(self, vertex):
+        x, y = vertex
+        return (x - 1) * self.size + (y - 1)
+
+    def _deflatten(self, idx):
+        x = idx // self.size + 1
+        y = idx % self.size + 1
+        return (x, y)
 
     def _in_board(self, vertex):
         x, y = vertex
-        if x < 1 or x > self.game.size: return False
-        if y < 1 or y > self.game.size: return False
+        if x < 1 or x > self.size: return False
+        if y < 1 or y > self.size: return False
         return True
 
     def _neighbor(self, vertex):
@@ -45,7 +55,7 @@ class Go:
         return corner
 
     def _find_group(self, current_board, vertex):
-        color = current_board[self.game._flatten(vertex)]
+        color = current_board[self._flatten(vertex)]
         # print ("color : ", color)
         chain = set()
         frontier = [vertex]
@@ -55,41 +65,41 @@ class Go:
             # print ("current : ", current)
             chain.add(current)
             for n in self._neighbor(current):
-                if current_board[self.game._flatten(n)] == color and not n in chain:
+                if current_board[self._flatten(n)] == color and not n in chain:
                     frontier.append(n)
-                if current_board[self.game._flatten(n)] == utils.EMPTY:
+                if current_board[self._flatten(n)] == utils.EMPTY:
                     has_liberty = True
         return has_liberty, chain
 
     def _is_suicide(self, current_board, color, vertex):
-        current_board[self.game._flatten(vertex)] = color # assume that we already take this move
+        current_board[self._flatten(vertex)] = color # assume that we already take this move
         suicide = False
 
         has_liberty, group = self._find_group(current_board, vertex)
         if not has_liberty:
             suicide = True # no liberty, suicide
             for n in self._neighbor(vertex):
-                if current_board[self.game._flatten(n)] == utils.another_color(color):
+                if current_board[self._flatten(n)] == utils.another_color(color):
                     opponent_liberty, group = self._find_group(current_board, n)
                     if not opponent_liberty:
                         suicide = False # this move is able to take opponent's stone, not suicide
 
-        current_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move
+        current_board[self._flatten(vertex)] = utils.EMPTY # undo this move
         return suicide
 
     def _process_board(self, current_board, color, vertex):
         nei = self._neighbor(vertex)
         for n in nei:
-            if current_board[self.game._flatten(n)] == utils.another_color(color):
+            if current_board[self._flatten(n)] == utils.another_color(color):
                 has_liberty, group = self._find_group(current_board, n)
                 if not has_liberty:
                     for b in group:
-                        current_board[self.game._flatten(b)] = utils.EMPTY
+                        current_board[self._flatten(b)] = utils.EMPTY
 
     def _check_global_isomorphous(self, history_boards, current_board, color, vertex):
         repeat = False
         next_board = copy.copy(current_board)
-        next_board[self.game._flatten(vertex)] = color
+        next_board[self._flatten(vertex)] = color
         self._process_board(next_board, color, vertex)
         if next_board in history_boards:
             repeat = True
@@ -98,7 +108,7 @@ class Go:
     def _is_eye(self, current_board, color, vertex):
         nei = self._neighbor(vertex)
         cor = self._corner(vertex)
-        ncolor = {color == current_board[self.game._flatten(n)] for n in nei}
+        ncolor = {color == current_board[self._flatten(n)] for n in nei}
         if False in ncolor:
             # print "not all neighbors are in same color with us"
             return False
@@ -107,7 +117,7 @@ class Go:
             # print "all neighbors are in same group and same color with us"
             return True
         else:
-            opponent_number = [current_board[self.game._flatten(c)] for c in cor].count(-color)
+            opponent_number = [current_board[self._flatten(c)] for c in cor].count(-color)
             opponent_propotion = float(opponent_number) / float(len(cor))
             if opponent_propotion < 0.5:
                 # print "few opponents, real eye"
@@ -131,20 +141,20 @@ class Go:
         board = copy.deepcopy(current_board)
         empty_idx = [i for i, x in enumerate(board) if x == utils.EMPTY]  # find all empty idx
         for idx in empty_idx:
-            neighbor_idx = self._neighbor(self.game.deflatten(idx))
+            neighbor_idx = self._neighbor(self.deflatten(idx))
         if len(neighbor_idx) > 1:
             first_idx = neighbor_idx[0]
         for other_idx in neighbor_idx[1:]:
-            if self.game.board[self.game.flatten(other_idx)] != self.game.board[self.game.flatten(first_idx)]:
+            if board[self.flatten(other_idx)] != board[self.flatten(first_idx)]:
                 return False
 
         return True
 
     def _action2vertex(self, action):
-        if action == self.game.size ** 2:
+        if action == self.size ** 2:
             vertex = (0, 0)
         else:
-            vertex = self.game._deflatten(action)
+            vertex = self._deflatten(action)
         return vertex
 
     def _is_valid(self, history_boards, current_board, color, vertex):
@@ -153,7 +163,7 @@ class Go:
             return False
 
         ### already have stone
-        if not current_board[self.game._flatten(vertex)] == utils.EMPTY:
+        if not current_board[self._flatten(vertex)] == utils.EMPTY:
             return False
 
         ### check if it is suicide
@@ -195,7 +205,7 @@ class Go:
         if vertex == utils.PASS:
             return board
         else:
-            id_ = self.game._flatten(vertex)
+            id_ = self._flatten(vertex)
             board[id_] = color
             return board
 
@@ -208,21 +218,21 @@ class Go:
         new_color = -color
         return [history_boards, new_color], 0
 
-    def executor_do_move(self, color, vertex):
-        if not self._is_valid(self.game.history, self.game.board, color, vertex):
+    def executor_do_move(self, history, latest_boards, current_board, color, vertex):
+        if not self._is_valid(history, current_board, color, vertex):
             return False
-        self.game.board[self.game._flatten(vertex)] = color
-        self._process_board(self.game.board, color, vertex)
-        self.game.history.append(copy.copy(self.game.board))
-        self.game.latest_boards.append(copy.copy(self.game.board))
+        current_board[self._flatten(vertex)] = color
+        self._process_board(current_board, color, vertex)
+        history.append(copy.copy(current_board))
+        latest_boards.append(copy.copy(current_board))
         return True
 
-    def _find_empty(self):
-        idx = [i for i,x in enumerate(self.game.board) if x == utils.EMPTY ][0]
-        return self.game._deflatten(idx)
+    def _find_empty(self, current_board):
+        idx = [i for i,x in enumerate(current_board) if x == utils.EMPTY ][0]
+        return self._deflatten(idx)
 
-    def _find_boarder(self, vertex):
-        _, group = self._find_group(self.game.board, vertex)
+    def _find_boarder(self, current_board, vertex):
+        _, group = self._find_group(current_board, vertex)
         border = []
         for b in group:
             for n in self._neighbor(b):
@@ -248,7 +258,7 @@ class Go:
             start_vertex_x += x_diff
             start_vertex_y += y_diff
 
-    def _predict_from_nearby(self, vertex, neighbor_step=3):
+    def _predict_from_nearby(self, current_board, vertex, neighbor_step=3):
         '''
         step: the nearby 3 steps is considered
         :vertex: position to be estimated
@@ -264,38 +274,37 @@ class Go:
             self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] -  step, -1, 1, neighbor_step)
             color_estimate = 0
             for neighbor_vertex in neighbor_vertex_set:
-                color_estimate += self.game.board[self.game._flatten(neighbor_vertex)]
+                color_estimate += current_board[self._flatten(neighbor_vertex)]
             if color_estimate > 0:
                 return utils.BLACK
             elif color_estimate < 0:
                 return utils.WHITE
 
-    def executor_get_score(self, is_unknown_estimation=False):
+    def executor_get_score(self, current_board, is_unknown_estimation=False):
         '''
             is_unknown_estimation: whether use nearby stone to predict the unknown
             return score from BLACK perspective.
         '''
-        _board = copy.copy(self.game.board)
-        while utils.EMPTY in self.game.board:
-            vertex = self._find_empty()
-            boarder = self._find_boarder(vertex)
-            boarder_color = set(map(lambda v: self.game.board[self.game._flatten(v)], boarder))
+        _board = copy.deepcopy(current_board)
+        while utils.EMPTY in _board:
+            vertex = self._find_empty(_board)
+            boarder = self._find_boarder(_board, vertex)
+            boarder_color = set(map(lambda v: _board[self._flatten(v)], boarder))
             if boarder_color == {utils.BLACK}:
-                self.game.board[self.game._flatten(vertex)] = utils.BLACK
+                _board[self._flatten(vertex)] = utils.BLACK
             elif boarder_color == {utils.WHITE}:
-                self.game.board[self.game._flatten(vertex)] = utils.WHITE
+                _board[self._flatten(vertex)] = utils.WHITE
             elif is_unknown_estimation:
-                self.game.board[self.game._flatten(vertex)] = self._predict_from_nearby(vertex)
+                _board[self._flatten(vertex)] = self._predict_from_nearby(_board, vertex)
             else:
-                self.game.board[self.game._flatten(vertex)] =utils.UNKNOWN
+                _board[self._flatten(vertex)] =utils.UNKNOWN
         score = 0
-        for i in self.game.board:
+        for i in _board:
             if i == utils.BLACK:
                 score += 1
             elif i == utils.WHITE:
                 score -= 1
-        score -= self.game.komi
+        score -= self.komi
 
-        self.game.board = _board
         return score
 
diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index 562dd14..e18555f 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -82,7 +82,7 @@ if __name__ == '__main__':
     evaluate_rounds = 1
     game_num = 0
     try:
-        while True:
+        while game_num < evaluate_rounds:
             num = 0
             pass_flag = [False, False]
             print("Start game {}".format(game_num))
@@ -132,6 +132,8 @@ if __name__ == '__main__':
                 picklestring = cPickle.dump(data, file)
             data.reset()
             game_num += 1
+        subprocess.call(["kill", "-9", str(agent_v0.pid)])
+        subprocess.call(["kill", "-9", str(agent_v1.pid)])
     except KeyboardInterrupt:
         subprocess.call(["kill", "-9", str(agent_v0.pid)])
         subprocess.call(["kill", "-9", str(agent_v1.pid)])
diff --git a/AlphaGo/self-play.py b/AlphaGo/self-play.py
index 63b7e97..4387b24 100644
--- a/AlphaGo/self-play.py
+++ b/AlphaGo/self-play.py
@@ -79,7 +79,7 @@ while True:
         prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1))
     print("Finished")
     print("\n")
-    score = game.game_engine.executor_get_score(True)
+    score = game.game_engine.executor_get_score(game.board, True)
     if score > 0:
         winner = utils.BLACK
     else:

From 9ad53de54f0ef28aea0df9de31c9d2c405186d15 Mon Sep 17 00:00:00 2001
From: rtz19970824 <1289226405@qq.com>
Date: Thu, 21 Dec 2017 23:30:24 +0800
Subject: [PATCH 36/98] implement the training process

---
 .gitignore       |   1 +
 AlphaGo/game.py  |   2 +-
 AlphaGo/model.py | 106 ++++++++++++++++++++++++++++++++++++++++++-----
 AlphaGo/play.py  |  28 ++++++++-----
 4 files changed, 114 insertions(+), 23 deletions(-)

diff --git a/.gitignore b/.gitignore
index 36d134c..d697b92 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@ checkpoints
 checkpoints_origin
 *.json
 .DS_Store
+data
diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index bf0d084..c342d0c 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -60,7 +60,7 @@ class Game:
 
     def think(self, latest_boards, color):
         mcts = MCTS(self.game_engine, self.evaluator, [latest_boards, color], self.size ** 2 + 1, inverse=True)
-        mcts.search(max_step=1)
+        mcts.search(max_step=20)
         temp = 1
         prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp)
         choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0]
diff --git a/AlphaGo/model.py b/AlphaGo/model.py
index fab864e..41f3a47 100644
--- a/AlphaGo/model.py
+++ b/AlphaGo/model.py
@@ -2,6 +2,7 @@ import os
 import time
 import sys
 import cPickle
+from collections import deque
 
 import numpy as np
 import tensorflow as tf
@@ -71,6 +72,13 @@ def value_head(input, is_training):
     return h
 
 
+class Data(object):
+    def __init__(self):
+        self.boards = []
+        self.probs = []
+        self.winner = 0
+
+
 class ResNet(object):
     def __init__(self, board_size, action_num, history_length=1, residual_block_num=20, checkpoint_path=None):
         """
@@ -85,11 +93,18 @@ class ResNet(object):
         self.board_size = board_size
         self.action_num = action_num
         self.history_length = history_length
+        self.checkpoint_path = checkpoint_path
         self.x = tf.placeholder(tf.float32, shape=[None, self.board_size, self.board_size, 2 * self.history_length + 1])
         self.is_training = tf.placeholder(tf.bool, shape=[])
         self.z = tf.placeholder(tf.float32, shape=[None, 1])
         self.pi = tf.placeholder(tf.float32, shape=[None, self.action_num])
-        self._build_network(residual_block_num, checkpoint_path)
+        self._build_network(residual_block_num, self.checkpoint_path)
+
+        # training hyper-parameters:
+        self.window_length = 1000
+        self.save_freq = 1000
+        self.training_data = {'states': deque(maxlen=self.window_length), 'probs': deque(maxlen=self.window_length),
+                              'winner': deque(maxlen=self.window_length)}
 
     def _build_network(self, residual_block_num, checkpoint_path):
         """
@@ -118,7 +133,7 @@ class ResNet(object):
         with tf.control_dependencies(self.update_ops):
             self.train_op = tf.train.AdamOptimizer(1e-4).minimize(self.total_loss)
         self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
-        self.saver = tf.train.Saver(max_to_keep=10, var_list=self.var_list)
+        self.saver = tf.train.Saver(var_list=self.var_list)
         self.sess = multi_gpu.create_session()
         self.sess.run(tf.global_variables_initializer())
         if checkpoint_path is not None:
@@ -166,21 +181,90 @@ class ResNet(object):
             state[0, :, :, 2 * self.history_length] = np.zeros([self.board_size, self.board_size])
         return state
 
-    #TODO: design the interface between the environment and training
+    # TODO: design the interface between the environment and training
     def train(self, mode='memory', *args, **kwargs):
         if mode == 'memory':
             pass
         if mode == 'file':
-            self.train_with_file(data_path=kwargs['data_path'], checkpoint_path=kwargs['checkpoint_path'])
+            self._train_with_file(data_path=kwargs['data_path'], batch_size=kwargs['batch_size'],
+                                  checkpoint_path=kwargs['checkpoint_path'])
 
-    def train_with_file(self, data_path, checkpoint_path):
+    def _train_with_file(self, data_path, batch_size, checkpoint_path):
+        # check if the path is valid
         if not os.path.exists(data_path):
             raise ValueError("{} doesn't exist".format(data_path))
+        self.checkpoint_path = checkpoint_path
+        if not os.path.exists(self.checkpoint_path):
+            os.mkdir(self.checkpoint_path)
 
-        file_list = os.listdir(data_path)
-        if file_list <= 50:
-            time.sleep(1)
-        else:
-            file_list.sort(key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir(
-                data_path + file) else 0)
+        new_file_list = []
+        all_file_list = []
+        training_data = {}
+        iters = 0
+        while True:
+            new_file_list = list(set(os.listdir(data_path)).difference(all_file_list))
+            all_file_list = os.listdir(data_path)
+            new_file_list.sort(
+                key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir(data_path + file) else 0)
+            if new_file_list:
+                for file in new_file_list:
+                    states, probs, winner = self._file_to_training_data(data_path + file)
+                    assert states.shape[0] == probs.shape[0]
+                    assert states.shape[0] == winner.shape[0]
+                    self.training_data['states'].append(states)
+                    self.training_data['probs'].append(probs)
+                    self.training_data['winner'].append(winner)
+                    training_data['states'] = np.concatenate(self.training_data['states'], axis=0)
+                    training_data['probs'] = np.concatenate(self.training_data['probs'], axis=0)
+                    training_data['winner'] = np.concatenate(self.training_data['winner'], axis=0)
 
+            if len(self.training_data['states']) != self.window_length:
+                continue
+            else:
+                data_num = training_data['states'].shape[0]
+                index = np.arange(data_num)
+                np.random.shuffle(index)
+                start_time = time.time()
+                value_loss, policy_loss, reg, _ = self.sess.run(
+                    [self.value_loss, self.policy_loss, self.reg, self.train_op],
+                    feed_dict={self.x: training_data['states'][index[:batch_size]],
+                               self.z: training_data['winner'][index[:batch_size]],
+                               self.pi: training_data['probs'][index[:batch_size]],
+                               self.is_training: True})
+                print("Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format(iters,
+                                                                                                 time.time() - start_time,
+                                                                                                 value_loss,
+                                                                                                 policy_loss, reg))
+                iters += 1
+                if iters % self.save_freq == 0:
+                    save_path = "Iteration{}.ckpt".format(iters)
+                    self.saver.save(self.sess, self.checkpoint_path + save_path)
+
+    def _file_to_training_data(self, file_name):
+        with open(file_name, 'r') as file:
+            data = cPickle.load(file)
+        history = deque(maxlen=self.history_length)
+        states = []
+        probs = []
+        winner = []
+        for _ in range(self.history_length):
+            # Note that 0 is specified, need a more general way like config
+            history.append([0] * self.board_size ** 2)
+        # Still, +1 is specified
+        color = +1
+
+        for [board, prob] in zip(data.boards, data.probs):
+            history.append(board)
+            states.append(self._history2state(history, color))
+            probs.append(np.array(prob).reshape(1, self.board_size ** 2 + 1))
+            winner.append(np.array(data.winner).reshape(1, 1))
+            color *= -1
+        states = np.concatenate(states, axis=0)
+        probs = np.concatenate(probs, axis=0)
+        winner = np.concatenate(winner, axis=0)
+        return states, probs, winner
+
+
+if __name__=="__main__":
+    model = ResNet(board_size=9, action_num=82)
+    model.train("file", data_path="./data/", batch_size=128, checkpoint_path="./checkpoint/")
\ No newline at end of file
diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index 562dd14..bd3776e 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -76,6 +76,7 @@ if __name__ == '__main__':
     color = ['b', 'w']
 
     pattern = "[A-Z]{1}[0-9]{1}"
+    space = re.compile("\s+")
     size = 9
     show = ['.', 'X', 'O']
 
@@ -83,12 +84,20 @@ if __name__ == '__main__':
     game_num = 0
     try:
         while True:
+            start_time = time.time()
             num = 0
             pass_flag = [False, False]
             print("Start game {}".format(game_num))
             # end the game if both palyer chose to pass, or play too much turns
             while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2:
                 turn = num % 2
+                board = player[turn].run_cmd(str(num) + ' show_board')
+                board = eval(board[board.index('['):board.index(']') + 1])
+                for i in range(size):
+                    for j in range(size):
+                        print show[board[i * size + j]] + " ",
+                    print "\n",
+                data.boards.append(board)
                 move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n')
                 print role[turn] + " : " + str(move),
                 num += 1
@@ -102,21 +111,18 @@ if __name__ == '__main__':
                     play_or_pass = ' PASS'
                     pass_flag[turn] = True
                 result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n')
-                board = player[turn].run_cmd(str(num) + ' show_board')
-                board = eval(board[board.index('['):board.index(']') + 1])
-                for i in range(size):
-                    for j in range(size):
-                        print show[board[i * size + j]] + " ",
-                    print "\n",
-                data.boards.append(board)
                 prob = player[turn].run_cmd(str(num) + ' get_prob')
+                prob = space.sub(',', prob[prob.index('['):prob.index(']') + 1])
+                prob = prob.replace('[,', '[')
+                prob = prob.replace('],', ']')
+                prob = eval(prob)
                 data.probs.append(prob)
             score = player[turn].run_cmd(str(num) + ' get_score')
             print "Finished : ", score.split(" ")[1]
             # TODO: generalize the player
-            if score > 0:
+            if eval(score.split(" ")[1]) > 0:
                 data.winner = 1
-            if score < 0:
+            if eval(score.split(" ")[1]) < 0:
                 data.winner = -1
             player[0].run_cmd(str(num) + ' clear_board')
             player[1].run_cmd(str(num) + ' clear_board')
@@ -127,12 +133,12 @@ if __name__ == '__main__':
                 file_list.sort(key=lambda file: os.path.getmtime(args.result_path + file) if not os.path.isdir(
                     args.result_path + file) else 0)
                 data_num = eval(file_list[-1][:-4]) + 1
-                print(file_list)
             with open("./data/" + str(data_num) + ".pkl", "w") as file:
                 picklestring = cPickle.dump(data, file)
             data.reset()
             game_num += 1
-    except KeyboardInterrupt:
+            print("Time {}".format(time.time()-start_time))
+    except Exception:
         subprocess.call(["kill", "-9", str(agent_v0.pid)])
         subprocess.call(["kill", "-9", str(agent_v1.pid)])
         print "Kill all player, finish all game."

From 43f6527d8e4ebaec6b9c001361db689090127e87 Mon Sep 17 00:00:00 2001
From: rtz19970824 <1289226405@qq.com>
Date: Thu, 21 Dec 2017 23:55:31 +0800
Subject: [PATCH 37/98] modify for multi instance

---
 AlphaGo/play.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index 35549dd..a9d3d20 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -28,6 +28,7 @@ if __name__ == '__main__':
     parser.add_argument("--result_path", type=str, default="./data/")
     parser.add_argument("--black_weight_path", type=str, default=None)
     parser.add_argument("--white_weight_path", type=str, default=None)
+    parser.add_argument("--id", type=int, default=0)
     args = parser.parse_args()
 
     if not os.path.exists(args.result_path):
@@ -50,12 +51,15 @@ if __name__ == '__main__':
     time.sleep(1)
 
     # start two different player with different network weights.
+    black_role_name = 'black' + str(args.id)
+    white_role_name = 'white' + str(args.id)
+
     agent_v0 = subprocess.Popen(
-        ['python', '-u', 'player.py', '--role=black', '--checkpoint_path=' + str(args.black_weight_path)],
+        ['python', '-u', 'player.py', '--role=' + black_role_name, '--checkpoint_path=' + str(args.black_weight_path)],
         stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
     agent_v1 = subprocess.Popen(
-        ['python', '-u', 'player.py', '--role=white', '--checkpoint_path=' + str(args.white_weight_path)],
+        ['python', '-u', 'player.py', '--role=' + white_role_name, '--checkpoint_path=' + str(args.white_weight_path)],
         stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
     server_list = ""
@@ -69,8 +73,8 @@ if __name__ == '__main__':
 
     data = Data()
     player = [None] * 2
-    player[0] = Pyro4.Proxy("PYRONAME:black")
-    player[1] = Pyro4.Proxy("PYRONAME:white")
+    player[0] = Pyro4.Proxy("PYRONAME:" + black_role_name)
+    player[1] = Pyro4.Proxy("PYRONAME:" + white_role_name)
 
     role = ["BLACK", "WHITE"]
     color = ['b', 'w']

From 6835ec62e14c63703a46a4adb8df677d6a14a0b3 Mon Sep 17 00:00:00 2001
From: rtz19970824 <rtz19970824@gmail.com>
Date: Fri, 22 Dec 2017 00:04:51 +0800
Subject: [PATCH 38/98] multi-instance support

---
 AlphaGo/play.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index a9d3d20..a8267a7 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -41,14 +41,14 @@ if __name__ == '__main__':
         raise ValueError("Can't not find the network weights for white player.")
 
     # kill the old server
-    kill_old_server = subprocess.Popen(['killall', 'pyro4-ns'])
-    print "kill the old pyro4 name server, the return code is : " + str(kill_old_server.wait())
-    time.sleep(1)
+    # kill_old_server = subprocess.Popen(['killall', 'pyro4-ns'])
+    # print "kill the old pyro4 name server, the return code is : " + str(kill_old_server.wait())
+    # time.sleep(1)
 
     # start a name server to find the remote object
-    start_new_server = subprocess.Popen(['pyro4-ns', '&'])
-    print "Start Name Sever : " + str(start_new_server.pid)  # + str(start_new_server.wait())
-    time.sleep(1)
+    # start_new_server = subprocess.Popen(['pyro4-ns', '&'])
+    # print "Start Name Sever : " + str(start_new_server.pid)  # + str(start_new_server.wait())
+    # time.sleep(1)
 
     # start two different player with different network weights.
     black_role_name = 'black' + str(args.id)
@@ -63,7 +63,7 @@ if __name__ == '__main__':
         stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
     server_list = ""
-    while ("black" not in server_list) or ("white" not in server_list):
+    while (black_role_name not in server_list) or (white_role_name not in server_list):
         server_list = subprocess.check_output(['pyro4-nsc', 'list'])
         print "Waiting for the server start..."
         time.sleep(1)
@@ -142,11 +142,12 @@ if __name__ == '__main__':
             data.reset()
             game_num += 1
 
-    except Exception:
+    except Exception as e:
+	print(e)
         subprocess.call(["kill", "-9", str(agent_v0.pid)])
         subprocess.call(["kill", "-9", str(agent_v1.pid)])
         print "Kill all player, finish all game."
 
     subprocess.call(["kill", "-9", str(agent_v0.pid)])
     subprocess.call(["kill", "-9", str(agent_v1.pid)])
-    print "Kill all player, finish all game."
\ No newline at end of file
+    print "Kill all player, finish all game."

From 1cc5063007925ceada46974f21aaf03a2361deee Mon Sep 17 00:00:00 2001
From: Haosheng Zou <zouhaosheng@163.com>
Date: Fri, 22 Dec 2017 00:22:23 +0800
Subject: [PATCH 39/98] add value_function (critic). value_function and policy
 not finished yet.

---
 tianshou/core/policy/base.py                 |  2 +-
 tianshou/core/policy/dqn.py                  | 11 ++++
 tianshou/core/value_function/__init__.py     |  0
 tianshou/core/value_function/action_value.py | 53 ++++++++++++++++++++
 tianshou/core/value_function/base.py         | 23 +++++++++
 tianshou/core/value_function/state_value.py  | 23 +++++++++
 6 files changed, 111 insertions(+), 1 deletion(-)
 create mode 100644 tianshou/core/value_function/__init__.py
 create mode 100644 tianshou/core/value_function/action_value.py
 create mode 100644 tianshou/core/value_function/base.py
 create mode 100644 tianshou/core/value_function/state_value.py

diff --git a/tianshou/core/policy/base.py b/tianshou/core/policy/base.py
index eecfc4f..025abd5 100644
--- a/tianshou/core/policy/base.py
+++ b/tianshou/core/policy/base.py
@@ -15,7 +15,7 @@ __all__ = [
     'QValuePolicy',
 ]
 
-# TODO: separate actor and critic, we should focus on it once we finish the basic module.
+# TODO: a even more "base" class for policy
 
 
 class QValuePolicy(object):
diff --git a/tianshou/core/policy/dqn.py b/tianshou/core/policy/dqn.py
index 39f6a16..d03dbd4 100644
--- a/tianshou/core/policy/dqn.py
+++ b/tianshou/core/policy/dqn.py
@@ -1,5 +1,16 @@
 from tianshou.core.policy.base import QValuePolicy
 import tensorflow as tf
+import sys
+sys.path.append('..')
+import value_function.action_value as value_func
+
+
+class DQN_refactor(object):
+    """
+    use DQN from value_function as a member
+    """
+    def __init__(self, value_tensor, observation_placeholder, action_placeholder):
+        self._network = value_func.DQN(value_tensor, observation_placeholder, action_placeholder)
 
 
 class DQN(QValuePolicy):
diff --git a/tianshou/core/value_function/__init__.py b/tianshou/core/value_function/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tianshou/core/value_function/action_value.py b/tianshou/core/value_function/action_value.py
new file mode 100644
index 0000000..cb8acc8
--- /dev/null
+++ b/tianshou/core/value_function/action_value.py
@@ -0,0 +1,53 @@
+from base import ValueFunctionBase
+import tensorflow as tf
+
+
+class ActionValue(ValueFunctionBase):
+    """
+    class of action values Q(s, a).
+    """
+    def __init__(self, value_tensor, observation_placeholder, action_placeholder):
+        self._action_placeholder = action_placeholder
+        super(ActionValue, self).__init__(
+            value_tensor=value_tensor,
+            observation_placeholder=observation_placeholder
+        )
+
+    def get_value(self, observation, action):
+        """
+
+        :param observation: numpy array of observations, of shape (batchsize, observation_dim).
+        :param action: numpy array of actions, of shape (batchsize, action_dim)
+        # TODO: Atari discrete action should have dim 1. Super Mario may should have, say, dim 5, where each can be 0/1
+        :return: numpy array of state values, of shape (batchsize, )
+        # TODO: dealing with the last dim of 1 in V(s) and Q(s, a)
+        """
+        sess = tf.get_default_session()
+        return sess.run(self.get_value_tensor(), feed_dict=
+        {self._observation_placeholder: observation, self._action_placeholder:action})[:, 0]
+
+
+class DQN(ActionValue):
+    """
+    class of the very DQN architecture. Instead of feeding s and a to the network to get a value, DQN feed s to the
+    network and the last layer is Q(s, *) for all actions
+    """
+    def __init__(self, value_tensor, observation_placeholder, action_placeholder):
+        """
+        :param value_tensor: of shape (batchsize, num_actions)
+        :param observation_placeholder: of shape (batchsize, observation_dim)
+        :param action_placeholder: of shape (batchsize, )
+        """
+        self._value_tensor_all_actions = value_tensor
+        canonical_value_tensor = value_tensor[action_placeholder]  # maybe a tf.map_fn. for now it's wrong
+
+        super(DQN, self).__init__(value_tensor=canonical_value_tensor,
+                                  observation_placeholder=observation_placeholder,
+                                  action_placeholder=action_placeholder)
+
+    def get_value_all_actions(self, observation):
+        sess = tf.get_default_session()
+        return sess.run(self._value_tensor_all_actions, feed_dict={self._observation_placeholder: observation})
+
+    def get_value_tensor_all_actions(self):
+        return self._value_tensor_all_actions
\ No newline at end of file
diff --git a/tianshou/core/value_function/base.py b/tianshou/core/value_function/base.py
new file mode 100644
index 0000000..0b27759
--- /dev/null
+++ b/tianshou/core/value_function/base.py
@@ -0,0 +1,23 @@
+
+# TODO: linear feature baseline also in tf?
+class ValueFunctionBase(object):
+    """
+    base class of value functions. Children include state values V(s) and action values Q(s, a)
+    """
+    def __init__(self, value_tensor, observation_placeholder):
+        self._observation_placeholder = observation_placeholder
+        self._value_tensor = value_tensor
+
+    def get_value(self, **kwargs):
+        """
+
+        :return: batch of corresponding values in numpy array
+        """
+        raise NotImplementedError()
+
+    def get_value_tensor(self):
+        """
+
+        :return: tensor of the corresponding values
+        """
+        return self._value_tensor
diff --git a/tianshou/core/value_function/state_value.py b/tianshou/core/value_function/state_value.py
new file mode 100644
index 0000000..04fe442
--- /dev/null
+++ b/tianshou/core/value_function/state_value.py
@@ -0,0 +1,23 @@
+from base import ValueFunctionBase
+import tensorflow as tf
+
+
+class StateValue(ValueFunctionBase):
+    """
+    class of state values V(s).
+    """
+    def __init__(self, value_tensor, observation_placeholder):
+        super(StateValue, self).__init__(
+            value_tensor=value_tensor,
+            observation_placeholder=observation_placeholder
+        )
+
+    def get_value(self, observation):
+        """
+
+        :param observation: numpy array of observations, of shape (batchsize, observation_dim).
+        :return: numpy array of state values, of shape (batchsize, )
+        # TODO: dealing with the last dim of 1 in V(s) and Q(s, a)
+        """
+        sess = tf.get_default_session()
+        return sess.run(self.get_value_tensor(), feed_dict={self._observation_placeholder: observation})[:, 0]
\ No newline at end of file

From 5c29dad26367ba76c1fbe4a19213c0bf9ae7391e Mon Sep 17 00:00:00 2001
From: JialianLee <lijialian7@163.com>
Date: Fri, 22 Dec 2017 01:57:48 +0800
Subject: [PATCH 40/98] An initial version for Reversi

---
 AlphaGo/reversi.py | 252 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 252 insertions(+)
 create mode 100644 AlphaGo/reversi.py

diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py
new file mode 100644
index 0000000..49d0e9a
--- /dev/null
+++ b/AlphaGo/reversi.py
@@ -0,0 +1,252 @@
+from __future__ import print_function
+import numpy as np
+
+'''
+Settings of the Go game.
+
+(1, 1) is considered as the upper left corner of the board,
+(size, 1) is the lower left
+'''
+
+
+def find_correct_moves(own, enemy):
+    """return legal moves"""
+    left_right_mask = 0x7e7e7e7e7e7e7e7e  # Both most left-right edge are 0, else 1
+    top_bottom_mask = 0x00ffffffffffff00  # Both most top-bottom edge are 0, else 1
+    mask = left_right_mask & top_bottom_mask
+    mobility = 0
+    mobility |= search_offset_left(own, enemy, left_right_mask, 1)  # Left
+    mobility |= search_offset_left(own, enemy, mask, 9)  # Left Top
+    mobility |= search_offset_left(own, enemy, top_bottom_mask, 8)  # Top
+    mobility |= search_offset_left(own, enemy, mask, 7)  # Top Right
+    mobility |= search_offset_right(own, enemy, left_right_mask, 1)  # Right
+    mobility |= search_offset_right(own, enemy, mask, 9)  # Bottom Right
+    mobility |= search_offset_right(own, enemy, top_bottom_mask, 8)  # Bottom
+    mobility |= search_offset_right(own, enemy, mask, 7)  # Left bottom
+    return mobility
+
+
+def calc_flip(pos, own, enemy):
+    """return flip stones of enemy by bitboard when I place stone at pos.
+
+    :param pos: 0~63
+    :param own: bitboard (0=top left, 63=bottom right)
+    :param enemy: bitboard
+    :return: flip stones of enemy when I place stone at pos.
+    """
+    assert 0 <= pos <= 63, f"pos={pos}"
+    f1 = _calc_flip_half(pos, own, enemy)
+    f2 = _calc_flip_half(63 - pos, rotate180(own), rotate180(enemy))
+    return f1 | rotate180(f2)
+
+
+def _calc_flip_half(pos, own, enemy):
+    el = [enemy, enemy & 0x7e7e7e7e7e7e7e7e, enemy & 0x7e7e7e7e7e7e7e7e, enemy & 0x7e7e7e7e7e7e7e7e]
+    masks = [0x0101010101010100, 0x00000000000000fe, 0x0002040810204080, 0x8040201008040200]
+    masks = [b64(m << pos) for m in masks]
+    flipped = 0
+    for e, mask in zip(el, masks):
+        outflank = mask & ((e | ~mask) + 1) & own
+        flipped |= (outflank - (outflank != 0)) & mask
+    return flipped
+
+
+def search_offset_left(own, enemy, mask, offset):
+    e = enemy & mask
+    blank = ~(own | enemy)
+    t = e & (own >> offset)
+    t |= e & (t >> offset)
+    t |= e & (t >> offset)
+    t |= e & (t >> offset)
+    t |= e & (t >> offset)
+    t |= e & (t >> offset)  # Up to six stones can be turned at once
+    return blank & (t >> offset)  # Only the blank squares can be started
+
+
+def search_offset_right(own, enemy, mask, offset):
+    e = enemy & mask
+    blank = ~(own | enemy)
+    t = e & (own << offset)
+    t |= e & (t << offset)
+    t |= e & (t << offset)
+    t |= e & (t << offset)
+    t |= e & (t << offset)
+    t |= e & (t << offset)  # Up to six stones can be turned at once
+    return blank & (t << offset)  # Only the blank squares can be started
+
+
+def flip_vertical(x):
+    k1 = 0x00FF00FF00FF00FF
+    k2 = 0x0000FFFF0000FFFF
+    x = ((x >> 8) & k1) | ((x & k1) << 8)
+    x = ((x >> 16) & k2) | ((x & k2) << 16)
+    x = (x >> 32) | b64(x << 32)
+    return x
+
+
+def b64(x):
+    return x & 0xFFFFFFFFFFFFFFFF
+
+
+def bit_count(x):
+    return bin(x).count('1')
+
+
+def bit_to_array(x, size):
+    """bit_to_array(0b0010, 4) -> array([0, 1, 0, 0])"""
+    return np.array(list(reversed((("0" * size) + bin(x)[2:])[-size:])), dtype=np.uint8)
+
+
+def flip_diag_a1h8(x):
+    k1 = 0x5500550055005500
+    k2 = 0x3333000033330000
+    k4 = 0x0f0f0f0f00000000
+    t = k4 & (x ^ b64(x << 28))
+    x ^= t ^ (t >> 28)
+    t = k2 & (x ^ b64(x << 14))
+    x ^= t ^ (t >> 14)
+    t = k1 & (x ^ b64(x << 7))
+    x ^= t ^ (t >> 7)
+    return x
+
+
+def rotate90(x):
+    return flip_diag_a1h8(flip_vertical(x))
+
+
+def rotate180(x):
+    return rotate90(rotate90(x))
+
+
+class Reversi:
+    def __init__(self, black=None, white=None):
+        self.black = black or (0b00001000 << 24 | 0b00010000 << 32)
+        self.white = white or (0b00010000 << 24 | 0b00001000 << 32)
+        self.board = None  # 8 * 8 board with 1 for black, -1 for white and 0 for blank
+        self.color = None  # 1 for black and -1 for white
+        self.action = None   # number in 0~63
+        self.winner = None
+
+    def simulate_is_valid(self, board, color):
+        self.board = board
+        self.color = color
+        self.board2bitboard()
+        own, enemy = self.get_own_and_enemy()
+        mobility = find_correct_moves(own, enemy)
+        valid_moves = bit_to_array(mobility, 64)
+        valid_moves = list(np.reshape(valid_moves, len(valid_moves)))
+        return valid_moves
+
+    def simulate_step_forward(self, board, color, vertex):
+        self.board = board
+        self.color = color
+        self.board2bitboard()
+        self.vertex2action(vertex)
+        step_forward = self.step()
+        if step_forward:
+            new_board = self.bitboard2board()
+            return new_board
+
+    def executor_do_move(self, board, color, vertex):
+        self.board = board
+        self.color = color
+        self.board2bitboard()
+        self.vertex2action(vertex)
+        step_forward = self.step()
+        if step_forward:
+            new_board = self.bitboard2board()
+            return new_board
+
+    def executor_get_score(self, board):
+        self.board = board
+        self._game_over()
+        if self.winner is not None:
+            return self.winner, 0 - self.winner
+        else:
+            ValueError("Game not finished!")
+
+    def board2bitboard(self):
+        count = 1
+        if self.board is None:
+            ValueError("None board!")
+        self.black = 0
+        self.white = 0
+        for i in range(64):
+            if self.board[i] == 1:
+                self.black |= count
+            elif self.board[i] == -1:
+                self.white |= count
+            count *= 2
+
+    def vertex2action(self, vertex):
+        x, y = vertex
+        if x == 0 and y == 0:
+            self.action = None
+        else:
+            self.action = 8 * (x - 1) + y - 1
+
+    def bitboard2board(self):
+        board = []
+        black = bit_to_array(self.black, 64)
+        white = bit_to_array(self.white, 64)
+        for i in range(64):
+            if black[i]:
+                board.append(1)
+            elif white[i]:
+                board.append(-1)
+            else:
+                board.append(0)
+        return board
+
+    def step(self):
+        if self.action < 0 or self.action > 63:
+            ValueError("Wrong action!")
+        if self.action is None:
+            return False
+
+        own, enemy = self.get_own_and_enemy()
+
+        flipped = calc_flip(self.action, own, enemy)
+        if bit_count(flipped) == 0:
+            self.illegal_move_to_lose(self.action)
+            return False
+        own ^= flipped
+        own |= 1 << self.action
+        enemy ^= flipped
+
+        self.set_own_and_enemy(own, enemy)
+        return True
+
+    def _game_over(self):
+        # self.done = True
+        if self.winner is None:
+            black_num, white_num = self.number_of_black_and_white
+            if black_num > white_num:
+                self.winner = 1
+            elif black_num < white_num:
+                self.winner = -1
+            else:
+                self.winner = 0
+
+    def illegal_move_to_lose(self, action):
+        logger.warning(f"Illegal action={action}, No Flipped!")
+        self._game_over()
+
+    def get_own_and_enemy(self):
+        if self.color == 1:
+            own, enemy = self.black, self.white
+        elif self.color == -1:
+            own, enemy = self.white, self.black
+        else:
+            own, enemy = None, None
+        return own, enemy
+
+    def set_own_and_enemy(self, own, enemy):
+        if self.color == 1:
+            self.black, self.white = own, enemy
+        else:
+            self.white, self.black = own, enemy
+
+    @property
+    def number_of_black_and_white(self):
+        return bit_count(self.black), bit_count(self.white)

From 2b1285143c232bc4006f47eabb498b99baf59785 Mon Sep 17 00:00:00 2001
From: rtz19970824 <1289226405@qq.com>
Date: Fri, 22 Dec 2017 13:04:02 +0800
Subject: [PATCH 41/98] debug the training process, initialize a nameserver if
 no nameserver exists

---
 AlphaGo/model.py | 15 ++++++++++++---
 AlphaGo/play.py  | 10 ++++++++--
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/AlphaGo/model.py b/AlphaGo/model.py
index 41f3a47..541de81 100644
--- a/AlphaGo/model.py
+++ b/AlphaGo/model.py
@@ -203,7 +203,8 @@ class ResNet(object):
         iters = 0
         while True:
             new_file_list = list(set(os.listdir(data_path)).difference(all_file_list))
-            all_file_list = os.listdir(data_path)
+            if new_file_list:
+                all_file_list = os.listdir(data_path)
             new_file_list.sort(
                 key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir(data_path + file) else 0)
             if new_file_list:
@@ -241,8 +242,16 @@ class ResNet(object):
                     self.saver.save(self.sess, self.checkpoint_path + save_path)
 
     def _file_to_training_data(self, file_name):
-        with open(file_name, 'r') as file:
-            data = cPickle.load(file)
+        read = False
+        with open(file_name, 'rb') as file:
+            while not read:
+                try:
+                    file.seek(0)
+                    data = cPickle.load(file)
+                    read = True
+                except Exception as e:
+                    print(e)
+                    time.sleep(1)
         history = deque(maxlen=self.history_length)
         states = []
         probs = []
diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index a8267a7..3681430 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -50,6 +50,12 @@ if __name__ == '__main__':
     # print "Start Name Sever : " + str(start_new_server.pid)  # + str(start_new_server.wait())
     # time.sleep(1)
 
+    # start a name server if no name server exists
+    if len(os.popen('ps aux | grep pyro4-ns | grep -v grep').readlines()) == 0:
+        start_new_server = subprocess.Popen(['pyro4-ns', '&'])
+        print "Start Name Sever : " + str(start_new_server.pid)  # + str(start_new_server.wait())
+        time.sleep(1)
+
     # start two different player with different network weights.
     black_role_name = 'black' + str(args.id)
     white_role_name = 'white' + str(args.id)
@@ -137,13 +143,13 @@ if __name__ == '__main__':
                 file_list.sort(key=lambda file: os.path.getmtime(args.result_path + file) if not os.path.isdir(
                     args.result_path + file) else 0)
                 data_num = eval(file_list[-1][:-4]) + 1
-            with open("./data/" + str(data_num) + ".pkl", "w") as file:
+            with open("./data/" + str(data_num) + ".pkl", "wb") as file:
                 picklestring = cPickle.dump(data, file)
             data.reset()
             game_num += 1
 
     except Exception as e:
-	print(e)
+        print(e)
         subprocess.call(["kill", "-9", str(agent_v0.pid)])
         subprocess.call(["kill", "-9", str(agent_v1.pid)])
         print "Kill all player, finish all game."

From d281ecc6e082027e7f67341a0abf1c18dbacbae8 Mon Sep 17 00:00:00 2001
From: rtz19970824 <1289226405@qq.com>
Date: Fri, 22 Dec 2017 13:05:01 +0800
Subject: [PATCH 42/98] no restrict on saving checkpoints

---
 AlphaGo/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/AlphaGo/model.py b/AlphaGo/model.py
index 541de81..5629128 100644
--- a/AlphaGo/model.py
+++ b/AlphaGo/model.py
@@ -133,7 +133,7 @@ class ResNet(object):
         with tf.control_dependencies(self.update_ops):
             self.train_op = tf.train.AdamOptimizer(1e-4).minimize(self.total_loss)
         self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
-        self.saver = tf.train.Saver(var_list=self.var_list)
+        self.saver = tf.train.Saver(max_to_keep=0, var_list=self.var_list)
         self.sess = multi_gpu.create_session()
         self.sess.run(tf.global_variables_initializer())
         if checkpoint_path is not None:

From 6b3efd7fca0f4e2eb7ac4e63524a30976efc4361 Mon Sep 17 00:00:00 2001
From: rtz19970824 <rtz19970824@gmail.com>
Date: Fri, 22 Dec 2017 13:30:48 +0800
Subject: [PATCH 43/98] modify the training config

---
 AlphaGo/model.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/AlphaGo/model.py b/AlphaGo/model.py
index 41f3a47..0d885ef 100644
--- a/AlphaGo/model.py
+++ b/AlphaGo/model.py
@@ -101,8 +101,8 @@ class ResNet(object):
         self._build_network(residual_block_num, self.checkpoint_path)
 
         # training hyper-parameters:
-        self.window_length = 1000
-        self.save_freq = 1000
+        self.window_length = 7000
+        self.save_freq = 5000
         self.training_data = {'states': deque(maxlen=self.window_length), 'probs': deque(maxlen=self.window_length),
                               'winner': deque(maxlen=self.window_length)}
 
@@ -241,6 +241,7 @@ class ResNet(object):
                     self.saver.save(self.sess, self.checkpoint_path + save_path)
 
     def _file_to_training_data(self, file_name):
+	print(file_name)
         with open(file_name, 'r') as file:
             data = cPickle.load(file)
         history = deque(maxlen=self.history_length)
@@ -267,4 +268,4 @@ class ResNet(object):
 
 if __name__=="__main__":
     model = ResNet(board_size=9, action_num=82)
-    model.train("file", data_path="./data/", batch_size=128, checkpoint_path="./checkpoint/")
\ No newline at end of file
+    model.train("file", data_path="./data/", batch_size=128, checkpoint_path="./checkpoint/")

From a8509ba2921795002bd88942bf58523aba80de99 Mon Sep 17 00:00:00 2001
From: rtz19970824 <1289226405@qq.com>
Date: Fri, 22 Dec 2017 13:42:53 +0800
Subject: [PATCH 44/98] faster the loading

---
 AlphaGo/model.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/AlphaGo/model.py b/AlphaGo/model.py
index 5629128..c4338c8 100644
--- a/AlphaGo/model.py
+++ b/AlphaGo/model.py
@@ -215,9 +215,10 @@ class ResNet(object):
                     self.training_data['states'].append(states)
                     self.training_data['probs'].append(probs)
                     self.training_data['winner'].append(winner)
-                    training_data['states'] = np.concatenate(self.training_data['states'], axis=0)
-                    training_data['probs'] = np.concatenate(self.training_data['probs'], axis=0)
-                    training_data['winner'] = np.concatenate(self.training_data['winner'], axis=0)
+                    if len(self.training_data['states']) == self.window_length:
+                        training_data['states'] = np.concatenate(self.training_data['states'], axis=0)
+                        training_data['probs'] = np.concatenate(self.training_data['probs'], axis=0)
+                        training_data['winner'] = np.concatenate(self.training_data['winner'], axis=0)
 
             if len(self.training_data['states']) != self.window_length:
                 continue

From 8328153b86871f36953605ebd89e17c001b3f537 Mon Sep 17 00:00:00 2001
From: rtz19970824 <rtz19970824@gmail.com>
Date: Fri, 22 Dec 2017 13:47:27 +0800
Subject: [PATCH 45/98] print in the loading process

---
 AlphaGo/model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/AlphaGo/model.py b/AlphaGo/model.py
index 15fc3da..e8b5eb9 100644
--- a/AlphaGo/model.py
+++ b/AlphaGo/model.py
@@ -249,6 +249,7 @@ class ResNet(object):
                     file.seek(0)
                     data = cPickle.load(file)
                     read = True
+                    print("{} Loaded".format(file_name))
                 except Exception as e:
                     print(e)
                     time.sleep(1)

From 511f64b3d6ada98d4fe0e04215eea93d690f56a4 Mon Sep 17 00:00:00 2001
From: JialianLee <Jialian@DESKTOP-N4N6F2G.localdomain>
Date: Fri, 22 Dec 2017 15:26:47 +0800
Subject: [PATCH 46/98] Modification for reversi

---
 AlphaGo/reversi.py | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py
index 49d0e9a..cba91d9 100644
--- a/AlphaGo/reversi.py
+++ b/AlphaGo/reversi.py
@@ -34,7 +34,6 @@ def calc_flip(pos, own, enemy):
     :param enemy: bitboard
     :return: flip stones of enemy when I place stone at pos.
     """
-    assert 0 <= pos <= 63, f"pos={pos}"
     f1 = _calc_flip_half(pos, own, enemy)
     f2 = _calc_flip_half(63 - pos, rotate180(own), rotate180(enemy))
     return f1 | rotate180(f2)
@@ -125,7 +124,14 @@ class Reversi:
         self.board = None  # 8 * 8 board with 1 for black, -1 for white and 0 for blank
         self.color = None  # 1 for black and -1 for white
         self.action = None   # number in 0~63
-        self.winner = None
+        # self.winner = None
+        self.black_win = None
+
+    def get_board(self, black=None, white=None):
+        self.black = black or (0b00001000 << 24 | 0b00010000 << 32)
+        self.white = white or (0b00010000 << 24 | 0b00001000 << 32)
+        self.board = self.bitboard2board() 	
+        return self.board
 
     def simulate_is_valid(self, board, color):
         self.board = board
@@ -134,18 +140,19 @@ class Reversi:
         own, enemy = self.get_own_and_enemy()
         mobility = find_correct_moves(own, enemy)
         valid_moves = bit_to_array(mobility, 64)
+        valid_moves = np.argwhere(valid_moves)
         valid_moves = list(np.reshape(valid_moves, len(valid_moves)))
         return valid_moves
 
-    def simulate_step_forward(self, board, color, vertex):
-        self.board = board
-        self.color = color
+    def simulate_step_forward(self, state, vertex):
+        self.board = state[0]
+        self.color = state[1]
         self.board2bitboard()
         self.vertex2action(vertex)
         step_forward = self.step()
         if step_forward:
             new_board = self.bitboard2board()
-            return new_board
+            return [new_board, 0 - self.color], 0
 
     def executor_do_move(self, board, color, vertex):
         self.board = board
@@ -155,13 +162,14 @@ class Reversi:
         step_forward = self.step()
         if step_forward:
             new_board = self.bitboard2board()
-            return new_board
+        for i in range(64):
+        	board[i] = new_board[i]
 
     def executor_get_score(self, board):
         self.board = board
         self._game_over()
-        if self.winner is not None:
-            return self.winner, 0 - self.winner
+        if self.black_win is not None:
+            return self.black_win
         else:
             ValueError("Game not finished!")
 
@@ -219,6 +227,7 @@ class Reversi:
 
     def _game_over(self):
         # self.done = True
+        '''
         if self.winner is None:
             black_num, white_num = self.number_of_black_and_white
             if black_num > white_num:
@@ -227,9 +236,12 @@ class Reversi:
                 self.winner = -1
             else:
                 self.winner = 0
+        '''
+        if self.black_win is None:
+        	black_num, white_num = self.number_of_black_and_white
+        	self.black_win = black_num - white_num
 
     def illegal_move_to_lose(self, action):
-        logger.warning(f"Illegal action={action}, No Flipped!")
         self._game_over()
 
     def get_own_and_enemy(self):

From c5e33af84173b4c5165e4a51600232daa1485cff Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Fri, 22 Dec 2017 15:44:44 +0800
Subject: [PATCH 47/98] move the unit test of is_eye into go.py

---
 AlphaGo/go.py        |  39 +++++++
 AlphaGo/unit_test.py | 266 -------------------------------------------
 2 files changed, 39 insertions(+), 266 deletions(-)
 delete mode 100644 AlphaGo/unit_test.py

diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index 9b7e21f..661d918 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -308,3 +308,42 @@ class Go:
 
         return score
 
+if __name__ == "__main__":
+    ### do unit test for Go class
+    pure_test = [
+        0, 1, 0, 1, 0, 1, 0, 0, 0,
+        1, 0, 1, 0, 1, 0, 0, 0, 0,
+        0, 1, 0, 1, 0, 0, 1, 0, 0,
+        0, 0, 1, 0, 0, 1, 0, 1, 0,
+        0, 0, 0, 0, 0, 1, 1, 1, 0,
+        1, 1, 1, 0, 0, 0, 0, 0, 0,
+        1, 0, 1, 0, 0, 1, 1, 0, 0,
+        1, 1, 1, 0, 1, 0, 1, 0, 0,
+        0, 0, 0, 0, 1, 1, 1, 0, 0
+    ]
+
+    pt_qry = [(1, 1), (1, 5), (3, 3), (4, 7), (7, 2), (8, 6)]
+    pt_ans = [True, True, True, True, True, True]
+
+    opponent_test = [
+        0, 1, 0, 1, 0, 1, 0,-1, 1,
+        1,-1, 0,-1, 1,-1, 0, 1, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 1,
+        1, 1,-1, 0, 1,-1, 1, 0, 0,
+        1, 0, 1, 0, 1, 0, 1, 0, 0,
+        -1,1, 1, 0, 1, 1, 1, 0, 0,
+        0, 1,-1, 0,-1,-1,-1, 0, 0,
+        1, 0, 1, 0,-1, 0,-1, 0, 0,
+        0, 1, 0, 0,-1,-1,-1, 0, 0
+    ]
+    ot_qry = [(1, 1), (1, 5), (2, 9), (5, 2), (5, 6), (8, 6), (8, 2)]
+    ot_ans = [False, False, False, False, False, False, True]
+
+    go = Go(size=9, komi=3.75)
+    for i in range(6):
+        print (go._is_eye(pure_test, utils.BLACK, pt_qry[i]))
+    print("Test of pure eye\n")
+
+    for i in range(7):
+        print (go._is_eye(opponent_test, utils.BLACK, ot_qry[i]))
+    print("Test of eye surrend by opponents\n")
diff --git a/AlphaGo/unit_test.py b/AlphaGo/unit_test.py
deleted file mode 100644
index 7a33b8e..0000000
--- a/AlphaGo/unit_test.py
+++ /dev/null
@@ -1,266 +0,0 @@
-import numpy as np
-import sys
-from game import Game
-from engine import GTPEngine
-import utils
-import time
-import copy
-import network_small
-import tensorflow as tf
-from collections import deque
-from tianshou.core.mcts.mcts import MCTS
-
-DELTA = [[1, 0], [-1, 0], [0, -1], [0, 1]]
-CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]]
-
-class GoEnv:
-    def __init__(self, size=9, komi=6.5):
-        self.size = size
-        self.komi = komi
-        self.board = [utils.EMPTY] * (self.size * self.size)
-        self.history = deque(maxlen=8)
-
-    def _set_board(self, board):
-        self.board = board
-
-    def _flatten(self, vertex):
-        x, y = vertex
-        return (x - 1) * self.size + (y - 1)
-
-    def _bfs(self, vertex, color, block, status, alive_break):
-        block.append(vertex)
-        status[self._flatten(vertex)] = True
-        nei = self._neighbor(vertex)
-        for n in nei:
-            if not status[self._flatten(n)]:
-                if self.board[self._flatten(n)] == color:
-                    self._bfs(n, color, block, status, alive_break)
-
-    def _find_block(self, vertex, alive_break=False):
-        block = []
-        status = [False] * (self.size * self.size)
-        color = self.board[self._flatten(vertex)]
-        self._bfs(vertex, color, block, status, alive_break)
-
-        for b in block:
-            for n in self._neighbor(b):
-                if self.board[self._flatten(n)] == utils.EMPTY:
-                    return False, block
-        return True, block
-
-    def _is_qi(self, color, vertex):
-        nei = self._neighbor(vertex)
-        for n in nei:
-            if self.board[self._flatten(n)] == utils.EMPTY:
-                return True
-
-        self.board[self._flatten(vertex)] = color
-        for n in nei:
-            if self.board[self._flatten(n)] == utils.another_color(color):
-                can_kill, block = self._find_block(n)
-                if can_kill:
-                    self.board[self._flatten(vertex)] = utils.EMPTY
-                    return True
-
-        ### avoid suicide
-        can_kill, block = self._find_block(vertex)
-        if can_kill:
-            self.board[self._flatten(vertex)] = utils.EMPTY
-            return False
-
-        self.board[self._flatten(vertex)] = utils.EMPTY
-        return True
-
-    def _check_global_isomorphous(self, color, vertex):
-        ##backup
-        _board = copy.copy(self.board)
-        self.board[self._flatten(vertex)] = color
-        self._process_board(color, vertex)
-        if self.board in self.history:
-            res = True
-        else:
-            res = False
-
-        self.board = _board
-        return res
-
-    def _in_board(self, vertex):
-        x, y = vertex
-        if x < 1 or x > self.size: return False
-        if y < 1 or y > self.size: return False
-        return True
-
-    def _neighbor(self, vertex):
-        x, y = vertex
-        nei = []
-        for d in DELTA:
-            _x = x + d[0]
-            _y = y + d[1]
-            if self._in_board((_x, _y)):
-                nei.append((_x, _y))
-        return nei
-
-    def _corner(self, vertex):
-        x, y = vertex
-        corner = []
-        for d in CORNER_OFFSET:
-            _x = x + d[0]
-            _y = y + d[1]
-            if self._in_board((_x, _y)):
-                corner.append((_x, _y))
-        return corner
-
-    def _process_board(self, color, vertex):
-        nei = self._neighbor(vertex)
-        for n in nei:
-            if self.board[self._flatten(n)] == utils.another_color(color):
-                can_kill, block = self._find_block(n, alive_break=True)
-                if can_kill:
-                    for b in block:
-                        self.board[self._flatten(b)] = utils.EMPTY
-
-    def _find_group(self, start):
-        color = self.board[self._flatten(start)]
-        #print ("color : ", color)
-        chain = set()
-        frontier = [start]
-        while frontier:
-            current = frontier.pop()
-            #print ("current : ", current)
-            chain.add(current)
-            for n in self._neighbor(current):
-                #print n, self._flatten(n), self.board[self._flatten(n)],
-                if self.board[self._flatten(n)] == color and not n in chain:
-                    frontier.append(n)
-        return chain
-
-    def _is_eye(self, color, vertex):
-        nei = self._neighbor(vertex)
-        cor = self._corner(vertex)
-        ncolor = {color == self.board[self._flatten(n)] for n in nei}
-        if False in ncolor:
-            #print "not all neighbors are in same color with us"
-            return False
-        if set(nei) < self._find_group(nei[0]):
-            #print "all neighbors are in same group and same color with us"
-            return True
-        else:
-            opponent_number = [self.board[self._flatten(c)] for c in cor].count(-color)
-            opponent_propotion = float(opponent_number) / float(len(cor))
-            if opponent_propotion < 0.5:
-                #print "few opponents, real eye"
-                return True
-            else:
-                #print "many opponents, fake eye"
-                return False
-
-    # def is_valid(self, color, vertex):
-    def is_valid(self, state, action):
-        # state is the play board, the shape is [1, 9, 9, 17]
-        if action == self.size * self.size:
-            vertex = (0, 0)
-        else:
-            vertex = (action / self.size + 1, action % self.size + 1)
-        if state[0, 0, 0, -1] == utils.BLACK:
-            color = utils.BLACK
-        else:
-            color = utils.WHITE
-        self.history.clear()
-        for i in range(8):
-            self.history.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist())
-        self.board = copy.copy(self.history[-1])
-        ### in board
-        if not self._in_board(vertex):
-            return False
-
-        ### already have stone
-        if not self.board[self._flatten(vertex)] == utils.EMPTY:
-            # print(np.array(self.board).reshape(9, 9))
-            # print(vertex)
-            return False
-
-        ### check if it is qi
-        if not self._is_qi(color, vertex):
-            return False
-
-        ### check if it is an eye of yourself
-        ### assumptions : notice that this judgement requires that the state is an endgame
-        #if self._is_eye(color, vertex):
-        #    return False
-
-        if self._check_global_isomorphous(color, vertex):
-            return False
-
-        return True
-
-    def do_move(self, color, vertex):
-        if vertex == utils.PASS:
-            return True
-
-        id_ = self._flatten(vertex)
-        if self.board[id_] == utils.EMPTY:
-            self.board[id_] = color
-            self.history.append(copy.copy(self.board))
-            return True
-        else:
-            return False
-
-    def step_forward(self, state, action):
-        if state[0, 0, 0, -1] == 1:
-            color = 1
-        else:
-            color = -1
-        if action == 81:
-            vertex = (0, 0)
-        else:
-            vertex = (action % 9 + 1, action / 9 + 1)
-        # print(vertex)
-        # print(self.board)
-        self.board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist()
-        self.do_move(color, vertex)
-        new_state = np.concatenate(
-            [state[:, :, :, 1:8], (np.array(self.board) == 1).reshape(1, 9, 9, 1),
-             state[:, :, :, 9:16], (np.array(self.board) == -1).reshape(1, 9, 9, 1),
-             np.array(1 - state[:, :, :, -1]).reshape(1, 9, 9, 1)],
-            axis=3)
-        return new_state, 0
-
-
-pure_test = [
-    0, 1, 0, 1, 0, 1, 0, 0, 0,
-    1, 0, 1, 0, 1, 0, 0, 0, 0,
-    0, 1, 0, 1, 0, 0, 1, 0, 0,
-    0, 0, 1, 0, 0, 1, 0, 1, 0,
-    0, 0, 0, 0, 0, 1, 1, 1, 0,
-    1, 1, 1, 0, 0, 0, 0, 0, 0,
-    1, 0, 1, 0, 0, 1, 1, 0, 0,
-    1, 1, 1, 0, 1, 0, 1, 0, 0,
-    0, 0, 0, 0, 1, 1, 1, 0, 0
-]
-
-pt_qry = [(1, 1), (1, 5), (3, 3), (4, 7), (7, 2), (8, 6)]
-pt_ans = [True, True, True, True, True, True]
-
-opponent_test = [
-    0, 1, 0, 1, 0, 1, 0,-1, 1,
-    1,-1, 0,-1, 1,-1, 0, 1, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 1,
-    1, 1,-1, 0, 1,-1, 1, 0, 0,
-    1, 0, 1, 0, 1, 0, 1, 0, 0,
-   -1, 1, 1, 0, 1, 1, 1, 0, 0,
-    0, 1,-1, 0,-1,-1,-1, 0, 0,
-    1, 0, 1, 0,-1, 0,-1, 0, 0,
-    0, 1, 0, 0,-1,-1,-1, 0, 0
-]
-ot_qry = [(1, 1), (1, 5), (2, 9), (5, 2), (5, 6), (8, 2), (8, 6)]
-ot_ans = [False, False, False, False, False, True, False]
-
-#print (ge._find_group((6, 1)))
-#print ge._is_eye(utils.BLACK, pt_qry[0])
-ge = GoEnv()
-ge._set_board(pure_test)
-for i in range(6):
-    print (ge._is_eye(utils.BLACK, pt_qry[i]))
-ge._set_board(opponent_test)
-for i in range(7):
-    print (ge._is_eye(utils.BLACK, ot_qry[i]))

From 67ba76a04d42152c1c7ae6f3554b2e8683fca0d5 Mon Sep 17 00:00:00 2001
From: rtz19970824 <1289226405@qq.com>
Date: Fri, 22 Dec 2017 17:16:44 +0800
Subject: [PATCH 48/98] implement a stochastic sample training method

---
 AlphaGo/game.py  |  4 ++--
 AlphaGo/model.py | 44 +++++++++++++++++++++++++-------------------
 2 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index 8706572..df08c0a 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -31,7 +31,7 @@ class Game:
         self.latest_boards = deque(maxlen=8)
         for _ in range(8):
             self.latest_boards.append(self.board)
-        self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8)
+        self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8, checkpoint_path=checkpoint_path)
         # self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v],
         #                                              feed_dict={self.net.x: state, self.net.is_training: False})
         self.game_engine = go.Go(size=self.size, komi=self.komi)
@@ -96,7 +96,7 @@ class Game:
         sys.stdout.flush()
 
 if __name__ == "__main__":
-    g = Game()
+    g = Game(checkpoint_path='./checkpoints/')
     g.show_board()
     g.think_play_move(1)
     #file = open("debug.txt", "a")
diff --git a/AlphaGo/model.py b/AlphaGo/model.py
index 764ba5f..22e8626 100644
--- a/AlphaGo/model.py
+++ b/AlphaGo/model.py
@@ -1,5 +1,6 @@
 import os
 import time
+import random
 import sys
 import cPickle
 from collections import deque
@@ -104,7 +105,7 @@ class ResNet(object):
         self.window_length = 7000
         self.save_freq = 5000
         self.training_data = {'states': deque(maxlen=self.window_length), 'probs': deque(maxlen=self.window_length),
-                              'winner': deque(maxlen=self.window_length)}
+                              'winner': deque(maxlen=self.window_length), 'length': deque(maxlen=self.window_length)}
 
     def _build_network(self, residual_block_num, checkpoint_path):
         """
@@ -199,15 +200,15 @@ class ResNet(object):
 
         new_file_list = []
         all_file_list = []
-        training_data = {}
+        training_data = {'states': [], 'probs': [], 'winner': []}
+
         iters = 0
         while True:
             new_file_list = list(set(os.listdir(data_path)).difference(all_file_list))
-            if new_file_list:
+            while new_file_list:
                 all_file_list = os.listdir(data_path)
-            new_file_list.sort(
-                key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir(data_path + file) else 0)
-            if new_file_list:
+                new_file_list.sort(
+                    key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir(data_path + file) else 0)
                 for file in new_file_list:
                     states, probs, winner = self._file_to_training_data(data_path + file)
                     assert states.shape[0] == probs.shape[0]
@@ -215,32 +216,36 @@ class ResNet(object):
                     self.training_data['states'].append(states)
                     self.training_data['probs'].append(probs)
                     self.training_data['winner'].append(winner)
-                    if len(self.training_data['states']) == self.window_length:
-                        training_data['states'] = np.concatenate(self.training_data['states'], axis=0)
-                        training_data['probs'] = np.concatenate(self.training_data['probs'], axis=0)
-                        training_data['winner'] = np.concatenate(self.training_data['winner'], axis=0)
+                    self.training_data['length'].append(states.shape[0])
+                new_file_list = list(set(os.listdir(data_path)).difference(all_file_list))
 
             if len(self.training_data['states']) != self.window_length:
                 continue
             else:
-                data_num = training_data['states'].shape[0]
-                index = np.arange(data_num)
-                np.random.shuffle(index)
                 start_time = time.time()
+                for i in range(batch_size):
+                    game_num = random.randint(0, self.window_length-1)
+                    state_num = random.randint(0, self.training_data['length'][game_num]-1)
+                    training_data['states'].append(np.expand_dims(self.training_data['states'][game_num][state_num], 0))
+                    training_data['probs'].append(np.expand_dims(self.training_data['probs'][game_num][state_num], 0))
+                    training_data['winner'].append(np.expand_dims(self.training_data['winner'][game_num][state_num], 0))
                 value_loss, policy_loss, reg, _ = self.sess.run(
                     [self.value_loss, self.policy_loss, self.reg, self.train_op],
-                    feed_dict={self.x: training_data['states'][index[:batch_size]],
-                               self.z: training_data['winner'][index[:batch_size]],
-                               self.pi: training_data['probs'][index[:batch_size]],
+                    feed_dict={self.x: np.concatenate(training_data['states'], axis=0),
+                               self.z: np.concatenate(training_data['winner'], axis=0),
+                               self.pi: np.concatenate(training_data['probs'], axis=0),
                                self.is_training: True})
+
                 print("Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format(iters,
                                                                                                  time.time() - start_time,
                                                                                                  value_loss,
                                                                                                  policy_loss, reg))
-                iters += 1
                 if iters % self.save_freq == 0:
                     save_path = "Iteration{}.ckpt".format(iters)
                     self.saver.save(self.sess, self.checkpoint_path + save_path)
+                for key in training_data.keys():
+                    training_data[key] = []
+                iters += 1
 
     def _file_to_training_data(self, file_name):
         read = False
@@ -250,6 +255,7 @@ class ResNet(object):
                     file.seek(0)
                     data = cPickle.load(file)
                     read = True
+                    print("{} Loaded!".format(file_name))
                 except Exception as e:
                     print(e)
                     time.sleep(1)
@@ -275,6 +281,6 @@ class ResNet(object):
         return states, probs, winner
 
 
-if __name__=="__main__":
-    model = ResNet(board_size=9, action_num=82)
+if __name__ == "__main__":
+    model = ResNet(board_size=9, action_num=82, history_length=8)
     model.train("file", data_path="./data/", batch_size=128, checkpoint_path="./checkpoint/")

From 3b534064bd6c92c972883d448c7c77fa0884e356 Mon Sep 17 00:00:00 2001
From: mcgrady00h <281130306@qq.com>
Date: Sat, 23 Dec 2017 02:48:53 +0800
Subject: [PATCH 49/98] fix virtual loss bug

---
 tianshou/core/mcts/mcts.py                   | 22 +++--------
 tianshou/core/mcts/mcts_virtual_loss.py      | 41 ++++++++++----------
 tianshou/core/mcts/mcts_virtual_loss_test.py |  6 +--
 tianshou/core/mcts/utils.py                  | 21 ++++++++++
 4 files changed, 49 insertions(+), 41 deletions(-)
 create mode 100644 tianshou/core/mcts/utils.py

diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index 979e994..16d13d5 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -1,22 +1,9 @@
 import numpy as np
 import math
 import time
+import sys,os
+from .utils import list2tuple, tuple2list
 
-c_puct = 5
-
-
-def list2tuple(list):
-    try:
-        return tuple(list2tuple(sub) for sub in list)
-    except TypeError:
-        return list
-
-
-def tuple2list(tuple):
-    try:
-        return list(tuple2list(sub) for sub in tuple)
-    except TypeError:
-        return tuple
 
 
 class MCTSNode(object):
@@ -39,12 +26,13 @@ class MCTSNode(object):
         pass
 
 class UCTNode(MCTSNode):
-    def __init__(self, parent, action, state, action_num, prior, inverse=False):
+    def __init__(self, parent, action, state, action_num, prior, inverse=False, c_puct = 5):
         super(UCTNode, self).__init__(parent, action, state, action_num, prior, inverse)
         self.Q = np.zeros([action_num])
         self.W = np.zeros([action_num])
         self.N = np.zeros([action_num])
-        self.ucb = self.Q + c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1)
+        self.c_puct = c_puct
+        self.ucb = self.Q + self.c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1)
         self.mask = None
 
     def selection(self, simulator):
diff --git a/tianshou/core/mcts/mcts_virtual_loss.py b/tianshou/core/mcts/mcts_virtual_loss.py
index 9d20b5a..9335464 100644
--- a/tianshou/core/mcts/mcts_virtual_loss.py
+++ b/tianshou/core/mcts/mcts_virtual_loss.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # vim:fenc=utf-8
 # $File: mcts_virtual_loss.py
-# $Date: Tue Dec 19 17:0444 2017 +0800
+# $Date: Sat Dec 23 02:4850 2017 +0800
 # Original file: mcts.py
 # $Author: renyong15 © <mails.tsinghua.edu.cn>
 #
@@ -12,25 +12,13 @@
     manner.
 """
 
+from __future__ import absolute_import
+
 import numpy as np
 import math
 import time
-
-c_puct = 5
-
-
-def list2tuple(list):
-    try:
-        return tuple(list2tuple(sub) for sub in list)
-    except TypeError:
-        return list
-
-
-def tuple2list(tuple):
-    try:
-        return list(tuple2list(sub) for sub in tuple)
-    except TypeError:
-        return tuple
+import sys,os
+from .utils import list2tuple, tuple2list
 
 
 class MCTSNodeVirtualLoss(object):
@@ -53,12 +41,13 @@ class MCTSNodeVirtualLoss(object):
         pass
 
 class UCTNodeVirtualLoss(MCTSNodeVirtualLoss):
-    def __init__(self, parent, action, state, action_num, prior, inverse=False):
+    def __init__(self, parent, action, state, action_num, prior, inverse=False, c_puct = 5):
         super(UCTNodeVirtualLoss, self).__init__(parent, action, state, action_num, prior, inverse)
         self.Q = np.zeros([action_num])
         self.W = np.zeros([action_num])
         self.N = np.zeros([action_num])
         self.virtual_loss = np.zeros([action_num])
+        self.c_puct = c_puct
         #### modified by adding virtual loss
         #self.ucb = self.Q + c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1)
 
@@ -67,9 +56,9 @@ class UCTNodeVirtualLoss(MCTSNodeVirtualLoss):
     def selection(self, simulator):
         self.valid_mask(simulator)
         self.Q = np.zeros([self.action_num])
-        N_not_zero = self.N > 0
-        self.Q[N_not_zero] = (self.W[N_not_zero] + self.virtual_loss[N_not_zero] + 0.) / self.N[N_not_zero]
-        self.ucb = self.Q + c_puct * self.prior * math.sqrt(np.sum(self.N + self.virtual_loss)) /\
+        N_not_zero = (self.N + self.virtual_loss) > 0
+        self.Q[N_not_zero] = (self.W[N_not_zero] + 0.)/ (self.virtual_loss[N_not_zero] + self.N[N_not_zero])
+        self.ucb = self.Q + self.c_puct * self.prior * math.sqrt(np.sum(self.N + self.virtual_loss)) /\
                    (self.N + self.virtual_loss + 1)
         action = np.argmax(self.ucb)
         self.virtual_loss[action] += 1
@@ -93,6 +82,7 @@ class UCTNodeVirtualLoss(MCTSNodeVirtualLoss):
         self.W[action] += self.children[action].reward
 
         ## do not need to  compute Q and ucb immediately since it will be modified by virtual loss
+        ## just comment out and leaving for comparision
         #for i in range(self.action_num):
         #    if self.N[i] != 0:
         #        self.Q[i] = (self.W[i] + 0.) / self.N[i]
@@ -186,6 +176,12 @@ class MCTSVirtualLoss(object):
 
 
     def do_search(self, max_step=None, max_time=None):
+        """
+            Expand the MCTS tree with stop crierion either by max_step or max_time
+        
+            :param max_step search maximum minibath rounds. ONE step is ONE minibatch
+            :param max_time search maximum seconds
+        """
         if max_step is not None:
             self.step = 0
             self.max_step = max_step
@@ -205,6 +201,9 @@ class MCTSVirtualLoss(object):
                 self.step += 1
 
     def expand(self):
+        """
+            Core logic method for MCTS tree to expand nodes.
+        """
         ## minibatch with virtual loss
         nodes = []
         new_actions = []
diff --git a/tianshou/core/mcts/mcts_virtual_loss_test.py b/tianshou/core/mcts/mcts_virtual_loss_test.py
index d2d6c81..e4666f3 100644
--- a/tianshou/core/mcts/mcts_virtual_loss_test.py
+++ b/tianshou/core/mcts/mcts_virtual_loss_test.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # vim:fenc=utf-8
 # $File: mcts_virtual_loss_test.py
-# $Date: Tue Dec 19 16:5459 2017 +0800
+# $Date: Sat Dec 23 02:2139 2017 +0800
 # Original file: mcts_test.py
 # $Author: renyong15 © <mails.tsinghua.edu.cn>
 #
@@ -9,8 +9,8 @@
 
 
 import numpy as np
-from mcts_virtual_loss import MCTSVirtualLoss
-from evaluator import rollout_policy
+from .mcts_virtual_loss import MCTSVirtualLoss
+from .evaluator import rollout_policy
 
 
 class TestEnv:
diff --git a/tianshou/core/mcts/utils.py b/tianshou/core/mcts/utils.py
new file mode 100644
index 0000000..de518a0
--- /dev/null
+++ b/tianshou/core/mcts/utils.py
@@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*-
+# vim:fenc=utf-8
+# $File: utils.py
+# $Date: Sat Dec 23 02:0854 2017 +0800
+# $Author: renyong15 © <mails.tsinghua.edu.cn>
+#
+
+def list2tuple(list):
+    try:
+        return tuple(list2tuple(sub) for sub in list)
+    except TypeError:
+        return list
+
+
+def tuple2list(tuple):
+    try:
+        return list(tuple2list(sub) for sub in tuple)
+    except TypeError:
+        return tuple
+
+

From 032ea46b7b729ac09196f34463e2b46523848109 Mon Sep 17 00:00:00 2001
From: JialianLee <Jialian@DESKTOP-N4N6F2G.localdomain>
Date: Sat, 23 Dec 2017 09:47:08 +0800
Subject: [PATCH 50/98] small modification

---
 AlphaGo/reversi.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py
index cba91d9..320445e 100644
--- a/AlphaGo/reversi.py
+++ b/AlphaGo/reversi.py
@@ -171,12 +171,12 @@ class Reversi:
         if self.black_win is not None:
             return self.black_win
         else:
-            ValueError("Game not finished!")
+            raise ValueError("Game not finished!")
 
     def board2bitboard(self):
         count = 1
         if self.board is None:
-            ValueError("None board!")
+            raise ValueError("None board!")
         self.black = 0
         self.white = 0
         for i in range(64):
@@ -208,7 +208,7 @@ class Reversi:
 
     def step(self):
         if self.action < 0 or self.action > 63:
-            ValueError("Wrong action!")
+            raise ValueError("Wrong action!")
         if self.action is None:
             return False
 

From b2ef770415ade966dcc29073973bfea3a447481b Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Sat, 23 Dec 2017 13:05:25 +0800
Subject: [PATCH 51/98] connect reversi with game

---
 AlphaGo/engine.py          |  4 ++--
 AlphaGo/game.py            | 44 +++++++++++++++++++++++++-------------
 AlphaGo/go.py              | 28 +++++++++++-------------
 AlphaGo/play.py            |  1 -
 AlphaGo/reversi.py         | 16 +++++++++-----
 AlphaGo/self-play.py       |  2 +-
 tianshou/core/mcts/mcts.py |  2 +-
 7 files changed, 57 insertions(+), 40 deletions(-)

diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py
index 8b54470..98e5e61 100644
--- a/AlphaGo/engine.py
+++ b/AlphaGo/engine.py
@@ -183,7 +183,7 @@ class GTPEngine():
             return 'unknown player', False
 
     def cmd_get_score(self, args, **kwargs):
-        return self._game.game_engine.executor_get_score(self._game.board, True), True
+        return self._game.game_engine.executor_get_score(self._game.board), True
 
     def cmd_show_board(self, args, **kwargs):
         return self._game.board, True
@@ -194,4 +194,4 @@ class GTPEngine():
 
 if __name__ == "main":
     game = Game()
-    engine = GTPEngine(game_obj=Game)
+    engine = GTPEngine(game_obj=game)
diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index df08c0a..ff1faf5 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -10,12 +10,14 @@ import copy
 import tensorflow as tf
 import numpy as np
 import sys, os
-import go
 import model
 from collections import deque
 sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir))
 from tianshou.core.mcts.mcts import MCTS
 
+import go
+import reversi
+
 class Game:
     '''
     Load the real game and trained weights.
@@ -23,18 +25,26 @@ class Game:
     TODO : Maybe merge with the engine class in future, 
     currently leave it untouched for interacting with Go UI.
     '''
-    def __init__(self, size=9, komi=3.75, checkpoint_path=None):
-        self.size = size
-        self.komi = komi
-        self.board = [utils.EMPTY] * (self.size ** 2)
-        self.history = []
-        self.latest_boards = deque(maxlen=8)
-        for _ in range(8):
-            self.latest_boards.append(self.board)
-        self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8, checkpoint_path=checkpoint_path)
-        # self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v],
-        #                                              feed_dict={self.net.x: state, self.net.is_training: False})
-        self.game_engine = go.Go(size=self.size, komi=self.komi)
+    def __init__(self, name="go", checkpoint_path=None):
+        self.name = name
+        if "go" == name:
+            self.size = 9
+            self.komi = 3.75
+            self.board = [utils.EMPTY] * (self.size ** 2)
+            self.history = []
+            self.latest_boards = deque(maxlen=8)
+            for _ in range(8):
+                self.latest_boards.append(self.board)
+
+            self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8)
+            self.game_engine = go.Go(size=self.size, komi=self.komi)
+        elif "reversi" == name:
+            self.size = 8
+            self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=1)
+            self.game_engine = reversi.Reversi()
+            self.board = self.game_engine.get_board()
+        else:
+            print(name + " is an unknown game...")
 
     def clear(self):
         self.board = [utils.EMPTY] * (self.size ** 2)
@@ -65,7 +75,11 @@ class Game:
         # this function can be called directly to play the opponent's move
         if vertex == utils.PASS:
             return True
-        res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex)
+        # TODO this implementation is not very elegant
+        if "go" == self.name:
+            res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex)
+        elif "revsersi" == self.name:
+            res = self.game_engine.executor_do_move(self.board, color, vertex)
         return res
 
     def think_play_move(self, color):
@@ -96,7 +110,7 @@ class Game:
         sys.stdout.flush()
 
 if __name__ == "__main__":
-    g = Game(checkpoint_path='./checkpoints/')
+    g = Game()
     g.show_board()
     g.think_play_move(1)
     #file = open("debug.txt", "a")
diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index 661d918..b819c08 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -157,7 +157,7 @@ class Go:
             vertex = self._deflatten(action)
         return vertex
 
-    def _is_valid(self, history_boards, current_board, color, vertex):
+    def _rule_check(self, history_boards, current_board, color, vertex):
         ### in board
         if not self._in_board(vertex):
             return False
@@ -176,30 +176,30 @@ class Go:
 
         return True
 
-    def simulate_is_valid(self, state, action):
+    def _is_valid(self, state, action):
         history_boards, color = state
         vertex = self._action2vertex(action)
         current_board = history_boards[-1]
 
-        if not self._is_valid(history_boards, current_board, color, vertex):
+        if not self._rule_check(history_boards, current_board, color, vertex):
             return False
 
         if not self._knowledge_prunning(current_board, color, vertex):
             return False
         return True
 
-    def simulate_is_valid_list(self, state, action_set):
+    def simulate_get_mask(self, state, action_set):
         # find all the invalid actions
-        invalid_action_list = []
+        invalid_action_mask = []
         for action_candidate in action_set[:-1]:
             # go through all the actions excluding pass
-            if not self.simulate_is_valid(state, action_candidate):
-                invalid_action_list.append(action_candidate)
-        if len(invalid_action_list) < len(action_set) - 1:
-            invalid_action_list.append(action_set[-1])
+            if not self._is_valid(state, action_candidate):
+                invalid_action_mask.append(action_candidate)
+        if len(invalid_action_mask) < len(action_set) - 1:
+            invalid_action_mask.append(action_set[-1])
             # forbid pass, if we have other choices
             # TODO: In fact we should not do this. In some extreme cases, we should permit pass.
-        return invalid_action_list
+        return invalid_action_mask
 
     def _do_move(self, board, color, vertex):
         if vertex == utils.PASS:
@@ -219,7 +219,7 @@ class Go:
         return [history_boards, new_color], 0
 
     def executor_do_move(self, history, latest_boards, current_board, color, vertex):
-        if not self._is_valid(history, current_board, color, vertex):
+        if not self._rule_check(history, current_board, color, vertex):
             return False
         current_board[self._flatten(vertex)] = color
         self._process_board(current_board, color, vertex)
@@ -280,7 +280,7 @@ class Go:
             elif color_estimate < 0:
                 return utils.WHITE
 
-    def executor_get_score(self, current_board, is_unknown_estimation=False):
+    def executor_get_score(self, current_board):
         '''
             is_unknown_estimation: whether use nearby stone to predict the unknown
             return score from BLACK perspective.
@@ -294,10 +294,8 @@ class Go:
                 _board[self._flatten(vertex)] = utils.BLACK
             elif boarder_color == {utils.WHITE}:
                 _board[self._flatten(vertex)] = utils.WHITE
-            elif is_unknown_estimation:
-                _board[self._flatten(vertex)] = self._predict_from_nearby(_board, vertex)
             else:
-                _board[self._flatten(vertex)] =utils.UNKNOWN
+                _board[self._flatten(vertex)] = self._predict_from_nearby(_board, vertex)
         score = 0
         for i in _board:
             if i == utils.BLACK:
diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index 3681430..b601ada 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -7,7 +7,6 @@ import time
 import os
 import cPickle
 
-
 class Data(object):
     def __init__(self):
         self.boards = []
diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py
index cba91d9..d67a882 100644
--- a/AlphaGo/reversi.py
+++ b/AlphaGo/reversi.py
@@ -25,7 +25,6 @@ def find_correct_moves(own, enemy):
     mobility |= search_offset_right(own, enemy, mask, 7)  # Left bottom
     return mobility
 
-
 def calc_flip(pos, own, enemy):
     """return flip stones of enemy by bitboard when I place stone at pos.
 
@@ -133,7 +132,9 @@ class Reversi:
         self.board = self.bitboard2board() 	
         return self.board
 
-    def simulate_is_valid(self, board, color):
+    def simulate_get_mask(self, state, action_set):
+        history_boards, color = state
+        board = history_boards[-1]
         self.board = board
         self.color = color
         self.board2bitboard()
@@ -142,13 +143,18 @@ class Reversi:
         valid_moves = bit_to_array(mobility, 64)
         valid_moves = np.argwhere(valid_moves)
         valid_moves = list(np.reshape(valid_moves, len(valid_moves)))
-        return valid_moves
+        # TODO it seems that the pass move is not considered
+        invalid_action_mask = []
+        for action in action_set:
+            if action not in valid_moves:
+                invalid_action_mask.append(action)
+        return invalid_action_mask
 
-    def simulate_step_forward(self, state, vertex):
+    def simulate_step_forward(self, state, action):
         self.board = state[0]
         self.color = state[1]
         self.board2bitboard()
-        self.vertex2action(vertex)
+        self.action = action
         step_forward = self.step()
         if step_forward:
             new_board = self.bitboard2board()
diff --git a/AlphaGo/self-play.py b/AlphaGo/self-play.py
index 4387b24..dd03b13 100644
--- a/AlphaGo/self-play.py
+++ b/AlphaGo/self-play.py
@@ -79,7 +79,7 @@ while True:
         prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1))
     print("Finished")
     print("\n")
-    score = game.game_engine.executor_get_score(game.board, True)
+    score = game.game_engine.executor_get_score(game.board)
     if score > 0:
         winner = utils.BLACK
     else:
diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index 8bb5f06..e8f3709 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -73,7 +73,7 @@ class UCTNode(MCTSNode):
     def valid_mask(self, simulator):
         # let all invalid actions be illeagel in mcts
         if self.mask is None:
-            self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num))
+            self.mask = simulator.simulate_get_mask(self.state, range(self.action_num))
         self.ucb[self.mask] = -float("Inf")
 
 
From b96fa9448bde1c42cd5a696568a30bda7bddf195 Mon Sep 17 00:00:00 2001
From: rtz19970824 <1289226405@qq.com>
Date: Sat, 23 Dec 2017 14:45:07 +0800
Subject: [PATCH 52/98] minor fixed

---
 .gitignore        |  4 ++--
 AlphaGo/game.py   | 19 ++++++++++---------
 AlphaGo/player.py |  2 +-
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/.gitignore b/.gitignore
index d697b92..8ee6691 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,8 +4,8 @@ leela-zero
 parameters
 *.swp
 *.sublime*
-checkpoints
-checkpoints_origin
+checkpoint
 *.json
 .DS_Store
 data
+.log
diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index ff1faf5..90d0bf0 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -27,29 +27,30 @@ class Game:
     '''
     def __init__(self, name="go", checkpoint_path=None):
         self.name = name
-        if "go" == name:
+        if self.name == "go":
             self.size = 9
             self.komi = 3.75
             self.board = [utils.EMPTY] * (self.size ** 2)
             self.history = []
+            self.history_length = 8
             self.latest_boards = deque(maxlen=8)
             for _ in range(8):
                 self.latest_boards.append(self.board)
-
-            self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8)
             self.game_engine = go.Go(size=self.size, komi=self.komi)
-        elif "reversi" == name:
+        elif self.name == "reversi":
             self.size = 8
-            self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=1)
+            self.history_length = 1
             self.game_engine = reversi.Reversi()
             self.board = self.game_engine.get_board()
         else:
-            print(name + " is an unknown game...")
+            raise ValueError(name + " is an unknown game...")
+
+        self.evaluator = model.ResNet(self.size, self.size ** 2 + 1, history_length=self.history_length)
 
     def clear(self):
         self.board = [utils.EMPTY] * (self.size ** 2)
         self.history = []
-        for _ in range(8):
+        for _ in range(self.history_length):
             self.latest_boards.append(self.board)
 
     def set_size(self, n):
@@ -76,9 +77,9 @@ class Game:
         if vertex == utils.PASS:
             return True
         # TODO this implementation is not very elegant
-        if "go" == self.name:
+        if self.name == "go":
             res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex)
-        elif "revsersi" == self.name:
+        elif self.name == "reversi":
             res = self.game_engine.executor_do_move(self.board, color, vertex)
         return res
 
diff --git a/AlphaGo/player.py b/AlphaGo/player.py
index 0e3daff..e848d2b 100644
--- a/AlphaGo/player.py
+++ b/AlphaGo/player.py
@@ -34,7 +34,7 @@ if __name__ == '__main__':
 
     daemon = Pyro4.Daemon()                # make a Pyro daemon
     ns = Pyro4.locateNS()                  # find the name server
-    player = Player(role = args.role, engine = engine)
+    player = Player(role=args.role, engine=engine)
     print "Init " + args.role + " player finished"
     uri = daemon.register(player)          # register the greeting maker as a Pyro object
     print "Start on name " + args.role

From 951eed60edeabbcd90ac465fc2df2050584a0238 Mon Sep 17 00:00:00 2001
From: haoshengzou <zouhaosheng@163.com>
Date: Sat, 23 Dec 2017 15:34:44 +0800
Subject: [PATCH 53/98] fix imports to support both python2 and python3. move
 contents from __init__.py to leave for work after major development.

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 9c3af16..fc7d494 100644
--- a/README.md
+++ b/README.md
@@ -41,6 +41,11 @@ Tianshou(天授) is a reinforcement learning platform. The following image illus
 
 <img src="https://github.com/sproblvem/tianshou/blob/master/docs/figures/go.png" height="150"/> <img src="https://github.com/sproblvem/tianshou/blob/master/docs/figures/reversi.jpg" height="150"/> <img src="https://github.com/sproblvem/tianshou/blob/master/docs/figures/warzone.jpg" height="150"/>
 
+## examples
+
+During development, run examples under `./examples/` directory with, e.g. `python ppo_example.py`.
+Running them under this directory with `python examples/ppo_example.py` will not work.
+
 
 ## About coding style
 

From 04048b78738d1092768c669f37fa63a9e1922d1a Mon Sep 17 00:00:00 2001
From: haoshengzou <zouhaosheng@163.com>
Date: Sat, 23 Dec 2017 15:36:10 +0800
Subject: [PATCH 54/98] fix imports to support both python2 and python3. move
 contents from __init__.py to leave for work after major development.

---
 examples/ppo_example.py                      |  7 +++----
 tianshou/core/policy/__init__.py             |  6 ------
 tianshou/core/policy/base.py                 | 12 ++++++++++++
 tianshou/core/policy/dqn.py                  | 18 ++++++++++++------
 tianshou/core/value_function/action_value.py | 17 +++++++++++++----
 tianshou/core/value_function/base.py         |  5 ++++-
 tianshou/core/value_function/state_value.py  |  8 +++++---
 7 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/examples/ppo_example.py b/examples/ppo_example.py
index 02ccb52..985c8f2 100755
--- a/examples/ppo_example.py
+++ b/examples/ppo_example.py
@@ -1,17 +1,16 @@
 #!/usr/bin/env python
+from __future__ import absolute_import
 
 import tensorflow as tf
-import numpy as np
-import time
 import gym
 
 # our lib imports here!
 import sys
 sys.path.append('..')
-import tianshou.core.losses as losses
+from tianshou.core import losses
 from tianshou.data.batch import Batch
 import tianshou.data.advantage_estimation as advantage_estimation
-import tianshou.core.policy as policy
+import tianshou.core.policy.stochastic as policy  # TODO: fix imports as zhusuan so that only need to import to policy
 
 
 def policy_net(observation, action_dim, scope=None):
diff --git a/tianshou/core/policy/__init__.py b/tianshou/core/policy/__init__.py
index ccde775..e69de29 100644
--- a/tianshou/core/policy/__init__.py
+++ b/tianshou/core/policy/__init__.py
@@ -1,6 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from .base import *
-from .stochastic import *
-from .dqn import *
\ No newline at end of file
diff --git a/tianshou/core/policy/base.py b/tianshou/core/policy/base.py
index 025abd5..1adeaeb 100644
--- a/tianshou/core/policy/base.py
+++ b/tianshou/core/policy/base.py
@@ -13,11 +13,23 @@ import tensorflow as tf
 __all__ = [
     'StochasticPolicy',
     'QValuePolicy',
+    'PolicyBase'
 ]
 
 # TODO: a even more "base" class for policy
 
 
+class PolicyBase(object):
+    """
+    base class for policy. only provides `act` method with exploration
+    """
+    def __init__(self):
+        pass
+
+    def act(self, observation, exploration):
+        raise NotImplementedError()
+
+
 class QValuePolicy(object):
     """
     The policy as in DQN
diff --git a/tianshou/core/policy/dqn.py b/tianshou/core/policy/dqn.py
index d03dbd4..716e4c4 100644
--- a/tianshou/core/policy/dqn.py
+++ b/tianshou/core/policy/dqn.py
@@ -1,16 +1,22 @@
-from tianshou.core.policy.base import QValuePolicy
+from __future__ import absolute_import
+
+from .base import PolicyBase
 import tensorflow as tf
-import sys
-sys.path.append('..')
-import value_function.action_value as value_func
+from ..value_function.action_value import DQN
 
 
-class DQN_refactor(object):
+class DQNRefactor(PolicyBase):
     """
     use DQN from value_function as a member
     """
     def __init__(self, value_tensor, observation_placeholder, action_placeholder):
-        self._network = value_func.DQN(value_tensor, observation_placeholder, action_placeholder)
+        self._network = DQN(value_tensor, observation_placeholder, action_placeholder)
+        self._argmax_action = tf.argmax(value_tensor, axis=1)
+
+    def act(self, observation, exploration):
+        sess = tf.get_default_session()
+        if not exploration:  # no exploration
+             action = sess.run(self._argmax_action, feed_dict={})
 
 
 class DQN(QValuePolicy):
diff --git a/tianshou/core/value_function/action_value.py b/tianshou/core/value_function/action_value.py
index cb8acc8..2bda4fa 100644
--- a/tianshou/core/value_function/action_value.py
+++ b/tianshou/core/value_function/action_value.py
@@ -1,4 +1,6 @@
-from base import ValueFunctionBase
+from __future__ import absolute_import
+
+from .base import ValueFunctionBase
 import tensorflow as tf
 
 
@@ -15,7 +17,6 @@ class ActionValue(ValueFunctionBase):
 
     def get_value(self, observation, action):
         """
-
         :param observation: numpy array of observations, of shape (batchsize, observation_dim).
         :param action: numpy array of actions, of shape (batchsize, action_dim)
         # TODO: Atari discrete action should have dim 1. Super Mario may should have, say, dim 5, where each can be 0/1
@@ -24,7 +25,7 @@ class ActionValue(ValueFunctionBase):
         """
         sess = tf.get_default_session()
         return sess.run(self.get_value_tensor(), feed_dict=
-        {self._observation_placeholder: observation, self._action_placeholder:action})[:, 0]
+        {self._observation_placeholder: observation, self._action_placeholder: action})
 
 
 class DQN(ActionValue):
@@ -39,13 +40,21 @@ class DQN(ActionValue):
         :param action_placeholder: of shape (batchsize, )
         """
         self._value_tensor_all_actions = value_tensor
-        canonical_value_tensor = value_tensor[action_placeholder]  # maybe a tf.map_fn. for now it's wrong
+
+        batch_size = tf.shape(value_tensor)[0]
+        batch_dim_index = tf.range(batch_size)
+        indices = tf.stack([batch_dim_index, action_placeholder], axis=1)
+        canonical_value_tensor = tf.gather_nd(value_tensor, indices)
 
         super(DQN, self).__init__(value_tensor=canonical_value_tensor,
                                   observation_placeholder=observation_placeholder,
                                   action_placeholder=action_placeholder)
 
     def get_value_all_actions(self, observation):
+        """
+        :param observation:
+        :return: numpy array of Q(s, *) given s, of shape (batchsize, num_actions)
+        """
         sess = tf.get_default_session()
         return sess.run(self._value_tensor_all_actions, feed_dict={self._observation_placeholder: observation})
 
diff --git a/tianshou/core/value_function/base.py b/tianshou/core/value_function/base.py
index 0b27759..b15f1bf 100644
--- a/tianshou/core/value_function/base.py
+++ b/tianshou/core/value_function/base.py
@@ -1,3 +1,6 @@
+from __future__ import absolute_import
+
+import tensorflow as tf
 
 # TODO: linear feature baseline also in tf?
 class ValueFunctionBase(object):
@@ -6,7 +9,7 @@ class ValueFunctionBase(object):
     """
     def __init__(self, value_tensor, observation_placeholder):
         self._observation_placeholder = observation_placeholder
-        self._value_tensor = value_tensor
+        self._value_tensor = tf.squeeze(value_tensor)  # canonical values has shape (batchsize, )
 
     def get_value(self, **kwargs):
         """
diff --git a/tianshou/core/value_function/state_value.py b/tianshou/core/value_function/state_value.py
index 04fe442..b7de196 100644
--- a/tianshou/core/value_function/state_value.py
+++ b/tianshou/core/value_function/state_value.py
@@ -1,4 +1,6 @@
-from base import ValueFunctionBase
+from __future__ import absolute_import
+
+from .base import ValueFunctionBase
 import tensorflow as tf
 
 
@@ -17,7 +19,7 @@ class StateValue(ValueFunctionBase):
 
         :param observation: numpy array of observations, of shape (batchsize, observation_dim).
         :return: numpy array of state values, of shape (batchsize, )
-        # TODO: dealing with the last dim of 1 in V(s) and Q(s, a)
+        # TODO: dealing with the last dim of 1 in V(s) and Q(s, a), this should rely on the action shape returned by env
         """
         sess = tf.get_default_session()
-        return sess.run(self.get_value_tensor(), feed_dict={self._observation_placeholder: observation})[:, 0]
\ No newline at end of file
+        return sess.run(self.get_value_tensor(), feed_dict={self._observation_placeholder: observation})
\ No newline at end of file

From 84208a7ac96058f1f7dca9fcb609f4641766ea6a Mon Sep 17 00:00:00 2001
From: JialianLee <Jialian@DESKTOP-N4N6F2G.localdomain>
Date: Sat, 23 Dec 2017 15:43:45 +0800
Subject: [PATCH 55/98] Modification for reversi.py

---
 AlphaGo/reversi.py | 107 +++++++++++++++++++++++++++++----------------
 1 file changed, 70 insertions(+), 37 deletions(-)

diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py
index c086a2c..ead6f4e 100644
--- a/AlphaGo/reversi.py
+++ b/AlphaGo/reversi.py
@@ -25,6 +25,7 @@ def find_correct_moves(own, enemy):
     mobility |= search_offset_right(own, enemy, mask, 7)  # Left bottom
     return mobility
 
+
 def calc_flip(pos, own, enemy):
     """return flip stones of enemy by bitboard when I place stone at pos.
 
@@ -123,8 +124,9 @@ class Reversi:
         self.board = None  # 8 * 8 board with 1 for black, -1 for white and 0 for blank
         self.color = None  # 1 for black and -1 for white
         self.action = None   # number in 0~63
-        # self.winner = None
+        self.winner = None
         self.black_win = None
+        self.size = 8
 
     def get_board(self, black=None, white=None):
         self.black = black or (0b00001000 << 24 | 0b00010000 << 32)
@@ -132,22 +134,29 @@ class Reversi:
         self.board = self.bitboard2board() 	
         return self.board
 
+    def is_valid(self, is_next=False):
+        self.board2bitboard()
+        own, enemy = self.get_own_and_enemy(is_next)
+        mobility = find_correct_moves(own, enemy)
+        valid_moves = bit_to_array(mobility, 64)
+        valid_moves = np.argwhere(valid_moves)
+        valid_moves = list(np.reshape(valid_moves, len(valid_moves)))
+        return valid_moves
+
     def simulate_get_mask(self, state, action_set):
         history_boards, color = state
         board = history_boards[-1]
         self.board = board
         self.color = color
-        self.board2bitboard()
-        own, enemy = self.get_own_and_enemy()
-        mobility = find_correct_moves(own, enemy)
-        valid_moves = bit_to_array(mobility, 64)
-        valid_moves = np.argwhere(valid_moves)
-        valid_moves = list(np.reshape(valid_moves, len(valid_moves)))
+        valid_moves = self.is_valid()
         # TODO it seems that the pass move is not considered
-        invalid_action_mask = []
-        for action in action_set:
-            if action not in valid_moves:
-                invalid_action_mask.append(action)
+        if not len(valid_moves):
+            invalid_action_mask = action_set[0:-1]
+        else:
+            invalid_action_mask = []
+            for action in action_set:
+                if action not in valid_moves:
+                    invalid_action_mask.append(action)
         return invalid_action_mask
 
     def simulate_step_forward(self, state, action):
@@ -155,21 +164,34 @@ class Reversi:
         self.color = state[1]
         self.board2bitboard()
         self.action = action
-        step_forward = self.step()
-        if step_forward:
-            new_board = self.bitboard2board()
-            return [new_board, 0 - self.color], 0
+        if self.action == 64:
+            valid_moves = self.is_valid(is_next=True)
+            if not len(valid_moves):
+                self._game_over()
+                return None, self.winner * self.color
+            else:
+                return [self.board, 0 - self.color], 0
+        self.step()
+        new_board = self.bitboard2board()
+        return [new_board, 0 - self.color], 0
 
     def executor_do_move(self, board, color, vertex):
         self.board = board
         self.color = color
         self.board2bitboard()
-        self.vertex2action(vertex)
-        step_forward = self.step()
-        if step_forward:
+        self.action = self._flatten(vertex)
+        if self.action == 64:
+            valid_moves = self.is_valid(is_next=True)
+            if not len(valid_moves):
+                return False
+            else:
+                return True
+        else:
+            self.step()
             new_board = self.bitboard2board()
-        for i in range(64):
-        	board[i] = new_board[i]
+            for i in range(64):
+                board[i] = new_board[i]
+            return True
 
     def executor_get_score(self, board):
         self.board = board
@@ -191,13 +213,14 @@ class Reversi:
             elif self.board[i] == -1:
                 self.white |= count
             count *= 2
-
+    '''
     def vertex2action(self, vertex):
         x, y = vertex
         if x == 0 and y == 0:
             self.action = None
         else:
             self.action = 8 * (x - 1) + y - 1
+    '''
 
     def bitboard2board(self):
         board = []
@@ -214,46 +237,45 @@ class Reversi:
 
     def step(self):
         if self.action < 0 or self.action > 63:
-            raise ValueError("Wrong action!")
+            raise ValueError("Action not in the range of [0,63]!")
         if self.action is None:
-            return False
+            raise ValueError("Action is None!")
 
         own, enemy = self.get_own_and_enemy()
 
         flipped = calc_flip(self.action, own, enemy)
         if bit_count(flipped) == 0:
-            self.illegal_move_to_lose(self.action)
-            return False
+            # self.illegal_move_to_lose(self.action)
+            raise ValueError("Illegal action!")
         own ^= flipped
         own |= 1 << self.action
         enemy ^= flipped
-
         self.set_own_and_enemy(own, enemy)
-        return True
 
     def _game_over(self):
         # self.done = True
-        '''
+
         if self.winner is None:
             black_num, white_num = self.number_of_black_and_white
-            if black_num > white_num:
+            self.black_win = black_num - white_num
+            if self.black_win > 0:
                 self.winner = 1
-            elif black_num < white_num:
+            elif self.black_win < 0:
                 self.winner = -1
             else:
                 self.winner = 0
-        '''
-        if self.black_win is None:
-        	black_num, white_num = self.number_of_black_and_white
-        	self.black_win = black_num - white_num
 
     def illegal_move_to_lose(self, action):
         self._game_over()
 
-    def get_own_and_enemy(self):
-        if self.color == 1:
+    def get_own_and_enemy(self, is_next=False):
+        if is_next:
+            color = 0 - self.color
+        else:
+            color = self.color
+        if color == 1:
             own, enemy = self.black, self.white
-        elif self.color == -1:
+        elif color == -1:
             own, enemy = self.white, self.black
         else:
             own, enemy = None, None
@@ -265,6 +287,17 @@ class Reversi:
         else:
             self.white, self.black = own, enemy
 
+    def _deflatten(self, idx):
+        x = idx // self.size + 1
+        y = idx % self.size + 1
+        return (x, y)
+
+    def _flatten(self, vertex):
+        x, y = vertex
+        if (x == 0) and (y == 0):
+            return 64
+        return (x - 1) * self.size + (y - 1)
+
     @property
     def number_of_black_and_white(self):
         return bit_count(self.black), bit_count(self.white)

From 3f238864fbfe20843900de12513aec75b8a59943 Mon Sep 17 00:00:00 2001
From: rtz19970824 <1289226405@qq.com>
Date: Sat, 23 Dec 2017 15:58:06 +0800
Subject: [PATCH 56/98] minor fixed for mcts, check finish for go

---
 AlphaGo/go.py              | 13 ++++++++-----
 tianshou/core/mcts/mcts.py | 12 ++++++++----
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index b819c08..fe2ab74 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -212,11 +212,14 @@ class Go:
     def simulate_step_forward(self, state, action):
         # initialize the simulate_board from state
         history_boards, color = state
-        vertex = self._action2vertex(action)
-        new_board = self._do_move(copy.copy(history_boards[-1]), color, vertex)
-        history_boards.append(new_board)
-        new_color = -color
-        return [history_boards, new_color], 0
+        if history_boards[-1] == history_boards[-2] and action is utils.PASS:
+            return None, 2 * (float(self.executor_get_score(history_boards[-1]) > 0)-0.5) * color
+        else:
+            vertex = self._action2vertex(action)
+            new_board = self._do_move(copy.copy(history_boards[-1]), color, vertex)
+            history_boards.append(new_board)
+            new_color = -color
+            return [history_boards, new_color], 0
 
     def executor_do_move(self, history, latest_boards, current_board, color, vertex):
         if not self._rule_check(history, current_board, color, vertex):
diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index e8f3709..e99373c 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -38,6 +38,7 @@ class MCTSNode(object):
     def valid_mask(self, simulator):
         pass
 
+
 class UCTNode(MCTSNode):
     def __init__(self, parent, action, state, action_num, prior, inverse=False):
         super(UCTNode, self).__init__(parent, action, state, action_num, prior, inverse)
@@ -71,10 +72,13 @@ class UCTNode(MCTSNode):
                 self.parent.backpropagation(self.children[action].reward)
 
     def valid_mask(self, simulator):
-        # let all invalid actions be illeagel in mcts
-        if self.mask is None:
-            self.mask = simulator.simulate_get_mask(self.state, range(self.action_num))
-        self.ucb[self.mask] = -float("Inf")
+        # let all invalid actions be illegal in mcts
+        if not hasattr(simulator, 'simulate_get_mask'):
+            pass
+        else:
+            if self.mask is None:
+                self.mask = simulator.simulate_get_mask(self.state, range(self.action_num))
+            self.ucb[self.mask] = -float("Inf")
 
 
 class TSNode(MCTSNode):

From 4589fcf52194eccc219f82e36345573541511674 Mon Sep 17 00:00:00 2001
From: rtz19970824 <1289226405@qq.com>
Date: Sat, 23 Dec 2017 16:27:09 +0800
Subject: [PATCH 57/98] add random preprocess, modify the uniform sample from 
 training data

---
 AlphaGo/model.py | 72 +++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 65 insertions(+), 7 deletions(-)

diff --git a/AlphaGo/model.py b/AlphaGo/model.py
index 22e8626..68973ac 100644
--- a/AlphaGo/model.py
+++ b/AlphaGo/model.py
@@ -1,7 +1,6 @@
 import os
 import time
-import random
-import sys
+import copy
 import cPickle
 from collections import deque
 
@@ -224,11 +223,21 @@ class ResNet(object):
             else:
                 start_time = time.time()
                 for i in range(batch_size):
-                    game_num = random.randint(0, self.window_length-1)
-                    state_num = random.randint(0, self.training_data['length'][game_num]-1)
-                    training_data['states'].append(np.expand_dims(self.training_data['states'][game_num][state_num], 0))
-                    training_data['probs'].append(np.expand_dims(self.training_data['probs'][game_num][state_num], 0))
-                    training_data['winner'].append(np.expand_dims(self.training_data['winner'][game_num][state_num], 0))
+                    priority = self.training_data['length'] / sum(self.training_data['length'])
+                    game_num = np.random.choice(self.window_length, 1, p=priority)
+                    state_num = np.random.randint(self.training_data['length'][game_num])
+                    rotate_times = np.random.randint(4)
+                    reflect_times = np.random.randint(2)
+                    reflect_orientation = np.random.randint(2)
+                    training_data['states'].append(
+                        self._preprocession(self.training_data['states'][game_num][state_num], reflect_times,
+                                            reflect_orientation, rotate_times))
+                    training_data['probs'].append(
+                        self._preprocession(self.training_data['probs'][game_num][state_num], reflect_times,
+                                            reflect_orientation, rotate_times))
+                    training_data['winner'].append(
+                        self._preprocession(self.training_data['winner'][game_num][state_num], reflect_times,
+                                            reflect_orientation, rotate_times))
                 value_loss, policy_loss, reg, _ = self.sess.run(
                     [self.value_loss, self.policy_loss, self.reg, self.train_op],
                     feed_dict={self.x: np.concatenate(training_data['states'], axis=0),
@@ -280,6 +289,55 @@ class ResNet(object):
         winner = np.concatenate(winner, axis=0)
         return states, probs, winner
 
+    def _preprocession(self, board, reflect_times=0, reflect_orientation=0, rotate_times=0):
+        """
+        preprocessing for augmentation
+
+        :param board: a ndarray, board to process
+        :param reflect_times: an integer, how many times to reflect
+        :param reflect_orientation: an integer, which orientation to reflect
+        :param rotate_times: an integer, how many times to rotate
+        :return:
+        """
+
+        new_board = copy.copy(board)
+        if new_board.ndim == 3:
+            np.expand_dims(new_board, axis=0)
+
+        new_board = self._board_reflection(new_board, reflect_times, reflect_orientation)
+        new_board = self._board_rotation(new_board, rotate_times)
+
+        return new_board
+
+    def _board_rotation(self, board, times):
+        """
+        rotate the board for augmentation
+        note that board's shape should be [batch_size, board_size, board_size, channels]
+
+        :param board: a ndarray, shape [batch_size, board_size, board_size, channels]
+        :param times: an integer, how many times to rotate
+        :return:
+        """
+        return np.rot90(board, times, (1, 2))
+
+    def _board_reflection(self, board, times, orientation):
+        """
+        reflect the board for augmentation
+        note that board's shape should be [batch_size, board_size, board_size, channels]
+
+        :param board: a ndarray, shape [batch_size, board_size, board_size, channels]
+        :param times: an integer, how many times to reflect
+        :param orientation: an integer, which orientation to reflect
+        :return:
+        """
+        new_board = copy.copy(board)
+        for _ in range(times):
+            if orientation == 0:
+                new_board = new_board[:, ::-1]
+            if orientation == 1:
+                new_board = new_board[:, :, ::-1]
+        return new_board
+
 
 if __name__ == "__main__":
     model = ResNet(board_size=9, action_num=82, history_length=8)

From b21a55dc88fefe7773b842e87af2d6b3eaab821b Mon Sep 17 00:00:00 2001
From: haoshengzou <zouhaosheng@163.com>
Date: Sat, 23 Dec 2017 17:25:16 +0800
Subject: [PATCH 58/98] towards policy/value refactor

---
 examples/dqn_example.py                      | 11 +++++------
 tianshou/core/README.md                      |  6 +++++-
 tianshou/core/losses.py                      |  7 +++----
 tianshou/core/policy/base.py                 | 18 +++++-------------
 tianshou/core/policy/dqn.py                  | 17 +++++++++++++----
 tianshou/core/policy/stochastic.py           |  6 ------
 tianshou/core/value_function/action_value.py |  9 +++++----
 tianshou/core/value_function/base.py         |  5 +++--
 tianshou/core/value_function/state_value.py  |  4 ++--
 9 files changed, 41 insertions(+), 42 deletions(-)

diff --git a/examples/dqn_example.py b/examples/dqn_example.py
index b676475..cf20d66 100644
--- a/examples/dqn_example.py
+++ b/examples/dqn_example.py
@@ -1,8 +1,6 @@
 #!/usr/bin/env python
 
 import tensorflow as tf
-import numpy as np
-import time
 import gym
 
 # our lib imports here!
@@ -10,7 +8,7 @@ import sys
 sys.path.append('..')
 import tianshou.core.losses as losses
 from tianshou.data.replay_buffer.utils import get_replay_buffer
-import tianshou.core.policy as policy
+import tianshou.core.policy.dqn as policy
 
 
 def policy_net(observation, action_dim):
@@ -41,6 +39,8 @@ if __name__ == '__main__':
     # pass the observation variable to the replay buffer or find a more reasonable way to help replay buffer
     # access this observation variable.
     observation = tf.placeholder(tf.float32, shape=(None,) + observation_dim, name="dqn_observation") # network input
+    action = tf.placeholder(dtype=tf.int32, shape=(None,)) # batch of integer actions
+
 
     with tf.variable_scope('q_net'):
         q_values = policy_net(observation, action_dim)
@@ -48,10 +48,9 @@ if __name__ == '__main__':
         q_values_target = policy_net(observation, action_dim)
 
     # 2. build losses, optimizers
-    q_net = policy.DQN(q_values, observation_placeholder=observation) # YongRen: policy.DQN
-    target_net = policy.DQN(q_values_target, observation_placeholder=observation)
+    q_net = policy.DQNRefactor(q_values, observation_placeholder=observation, action_placeholder=action) # YongRen: policy.DQN
+    target_net = policy.DQNRefactor(q_values_target, observation_placeholder=observation, action_placeholder=action)
 
-    action = tf.placeholder(dtype=tf.int32, shape=[None]) # batch of integer actions
     target = tf.placeholder(dtype=tf.float32, shape=[None]) # target value for DQN
 
     dqn_loss = losses.dqn_loss(action, target, q_net) # TongzhengRen
diff --git a/tianshou/core/README.md b/tianshou/core/README.md
index 3617525..a9cda58 100644
--- a/tianshou/core/README.md
+++ b/tianshou/core/README.md
@@ -21,4 +21,8 @@ referencing QValuePolicy in base.py, should have at least the listed methods.
 
 TongzhengRen
 
-seems to be direct python functions. Though the management of placeholders may require some discussion. also may write it in a functional form.
\ No newline at end of file
+seems to be direct python functions. Though the management of placeholders may require some discussion. also may write it in a functional form.
+
+# policy, value_function
+
+naming should be reconsidered. Perhaps use plural forms for all nouns
\ No newline at end of file
diff --git a/tianshou/core/losses.py b/tianshou/core/losses.py
index 3461afb..5d5d2f3 100644
--- a/tianshou/core/losses.py
+++ b/tianshou/core/losses.py
@@ -35,17 +35,16 @@ def vanilla_policy_gradient(sampled_action, reward, pi, baseline="None"):
     # TODO: Different baseline methods like REINFORCE, etc.
     return vanilla_policy_gradient_loss
 
-def dqn_loss(sampled_action, sampled_target, q_net):
+def dqn_loss(sampled_action, sampled_target, policy):
     """
     deep q-network
 
     :param sampled_action: placeholder of sampled actions during the interaction with the environment
     :param sampled_target: estimated Q(s,a)
-    :param q_net: current `policy` to be optimized
+    :param policy: current `policy` to be optimized
     :return:
     """
-    action_num = q_net.values_tensor().get_shape()[1]
-    sampled_q = tf.reduce_sum(q_net.values_tensor() * tf.one_hot(sampled_action, action_num), axis=1)
+    sampled_q = policy.q_net.value_tensor
     return tf.reduce_mean(tf.square(sampled_target - sampled_q))
 
 def deterministic_policy_gradient(sampled_state, critic):
diff --git a/tianshou/core/policy/base.py b/tianshou/core/policy/base.py
index 1adeaeb..1c1e1c5 100644
--- a/tianshou/core/policy/base.py
+++ b/tianshou/core/policy/base.py
@@ -3,19 +3,12 @@
 
 from __future__ import absolute_import
 from __future__ import division
-import warnings
 
 import tensorflow as tf
 
 # from zhusuan.utils import add_name_scope
 
 
-__all__ = [
-    'StochasticPolicy',
-    'QValuePolicy',
-    'PolicyBase'
-]
-
 # TODO: a even more "base" class for policy
 
 
@@ -23,8 +16,8 @@ class PolicyBase(object):
     """
     base class for policy. only provides `act` method with exploration
     """
-    def __init__(self):
-        pass
+    def __init__(self, observation_placeholder):
+        self._observation_placeholder = observation_placeholder
 
     def act(self, observation, exploration):
         raise NotImplementedError()
@@ -37,14 +30,14 @@ class QValuePolicy(object):
     def __init__(self, observation_placeholder):
         self._observation_placeholder = observation_placeholder
 
-    def act(self, observation, exploration=None): # first implement no exploration
+    def act(self, observation, exploration=None):  # first implement no exploration
         """
         return the action (int) to be executed.
         no exploration when exploration=None.
         """
         self._act(observation, exploration)
 
-    def _act(self, observation, exploration = None):
+    def _act(self, observation, exploration=None):
         raise NotImplementedError()
 
     def values(self, observation):
@@ -60,7 +53,6 @@ class QValuePolicy(object):
         pass
 
 
-
 class StochasticPolicy(object):
     """
     The :class:`Distribution` class is the base class for various probabilistic
@@ -130,7 +122,7 @@ class StochasticPolicy(object):
                  param_dtype,
                  is_continuous,
                  observation_placeholder,
-                 group_ndims=0, # maybe useful for repeat_action
+                 group_ndims=0,  # maybe useful for repeat_action
                  **kwargs):
 
         self._act_dtype = act_dtype
diff --git a/tianshou/core/policy/dqn.py b/tianshou/core/policy/dqn.py
index 716e4c4..8533549 100644
--- a/tianshou/core/policy/dqn.py
+++ b/tianshou/core/policy/dqn.py
@@ -10,16 +10,25 @@ class DQNRefactor(PolicyBase):
     use DQN from value_function as a member
     """
     def __init__(self, value_tensor, observation_placeholder, action_placeholder):
-        self._network = DQN(value_tensor, observation_placeholder, action_placeholder)
+        self._q_net = DQN(value_tensor, observation_placeholder, action_placeholder)
         self._argmax_action = tf.argmax(value_tensor, axis=1)
 
-    def act(self, observation, exploration):
+        super(DQNRefactor, self).__init__(observation_placeholder=observation_placeholder)
+
+    def act(self, observation, exploration=None):
         sess = tf.get_default_session()
         if not exploration:  # no exploration
-             action = sess.run(self._argmax_action, feed_dict={})
+            action = sess.run(self._argmax_action, feed_dict={self._observation_placeholder: observation})
 
 
-class DQN(QValuePolicy):
+        return action
+
+    @property
+    def q_net(self):
+        return self._q_net
+
+
+class DQNOld(QValuePolicy):
     """
     The policy as in DQN
     """
diff --git a/tianshou/core/policy/stochastic.py b/tianshou/core/policy/stochastic.py
index 3ef463e..d7a75d7 100644
--- a/tianshou/core/policy/stochastic.py
+++ b/tianshou/core/policy/stochastic.py
@@ -10,12 +10,6 @@ import tensorflow as tf
 from .base import StochasticPolicy
 
 
-__all__ = [
-    'OnehotCategorical',
-    'OnehotDiscrete',
-]
-
-
 class OnehotCategorical(StochasticPolicy):
     """
     The class of one-hot Categorical distribution.
diff --git a/tianshou/core/value_function/action_value.py b/tianshou/core/value_function/action_value.py
index 2bda4fa..c62dae6 100644
--- a/tianshou/core/value_function/action_value.py
+++ b/tianshou/core/value_function/action_value.py
@@ -15,7 +15,7 @@ class ActionValue(ValueFunctionBase):
             observation_placeholder=observation_placeholder
         )
 
-    def get_value(self, observation, action):
+    def eval_value(self, observation, action):
         """
         :param observation: numpy array of observations, of shape (batchsize, observation_dim).
         :param action: numpy array of actions, of shape (batchsize, action_dim)
@@ -24,7 +24,7 @@ class ActionValue(ValueFunctionBase):
         # TODO: dealing with the last dim of 1 in V(s) and Q(s, a)
         """
         sess = tf.get_default_session()
-        return sess.run(self.get_value_tensor(), feed_dict=
+        return sess.run(self.value_tensor, feed_dict=
         {self._observation_placeholder: observation, self._action_placeholder: action})
 
 
@@ -50,7 +50,7 @@ class DQN(ActionValue):
                                   observation_placeholder=observation_placeholder,
                                   action_placeholder=action_placeholder)
 
-    def get_value_all_actions(self, observation):
+    def eval_value_all_actions(self, observation):
         """
         :param observation:
         :return: numpy array of Q(s, *) given s, of shape (batchsize, num_actions)
@@ -58,5 +58,6 @@ class DQN(ActionValue):
         sess = tf.get_default_session()
         return sess.run(self._value_tensor_all_actions, feed_dict={self._observation_placeholder: observation})
 
-    def get_value_tensor_all_actions(self):
+    @property
+    def value_tensor_all_actions(self):
         return self._value_tensor_all_actions
\ No newline at end of file
diff --git a/tianshou/core/value_function/base.py b/tianshou/core/value_function/base.py
index b15f1bf..8ca9dd0 100644
--- a/tianshou/core/value_function/base.py
+++ b/tianshou/core/value_function/base.py
@@ -11,14 +11,15 @@ class ValueFunctionBase(object):
         self._observation_placeholder = observation_placeholder
         self._value_tensor = tf.squeeze(value_tensor)  # canonical values has shape (batchsize, )
 
-    def get_value(self, **kwargs):
+    def eval_value(self, **kwargs):
         """
 
         :return: batch of corresponding values in numpy array
         """
         raise NotImplementedError()
 
-    def get_value_tensor(self):
+    @property
+    def value_tensor(self):
         """
 
         :return: tensor of the corresponding values
diff --git a/tianshou/core/value_function/state_value.py b/tianshou/core/value_function/state_value.py
index b7de196..02c12fe 100644
--- a/tianshou/core/value_function/state_value.py
+++ b/tianshou/core/value_function/state_value.py
@@ -14,7 +14,7 @@ class StateValue(ValueFunctionBase):
             observation_placeholder=observation_placeholder
         )
 
-    def get_value(self, observation):
+    def eval_value(self, observation):
         """
 
         :param observation: numpy array of observations, of shape (batchsize, observation_dim).
@@ -22,4 +22,4 @@ class StateValue(ValueFunctionBase):
         # TODO: dealing with the last dim of 1 in V(s) and Q(s, a), this should rely on the action shape returned by env
         """
         sess = tf.get_default_session()
-        return sess.run(self.get_value_tensor(), feed_dict={self._observation_placeholder: observation})
\ No newline at end of file
+        return sess.run(self.value_tensor, feed_dict={self._observation_placeholder: observation})
\ No newline at end of file

From 919784e88b011028ff5e8b8e226974a9bbf8d75c Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Sat, 23 Dec 2017 17:43:33 +0800
Subject: [PATCH 59/98] bug fix of model.py

---
 AlphaGo/model.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/AlphaGo/model.py b/AlphaGo/model.py
index 68973ac..2dc1ef0 100644
--- a/AlphaGo/model.py
+++ b/AlphaGo/model.py
@@ -101,7 +101,7 @@ class ResNet(object):
         self._build_network(residual_block_num, self.checkpoint_path)
 
         # training hyper-parameters:
-        self.window_length = 7000
+        self.window_length = 3
         self.save_freq = 5000
         self.training_data = {'states': deque(maxlen=self.window_length), 'probs': deque(maxlen=self.window_length),
                               'winner': deque(maxlen=self.window_length), 'length': deque(maxlen=self.window_length)}
@@ -223,8 +223,8 @@ class ResNet(object):
             else:
                 start_time = time.time()
                 for i in range(batch_size):
-                    priority = self.training_data['length'] / sum(self.training_data['length'])
-                    game_num = np.random.choice(self.window_length, 1, p=priority)
+                    priority = np.array(self.training_data['length']) / (0.0 + np.sum(np.array(self.training_data['length'])))
+                    game_num = np.random.choice(self.window_length, 1, p=priority)[0]
                     state_num = np.random.randint(self.training_data['length'][game_num])
                     rotate_times = np.random.randint(4)
                     reflect_times = np.random.randint(2)
@@ -232,12 +232,10 @@ class ResNet(object):
                     training_data['states'].append(
                         self._preprocession(self.training_data['states'][game_num][state_num], reflect_times,
                                             reflect_orientation, rotate_times))
-                    training_data['probs'].append(
-                        self._preprocession(self.training_data['probs'][game_num][state_num], reflect_times,
-                                            reflect_orientation, rotate_times))
-                    training_data['winner'].append(
-                        self._preprocession(self.training_data['winner'][game_num][state_num], reflect_times,
-                                            reflect_orientation, rotate_times))
+                    training_data['probs'].append(np.concatenate(
+                        [self._preprocession(self.training_data['probs'][game_num][state_num][:-1].reshape(self.board_size, self.board_size, 1), reflect_times,
+                                            reflect_orientation, rotate_times).reshape(1, self.board_size**2), self.training_data['probs'][game_num][state_num][-1].reshape(1,1)], axis=1))
+                    training_data['winner'].append(self.training_data['winner'][game_num][state_num].reshape(1, 1))
                 value_loss, policy_loss, reg, _ = self.sess.run(
                     [self.value_loss, self.policy_loss, self.reg, self.train_op],
                     feed_dict={self.x: np.concatenate(training_data['states'], axis=0),
@@ -302,7 +300,7 @@ class ResNet(object):
 
         new_board = copy.copy(board)
         if new_board.ndim == 3:
-            np.expand_dims(new_board, axis=0)
+            new_board = np.expand_dims(new_board, axis=0)
 
         new_board = self._board_reflection(new_board, reflect_times, reflect_orientation)
         new_board = self._board_rotation(new_board, rotate_times)

From dcf293d63749e0d9febdc8bf9e2ea1795be112ba Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Sat, 23 Dec 2017 22:05:34 +0800
Subject: [PATCH 60/98] count the winning rate for each player

---
 AlphaGo/.gitignore        |  1 +
 AlphaGo/data_statistic.py | 29 +++++++++++++++++++++++++++++
 AlphaGo/game.py           |  2 +-
 3 files changed, 31 insertions(+), 1 deletion(-)
 create mode 100644 AlphaGo/data_statistic.py

diff --git a/AlphaGo/.gitignore b/AlphaGo/.gitignore
index 9c2fe16..e578e5a 100644
--- a/AlphaGo/.gitignore
+++ b/AlphaGo/.gitignore
@@ -1,3 +1,4 @@
 data
 checkpoints
 checkpoints_origin
+*.log
diff --git a/AlphaGo/data_statistic.py b/AlphaGo/data_statistic.py
new file mode 100644
index 0000000..6fedf1c
--- /dev/null
+++ b/AlphaGo/data_statistic.py
@@ -0,0 +1,29 @@
+import os
+import cPickle
+
+class Data(object):
+    def __init__(self):
+        self.boards = []
+        self.probs = []
+        self.winner = 0
+
+def file_to_training_data(file_name):
+    with open(file_name, 'rb') as file:
+        try:
+            file.seek(0)
+            data = cPickle.load(file)
+            return data.winner
+        except Exception as e:
+            print(e)
+            return 0
+
+if __name__ == "__main__":
+    win_count = [0, 0, 0]
+    file_list = os.listdir("./data")
+    #print file_list
+    for file in file_list:
+        win_count[file_to_training_data("./data/" + file)] += 1
+    print "Total play : " + str(len(file_list))
+    print "Black wins : " + str(win_count[1])
+    print "White wins : " + str(win_count[-1])
+
diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index 90d0bf0..9fc8fa2 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -62,7 +62,7 @@ class Game:
 
     def think(self, latest_boards, color):
         mcts = MCTS(self.game_engine, self.evaluator, [latest_boards, color], self.size ** 2 + 1, inverse=True)
-        mcts.search(max_step=20)
+        mcts.search(max_step=100)
         temp = 1
         prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp)
         choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0]

From 162aa313b6b75f255b8690b9c809f4e2c5f81fd4 Mon Sep 17 00:00:00 2001
From: JialianLee <Jialian@DESKTOP-N4N6F2G.localdomain>
Date: Sun, 24 Dec 2017 00:42:59 +0800
Subject: [PATCH 61/98] A new version of reversi

---
 AlphaGo/reversi.py | 505 ++++++++++++++++++---------------------------
 1 file changed, 202 insertions(+), 303 deletions(-)

diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py
index ead6f4e..4fa1468 100644
--- a/AlphaGo/reversi.py
+++ b/AlphaGo/reversi.py
@@ -1,303 +1,202 @@
-from __future__ import print_function
-import numpy as np
-
-'''
-Settings of the Go game.
-
-(1, 1) is considered as the upper left corner of the board,
-(size, 1) is the lower left
-'''
-
-
-def find_correct_moves(own, enemy):
-    """return legal moves"""
-    left_right_mask = 0x7e7e7e7e7e7e7e7e  # Both most left-right edge are 0, else 1
-    top_bottom_mask = 0x00ffffffffffff00  # Both most top-bottom edge are 0, else 1
-    mask = left_right_mask & top_bottom_mask
-    mobility = 0
-    mobility |= search_offset_left(own, enemy, left_right_mask, 1)  # Left
-    mobility |= search_offset_left(own, enemy, mask, 9)  # Left Top
-    mobility |= search_offset_left(own, enemy, top_bottom_mask, 8)  # Top
-    mobility |= search_offset_left(own, enemy, mask, 7)  # Top Right
-    mobility |= search_offset_right(own, enemy, left_right_mask, 1)  # Right
-    mobility |= search_offset_right(own, enemy, mask, 9)  # Bottom Right
-    mobility |= search_offset_right(own, enemy, top_bottom_mask, 8)  # Bottom
-    mobility |= search_offset_right(own, enemy, mask, 7)  # Left bottom
-    return mobility
-
-
-def calc_flip(pos, own, enemy):
-    """return flip stones of enemy by bitboard when I place stone at pos.
-
-    :param pos: 0~63
-    :param own: bitboard (0=top left, 63=bottom right)
-    :param enemy: bitboard
-    :return: flip stones of enemy when I place stone at pos.
-    """
-    f1 = _calc_flip_half(pos, own, enemy)
-    f2 = _calc_flip_half(63 - pos, rotate180(own), rotate180(enemy))
-    return f1 | rotate180(f2)
-
-
-def _calc_flip_half(pos, own, enemy):
-    el = [enemy, enemy & 0x7e7e7e7e7e7e7e7e, enemy & 0x7e7e7e7e7e7e7e7e, enemy & 0x7e7e7e7e7e7e7e7e]
-    masks = [0x0101010101010100, 0x00000000000000fe, 0x0002040810204080, 0x8040201008040200]
-    masks = [b64(m << pos) for m in masks]
-    flipped = 0
-    for e, mask in zip(el, masks):
-        outflank = mask & ((e | ~mask) + 1) & own
-        flipped |= (outflank - (outflank != 0)) & mask
-    return flipped
-
-
-def search_offset_left(own, enemy, mask, offset):
-    e = enemy & mask
-    blank = ~(own | enemy)
-    t = e & (own >> offset)
-    t |= e & (t >> offset)
-    t |= e & (t >> offset)
-    t |= e & (t >> offset)
-    t |= e & (t >> offset)
-    t |= e & (t >> offset)  # Up to six stones can be turned at once
-    return blank & (t >> offset)  # Only the blank squares can be started
-
-
-def search_offset_right(own, enemy, mask, offset):
-    e = enemy & mask
-    blank = ~(own | enemy)
-    t = e & (own << offset)
-    t |= e & (t << offset)
-    t |= e & (t << offset)
-    t |= e & (t << offset)
-    t |= e & (t << offset)
-    t |= e & (t << offset)  # Up to six stones can be turned at once
-    return blank & (t << offset)  # Only the blank squares can be started
-
-
-def flip_vertical(x):
-    k1 = 0x00FF00FF00FF00FF
-    k2 = 0x0000FFFF0000FFFF
-    x = ((x >> 8) & k1) | ((x & k1) << 8)
-    x = ((x >> 16) & k2) | ((x & k2) << 16)
-    x = (x >> 32) | b64(x << 32)
-    return x
-
-
-def b64(x):
-    return x & 0xFFFFFFFFFFFFFFFF
-
-
-def bit_count(x):
-    return bin(x).count('1')
-
-
-def bit_to_array(x, size):
-    """bit_to_array(0b0010, 4) -> array([0, 1, 0, 0])"""
-    return np.array(list(reversed((("0" * size) + bin(x)[2:])[-size:])), dtype=np.uint8)
-
-
-def flip_diag_a1h8(x):
-    k1 = 0x5500550055005500
-    k2 = 0x3333000033330000
-    k4 = 0x0f0f0f0f00000000
-    t = k4 & (x ^ b64(x << 28))
-    x ^= t ^ (t >> 28)
-    t = k2 & (x ^ b64(x << 14))
-    x ^= t ^ (t >> 14)
-    t = k1 & (x ^ b64(x << 7))
-    x ^= t ^ (t >> 7)
-    return x
-
-
-def rotate90(x):
-    return flip_diag_a1h8(flip_vertical(x))
-
-
-def rotate180(x):
-    return rotate90(rotate90(x))
-
-
-class Reversi:
-    def __init__(self, black=None, white=None):
-        self.black = black or (0b00001000 << 24 | 0b00010000 << 32)
-        self.white = white or (0b00010000 << 24 | 0b00001000 << 32)
-        self.board = None  # 8 * 8 board with 1 for black, -1 for white and 0 for blank
-        self.color = None  # 1 for black and -1 for white
-        self.action = None   # number in 0~63
-        self.winner = None
-        self.black_win = None
-        self.size = 8
-
-    def get_board(self, black=None, white=None):
-        self.black = black or (0b00001000 << 24 | 0b00010000 << 32)
-        self.white = white or (0b00010000 << 24 | 0b00001000 << 32)
-        self.board = self.bitboard2board() 	
-        return self.board
-
-    def is_valid(self, is_next=False):
-        self.board2bitboard()
-        own, enemy = self.get_own_and_enemy(is_next)
-        mobility = find_correct_moves(own, enemy)
-        valid_moves = bit_to_array(mobility, 64)
-        valid_moves = np.argwhere(valid_moves)
-        valid_moves = list(np.reshape(valid_moves, len(valid_moves)))
-        return valid_moves
-
-    def simulate_get_mask(self, state, action_set):
-        history_boards, color = state
-        board = history_boards[-1]
-        self.board = board
-        self.color = color
-        valid_moves = self.is_valid()
-        # TODO it seems that the pass move is not considered
-        if not len(valid_moves):
-            invalid_action_mask = action_set[0:-1]
-        else:
-            invalid_action_mask = []
-            for action in action_set:
-                if action not in valid_moves:
-                    invalid_action_mask.append(action)
-        return invalid_action_mask
-
-    def simulate_step_forward(self, state, action):
-        self.board = state[0]
-        self.color = state[1]
-        self.board2bitboard()
-        self.action = action
-        if self.action == 64:
-            valid_moves = self.is_valid(is_next=True)
-            if not len(valid_moves):
-                self._game_over()
-                return None, self.winner * self.color
-            else:
-                return [self.board, 0 - self.color], 0
-        self.step()
-        new_board = self.bitboard2board()
-        return [new_board, 0 - self.color], 0
-
-    def executor_do_move(self, board, color, vertex):
-        self.board = board
-        self.color = color
-        self.board2bitboard()
-        self.action = self._flatten(vertex)
-        if self.action == 64:
-            valid_moves = self.is_valid(is_next=True)
-            if not len(valid_moves):
-                return False
-            else:
-                return True
-        else:
-            self.step()
-            new_board = self.bitboard2board()
-            for i in range(64):
-                board[i] = new_board[i]
-            return True
-
-    def executor_get_score(self, board):
-        self.board = board
-        self._game_over()
-        if self.black_win is not None:
-            return self.black_win
-        else:
-            raise ValueError("Game not finished!")
-
-    def board2bitboard(self):
-        count = 1
-        if self.board is None:
-            raise ValueError("None board!")
-        self.black = 0
-        self.white = 0
-        for i in range(64):
-            if self.board[i] == 1:
-                self.black |= count
-            elif self.board[i] == -1:
-                self.white |= count
-            count *= 2
-    '''
-    def vertex2action(self, vertex):
-        x, y = vertex
-        if x == 0 and y == 0:
-            self.action = None
-        else:
-            self.action = 8 * (x - 1) + y - 1
-    '''
-
-    def bitboard2board(self):
-        board = []
-        black = bit_to_array(self.black, 64)
-        white = bit_to_array(self.white, 64)
-        for i in range(64):
-            if black[i]:
-                board.append(1)
-            elif white[i]:
-                board.append(-1)
-            else:
-                board.append(0)
-        return board
-
-    def step(self):
-        if self.action < 0 or self.action > 63:
-            raise ValueError("Action not in the range of [0,63]!")
-        if self.action is None:
-            raise ValueError("Action is None!")
-
-        own, enemy = self.get_own_and_enemy()
-
-        flipped = calc_flip(self.action, own, enemy)
-        if bit_count(flipped) == 0:
-            # self.illegal_move_to_lose(self.action)
-            raise ValueError("Illegal action!")
-        own ^= flipped
-        own |= 1 << self.action
-        enemy ^= flipped
-        self.set_own_and_enemy(own, enemy)
-
-    def _game_over(self):
-        # self.done = True
-
-        if self.winner is None:
-            black_num, white_num = self.number_of_black_and_white
-            self.black_win = black_num - white_num
-            if self.black_win > 0:
-                self.winner = 1
-            elif self.black_win < 0:
-                self.winner = -1
-            else:
-                self.winner = 0
-
-    def illegal_move_to_lose(self, action):
-        self._game_over()
-
-    def get_own_and_enemy(self, is_next=False):
-        if is_next:
-            color = 0 - self.color
-        else:
-            color = self.color
-        if color == 1:
-            own, enemy = self.black, self.white
-        elif color == -1:
-            own, enemy = self.white, self.black
-        else:
-            own, enemy = None, None
-        return own, enemy
-
-    def set_own_and_enemy(self, own, enemy):
-        if self.color == 1:
-            self.black, self.white = own, enemy
-        else:
-            self.white, self.black = own, enemy
-
-    def _deflatten(self, idx):
-        x = idx // self.size + 1
-        y = idx % self.size + 1
-        return (x, y)
-
-    def _flatten(self, vertex):
-        x, y = vertex
-        if (x == 0) and (y == 0):
-            return 64
-        return (x - 1) * self.size + (y - 1)
-
-    @property
-    def number_of_black_and_white(self):
-        return bit_count(self.black), bit_count(self.white)
+import numpy as np
+'''
+Settings of the Reversi game.
+
+(1, 1) is considered as the upper left corner of the board,
+(size, 1) is the lower left
+'''
+
+
+class Reversi:
+    def __init__(self, black=None, white=None):
+        self.board = None  # 8 * 8 board with 1 for black, -1 for white and 0 for blank
+        self.color = None  # 1 for black and -1 for white
+        self.action = None   # number in 0~63
+        self.winner = None
+        self.black_win = None
+        self.size = 8
+
+    def _deflatten(self, idx):
+        x = idx // self.size + 1
+        y = idx % self.size + 1
+        return (x, y)
+
+    def _flatten(self, vertex):
+        x, y = vertex
+        if (x == 0) and (y == 0):
+            return 64
+        return (x - 1) * self.size + (y - 1)
+
+    def get_board(self, board=None):
+        self.board = board or np.zeros([8,8])
+        self.board[3, 3] = -1
+        self.board[4, 4] = -1
+        self.board[3, 4] = 1
+        self.board[4, 3] = 1
+        return self.board
+
+    def _find_correct_moves(self, is_next=False):
+        moves = []
+        if is_next:
+            color = 0 - self.color
+        else:
+            color = self.color
+        for i in range(64):
+            x, y = self._deflatten(i)
+            valid = self._is_valid(x - 1, y - 1, color)
+            if valid:
+                moves.append(i)
+        return moves
+
+    def _one_direction_valid(self, x, y, color):
+        if (x >= 0) and (x < self.size):
+            if (y >= 0) and (y < self.size):
+                if self.board[x, y] == color:
+                    return True
+        return False
+
+    def _is_valid(self, x, y, color):
+        if self.board[x, y]:
+            return False
+        for x_direction in [-1, 0, 1]:
+            for y_direction in [-1, 0, 1]:
+                new_x = x
+                new_y = y
+                flag = 0
+                while True:
+                    new_x += x_direction
+                    new_y += y_direction
+                    if self._one_direction_valid(new_x, new_y, 0 - color):
+                        flag = 1
+                    else:
+                        break
+                if self._one_direction_valid(new_x, new_y, color) and flag:
+                    return True
+        return False
+
+    def simulate_get_mask(self, state, action_set):
+        history_boards, color = state
+        self.board = np.reshape(history_boards[-1], (self.size, self.size))
+        self.color = color
+        valid_moves = self._find_correct_moves()
+        print(valid_moves)
+        if not len(valid_moves):
+            invalid_action_mask = action_set[0:-1]
+        else:
+            invalid_action_mask = []
+            for action in action_set:
+                if action not in valid_moves:
+                    invalid_action_mask.append(action)
+        return invalid_action_mask
+
+    def simulate_step_forward(self, state, action):
+        self.board = state[0].copy()
+        self.board = np.reshape(self.board, (self.size, self.size))
+        self.color = state[1]
+        self.action = action
+        if self.action == 64:
+            valid_moves = self._find_correct_moves(is_next=True)
+            if not len(valid_moves):
+                self._game_over()
+                return None, self.winner * self.color
+            else:
+                return [self.board, 0 - self.color], 0
+        self._step()
+        return [self.board, 0 - self.color], 0
+
+    def _game_over(self):
+        black_num, white_num = self._number_of_black_and_white()
+        self.black_win = black_num - white_num
+        if self.black_win > 0:
+            self.winner = 1
+        elif self.black_win < 0:
+            self.winner = -1
+        else:
+            self.winner = 0
+
+    def _number_of_black_and_white(self):
+        black_num = 0
+        white_num = 0
+        board_list = np.reshape(self.board, self.size ** 2)
+        for i in range(len(board_list)):
+            if board_list[i] == 1:
+                black_num += 1
+            elif board_list[i] == -1:
+                white_num += 1
+        return black_num, white_num
+
+    def _step(self):
+        if self.action < 0 or self.action > 63:
+            raise ValueError("Action not in the range of [0,63]!")
+        if self.action is None:
+            raise ValueError("Action is None!")
+        x, y = self._deflatten(self.action)
+        valid = self._flip(x -1, y - 1)
+        if not valid:
+            raise ValueError("Illegal action!")
+
+    def _flip(self, x, y):
+        valid = 0
+        self.board[x, y] = self.color
+        for x_direction in [-1, 0, 1]:
+            for y_direction in [-1, 0, 1]:
+                new_x = x
+                new_y = y
+                flag = 0
+                while True:
+                    new_x += x_direction
+                    new_y += y_direction
+                    if self._one_direction_valid(new_x, new_y, 0 - self.color):
+                        flag = 1
+                    else:
+                        break
+                if self._one_direction_valid(new_x, new_y, self.color) and flag:
+                    valid = 1
+                    flip_x = x
+                    flip_y = y
+                    while True:
+                        flip_x += x_direction
+                        flip_y += y_direction
+                        if self._one_direction_valid(flip_x, flip_y, 0 - self.color):
+                            self.board[flip_x, flip_y] = self.color
+                        else:
+                            break
+        if valid:
+            return True
+        else:
+            return False
+
+    def executor_do_move(self, history, latest_boards, board, color, vertex):
+        self.board = np.reshape(board, (self.size, self.size))
+        self.color = color
+        self.action = self._flatten(vertex)
+        if self.action == 64:
+            valid_moves = self._find_correct_moves(is_next=True)
+            if not len(valid_moves):
+                return False
+            else:
+                return True
+        else:
+            self._step()
+            return True
+
+    def executor_get_score(self, board):
+        self.board = board
+        self._game_over()
+        if self.black_win is not None:
+            return self.black_win
+        else:
+            raise ValueError("Game not finished!")
+
+
+if __name__ == "__main__":
+    reversi = Reversi()
+    # board = reversi.get_board()
+    # print(board)
+    # state, value = reversi.simulate_step_forward([board, -1], 20)
+    # print(state[0])
+    # print("board")
+    # print(board)
+    # r = reversi.executor_get_score(board)
+    # print(r)
+

From 426251e15852e894a0ac200838fd8dec3078f62c Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Sun, 24 Dec 2017 01:07:46 +0800
Subject: [PATCH 62/98] add some code for debug and profiling

---
 AlphaGo/game.py            | 10 +++++++---
 AlphaGo/go.py              |  1 +
 AlphaGo/model.py           |  3 +++
 AlphaGo/play.py            | 11 ++++++++---
 AlphaGo/player.py          |  6 +++++-
 tianshou/core/mcts/mcts.py | 40 ++++++++++++++++++++++++++++++++++----
 6 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index 9fc8fa2..442cb73 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -17,6 +17,7 @@ from tianshou.core.mcts.mcts import MCTS
 
 import go
 import reversi
+import time
 
 class Game:
     '''
@@ -25,8 +26,10 @@ class Game:
     TODO : Maybe merge with the engine class in future, 
     currently leave it untouched for interacting with Go UI.
     '''
-    def __init__(self, name="go", checkpoint_path=None):
+    def __init__(self, name="go", role="unknown", debug=False, checkpoint_path=None):
         self.name = name
+        self.role = role
+        self.debug = debug
         if self.name == "go":
             self.size = 9
             self.komi = 3.75
@@ -36,7 +39,7 @@ class Game:
             self.latest_boards = deque(maxlen=8)
             for _ in range(8):
                 self.latest_boards.append(self.board)
-            self.game_engine = go.Go(size=self.size, komi=self.komi)
+            self.game_engine = go.Go(size=self.size, komi=self.komi, role=self.role)
         elif self.name == "reversi":
             self.size = 8
             self.history_length = 1
@@ -61,7 +64,8 @@ class Game:
         self.komi = k
 
     def think(self, latest_boards, color):
-        mcts = MCTS(self.game_engine, self.evaluator, [latest_boards, color], self.size ** 2 + 1, inverse=True)
+        mcts = MCTS(self.game_engine, self.evaluator, [latest_boards, color],
+                    self.size ** 2 + 1, role=self.role, debug=self.debug, inverse=True)
         mcts.search(max_step=100)
         temp = 1
         prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp)
diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index fe2ab74..833b01f 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -18,6 +18,7 @@ class Go:
     def __init__(self, **kwargs):
         self.size = kwargs['size']
         self.komi = kwargs['komi']
+        self.role = kwargs['role']
 
     def _flatten(self, vertex):
         x, y = vertex
diff --git a/AlphaGo/model.py b/AlphaGo/model.py
index 2dc1ef0..2a620f9 100644
--- a/AlphaGo/model.py
+++ b/AlphaGo/model.py
@@ -152,6 +152,9 @@ class ResNet(object):
         :param color: a string, indicate which one to play
         :return: a list of tensor, the predicted value and policy given the history and color
         """
+        # Note : maybe we can use it for isolating test of MCTS
+        #prob = [1.0 / self.action_num] * self.action_num
+        #return [prob, np.random.uniform(-1, 1)]
         history, color = state
         if len(history) != self.history_length:
             raise ValueError(
diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index b601ada..9144a40 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -28,6 +28,7 @@ if __name__ == '__main__':
     parser.add_argument("--black_weight_path", type=str, default=None)
     parser.add_argument("--white_weight_path", type=str, default=None)
     parser.add_argument("--id", type=int, default=0)
+    parser.add_argument("--debug", type=bool, default=False)
     args = parser.parse_args()
 
     if not os.path.exists(args.result_path):
@@ -60,11 +61,13 @@ if __name__ == '__main__':
     white_role_name = 'white' + str(args.id)
 
     agent_v0 = subprocess.Popen(
-        ['python', '-u', 'player.py', '--role=' + black_role_name, '--checkpoint_path=' + str(args.black_weight_path)],
+        ['python', '-u', 'player.py', '--role=' + black_role_name,
+         '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)],
         stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
     agent_v1 = subprocess.Popen(
-        ['python', '-u', 'player.py', '--role=' + white_role_name, '--checkpoint_path=' + str(args.white_weight_path)],
+        ['python', '-u', 'player.py', '--role=' + white_role_name,
+        '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)],
         stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
     server_list = ""
@@ -92,7 +95,8 @@ if __name__ == '__main__':
     evaluate_rounds = 1
     game_num = 0
     try:
-        while True:
+        #while True:
+        while game_num < evaluate_rounds:
             start_time = time.time()
             num = 0
             pass_flag = [False, False]
@@ -107,6 +111,7 @@ if __name__ == '__main__':
                         print show[board[i * size + j]] + " ",
                     print "\n",
                 data.boards.append(board)
+                start_time = time.time()
                 move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n')
                 print role[turn] + " : " + str(move),
                 num += 1
diff --git a/AlphaGo/player.py b/AlphaGo/player.py
index e848d2b..66a487f 100644
--- a/AlphaGo/player.py
+++ b/AlphaGo/player.py
@@ -25,11 +25,15 @@ if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument("--checkpoint_path", type=str, default=None)
     parser.add_argument("--role", type=str, default="unknown")
+    parser.add_argument("--debug", type=str, default=False)
     args = parser.parse_args()
 
     if args.checkpoint_path == 'None':
         args.checkpoint_path = None
-    game = Game(checkpoint_path=args.checkpoint_path)
+    debug = False
+    if args.debug == "True":
+        debug = True
+    game = Game(role=args.role, checkpoint_path=args.checkpoint_path, debug=debug)
     engine = GTPEngine(game_obj=game, name='tianshou', version=0)
 
     daemon = Pyro4.Daemon()                # make a Pyro daemon
diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index e99373c..e565337 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -40,16 +40,23 @@ class MCTSNode(object):
 
 
 class UCTNode(MCTSNode):
-    def __init__(self, parent, action, state, action_num, prior, inverse=False):
+    def __init__(self, parent, action, state, action_num, prior, debug=False, inverse=False):
         super(UCTNode, self).__init__(parent, action, state, action_num, prior, inverse)
         self.Q = np.zeros([action_num])
         self.W = np.zeros([action_num])
         self.N = np.zeros([action_num])
         self.ucb = self.Q + c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1)
         self.mask = None
+        self.debug=debug
+        self.elapse_time = 0
+
+    def clear_elapse_time(self):
+        self.elapse_time = 0
 
     def selection(self, simulator):
+        head = time.time()
         self.valid_mask(simulator)
+        self.elapse_time += time.time() - head
         action = np.argmax(self.ucb)
         if action in self.children.keys():
             return self.children[action].selection(simulator)
@@ -142,15 +149,18 @@ class ActionNode(object):
 
 
 class MCTS(object):
-    def __init__(self, simulator, evaluator, root, action_num, method="UCT", inverse=False):
+    def __init__(self, simulator, evaluator, root, action_num, method="UCT",
+                 role="unknown", debug=False, inverse=False):
         self.simulator = simulator
         self.evaluator = evaluator
+        self.role = role
+        self.debug = debug
         prior, _ = self.evaluator(root)
         self.action_num = action_num
         if method == "":
             self.root = root
         if method == "UCT":
-            self.root = UCTNode(None, None, root, action_num, prior, inverse=inverse)
+            self.root = UCTNode(None, None, root, action_num, prior, self.debug, inverse=inverse)
         if method == "TS":
             self.root = TSNode(None, None, root, action_num, prior, inverse=inverse)
         self.inverse = inverse
@@ -165,14 +175,36 @@ class MCTS(object):
         if max_step is None and max_time is None:
             raise ValueError("Need a stop criteria!")
 
+        selection_time = 0
+        expansion_time = 0
+        backprop_time = 0
+        self.root.clear_elapse_time()
         while step < max_step and time.time() - start_time < max_step:
-            self._expand()
+            sel_time, exp_time, back_time = self._expand()
+            selection_time += sel_time
+            expansion_time += exp_time
+            backprop_time += back_time
             step += 1
+        if (self.debug):
+            file = open("debug.txt", "a")
+            file.write("[" + str(self.role) + "]"
+                       + " selection : " + str(selection_time) + "\t"
+                       + " validmask : " + str(self.root.elapse_time) + "\t"
+                       + " expansion : " + str(expansion_time) + "\t"
+                       + " backprop  : " + str(backprop_time) + "\t"
+                       + "\n")
+            file.close()
 
     def _expand(self):
+        t0 = time.time()
         node, new_action = self.root.selection(self.simulator)
+        t1 = time.time()
         value = node.children[new_action].expansion(self.evaluator, self.action_num)
+        t2 = time.time()
         node.children[new_action].backpropagation(value + 0.)
+        t3 = time.time()
+        return t1 - t0, t2 - t1, t3 - t2
+
 
 if __name__ == "__main__":
     pass

From 001263a683c008d2a130b2468b68dcfdcbe5b82f Mon Sep 17 00:00:00 2001
From: Wenbo Hu <huwenbo.rambo@gmail.com>
Date: Sun, 24 Dec 2017 12:07:56 +0800
Subject: [PATCH 63/98] use a simplified version of get_score

---
 AlphaGo/go.py | 49 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 6 deletions(-)

diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index 833b01f..37e8e9f 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -3,7 +3,7 @@ import utils
 import copy
 import numpy as np
 from collections import deque
-
+import time
 '''
 Settings of the Go game.
 
@@ -214,7 +214,7 @@ class Go:
         # initialize the simulate_board from state
         history_boards, color = state
         if history_boards[-1] == history_boards[-2] and action is utils.PASS:
-            return None, 2 * (float(self.executor_get_score(history_boards[-1]) > 0)-0.5) * color
+            return None, 2 * (float(self.simple_executor_get_score(history_boards[-1]) > 0)-0.5) * color
         else:
             vertex = self._action2vertex(action)
             new_board = self._do_move(copy.copy(history_boards[-1]), color, vertex)
@@ -285,10 +285,7 @@ class Go:
                 return utils.WHITE
 
     def executor_get_score(self, current_board):
-        '''
-            is_unknown_estimation: whether use nearby stone to predict the unknown
-            return score from BLACK perspective.
-        '''
+        #return score from BLACK perspective.
         _board = copy.deepcopy(current_board)
         while utils.EMPTY in _board:
             vertex = self._find_empty(_board)
@@ -310,7 +307,46 @@ class Go:
 
         return score
 
+
+    def simple_executor_get_score(self, current_board):
+        '''
+            can only be used for the empty group only have one single stone
+            return score from BLACK perspective.
+        '''
+        score = 0
+        for idx, color in enumerate(current_board):
+            if color == utils.EMPTY:
+                neighbors = self._neighbor(self._deflatten(idx))
+                color = current_board[self._flatten(neighbors[0])]
+            if color == utils.BLACK:
+                score += 1
+            elif color == utils.WHITE:
+                score -= 1
+        score -= self.komi
+        return score
+
+
 if __name__ == "__main__":
+    go = Go(size=9, komi=3.75, role = utils.BLACK)
+    endgame = [
+        1, 0, 1, 0, 1, 1, -1, 0, -1,
+        1, 1, 1, 1, 1, 1, -1, -1, -1,
+        0, 1, 1, 1, 1, -1, 0, -1, 0,
+        1, 1, 1, 1, 1, -1, -1, -1, -1,
+        1, -1, 1, -1, 1, 1, -1, -1, -1,
+        -1, -1, -1, -1, -1, 1, -1, 0, -1,
+        1, 1, 1, -1, -1, -1, -1, -1, -1,
+        1, 0, 1, 1, 1, 1, 1, -1, 0,
+        1, 1, 0, 1, -1, -1, -1, -1, -1
+    ]
+    time0 = time.time()
+    score = go.executor_get_score(endgame)
+    time1 = time.time()
+    print(score, time1 - time0)
+    score = go.new_executor_get_score(endgame)
+    time2 = time.time()
+    print(score, time2 - time1)
+    '''
     ### do unit test for Go class
     pure_test = [
         0, 1, 0, 1, 0, 1, 0, 0, 0,
@@ -349,3 +385,4 @@ if __name__ == "__main__":
     for i in range(7):
         print (go._is_eye(opponent_test, utils.BLACK, ot_qry[i]))
     print("Test of eye surrend by opponents\n")
+    '''

From 74504ceb1dbbb6b28ea9ce2abae7dcd6ae7f761d Mon Sep 17 00:00:00 2001
From: rtz19970824 <1289226405@qq.com>
Date: Sun, 24 Dec 2017 14:40:50 +0800
Subject: [PATCH 64/98] debug for go and reversi

---
 AlphaGo/engine.py          |   7 +-
 AlphaGo/game.py            |  29 ++++---
 AlphaGo/go.py              |   8 +-
 AlphaGo/model.py           |   8 +-
 AlphaGo/play.py            |  10 +--
 AlphaGo/reversi.py         | 150 ++++++++++++++++++-------------------
 tianshou/core/mcts/mcts.py |   8 +-
 7 files changed, 111 insertions(+), 109 deletions(-)

diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py
index 98e5e61..5624a2f 100644
--- a/AlphaGo/engine.py
+++ b/AlphaGo/engine.py
@@ -6,6 +6,8 @@
 #
 
 from game import Game
+import copy
+import numpy as np
 import utils
 
 
@@ -186,7 +188,10 @@ class GTPEngine():
         return self._game.game_engine.executor_get_score(self._game.board), True
 
     def cmd_show_board(self, args, **kwargs):
-        return self._game.board, True
+        board = copy.deepcopy(self._game.board)
+        if isinstance(board, np.ndarray):
+            board = board.flatten().tolist()
+        return board, True
 
     def cmd_get_prob(self, args, **kwargs):
         return self._game.prob, True
diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index 442cb73..3a7959c 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -26,33 +26,37 @@ class Game:
     TODO : Maybe merge with the engine class in future, 
     currently leave it untouched for interacting with Go UI.
     '''
-    def __init__(self, name="go", role="unknown", debug=False, checkpoint_path=None):
+    def __init__(self, name="reversi", role="unknown", debug=False, checkpoint_path=None):
         self.name = name
         self.role = role
         self.debug = debug
         if self.name == "go":
             self.size = 9
             self.komi = 3.75
-            self.board = [utils.EMPTY] * (self.size ** 2)
             self.history = []
             self.history_length = 8
-            self.latest_boards = deque(maxlen=8)
-            for _ in range(8):
-                self.latest_boards.append(self.board)
             self.game_engine = go.Go(size=self.size, komi=self.komi, role=self.role)
+            self.board = [utils.EMPTY] * (self.size ** 2)
         elif self.name == "reversi":
             self.size = 8
             self.history_length = 1
-            self.game_engine = reversi.Reversi()
+            self.history = []
+            self.game_engine = reversi.Reversi(size=self.size)
             self.board = self.game_engine.get_board()
         else:
             raise ValueError(name + " is an unknown game...")
 
         self.evaluator = model.ResNet(self.size, self.size ** 2 + 1, history_length=self.history_length)
+        self.latest_boards = deque(maxlen=self.history_length)
+        for _ in range(self.history_length):
+            self.latest_boards.append(self.board)
 
     def clear(self):
-        self.board = [utils.EMPTY] * (self.size ** 2)
-        self.history = []
+        if self.name == "go":
+            self.board = [utils.EMPTY] * (self.size ** 2)
+            self.history = []
+        if self.name == "reversi":
+            self.board = self.game_engine.get_board()
         for _ in range(self.history_length):
             self.latest_boards.append(self.board)
 
@@ -84,7 +88,7 @@ class Game:
         if self.name == "go":
             res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex)
         elif self.name == "reversi":
-            res = self.game_engine.executor_do_move(self.board, color, vertex)
+            res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex)
         return res
 
     def think_play_move(self, color):
@@ -110,13 +114,14 @@ class Game:
             if row[i] < 10:
                 print(' ', end='')
             for j in range(self.size):
-                print(self.status2symbol(self.board[self._flatten((j + 1, i + 1))]), end='  ')
+                print(self.status2symbol(self.board[self.game_engine._flatten((j + 1, i + 1))]), end='  ')
             print('')
         sys.stdout.flush()
 
 if __name__ == "__main__":
-    g = Game()
-    g.show_board()
+    g = Game("go")
+    print(g.board)
+    g.clear()
     g.think_play_move(1)
     #file = open("debug.txt", "a")
     #file.write("mcts check\n")
diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index 833b01f..aca6632 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -212,12 +212,12 @@ class Go:
 
     def simulate_step_forward(self, state, action):
         # initialize the simulate_board from state
-        history_boards, color = state
+        history_boards, color = copy.deepcopy(state)
         if history_boards[-1] == history_boards[-2] and action is utils.PASS:
             return None, 2 * (float(self.executor_get_score(history_boards[-1]) > 0)-0.5) * color
         else:
             vertex = self._action2vertex(action)
-            new_board = self._do_move(copy.copy(history_boards[-1]), color, vertex)
+            new_board = self._do_move(copy.deepcopy(history_boards[-1]), color, vertex)
             history_boards.append(new_board)
             new_color = -color
             return [history_boards, new_color], 0
@@ -227,8 +227,8 @@ class Go:
             return False
         current_board[self._flatten(vertex)] = color
         self._process_board(current_board, color, vertex)
-        history.append(copy.copy(current_board))
-        latest_boards.append(copy.copy(current_board))
+        history.append(copy.deepcopy(current_board))
+        latest_boards.append(copy.deepcopy(current_board))
         return True
 
     def _find_empty(self, current_board):
diff --git a/AlphaGo/model.py b/AlphaGo/model.py
index 2a620f9..0549f41 100644
--- a/AlphaGo/model.py
+++ b/AlphaGo/model.py
@@ -173,10 +173,10 @@ class ResNet(object):
         """
         state = np.zeros([1, self.board_size, self.board_size, 2 * self.history_length + 1])
         for i in range(self.history_length):
-            state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.board_size ** 2)).reshape(self.board_size,
+            state[0, :, :, i] = np.array(np.array(history[i]).flatten() == np.ones(self.board_size ** 2)).reshape(self.board_size,
                                                                                                         self.board_size)
             state[0, :, :, i + self.history_length] = np.array(
-                np.array(history[i]) == -np.ones(self.board_size ** 2)).reshape(self.board_size, self.board_size)
+                np.array(history[i]).flatten() == -np.ones(self.board_size ** 2)).reshape(self.board_size, self.board_size)
         # TODO: need a config to specify the BLACK and WHITE
         if color == +1:
             state[0, :, :, 2 * self.history_length] = np.ones([self.board_size, self.board_size])
@@ -301,7 +301,7 @@ class ResNet(object):
         :return:
         """
 
-        new_board = copy.copy(board)
+        new_board = copy.deepcopy(board)
         if new_board.ndim == 3:
             new_board = np.expand_dims(new_board, axis=0)
 
@@ -331,7 +331,7 @@ class ResNet(object):
         :param orientation: an integer, which orientation to reflect
         :return:
         """
-        new_board = copy.copy(board)
+        new_board = copy.deepcopy(board)
         for _ in range(times):
             if orientation == 0:
                 new_board = new_board[:, ::-1]
diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index 9144a40..2731948 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -89,7 +89,7 @@ if __name__ == '__main__':
 
     pattern = "[A-Z]{1}[0-9]{1}"
     space = re.compile("\s+")
-    size = 9
+    size = {"go":9, "reversi":8}
     show = ['.', 'X', 'O']
 
     evaluate_rounds = 1
@@ -102,13 +102,13 @@ if __name__ == '__main__':
             pass_flag = [False, False]
             print("Start game {}".format(game_num))
             # end the game if both palyer chose to pass, or play too much turns
-            while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2:
+            while not (pass_flag[0] and pass_flag[1]) and num < size["reversi"] ** 2 * 2:
                 turn = num % 2
                 board = player[turn].run_cmd(str(num) + ' show_board')
                 board = eval(board[board.index('['):board.index(']') + 1])
-                for i in range(size):
-                    for j in range(size):
-                        print show[board[i * size + j]] + " ",
+                for i in range(size["reversi"]):
+                    for j in range(size["reversi"]):
+                        print show[board[i * size["reversi"] + j]] + " ",
                     print "\n",
                 data.boards.append(board)
                 start_time = time.time()
diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py
index 4fa1468..c6c8a5b 100644
--- a/AlphaGo/reversi.py
+++ b/AlphaGo/reversi.py
@@ -1,4 +1,5 @@
 import numpy as np
+import copy
 '''
 Settings of the Reversi game.
 
@@ -8,13 +9,8 @@ Settings of the Reversi game.
 
 
 class Reversi:
-    def __init__(self, black=None, white=None):
-        self.board = None  # 8 * 8 board with 1 for black, -1 for white and 0 for blank
-        self.color = None  # 1 for black and -1 for white
-        self.action = None   # number in 0~63
-        self.winner = None
-        self.black_win = None
-        self.size = 8
+    def __init__(self, **kwargs):
+        self.size = kwargs['size']
 
     def _deflatten(self, idx):
         x = idx // self.size + 1
@@ -24,39 +20,39 @@ class Reversi:
     def _flatten(self, vertex):
         x, y = vertex
         if (x == 0) and (y == 0):
-            return 64
+            return self.size ** 2
         return (x - 1) * self.size + (y - 1)
 
-    def get_board(self, board=None):
-        self.board = board or np.zeros([8,8])
-        self.board[3, 3] = -1
-        self.board[4, 4] = -1
-        self.board[3, 4] = 1
-        self.board[4, 3] = 1
-        return self.board
+    def get_board(self):
+        board = np.zeros([self.size, self.size], dtype=np.int32)
+        board[self.size / 2 - 1, self.size / 2 - 1] = -1
+        board[self.size / 2, self.size / 2] = -1
+        board[self.size / 2 - 1, self.size / 2] = 1
+        board[self.size / 2, self.size / 2 - 1] = 1
+        return board
 
-    def _find_correct_moves(self, is_next=False):
+    def _find_correct_moves(self, board, color, is_next=False):
         moves = []
         if is_next:
-            color = 0 - self.color
+            new_color = 0 - color
         else:
-            color = self.color
-        for i in range(64):
+            new_color = color
+        for i in range(self.size ** 2):
             x, y = self._deflatten(i)
-            valid = self._is_valid(x - 1, y - 1, color)
+            valid = self._is_valid(board, x - 1, y - 1, new_color)
             if valid:
                 moves.append(i)
         return moves
 
-    def _one_direction_valid(self, x, y, color):
+    def _one_direction_valid(self, board, x, y, color):
         if (x >= 0) and (x < self.size):
             if (y >= 0) and (y < self.size):
-                if self.board[x, y] == color:
+                if board[x, y] == color:
                     return True
         return False
 
-    def _is_valid(self, x, y, color):
-        if self.board[x, y]:
+    def _is_valid(self, board, x, y, color):
+        if board[x, y]:
             return False
         for x_direction in [-1, 0, 1]:
             for y_direction in [-1, 0, 1]:
@@ -66,20 +62,18 @@ class Reversi:
                 while True:
                     new_x += x_direction
                     new_y += y_direction
-                    if self._one_direction_valid(new_x, new_y, 0 - color):
+                    if self._one_direction_valid(board, new_x, new_y, 0 - color):
                         flag = 1
                     else:
                         break
-                if self._one_direction_valid(new_x, new_y, color) and flag:
+                if self._one_direction_valid(board, new_x, new_y, color) and flag:
                     return True
         return False
 
     def simulate_get_mask(self, state, action_set):
-        history_boards, color = state
-        self.board = np.reshape(history_boards[-1], (self.size, self.size))
-        self.color = color
-        valid_moves = self._find_correct_moves()
-        print(valid_moves)
+        history_boards, color = copy.deepcopy(state)
+        board = copy.deepcopy(history_boards[-1])
+        valid_moves = self._find_correct_moves(board, color)
         if not len(valid_moves):
             invalid_action_mask = action_set[0:-1]
         else:
@@ -90,34 +84,34 @@ class Reversi:
         return invalid_action_mask
 
     def simulate_step_forward(self, state, action):
-        self.board = state[0].copy()
-        self.board = np.reshape(self.board, (self.size, self.size))
-        self.color = state[1]
-        self.action = action
-        if self.action == 64:
-            valid_moves = self._find_correct_moves(is_next=True)
+        history_boards, color = copy.deepcopy(state)
+        board = copy.deepcopy(history_boards[-1])
+        if action == self.size ** 2:
+            valid_moves = self._find_correct_moves(board, color, is_next=True)
             if not len(valid_moves):
-                self._game_over()
-                return None, self.winner * self.color
+                winner = self._get_winner(board)
+                return None, winner * color
             else:
-                return [self.board, 0 - self.color], 0
-        self._step()
-        return [self.board, 0 - self.color], 0
+                return [history_boards, 0 - color], 0
+        new_board = self._step(board, color, action)
+        history_boards.append(new_board)
+        return [history_boards, 0 - color], 0
 
-    def _game_over(self):
-        black_num, white_num = self._number_of_black_and_white()
-        self.black_win = black_num - white_num
-        if self.black_win > 0:
-            self.winner = 1
-        elif self.black_win < 0:
-            self.winner = -1
+    def _get_winner(self, board):
+        black_num, white_num = self._number_of_black_and_white(board)
+        black_win = black_num - white_num
+        if black_win > 0:
+            winner = 1
+        elif black_win < 0:
+            winner = -1
         else:
-            self.winner = 0
+            winner = 0
+        return winner
 
-    def _number_of_black_and_white(self):
+    def _number_of_black_and_white(self, board):
         black_num = 0
         white_num = 0
-        board_list = np.reshape(self.board, self.size ** 2)
+        board_list = np.reshape(board, self.size ** 2)
         for i in range(len(board_list)):
             if board_list[i] == 1:
                 black_num += 1
@@ -125,19 +119,18 @@ class Reversi:
                 white_num += 1
         return black_num, white_num
 
-    def _step(self):
-        if self.action < 0 or self.action > 63:
+    def _step(self, board, color, action):
+        if action < 0 or action > self.size ** 2 - 1:
             raise ValueError("Action not in the range of [0,63]!")
-        if self.action is None:
+        if action is None:
             raise ValueError("Action is None!")
-        x, y = self._deflatten(self.action)
-        valid = self._flip(x -1, y - 1)
-        if not valid:
-            raise ValueError("Illegal action!")
+        x, y = self._deflatten(action)
+        new_board = self._flip(board, x - 1, y - 1, color)
+        return new_board
 
-    def _flip(self, x, y):
+    def _flip(self, board, x, y, color):
         valid = 0
-        self.board[x, y] = self.color
+        board[x, y] = color
         for x_direction in [-1, 0, 1]:
             for y_direction in [-1, 0, 1]:
                 new_x = x
@@ -146,47 +139,46 @@ class Reversi:
                 while True:
                     new_x += x_direction
                     new_y += y_direction
-                    if self._one_direction_valid(new_x, new_y, 0 - self.color):
+                    if self._one_direction_valid(board, new_x, new_y, 0 - color):
                         flag = 1
                     else:
                         break
-                if self._one_direction_valid(new_x, new_y, self.color) and flag:
+                if self._one_direction_valid(board, new_x, new_y, color) and flag:
                     valid = 1
                     flip_x = x
                     flip_y = y
                     while True:
                         flip_x += x_direction
                         flip_y += y_direction
-                        if self._one_direction_valid(flip_x, flip_y, 0 - self.color):
-                            self.board[flip_x, flip_y] = self.color
+                        if self._one_direction_valid(board, flip_x, flip_y, 0 - color):
+                            board[flip_x, flip_y] = color
                         else:
                             break
         if valid:
-            return True
+            return board
         else:
-            return False
+            raise ValueError("Invalid action")
 
     def executor_do_move(self, history, latest_boards, board, color, vertex):
-        self.board = np.reshape(board, (self.size, self.size))
-        self.color = color
-        self.action = self._flatten(vertex)
-        if self.action == 64:
-            valid_moves = self._find_correct_moves(is_next=True)
+        board = np.reshape(board, (self.size, self.size))
+        color = color
+        action = self._flatten(vertex)
+        if action == self.size ** 2:
+            valid_moves = self._find_correct_moves(board, color, is_next=True)
             if not len(valid_moves):
                 return False
             else:
                 return True
         else:
-            self._step()
+            new_board = self._step(board, color, action)
+            history.append(new_board)
+            latest_boards.append(new_board)
             return True
 
     def executor_get_score(self, board):
-        self.board = board
-        self._game_over()
-        if self.black_win is not None:
-            return self.black_win
-        else:
-            raise ValueError("Game not finished!")
+        board = board
+        winner = self._get_winner(board)
+        return winner
 
 
 if __name__ == "__main__":
diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index e565337..493cf7d 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -110,15 +110,15 @@ class ActionNode(object):
         self.reward = 0
 
     def type_conversion_to_tuple(self):
-        if type(self.next_state) is np.ndarray:
+        if isinstance(self.next_state, np.ndarray):
             self.next_state = self.next_state.tolist()
-        if type(self.next_state) is list:
+        if isinstance(self.next_state, list):
             self.next_state = list2tuple(self.next_state)
 
     def type_conversion_to_origin(self):
-        if self.state_type is np.ndarray:
+        if isinstance(self.state_type, np.ndarray):
             self.next_state = np.array(self.next_state)
-        if self.state_type is list:
+        if isinstance(self.state_type, np.ndarray):
             self.next_state = tuple2list(self.next_state)
 
     def selection(self, simulator):

From 2d9aa32758968829c0351e84887e9277d8c1697d Mon Sep 17 00:00:00 2001
From: rtz19970824 <1289226405@qq.com>
Date: Sun, 24 Dec 2017 14:41:40 +0800
Subject: [PATCH 65/98] change all copy to deepcopy

---
 AlphaGo/go.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index 15fc5c6..55f5a4a 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -99,7 +99,7 @@ class Go:
 
     def _check_global_isomorphous(self, history_boards, current_board, color, vertex):
         repeat = False
-        next_board = copy.copy(current_board)
+        next_board = copy.deepcopy(current_board)
         next_board[self._flatten(vertex)] = color
         self._process_board(next_board, color, vertex)
         if next_board in history_boards:

From cf57144ce994dc57588c1473fc05e85bbac92587 Mon Sep 17 00:00:00 2001
From: mcgrady00h <281130306@qq.com>
Date: Sun, 24 Dec 2017 15:47:11 +0800
Subject: [PATCH 66/98] merge master

---
 AlphaGo/network.py | 225 ---------------------------------------------
 1 file changed, 225 deletions(-)
 delete mode 100644 AlphaGo/network.py

diff --git a/AlphaGo/network.py b/AlphaGo/network.py
deleted file mode 100644
index cfff6f3..0000000
--- a/AlphaGo/network.py
+++ /dev/null
@@ -1,225 +0,0 @@
-import os
-import time
-import sys
-
-import numpy as np
-import time
-import tensorflow as tf
-import tensorflow.contrib.layers as layers
-
-import multi_gpu
-import time
-import copy
-
-# os.environ["CUDA_VISIBLE_DEVICES"] = "1"
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
-
-
-def residual_block(input, is_training):
-    normalizer_params = {'is_training': is_training,
-                         'updates_collections': tf.GraphKeys.UPDATE_OPS}
-    h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu,
-                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
-                      weights_regularizer=layers.l2_regularizer(1e-4))
-    h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity,
-                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
-                      weights_regularizer=layers.l2_regularizer(1e-4))
-    h = h + input
-    return tf.nn.relu(h)
-
-
-def policy_heads(input, is_training):
-    normalizer_params = {'is_training': is_training,
-                         'updates_collections': tf.GraphKeys.UPDATE_OPS}
-    h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
-                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
-                      weights_regularizer=layers.l2_regularizer(1e-4))
-    h = layers.flatten(h)
-    h = layers.fully_connected(h, 82, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4))
-    return h
-
-
-def value_heads(input, is_training):
-    normalizer_params = {'is_training': is_training,
-                         'updates_collections': tf.GraphKeys.UPDATE_OPS}
-    h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
-                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
-                      weights_regularizer=layers.l2_regularizer(1e-4))
-    h = layers.flatten(h)
-    h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4))
-    h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4))
-    return h
-
-
-class Network(object):
-    def __init__(self):
-        self.x = tf.placeholder(tf.float32, shape=[None, 9, 9, 17])
-        self.is_training = tf.placeholder(tf.bool, shape=[])
-        self.z = tf.placeholder(tf.float32, shape=[None, 1])
-        self.pi = tf.placeholder(tf.float32, shape=[None, 82])
-        self.build_network()
-
-    def build_network(self):
-        h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu,
-                          normalizer_fn=layers.batch_norm,
-                          normalizer_params={'is_training': self.is_training,
-                                             'updates_collections': tf.GraphKeys.UPDATE_OPS},
-                          weights_regularizer=layers.l2_regularizer(1e-4))
-        for i in range(4):
-            h = residual_block(h, self.is_training)
-        self.v = value_heads(h, self.is_training)
-        self.p = policy_heads(h, self.is_training)
-        # loss = tf.reduce_mean(tf.square(z-v)) - tf.multiply(pi, tf.log(tf.clip_by_value(tf.nn.softmax(p), 1e-8, tf.reduce_max(tf.nn.softmax(p)))))
-        self.value_loss = tf.reduce_mean(tf.square(self.z - self.v))
-        self.policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.pi, logits=self.p))
-
-        self.reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-        self.total_loss = self.value_loss + self.policy_loss + self.reg
-        # train_op = tf.train.MomentumOptimizer(1e-4, momentum=0.9, use_nesterov=True).minimize(total_loss)
-        self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-        with tf.control_dependencies(self.update_ops):
-            self.train_op = tf.train.RMSPropOptimizer(1e-4).minimize(self.total_loss)
-        self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
-        self.saver = tf.train.Saver(max_to_keep=10, var_list=self.var_list)
-        self.sess = multi_gpu.create_session()
-
-    def train(self):
-        data_path = "./training_data/"
-        data_name = os.listdir(data_path)
-        epochs = 100
-        batch_size = 128
-
-        result_path = "./checkpoints_origin/"
-        with multi_gpu.create_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            ckpt_file = tf.train.latest_checkpoint(result_path)
-            if ckpt_file is not None:
-                print('Restoring model from {}...'.format(ckpt_file))
-                self.saver.restore(sess, ckpt_file)
-            for epoch in range(epochs):
-                for name in data_name:
-                    data = np.load(data_path + name)
-                    boards = data["boards"]
-                    wins = data["wins"]
-                    ps = data["ps"]
-                    print (boards.shape)
-                    print (wins.shape)
-                    print (ps.shape)
-                    batch_num = boards.shape[0] // batch_size
-                    index = np.arange(boards.shape[0])
-                    np.random.shuffle(index)
-                    value_losses = []
-                    policy_losses = []
-                    regs = []
-                    time_train = -time.time()
-                    for iter in range(batch_num):
-                        lv, lp, r, value, prob, _ = sess.run(
-                            [self.value_loss, self.policy_loss, self.reg, self.v, tf.nn.softmax(self.p), self.train_op],
-                            feed_dict={self.x: boards[
-                                index[iter * batch_size:(iter + 1) * batch_size]],
-                                       self.z: wins[index[
-                                                    iter * batch_size:(iter + 1) * batch_size]],
-                                       self.pi: ps[index[
-                                                   iter * batch_size:(iter + 1) * batch_size]],
-                                       self.is_training: True})
-                        value_losses.append(lv)
-                        policy_losses.append(lp)
-                        regs.append(r)
-                        if iter % 1 == 0:
-                            print(
-                                "Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format(
-                                    epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)),
-                                    np.mean(np.array(policy_losses)), np.mean(np.array(regs))))
-                            time_train = -time.time()
-                            value_losses = []
-                            policy_losses = []
-                            regs = []
-                        if iter % 20 == 0:
-                            save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter)
-                            self.saver.save(sess, result_path + save_path)
-                    del data, boards, wins, ps
-
-
-                    # def forward(call_number):
-                    #     # checkpoint_path = "/home/yama/rl/tianshou/AlphaGo/checkpoints"
-                    #     checkpoint_path = "/home/jialian/stuGo/tianshou/stuGo/checkpoints/"
-                    #     board_file = np.genfromtxt("/home/jialian/stuGo/tianshou/leela-zero/src/mcts_nn_files/board_" + call_number,
-                    #                                dtype='str');
-                    #     human_board = np.zeros((17, 19, 19))
-                    #
-                    #     # TODO : is it ok to ignore the last channel?
-                    #     for i in range(17):
-                    #         human_board[i] = np.array(list(board_file[i])).reshape(19, 19)
-                    #     # print("============================")
-                    #     # print("human board sum : " + str(np.sum(human_board[-1])))
-                    #     # print("============================")
-                    #     # print(human_board)
-                    #     # print("============================")
-                    #     # rint(human_board)
-                    #     feed_board = human_board.transpose(1, 2, 0).reshape(1, 19, 19, 17)
-                    #     # print(feed_board[:,:,:,-1])
-                    #     # print(feed_board.shape)
-                    #
-                    #     # npz_board = np.load("/home/yama/rl/tianshou/AlphaGo/data/7f83928932f64a79bc1efdea268698ae.npz")
-                    #     # print(npz_board["boards"].shape)
-                    #     # feed_board = npz_board["boards"][10].reshape(-1, 19, 19, 17)
-                    #     ##print(feed_board)
-                    #     # show_board = feed_board[0].transpose(2, 0, 1)
-                    #     # print("board shape : ", show_board.shape)
-                    #     # print(show_board)
-                    #
-                    #     itflag = False
-                    #     with multi_gpu.create_session() as sess:
-                    #         sess.run(tf.global_variables_initializer())
-                    #         ckpt_file = tf.train.latest_checkpoint(checkpoint_path)
-                    #         if ckpt_file is not None:
-                    #             # print('Restoring model from {}...'.format(ckpt_file))
-                    #             saver.restore(sess, ckpt_file)
-                    #         else:
-                    #             raise ValueError("No model loaded")
-                    #         res = sess.run([tf.nn.softmax(p), v], feed_dict={x: feed_board, is_training: itflag})
-                    #         # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False})
-                    #         # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True})
-                    #         # print(np.argmax(res[0]))
-                    #         np.savetxt(sys.stdout, res[0][0], fmt="%.6f", newline=" ")
-                    #         np.savetxt(sys.stdout, res[1][0], fmt="%.6f", newline=" ")
-                    #         pv_file = "/home/jialian/stuGotianshou/leela-zero/src/mcts_nn_files/policy_value"
-                    #         np.savetxt(pv_file, np.concatenate((res[0][0], res[1][0])), fmt="%.6f", newline=" ")
-                    #     # np.savetxt(pv_file, res[1][0], fmt="%.6f", newline=" ")
-                    #     return res
-
-    def forward(self, checkpoint_path):
-        # checkpoint_path = "/home/tongzheng/tianshou/AlphaGo/checkpoints/"
-        # sess = multi_gpu.create_session()
-        # sess.run(tf.global_variables_initializer())
-        if checkpoint_path is None:
-            self.sess.run(tf.global_variables_initializer())
-        else:
-            ckpt_file = tf.train.latest_checkpoint(checkpoint_path)
-            if ckpt_file is not None:
-            # print('Restoring model from {}...'.format(ckpt_file))
-                self.saver.restore(self.sess, ckpt_file)
-            # print('Successfully loaded')
-            else:
-                raise ValueError("No model loaded")
-        # prior, value = sess.run([tf.nn.softmax(p), v], feed_dict={x: state, is_training: False})
-        # return prior, value
-        return self.sess
-
-
-if __name__ == '__main__':
-    # state = np.random.randint(0, 1, [256, 9, 9, 17])
-    # net = Network()
-    # net.train()
-    # sess = net.forward()
-    # start_time = time.time()
-    # for i in range(100):
-    #     sess.run([tf.nn.softmax(net.p), net.v], feed_dict={net.x: state, net.is_training: False})
-    #     print("Step {}, use time {}".format(i, time.time() - start_time))
-    #     start_time = time.time()
-    net0 = Network()
-    sess0 = net0.forward("./checkpoints/")
-    print("Loaded")
-    while True:
-        pass
-

From 5aa5dcd191a266aca637574ff8aaab46ee1c58ae Mon Sep 17 00:00:00 2001
From: mcgrady00h <281130306@qq.com>
Date: Sun, 24 Dec 2017 16:47:43 +0800
Subject: [PATCH 67/98] add comments for mcts with virtual loss

---
 tianshou/core/mcts/mcts_virtual_loss.py | 47 +++++++++++++++++++++----
 1 file changed, 41 insertions(+), 6 deletions(-)

diff --git a/tianshou/core/mcts/mcts_virtual_loss.py b/tianshou/core/mcts/mcts_virtual_loss.py
index 9335464..f27d8a3 100644
--- a/tianshou/core/mcts/mcts_virtual_loss.py
+++ b/tianshou/core/mcts/mcts_virtual_loss.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # vim:fenc=utf-8
 # $File: mcts_virtual_loss.py
-# $Date: Sat Dec 23 02:4850 2017 +0800
+# $Date: Sun Dec 24 16:4740 2017 +0800
 # Original file: mcts.py
 # $Author: renyong15 © <mails.tsinghua.edu.cn>
 #
@@ -22,7 +22,17 @@ from .utils import list2tuple, tuple2list
 
 
 class MCTSNodeVirtualLoss(object):
-    def __init__(self, parent, action, state, action_num, prior, inverse=False):
+    """
+        MCTS abstract class with virtual loss. Currently we only support UCT node.
+        Role of the Parameters can be found in Readme.md.
+    """
+    def __init__(self, 
+                 parent, 
+                 action, 
+                 state, 
+                 action_num, 
+                 prior, 
+                 inverse = False):
         self.parent = parent
         self.action = action
         self.children = {}
@@ -41,7 +51,19 @@ class MCTSNodeVirtualLoss(object):
         pass
 
 class UCTNodeVirtualLoss(MCTSNodeVirtualLoss):
-    def __init__(self, parent, action, state, action_num, prior, inverse=False, c_puct = 5):
+    """
+        UCT node (state node) with virtual loss.
+        Role of the Parameters can be found in Readme.md.
+        :param c_puct balance between exploration and exploition,
+    """
+    def __init__(self, 
+                 parent, 
+                 action, 
+                 state, 
+                 action_num, 
+                 prior, 
+                 inverse=False, 
+                 c_puct = 5):
         super(UCTNodeVirtualLoss, self).__init__(parent, action, state, action_num, prior, inverse)
         self.Q = np.zeros([action_num])
         self.W = np.zeros([action_num])
@@ -53,7 +75,8 @@ class UCTNodeVirtualLoss(MCTSNodeVirtualLoss):
 
         self.mask = None
 
-    def selection(self, simulator):
+    def selection(self, 
+                  simulator):
         self.valid_mask(simulator)
         self.Q = np.zeros([self.action_num])
         N_not_zero = (self.N + self.virtual_loss) > 0
@@ -108,6 +131,9 @@ class UCTNodeVirtualLoss(MCTSNodeVirtualLoss):
 
 
 class ActionNodeVirtualLoss(object):
+    """
+        Action node with virtual loss.
+    """
     def __init__(self, parent, action):
         self.parent = parent
         self.action = action
@@ -156,6 +182,9 @@ class ActionNodeVirtualLoss(object):
 
 
 class MCTSVirtualLoss(object):
+    """
+        MCTS class with virtual loss 
+    """
     def __init__(self, simulator, evaluator, root, action_num, batch_size = 1, method = "UCT", inverse = False):
         self.simulator = simulator
         self.evaluator = evaluator
@@ -196,13 +225,19 @@ class MCTSVirtualLoss(object):
         self.bp_time = []
         while (max_step is not None and self.step < self.max_step or max_step is None) \
                 and (max_time is not None and time.time() - self.start_time < self.max_time or max_time is None):
-            self.expand()
+            self._expand()
             if max_step is not None:
                 self.step += 1
 
-    def expand(self):
+    def _expand(self):
         """
             Core logic method for MCTS tree to expand nodes.
+            Steps to expand node:
+            1. Select final action node with virtual loss and collect them in to a minibatch.
+               (i.e. root->action->state->action...->action)
+            2. Remove the virtual loss
+            3. Evaluate the whole minibatch using evaluator 
+            4. Expand new nodes and perform back propogation.
         """
         ## minibatch with virtual loss
         nodes = []

From f0074aa7ca0db4736309e708f7332284dc5e9d64 Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Sun, 24 Dec 2017 17:43:45 +0800
Subject: [PATCH 68/98] fix bug of game config and add profing functions to
 mcts

---
 AlphaGo/engine.py          |   3 +-
 AlphaGo/game.py            |   7 +--
 AlphaGo/play.py            |  13 ++--
 AlphaGo/player.py          |   3 +-
 AlphaGo/random_data.py     | 123 -------------------------------------
 AlphaGo/self-play.py       | 103 -------------------------------
 tianshou/core/mcts/mcts.py |  68 +++++++++++++-------
 7 files changed, 58 insertions(+), 262 deletions(-)
 delete mode 100644 AlphaGo/random_data.py
 delete mode 100644 AlphaGo/self-play.py

diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py
index 5624a2f..b662dbd 100644
--- a/AlphaGo/engine.py
+++ b/AlphaGo/engine.py
@@ -198,5 +198,4 @@ class GTPEngine():
 
 
 if __name__ == "main":
-    game = Game()
-    engine = GTPEngine(game_obj=game)
+    print ("test engine.py")
diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index 3a7959c..8ffde93 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -26,7 +26,7 @@ class Game:
     TODO : Maybe merge with the engine class in future, 
     currently leave it untouched for interacting with Go UI.
     '''
-    def __init__(self, name="reversi", role="unknown", debug=False, checkpoint_path=None):
+    def __init__(self, name=None, role=None, debug=False, checkpoint_path=None):
         self.name = name
         self.role = role
         self.debug = debug
@@ -119,10 +119,7 @@ class Game:
         sys.stdout.flush()
 
 if __name__ == "__main__":
-    g = Game("go")
-    print(g.board)
-    g.clear()
-    g.think_play_move(1)
+    print("test game.py")
     #file = open("debug.txt", "a")
     #file.write("mcts check\n")
     #file.close()
diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index 2731948..5777982 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -60,13 +60,14 @@ if __name__ == '__main__':
     black_role_name = 'black' + str(args.id)
     white_role_name = 'white' + str(args.id)
 
+    game_name = 'go'
     agent_v0 = subprocess.Popen(
-        ['python', '-u', 'player.py', '--role=' + black_role_name,
+        ['python', '-u', 'player.py', '--game=' + game_name, '--role=' + black_role_name,
          '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)],
         stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
     agent_v1 = subprocess.Popen(
-        ['python', '-u', 'player.py', '--role=' + white_role_name,
+        ['python', '-u', 'player.py', '--game=' + game_name, '--role=' + white_role_name,
         '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)],
         stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
@@ -102,13 +103,13 @@ if __name__ == '__main__':
             pass_flag = [False, False]
             print("Start game {}".format(game_num))
             # end the game if both palyer chose to pass, or play too much turns
-            while not (pass_flag[0] and pass_flag[1]) and num < size["reversi"] ** 2 * 2:
+            while not (pass_flag[0] and pass_flag[1]) and num < size[game_name] ** 2 * 2:
                 turn = num % 2
                 board = player[turn].run_cmd(str(num) + ' show_board')
                 board = eval(board[board.index('['):board.index(']') + 1])
-                for i in range(size["reversi"]):
-                    for j in range(size["reversi"]):
-                        print show[board[i * size["reversi"] + j]] + " ",
+                for i in range(size[game_name]):
+                    for j in range(size[game_name]):
+                        print show[board[i * size[game_name] + j]] + " ",
                     print "\n",
                 data.boards.append(board)
                 start_time = time.time()
diff --git a/AlphaGo/player.py b/AlphaGo/player.py
index 66a487f..a8f61c1 100644
--- a/AlphaGo/player.py
+++ b/AlphaGo/player.py
@@ -26,6 +26,7 @@ if __name__ == '__main__':
     parser.add_argument("--checkpoint_path", type=str, default=None)
     parser.add_argument("--role", type=str, default="unknown")
     parser.add_argument("--debug", type=str, default=False)
+    parser.add_argument("--game", type=str, default=False)
     args = parser.parse_args()
 
     if args.checkpoint_path == 'None':
@@ -33,7 +34,7 @@ if __name__ == '__main__':
     debug = False
     if args.debug == "True":
         debug = True
-    game = Game(role=args.role, checkpoint_path=args.checkpoint_path, debug=debug)
+    game = Game(name=args.game, role=args.role, checkpoint_path=args.checkpoint_path, debug=debug)
     engine = GTPEngine(game_obj=game, name='tianshou', version=0)
 
     daemon = Pyro4.Daemon()                # make a Pyro daemon
diff --git a/AlphaGo/random_data.py b/AlphaGo/random_data.py
deleted file mode 100644
index 5b53bd6..0000000
--- a/AlphaGo/random_data.py
+++ /dev/null
@@ -1,123 +0,0 @@
-import os
-import numpy as np
-import time
-
-size = 9
-path = "/raid/tongzheng/tianshou/AlphaGo/data/part1/"
-save_path = "/raid/tongzheng/tianshou/AlphaGo/data/"
-name = os.listdir(path)
-print(len(name))
-batch_size = 128
-batch_num = 512
-
-block_size = batch_size * batch_num
-slots_num = 16
-
-
-class block(object):
-    def __init__(self, block_size, block_id):
-        self.boards = []
-        self.wins = []
-        self.ps = []
-        self.block_size = block_size
-        self.block_id = block_id
-
-    def concat(self, board, p, win):
-        board = board.reshape(-1, size, size, 17)
-        win = win.reshape(-1, 1)
-        p = p.reshape(-1, size ** 2 + 1)
-        self.boards.append(board)
-        self.wins.append(win)
-        self.ps.append(p)
-
-    def isfull(self):
-        assert len(self.boards) == len(self.wins)
-        assert len(self.boards) == len(self.ps)
-        return len(self.boards) == self.block_size
-
-    def save_and_reset(self, block_id):
-        self.boards = np.concatenate(self.boards, axis=0)
-        self.wins = np.concatenate(self.wins, axis=0)
-        self.ps = np.concatenate(self.ps, axis=0)
-        print ("Block {}, Boards shape {}, Wins Shape {}, Ps Shape {}".format(self.block_id, self.boards.shape[0],
-                                                                              self.wins.shape[0], self.ps.shape[0]))
-        np.savez(save_path + "block" + str(self.block_id), boards=self.boards, wins=self.wins, ps=self.ps)
-        self.boards = []
-        self.wins = []
-        self.ps = []
-        self.block_id = block_id
-
-    def store_num(self):
-        assert len(self.boards) == len(self.wins)
-        assert len(self.boards) == len(self.ps)
-        return len(self.boards)
-
-
-def concat(block_list, board, win, p):
-    global index
-    seed = np.random.randint(slots_num)
-    block_list[seed].concat(board, win, p)
-    if block_list[seed].isfull():
-        block_list[seed].save_and_reset(index)
-        index = index + 1
-
-
-block_list = []
-for index in range(slots_num):
-    block_list.append(block(block_size, index))
-index = index + 1
-for n in name:
-    data = np.load(path + n)
-    board = data["boards"]
-    win = data["win"]
-    p = data["p"]
-    print("Start {}".format(n))
-    print("Shape {}".format(board.shape[0]))
-    start = -time.time()
-    for i in range(board.shape[0]):
-        board_ori = board[i].reshape(-1, size, size, 17)
-        win_ori = win[i].reshape(-1, 1)
-        p_ori = p[i].reshape(-1, size ** 2 + 1)
-        concat(block_list, board_ori, p_ori, win_ori)
-
-        for t in range(1, 4):
-            board_aug = np.rot90(board_ori, t, (1, 2))
-            p_aug = np.concatenate(
-                [np.rot90(p_ori[:, :-1].reshape(-1, size, size), t, (1, 2)).reshape(-1, size ** 2), p_ori[:, -1].reshape(-1, 1)],
-                axis=1)
-            concat(block_list, board_aug, p_aug, win_ori)
-
-        board_aug = board_ori[:, ::-1]
-        p_aug = np.concatenate(
-            [p_ori[:, :-1].reshape(-1, size, size)[:, ::-1].reshape(-1, size ** 2), p_ori[:, -1].reshape(-1, 1)],
-            axis=1)
-        concat(block_list, board_aug, p_aug, win_ori)
-
-        board_aug = board_ori[:, :, ::-1]
-        p_aug = np.concatenate(
-            [p_ori[:, :-1].reshape(-1, size, size)[:, :, ::-1].reshape(-1, size ** 2), p_ori[:, -1].reshape(-1, 1)],
-            axis=1)
-        concat(block_list, board_aug, p_aug, win_ori)
-
-        board_aug = np.rot90(board_ori[:, ::-1], 1, (1, 2))
-        p_aug = np.concatenate(
-            [np.rot90(p_ori[:, :-1].reshape(-1, size, size)[:, ::-1], 1, (1, 2)).reshape(-1, size ** 2),
-             p_ori[:, -1].reshape(-1, 1)],
-            axis=1)
-        concat(block_list, board_aug, p_aug, win_ori)
-
-        board_aug = np.rot90(board_ori[:, :, ::-1], 1, (1, 2))
-        p_aug = np.concatenate(
-            [np.rot90(p_ori[:, :-1].reshape(-1, size, size)[:, :, ::-1], 1, (1, 2)).reshape(-1, size ** 2),
-             p_ori[:, -1].reshape(-1, 1)],
-            axis=1)
-        concat(block_list, board_aug, p_aug, win_ori)
-    print ("Finished {} with time {}".format(n, time.time() + start))
-    data_num = 0
-    for i in range(slots_num):
-        print("Block {} ".format(block_list[i].block_id) + "Size {}".format(block_list[i].store_num()))
-        data_num = data_num + block_list[i].store_num()
-    print ("Total data {}".format(data_num))
-
-for i in range(slots_num):
-    block_list[i].save_and_reset(block_list[i].block_id)
diff --git a/AlphaGo/self-play.py b/AlphaGo/self-play.py
deleted file mode 100644
index dd03b13..0000000
--- a/AlphaGo/self-play.py
+++ /dev/null
@@ -1,103 +0,0 @@
-from game import Game
-from engine import GTPEngine
-import re
-import numpy as np
-import os
-from collections import deque
-import utils
-import argparse
-
-parser = argparse.ArgumentParser()
-parser.add_argument('--result_path', type=str, default='./part1')
-args = parser.parse_args()
-
-if not os.path.exists(args.result_path):
-    os.makedirs(args.result_path)
-
-game = Game()
-engine = GTPEngine(game_obj=game)
-history = deque(maxlen=8)
-for i in range(8):
-    history.append(game.board)
-state = []
-prob = []
-winner = []
-pattern = "[A-Z]{1}[0-9]{1}"
-game.show_board()
-
-
-def history2state(history, color):
-    state = np.zeros([1, game.size, game.size, 17])
-    for i in range(8):
-        state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(game.size ** 2)).reshape(game.size, game.size)
-        state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(game.size ** 2)).reshape(game.size, game.size)
-    if color == utils.BLACK:
-        state[0, :, :, 16] = np.ones([game.size, game.size])
-    if color == utils.WHITE:
-        state[0, :, :, 16] = np.zeros([game.size, game.size])
-    return state
-
-
-num = 0
-game_num = 0
-black_pass = False
-white_pass = False
-while True:
-    print("Start game {}".format(game_num))
-    while not (black_pass and white_pass) and num < game.size ** 2 * 2:
-        if num % 2 == 0:
-            color = utils.BLACK
-            new_state = history2state(history, color)
-            state.append(new_state)
-            result = engine.run_cmd(str(num) + " genmove BLACK")
-            num += 1
-            match = re.search(pattern, result)
-            if match is not None:
-                print(match.group())
-            else:
-                print("pass")
-            if re.search("pass", result) is not None:
-                black_pass = True
-            else:
-                black_pass = False
-        else:
-            color = utils.WHITE
-            new_state = history2state(history, color)
-            state.append(new_state)
-            result = engine.run_cmd(str(num) + " genmove WHITE")
-            num += 1
-            match = re.search(pattern, result)
-            if match is not None:
-                print(match.group())
-            else:
-                print("pass")
-            if re.search("pass", result) is not None:
-                white_pass = True
-            else:
-                white_pass = False
-        game.show_board()
-        prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1))
-    print("Finished")
-    print("\n")
-    score = game.game_engine.executor_get_score(game.board)
-    if score > 0:
-        winner = utils.BLACK
-    else:
-        winner = utils.WHITE
-    state = np.concatenate(state, axis=0)
-    prob = np.concatenate(prob, axis=0)
-    winner = np.ones([num, 1]) * winner
-    assert state.shape[0] == prob.shape[0]
-    assert state.shape[0] == winner.shape[0]
-    np.savez(args.result_path + "/game" + str(game_num), state=state, prob=prob, winner=winner)
-    state = []
-    prob = []
-    winner = []
-    num = 0
-    black_pass = False
-    white_pass = False
-    engine.run_cmd(str(num) + " clear_board")
-    history.clear()
-    for _ in range(8):
-        history.append(game.board)
-    game_num += 1
diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index 493cf7d..1994284 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -40,28 +40,27 @@ class MCTSNode(object):
 
 
 class UCTNode(MCTSNode):
-    def __init__(self, parent, action, state, action_num, prior, debug=False, inverse=False):
+    def __init__(self, parent, action, state, action_num, prior, mcts, inverse=False):
         super(UCTNode, self).__init__(parent, action, state, action_num, prior, inverse)
         self.Q = np.zeros([action_num])
         self.W = np.zeros([action_num])
         self.N = np.zeros([action_num])
         self.ucb = self.Q + c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1)
         self.mask = None
-        self.debug=debug
-        self.elapse_time = 0
-
-    def clear_elapse_time(self):
         self.elapse_time = 0
+        self.mcts = mcts
 
     def selection(self, simulator):
         head = time.time()
         self.valid_mask(simulator)
-        self.elapse_time += time.time() - head
+        self.mcts.valid_mask_time += time.time() - head
         action = np.argmax(self.ucb)
         if action in self.children.keys():
+            self.mcts.state_selection_time += time.time() - head
             return self.children[action].selection(simulator)
         else:
-            self.children[action] = ActionNode(self, action)
+            self.children[action] = ActionNode(self, action, mcts=self.mcts)
+            self.mcts.state_selection_time += time.time() - head
             return self.children[action].selection(simulator)
 
     def backpropagation(self, action):
@@ -100,7 +99,7 @@ class TSNode(MCTSNode):
 
 
 class ActionNode(object):
-    def __init__(self, parent, action):
+    def __init__(self, parent, action, mcts):
         self.parent = parent
         self.action = action
         self.children = {}
@@ -108,12 +107,18 @@ class ActionNode(object):
         self.origin_state = None
         self.state_type = None
         self.reward = 0
+        self.mcts = mcts
 
     def type_conversion_to_tuple(self):
+        t0 = time.time()
         if isinstance(self.next_state, np.ndarray):
             self.next_state = self.next_state.tolist()
+        t1 = time.time()
         if isinstance(self.next_state, list):
             self.next_state = list2tuple(self.next_state)
+        t2 = time.time()
+        self.mcts.ndarray2list_time += t1 - t0
+        self.mcts.list2tuple_time += t2 - t1
 
     def type_conversion_to_origin(self):
         if isinstance(self.state_type, np.ndarray):
@@ -122,23 +127,28 @@ class ActionNode(object):
             self.next_state = tuple2list(self.next_state)
 
     def selection(self, simulator):
+        head = time.time()
         self.next_state, self.reward = simulator.simulate_step_forward(self.parent.state, self.action)
+        self.mcts.simulate_sf_time += time.time() - head
         self.origin_state = self.next_state
         self.state_type = type(self.next_state)
         self.type_conversion_to_tuple()
         if self.next_state is not None:
             if self.next_state in self.children.keys():
+                self.mcts.action_selection_time += time.time() - head
                 return self.children[self.next_state].selection(simulator)
             else:
+                self.mcts.action_selection_time += time.time() - head
                 return self.parent, self.action
         else:
+            self.mcts.action_selection_time += time.time() - head
             return self.parent, self.action
 
     def expansion(self, evaluator, action_num):
         if self.next_state is not None:
             prior, value = evaluator(self.next_state)
             self.children[self.next_state] = UCTNode(self, self.action, self.origin_state, action_num, prior,
-                                                     self.parent.inverse)
+                                                     mcts=self.mcts, inverse=self.parent.inverse)
             return value
         else:
             return 0.
@@ -160,11 +170,23 @@ class MCTS(object):
         if method == "":
             self.root = root
         if method == "UCT":
-            self.root = UCTNode(None, None, root, action_num, prior, self.debug, inverse=inverse)
+            self.root = UCTNode(None, None, root, action_num, prior, mcts=self, inverse=inverse)
         if method == "TS":
             self.root = TSNode(None, None, root, action_num, prior, inverse=inverse)
         self.inverse = inverse
 
+        # time spend on each step
+        self.selection_time = 0
+        self.expansion_time = 0
+        self.backpropagation_time = 0
+        self.action_selection_time = 0
+        self.state_selection_time = 0
+        self.simulate_sf_time = 0
+        self.valid_mask_time = 0
+        self.ndarray2list_time = 0
+        self.list2tuple_time = 0
+        self.check = 0
+
     def search(self, max_step=None, max_time=None):
         step = 0
         start_time = time.time()
@@ -175,23 +197,25 @@ class MCTS(object):
         if max_step is None and max_time is None:
             raise ValueError("Need a stop criteria!")
 
-        selection_time = 0
-        expansion_time = 0
-        backprop_time = 0
-        self.root.clear_elapse_time()
         while step < max_step and time.time() - start_time < max_step:
             sel_time, exp_time, back_time = self._expand()
-            selection_time += sel_time
-            expansion_time += exp_time
-            backprop_time += back_time
+            self.selection_time += sel_time
+            self.expansion_time += exp_time
+            self.backpropagation_time += back_time
             step += 1
         if (self.debug):
-            file = open("debug.txt", "a")
+            file = open("mcts_profiling.txt", "a")
             file.write("[" + str(self.role) + "]"
-                       + " selection : " + str(selection_time) + "\t"
-                       + " validmask : " + str(self.root.elapse_time) + "\t"
-                       + " expansion : " + str(expansion_time) + "\t"
-                       + " backprop  : " + str(backprop_time) + "\t"
+                       + " sel  " + '%.3f' % self.selection_time + "  "
+                       + " sel_sta  " + '%.3f' % self.state_selection_time + "  "
+                       + " valid  " + '%.3f' % self.valid_mask_time + "  "
+                       + " sel_act  " + '%.3f' % self.action_selection_time + "  "
+                       + " array2list  " + '%.4f' % self.ndarray2list_time + "  "
+                       + " check " + str(self.check) + "  "
+                       + " list2tuple  " + '%.4f' % self.list2tuple_time + " \t"
+                       + " forward  " + '%.3f' % self.simulate_sf_time + "  "
+                       + " exp  " + '%.3f' % self.expansion_time + "  "
+                       + " bak  " + '%.3f' % self.backpropagation_time + "  "
                        + "\n")
             file.close()
 

From 89226b449a8d0a05ffd852805913fcf05efdca07 Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Sun, 24 Dec 2017 20:57:53 +0800
Subject: [PATCH 69/98] replace try catch by isinstance collections.Hashable

---
 AlphaGo/.gitignore         |  1 +
 AlphaGo/game.py            |  2 +-
 tianshou/core/mcts/mcts.py | 29 ++++++++---------------------
 3 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/AlphaGo/.gitignore b/AlphaGo/.gitignore
index e578e5a..ff61326 100644
--- a/AlphaGo/.gitignore
+++ b/AlphaGo/.gitignore
@@ -2,3 +2,4 @@ data
 checkpoints
 checkpoints_origin
 *.log
+*.txt
diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index 8ffde93..a962f5c 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -33,8 +33,8 @@ class Game:
         if self.name == "go":
             self.size = 9
             self.komi = 3.75
-            self.history = []
             self.history_length = 8
+            self.history = []
             self.game_engine = go.Go(size=self.size, komi=self.komi, role=self.role)
             self.board = [utils.EMPTY] * (self.size ** 2)
         elif self.name == "reversi":
diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index 1994284..bd21e09 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -1,23 +1,16 @@
 import numpy as np
 import math
 import time
+import sys
+import collections
 
 c_puct = 5
 
-
-def list2tuple(list):
-    try:
-        return tuple(list2tuple(sub) for sub in list)
-    except TypeError:
-        return list
-
-
-def tuple2list(tuple):
-    try:
-        return list(tuple2list(sub) for sub in tuple)
-    except TypeError:
-        return tuple
-
+def list2tuple(obj):
+    if isinstance(obj, collections.Hashable):
+        return obj
+    else:
+        return tuple(list2tuple(sub) for sub in obj)
 
 class MCTSNode(object):
     def __init__(self, parent, action, state, action_num, prior, inverse=False):
@@ -38,7 +31,6 @@ class MCTSNode(object):
     def valid_mask(self, simulator):
         pass
 
-
 class UCTNode(MCTSNode):
     def __init__(self, parent, action, state, action_num, prior, mcts, inverse=False):
         super(UCTNode, self).__init__(parent, action, state, action_num, prior, inverse)
@@ -119,12 +111,7 @@ class ActionNode(object):
         t2 = time.time()
         self.mcts.ndarray2list_time += t1 - t0
         self.mcts.list2tuple_time += t2 - t1
-
-    def type_conversion_to_origin(self):
-        if isinstance(self.state_type, np.ndarray):
-            self.next_state = np.array(self.next_state)
-        if isinstance(self.state_type, np.ndarray):
-            self.next_state = tuple2list(self.next_state)
+        self.mcts.check += sys.getsizeof(object)
 
     def selection(self, simulator):
         head = time.time()

From 70824a3612632fa8a81c039774d1efd03cf17881 Mon Sep 17 00:00:00 2001
From: rtz19970824 <1289226405@qq.com>
Date: Mon, 25 Dec 2017 15:09:26 +0800
Subject: [PATCH 70/98] remove historical file data.py

---
 AlphaGo/data.py | 84 -------------------------------------------------
 1 file changed, 84 deletions(-)
 delete mode 100644 AlphaGo/data.py

diff --git a/AlphaGo/data.py b/AlphaGo/data.py
deleted file mode 100644
index 464ebb9..0000000
--- a/AlphaGo/data.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import os
-import threading
-import numpy as np
-
-size = 9
-path = "/home/yama/leela-zero/data/npz-files/"
-name = os.listdir(path)
-print(len(name))
-thread_num = 17
-batch_num = len(name) // thread_num
-
-def integrate(name, index):
-    boards = np.zeros([0, size, size, 17])
-    wins = np.zeros([0, 1])
-    ps = np.zeros([0, size**2 + 1])
-    for n in name:
-        data = np.load(path + n)
-        board = data["state"]
-        win = data["winner"]
-        p = data["prob"]
-        # board = np.zeros([0, size, size, 17])
-        # win = np.zeros([0, 1])
-        # p = np.zeros([0, size**2 + 1])
-        # for i in range(data["boards"].shape[3]):
-        #       board = np.concatenate([board, data["boards"][:,:,:,i].reshape(-1, size, size, 17)], axis=0)
-        #       win = np.concatenate([win, data["win"][:,i].reshape(-1, 1)], axis=0)
-        # p = np.concatenate([p, data["p"][:,i].reshape(-1, size**2 + 1)], axis=0)
-        boards = np.concatenate([boards, board], axis=0)
-        wins = np.concatenate([wins, win], axis=0)
-        ps = np.concatenate([ps, p], axis=0)
-        # print("Finish " + n)
-    print ("Integration {} Finished!".format(index))
-    board_ori = boards
-    win_ori = wins
-    p_ori = ps
-    for i in range(1, 3):
-        board = np.rot90(board_ori, i, (1, 2))
-        p = np.concatenate(
-            [np.rot90(p_ori[:, :-1].reshape(-1, size, size), i, (1, 2)).reshape(-1, size**2), p_ori[:, -1].reshape(-1, 1)],
-            axis=1)
-        boards = np.concatenate([boards, board], axis=0)
-        wins = np.concatenate([wins, win_ori], axis=0)
-        ps = np.concatenate([ps, p], axis=0)
-
-    board = board_ori[:, ::-1]
-    p = np.concatenate([p_ori[:, :-1].reshape(-1, size, size)[:, ::-1].reshape(-1, size**2), p_ori[:, -1].reshape(-1, 1)],
-                       axis=1)
-    boards = np.concatenate([boards, board], axis=0)
-    wins = np.concatenate([wins, win_ori], axis=0)
-    ps = np.concatenate([ps, p], axis=0)
-
-    board = board_ori[:, :, ::-1]
-    p = np.concatenate([p_ori[:, :-1].reshape(-1, size, size)[:, :, ::-1].reshape(-1, size**2), p_ori[:, -1].reshape(-1, 1)],
-                       axis=1)
-    boards = np.concatenate([boards, board], axis=0)
-    wins = np.concatenate([wins, win_ori], axis=0)
-    ps = np.concatenate([ps, p], axis=0)
-
-    board = board_ori[:, ::-1]
-    p = np.concatenate(
-        [np.rot90(p_ori[:, :-1].reshape(-1, size, size)[:, ::-1], 1, (1, 2)).reshape(-1, size**2), p_ori[:, -1].reshape(-1, 1)],
-        axis=1)
-    boards = np.concatenate([boards, np.rot90(board, 1, (1, 2))], axis=0)
-    wins = np.concatenate([wins, win_ori], axis=0)
-    ps = np.concatenate([ps, p], axis=0)
-
-    board = board_ori[:, :, ::-1]
-    p = np.concatenate(
-        [np.rot90(p_ori[:, :-1].reshape(-1, size, size)[:, :, ::-1], 1, (1, 2)).reshape(-1, size**2),
-         p_ori[:, -1].reshape(-1, 1)],
-        axis=1)
-    boards = np.concatenate([boards, np.rot90(board, 1, (1, 2))], axis=0)
-    wins = np.concatenate([wins, win_ori], axis=0)
-    ps = np.concatenate([ps, p], axis=0)
-
-    np.savez("/home/tongzheng/data/data-" + str(index), state=boards, winner=wins, prob=ps)
-    print ("Thread {} has finished.".format(index))
-thread_list = list()
-for i in range(thread_num):
-    thread_list.append(threading.Thread(target=integrate, args=(name[batch_num * i:batch_num * (i + 1)], i,)))
-for thread in thread_list:
-    thread.start()
-for thread in thread_list:
-    thread.join()

From 0fdbaef1a19e6de4ae866fd59ea05428dfe12bfa Mon Sep 17 00:00:00 2001
From: mcgrady00h <281130306@qq.com>
Date: Mon, 25 Dec 2017 15:33:17 +0800
Subject: [PATCH 71/98] add '()' to support python3

---
 AlphaGo/play.py | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index 5777982..4e4aa6f 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -5,7 +5,14 @@ import re
 import Pyro4
 import time
 import os
-import cPickle
+
+python_version = sys.version_info
+
+if python_version < (3, 0):
+    import cPickle
+else:
+    import _pickle as cPickle
+
 
 class Data(object):
     def __init__(self):
@@ -53,7 +60,7 @@ if __name__ == '__main__':
     # start a name server if no name server exists
     if len(os.popen('ps aux | grep pyro4-ns | grep -v grep').readlines()) == 0:
         start_new_server = subprocess.Popen(['pyro4-ns', '&'])
-        print "Start Name Sever : " + str(start_new_server.pid)  # + str(start_new_server.wait())
+        print("Start Name Sever : " + str(start_new_server.pid))  # + str(start_new_server.wait())
         time.sleep(1)
 
     # start two different player with different network weights.
@@ -73,12 +80,15 @@ if __name__ == '__main__':
 
     server_list = ""
     while (black_role_name not in server_list) or (white_role_name not in server_list):
-        server_list = subprocess.check_output(['pyro4-nsc', 'list'])
-        print "Waiting for the server start..."
+        if python_version < (3, 0):
+            server_list = subprocess.check_output(['pyro4-nsc', 'list'])
+        else:
+            server_list = subprocess.check_output(['pyro4-nsc', 'list'])
+        print("Waiting for the server start...")
         time.sleep(1)
-    print server_list
-    print "Start black player at : " + str(agent_v0.pid)
-    print "Start white player at : " + str(agent_v1.pid)
+    print(server_list)
+    print("Start black player at : " + str(agent_v0.pid))
+    print("Start white player at : " + str(agent_v1.pid))
 
     data = Data()
     player = [None] * 2
@@ -109,12 +119,12 @@ if __name__ == '__main__':
                 board = eval(board[board.index('['):board.index(']') + 1])
                 for i in range(size[game_name]):
                     for j in range(size[game_name]):
-                        print show[board[i * size[game_name] + j]] + " ",
-                    print "\n",
+                        print(show[board[i * size[game_name] + j]] + " ",)
+                    print("\n",)
                 data.boards.append(board)
                 start_time = time.time()
                 move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n')
-                print role[turn] + " : " + str(move),
+                print(role[turn] + " : " + str(move),)
                 num += 1
                 match = re.search(pattern, move)
                 if match is not None:
@@ -133,7 +143,7 @@ if __name__ == '__main__':
                 prob = eval(prob)
                 data.probs.append(prob)
             score = player[turn].run_cmd(str(num) + ' get_score')
-            print "Finished : ", score.split(" ")[1]
+            print("Finished : ", score.split(" ")[1])
             # TODO: generalize the player
             if eval(score.split(" ")[1]) > 0:
                 data.winner = 1
@@ -157,8 +167,8 @@ if __name__ == '__main__':
         print(e)
         subprocess.call(["kill", "-9", str(agent_v0.pid)])
         subprocess.call(["kill", "-9", str(agent_v1.pid)])
-        print "Kill all player, finish all game."
+        print("Kill all player, finish all game.")
 
     subprocess.call(["kill", "-9", str(agent_v0.pid)])
     subprocess.call(["kill", "-9", str(agent_v1.pid)])
-    print "Kill all player, finish all game."
+    print("Kill all player, finish all game.")

From 64da200e5d4d4cff8c1642f4def897cefadbb87d Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Mon, 25 Dec 2017 16:26:51 +0800
Subject: [PATCH 72/98] move , from inside of () to outside of ()

---
 AlphaGo/play.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index 4e4aa6f..b3cc02a 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -75,7 +75,7 @@ if __name__ == '__main__':
 
     agent_v1 = subprocess.Popen(
         ['python', '-u', 'player.py', '--game=' + game_name, '--role=' + white_role_name,
-        '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)],
+        '--checkpoint_path=' + str(args.white_weight_path), '--debug=' + str(args.debug)],
         stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
     server_list = ""
@@ -119,12 +119,12 @@ if __name__ == '__main__':
                 board = eval(board[board.index('['):board.index(']') + 1])
                 for i in range(size[game_name]):
                     for j in range(size[game_name]):
-                        print(show[board[i * size[game_name] + j]] + " ",)
-                    print("\n",)
+                        print(show[board[i * size[game_name] + j]] + " "),
+                    print("\n"),
                 data.boards.append(board)
                 start_time = time.time()
                 move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n')
-                print(role[turn] + " : " + str(move),)
+                print(role[turn] + " : " + str(move)),
                 num += 1
                 match = re.search(pattern, move)
                 if match is not None:

From fcb160dff674f3d587dfd61a79ceffaeacb18ba1 Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Mon, 25 Dec 2017 16:35:43 +0800
Subject: [PATCH 73/98] fix python 2,3 print format error

---
 AlphaGo/play.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index b3cc02a..e67621a 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -143,7 +143,7 @@ if __name__ == '__main__':
                 prob = eval(prob)
                 data.probs.append(prob)
             score = player[turn].run_cmd(str(num) + ' get_score')
-            print("Finished : ", score.split(" ")[1])
+            print("Finished : {}".format(score.split(" ")[1]))
             # TODO: generalize the player
             if eval(score.split(" ")[1]) > 0:
                 data.winner = 1

From 4379f4c0fd87ff8925724a4db67fad99bdff3098 Mon Sep 17 00:00:00 2001
From: rtz19970824 <1289226405@qq.com>
Date: Mon, 25 Dec 2017 16:40:38 +0800
Subject: [PATCH 74/98] modify play.py for better experience

---
 AlphaGo/play.py | 59 +++++++++++++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 21 deletions(-)

diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index 5777982..6526f13 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -7,6 +7,7 @@ import time
 import os
 import cPickle
 
+
 class Data(object):
     def __init__(self):
         self.boards = []
@@ -24,15 +25,16 @@ if __name__ == '__main__':
     """
     # TODO : we should set the network path in a more configurable way.
     parser = argparse.ArgumentParser()
-    parser.add_argument("--result_path", type=str, default="./data/")
+    parser.add_argument("--data_path", type=str, default="./data/")
     parser.add_argument("--black_weight_path", type=str, default=None)
     parser.add_argument("--white_weight_path", type=str, default=None)
-    parser.add_argument("--id", type=int, default=0)
+    parser.add_argument("--id", type=int, default=-1)
     parser.add_argument("--debug", type=bool, default=False)
+    parser.add_argument("--game", type=str, default="go")
     args = parser.parse_args()
 
-    if not os.path.exists(args.result_path):
-        os.mkdir(args.result_path)
+    if not os.path.exists(args.data_path):
+        os.mkdir(args.data_path)
     # black_weight_path = "./checkpoints"
     # white_weight_path = "./checkpoints_origin"
     if args.black_weight_path is not None and (not os.path.exists(args.black_weight_path)):
@@ -57,18 +59,34 @@ if __name__ == '__main__':
         time.sleep(1)
 
     # start two different player with different network weights.
+    server_list = subprocess.check_output(['pyro4-nsc', 'list'])
+    index = []
+    if server_list is not None:
+        server_list = server_list.split("\n")[3:-2]
+        for s in server_list:
+            id = s.split(" ")[0][5:]
+            index.append(eval(id))
+        index.sort()
+    if args.id == -1:
+        if index:
+            args.id = index[-1] + 1
+        else:
+            args.id = 0
+    else:
+        if args.id in index:
+            raise ValueError("Name exists in name server!")
+
     black_role_name = 'black' + str(args.id)
     white_role_name = 'white' + str(args.id)
 
-    game_name = 'go'
     agent_v0 = subprocess.Popen(
-        ['python', '-u', 'player.py', '--game=' + game_name, '--role=' + black_role_name,
+        ['python', '-u', 'player.py', '--game=' + args.game, '--role=' + black_role_name,
          '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)],
         stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
     agent_v1 = subprocess.Popen(
-        ['python', '-u', 'player.py', '--game=' + game_name, '--role=' + white_role_name,
-        '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)],
+        ['python', '-u', 'player.py', '--game=' + args.game, '--role=' + white_role_name,
+         '--checkpoint_path=' + str(args.white_weight_path), '--debug=' + str(args.debug)],
         stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
     server_list = ""
@@ -103,13 +121,13 @@ if __name__ == '__main__':
             pass_flag = [False, False]
             print("Start game {}".format(game_num))
             # end the game if both palyer chose to pass, or play too much turns
-            while not (pass_flag[0] and pass_flag[1]) and num < size[game_name] ** 2 * 2:
+            while not (pass_flag[0] and pass_flag[1]) and num < size[args.game] ** 2 * 2:
                 turn = num % 2
                 board = player[turn].run_cmd(str(num) + ' show_board')
                 board = eval(board[board.index('['):board.index(']') + 1])
-                for i in range(size[game_name]):
-                    for j in range(size[game_name]):
-                        print show[board[i * size[game_name] + j]] + " ",
+                for i in range(size[args.game]):
+                    for j in range(size[args.game]):
+                        print show[board[i * size[args.game] + j]] + " ",
                     print "\n",
                 data.boards.append(board)
                 start_time = time.time()
@@ -141,24 +159,23 @@ if __name__ == '__main__':
                 data.winner = -1
             player[0].run_cmd(str(num) + ' clear_board')
             player[1].run_cmd(str(num) + ' clear_board')
-            file_list = os.listdir(args.result_path)
+            file_list = os.listdir(args.data_path)
             if not file_list:
                 data_num = 0
             else:
-                file_list.sort(key=lambda file: os.path.getmtime(args.result_path + file) if not os.path.isdir(
-                    args.result_path + file) else 0)
+                file_list.sort(key=lambda file: os.path.getmtime(args.data_path + file) if not os.path.isdir(
+                    args.data_path + file) else 0)
                 data_num = eval(file_list[-1][:-4]) + 1
             with open("./data/" + str(data_num) + ".pkl", "wb") as file:
                 picklestring = cPickle.dump(data, file)
             data.reset()
             game_num += 1
+    except KeyboardInterrupt:
+        pass
 
-    except Exception as e:
-        print(e)
-        subprocess.call(["kill", "-9", str(agent_v0.pid)])
-        subprocess.call(["kill", "-9", str(agent_v1.pid)])
-        print "Kill all player, finish all game."
-
+    ns = Pyro4.locateNS()
+    ns.unregister(black_role_name)
+    ns.unregister(white_role_name)
     subprocess.call(["kill", "-9", str(agent_v0.pid)])
     subprocess.call(["kill", "-9", str(agent_v1.pid)])
     print "Kill all player, finish all game."

From 76f641a0f1b0583ccd2bee2892f996be970152f9 Mon Sep 17 00:00:00 2001
From: rtz19970824 <1289226405@qq.com>
Date: Mon, 25 Dec 2017 16:51:44 +0800
Subject: [PATCH 75/98] minor fixed

---
 AlphaGo/play.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index c0bdc5b..6b57b86 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -92,7 +92,7 @@ if __name__ == '__main__':
         stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
     agent_v1 = subprocess.Popen(
-        ['python', '-u', 'player.py', '--game=' + game_name, '--role=' + white_role_name,
+        ['python', '-u', 'player.py', '--game=' + args.game, '--role=' + white_role_name,
         '--checkpoint_path=' + str(args.white_weight_path), '--debug=' + str(args.debug)],
         stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
@@ -183,9 +183,6 @@ if __name__ == '__main__':
     except KeyboardInterrupt:
         pass
 
-    ns = Pyro4.locateNS()
-    ns.unregister(black_role_name)
-    ns.unregister(white_role_name)
     subprocess.call(["kill", "-9", str(agent_v0.pid)])
     subprocess.call(["kill", "-9", str(agent_v1.pid)])
     print("Kill all player, finish all game.")

From 725fc2c04eb7b98350684519dabbd7fdd48b32ea Mon Sep 17 00:00:00 2001
From: rtz19970824 <rtz19970824@gmail.com>
Date: Tue, 26 Dec 2017 13:17:46 +0800
Subject: [PATCH 76/98] pass the checkpoint path to the model

---
 AlphaGo/game.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index a962f5c..72ae2e0 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -22,8 +22,8 @@ import time
 class Game:
     '''
     Load the real game and trained weights.
-    
-    TODO : Maybe merge with the engine class in future, 
+
+    TODO : Maybe merge with the engine class in future,
     currently leave it untouched for interacting with Go UI.
     '''
     def __init__(self, name=None, role=None, debug=False, checkpoint_path=None):
@@ -46,7 +46,7 @@ class Game:
         else:
             raise ValueError(name + " is an unknown game...")
 
-        self.evaluator = model.ResNet(self.size, self.size ** 2 + 1, history_length=self.history_length)
+        self.evaluator = model.ResNet(self.size, self.size ** 2 + 1, history_length=self.history_length, checkpoint_path=checkpoint_path)
         self.latest_boards = deque(maxlen=self.history_length)
         for _ in range(self.history_length):
             self.latest_boards.append(self.board)

From aa6b5434c673c8d7c83c290bacd4a92b1ac0832b Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Tue, 26 Dec 2017 14:46:14 +0800
Subject: [PATCH 77/98] add debuf info for mcts and add softmax for the prior

---
 AlphaGo/game.py  | 15 +++++++++++----
 AlphaGo/model.py |  4 ++--
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index 72ae2e0..ec39f94 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -71,6 +71,13 @@ class Game:
         mcts = MCTS(self.game_engine, self.evaluator, [latest_boards, color],
                     self.size ** 2 + 1, role=self.role, debug=self.debug, inverse=True)
         mcts.search(max_step=100)
+        if self.debug:
+            file = open("mcts_debug.log", 'ab')
+            np.savetxt(file, mcts.root.Q, header="\nQ value : ", fmt='%.4f', newline=", ")
+            np.savetxt(file, mcts.root.W, header="\nW value : ", fmt='%.4f', newline=", ")
+            np.savetxt(file, mcts.root.N, header="\nN value : ", fmt="%d", newline=", ")
+            np.savetxt(file, mcts.root.prior, header="\nprior   : ", fmt='%.4f', newline=", ")
+            file.close()
         temp = 1
         prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp)
         choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0]
@@ -119,7 +126,7 @@ class Game:
         sys.stdout.flush()
 
 if __name__ == "__main__":
-    print("test game.py")
-    #file = open("debug.txt", "a")
-    #file.write("mcts check\n")
-    #file.close()
+    game = Game(name="go", checkpoint_path="./checkpoint")
+    game.debug = True
+    game.think_play_move(utils.BLACK)
+
diff --git a/AlphaGo/model.py b/AlphaGo/model.py
index 0549f41..704a034 100644
--- a/AlphaGo/model.py
+++ b/AlphaGo/model.py
@@ -80,7 +80,7 @@ class Data(object):
 
 
 class ResNet(object):
-    def __init__(self, board_size, action_num, history_length=1, residual_block_num=20, checkpoint_path=None):
+    def __init__(self, board_size, action_num, history_length=1, residual_block_num=10, checkpoint_path=None):
         """
         the resnet model
 
@@ -161,7 +161,7 @@ class ResNet(object):
                 'The length of history cannot meet the need of the model, given {}, need {}'.format(len(history),
                                                                                                     self.history_length))
         state = self._history2state(history, color)
-        return self.sess.run([self.p, self.v], feed_dict={self.x: state, self.is_training: False})
+        return self.sess.run([tf.nn.softmax(self.p), self.v], feed_dict={self.x: state, self.is_training: False})
 
     def _history2state(self, history, color):
         """

From 8f508c790b0b8351e1dfab25df7416337dfb8ac0 Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Tue, 26 Dec 2017 15:07:15 +0800
Subject: [PATCH 78/98] add role for mcts debug

---
 AlphaGo/game.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index ec39f94..d123a92 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -73,10 +73,10 @@ class Game:
         mcts.search(max_step=100)
         if self.debug:
             file = open("mcts_debug.log", 'ab')
-            np.savetxt(file, mcts.root.Q, header="\nQ value : ", fmt='%.4f', newline=", ")
-            np.savetxt(file, mcts.root.W, header="\nW value : ", fmt='%.4f', newline=", ")
-            np.savetxt(file, mcts.root.N, header="\nN value : ", fmt="%d", newline=", ")
-            np.savetxt(file, mcts.root.prior, header="\nprior   : ", fmt='%.4f', newline=", ")
+            np.savetxt(file, mcts.root.Q, header="\n" + self.role + " Q value : ", fmt='%.4f', newline=", ")
+            np.savetxt(file, mcts.root.W, header="\n" + self.role + " W value : ", fmt='%.4f', newline=", ")
+            np.savetxt(file, mcts.root.N, header="\n" + self.role + " N value : ", fmt="%d", newline=", ")
+            np.savetxt(file, mcts.root.prior, header="\n" + self.role + " prior : ", fmt='%.4f', newline=", ")
             file.close()
         temp = 1
         prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp)

From 029ab199f4a8da3fd15897cd9f3ef830467ad578 Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Tue, 26 Dec 2017 16:47:24 +0800
Subject: [PATCH 79/98] add softmax for mcts root node

---
 AlphaGo/model.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/AlphaGo/model.py b/AlphaGo/model.py
index 704a034..dbfc5ca 100644
--- a/AlphaGo/model.py
+++ b/AlphaGo/model.py
@@ -124,6 +124,7 @@ class ResNet(object):
             h = residual_block(h, self.is_training)
         self.v = value_head(h, self.is_training)
         self.p = policy_head(h, self.is_training, self.action_num)
+        self.prob = tf.nn.softmax(self.p)
         self.value_loss = tf.reduce_mean(tf.square(self.z - self.v))
         self.policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.pi, logits=self.p))
 
@@ -161,7 +162,7 @@ class ResNet(object):
                 'The length of history cannot meet the need of the model, given {}, need {}'.format(len(history),
                                                                                                     self.history_length))
         state = self._history2state(history, color)
-        return self.sess.run([tf.nn.softmax(self.p), self.v], feed_dict={self.x: state, self.is_training: False})
+        return self.sess.run([self.prob, self.v], feed_dict={self.x: state, self.is_training: False})
 
     def _history2state(self, history, color):
         """

From 0c3ff3bf373f8c3c12a9572de9dac568e7cb69eb Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Tue, 26 Dec 2017 19:29:35 +0800
Subject: [PATCH 80/98] delete unused code

---
 AlphaGo/.gitignore |  2 +-
 AlphaGo/engine.py  | 16 +++++-----------
 AlphaGo/play.py    | 46 +++++++++++++++++++++-------------------------
 AlphaGo/player.py  | 14 ++------------
 4 files changed, 29 insertions(+), 49 deletions(-)

diff --git a/AlphaGo/.gitignore b/AlphaGo/.gitignore
index ff61326..38ff946 100644
--- a/AlphaGo/.gitignore
+++ b/AlphaGo/.gitignore
@@ -1,5 +1,5 @@
 data
 checkpoints
-checkpoints_origin
+random
 *.log
 *.txt
diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py
index b662dbd..d298aea 100644
--- a/AlphaGo/engine.py
+++ b/AlphaGo/engine.py
@@ -13,8 +13,6 @@ import utils
 
 class GTPEngine():
     def __init__(self, **kwargs):
-        self.size = 9
-        self.komi = 6.5
         try:
             self._game = kwargs['game_obj']
             self._game.clear()
@@ -143,11 +141,9 @@ class GTPEngine():
         self.disconnect = True
         return None, True
 
-    def cmd_boardsize(self, args, **kwargs):
-        if args.isdigit():
-            size = int(args)
-            self.size = size
-            self._game.set_size(size)
+    def cmd_boardsize(self, board_size, **kwargs):
+        if board_size.isdigit():
+            self._game.set_size(int(board_size))
             return None, True
         else:
             return 'non digit size', False
@@ -156,11 +152,9 @@ class GTPEngine():
         self._game.clear()
         return None, True
 
-    def cmd_komi(self, args, **kwargs):
+    def cmd_komi(self, komi, **kwargs):
         try:
-            komi = float(args)
-            self.komi = komi
-            self._game.set_komi(komi)
+            self._game.set_komi(float(komi))
             return None, True
         except ValueError:
             raise ValueError("syntax error")
diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index 6b57b86..884d2ab 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -5,6 +5,8 @@ import re
 import Pyro4
 import time
 import os
+import utils
+from time import gmtime, strftime
 
 python_version = sys.version_info
 
@@ -13,8 +15,6 @@ if python_version < (3, 0):
 else:
     import _pickle as cPickle
 
-
-
 class Data(object):
     def __init__(self):
         self.boards = []
@@ -45,9 +45,9 @@ if __name__ == '__main__':
     # black_weight_path = "./checkpoints"
     # white_weight_path = "./checkpoints_origin"
     if args.black_weight_path is not None and (not os.path.exists(args.black_weight_path)):
-        raise ValueError("Can't not find the network weights for black player.")
+        raise ValueError("Can't find the network weights for black player.")
     if args.white_weight_path is not None and (not os.path.exists(args.white_weight_path)):
-        raise ValueError("Can't not find the network weights for white player.")
+        raise ValueError("Can't find the network weights for white player.")
 
     # kill the old server
     # kill_old_server = subprocess.Popen(['killall', 'pyro4-ns'])
@@ -86,27 +86,29 @@ if __name__ == '__main__':
     black_role_name = 'black' + str(args.id)
     white_role_name = 'white' + str(args.id)
 
-    agent_v0 = subprocess.Popen(
+    #TODO : check if we can get the output of player from the stdout, for debug convenience
+    black_player = subprocess.Popen(
         ['python', '-u', 'player.py', '--game=' + args.game, '--role=' + black_role_name,
          '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)],
         stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
-    agent_v1 = subprocess.Popen(
+    white_player = subprocess.Popen(
         ['python', '-u', 'player.py', '--game=' + args.game, '--role=' + white_role_name,
-        '--checkpoint_path=' + str(args.white_weight_path), '--debug=' + str(args.debug)],
+         '--checkpoint_path=' + str(args.white_weight_path), '--debug=' + str(args.debug)],
         stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
     server_list = ""
     while (black_role_name not in server_list) or (white_role_name not in server_list):
         if python_version < (3, 0):
+            # TODO : @renyong what is the difference between those two options?
             server_list = subprocess.check_output(['pyro4-nsc', 'list'])
         else:
             server_list = subprocess.check_output(['pyro4-nsc', 'list'])
         print("Waiting for the server start...")
         time.sleep(1)
     print(server_list)
-    print("Start black player at : " + str(agent_v0.pid))
-    print("Start white player at : " + str(agent_v1.pid))
+    print("Start black player at : " + str(black_player.pid))
+    print("Start white player at : " + str(white_player.pid))
 
     data = Data()
     player = [None] * 2
@@ -121,7 +123,7 @@ if __name__ == '__main__':
     size = {"go":9, "reversi":8}
     show = ['.', 'X', 'O']
 
-    evaluate_rounds = 1
+    evaluate_rounds = 100
     game_num = 0
     try:
         #while True:
@@ -141,8 +143,8 @@ if __name__ == '__main__':
                     print "\n",
                 data.boards.append(board)
                 start_time = time.time()
-                move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n')
-                print(role[turn] + " : " + str(move)),
+                move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn])[:-1]
+                print("\n" + role[turn] + " : " + str(move)),
                 num += 1
                 match = re.search(pattern, move)
                 if match is not None:
@@ -160,29 +162,23 @@ if __name__ == '__main__':
                 prob = prob.replace('],', ']')
                 prob = eval(prob)
                 data.probs.append(prob)
-            score = player[turn].run_cmd(str(num) + ' get_score')
+            score = player[0].run_cmd(str(num) + ' get_score')
             print("Finished : {}".format(score.split(" ")[1]))
-            # TODO: generalize the player
             if eval(score.split(" ")[1]) > 0:
-                data.winner = 1
+                data.winner = utils.BLACK
             if eval(score.split(" ")[1]) < 0:
-                data.winner = -1
+                data.winner = utils.WHITE
             player[0].run_cmd(str(num) + ' clear_board')
             player[1].run_cmd(str(num) + ' clear_board')
             file_list = os.listdir(args.data_path)
-            if not file_list:
-                data_num = 0
-            else:
-                file_list.sort(key=lambda file: os.path.getmtime(args.data_path + file) if not os.path.isdir(
-                    args.data_path + file) else 0)
-                data_num = eval(file_list[-1][:-4]) + 1
-            with open("./data/" + str(data_num) + ".pkl", "wb") as file:
+            current_time = strftime("%Y%m%d_%H%M%S", gmtime())
+            with open(args.data_path + current_time + ".pkl", "wb") as file:
                 picklestring = cPickle.dump(data, file)
             data.reset()
             game_num += 1
     except KeyboardInterrupt:
         pass
 
-    subprocess.call(["kill", "-9", str(agent_v0.pid)])
-    subprocess.call(["kill", "-9", str(agent_v1.pid)])
+    subprocess.call(["kill", "-9", str(black_player.pid)])
+    subprocess.call(["kill", "-9", str(white_player.pid)])
     print("Kill all player, finish all game.")
diff --git a/AlphaGo/player.py b/AlphaGo/player.py
index a8f61c1..b93c124 100644
--- a/AlphaGo/player.py
+++ b/AlphaGo/player.py
@@ -1,8 +1,5 @@
 import argparse
-import time
-import sys
 import Pyro4
-
 from game import Game
 from engine import GTPEngine
 
@@ -17,10 +14,8 @@ class Player(object):
         self.engine = kwargs['engine']
 
     def run_cmd(self, command):
-        #return "inside the Player of player.py"
         return self.engine.run_cmd(command)
 
-
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument("--checkpoint_path", type=str, default=None)
@@ -29,12 +24,7 @@ if __name__ == '__main__':
     parser.add_argument("--game", type=str, default=False)
     args = parser.parse_args()
 
-    if args.checkpoint_path == 'None':
-        args.checkpoint_path = None
-    debug = False
-    if args.debug == "True":
-        debug = True
-    game = Game(name=args.game, role=args.role, checkpoint_path=args.checkpoint_path, debug=debug)
+    game = Game(name=args.game, role=args.role, checkpoint_path=eval(args.checkpoint_path), debug=eval(args.debug))
     engine = GTPEngine(game_obj=game, name='tianshou', version=0)
 
     daemon = Pyro4.Daemon()                # make a Pyro daemon
@@ -43,7 +33,7 @@ if __name__ == '__main__':
     print "Init " + args.role + " player finished"
     uri = daemon.register(player)          # register the greeting maker as a Pyro object
     print "Start on name " + args.role
-    ns.register(args.role, uri)       # register the object with a name in the name server
+    ns.register(args.role, uri)            # register the object with a name in the name server
     print "Start Request Loop " + str(uri)
     daemon.requestLoop()                   # start the event loop of the server to wait for calls
 

From 7f0565a5f65b7784ba7145bcce237a09aff8f632 Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Tue, 26 Dec 2017 22:19:10 +0800
Subject: [PATCH 81/98] variable rename and delete redundant code

---
 AlphaGo/game.py            |  9 +++------
 tianshou/core/mcts/mcts.py | 13 +++++++------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index d123a92..f17c7af 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -46,7 +46,8 @@ class Game:
         else:
             raise ValueError(name + " is an unknown game...")
 
-        self.evaluator = model.ResNet(self.size, self.size ** 2 + 1, history_length=self.history_length, checkpoint_path=checkpoint_path)
+        self.evaluator = model.ResNet(self.size, self.size ** 2 + 1, history_length=self.history_length,
+                                      checkpoint_path=checkpoint_path)
         self.latest_boards = deque(maxlen=self.history_length)
         for _ in range(self.history_length):
             self.latest_boards.append(self.board)
@@ -91,11 +92,7 @@ class Game:
         # this function can be called directly to play the opponent's move
         if vertex == utils.PASS:
             return True
-        # TODO this implementation is not very elegant
-        if self.name == "go":
-            res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex)
-        elif self.name == "reversi":
-            res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex)
+        res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex)
         return res
 
     def think_play_move(self, color):
diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index 5c96d38..3d547c6 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -129,6 +129,7 @@ class ActionNode(object):
                 self.mcts.action_selection_time += time.time() - head
                 return self.parent, self.action
         else:
+            # self.next_state is None means we have reach the terminate state
             self.mcts.action_selection_time += time.time() - head
             return self.parent, self.action
 
@@ -147,20 +148,20 @@ class ActionNode(object):
 
 
 class MCTS(object):
-    def __init__(self, simulator, evaluator, root, action_num, method="UCT",
+    def __init__(self, simulator, evaluator, start_state, action_num, method="UCT",
                  role="unknown", debug=False, inverse=False):
         self.simulator = simulator
         self.evaluator = evaluator
         self.role = role
         self.debug = debug
-        prior, _ = self.evaluator(root)
+        prior, _ = self.evaluator(start_state)
         self.action_num = action_num
         if method == "":
-            self.root = root
+            self.root = start_state
         if method == "UCT":
-            self.root = UCTNode(None, None, root, action_num, prior, mcts=self, inverse=inverse)
+            self.root = UCTNode(None, None, start_state, action_num, prior, mcts=self, inverse=inverse)
         if method == "TS":
-            self.root = TSNode(None, None, root, action_num, prior, inverse=inverse)
+            self.root = TSNode(None, None, start_state, action_num, prior, inverse=inverse)
         self.inverse = inverse
 
         # time spend on each step
@@ -191,7 +192,7 @@ class MCTS(object):
             self.expansion_time += exp_time
             self.backpropagation_time += back_time
             step += 1
-        if (self.debug):
+        if self.debug:
             file = open("mcts_profiling.txt", "a")
             file.write("[" + str(self.role) + "]"
                        + " sel  " + '%.3f' % self.selection_time + "  "

From c788b253fbf27706d0cac693f6c02ac806376c5a Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Wed, 27 Dec 2017 01:04:09 +0800
Subject: [PATCH 82/98] show the stdout of player.py for debugging

---
 AlphaGo/play.py   | 17 ++++++++++++-----
 AlphaGo/player.py | 12 ++++++++----
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index 884d2ab..038953f 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -54,11 +54,6 @@ if __name__ == '__main__':
     # print "kill the old pyro4 name server, the return code is : " + str(kill_old_server.wait())
     # time.sleep(1)
 
-    # start a name server to find the remote object
-    # start_new_server = subprocess.Popen(['pyro4-ns', '&'])
-    # print "Start Name Sever : " + str(start_new_server.pid)  # + str(start_new_server.wait())
-    # time.sleep(1)
-
     # start a name server if no name server exists
     if len(os.popen('ps aux | grep pyro4-ns | grep -v grep').readlines()) == 0:
         start_new_server = subprocess.Popen(['pyro4-ns', '&'])
@@ -91,11 +86,23 @@ if __name__ == '__main__':
         ['python', '-u', 'player.py', '--game=' + args.game, '--role=' + black_role_name,
          '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)],
         stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    bp_output = black_player.stdout.readline()
+    bp_message = bp_output
+    while bp_output != '' and "Start requestLoop" not in bp_output:
+        bp_output = black_player.stdout.readline()
+        bp_message += bp_output
+    print("============ " + black_role_name + " message ============" + "\n" + bp_message),
 
     white_player = subprocess.Popen(
         ['python', '-u', 'player.py', '--game=' + args.game, '--role=' + white_role_name,
          '--checkpoint_path=' + str(args.white_weight_path), '--debug=' + str(args.debug)],
         stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    wp_output = white_player.stdout.readline()
+    wp_message = wp_output
+    while wp_output != '' and "Start requestLoop" not in wp_output:
+        wp_output = white_player.stdout.readline()
+        wp_message += wp_output
+    print("============ " + white_role_name + " message ============" + "\n" + wp_message),
 
     server_list = ""
     while (black_role_name not in server_list) or (white_role_name not in server_list):
diff --git a/AlphaGo/player.py b/AlphaGo/player.py
index b93c124..8d46ae5 100644
--- a/AlphaGo/player.py
+++ b/AlphaGo/player.py
@@ -24,16 +24,20 @@ if __name__ == '__main__':
     parser.add_argument("--game", type=str, default=False)
     args = parser.parse_args()
 
-    game = Game(name=args.game, role=args.role, checkpoint_path=eval(args.checkpoint_path), debug=eval(args.debug))
+    if args.checkpoint_path == 'None':
+        args.checkpoint_path = None
+    game = Game(name=args.game, role=args.role,
+                checkpoint_path=args.checkpoint_path,
+                debug=eval(args.debug))
     engine = GTPEngine(game_obj=game, name='tianshou', version=0)
 
     daemon = Pyro4.Daemon()                # make a Pyro daemon
     ns = Pyro4.locateNS()                  # find the name server
     player = Player(role=args.role, engine=engine)
-    print "Init " + args.role + " player finished"
+    print("Init " + args.role + " player finished")
     uri = daemon.register(player)          # register the greeting maker as a Pyro object
-    print "Start on name " + args.role
+    print("Start on name " + args.role)
     ns.register(args.role, uri)            # register the object with a name in the name server
-    print "Start Request Loop " + str(uri)
+    print("Start requestLoop " + str(uri))
     daemon.requestLoop()                   # start the event loop of the server to wait for calls
 

From a1f6044cba6114bc931fb69e208aa4bd1fa0e61d Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Wed, 27 Dec 2017 11:43:04 +0800
Subject: [PATCH 83/98] rewrite selection function of ActionNode for clarity,
 add and delete some notes

---
 AlphaGo/play.py            |  3 +--
 tianshou/core/mcts/mcts.py | 18 +++++++-----------
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index 038953f..7c7961c 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -24,7 +24,6 @@ class Data(object):
     def reset(self):
         self.__init__()
 
-
 if __name__ == '__main__':
     """
     Starting two different players which load network weights to evaluate the winning ratio.
@@ -81,13 +80,13 @@ if __name__ == '__main__':
     black_role_name = 'black' + str(args.id)
     white_role_name = 'white' + str(args.id)
 
-    #TODO : check if we can get the output of player from the stdout, for debug convenience
     black_player = subprocess.Popen(
         ['python', '-u', 'player.py', '--game=' + args.game, '--role=' + black_role_name,
          '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)],
         stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
     bp_output = black_player.stdout.readline()
     bp_message = bp_output
+    # '' means player.py failed to start, "Start requestLoop" means player.py start successfully
     while bp_output != '' and "Start requestLoop" not in bp_output:
         bp_output = black_player.stdout.readline()
         bp_message += bp_output
diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index 3d547c6..f64b5a0 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -97,7 +97,6 @@ class ActionNode(object):
         self.action = action
         self.children = {}
         self.next_state = None
-        self.origin_state = None
         self.state_type = None
         self.reward = 0
         self.mcts = mcts
@@ -118,18 +117,15 @@ class ActionNode(object):
         head = time.time()
         self.next_state, self.reward = simulator.simulate_step_forward(self.parent.state, self.action)
         self.mcts.simulate_sf_time += time.time() - head
+        if self.next_state is None: # next_state is None means that self.parent.state is the terminate state
+            self.mcts.action_selection_time += time.time() - head
+            return self.parent, self.action
         self.origin_state = self.next_state
-        self.state_type = type(self.next_state)
         self.type_conversion_to_tuple()
-        if self.next_state is not None:
-            if self.next_state in self.children.keys():
-                self.mcts.action_selection_time += time.time() - head
-                return self.children[self.next_state].selection(simulator)
-            else:
-                self.mcts.action_selection_time += time.time() - head
-                return self.parent, self.action
-        else:
-            # self.next_state is None means we have reach the terminate state
+        if self.next_state in self.children.keys(): # next state has already visited before
+            self.mcts.action_selection_time += time.time() - head
+            return self.children[self.next_state].selection(simulator)
+        else: # next state is a new state never seen before
             self.mcts.action_selection_time += time.time() - head
             return self.parent, self.action
 

From 9f6098497336d989c70b7f6fc67ebf2bc4ad6e85 Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Wed, 27 Dec 2017 14:08:34 +0800
Subject: [PATCH 84/98] remove type_conversion function

---
 AlphaGo/model.py           |  4 ++--
 tianshou/core/mcts/mcts.py | 33 ++++++++++-----------------------
 2 files changed, 12 insertions(+), 25 deletions(-)

diff --git a/AlphaGo/model.py b/AlphaGo/model.py
index dbfc5ca..6fde6e5 100644
--- a/AlphaGo/model.py
+++ b/AlphaGo/model.py
@@ -161,8 +161,8 @@ class ResNet(object):
             raise ValueError(
                 'The length of history cannot meet the need of the model, given {}, need {}'.format(len(history),
                                                                                                     self.history_length))
-        state = self._history2state(history, color)
-        return self.sess.run([self.prob, self.v], feed_dict={self.x: state, self.is_training: False})
+        eval_state = self._history2state(history, color)
+        return self.sess.run([self.prob, self.v], feed_dict={self.x: eval_state, self.is_training: False})
 
     def _history2state(self, history, color):
         """
diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index f64b5a0..98ab056 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -6,11 +6,11 @@ import collections
 
 c_puct = 5
 
-def list2tuple(obj):
+def hashable_conversion(obj):
     if isinstance(obj, collections.Hashable):
         return obj
     else:
-        return tuple(list2tuple(sub) for sub in obj)
+        return tuple(hashable_conversion(sub) for sub in obj)
 
 class MCTSNode(object):
     def __init__(self, parent, action, state, action_num, prior, inverse=False):
@@ -79,7 +79,7 @@ class UCTNode(MCTSNode):
                 self.mask = simulator.simulate_get_mask(self.state, range(self.action_num))
             self.ucb[self.mask] = -float("Inf")
 
-
+# Code reserved for Thompson Sampling
 class TSNode(MCTSNode):
     def __init__(self, parent, action, state, action_num, prior, method="Gaussian", inverse=False):
         super(TSNode, self).__init__(parent, action, state, action_num, prior, inverse)
@@ -97,22 +97,11 @@ class ActionNode(object):
         self.action = action
         self.children = {}
         self.next_state = None
+        self.next_state_hashable = None
         self.state_type = None
         self.reward = 0
         self.mcts = mcts
 
-    def type_conversion_to_tuple(self):
-        t0 = time.time()
-        if isinstance(self.next_state, np.ndarray):
-            self.next_state = self.next_state.tolist()
-        t1 = time.time()
-        if isinstance(self.next_state, list):
-            self.next_state = list2tuple(self.next_state)
-        t2 = time.time()
-        self.mcts.ndarray2list_time += t1 - t0
-        self.mcts.list2tuple_time += t2 - t1
-        self.mcts.check += sys.getsizeof(object)
-
     def selection(self, simulator):
         head = time.time()
         self.next_state, self.reward = simulator.simulate_step_forward(self.parent.state, self.action)
@@ -120,29 +109,28 @@ class ActionNode(object):
         if self.next_state is None: # next_state is None means that self.parent.state is the terminate state
             self.mcts.action_selection_time += time.time() - head
             return self.parent, self.action
-        self.origin_state = self.next_state
-        self.type_conversion_to_tuple()
-        if self.next_state in self.children.keys(): # next state has already visited before
+        self.next_state_hashable = hashable_conversion(self.next_state)
+        if self.next_state_hashable in self.children.keys(): # next state has already visited before
             self.mcts.action_selection_time += time.time() - head
-            return self.children[self.next_state].selection(simulator)
+            return self.children[self.next_state_hashable].selection(simulator)
         else: # next state is a new state never seen before
             self.mcts.action_selection_time += time.time() - head
             return self.parent, self.action
 
     def expansion(self, evaluator, action_num):
         if self.next_state is not None:
+            # note that self.next_state was assigned already at the selection function
             prior, value = evaluator(self.next_state)
-            self.children[self.next_state] = UCTNode(self, self.action, self.origin_state, action_num, prior,
+            self.children[self.next_state_hashable] = UCTNode(self, self.action, self.next_state, action_num, prior,
                                                      mcts=self.mcts, inverse=self.parent.inverse)
             return value
-        else:
+        else: # self.next_state is None means MCTS selected a terminate node
             return 0.
 
     def backpropagation(self, value):
         self.reward += value
         self.parent.backpropagation(self.action)
 
-
 class MCTS(object):
     def __init__(self, simulator, evaluator, start_state, action_num, method="UCT",
                  role="unknown", debug=False, inverse=False):
@@ -214,6 +202,5 @@ class MCTS(object):
         t3 = time.time()
         return t1 - t0, t2 - t1, t3 - t2
 
-
 if __name__ == "__main__":
     pass

From 8d102d249fd05a274f8d4174d061b0fc046181cb Mon Sep 17 00:00:00 2001
From: JialianLee <Jialian@DESKTOP-N4N6F2G.localdomain>
Date: Wed, 27 Dec 2017 18:55:00 +0800
Subject: [PATCH 85/98] Modification for backpropagation process

---
 tianshou/core/mcts/mcts.py              | 5 ++++-
 tianshou/core/mcts/mcts_virtual_loss.py | 8 ++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index 98ab056..f733f83 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -198,7 +198,10 @@ class MCTS(object):
         t1 = time.time()
         value = node.children[new_action].expansion(self.evaluator, self.action_num)
         t2 = time.time()
-        node.children[new_action].backpropagation(value + 0.)
+        if self.inverse:
+            node.children[new_action].backpropagation(-value + 0.)
+        else:
+            node.children[new_action].backpropagation(value + 0.)
         t3 = time.time()
         return t1 - t0, t2 - t1, t3 - t2
 
diff --git a/tianshou/core/mcts/mcts_virtual_loss.py b/tianshou/core/mcts/mcts_virtual_loss.py
index f27d8a3..5826bd5 100644
--- a/tianshou/core/mcts/mcts_virtual_loss.py
+++ b/tianshou/core/mcts/mcts_virtual_loss.py
@@ -278,8 +278,12 @@ class MCTSVirtualLoss(object):
                                                         priors[i],
                                                         nodes[i].inverse)
 
-        for i in range(self.batch_size):
-            nodes[i].children[new_actions[i]].backpropagation(values[i] + 0.)
+        if self.inverse:
+            for i in range(self.batch_size):
+                nodes[i].children[new_actions[i]].backpropagation(-values[i] + 0.)
+        else:
+            for i in range(self.batch_size):
+                nodes[i].children[new_actions[i]].backpropagation(values[i] + 0.)
 
 
 ##### TODO 

From f2291efc72cd88a55db3719b04997d94770d10bc Mon Sep 17 00:00:00 2001
From: rtz19970824 <rtz19970824@gmail.com>
Date: Wed, 27 Dec 2017 19:54:36 +0800
Subject: [PATCH 86/98] check exists when save data

---
 AlphaGo/play.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/AlphaGo/play.py b/AlphaGo/play.py
index 7c7961c..d1a5301 100644
--- a/AlphaGo/play.py
+++ b/AlphaGo/play.py
@@ -178,6 +178,9 @@ if __name__ == '__main__':
             player[1].run_cmd(str(num) + ' clear_board')
             file_list = os.listdir(args.data_path)
             current_time = strftime("%Y%m%d_%H%M%S", gmtime())
+            if os.path.exists(args.data_path + current_time + ".pkl"):
+                time.sleep(1)
+                current_time = strftime("%Y%m%d_%H%M%S", gmtime())
             with open(args.data_path + current_time + ".pkl", "wb") as file:
                 picklestring = cPickle.dump(data, file)
             data.reset()

From d48982d59ed2ca797d07ef6afee5f211a7e22aed Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Wed, 27 Dec 2017 20:49:54 +0800
Subject: [PATCH 87/98] move evaluator from action node to mcts

---
 AlphaGo/model.py           |  2 ++
 tianshou/core/mcts/mcts.py | 10 ++++------
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/AlphaGo/model.py b/AlphaGo/model.py
index 6fde6e5..c3bb9f0 100644
--- a/AlphaGo/model.py
+++ b/AlphaGo/model.py
@@ -156,6 +156,8 @@ class ResNet(object):
         # Note : maybe we can use it for isolating test of MCTS
         #prob = [1.0 / self.action_num] * self.action_num
         #return [prob, np.random.uniform(-1, 1)]
+        if state is None:
+            return [[0.0] * self.action_num, 0]
         history, color = state
         if len(history) != self.history_length:
             raise ValueError(
diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index f733f83..a1b0b3d 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -117,15 +117,12 @@ class ActionNode(object):
             self.mcts.action_selection_time += time.time() - head
             return self.parent, self.action
 
-    def expansion(self, evaluator, action_num):
+    def expansion(self, prior, action_num):
         if self.next_state is not None:
             # note that self.next_state was assigned already at the selection function
-            prior, value = evaluator(self.next_state)
+            # self.next_state is None means MCTS selected a terminate node
             self.children[self.next_state_hashable] = UCTNode(self, self.action, self.next_state, action_num, prior,
                                                      mcts=self.mcts, inverse=self.parent.inverse)
-            return value
-        else: # self.next_state is None means MCTS selected a terminate node
-            return 0.
 
     def backpropagation(self, value):
         self.reward += value
@@ -196,7 +193,8 @@ class MCTS(object):
         t0 = time.time()
         node, new_action = self.root.selection(self.simulator)
         t1 = time.time()
-        value = node.children[new_action].expansion(self.evaluator, self.action_num)
+        prior, value = self.evaluator(node.children[new_action].next_state)
+        node.children[new_action].expansion(prior, self.action_num)
         t2 = time.time()
         if self.inverse:
             node.children[new_action].backpropagation(-value + 0.)

From affd0319e283a26276e44c1359bcd72172751da5 Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Wed, 27 Dec 2017 21:11:40 +0800
Subject: [PATCH 88/98] rewrite the selection fuction of UCTNode to return the
 action node instead of return the state node and next action

---
 tianshou/core/mcts/mcts.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index a1b0b3d..4c23809 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -108,14 +108,14 @@ class ActionNode(object):
         self.mcts.simulate_sf_time += time.time() - head
         if self.next_state is None: # next_state is None means that self.parent.state is the terminate state
             self.mcts.action_selection_time += time.time() - head
-            return self.parent, self.action
+            return self
         self.next_state_hashable = hashable_conversion(self.next_state)
         if self.next_state_hashable in self.children.keys(): # next state has already visited before
             self.mcts.action_selection_time += time.time() - head
             return self.children[self.next_state_hashable].selection(simulator)
         else: # next state is a new state never seen before
             self.mcts.action_selection_time += time.time() - head
-            return self.parent, self.action
+            return self
 
     def expansion(self, prior, action_num):
         if self.next_state is not None:
@@ -191,15 +191,15 @@ class MCTS(object):
 
     def _expand(self):
         t0 = time.time()
-        node, new_action = self.root.selection(self.simulator)
+        next_action = self.root.selection(self.simulator)
         t1 = time.time()
-        prior, value = self.evaluator(node.children[new_action].next_state)
-        node.children[new_action].expansion(prior, self.action_num)
+        prior, value = self.evaluator(next_action.next_state)
+        next_action.expansion(prior, self.action_num)
         t2 = time.time()
         if self.inverse:
-            node.children[new_action].backpropagation(-value + 0.)
+            next_action.backpropagation(-value + 0.)
         else:
-            node.children[new_action].backpropagation(value + 0.)
+            next_action.backpropagation(value + 0.)
         t3 = time.time()
         return t1 - t0, t2 - t1, t3 - t2
 

From 47676993fdd5bcd99b8d484f689245681dcd09db Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Thu, 28 Dec 2017 01:16:24 +0800
Subject: [PATCH 89/98] solve the performance bottleneck by only hashing the
 last board

---
 AlphaGo/go.py              |  4 ++++
 AlphaGo/reversi.py         |  4 ++++
 tianshou/core/mcts/mcts.py | 36 +++++++++++++-----------------------
 3 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index 55f5a4a..987fe93 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -222,6 +222,10 @@ class Go:
             new_color = -color
             return [history_boards, new_color], 0
 
+    def simulate_hashable_conversion(self, state):
+        # since go is MDP, we only need the last board for hashing
+        return tuple(state[0][-1])
+
     def executor_do_move(self, history, latest_boards, current_board, color, vertex):
         if not self._rule_check(history, current_board, color, vertex):
             return False
diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py
index c6c8a5b..08a5ec5 100644
--- a/AlphaGo/reversi.py
+++ b/AlphaGo/reversi.py
@@ -97,6 +97,10 @@ class Reversi:
         history_boards.append(new_board)
         return [history_boards, 0 - color], 0
 
+    def simulate_hashable_conversion(self, state):
+        # since go is MDP, we only need the last board for hashing
+        return tuple(state[0][-1])
+
     def _get_winner(self, board):
         black_num, white_num = self._number_of_black_and_white(board)
         black_win = black_num - white_num
diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index 4c23809..9625261 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -1,17 +1,9 @@
 import numpy as np
 import math
 import time
-import sys
-import collections
 
 c_puct = 5
 
-def hashable_conversion(obj):
-    if isinstance(obj, collections.Hashable):
-        return obj
-    else:
-        return tuple(hashable_conversion(sub) for sub in obj)
-
 class MCTSNode(object):
     def __init__(self, parent, action, state, action_num, prior, inverse=False):
         self.parent = parent
@@ -109,7 +101,9 @@ class ActionNode(object):
         if self.next_state is None: # next_state is None means that self.parent.state is the terminate state
             self.mcts.action_selection_time += time.time() - head
             return self
-        self.next_state_hashable = hashable_conversion(self.next_state)
+        head = time.time()
+        self.next_state_hashable = simulator.simulate_hashable_conversion(self.next_state)
+        self.mcts.hash_time += time.time() - head
         if self.next_state_hashable in self.children.keys(): # next state has already visited before
             self.mcts.action_selection_time += time.time() - head
             return self.children[self.next_state_hashable].selection(simulator)
@@ -153,9 +147,7 @@ class MCTS(object):
         self.state_selection_time = 0
         self.simulate_sf_time = 0
         self.valid_mask_time = 0
-        self.ndarray2list_time = 0
-        self.list2tuple_time = 0
-        self.check = 0
+        self.hash_time = 0
 
     def search(self, max_step=None, max_time=None):
         step = 0
@@ -174,18 +166,16 @@ class MCTS(object):
             self.backpropagation_time += back_time
             step += 1
         if self.debug:
-            file = open("mcts_profiling.txt", "a")
+            file = open("mcts_profiling.log", "a")
             file.write("[" + str(self.role) + "]"
-                       + " sel  " + '%.3f' % self.selection_time + "  "
-                       + " sel_sta  " + '%.3f' % self.state_selection_time + "  "
-                       + " valid  " + '%.3f' % self.valid_mask_time + "  "
-                       + " sel_act  " + '%.3f' % self.action_selection_time + "  "
-                       + " array2list  " + '%.4f' % self.ndarray2list_time + "  "
-                       + " check " + str(self.check) + "  "
-                       + " list2tuple  " + '%.4f' % self.list2tuple_time + " \t"
-                       + " forward  " + '%.3f' % self.simulate_sf_time + "  "
-                       + " exp  " + '%.3f' % self.expansion_time + "  "
-                       + " bak  " + '%.3f' % self.backpropagation_time + "  "
+                       + " sel " + '%.3f' % self.selection_time + "  "
+                       + " sel_sta " + '%.3f' % self.state_selection_time + "  "
+                       + " valid " + '%.3f' % self.valid_mask_time + "  "
+                       + " sel_act " + '%.3f' % self.action_selection_time + "  "
+                       + " hash " + '%.3f' % self.hash_time + "  "
+                       + " step forward " + '%.3f' % self.simulate_sf_time + "  "
+                       + " expansion  " + '%.3f' % self.expansion_time + "  "
+                       + " backprop " + '%.3f' % self.backpropagation_time + "  "
                        + "\n")
             file.close()
 

From 08b6649fead4c9f550c6c8b6c51f44ac605a67a3 Mon Sep 17 00:00:00 2001
From: Dong Yan <sproblvem@gmail.com>
Date: Thu, 28 Dec 2017 15:52:31 +0800
Subject: [PATCH 90/98] test next_action.next_state in MCTS

---
 AlphaGo/model.py           |  2 --
 tianshou/core/mcts/mcts.py | 15 ++++++++-------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/AlphaGo/model.py b/AlphaGo/model.py
index c3bb9f0..6fde6e5 100644
--- a/AlphaGo/model.py
+++ b/AlphaGo/model.py
@@ -156,8 +156,6 @@ class ResNet(object):
         # Note : maybe we can use it for isolating test of MCTS
         #prob = [1.0 / self.action_num] * self.action_num
         #return [prob, np.random.uniform(-1, 1)]
-        if state is None:
-            return [[0.0] * self.action_num, 0]
         history, color = state
         if len(history) != self.history_length:
             raise ValueError(
diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py
index 9625261..1251d05 100644
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@@ -112,11 +112,8 @@ class ActionNode(object):
             return self
 
     def expansion(self, prior, action_num):
-        if self.next_state is not None:
-            # note that self.next_state was assigned already at the selection function
-            # self.next_state is None means MCTS selected a terminate node
-            self.children[self.next_state_hashable] = UCTNode(self, self.action, self.next_state, action_num, prior,
-                                                     mcts=self.mcts, inverse=self.parent.inverse)
+        self.children[self.next_state_hashable] = UCTNode(self, self.action, self.next_state, action_num, prior,
+                                                          mcts=self.mcts, inverse=self.parent.inverse)
 
     def backpropagation(self, value):
         self.reward += value
@@ -183,8 +180,12 @@ class MCTS(object):
         t0 = time.time()
         next_action = self.root.selection(self.simulator)
         t1 = time.time()
-        prior, value = self.evaluator(next_action.next_state)
-        next_action.expansion(prior, self.action_num)
+        # next_action.next_state is None means the parent state node of next_action is a terminate node
+        if next_action.next_state is not None:
+            prior, value = self.evaluator(next_action.next_state)
+            next_action.expansion(prior, self.action_num)
+        else:
+            value = 0
         t2 = time.time()
         if self.inverse:
             next_action.backpropagation(-value + 0.)

From b699258e769429f881fdef34ca947e1311983be3 Mon Sep 17 00:00:00 2001
From: rtz19970824 <rtz19970824@gmail.com>
Date: Thu, 28 Dec 2017 15:55:07 +0800
Subject: [PATCH 91/98] debug for reversi

---
 AlphaGo/game.py    | 2 +-
 AlphaGo/player.py  | 7 +++----
 AlphaGo/reversi.py | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index f17c7af..82cf254 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -123,7 +123,7 @@ class Game:
         sys.stdout.flush()
 
 if __name__ == "__main__":
-    game = Game(name="go", checkpoint_path="./checkpoint")
+    game = Game(name="reversi", checkpoint_path=None)
     game.debug = True
     game.think_play_move(utils.BLACK)
 
diff --git a/AlphaGo/player.py b/AlphaGo/player.py
index 8d46ae5..bd2a2d1 100644
--- a/AlphaGo/player.py
+++ b/AlphaGo/player.py
@@ -18,12 +18,11 @@ class Player(object):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint_path", type=str, default=None)
+    parser.add_argument("--checkpoint_path", type=str, default="None")
     parser.add_argument("--role", type=str, default="unknown")
-    parser.add_argument("--debug", type=str, default=False)
-    parser.add_argument("--game", type=str, default=False)
+    parser.add_argument("--debug", type=str, default="False")
+    parser.add_argument("--game", type=str, default="go")
     args = parser.parse_args()
-
     if args.checkpoint_path == 'None':
         args.checkpoint_path = None
     game = Game(name=args.game, role=args.role,
diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py
index 08a5ec5..1685b66 100644
--- a/AlphaGo/reversi.py
+++ b/AlphaGo/reversi.py
@@ -99,7 +99,7 @@ class Reversi:
 
     def simulate_hashable_conversion(self, state):
         # since go is MDP, we only need the last board for hashing
-        return tuple(state[0][-1])
+        return tuple(state[0][-1].flatten().tolist())
 
     def _get_winner(self, board):
         black_num, white_num = self._number_of_black_and_white(board)

From 5457e5134e3cb6c0566df17b9b79fc479e773375 Mon Sep 17 00:00:00 2001
From: JialianLee <Jialian@DESKTOP-N4N6F2G.localdomain>
Date: Thu, 28 Dec 2017 16:20:44 +0800
Subject: [PATCH 92/98] add a unit test

---
 tianshou/core/mcts/unit_test/Evaluator.py |  28 +++
 tianshou/core/mcts/unit_test/ZOGame.py    |  70 ++++++++
 tianshou/core/mcts/unit_test/agent.py     |  27 +++
 tianshou/core/mcts/unit_test/game.py      |  37 ++++
 tianshou/core/mcts/unit_test/mcts.py      | 198 ++++++++++++++++++++++
 5 files changed, 360 insertions(+)
 create mode 100644 tianshou/core/mcts/unit_test/Evaluator.py
 create mode 100644 tianshou/core/mcts/unit_test/ZOGame.py
 create mode 100644 tianshou/core/mcts/unit_test/agent.py
 create mode 100644 tianshou/core/mcts/unit_test/game.py
 create mode 100644 tianshou/core/mcts/unit_test/mcts.py

diff --git a/tianshou/core/mcts/unit_test/Evaluator.py b/tianshou/core/mcts/unit_test/Evaluator.py
new file mode 100644
index 0000000..a1f9456
--- /dev/null
+++ b/tianshou/core/mcts/unit_test/Evaluator.py
@@ -0,0 +1,28 @@
+import numpy as np
+
+
+class evaluator(object):
+    def __init__(self, env, action_num):
+        self.env = env
+        self.action_num = action_num
+
+    def __call__(self, state):
+        raise NotImplementedError("Need to implement the evaluator")
+
+
+class rollout_policy(evaluator):
+    def __init__(self, env, action_num):
+        super(rollout_policy, self).__init__(env, action_num)
+        self.is_terminated = False
+
+    def __call__(self, state):
+        # TODO: prior for rollout policy
+        total_reward = 0.
+        action = np.random.randint(0, self.action_num)
+        state, reward = self.env.simulate_step_forward(state, action)
+        total_reward += reward
+        while state is not None:
+            action = np.random.randint(0, self.action_num)
+            state, reward = self.env.simulate_step_forward(state, action)
+            total_reward += reward
+        return np.ones([self.action_num])/self.action_num, total_reward
diff --git a/tianshou/core/mcts/unit_test/ZOGame.py b/tianshou/core/mcts/unit_test/ZOGame.py
new file mode 100644
index 0000000..8a2ed54
--- /dev/null
+++ b/tianshou/core/mcts/unit_test/ZOGame.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+import numpy as np
+import copy
+
+
+class ZOTree:
+    def __init__(self, size):
+        self.size = size
+        self.depth = self.size * 2
+
+    def simulate_step_forward(self, state, action):
+        seq, color = copy.deepcopy(state)
+        if len(seq) == self.depth:
+            winner = self.executor_get_reward(state)
+            return None, color * winner
+        else:
+            seq.append(int(action))
+            return [seq, 0 - color], 0
+
+    def executor_get_reward(self, state):
+        seq = np.array(state[0], dtype='int16')
+        length = len(seq)
+        if length != self.depth:
+            raise ValueError("The game is not terminated!")
+        result = np.sum(seq)
+        if result > 0:
+            winner = 1
+        elif result < 0:
+            winner = -1
+        else:
+            winner = 0
+        return winner
+
+    def executor_do_move(self, state, action):
+        seq, color = state
+        if len(seq) == self.depth:
+            return False
+        else:
+            seq.append(int(action))
+            if len(seq) == self.depth:
+                return False
+            return True
+
+    def v_value(self, state):
+        seq, color = state
+        choosen_result = np.sum(np.array(seq, dtype='int16'))
+        if color == 1:
+            if choosen_result > 0:
+                return 1
+            elif choosen_result < 0:
+                return -1
+            else:
+                return 0
+        elif color == -1:
+            if choosen_result > 1:
+                return 1
+            elif choosen_result < 1:
+                return -1
+            else:
+                return 0
+        else:
+            raise ValueError("Wrong color")
+
+if __name__ == "__main__":
+    size = 2
+    game = ZOTree(size)
+    seq = [1, -1, 1, 1]
+    result = game.executor_do_move([seq, 1], 1)
+    print(result)
+    print(seq)
\ No newline at end of file
diff --git a/tianshou/core/mcts/unit_test/agent.py b/tianshou/core/mcts/unit_test/agent.py
new file mode 100644
index 0000000..1bffdd0
--- /dev/null
+++ b/tianshou/core/mcts/unit_test/agent.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+import numpy as np
+import ZOGame
+import Evaluator
+from mcts import MCTS
+
+temp = 1
+
+
+class Agent:
+    def __init__(self, size, color):
+        self.size = size
+        self.color = color
+        self.simulator = ZOGame.ZOTree(self.size)
+        self.evaluator = Evaluator.rollout_policy(self.simulator, 2)
+
+    def gen_move(self, seq):
+        if len(seq) >= 2 * self.size:
+            raise ValueError("Game is terminated.")
+        mcts = MCTS(self.simulator, self.evaluator, [seq, self.color], 2)
+        mcts.search(max_step=50)
+        N = mcts.root.N
+        N = np.power(N, 1.0 / temp)
+        prob = N / np.sum(N)
+        print("prob: {}".format(prob))
+        action = int(np.random.binomial(1, prob[1]) * 2 - 1)
+        return action
\ No newline at end of file
diff --git a/tianshou/core/mcts/unit_test/game.py b/tianshou/core/mcts/unit_test/game.py
new file mode 100644
index 0000000..7ac044c
--- /dev/null
+++ b/tianshou/core/mcts/unit_test/game.py
@@ -0,0 +1,37 @@
+import ZOGame
+import agent
+
+
+if __name__ == '__main__':
+    print("Our game has 2 players.")
+    print("Player 1 has color 1 and plays first. Player 2 has color -1 and plays following player 1.")
+    print("Both player choose 1 or -1 for an action.")
+    size = 1
+    print("This game has {} iterations".format(size))
+    print("If the final sequence has more 1 that -1, player 1 wins.")
+    print("If the final sequence has less 1 that -1, player 2 wins.")
+    print("Otherwise, both players get 0.\n")
+    game = ZOGame.ZOTree(size)
+    player1 = agent.Agent(size, 1)
+    player2 = agent.Agent(size, -1)
+
+    seq = []
+    print("Sequence is {}\n".format(seq))
+    while True:
+        action1 = player1.gen_move(seq)
+        print("action1 is {}".format(action1))
+        result = game.executor_do_move([seq, 1], action1)
+        print("Sequence is {}\n".format(seq))
+        if not result:
+            winner = game.executor_get_reward([seq, 1])
+            break
+        action2 = player2.gen_move(seq)
+        print("action2 is {}".format(action2))
+        result = game.executor_do_move([seq, -1], action2)
+        print("Sequence is {}\n".format(seq))
+        if not result:
+            winner = game.executor_get_reward([seq, 1])
+            break
+
+    print("The choice sequence is {}".format(seq))
+    print("The game result is {}".format(winner))
\ No newline at end of file
diff --git a/tianshou/core/mcts/unit_test/mcts.py b/tianshou/core/mcts/unit_test/mcts.py
new file mode 100644
index 0000000..1251d05
--- /dev/null
+++ b/tianshou/core/mcts/unit_test/mcts.py
@@ -0,0 +1,198 @@
+import numpy as np
+import math
+import time
+
+c_puct = 5
+
+class MCTSNode(object):
+    def __init__(self, parent, action, state, action_num, prior, inverse=False):
+        self.parent = parent
+        self.action = action
+        self.children = {}
+        self.state = state
+        self.action_num = action_num
+        self.prior = np.array(prior).reshape(-1)
+        self.inverse = inverse
+
+    def selection(self, simulator):
+        raise NotImplementedError("Need to implement function selection")
+
+    def backpropagation(self, action):
+        raise NotImplementedError("Need to implement function backpropagation")
+
+    def valid_mask(self, simulator):
+        pass
+
+class UCTNode(MCTSNode):
+    def __init__(self, parent, action, state, action_num, prior, mcts, inverse=False):
+        super(UCTNode, self).__init__(parent, action, state, action_num, prior, inverse)
+        self.Q = np.zeros([action_num])
+        self.W = np.zeros([action_num])
+        self.N = np.zeros([action_num])
+        self.c_puct = c_puct
+        self.ucb = self.Q + self.c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1)
+        self.mask = None
+        self.elapse_time = 0
+        self.mcts = mcts
+
+    def selection(self, simulator):
+        head = time.time()
+        self.valid_mask(simulator)
+        self.mcts.valid_mask_time += time.time() - head
+        action = np.argmax(self.ucb)
+        if action in self.children.keys():
+            self.mcts.state_selection_time += time.time() - head
+            return self.children[action].selection(simulator)
+        else:
+            self.children[action] = ActionNode(self, action, mcts=self.mcts)
+            self.mcts.state_selection_time += time.time() - head
+            return self.children[action].selection(simulator)
+
+    def backpropagation(self, action):
+        action = int(action)
+        self.N[action] += 1
+        self.W[action] += self.children[action].reward
+        for i in range(self.action_num):
+            if self.N[i] != 0:
+                self.Q[i] = (self.W[i] + 0.) / self.N[i]
+        self.ucb = self.Q + c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1.)
+        if self.parent is not None:
+            if self.inverse:
+                self.parent.backpropagation(-self.children[action].reward)
+            else:
+                self.parent.backpropagation(self.children[action].reward)
+
+    def valid_mask(self, simulator):
+        # let all invalid actions be illegal in mcts
+        if not hasattr(simulator, 'simulate_get_mask'):
+            pass
+        else:
+            if self.mask is None:
+                self.mask = simulator.simulate_get_mask(self.state, range(self.action_num))
+            self.ucb[self.mask] = -float("Inf")
+
+# Code reserved for Thompson Sampling
+class TSNode(MCTSNode):
+    def __init__(self, parent, action, state, action_num, prior, method="Gaussian", inverse=False):
+        super(TSNode, self).__init__(parent, action, state, action_num, prior, inverse)
+        if method == "Beta":
+            self.alpha = np.ones([action_num])
+            self.beta = np.ones([action_num])
+        if method == "Gaussian":
+            self.mu = np.zeros([action_num])
+            self.sigma = np.zeros([action_num])
+
+
+class ActionNode(object):
+    def __init__(self, parent, action, mcts):
+        self.parent = parent
+        self.action = action
+        self.children = {}
+        self.next_state = None
+        self.next_state_hashable = None
+        self.state_type = None
+        self.reward = 0
+        self.mcts = mcts
+
+    def selection(self, simulator):
+        head = time.time()
+        self.next_state, self.reward = simulator.simulate_step_forward(self.parent.state, self.action)
+        self.mcts.simulate_sf_time += time.time() - head
+        if self.next_state is None: # next_state is None means that self.parent.state is the terminate state
+            self.mcts.action_selection_time += time.time() - head
+            return self
+        head = time.time()
+        self.next_state_hashable = simulator.simulate_hashable_conversion(self.next_state)
+        self.mcts.hash_time += time.time() - head
+        if self.next_state_hashable in self.children.keys(): # next state has already visited before
+            self.mcts.action_selection_time += time.time() - head
+            return self.children[self.next_state_hashable].selection(simulator)
+        else: # next state is a new state never seen before
+            self.mcts.action_selection_time += time.time() - head
+            return self
+
+    def expansion(self, prior, action_num):
+        self.children[self.next_state_hashable] = UCTNode(self, self.action, self.next_state, action_num, prior,
+                                                          mcts=self.mcts, inverse=self.parent.inverse)
+
+    def backpropagation(self, value):
+        self.reward += value
+        self.parent.backpropagation(self.action)
+
+class MCTS(object):
+    def __init__(self, simulator, evaluator, start_state, action_num, method="UCT",
+                 role="unknown", debug=False, inverse=False):
+        self.simulator = simulator
+        self.evaluator = evaluator
+        self.role = role
+        self.debug = debug
+        prior, _ = self.evaluator(start_state)
+        self.action_num = action_num
+        if method == "":
+            self.root = start_state
+        if method == "UCT":
+            self.root = UCTNode(None, None, start_state, action_num, prior, mcts=self, inverse=inverse)
+        if method == "TS":
+            self.root = TSNode(None, None, start_state, action_num, prior, inverse=inverse)
+        self.inverse = inverse
+
+        # time spend on each step
+        self.selection_time = 0
+        self.expansion_time = 0
+        self.backpropagation_time = 0
+        self.action_selection_time = 0
+        self.state_selection_time = 0
+        self.simulate_sf_time = 0
+        self.valid_mask_time = 0
+        self.hash_time = 0
+
+    def search(self, max_step=None, max_time=None):
+        step = 0
+        start_time = time.time()
+        if max_step is None:
+            max_step = int("Inf")
+        if max_time is None:
+            max_time = float("Inf")
+        if max_step is None and max_time is None:
+            raise ValueError("Need a stop criteria!")
+
+        while step < max_step and time.time() - start_time < max_step:
+            sel_time, exp_time, back_time = self._expand()
+            self.selection_time += sel_time
+            self.expansion_time += exp_time
+            self.backpropagation_time += back_time
+            step += 1
+        if self.debug:
+            file = open("mcts_profiling.log", "a")
+            file.write("[" + str(self.role) + "]"
+                       + " sel " + '%.3f' % self.selection_time + "  "
+                       + " sel_sta " + '%.3f' % self.state_selection_time + "  "
+                       + " valid " + '%.3f' % self.valid_mask_time + "  "
+                       + " sel_act " + '%.3f' % self.action_selection_time + "  "
+                       + " hash " + '%.3f' % self.hash_time + "  "
+                       + " step forward " + '%.3f' % self.simulate_sf_time + "  "
+                       + " expansion  " + '%.3f' % self.expansion_time + "  "
+                       + " backprop " + '%.3f' % self.backpropagation_time + "  "
+                       + "\n")
+            file.close()
+
+    def _expand(self):
+        t0 = time.time()
+        next_action = self.root.selection(self.simulator)
+        t1 = time.time()
+        # next_action.next_state is None means the parent state node of next_action is a terminate node
+        if next_action.next_state is not None:
+            prior, value = self.evaluator(next_action.next_state)
+            next_action.expansion(prior, self.action_num)
+        else:
+            value = 0
+        t2 = time.time()
+        if self.inverse:
+            next_action.backpropagation(-value + 0.)
+        else:
+            next_action.backpropagation(value + 0.)
+        t3 = time.time()
+        return t1 - t0, t2 - t1, t3 - t2
+
+if __name__ == "__main__":
+    pass

From 0352866b1ab1c6da3ac230479c4ebe8493ed71d0 Mon Sep 17 00:00:00 2001
From: JialianLee <Jialian@DESKTOP-N4N6F2G.localdomain>
Date: Thu, 28 Dec 2017 16:27:28 +0800
Subject: [PATCH 93/98] Modification for game engine

---
 tianshou/core/mcts/unit_test/ZOGame.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tianshou/core/mcts/unit_test/ZOGame.py b/tianshou/core/mcts/unit_test/ZOGame.py
index 8a2ed54..acad284 100644
--- a/tianshou/core/mcts/unit_test/ZOGame.py
+++ b/tianshou/core/mcts/unit_test/ZOGame.py
@@ -17,6 +17,10 @@ class ZOTree:
             seq.append(int(action))
             return [seq, 0 - color], 0
 
+    def simulate_hashable_conversion(self, state):
+        # since go is MDP, we only need the last board for hashing
+        return tuple(state[0])
+    
     def executor_get_reward(self, state):
         seq = np.array(state[0], dtype='int16')
         length = len(seq)

From 4140d8c9d28fd2164ebb1f1dce902b77e0d9c5b5 Mon Sep 17 00:00:00 2001
From: JialianLee <Jialian@DESKTOP-N4N6F2G.localdomain>
Date: Thu, 28 Dec 2017 17:10:25 +0800
Subject: [PATCH 94/98] Modification on unit test

---
 tianshou/core/mcts/unit_test/Evaluator.py |  3 +-
 tianshou/core/mcts/unit_test/ZOGame.py    | 36 +++++++++++++++++++++--
 tianshou/core/mcts/unit_test/agent.py     |  2 +-
 tianshou/core/mcts/unit_test/game.py      |  6 ++--
 tianshou/core/mcts/unit_test/mcts.py      |  2 ++
 5 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/tianshou/core/mcts/unit_test/Evaluator.py b/tianshou/core/mcts/unit_test/Evaluator.py
index a1f9456..f78da95 100644
--- a/tianshou/core/mcts/unit_test/Evaluator.py
+++ b/tianshou/core/mcts/unit_test/Evaluator.py
@@ -18,6 +18,7 @@ class rollout_policy(evaluator):
     def __call__(self, state):
         # TODO: prior for rollout policy
         total_reward = 0.
+        color = state[1]
         action = np.random.randint(0, self.action_num)
         state, reward = self.env.simulate_step_forward(state, action)
         total_reward += reward
@@ -25,4 +26,4 @@ class rollout_policy(evaluator):
             action = np.random.randint(0, self.action_num)
             state, reward = self.env.simulate_step_forward(state, action)
             total_reward += reward
-        return np.ones([self.action_num])/self.action_num, total_reward
+        return np.ones([self.action_num])/self.action_num, total_reward * color
diff --git a/tianshou/core/mcts/unit_test/ZOGame.py b/tianshou/core/mcts/unit_test/ZOGame.py
index acad284..b598579 100644
--- a/tianshou/core/mcts/unit_test/ZOGame.py
+++ b/tianshou/core/mcts/unit_test/ZOGame.py
@@ -9,6 +9,7 @@ class ZOTree:
         self.depth = self.size * 2
 
     def simulate_step_forward(self, state, action):
+        self._check_state(state)
         seq, color = copy.deepcopy(state)
         if len(seq) == self.depth:
             winner = self.executor_get_reward(state)
@@ -18,15 +19,24 @@ class ZOTree:
             return [seq, 0 - color], 0
 
     def simulate_hashable_conversion(self, state):
+        self._check_state(state)
         # since go is MDP, we only need the last board for hashing
         return tuple(state[0])
-    
+
     def executor_get_reward(self, state):
+        self._check_state(state)
         seq = np.array(state[0], dtype='int16')
         length = len(seq)
         if length != self.depth:
             raise ValueError("The game is not terminated!")
-        result = np.sum(seq)
+        ones = 0
+        zeros = 0
+        for i in range(len(seq)):
+            if seq[i] == 0:
+                zeros += 1
+            if seq[i] == 1:
+                ones += 1
+        result = ones - zeros
         if result > 0:
             winner = 1
         elif result < 0:
@@ -36,6 +46,7 @@ class ZOTree:
         return winner
 
     def executor_do_move(self, state, action):
+        self._check_state(state)
         seq, color = state
         if len(seq) == self.depth:
             return False
@@ -46,8 +57,16 @@ class ZOTree:
             return True
 
     def v_value(self, state):
+        self._check_state(state)
         seq, color = state
-        choosen_result = np.sum(np.array(seq, dtype='int16'))
+        ones = 0
+        zeros = 0
+        for i in range(len(seq)):
+            if seq[i] == 0:
+                zeros += 1
+            if seq[i] == 1:
+                ones += 1
+        choosen_result = ones - zeros
         if color == 1:
             if choosen_result > 0:
                 return 1
@@ -65,6 +84,17 @@ class ZOTree:
         else:
             raise ValueError("Wrong color")
 
+    def _check_state(self, state):
+        seq, color = state
+        if color == 1:
+            if len(seq) % 2:
+                raise ValueError("Color is 1 but the length of seq is odd!")
+        elif color == -1:
+            if not len(seq) % 2:
+                raise ValueError("Color is -1 but the length of seq is even!")
+        else:
+            raise ValueError("Wrong color!")
+
 if __name__ == "__main__":
     size = 2
     game = ZOTree(size)
diff --git a/tianshou/core/mcts/unit_test/agent.py b/tianshou/core/mcts/unit_test/agent.py
index 1bffdd0..ebe346e 100644
--- a/tianshou/core/mcts/unit_test/agent.py
+++ b/tianshou/core/mcts/unit_test/agent.py
@@ -23,5 +23,5 @@ class Agent:
         N = np.power(N, 1.0 / temp)
         prob = N / np.sum(N)
         print("prob: {}".format(prob))
-        action = int(np.random.binomial(1, prob[1]) * 2 - 1)
+        action = int(np.random.binomial(1, prob[1]))
         return action
\ No newline at end of file
diff --git a/tianshou/core/mcts/unit_test/game.py b/tianshou/core/mcts/unit_test/game.py
index 7ac044c..14c2df5 100644
--- a/tianshou/core/mcts/unit_test/game.py
+++ b/tianshou/core/mcts/unit_test/game.py
@@ -5,11 +5,11 @@ import agent
 if __name__ == '__main__':
     print("Our game has 2 players.")
     print("Player 1 has color 1 and plays first. Player 2 has color -1 and plays following player 1.")
-    print("Both player choose 1 or -1 for an action.")
+    print("Both player choose 1 or 0 for an action.")
     size = 1
     print("This game has {} iterations".format(size))
-    print("If the final sequence has more 1 that -1, player 1 wins.")
-    print("If the final sequence has less 1 that -1, player 2 wins.")
+    print("If the final sequence has more 1 that 0, player 1 wins.")
+    print("If the final sequence has less 1 that 0, player 2 wins.")
     print("Otherwise, both players get 0.\n")
     game = ZOGame.ZOTree(size)
     player1 = agent.Agent(size, 1)
diff --git a/tianshou/core/mcts/unit_test/mcts.py b/tianshou/core/mcts/unit_test/mcts.py
index 1251d05..dd89f57 100644
--- a/tianshou/core/mcts/unit_test/mcts.py
+++ b/tianshou/core/mcts/unit_test/mcts.py
@@ -162,6 +162,8 @@ class MCTS(object):
             self.expansion_time += exp_time
             self.backpropagation_time += back_time
             step += 1
+            print("Q = {}".format(self.root.Q))
+            print("N = {}".format(self.root.N))
         if self.debug:
             file = open("mcts_profiling.log", "a")
             file.write("[" + str(self.role) + "]"

From 2dfab68efe58a047919b16e7c89a83d3c7f13d7f Mon Sep 17 00:00:00 2001
From: rtz19970824 <rtz19970824@gmail.com>
Date: Thu, 28 Dec 2017 19:28:21 +0800
Subject: [PATCH 95/98] debug for unit test

---
 AlphaGo/game.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index 82cf254..8329b1b 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -28,6 +28,8 @@ class Game:
     '''
     def __init__(self, name=None, role=None, debug=False, checkpoint_path=None):
         self.name = name
+	if role is None:
+	    raise ValueError("Need a role!")
         self.role = role
         self.debug = debug
         if self.name == "go":
@@ -123,7 +125,7 @@ class Game:
         sys.stdout.flush()
 
 if __name__ == "__main__":
-    game = Game(name="reversi", checkpoint_path=None)
+    game = Game(name="reversi", role="black", checkpoint_path=None)
     game.debug = True
     game.think_play_move(utils.BLACK)
 

From 63a0d32b3445cb1fc2994218d1a3cfd0ed2bbf08 Mon Sep 17 00:00:00 2001
From: Wenbo Hu <huwenbo.rambo@gmail.com>
Date: Fri, 29 Dec 2017 03:30:09 +0800
Subject: [PATCH 96/98] use hash table for check_global_isomorphous

---
 AlphaGo/game.py |  8 ++++++--
 AlphaGo/go.py   | 13 +++++++------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index 82cf254..60e09f0 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -35,6 +35,7 @@ class Game:
             self.komi = 3.75
             self.history_length = 8
             self.history = []
+            self.history_set = set()
             self.game_engine = go.Go(size=self.size, komi=self.komi, role=self.role)
             self.board = [utils.EMPTY] * (self.size ** 2)
         elif self.name == "reversi":
@@ -92,7 +93,10 @@ class Game:
         # this function can be called directly to play the opponent's move
         if vertex == utils.PASS:
             return True
-        res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex)
+        if self.name == "reversi":
+            res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex)
+        if self.name == "go":
+            res = self.game_engine.executor_do_move(self.history, self.history_set, self.latest_boards, self.board, color, vertex)
         return res
 
     def think_play_move(self, color):
@@ -124,6 +128,6 @@ class Game:
 
 if __name__ == "__main__":
     game = Game(name="reversi", checkpoint_path=None)
-    game.debug = True
+    game.debug = False
     game.think_play_move(utils.BLACK)
 
diff --git a/AlphaGo/go.py b/AlphaGo/go.py
index 987fe93..cf6b7aa 100644
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
@@ -97,12 +97,12 @@ class Go:
                     for b in group:
                         current_board[self._flatten(b)] = utils.EMPTY
 
-    def _check_global_isomorphous(self, history_boards, current_board, color, vertex):
+    def _check_global_isomorphous(self, history_boards_set, current_board, color, vertex):
         repeat = False
         next_board = copy.deepcopy(current_board)
         next_board[self._flatten(vertex)] = color
         self._process_board(next_board, color, vertex)
-        if next_board in history_boards:
+        if hash(tuple(next_board)) in history_boards_set:
             repeat = True
         return repeat
 
@@ -158,7 +158,7 @@ class Go:
             vertex = self._deflatten(action)
         return vertex
 
-    def _rule_check(self, history_boards, current_board, color, vertex):
+    def _rule_check(self, history_boards_set, current_board, color, vertex):
         ### in board
         if not self._in_board(vertex):
             return False
@@ -172,7 +172,7 @@ class Go:
             return False
 
         ### forbid global isomorphous
-        if self._check_global_isomorphous(history_boards, current_board, color, vertex):
+        if self._check_global_isomorphous(history_boards_set, current_board, color, vertex):
             return False
 
         return True
@@ -226,13 +226,14 @@ class Go:
         # since go is MDP, we only need the last board for hashing
         return tuple(state[0][-1])
 
-    def executor_do_move(self, history, latest_boards, current_board, color, vertex):
-        if not self._rule_check(history, current_board, color, vertex):
+    def executor_do_move(self, history, history_set, latest_boards, current_board, color, vertex):
+        if not self._rule_check(history_set, current_board, color, vertex):
             return False
         current_board[self._flatten(vertex)] = color
         self._process_board(current_board, color, vertex)
         history.append(copy.deepcopy(current_board))
         latest_boards.append(copy.deepcopy(current_board))
+        history_set.add(hash(tuple(current_board)))
         return True
 
     def _find_empty(self, current_board):

From 01f39f40d3df481703401ebaf2d8305f232074b6 Mon Sep 17 00:00:00 2001
From: rtz19970824 <rtz19970824@gmail.com>
Date: Thu, 28 Dec 2017 19:38:25 +0800
Subject: [PATCH 97/98] debug for unit test

---
 tianshou/core/mcts/unit_test/ZOGame.py | 17 +++++------------
 tianshou/core/mcts/unit_test/agent.py  |  4 ++--
 tianshou/core/mcts/unit_test/game.py   |  4 ++--
 tianshou/core/mcts/unit_test/mcts.py   |  2 +-
 4 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/tianshou/core/mcts/unit_test/ZOGame.py b/tianshou/core/mcts/unit_test/ZOGame.py
index b598579..0b3d771 100644
--- a/tianshou/core/mcts/unit_test/ZOGame.py
+++ b/tianshou/core/mcts/unit_test/ZOGame.py
@@ -29,17 +29,10 @@ class ZOTree:
         length = len(seq)
         if length != self.depth:
             raise ValueError("The game is not terminated!")
-        ones = 0
-        zeros = 0
-        for i in range(len(seq)):
-            if seq[i] == 0:
-                zeros += 1
-            if seq[i] == 1:
-                ones += 1
-        result = ones - zeros
-        if result > 0:
+	result = np.sum(seq)
+        if result > self.size:
             winner = 1
-        elif result < 0:
+        elif result < self.size:
             winner = -1
         else:
             winner = 0
@@ -98,7 +91,7 @@ class ZOTree:
 if __name__ == "__main__":
     size = 2
     game = ZOTree(size)
-    seq = [1, -1, 1, 1]
+    seq = [1, 0, 1, 1]
     result = game.executor_do_move([seq, 1], 1)
     print(result)
-    print(seq)
\ No newline at end of file
+    print(seq)
diff --git a/tianshou/core/mcts/unit_test/agent.py b/tianshou/core/mcts/unit_test/agent.py
index ebe346e..6dd34aa 100644
--- a/tianshou/core/mcts/unit_test/agent.py
+++ b/tianshou/core/mcts/unit_test/agent.py
@@ -17,11 +17,11 @@ class Agent:
     def gen_move(self, seq):
         if len(seq) >= 2 * self.size:
             raise ValueError("Game is terminated.")
-        mcts = MCTS(self.simulator, self.evaluator, [seq, self.color], 2)
+        mcts = MCTS(self.simulator, self.evaluator, [seq, self.color], 2, inverse=True)
         mcts.search(max_step=50)
         N = mcts.root.N
         N = np.power(N, 1.0 / temp)
         prob = N / np.sum(N)
         print("prob: {}".format(prob))
         action = int(np.random.binomial(1, prob[1]))
-        return action
\ No newline at end of file
+        return action
diff --git a/tianshou/core/mcts/unit_test/game.py b/tianshou/core/mcts/unit_test/game.py
index 14c2df5..6fb504b 100644
--- a/tianshou/core/mcts/unit_test/game.py
+++ b/tianshou/core/mcts/unit_test/game.py
@@ -6,7 +6,7 @@ if __name__ == '__main__':
     print("Our game has 2 players.")
     print("Player 1 has color 1 and plays first. Player 2 has color -1 and plays following player 1.")
     print("Both player choose 1 or 0 for an action.")
-    size = 1
+    size = 2
     print("This game has {} iterations".format(size))
     print("If the final sequence has more 1 that 0, player 1 wins.")
     print("If the final sequence has less 1 that 0, player 2 wins.")
@@ -34,4 +34,4 @@ if __name__ == '__main__':
             break
 
     print("The choice sequence is {}".format(seq))
-    print("The game result is {}".format(winner))
\ No newline at end of file
+    print("The game result is {}".format(winner))
diff --git a/tianshou/core/mcts/unit_test/mcts.py b/tianshou/core/mcts/unit_test/mcts.py
index dd89f57..49c9faf 100644
--- a/tianshou/core/mcts/unit_test/mcts.py
+++ b/tianshou/core/mcts/unit_test/mcts.py
@@ -187,7 +187,7 @@ class MCTS(object):
             prior, value = self.evaluator(next_action.next_state)
             next_action.expansion(prior, self.action_num)
         else:
-            value = 0
+            value = 0.
         t2 = time.time()
         if self.inverse:
             next_action.backpropagation(-value + 0.)

From 5849776c9aa48b7ef040200881eaecf9e71c0967 Mon Sep 17 00:00:00 2001
From: JialianLee <Jialian@DESKTOP-N4N6F2G.localdomain>
Date: Fri, 29 Dec 2017 13:45:53 +0800
Subject: [PATCH 98/98] Modification and doc for unit test

---
 tianshou/core/mcts/unit_test/README.md | 21 +++++++++++++++++++++
 tianshou/core/mcts/unit_test/ZOGame.py |  2 +-
 tianshou/core/mcts/unit_test/agent.py  | 11 ++++++-----
 tianshou/core/mcts/unit_test/game.py   |  8 +++++---
 tianshou/core/mcts/unit_test/mcts.py   |  2 --
 5 files changed, 33 insertions(+), 11 deletions(-)
 create mode 100644 tianshou/core/mcts/unit_test/README.md

diff --git a/tianshou/core/mcts/unit_test/README.md b/tianshou/core/mcts/unit_test/README.md
new file mode 100644
index 0000000..b7d0214
--- /dev/null
+++ b/tianshou/core/mcts/unit_test/README.md
@@ -0,0 +1,21 @@
+# Unit Test
+
+This is a two-player zero-sum perfect information extensive game. Player 1 and player 2 iteratively choose actions. At every iteration, player 1 players first and player 2 follows. Both players have choices 0 or 1.
+
+The number of iterations is given as a fixed number. After one game finished, the game counts the number of 0s and 1s that are choosen. If the number of 1 is more than that of 0, player 1 gets 1 and player 2 gets -1. If the number of 1 is less than that of 0, player 1 gets -1 and player 2 gets 1. Otherwise, they both get 0.
+
+## Files
+
++ game.py: run this file to play the game.
++ agent.py: a class for players. MCTS is used here.
++ ZOgame.py: the game environment.
++ mcts.py: MCTS method.
++ Evaluator: evaluator for MCTS. Rollout policy is also here.
+
+## Parameters
+
+Three paramters are given in game.py.
+
++ size: the number of iterations
++ searching_step: the number of searching times of MCTS for one step
++ temp: the temporature paramter used to tradeoff exploitation and exploration
diff --git a/tianshou/core/mcts/unit_test/ZOGame.py b/tianshou/core/mcts/unit_test/ZOGame.py
index 0b3d771..a4ea5e9 100644
--- a/tianshou/core/mcts/unit_test/ZOGame.py
+++ b/tianshou/core/mcts/unit_test/ZOGame.py
@@ -29,7 +29,7 @@ class ZOTree:
         length = len(seq)
         if length != self.depth:
             raise ValueError("The game is not terminated!")
-	result = np.sum(seq)
+        result = np.sum(seq)
         if result > self.size:
             winner = 1
         elif result < self.size:
diff --git a/tianshou/core/mcts/unit_test/agent.py b/tianshou/core/mcts/unit_test/agent.py
index 6dd34aa..f2946ce 100644
--- a/tianshou/core/mcts/unit_test/agent.py
+++ b/tianshou/core/mcts/unit_test/agent.py
@@ -4,13 +4,15 @@ import ZOGame
 import Evaluator
 from mcts import MCTS
 
-temp = 1
+
 
 
 class Agent:
-    def __init__(self, size, color):
+    def __init__(self, size, color, searching_step, temp):
         self.size = size
         self.color = color
+        self.searching_step = searching_step
+        self.temp = temp
         self.simulator = ZOGame.ZOTree(self.size)
         self.evaluator = Evaluator.rollout_policy(self.simulator, 2)
 
@@ -18,10 +20,9 @@ class Agent:
         if len(seq) >= 2 * self.size:
             raise ValueError("Game is terminated.")
         mcts = MCTS(self.simulator, self.evaluator, [seq, self.color], 2, inverse=True)
-        mcts.search(max_step=50)
+        mcts.search(max_step=self.searching_step)
         N = mcts.root.N
-        N = np.power(N, 1.0 / temp)
+        N = np.power(N, 1.0 / self.temp)
         prob = N / np.sum(N)
-        print("prob: {}".format(prob))
         action = int(np.random.binomial(1, prob[1]))
         return action
diff --git a/tianshou/core/mcts/unit_test/game.py b/tianshou/core/mcts/unit_test/game.py
index 6fb504b..92fcea8 100644
--- a/tianshou/core/mcts/unit_test/game.py
+++ b/tianshou/core/mcts/unit_test/game.py
@@ -3,17 +3,19 @@ import agent
 
 
 if __name__ == '__main__':
+    size = 10
+    seaching_step = 100
+    temp = 1
     print("Our game has 2 players.")
     print("Player 1 has color 1 and plays first. Player 2 has color -1 and plays following player 1.")
     print("Both player choose 1 or 0 for an action.")
-    size = 2
     print("This game has {} iterations".format(size))
     print("If the final sequence has more 1 that 0, player 1 wins.")
     print("If the final sequence has less 1 that 0, player 2 wins.")
     print("Otherwise, both players get 0.\n")
     game = ZOGame.ZOTree(size)
-    player1 = agent.Agent(size, 1)
-    player2 = agent.Agent(size, -1)
+    player1 = agent.Agent(size, 1, seaching_step, temp)
+    player2 = agent.Agent(size, -1, seaching_step, temp)
 
     seq = []
     print("Sequence is {}\n".format(seq))
diff --git a/tianshou/core/mcts/unit_test/mcts.py b/tianshou/core/mcts/unit_test/mcts.py
index 49c9faf..ab566f0 100644
--- a/tianshou/core/mcts/unit_test/mcts.py
+++ b/tianshou/core/mcts/unit_test/mcts.py
@@ -162,8 +162,6 @@ class MCTS(object):
             self.expansion_time += exp_time
             self.backpropagation_time += back_time
             step += 1
-            print("Q = {}".format(self.root.Q))
-            print("N = {}".format(self.root.N))
         if self.debug:
             file = open("mcts_profiling.log", "a")
             file.write("[" + str(self.role) + "]"