From 72ae304ab3477242dfad48aac22f4b54a208b4c0 Mon Sep 17 00:00:00 2001 From: Haosheng Zou Date: Wed, 13 Dec 2017 20:47:45 +0800 Subject: [PATCH 01/98] preliminary design of dqn_example, dqn interface. identify the assign of networks --- examples/dqn_example.py | 86 ++++++++++++++++++++++++++++++ examples/ppo_example.py | 6 ++- tianshou/core/README.md | 3 +- tianshou/core/policy/base.py | 33 +++++++++++- tianshou/core/policy/stochastic.py | 1 + 5 files changed, 124 insertions(+), 5 deletions(-) create mode 100644 examples/dqn_example.py diff --git a/examples/dqn_example.py b/examples/dqn_example.py new file mode 100644 index 0000000..0a5c084 --- /dev/null +++ b/examples/dqn_example.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python + +import tensorflow as tf +import numpy as np +import time +import gym + +# our lib imports here! +import sys +sys.path.append('..') +import tianshou.core.losses as losses +from tianshou.data.replay import Replay +import tianshou.data.advantage_estimation as advantage_estimation +import tianshou.core.policy as policy + + +def policy_net(observation, action_dim): + """ + Constructs the policy network. NOT NEEDED IN THE LIBRARY! this is pure tf + + :param observation: Placeholder for the observation. A tensor of shape (bs, x, y, channels) + :param action_dim: int. The number of actions. + :param scope: str. Specifying the scope of the variables. + """ + net = tf.layers.conv2d(observation, 16, 8, 4, 'valid', activation=tf.nn.relu) + net = tf.layers.conv2d(net, 32, 4, 2, 'valid', activation=tf.nn.relu) + net = tf.layers.flatten(net) + net = tf.layers.dense(net, 256, activation=tf.nn.relu) + + q_values = tf.layers.dense(net, action_dim) + + return q_values + + +if __name__ == '__main__': + env = gym.make('PongNoFrameskip-v4') + observation_dim = env.observation_space.shape + action_dim = env.action_space.n + + # 1. build network with pure tf + observation = tf.placeholder(tf.float32, shape=(None,) + observation_dim) # network input + + with tf.variable_scope('q_net'): + q_values = policy_net(observation, action_dim) + train_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) # TODO: better management of TRAINABLE_VARIABLES + with tf.variable_scope('target_net'): + q_values_target = policy_net(observation, action_dim) + + # 2. build losses, optimizers + q_net = policy.DQN(q_values, observation_placeholder=observation) # YongRen: policy.DQN + target_net = policy.DQN(q_values_target, observation_placeholder=observation) + + action = tf.placeholder(dtype=tf.int32, shape=[None]) # batch of integer actions + target = tf.placeholder(dtype=tf.float32, shape=[None]) # target value for DQN + + dqn_loss = losses.dqn_loss(action, target, pi) # TongzhengRen + + total_loss = dqn_loss + optimizer = tf.train.AdamOptimizer(1e-3) + train_op = optimizer.minimize(total_loss, var_list=train_var_list) + + # 3. define data collection + training_data = Replay(env, q_net, advantage_estimation.qlearning_target(target_net)) # + # ShihongSong: Replay(env, pi, advantage_estimation.qlearning_target(target_network)), use your ReplayMemory, interact as follows. Simplify your advantage_estimation.dqn to run before YongRen's DQN + # maybe a dict to manage the elements to be collected + + # 4. start training + with tf.Session() as sess: + sess.run(tf.global_variables_initializer()) + + minibatch_count = 0 + collection_count = 0 + while True: # until some stopping criterion met... + # collect data + training_data.collect() # ShihongSong + collection_count += 1 + print('Collected {} times.'.format(collection_count)) + + # update network + data = training_data.next_batch(64) # YouQiaoben, ShihongSong + # TODO: auto managing of the placeholders? or add this to params of data.Batch + sess.run(train_op, feed_dict={observation: data['observations'], action: data['actions'], target: data['target']}) + minibatch_count += 1 + print('Trained {} minibatches.'.format(minibatch_count)) + + # TODO: assigning pi to pi_old is not implemented yet \ No newline at end of file diff --git a/examples/ppo_example.py b/examples/ppo_example.py index d085273..02ccb52 100755 --- a/examples/ppo_example.py +++ b/examples/ppo_example.py @@ -66,7 +66,7 @@ if __name__ == '__main__': # a clean version with only policy net, no value net # 3. define data collection training_data = Batch(env, pi, advantage_estimation.full_return) # YouQiaoben: finish and polish Batch, advantage_estimation.gae_lambda as in PPO paper - # ShihongSong: Replay(env, pi, advantage_estimation.target_network), use your ReplayMemory, interact as follows. Simplify your advantage_estimation.dqn to run before YongRen's DQN + # ShihongSong: Replay(), see dqn_example.py # maybe a dict to manage the elements to be collected # 4. start training @@ -87,4 +87,6 @@ if __name__ == '__main__': # a clean version with only policy net, no value net # TODO: auto managing of the placeholders? or add this to params of data.Batch sess.run(train_op, feed_dict={observation: data['observations'], action: data['actions'], advantage: data['returns']}) minibatch_count += 1 - print('Trained {} minibatches.'.format(minibatch_count)) \ No newline at end of file + print('Trained {} minibatches.'.format(minibatch_count)) + + # TODO: assigning pi to pi_old is not implemented yet \ No newline at end of file diff --git a/tianshou/core/README.md b/tianshou/core/README.md index 16d915e..1e6d7c7 100644 --- a/tianshou/core/README.md +++ b/tianshou/core/README.md @@ -10,8 +10,7 @@ follow OnehotCategorical to write Gaussian, can be in the same file as stochasti not sure how to write, but should at least have act() method to interact with environment -DQN should have an effective argmax_{actions}() method to use as a value network - +referencing QValuePolicy in base.py, should have at least the listed methods. # losses diff --git a/tianshou/core/policy/base.py b/tianshou/core/policy/base.py index b0bf28a..0ae20a1 100644 --- a/tianshou/core/policy/base.py +++ b/tianshou/core/policy/base.py @@ -14,6 +14,33 @@ __all__ = [ 'StochasticPolicy', ] +class QValuePolicy(object): + """ + The policy as in DQN + """ + def __init__(self, value_tensor): + pass + + def act(self, observation, exploration=None): # first implement no exploration + """ + return the action (int) to be executed. + no exploration when exploration=None. + """ + pass + + def values(self, observation): + """ + returns the Q(s, a) values (float) for all actions a at observation s + """ + pass + + def values_tensor(self, observation): + """ + returns the tensor of the values for all actions a at observation s + """ + pass + + class StochasticPolicy(object): """ @@ -194,4 +221,8 @@ class StochasticPolicy(object): """ Private method for subclasses to rewrite the :meth:`prob` method. """ - raise NotImplementedError() \ No newline at end of file + raise NotImplementedError() + + +class QValuePolicy(object): + pass \ No newline at end of file diff --git a/tianshou/core/policy/stochastic.py b/tianshou/core/policy/stochastic.py index 37eb1be..3ef463e 100644 --- a/tianshou/core/policy/stochastic.py +++ b/tianshou/core/policy/stochastic.py @@ -70,6 +70,7 @@ class OnehotCategorical(StochasticPolicy): def _act(self, observation): sess = tf.get_default_session() # TODO: this may be ugly. also maybe huge problem when parallel sampled_action = sess.run(tf.multinomial(self.logits, num_samples=1), feed_dict={self._observation_placeholder: observation[None]}) + # observation[None] adds one dimension at the beginning sampled_action = sampled_action[0, 0] From f496725437ae4d80d9284284ce8148922cbab832 Mon Sep 17 00:00:00 2001 From: Haosheng Zou Date: Wed, 13 Dec 2017 22:43:45 +0800 Subject: [PATCH 02/98] add dqn.py to write --- tianshou/core/policy/dqn.py | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 tianshou/core/policy/dqn.py diff --git a/tianshou/core/policy/dqn.py b/tianshou/core/policy/dqn.py new file mode 100644 index 0000000..cfc6abf --- /dev/null +++ b/tianshou/core/policy/dqn.py @@ -0,0 +1,7 @@ + + +from .base import QValuePolicy + + +class DQN(QValuePolicy): + pass \ No newline at end of file From 9ed3e7b09276e072953a997b025a4d55728a5cf4 Mon Sep 17 00:00:00 2001 From: Haosheng Zou Date: Thu, 14 Dec 2017 19:46:38 +0800 Subject: [PATCH 03/98] minor fix --- examples/dqn_example.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/dqn_example.py b/examples/dqn_example.py index 0a5c084..4fbe466 100644 --- a/examples/dqn_example.py +++ b/examples/dqn_example.py @@ -53,7 +53,7 @@ if __name__ == '__main__': action = tf.placeholder(dtype=tf.int32, shape=[None]) # batch of integer actions target = tf.placeholder(dtype=tf.float32, shape=[None]) # target value for DQN - dqn_loss = losses.dqn_loss(action, target, pi) # TongzhengRen + dqn_loss = losses.dqn_loss(action, target, q_net) # TongzhengRen total_loss = dqn_loss optimizer = tf.train.AdamOptimizer(1e-3) @@ -61,7 +61,7 @@ if __name__ == '__main__': # 3. define data collection training_data = Replay(env, q_net, advantage_estimation.qlearning_target(target_net)) # - # ShihongSong: Replay(env, pi, advantage_estimation.qlearning_target(target_network)), use your ReplayMemory, interact as follows. Simplify your advantage_estimation.dqn to run before YongRen's DQN + # ShihongSong: Replay(env, q_net, advantage_estimation.qlearning_target(target_network)), use your ReplayMemory, interact as follows. Simplify your advantage_estimation.dqn to run before YongRen's DQN # maybe a dict to manage the elements to be collected # 4. start training From 0874d5342f8bf2a4b32512f701a12affd6093869 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Fri, 15 Dec 2017 14:24:08 +0800 Subject: [PATCH 04/98] implement dqn loss and dpg loss, add TODO for separate actor and critic --- examples/dqn_example.py | 2 +- tianshou/core/README.md | 4 ++++ tianshou/core/losses.py | 27 ++++++++++++++++++++++----- tianshou/core/policy/base.py | 10 ++++------ tianshou/data/README.md | 4 ++++ 5 files changed, 35 insertions(+), 12 deletions(-) diff --git a/examples/dqn_example.py b/examples/dqn_example.py index 0a5c084..6a9e2a6 100644 --- a/examples/dqn_example.py +++ b/examples/dqn_example.py @@ -53,7 +53,7 @@ if __name__ == '__main__': action = tf.placeholder(dtype=tf.int32, shape=[None]) # batch of integer actions target = tf.placeholder(dtype=tf.float32, shape=[None]) # target value for DQN - dqn_loss = losses.dqn_loss(action, target, pi) # TongzhengRen + dqn_loss = losses.dqn_loss(action, target, q_net) # TongzhengRen total_loss = dqn_loss optimizer = tf.train.AdamOptimizer(1e-3) diff --git a/tianshou/core/README.md b/tianshou/core/README.md index 1e6d7c7..3617525 100644 --- a/tianshou/core/README.md +++ b/tianshou/core/README.md @@ -1,3 +1,7 @@ +#TODO: + +Separate actor and critic. (Important, we need to focus on that recently) + # policy YongRen diff --git a/tianshou/core/losses.py b/tianshou/core/losses.py index f7d798b..d281df9 100644 --- a/tianshou/core/losses.py +++ b/tianshou/core/losses.py @@ -26,7 +26,7 @@ def vanilla_policy_gradient(sampled_action, reward, pi, baseline="None"): :param sampled_action: placeholder of sampled actions during interaction with the environment :param reward: placeholder of reward the 'sampled_action' get - :param pi: current 'policy' to be optimized + :param pi: current `policy` to be optimized :param baseline: the baseline method used to reduce the variance, default is 'None' :return: """ @@ -35,8 +35,25 @@ def vanilla_policy_gradient(sampled_action, reward, pi, baseline="None"): # TODO: Different baseline methods like REINFORCE, etc. return vanilla_policy_gradient_loss -def temporal_difference_loss(): - pass +def dqn_loss(sampled_action, sampled_target, q_net): + """ + deep q-network -def deterministic_policy_gradient(): - pass \ No newline at end of file + :param sampled_action: placeholder of sampled actions during the interaction with the environment + :param sampled_target: estimated Q(s,a) + :param q_net: current `policy` to be optimized + :return: + """ + action_num = q_net.get_values().shape()[1] + sampled_q = tf.reduce_sum(q_net.get_values() * tf.one_hot(sampled_action, action_num), axis=1) + return tf.reduce_mean(tf.square(sampled_target - sampled_q)) + +def deterministic_policy_gradient(sampled_state, critic): + """ + deterministic policy gradient: + + :param sampled_action: placeholder of sampled actions during the interaction with the environment + :param critic: current `value` function + :return: + """ + return tf.reduce_mean(critic.get_value(sampled_state)) \ No newline at end of file diff --git a/tianshou/core/policy/base.py b/tianshou/core/policy/base.py index 0ae20a1..b6d8d48 100644 --- a/tianshou/core/policy/base.py +++ b/tianshou/core/policy/base.py @@ -14,12 +14,14 @@ __all__ = [ 'StochasticPolicy', ] +#TODO: separate actor and critic, we should focus on it once we finish the basic module. + class QValuePolicy(object): """ The policy as in DQN """ - def __init__(self, value_tensor): - pass + def __init__(self, observation_placeholder): + self.observation_placeholder = observation_placeholder def act(self, observation, exploration=None): # first implement no exploration """ @@ -222,7 +224,3 @@ class StochasticPolicy(object): Private method for subclasses to rewrite the :meth:`prob` method. """ raise NotImplementedError() - - -class QValuePolicy(object): - pass \ No newline at end of file diff --git a/tianshou/data/README.md b/tianshou/data/README.md index 241971a..e9e6374 100644 --- a/tianshou/data/README.md +++ b/tianshou/data/README.md @@ -1,3 +1,7 @@ +# TODO: + +Notice that we will separate actor and critic, and batch will collect data for optimizing policy while replay will collect data for optimizing critic. + # Batch YouQiaoben From 00f599bba375c1f9b7614a2467ed031c87c542f7 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Fri, 15 Dec 2017 14:27:04 +0800 Subject: [PATCH 05/98] assign TODO to Haosheng and Tongzheng --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index f1da719..543d237 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,11 @@ Try to use full names. Don't use abbrevations for class/function/variable names The """xxx""" comment should be written right after class/function. Also comment the part that's not intuitive during the code. We must comment, but for now we don't need to polish them. +# High Priority TODO + +For Haosheng and Tongzheng: separate actor and critic, rewrite the interfaces for policy + +Others can still focus on the task below. ## TODO Search based method parallel. From 6cb4b02fcad99e2768886878e2234a169b0333cb Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Fri, 15 Dec 2017 22:19:44 +0800 Subject: [PATCH 06/98] merge class strategy with class game. Next, merge Go with GoEnv --- AlphaGo/README.md | 4 + AlphaGo/engine.py | 1 - AlphaGo/game.py | 264 +++-------------- AlphaGo/go.py | 592 ++++++++++++------------------------- AlphaGo/gtp_wrapper.py | 70 ----- AlphaGo/play.py | 3 +- AlphaGo/player.py | 4 + AlphaGo/strategy.py | 35 --- tianshou/core/mcts/mcts.py | 1 + 9 files changed, 244 insertions(+), 730 deletions(-) delete mode 100644 AlphaGo/gtp_wrapper.py diff --git a/AlphaGo/README.md b/AlphaGo/README.md index d21b9bd..720c4d0 100644 --- a/AlphaGo/README.md +++ b/AlphaGo/README.md @@ -10,3 +10,7 @@ Connecting our own policy-value neural network with leela-zero. ## checkpoints: Weights of the policy-value neural network + + +## File Specification + diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index 716d40b..1f9af85 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -188,7 +188,6 @@ class GTPEngine(): def cmd_show_board(self, args, **kwargs): return self._game.board, True - if __name__ == "main": game = Game() engine = GTPEngine(game_obj=Game) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 919a5d5..360921e 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -10,242 +10,49 @@ import copy import tensorflow as tf import numpy as np import sys +import go +import network_small +import strategy from collections import deque +from tianshou.core.mcts.mcts import MCTS import Network -from strategy import strategy - -''' -(1, 1) is considered as the upper left corner of the board, -(size, 1) is the lower left -''' - -DELTA = [[1, 0], [-1, 0], [0, -1], [0, 1]] - - -class Executor: - def __init__(self, **kwargs): - self.game = kwargs['game'] - - def _bfs(self, vertex, color, block, status, alive_break): - block.append(vertex) - status[self.game._flatten(vertex)] = True - nei = self._neighbor(vertex) - for n in nei: - if not status[self.game._flatten(n)]: - if self.game.board[self.game._flatten(n)] == color: - self._bfs(n, color, block, status, alive_break) - - def _find_block(self, vertex, alive_break=False): - block = [] - status = [False] * (self.game.size * self.game.size) - color = self.game.board[self.game._flatten(vertex)] - self._bfs(vertex, color, block, status, alive_break) - - for b in block: - for n in self._neighbor(b): - if self.game.board[self.game._flatten(n)] == utils.EMPTY: - return False, block - return True, block - - def _find_boarder(self, vertex): - block = [] - status = [False] * (self.game.size * self.game.size) - self._bfs(vertex, utils.EMPTY, block, status, False) - border = [] - for b in block: - for n in self._neighbor(b): - if not (n in block): - border.append(n) - return border - - def _is_qi(self, color, vertex): - nei = self._neighbor(vertex) - for n in nei: - if self.game.board[self.game._flatten(n)] == utils.EMPTY: - return True - - self.game.board[self.game._flatten(vertex)] = color - for n in nei: - if self.game.board[self.game._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n) - if can_kill: - self.game.board[self.game._flatten(vertex)] = utils.EMPTY - return True - - ### can not suicide - can_kill, block = self._find_block(vertex) - if can_kill: - self.game.board[self.game._flatten(vertex)] = utils.EMPTY - return False - - self.game.board[self.game._flatten(vertex)] = utils.EMPTY - return True - - def _check_global_isomorphous(self, color, vertex): - ##backup - _board = copy.copy(self.game.board) - self.game.board[self.game._flatten(vertex)] = color - self._process_board(color, vertex) - if self.game.board in self.game.history: - res = True - else: - res = False - - self.game.board = _board - return res - - def _in_board(self, vertex): - x, y = vertex - if x < 1 or x > self.game.size: return False - if y < 1 or y > self.game.size: return False - return True - - def _neighbor(self, vertex): - x, y = vertex - nei = [] - for d in DELTA: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - nei.append((_x, _y)) - return nei - - def _process_board(self, color, vertex): - nei = self._neighbor(vertex) - for n in nei: - if self.game.board[self.game._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n, alive_break=True) - if can_kill: - for b in block: - self.game.board[self.game._flatten(b)] = utils.EMPTY - - def is_valid(self, color, vertex): - ### in board - if not self._in_board(vertex): - return False - - ### already have stone - if not self.game.board[self.game._flatten(vertex)] == utils.EMPTY: - return False - - ### check if it is qi - if not self._is_qi(color, vertex): - return False - - if self._check_global_isomorphous(color, vertex): - return False - - return True - - def do_move(self, color, vertex): - if not self.is_valid(color, vertex): - return False - self.game.board[self.game._flatten(vertex)] = color - self._process_board(color, vertex) - self.game.history.append(copy.copy(self.game.board)) - self.game.past.append(copy.copy(self.game.board)) - return True - - def _find_empty(self): - idx = [i for i,x in enumerate(self.game.board) if x == utils.EMPTY ][0] - return self.game._deflatten(idx) - - def get_score(self, is_unknown_estimation = False): - ''' - is_unknown_estimation: whether use nearby stone to predict the unknown - return score from BLACK perspective. - ''' - _board = copy.copy(self.game.board) - while utils.EMPTY in self.game.board: - vertex = self._find_empty() - boarder = self._find_boarder(vertex) - boarder_color = set(map(lambda v: self.game.board[self.game._flatten(v)], boarder)) - if boarder_color == {utils.BLACK}: - self.game.board[self.game._flatten(vertex)] = utils.BLACK - elif boarder_color == {utils.WHITE}: - self.game.board[self.game._flatten(vertex)] = utils.WHITE - elif is_unknown_estimation: - self.game.board[self.game._flatten(vertex)] = self._predict_from_nearby(vertex) - else: - self.game.board[self.game._flatten(vertex)] =utils.UNKNOWN - score = 0 - for i in self.game.board: - if i == utils.BLACK: - score += 1 - elif i == utils.WHITE: - score -= 1 - score -= self.game.komi - - self.game.board = _board - return score - - def _predict_from_nearby(self, vertex, neighbor_step = 3): - ''' - step: the nearby 3 steps is considered - :vertex: position to be estimated - :neighbor_step: how many steps nearby - :return: the nearby positions of the input position - currently the nearby 3*3 grid is returned, altogether 4*8 points involved - ''' - for step in range(1, neighbor_step + 1): # check the stones within the steps in range - neighbor_vertex_set = [] - self._add_nearby_stones(neighbor_vertex_set, vertex[0] - step, vertex[1], 1, 1, neighbor_step) - self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] + step, 1, -1, neighbor_step) - self._add_nearby_stones(neighbor_vertex_set, vertex[0] + step, vertex[1], -1, -1, neighbor_step) - self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] - step, -1, 1, neighbor_step) - color_estimate = 0 - for neighbor_vertex in neighbor_vertex_set: - color_estimate += self.game.board[self.game._flatten(neighbor_vertex)] - if color_estimate > 0: - return utils.BLACK - elif color_estimate < 0: - return utils.WHITE - - def _add_nearby_stones(self, neighbor_vertex_set, start_vertex_x, start_vertex_y, x_diff, y_diff, num_step): - ''' - add the nearby stones around the input vertex - :param neighbor_vertex_set: input list - :param start_vertex_x: x axis of the input vertex - :param start_vertex_y: y axis of the input vertex - :param x_diff: add x axis - :param y_diff: add y axis - :param num_step: number of steps to be added - :return: - ''' - for step in xrange(num_step): - new_neighbor_vertex = (start_vertex_x, start_vertex_y) - if self._in_board(new_neighbor_vertex): - neighbor_vertex_set.append((start_vertex_x, start_vertex_y)) - start_vertex_x += x_diff - start_vertex_y += y_diff - - - +#from strategy import strategy class Game: + ''' + Load the real game and trained weights. + + TODO : Maybe merge with the engine class in future, + currently leave it untouched for interacting with Go UI. + ''' def __init__(self, size=9, komi=6.5, checkpoint_path=None): self.size = size self.komi = komi self.board = [utils.EMPTY] * (self.size * self.size) - self.strategy = strategy(checkpoint_path) - # self.strategy = None - self.executor = Executor(game=self) self.history = [] self.past = deque(maxlen=8) for _ in range(8): self.past.append(self.board) + self.executor = go.Go(game=self) + #self.strategy = strategy(checkpoint_path) + + self.simulator = strategy.GoEnv() + self.net = network_small.Network() + self.sess = self.net.forward(checkpoint_path) + self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], + feed_dict={self.net.x: state, self.net.is_training: False}) + def _flatten(self, vertex): x, y = vertex return (y - 1) * self.size + (x - 1) def _deflatten(self, idx): x = idx % self.size + 1 - y = idx // self.size + 1 + y = idx // self.size + 1 return (x,y) - def clear(self): self.board = [utils.EMPTY] * (self.size * self.size) self.history = [] @@ -259,8 +66,30 @@ class Game: def set_komi(self, k): self.komi = k - def check_valid(self, color, vertex): - return self.executor.is_valid(color, vertex) + def data_process(self, history, color): + state = np.zeros([1, self.simulator.size, self.simulator.size, 17]) + for i in range(8): + state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.simulator.size ** 2)).reshape(self.simulator.size, self.simulator.size) + state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(self.simulator.size ** 2)).reshape(self.simulator.size, self.simulator.size) + if color == utils.BLACK: + state[0, :, :, 16] = np.ones([self.simulator.size, self.simulator.size]) + if color == utils.WHITE: + state[0, :, :, 16] = np.zeros([self.simulator.size, self.simulator.size]) + return state + + def strategy_gen_move(self, history, color): + self.simulator.history = copy.copy(history) + self.simulator.board = copy.copy(history[-1]) + state = self.data_process(self.simulator.history, color) + mcts = MCTS(self.simulator, self.evaluator, state, self.simulator.size ** 2 + 1, inverse=True, max_step=10) + temp = 1 + prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) + choice = np.random.choice(self.simulator.size ** 2 + 1, 1, p=prob).tolist()[0] + if choice == self.simulator.size ** 2: + move = utils.PASS + else: + move = (choice % self.simulator.size + 1, choice / self.simulator.size + 1) + return move, prob def do_move(self, color, vertex): if vertex == utils.PASS: @@ -271,7 +100,7 @@ class Game: def gen_move(self, color): # move = self.strategy.gen_move(color) # return move - move, self.prob = self.strategy.gen_move(self.past, color) + move, self.prob = self.strategy_gen_move(self.past, color) self.do_move(color, move) return move @@ -295,7 +124,6 @@ class Game: print('') sys.stdout.flush() - if __name__ == "__main__": g = Game() g.show_board() diff --git a/AlphaGo/go.py b/AlphaGo/go.py index b83d305..26540e1 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -1,428 +1,212 @@ -''' -A board is a NxN numpy array. -A Coordinate is a tuple index into the board. -A Move is a (Coordinate c | None). -A PlayerMove is a (Color, Move) tuple -(0, 0) is considered to be the upper left corner of the board, and (18, 0) is the lower left. -''' -from collections import namedtuple +from __future__ import print_function +import utils import copy -import itertools +import sys +from collections import deque -import numpy as np +''' +Settings of the Go game. -# Represent a board as a numpy array, with 0 empty, 1 is black, -1 is white. -# This means that swapping colors is as simple as multiplying array by -1. -WHITE, EMPTY, BLACK, FILL, KO, UNKNOWN = range(-1, 5) +(1, 1) is considered as the upper left corner of the board, +(size, 1) is the lower left +''' + +NEIGHBOR_OFFSET = [[1, 0], [-1, 0], [0, -1], [0, 1]] -class PlayerMove(namedtuple('PlayerMove', ['color', 'move'])): pass +class Go: + def __init__(self, **kwargs): + self.game = kwargs['game'] + def _bfs(self, vertex, color, block, status, alive_break): + block.append(vertex) + status[self.game._flatten(vertex)] = True + nei = self._neighbor(vertex) + for n in nei: + if not status[self.game._flatten(n)]: + if self.game.board[self.game._flatten(n)] == color: + self._bfs(n, color, block, status, alive_break) -# Represents "group not found" in the LibertyTracker object -MISSING_GROUP_ID = -1 + def _find_block(self, vertex, alive_break=False): + block = [] + status = [False] * (self.game.size * self.game.size) + color = self.game.board[self.game._flatten(vertex)] + self._bfs(vertex, color, block, status, alive_break) + for b in block: + for n in self._neighbor(b): + if self.game.board[self.game._flatten(n)] == utils.EMPTY: + return False, block + return True, block -class IllegalMove(Exception): pass + def _find_boarder(self, vertex): + block = [] + status = [False] * (self.game.size * self.game.size) + self._bfs(vertex, utils.EMPTY, block, status, False) + border = [] + for b in block: + for n in self._neighbor(b): + if not (n in block): + border.append(n) + return border + def _is_qi(self, color, vertex): + nei = self._neighbor(vertex) + for n in nei: + if self.game.board[self.game._flatten(n)] == utils.EMPTY: + return True -# these are initialized by set_board_size -N = None -ALL_COORDS = [] -EMPTY_BOARD = None -NEIGHBORS = {} -DIAGONALS = {} + self.game.board[self.game._flatten(vertex)] = color + for n in nei: + if self.game.board[self.game._flatten(n)] == utils.another_color(color): + can_kill, block = self._find_block(n) + if can_kill: + self.game.board[self.game._flatten(vertex)] = utils.EMPTY + return True - -def set_board_size(n): - ''' - Hopefully nobody tries to run both 9x9 and 19x19 game instances at once. - Also, never do "from go import N, W, ALL_COORDS, EMPTY_BOARD". - ''' - global N, ALL_COORDS, EMPTY_BOARD, NEIGHBORS, DIAGONALS - if N == n: return - N = n - ALL_COORDS = [(i, j) for i in range(n) for j in range(n)] - EMPTY_BOARD = np.zeros([n, n], dtype=np.int8) - - def check_bounds(c): - return c[0] % n == c[0] and c[1] % n == c[1] - - NEIGHBORS = {(x, y): list(filter(check_bounds, [(x + 1, y), (x - 1, y), (x, y + 1), (x, y - 1)])) for x, y in - ALL_COORDS} - DIAGONALS = {(x, y): list(filter(check_bounds, [(x + 1, y + 1), (x + 1, y - 1), (x - 1, y + 1), (x - 1, y - 1)])) - for x, y in ALL_COORDS} - - -def place_stones(board, color, stones): - for s in stones: - board[s] = color - - -def find_reached(board, c): - # that can reach from one place - color = board[c] - chain = set([c]) - reached = set() - frontier = [c] - while frontier: - current = frontier.pop() - chain.add(current) - for n in NEIGHBORS[current]: - if board[n] == color and (not n in chain): - frontier.append(n) - elif board[n] != color: - reached.add(n) - return chain, reached - - -def is_koish(board, c): - 'Check if c is surrounded on all sides by 1 color, and return that color' - if board[c] != EMPTY: return None - neighbors = {board[n] for n in NEIGHBORS[c]} - if len(neighbors) == 1 and not EMPTY in neighbors: - return list(neighbors)[0] - else: - return None - - -def is_eyeish(board, c): - 'Check if c is an eye, for the purpose of restricting MC rollouts.' - color = is_koish(board, c) - if color is None: - return None - diagonal_faults = 0 - diagonals = DIAGONALS[c] - if len(diagonals) < 4: - diagonal_faults += 1 - for d in diagonals: - if not board[d] in (color, EMPTY): - diagonal_faults += 1 - if diagonal_faults > 1: - return None - else: - return color - - -class Group(namedtuple('Group', ['id', 'stones', 'liberties', 'color'])): - ''' - stones: a set of Coordinates belonging to this group - liberties: a set of Coordinates that are empty and adjacent to this group. - color: color of this group - ''' - - def __eq__(self, other): - return self.stones == other.stones and self.liberties == other.liberties and self.color == other.color - - -class LibertyTracker(object): - @staticmethod - def from_board(board): - board = np.copy(board) - curr_group_id = 0 - lib_tracker = LibertyTracker() - for color in (WHITE, BLACK): - while color in board: - curr_group_id += 1 - found_color = np.where(board == color) - coord = found_color[0][0], found_color[1][0] - chain, reached = find_reached(board, coord) - liberties = set(r for r in reached if board[r] == EMPTY) - new_group = Group(curr_group_id, chain, liberties, color) - lib_tracker.groups[curr_group_id] = new_group - for s in chain: - lib_tracker.group_index[s] = curr_group_id - place_stones(board, FILL, chain) - - lib_tracker.max_group_id = curr_group_id - - liberty_counts = np.zeros([N, N], dtype=np.uint8) - for group in lib_tracker.groups.values(): - num_libs = len(group.liberties) - for s in group.stones: - liberty_counts[s] = num_libs - lib_tracker.liberty_cache = liberty_counts - - return lib_tracker - - def __init__(self, group_index=None, groups=None, liberty_cache=None, max_group_id=1): - # group_index: a NxN numpy array of group_ids. -1 means no group - # groups: a dict of group_id to groups - # liberty_cache: a NxN numpy array of liberty counts - self.group_index = group_index if group_index is not None else -np.ones([N, N], dtype=np.int32) - self.groups = groups or {} - self.liberty_cache = liberty_cache if liberty_cache is not None else np.zeros([N, N], dtype=np.uint8) - self.max_group_id = max_group_id - - def __deepcopy__(self, memodict={}): - new_group_index = np.copy(self.group_index) - new_lib_cache = np.copy(self.liberty_cache) - new_groups = { - group.id: Group(group.id, set(group.stones), set(group.liberties), group.color) - for group in self.groups.values() - } - return LibertyTracker(new_group_index, new_groups, liberty_cache=new_lib_cache, max_group_id=self.max_group_id) - - def add_stone(self, color, c): - assert self.group_index[c] == MISSING_GROUP_ID - captured_stones = set() - opponent_neighboring_group_ids = set() - friendly_neighboring_group_ids = set() - empty_neighbors = set() - - for n in NEIGHBORS[c]: - neighbor_group_id = self.group_index[n] - if neighbor_group_id != MISSING_GROUP_ID: - neighbor_group = self.groups[neighbor_group_id] - if neighbor_group.color == color: - friendly_neighboring_group_ids.add(neighbor_group_id) - else: - opponent_neighboring_group_ids.add(neighbor_group_id) - else: - empty_neighbors.add(n) - - new_group = self._create_group(color, c, empty_neighbors) - - for group_id in friendly_neighboring_group_ids: - new_group = self._merge_groups(group_id, new_group.id) - - for group_id in opponent_neighboring_group_ids: - neighbor_group = self.groups[group_id] - if len(neighbor_group.liberties) == 1: - captured = self._capture_group(group_id) - captured_stones.update(captured) - else: - self._update_liberties(group_id, remove={c}) - - self._handle_captures(captured_stones) - - # suicide is illegal - if len(new_group.liberties) == 0: - raise IllegalMove("Move at {} would commit suicide!\n".format(c)) - - return captured_stones - - def _create_group(self, color, c, liberties): - self.max_group_id += 1 - new_group = Group(self.max_group_id, set([c]), liberties, color) - self.groups[new_group.id] = new_group - self.group_index[c] = new_group.id - self.liberty_cache[c] = len(liberties) - return new_group - - def _merge_groups(self, group1_id, group2_id): - group1 = self.groups[group1_id] - group2 = self.groups[group2_id] - group1.stones.update(group2.stones) - del self.groups[group2_id] - for s in group2.stones: - self.group_index[s] = group1_id - - self._update_liberties(group1_id, add=group2.liberties, remove=(group2.stones | group1.stones)) - - return group1 - - def _capture_group(self, group_id): - dead_group = self.groups[group_id] - del self.groups[group_id] - for s in dead_group.stones: - self.group_index[s] = MISSING_GROUP_ID - self.liberty_cache[s] = 0 - return dead_group.stones - - def _update_liberties(self, group_id, add=None, remove=None): - group = self.groups[group_id] - if add: - group.liberties.update(add) - if remove: - group.liberties.difference_update(remove) - - new_lib_count = len(group.liberties) - for s in group.stones: - self.liberty_cache[s] = new_lib_count - - def _handle_captures(self, captured_stones): - for s in captured_stones: - for n in NEIGHBORS[s]: - group_id = self.group_index[n] - if group_id != MISSING_GROUP_ID: - self._update_liberties(group_id, add={s}) - - -class Position(): - def __init__(self, board=None, n=0, komi=7.5, caps=(0, 0), lib_tracker=None, ko=None, recent=tuple(), - to_play=BLACK): - ''' - board: a numpy array - n: an int representing moves played so far - komi: a float, representing points given to the second player. - caps: a (int, int) tuple of captures for B, W. - lib_tracker: a LibertyTracker object - ko: a Move - recent: a tuple of PlayerMoves, such that recent[-1] is the last move. - to_play: BLACK or WHITE - ''' - self.board = board if board is not None else np.copy(EMPTY_BOARD) - self.n = n - self.komi = komi - self.caps = caps - self.lib_tracker = lib_tracker or LibertyTracker.from_board(self.board) - self.ko = ko - self.recent = recent - self.to_play = to_play - - def __deepcopy__(self, memodict={}): - new_board = np.copy(self.board) - new_lib_tracker = copy.deepcopy(self.lib_tracker) - return Position(new_board, self.n, self.komi, self.caps, new_lib_tracker, self.ko, self.recent, self.to_play) - - def __str__(self): - pretty_print_map = { - WHITE: '\x1b[0;31;47mO', - EMPTY: '\x1b[0;31;43m.', - BLACK: '\x1b[0;31;40mX', - FILL: '#', - KO: '*', - } - board = np.copy(self.board) - captures = self.caps - if self.ko is not None: - place_stones(board, KO, [self.ko]) - raw_board_contents = [] - for i in range(N): - row = [] - for j in range(N): - appended = '<' if (self.recent and (i, j) == self.recent[-1].move) else ' ' - row.append(pretty_print_map[board[i, j]] + appended) - row.append('\x1b[0m') - raw_board_contents.append(''.join(row)) - - row_labels = ['%2d ' % i for i in range(N, 0, -1)] - annotated_board_contents = [''.join(r) for r in zip(row_labels, raw_board_contents, row_labels)] - header_footer_rows = [' ' + ' '.join('ABCDEFGHJKLMNOPQRST'[:N]) + ' '] - annotated_board = '\n'.join(itertools.chain(header_footer_rows, annotated_board_contents, header_footer_rows)) - details = "\nMove: {}. Captures X: {} O: {}\n".format(self.n, *captures) - return annotated_board + details - - def is_move_suicidal(self, move): - potential_libs = set() - for n in NEIGHBORS[move]: - neighbor_group_id = self.lib_tracker.group_index[n] - if neighbor_group_id == MISSING_GROUP_ID: - # at least one liberty after playing here, so not a suicide - return False - neighbor_group = self.lib_tracker.groups[neighbor_group_id] - if neighbor_group.color == self.to_play: - potential_libs |= neighbor_group.liberties - elif len(neighbor_group.liberties) == 1: - # would capture an opponent group if they only had one lib. - return False - # it's possible to suicide by connecting several friendly groups - # each of which had one liberty. - potential_libs -= set([move]) - return not potential_libs - - def is_move_legal(self, move): - 'Checks that a move is on an empty space, not on ko, and not suicide' - if move is None: - return True - if self.board[move] != EMPTY: + ### can not suicide + can_kill, block = self._find_block(vertex) + if can_kill: + self.game.board[self.game._flatten(vertex)] = utils.EMPTY return False - if move == self.ko: + + self.game.board[self.game._flatten(vertex)] = utils.EMPTY + return True + + def _check_global_isomorphous(self, color, vertex): + ##backup + _board = copy.copy(self.game.board) + self.game.board[self.game._flatten(vertex)] = color + self._process_board(color, vertex) + if self.game.board in self.game.history: + res = True + else: + res = False + + self.game.board = _board + return res + + def _in_board(self, vertex): + x, y = vertex + if x < 1 or x > self.game.size: return False + if y < 1 or y > self.game.size: return False + return True + + def _neighbor(self, vertex): + x, y = vertex + nei = [] + for d in NEIGHBOR_OFFSET: + _x = x + d[0] + _y = y + d[1] + if self._in_board((_x, _y)): + nei.append((_x, _y)) + return nei + + def _process_board(self, color, vertex): + nei = self._neighbor(vertex) + for n in nei: + if self.game.board[self.game._flatten(n)] == utils.another_color(color): + can_kill, block = self._find_block(n, alive_break=True) + if can_kill: + for b in block: + self.game.board[self.game._flatten(b)] = utils.EMPTY + + def is_valid(self, color, vertex): + ### in board + if not self._in_board(vertex): return False - if self.is_move_suicidal(move): + + ### already have stone + if not self.game.board[self.game._flatten(vertex)] == utils.EMPTY: + return False + + ### check if it is qi + if not self._is_qi(color, vertex): + return False + + if self._check_global_isomorphous(color, vertex): return False return True - def pass_move(self, mutate=False): - pos = self if mutate else copy.deepcopy(self) - pos.n += 1 - pos.recent += (PlayerMove(pos.to_play, None),) - pos.to_play *= -1 - pos.ko = None - return pos + def do_move(self, color, vertex): + if not self.is_valid(color, vertex): + return False + self.game.board[self.game._flatten(vertex)] = color + self._process_board(color, vertex) + self.game.history.append(copy.copy(self.game.board)) + self.game.past.append(copy.copy(self.game.board)) + return True - def flip_playerturn(self, mutate=False): - pos = self if mutate else copy.deepcopy(self) - pos.ko = None - pos.to_play *= -1 - return pos + def _find_empty(self): + idx = [i for i,x in enumerate(self.game.board) if x == utils.EMPTY ][0] + return self.game._deflatten(idx) - def get_liberties(self): - return self.lib_tracker.liberty_cache - - def play_move(self, c, color=None, mutate=False): - # Obeys CGOS Rules of Play. In short: - # No suicides - # Chinese/area scoring - # Positional superko (this is very crudely approximate at the moment.) - if color is None: - color = self.to_play - - pos = self if mutate else copy.deepcopy(self) - - if c is None: - pos = pos.pass_move(mutate=mutate) - return pos - - if not self.is_move_legal(c): - raise IllegalMove("Move at {} is illegal: \n{}".format(c, self)) - - # check must be done before potentially mutating the board - potential_ko = is_koish(self.board, c) - - place_stones(pos.board, color, [c]) - captured_stones = pos.lib_tracker.add_stone(color, c) - place_stones(pos.board, EMPTY, captured_stones) - - opp_color = color * -1 - - if len(captured_stones) == 1 and potential_ko == opp_color: - new_ko = list(captured_stones)[0] - else: - new_ko = None - - if pos.to_play == BLACK: - new_caps = (pos.caps[0] + len(captured_stones), pos.caps[1]) - else: - new_caps = (pos.caps[0], pos.caps[1] + len(captured_stones)) - - pos.n += 1 - pos.caps = new_caps - pos.ko = new_ko - pos.recent += (PlayerMove(color, c),) - pos.to_play *= -1 - return pos - - def score(self): - 'Return score from B perspective. If W is winning, score is negative.' - working_board = np.copy(self.board) - while EMPTY in working_board: - unassigned_spaces = np.where(working_board == EMPTY) - c = unassigned_spaces[0][0], unassigned_spaces[1][0] - territory, borders = find_reached(working_board, c) - border_colors = set(working_board[b] for b in borders) - X_border = BLACK in border_colors - O_border = WHITE in border_colors - if X_border and not O_border: - territory_color = BLACK - elif O_border and not X_border: - territory_color = WHITE + def get_score(self, is_unknown_estimation = False): + ''' + is_unknown_estimation: whether use nearby stone to predict the unknown + return score from BLACK perspective. + ''' + _board = copy.copy(self.game.board) + while utils.EMPTY in self.game.board: + vertex = self._find_empty() + boarder = self._find_boarder(vertex) + boarder_color = set(map(lambda v: self.game.board[self.game._flatten(v)], boarder)) + if boarder_color == {utils.BLACK}: + self.game.board[self.game._flatten(vertex)] = utils.BLACK + elif boarder_color == {utils.WHITE}: + self.game.board[self.game._flatten(vertex)] = utils.WHITE + elif is_unknown_estimation: + self.game.board[self.game._flatten(vertex)] = self._predict_from_nearby(vertex) else: - territory_color = UNKNOWN # dame, or seki - place_stones(working_board, territory_color, territory) + self.game.board[self.game._flatten(vertex)] =utils.UNKNOWN + score = 0 + for i in self.game.board: + if i == utils.BLACK: + score += 1 + elif i == utils.WHITE: + score -= 1 + score -= self.game.komi - return np.count_nonzero(working_board == BLACK) - np.count_nonzero(working_board == WHITE) - self.komi + self.game.board = _board + return score - def result(self): - score = self.score() - if score > 0: - return 'B+' + '%.1f' % score - elif score < 0: - return 'W+' + '%.1f' % abs(score) - else: - return 'DRAW' + def _predict_from_nearby(self, vertex, neighbor_step = 3): + ''' + step: the nearby 3 steps is considered + :vertex: position to be estimated + :neighbor_step: how many steps nearby + :return: the nearby positions of the input position + currently the nearby 3*3 grid is returned, altogether 4*8 points involved + ''' + for step in range(1, neighbor_step + 1): # check the stones within the steps in range + neighbor_vertex_set = [] + self._add_nearby_stones(neighbor_vertex_set, vertex[0] - step, vertex[1], 1, 1, neighbor_step) + self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] + step, 1, -1, neighbor_step) + self._add_nearby_stones(neighbor_vertex_set, vertex[0] + step, vertex[1], -1, -1, neighbor_step) + self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] - step, -1, 1, neighbor_step) + color_estimate = 0 + for neighbor_vertex in neighbor_vertex_set: + color_estimate += self.game.board[self.game._flatten(neighbor_vertex)] + if color_estimate > 0: + return utils.BLACK + elif color_estimate < 0: + return utils.WHITE - -set_board_size(19) + def _add_nearby_stones(self, neighbor_vertex_set, start_vertex_x, start_vertex_y, x_diff, y_diff, num_step): + ''' + add the nearby stones around the input vertex + :param neighbor_vertex_set: input list + :param start_vertex_x: x axis of the input vertex + :param start_vertex_y: y axis of the input vertex + :param x_diff: add x axis + :param y_diff: add y axis + :param num_step: number of steps to be added + :return: + ''' + for step in xrange(num_step): + new_neighbor_vertex = (start_vertex_x, start_vertex_y) + if self._in_board(new_neighbor_vertex): + neighbor_vertex_set.append((start_vertex_x, start_vertex_y)) + start_vertex_x += x_diff + start_vertex_y += y_diff diff --git a/AlphaGo/gtp_wrapper.py b/AlphaGo/gtp_wrapper.py deleted file mode 100644 index 1da8f03..0000000 --- a/AlphaGo/gtp_wrapper.py +++ /dev/null @@ -1,70 +0,0 @@ -import gtp -import go -import utils - - -def translate_gtp_colors(gtp_color): - if gtp_color == gtp.BLACK: - return go.BLACK - elif gtp_color == gtp.WHITE: - return go.WHITE - else: - return go.EMPTY - - -class GtpInterface(object): - def __init__(self): - self.size = 9 - self.position = None - self.komi = 6.5 - self.clear() - - def set_size(self, n): - self.size = n - go.set_board_size(n) - self.clear() - - def set_komi(self, komi): - self.komi = komi - self.position.komi = komi - - def clear(self): - self.position = go.Position(komi=self.komi) - - def accomodate_out_of_turn(self, color): - if not translate_gtp_colors(color) == self.position.to_play: - self.position.flip_playerturn(mutate=True) - - def make_move(self, color, vertex): - coords = utils.parse_pygtp_coords(vertex) - self.accomodate_out_of_turn(color) - try: - self.position = self.position.play_move(coords, color=translate_gtp_colors(color)) - except go.IllegalMove: - return False - return True - - def get_move(self, color): - self.accomodate_out_of_turn(color) - if self.should_resign(self.position): - return gtp.RESIGN - - if self.should_pass(self.position): - return gtp.PASS - - move = self.suggest_move(self.position) - return utils.unparse_pygtp_coords(move) - - def should_resign(self, position): - if position.caps[0] + 50 < position.caps[1]: - return gtp.RESIGN - - def should_pass(self, position): - # Pass if the opponent passes - return position.n > 100 and position.recent and position.recent[-1].move == None - - def get_score(self): - return self.position.result() - - def suggest_move(self, position): - raise NotImplementedError diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 18ce869..180186a 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -13,12 +13,11 @@ print "Start Name Sever : " + str(start_new_server.pid)# + str(start_new_server. time.sleep(1) agent_v0 = subprocess.Popen(['python', '-u', 'player.py', '--role=black'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) -time.sleep(3) print "Start Player 0 at : " + str(agent_v0.pid) agent_v1 = subprocess.Popen(['python', '-u', 'player.py', '--role=white', '--checkpoint_path=./checkpoints_origin/'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) -time.sleep(3) print "Start Player 1 at : " + str(agent_v1.pid) +time.sleep(5) player = [None] * 2 player[0] = Pyro4.Proxy("PYRONAME:black") diff --git a/AlphaGo/player.py b/AlphaGo/player.py index 36965a9..8245c38 100644 --- a/AlphaGo/player.py +++ b/AlphaGo/player.py @@ -8,6 +8,10 @@ from engine import GTPEngine @Pyro4.expose class Player(object): + """ + This is the class which defines the object called by Pyro4 (Python remote object). + It passes the command to our engine, and return the result. + """ def __init__(self, **kwargs): self.role = kwargs['role'] self.engine = kwargs['engine'] diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index 327111d..5a55002 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -13,7 +13,6 @@ from tianshou.core.mcts.mcts import MCTS DELTA = [[1, 0], [-1, 0], [0, -1], [0, 1]] CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] - class GoEnv: def __init__(self, size=9, komi=6.5): self.size = size @@ -221,37 +220,3 @@ class GoEnv: np.array(1 - state[:, :, :, -1]).reshape(1, self.size, self.size, 1)], axis=3) return new_state, 0 - - -class strategy(object): - def __init__(self, checkpoint_path): - self.simulator = GoEnv() - self.net = network_small.Network() - self.sess = self.net.forward(checkpoint_path) - self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], - feed_dict={self.net.x: state, self.net.is_training: False}) - - def data_process(self, history, color): - state = np.zeros([1, self.simulator.size, self.simulator.size, 17]) - for i in range(8): - state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.simulator.size ** 2)).reshape(self.simulator.size, self.simulator.size) - state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(self.simulator.size ** 2)).reshape(self.simulator.size, self.simulator.size) - if color == utils.BLACK: - state[0, :, :, 16] = np.ones([self.simulator.size, self.simulator.size]) - if color == utils.WHITE: - state[0, :, :, 16] = np.zeros([self.simulator.size, self.simulator.size]) - return state - - def gen_move(self, history, color): - self.simulator.history = copy.copy(history) - self.simulator.board = copy.copy(history[-1]) - state = self.data_process(self.simulator.history, color) - mcts = MCTS(self.simulator, self.evaluator, state, self.simulator.size ** 2 + 1, inverse=True, max_step=10) - temp = 1 - prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) - choice = np.random.choice(self.simulator.size ** 2 + 1, 1, p=prob).tolist()[0] - if choice == self.simulator.size ** 2: - move = utils.PASS - else: - move = (choice % self.simulator.size + 1, choice / self.simulator.size + 1) - return move, prob diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index e29d919..47b0768 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -168,6 +168,7 @@ class MCTS(object): if max_step is None and max_time is None: raise ValueError("Need a stop criteria!") + # TODO: running mcts should be implemented in another function, e.g. def search(self, max_step, max_time) self.select_time = [] self.evaluate_time = [] self.bp_time = [] From b8bdfea8bd9e01115c570e9e2de6cdb7c3633b46 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Sat, 16 Dec 2017 14:33:31 +0800 Subject: [PATCH 07/98] start the player server in a more robost way. --- AlphaGo/play.py | 110 +++++++++++++++++++++++++----------------------- 1 file changed, 58 insertions(+), 52 deletions(-) diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 180186a..242ba52 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -4,63 +4,69 @@ import re import Pyro4 import time -#start a name server to find the remote object -kill_old_server = subprocess.Popen(['killall', 'pyro4-ns']) -print "kill old server, the return code is : " + str(kill_old_server.wait()) -time.sleep(1) -start_new_server = subprocess.Popen(['pyro4-ns', '&']) -print "Start Name Sever : " + str(start_new_server.pid)# + str(start_new_server.wait()) -time.sleep(1) -agent_v0 = subprocess.Popen(['python', '-u', 'player.py', '--role=black'], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) -print "Start Player 0 at : " + str(agent_v0.pid) -agent_v1 = subprocess.Popen(['python', '-u', 'player.py', '--role=white', '--checkpoint_path=./checkpoints_origin/'], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) -print "Start Player 1 at : " + str(agent_v1.pid) -time.sleep(5) +if __name__ == '__main__': + # start a name server to find the remote object + kill_old_server = subprocess.Popen(['killall', 'pyro4-ns']) + print "kill old server, the return code is : " + str(kill_old_server.wait()) + time.sleep(1) + start_new_server = subprocess.Popen(['pyro4-ns', '&']) + print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait()) + time.sleep(1) + agent_v0 = subprocess.Popen(['python', '-u', 'player.py', '--role=black'], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + agent_v1 = subprocess.Popen(['python', '-u', 'player.py', '--role=white', '--checkpoint_path=./checkpoints_origin/'], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + server_list = "" + while ("black" not in server_list) or ("white" not in server_list): + server_list = subprocess.check_output(['pyro4-nsc', 'list']) + print "Waining for the server start..." + time.sleep(1) + print server_list + print "Start black player at : " + str(agent_v0.pid) + print "Start white player at : " + str(agent_v1.pid) -player = [None] * 2 -player[0] = Pyro4.Proxy("PYRONAME:black") -player[1] = Pyro4.Proxy("PYRONAME:white") + player = [None] * 2 + player[0] = Pyro4.Proxy("PYRONAME:black") + player[1] = Pyro4.Proxy("PYRONAME:white") -role = ["BLACK", "WHITE"] -color = ['b', 'w'] + role = ["BLACK", "WHITE"] + color = ['b', 'w'] -pattern = "[A-Z]{1}[0-9]{1}" -size = 9 -show = ['.', 'X', 'O'] + pattern = "[A-Z]{1}[0-9]{1}" + size = 9 + show = ['.', 'X', 'O'] -game_num = 0 -while game_num < 1: - num = 0 - pass_flag = [False, False] - print("Start game {}".format(game_num)) - # end the game if both palyer chose to pass, or play too much turns - while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2: - turn = num % 2 - move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n') - print role[turn] + " : " + str(move), - num += 1 - match = re.search(pattern, move) - if match is not None: - #print "match : " + str(match.group()) - play_or_pass = match.group() - pass_flag[turn] = False - else: - #print "no match" - play_or_pass = ' PASS' - pass_flag[turn] = True - result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n') - board = player[turn].run_cmd(str(num) + ' show_board') - board = eval(board[board.index('['):board.index(']') + 1]) - for i in range(size): - for j in range(size): - print show[board[i * size + j]] + " ", - print "\n", + game_num = 0 + while game_num < 1: + num = 0 + pass_flag = [False, False] + print("Start game {}".format(game_num)) + # end the game if both palyer chose to pass, or play too much turns + while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2: + turn = num % 2 + move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n') + print role[turn] + " : " + str(move), + num += 1 + match = re.search(pattern, move) + if match is not None: + # print "match : " + str(match.group()) + play_or_pass = match.group() + pass_flag[turn] = False + else: + # print "no match" + play_or_pass = ' PASS' + pass_flag[turn] = True + result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n') + board = player[turn].run_cmd(str(num) + ' show_board') + board = eval(board[board.index('['):board.index(']') + 1]) + for i in range(size): + for j in range(size): + print show[board[i * size + j]] + " ", + print "\n", - score = player[turn].run_cmd(str(num) + ' get_score') - print "Finished : ", score.split(" ")[1] - player[0].run_cmd(str(num) + ' clear_board') + score = player[turn].run_cmd(str(num) + ' get_score') + print "Finished : ", score.split(" ")[1] + player[0].run_cmd(str(num) + ' clear_board') player[1].run_cmd(str(num) + ' clear_board') game_num += 1 From 431f551ce9ce44015e7563ffc5db06f9caf8fd2e Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Sat, 16 Dec 2017 14:55:19 +0800 Subject: [PATCH 08/98] check if the network weights exists for every player --- AlphaGo/play.py | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 242ba52..d6e6138 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -3,15 +3,34 @@ import sys import re import Pyro4 import time +import os if __name__ == '__main__': - # start a name server to find the remote object + """ + Starting two different players which load network weights to evaluate the winning ratio. + Note that, this function requires the installation of the Pyro4 library. + """ + # TODO : we should set the network path in a more configurable way. + black_weight_path = "./checkpoints" + white_weight_path = "./checkpoints_origin" + if (not os.path.exists(black_weight_path)): + print "Can't not find the network weights for black player." + sys.exit() + if (not os.path.exists(white_weight_path)): + print "Can't not find the network weights for white player." + sys.exit() + + # kill the old server kill_old_server = subprocess.Popen(['killall', 'pyro4-ns']) - print "kill old server, the return code is : " + str(kill_old_server.wait()) + print "kill the old pyro4 name server, the return code is : " + str(kill_old_server.wait()) time.sleep(1) + + # start a name server to find the remote object start_new_server = subprocess.Popen(['pyro4-ns', '&']) print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait()) time.sleep(1) + + # start two different player with different network weights. agent_v0 = subprocess.Popen(['python', '-u', 'player.py', '--role=black'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) agent_v1 = subprocess.Popen(['python', '-u', 'player.py', '--role=white', '--checkpoint_path=./checkpoints_origin/'], @@ -36,8 +55,9 @@ if __name__ == '__main__': size = 9 show = ['.', 'X', 'O'] + evaluate_rounds = 1 game_num = 0 - while game_num < 1: + while game_num < evaluate_rounds: num = 0 pass_flag = [False, False] print("Start game {}".format(game_num)) @@ -70,6 +90,6 @@ if __name__ == '__main__': player[1].run_cmd(str(num) + ' clear_board') game_num += 1 -subprocess.call(["kill", "-9", str(agent_v0.pid)]) -subprocess.call(["kill", "-9", str(agent_v1.pid)]) -print "Kill all player, finish all game." + subprocess.call(["kill", "-9", str(agent_v0.pid)]) + subprocess.call(["kill", "-9", str(agent_v1.pid)]) + print "Kill all player, finish all game." From e10acf51303bf5fb6c246bf1e9d87ba3aede92bb Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Sat, 16 Dec 2017 23:29:11 +0800 Subject: [PATCH 09/98] 0. code refactor, try to merge Go and GoEnv --- AlphaGo/game.py | 41 ++++++++++++++------------- AlphaGo/go.py | 2 +- AlphaGo/play.py | 4 +-- AlphaGo/strategy.py | 57 +++++++++++++++++++++----------------- tianshou/core/mcts/mcts.py | 2 +- 5 files changed, 57 insertions(+), 49 deletions(-) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 360921e..02ccb27 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -31,14 +31,14 @@ class Game: self.komi = komi self.board = [utils.EMPTY] * (self.size * self.size) self.history = [] - self.past = deque(maxlen=8) + self.latest_boards = deque(maxlen=8) for _ in range(8): - self.past.append(self.board) + self.latest_boards.append(self.board) self.executor = go.Go(game=self) #self.strategy = strategy(checkpoint_path) - self.simulator = strategy.GoEnv() + self.simulator = strategy.GoEnv(game=self) self.net = network_small.Network() self.sess = self.net.forward(checkpoint_path) self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], @@ -57,7 +57,7 @@ class Game: self.board = [utils.EMPTY] * (self.size * self.size) self.history = [] for _ in range(8): - self.past.append(self.board) + self.latest_boards.append(self.board) def set_size(self, n): self.size = n @@ -66,29 +66,29 @@ class Game: def set_komi(self, k): self.komi = k - def data_process(self, history, color): - state = np.zeros([1, self.simulator.size, self.simulator.size, 17]) + def generate_nn_input(self, history, color): + state = np.zeros([1, self.size, self.size, 17]) for i in range(8): - state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.simulator.size ** 2)).reshape(self.simulator.size, self.simulator.size) - state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(self.simulator.size ** 2)).reshape(self.simulator.size, self.simulator.size) + state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.size ** 2)).reshape(self.size, self.size) + state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(self.size ** 2)).reshape(self.size, self.size) if color == utils.BLACK: - state[0, :, :, 16] = np.ones([self.simulator.size, self.simulator.size]) + state[0, :, :, 16] = np.ones([self.size, self.size]) if color == utils.WHITE: - state[0, :, :, 16] = np.zeros([self.simulator.size, self.simulator.size]) + state[0, :, :, 16] = np.zeros([self.size, self.size]) return state - def strategy_gen_move(self, history, color): - self.simulator.history = copy.copy(history) - self.simulator.board = copy.copy(history[-1]) - state = self.data_process(self.simulator.history, color) - mcts = MCTS(self.simulator, self.evaluator, state, self.simulator.size ** 2 + 1, inverse=True, max_step=10) + def strategy_gen_move(self, latest_boards, color): + self.simulator.latest_boards = copy.copy(latest_boards) + self.simulator.board = copy.copy(latest_boards[-1]) + nn_input = self.generate_nn_input(self.simulator.latest_boards, color) + mcts = MCTS(self.simulator, self.evaluator, nn_input, self.size ** 2 + 1, inverse=True, max_step=1) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) - choice = np.random.choice(self.simulator.size ** 2 + 1, 1, p=prob).tolist()[0] - if choice == self.simulator.size ** 2: + choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0] + if choice == self.size ** 2: move = utils.PASS else: - move = (choice % self.simulator.size + 1, choice / self.simulator.size + 1) + move = (choice % self.size + 1, choice / self.size + 1) return move, prob def do_move(self, color, vertex): @@ -100,7 +100,7 @@ class Game: def gen_move(self, color): # move = self.strategy.gen_move(color) # return move - move, self.prob = self.strategy_gen_move(self.past, color) + move, self.prob = self.strategy_gen_move(self.latest_boards, color) self.do_move(color, move) return move @@ -127,3 +127,6 @@ class Game: if __name__ == "__main__": g = Game() g.show_board() + #file = open("debug.txt", "a") + #file.write("mcts check\n") + #file.close() diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 26540e1..0afc877 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -135,7 +135,7 @@ class Go: self.game.board[self.game._flatten(vertex)] = color self._process_board(color, vertex) self.game.history.append(copy.copy(self.game.board)) - self.game.past.append(copy.copy(self.game.board)) + self.game.latest_boards.append(copy.copy(self.game.board)) return True def _find_empty(self): diff --git a/AlphaGo/play.py b/AlphaGo/play.py index d6e6138..fe6c7ce 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -87,8 +87,8 @@ if __name__ == '__main__': score = player[turn].run_cmd(str(num) + ' get_score') print "Finished : ", score.split(" ")[1] player[0].run_cmd(str(num) + ' clear_board') - player[1].run_cmd(str(num) + ' clear_board') - game_num += 1 + player[1].run_cmd(str(num) + ' clear_board') + game_num += 1 subprocess.call(["kill", "-9", str(agent_v0.pid)]) subprocess.call(["kill", "-9", str(agent_v1.pid)]) diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index 5a55002..0bad998 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -14,15 +14,14 @@ DELTA = [[1, 0], [-1, 0], [0, -1], [0, 1]] CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] class GoEnv: - def __init__(self, size=9, komi=6.5): - self.size = size - self.komi = komi - self.board = [utils.EMPTY] * (self.size * self.size) - self.history = deque(maxlen=8) + def __init__(self, **kwargs): + self.game = kwargs['game'] + self.board = [utils.EMPTY] * (self.game.size * self.game.size) + self.latest_boards = deque(maxlen=8) def _flatten(self, vertex): x, y = vertex - return (x - 1) * self.size + (y - 1) + return (x - 1) * self.game.size + (y - 1) def _bfs(self, vertex, color, block, status, alive_break): block.append(vertex) @@ -35,7 +34,7 @@ class GoEnv: def _find_block(self, vertex, alive_break=False): block = [] - status = [False] * (self.size * self.size) + status = [False] * (self.game.size * self.game.size) color = self.board[self._flatten(vertex)] self._bfs(vertex, color, block, status, alive_break) @@ -73,7 +72,7 @@ class GoEnv: _board = copy.copy(self.board) self.board[self._flatten(vertex)] = color self._process_board(color, vertex) - if self.board in self.history: + if self.board in self.latest_boards: res = True else: res = False @@ -83,8 +82,8 @@ class GoEnv: def _in_board(self, vertex): x, y = vertex - if x < 1 or x > self.size: return False - if y < 1 or y > self.size: return False + if x < 1 or x > self.game.size: return False + if y < 1 or y > self.game.size: return False return True def _neighbor(self, vertex): @@ -151,21 +150,28 @@ class GoEnv: # print "many opponents, fake eye" return False - # def is_valid(self, color, vertex): - def is_valid(self, state, action): + def knowledge_prunning(self, color, vertex): + ### check if it is an eye of yourself + ### assumptions : notice that this judgement requires that the state is an endgame + if self._is_eye(color, vertex): + return False + return True + + def simulate_is_valid(self, state, action): # state is the play board, the shape is [1, 9, 9, 17] - if action == self.size * self.size: + if action == self.game.size * self.game.size: vertex = (0, 0) else: - vertex = (action / self.size + 1, action % self.size + 1) + vertex = (action / self.game.size + 1, action % self.game.size + 1) if state[0, 0, 0, -1] == utils.BLACK: color = utils.BLACK else: color = utils.WHITE - self.history.clear() + self.latest_boards.clear() for i in range(8): - self.history.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) - self.board = copy.copy(self.history[-1]) + self.latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) + self.board = copy.copy(self.latest_boards[-1]) + ### in board if not self._in_board(vertex): return False @@ -180,12 +186,11 @@ class GoEnv: if not self._is_qi(color, vertex): return False - ### check if it is an eye of yourself - ### assumptions : notice that this judgement requires that the state is an endgame - if self._is_eye(color, vertex): + ### forbid global isomorphous + if self._check_global_isomorphous(color, vertex): return False - if self._check_global_isomorphous(color, vertex): + if not self.knowledge_prunning(color, vertex): return False return True @@ -206,17 +211,17 @@ class GoEnv: color = utils.BLACK else: color = utils.WHITE - if action == self.size ** 2: + if action == self.game.size ** 2: vertex = utils.PASS else: - vertex = (action % self.size + 1, action / self.size + 1) + vertex = (action % self.game.size + 1, action / self.game.size + 1) # print(vertex) # print(self.board) self.board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() self.do_move(color, vertex) new_state = np.concatenate( - [state[:, :, :, 1:8], (np.array(self.board) == utils.BLACK).reshape(1, self.size, self.size, 1), - state[:, :, :, 9:16], (np.array(self.board) == utils.WHITE).reshape(1, self.size, self.size, 1), - np.array(1 - state[:, :, :, -1]).reshape(1, self.size, self.size, 1)], + [state[:, :, :, 1:8], (np.array(self.board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), + state[:, :, :, 9:16], (np.array(self.board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), + np.array(1 - state[:, :, :, -1]).reshape(1, self.game.size, self.game.size, 1)], axis=3) return new_state, 0 diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 47b0768..979e994 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -75,7 +75,7 @@ class UCTNode(MCTSNode): start_time = time.time() self.mask = [] for act in range(self.action_num - 1): - if not simulator.is_valid(self.state, act): + if not simulator.simulate_is_valid(self.state, act): self.mask.append(act) self.ucb[act] = -float("Inf") else: From 62e2c6582dcd862e55d7cfb27ffb6c76f18af97f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AE=8B=E4=B8=96=E8=99=B9?= Date: Sun, 17 Dec 2017 12:52:00 +0800 Subject: [PATCH 10/98] finished very naive dqn: changed the interface of replay buffer by adding collect and next_batch, but still need refactoring; added implementation of dqn.py, but still need to consider the interface to make it more extensive; slightly refactored the code style of the codebase; more comments and todos will be in the next commit --- examples/dqn_example.py | 26 +++-- tianshou/core/losses.py | 6 +- tianshou/core/policy/__init__.py | 3 +- tianshou/core/policy/base.py | 13 ++- tianshou/core/policy/dqn.py | 55 ++++++++- tianshou/data/advantage_estimation.py | 37 ++++++- tianshou/data/replay_buffer/buffer.py | 60 ++++++---- tianshou/data/replay_buffer/naive.py | 104 ++++++++++++++---- tianshou/data/replay_buffer/proportional.py | 85 +++++++++++++- tianshou/data/replay_buffer/rank_based.py | 83 ++++++++++++-- .../data/replay_buffer/replay_buffer_test.py | 10 +- tianshou/data/replay_buffer/utils.py | 33 +++--- 12 files changed, 411 insertions(+), 104 deletions(-) diff --git a/examples/dqn_example.py b/examples/dqn_example.py index 4fbe466..7d20731 100644 --- a/examples/dqn_example.py +++ b/examples/dqn_example.py @@ -9,8 +9,7 @@ import gym import sys sys.path.append('..') import tianshou.core.losses as losses -from tianshou.data.replay import Replay -import tianshou.data.advantage_estimation as advantage_estimation +from tianshou.data.replay_buffer.utils import get_replay_buffer import tianshou.core.policy as policy @@ -38,11 +37,10 @@ if __name__ == '__main__': action_dim = env.action_space.n # 1. build network with pure tf - observation = tf.placeholder(tf.float32, shape=(None,) + observation_dim) # network input + observation = tf.placeholder(tf.float32, shape=(None,) + observation_dim, name="dqn_observation") # network input with tf.variable_scope('q_net'): q_values = policy_net(observation, action_dim) - train_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) # TODO: better management of TRAINABLE_VARIABLES with tf.variable_scope('target_net'): q_values_target = policy_net(observation, action_dim) @@ -54,13 +52,15 @@ if __name__ == '__main__': target = tf.placeholder(dtype=tf.float32, shape=[None]) # target value for DQN dqn_loss = losses.dqn_loss(action, target, q_net) # TongzhengRen - + global_step = tf.Variable(0, name='global_step', trainable=False) + train_var_list = tf.get_collection( + tf.GraphKeys.TRAINABLE_VARIABLES) # TODO: better management of TRAINABLE_VARIABLES total_loss = dqn_loss optimizer = tf.train.AdamOptimizer(1e-3) - train_op = optimizer.minimize(total_loss, var_list=train_var_list) - + train_op = optimizer.minimize(total_loss, var_list=train_var_list, global_step=tf.train.get_global_step()) # 3. define data collection - training_data = Replay(env, q_net, advantage_estimation.qlearning_target(target_net)) # + replay_memory = get_replay_buffer('rank_based', env, q_values, q_net, target_net, + {'size': 1000, 'batch_size': 64, 'learn_start': 20}) # ShihongSong: Replay(env, q_net, advantage_estimation.qlearning_target(target_network)), use your ReplayMemory, interact as follows. Simplify your advantage_estimation.dqn to run before YongRen's DQN # maybe a dict to manage the elements to be collected @@ -70,14 +70,16 @@ if __name__ == '__main__': minibatch_count = 0 collection_count = 0 + collect_freq = 100 while True: # until some stopping criterion met... # collect data - training_data.collect() # ShihongSong - collection_count += 1 - print('Collected {} times.'.format(collection_count)) + for i in range(0, collect_freq): + replay_memory.collect() # ShihongSong + collection_count += 1 + print('Collected {} times.'.format(collection_count)) # update network - data = training_data.next_batch(64) # YouQiaoben, ShihongSong + data = replay_memory.next_batch(10) # YouQiaoben, ShihongSong # TODO: auto managing of the placeholders? or add this to params of data.Batch sess.run(train_op, feed_dict={observation: data['observations'], action: data['actions'], target: data['target']}) minibatch_count += 1 diff --git a/tianshou/core/losses.py b/tianshou/core/losses.py index d281df9..3461afb 100644 --- a/tianshou/core/losses.py +++ b/tianshou/core/losses.py @@ -32,7 +32,7 @@ def vanilla_policy_gradient(sampled_action, reward, pi, baseline="None"): """ log_pi_act = pi.log_prob(sampled_action) vanilla_policy_gradient_loss = tf.reduce_mean(reward * log_pi_act) - # TODO: Different baseline methods like REINFORCE, etc. + # TODO: Different baseline methods like REINFORCE, etc. return vanilla_policy_gradient_loss def dqn_loss(sampled_action, sampled_target, q_net): @@ -44,8 +44,8 @@ def dqn_loss(sampled_action, sampled_target, q_net): :param q_net: current `policy` to be optimized :return: """ - action_num = q_net.get_values().shape()[1] - sampled_q = tf.reduce_sum(q_net.get_values() * tf.one_hot(sampled_action, action_num), axis=1) + action_num = q_net.values_tensor().get_shape()[1] + sampled_q = tf.reduce_sum(q_net.values_tensor() * tf.one_hot(sampled_action, action_num), axis=1) return tf.reduce_mean(tf.square(sampled_target - sampled_q)) def deterministic_policy_gradient(sampled_state, critic): diff --git a/tianshou/core/policy/__init__.py b/tianshou/core/policy/__init__.py index f67b3ba..ccde775 100644 --- a/tianshou/core/policy/__init__.py +++ b/tianshou/core/policy/__init__.py @@ -2,4 +2,5 @@ # -*- coding: utf-8 -*- from .base import * -from .stochastic import * \ No newline at end of file +from .stochastic import * +from .dqn import * \ No newline at end of file diff --git a/tianshou/core/policy/base.py b/tianshou/core/policy/base.py index b6d8d48..eecfc4f 100644 --- a/tianshou/core/policy/base.py +++ b/tianshou/core/policy/base.py @@ -12,23 +12,28 @@ import tensorflow as tf __all__ = [ 'StochasticPolicy', + 'QValuePolicy', ] -#TODO: separate actor and critic, we should focus on it once we finish the basic module. +# TODO: separate actor and critic, we should focus on it once we finish the basic module. + class QValuePolicy(object): """ The policy as in DQN """ def __init__(self, observation_placeholder): - self.observation_placeholder = observation_placeholder + self._observation_placeholder = observation_placeholder def act(self, observation, exploration=None): # first implement no exploration """ return the action (int) to be executed. no exploration when exploration=None. """ - pass + self._act(observation, exploration) + + def _act(self, observation, exploration = None): + raise NotImplementedError() def values(self, observation): """ @@ -36,7 +41,7 @@ class QValuePolicy(object): """ pass - def values_tensor(self, observation): + def values_tensor(self): """ returns the tensor of the values for all actions a at observation s """ diff --git a/tianshou/core/policy/dqn.py b/tianshou/core/policy/dqn.py index cfc6abf..81efc9b 100644 --- a/tianshou/core/policy/dqn.py +++ b/tianshou/core/policy/dqn.py @@ -1,7 +1,54 @@ - - -from .base import QValuePolicy +from tianshou.core.policy.base import QValuePolicy +import tensorflow as tf class DQN(QValuePolicy): - pass \ No newline at end of file + """ + The policy as in DQN + """ + + def __init__(self, logits, observation_placeholder, dtype=None, **kwargs): + self._logits = tf.convert_to_tensor(logits) + if dtype is None: + dtype = tf.int32 + self._n_categories = self._logits.get_shape()[-1].value + + super(DQN, self).__init__(observation_placeholder) + + net = tf.layers.conv2d(self._observation_placeholder, 16, 8, 4, 'valid', activation=tf.nn.relu) + net = tf.layers.conv2d(net, 32, 4, 2, 'valid', activation=tf.nn.relu) + net = tf.layers.flatten(net) + net = tf.layers.dense(net, 256, activation=tf.nn.relu, use_bias=True) + self._value = tf.layers.dense(net, self._n_categories) + + def _act(self, observation, exploration=None): # first implement no exploration + """ + return the action (int) to be executed. + no exploration when exploration=None. + """ + sess = tf.get_default_session() + sampled_action = sess.run(tf.multinomial(self.logits, num_samples=1), + feed_dict={self._observation_placeholder: observation[None]}) + return sampled_action + + @property + def logits(self): + return self._logits + + @property + def n_categories(self): + return self._n_categories + + def values(self, observation): + """ + returns the Q(s, a) values (float) for all actions a at observation s + """ + sess = tf.get_default_session() + value = sess.run(self._value, feed_dict={self._observation_placeholder: observation[None]}) + return value + + def values_tensor(self): + """ + returns the tensor of the values for all actions a at observation s + """ + return self._value diff --git a/tianshou/data/advantage_estimation.py b/tianshou/data/advantage_estimation.py index 6f5b8a6..3c2d644 100644 --- a/tianshou/data/advantage_estimation.py +++ b/tianshou/data/advantage_estimation.py @@ -19,7 +19,8 @@ def full_return(raw_data): returns = rewards.copy() episode_start_idx = 0 for i in range(1, num_timesteps): - if episode_start_flags[i] or (i == num_timesteps - 1): # found the start of next episode or the end of all episodes + if episode_start_flags[i] or ( + i == num_timesteps - 1): # found the start of next episode or the end of all episodes if i < rewards.shape[0] - 1: t = i - 1 else: @@ -34,4 +35,36 @@ def full_return(raw_data): data['returns'] = returns - return data \ No newline at end of file + return data + + +class QLearningTarget: + def __init__(self, policy, gamma): + self._policy = policy + self._gamma = gamma + + def __call__(self, raw_data): + data = dict() + observations = list() + actions = list() + rewards = list() + wi = list() + all_data, data_wi, data_index = raw_data + + for i in range(0, all_data.shape[0]): + current_data = all_data[i] + current_wi = data_wi[i] + current_index = data_index[i] + observations.append(current_data['observation']) + actions.append(current_data['action']) + next_max_qvalue = np.max(self._policy.values(current_data['observation'])) + current_qvalue = self._policy.values(current_data['previous_observation'])[current_data['previous_action']] + reward = current_data['reward'] + next_max_qvalue - current_qvalue + rewards.append(reward) + wi.append(current_wi) + + data['observations'] = np.array(observations) + data['actions'] = np.array(actions) + data['rewards'] = np.array(rewards) + + return data diff --git a/tianshou/data/replay_buffer/buffer.py b/tianshou/data/replay_buffer/buffer.py index 4b92cfc..6a44170 100644 --- a/tianshou/data/replay_buffer/buffer.py +++ b/tianshou/data/replay_buffer/buffer.py @@ -1,39 +1,51 @@ class ReplayBuffer(object): - def __init__(self, conf): - ''' + def __init__(self, env, policy, qnet, target_qnet, conf): + """ Initialize a replay buffer with parameters in conf. - ''' - pass + """ + pass - def add(self, data, priority): - ''' + def add(self, data, priority): + """ Add a data with priority = priority to replay buffer. - ''' - pass + """ + pass - def update_priority(self, indices, priorities): - ''' + def collect(self): + """ + Collect data from current environment and policy. + """ + pass + + def next_batch(self, batch_size): + """ + get batch of data from the replay buffer. + """ + pass + + def update_priority(self, indices, priorities): + """ Update the data's priority whose indices = indices. For proportional replay buffer, the priority is the priority. For rank based replay buffer, the priorities parameter will be the delta used to update the priority. - ''' - pass + """ + pass - def reset_alpha(self, alpha): - ''' + def reset_alpha(self, alpha): + """ This function only works for proportional replay buffer. This function resets alpha. - ''' - pass + """ + pass - def sample(self, conf): - ''' + def sample(self, conf): + """ Sample from replay buffer with parameters in conf. - ''' - pass + """ + pass - def rebalance(self): - ''' + def rebalance(self): + """ This is for rank based priority replay buffer, which is used to rebalance the sum tree of the priority queue. - ''' - pass \ No newline at end of file + """ + pass diff --git a/tianshou/data/replay_buffer/naive.py b/tianshou/data/replay_buffer/naive.py index 9436a39..50ba1c3 100644 --- a/tianshou/data/replay_buffer/naive.py +++ b/tianshou/data/replay_buffer/naive.py @@ -1,29 +1,93 @@ -from buffer import ReplayBuffer import numpy as np +import tensorflow as tf from collections import deque +from math import fabs + +from tianshou.data.replay_buffer.buffer import ReplayBuffer + class NaiveExperience(ReplayBuffer): - def __init__(self, conf): - self.max_size = conf['size'] - self.n_entries = 0 - self.memory = deque(maxlen = self.max_size) + def __init__(self, env, policy, qnet, target_qnet, conf): + self.max_size = conf['size'] + self._env = env + self._policy = policy + self._qnet = qnet + self._target_qnet = target_qnet + self._begin_act() + self.n_entries = 0 + self.memory = deque(maxlen=self.max_size) - def add(self, data, priority = 0): - self.memory.append(data) - if self.n_entries < self.max_size: - self.n_entries += 1 + def add(self, data, priority=0): + self.memory.append(data) + if self.n_entries < self.max_size: + self.n_entries += 1 - def update_priority(self, indices, priorities = 0): - pass + def _begin_act(self): + self.observation = self._env.reset() + self.action = self._env.action_space.sample() + done = False + while not done: + if done: + self.observation = self._env.reset() + self.action = self._env.action_space.sample() + self.observation, _, done, _ = self._env.step(self.action) - def reset_alpha(self, alpha): - pass + def collect(self): + sess = tf.get_default_session() + current_data = dict() + current_data['previous_action'] = self.action + current_data['previous_observation'] = self.observation + self.action = np.argmax(sess.run(self._policy, feed_dict={"dqn_observation:0": self.observation.reshape((1,) + self.observation.shape)})) + self.observation, reward, done, _ = self._env.step(self.action) + current_data['action'] = self.action + current_data['observation'] = self.observation + current_data['reward'] = reward + self.add(current_data) + if done: + self._begin_act() - def sample(self, conf): - batch_size = conf['batch_size'] - batch_size = min(len(self.memory), batch_size) - idxs = np.random.choice(len(self.memory), batch_size) - return [self.memory[idx] for idx in idxs], [1] * len(idxs), idxs + def update_priority(self, indices, priorities=0): + pass - def rebalance(self): - pass + def reset_alpha(self, alpha): + pass + + def sample(self, conf): + batch_size = conf['batch_size'] + batch_size = min(len(self.memory), batch_size) + idxs = np.random.choice(len(self.memory), batch_size) + return [self.memory[idx] for idx in idxs], [1] * len(idxs), idxs + + def next_batch(self, batch_size): + data = dict() + observations = list() + actions = list() + rewards = list() + wi = list() + target = list() + + for i in range(0, batch_size): + current_datas, current_wis, current_indexs = self.sample({'batch_size': 1}) + current_data = current_datas[0] + current_wi = current_wis[0] + current_index = current_indexs[0] + observations.append(current_data['observation']) + actions.append(current_data['action']) + next_max_qvalue = np.max(self._target_qnet.values(current_data['observation'])) + current_qvalue = self._qnet.values(current_data['previous_observation'])[0, current_data['previous_action']] + reward = current_data['reward'] + next_max_qvalue - current_qvalue + rewards.append(reward) + target.append(current_data['reward'] + next_max_qvalue) + self.update_priority(current_index, [fabs(reward)]) + wi.append(current_wi) + + data['observations'] = np.array(observations) + data['actions'] = np.array(actions) + data['rewards'] = np.array(rewards) + data['wi'] = np.array(wi) + data['target'] = np.array(target) + + return data + + def rebalance(self): + pass diff --git a/tianshou/data/replay_buffer/proportional.py b/tianshou/data/replay_buffer/proportional.py index 72d1457..63aab66 100644 --- a/tianshou/data/replay_buffer/proportional.py +++ b/tianshou/data/replay_buffer/proportional.py @@ -1,7 +1,10 @@ -import numpy +import numpy as np import random -import sum_tree -from buffer import ReplayBuffer +import tensorflow as tf +import math + +from tianshou.data.replay_buffer import sum_tree +from tianshou.data.replay_buffer.buffer import ReplayBuffer class PropotionalExperience(ReplayBuffer): @@ -15,7 +18,7 @@ class PropotionalExperience(ReplayBuffer): """ - def __init__(self, conf): + def __init__(self, env, policy, qnet, target_qnet, conf): """ Prioritized experience replay buffer initialization. Parameters @@ -30,11 +33,26 @@ class PropotionalExperience(ReplayBuffer): """ memory_size = conf['size'] batch_size = conf['batch_size'] - alpha = conf['alpha'] + alpha = conf['alpha'] if 'alpha' in conf else 0.6 self.tree = sum_tree.SumTree(memory_size) self.memory_size = memory_size self.batch_size = batch_size self.alpha = alpha + self._env = env + self._policy = policy + self._qnet = qnet + self._target_qnet = target_qnet + self._begin_act() + + def _begin_act(self): + self.observation = self._env.reset() + self.action = self._env.action_space.sample() + done = False + while not done: + if done: + self.observation = self._env.reset() + self.action = self._env.action_space.sample() + self.observation, _, done, _ = self._env.step(self.action) def add(self, data, priority): """ Add new sample. @@ -48,6 +66,12 @@ class PropotionalExperience(ReplayBuffer): """ self.tree.add(data, priority**self.alpha) + def collect(self): + pass + + def next_batch(self, batch_size): + pass + def sample(self, conf): """ The method return samples randomly. @@ -64,8 +88,9 @@ class PropotionalExperience(ReplayBuffer): indices: list of sample indices The indices indicate sample positions in a sum tree. + :param conf: giving beta """ - beta = conf['beta'] + beta = conf['beta'] if 'beta' in conf else 0.4 if self.tree.filled_size() < self.batch_size: return None, None, None @@ -91,6 +116,54 @@ class PropotionalExperience(ReplayBuffer): return out, weights, indices + def collect(self): + sess = tf.get_default_session() + current_data = dict() + current_data['previous_action'] = self.action + current_data['previous_observation'] = self.observation + # TODO: change the name of the feed_dict + self.action = np.argmax(sess.run(self._policy, feed_dict={"dqn_observation:0": self.observation.reshape((1,) + self.observation.shape)})) + self.observation, reward, done, _ = self._env.step(self.action) + current_data['action'] = self.action + current_data['observation'] = self.observation + current_data['reward'] = reward + priorities = np.array([self.tree.get_val(i) ** -self.alpha for i in range(self.tree.filled_size())]) + priority = np.max(priorities) if len(priorities) > 0 else 1 + self.add(current_data, priority) + if done: + self._begin_act() + + def next_batch(self, batch_size): + data = dict() + observations = list() + actions = list() + rewards = list() + wi = list() + target = list() + + for i in range(0, batch_size): + current_datas, current_wis, current_indexs = self.sample({'batch_size': 1}) + current_data = current_datas[0] + current_wi = current_wis[0] + current_index = current_indexs[0] + observations.append(current_data['observation']) + actions.append(current_data['action']) + next_max_qvalue = np.max(self._target_qnet.values(current_data['observation'])) + current_qvalue = self._qnet.values(current_data['previous_observation'])[0, current_data['previous_action']] + reward = current_data['reward'] + next_max_qvalue - current_qvalue + rewards.append(reward) + target.append(current_data['reward'] + next_max_qvalue) + self.update_priority([current_index], [math.fabs(reward)]) + wi.append(current_wi) + + data['observations'] = np.array(observations) + data['actions'] = np.array(actions) + data['rewards'] = np.array(rewards) + data['wi'] = np.array(wi) + data['target'] = np.array(target) + + return data + def update_priority(self, indices, priorities): """ The methods update samples's priority. diff --git a/tianshou/data/replay_buffer/rank_based.py b/tianshou/data/replay_buffer/rank_based.py index eb770af..da56763 100644 --- a/tianshou/data/replay_buffer/rank_based.py +++ b/tianshou/data/replay_buffer/rank_based.py @@ -8,13 +8,15 @@ import sys import math import random import numpy as np +import tensorflow as tf + +from tianshou.data.replay_buffer.binary_heap import BinaryHeap +from tianshou.data.replay_buffer.buffer import ReplayBuffer -from binary_heap import BinaryHeap -from buffer import ReplayBuffer class RankBasedExperience(ReplayBuffer): - def __init__(self, conf): + def __init__(self, env, policy, qnet, target_qnet, conf): self.size = conf['size'] self.replace_flag = conf['replace_old'] if 'replace_old' in conf else True self.priority_size = conf['priority_size'] if 'priority_size' in conf else self.size @@ -25,12 +27,18 @@ class RankBasedExperience(ReplayBuffer): self.learn_start = conf['learn_start'] if 'learn_start' in conf else 1000 self.total_steps = conf['steps'] if 'steps' in conf else 100000 # partition number N, split total size to N part - self.partition_num = conf['partition_num'] if 'partition_num' in conf else 100 + self.partition_num = conf['partition_num'] if 'partition_num' in conf else 10 self.index = 0 self.record_size = 0 self.isFull = False + self._env = env + self._policy = policy + self._qnet = qnet + self._target_qnet = target_qnet + self._begin_act() + self._experience = {} self.priority_queue = BinaryHeap(self.priority_size) self.distributions = self.build_distributions() @@ -98,7 +106,64 @@ class RankBasedExperience(ReplayBuffer): self.index += 1 return self.index - def add(self, data, priority = 0): + def _begin_act(self): + self.observation = self._env.reset() + self.action = self._env.action_space.sample() + done = False + while not done: + if done: + self.observation = self._env.reset() + self.action = self._env.action_space.sample() + self.observation, _, done, _ = self._env.step(self.action) + + def collect(self): + sess = tf.get_default_session() + current_data = dict() + current_data['previous_action'] = self.action + current_data['previous_observation'] = self.observation + self.action = np.argmax(sess.run(self._policy, feed_dict={"dqn_observation:0": self.observation.reshape((1,) + self.observation.shape)})) + self.observation, reward, done, _ = self._env.step(self.action) + current_data['action'] = self.action + current_data['observation'] = self.observation + current_data['reward'] = reward + self.add(current_data) + if done: + self._begin_act() + + def next_batch(self, batch_size): + data = dict() + observations = list() + actions = list() + rewards = list() + wi = list() + target = list() + + sess = tf.get_default_session() + current_datas, current_wis, current_indexs = self.sample({'global_step': sess.run(tf.train.get_global_step())}) + + for i in range(0, batch_size): + current_data = current_datas[i] + current_wi = current_wis[i] + current_index = current_indexs[i] + observations.append(current_data['observation']) + actions.append(current_data['action']) + next_max_qvalue = np.max(self._target_qnet.values(current_data['observation'])) + current_qvalue = self._qnet.values(current_data['previous_observation'])[0, current_data['previous_action']] + reward = current_data['reward'] + next_max_qvalue - current_qvalue + rewards.append(reward) + target.append(current_data['reward'] + next_max_qvalue) + self.update_priority([current_index], [math.fabs(reward)]) + wi.append(current_wi) + + data['observations'] = np.array(observations) + data['actions'] = np.array(actions) + data['rewards'] = np.array(rewards) + data['wi'] = np.array(wi) + data['target'] = np.array(target) + + return data + + def add(self, data, priority = 1): """ store experience, suggest that experience is a tuple of (s1, a, r, s2, t) so each experience is valid @@ -156,16 +221,16 @@ class RankBasedExperience(ReplayBuffer): sys.stderr.write('Record size less than learn start! Sample failed\n') return False, False, False - dist_index = math.floor(self.record_size / self.size * self.partition_num) + dist_index = math.floor(self.record_size * 1. / self.size * self.partition_num) # issue 1 by @camigord - partition_size = math.floor(self.size / self.partition_num) + partition_size = math.floor(self.size * 1. / self.partition_num) partition_max = dist_index * partition_size distribution = self.distributions[dist_index] rank_list = [] # sample from k segments for n in range(1, self.batch_size + 1): - index = random.randint(distribution['strata_ends'][n] + 1, - distribution['strata_ends'][n + 1]) + index = random.randint(distribution['strata_ends'][n], + distribution['strata_ends'][n + 1]) rank_list.append(index) # beta, increase by global_step, max 1 diff --git a/tianshou/data/replay_buffer/replay_buffer_test.py b/tianshou/data/replay_buffer/replay_buffer_test.py index 9be659b..46b25c8 100644 --- a/tianshou/data/replay_buffer/replay_buffer_test.py +++ b/tianshou/data/replay_buffer/replay_buffer_test.py @@ -1,13 +1,15 @@ -from utils import * from functions import * +from tianshou.data.replay_buffer.utils import get_replay_buffer + + def test_rank_based(): conf = {'size': 50, 'learn_start': 10, 'partition_num': 5, 'total_step': 100, 'batch_size': 4} - experience = getReplayBuffer('rank_based', conf) + experience = get_replay_buffer('rank_based', conf) # insert to experience print 'test insert experience' @@ -52,7 +54,7 @@ def test_proportional(): conf = {'size': 50, 'alpha': 0.7, 'batch_size': 4} - experience = getReplayBuffer('proportional', conf) + experience = get_replay_buffer('proportional', conf) # insert to experience print 'test insert experience' @@ -90,7 +92,7 @@ def test_proportional(): def test_naive(): conf = {'size': 50} - experience = getReplayBuffer('naive', conf) + experience = get_replay_buffer('naive', conf) # insert to experience print 'test insert experience' diff --git a/tianshou/data/replay_buffer/utils.py b/tianshou/data/replay_buffer/utils.py index 3bb9bfe..4480375 100644 --- a/tianshou/data/replay_buffer/utils.py +++ b/tianshou/data/replay_buffer/utils.py @@ -1,17 +1,20 @@ -from rank_based import * -from proportional import * -from naive import * import sys -def getReplayBuffer(name, conf): - ''' - Get replay buffer according to the given name. - ''' - if (name == 'rank_based'): - return RankBasedExperience(conf) - elif (name == 'proportional'): - return PropotionalExperience(conf) - elif (name == 'naive'): - return NaiveExperience(conf) - else: - sys.stderr.write('no such replay buffer') +from tianshou.data.replay_buffer.naive import NaiveExperience +from tianshou.data.replay_buffer.proportional import PropotionalExperience +from tianshou.data.replay_buffer.rank_based import RankBasedExperience + + +def get_replay_buffer(name, env, policy, qnet, target_qnet, conf): + """ + Get replay buffer according to the given name. + """ + + if name == 'rank_based': + return RankBasedExperience(env, policy, qnet, target_qnet, conf) + elif name == 'proportional': + return PropotionalExperience(env, policy, qnet, target_qnet, conf) + elif name == 'naive': + return NaiveExperience(env, policy, qnet, target_qnet, conf) + else: + sys.stderr.write('no such replay buffer') From 7693c38f44e4f7d8f024e78a225a9efdbac40a83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AE=8B=E4=B8=96=E8=99=B9?= Date: Sun, 17 Dec 2017 13:28:21 +0800 Subject: [PATCH 11/98] add comments and todos --- examples/dqn_example.py | 5 +++++ tianshou/core/policy/dqn.py | 9 +++++++++ tianshou/data/replay_buffer/naive.py | 15 +++++++++++++++ tianshou/data/replay_buffer/proportional.py | 21 +++++++++++++++------ tianshou/data/replay_buffer/rank_based.py | 15 +++++++++++++++ 5 files changed, 59 insertions(+), 6 deletions(-) diff --git a/examples/dqn_example.py b/examples/dqn_example.py index 7d20731..b676475 100644 --- a/examples/dqn_example.py +++ b/examples/dqn_example.py @@ -37,6 +37,9 @@ if __name__ == '__main__': action_dim = env.action_space.n # 1. build network with pure tf + # TODO: + # pass the observation variable to the replay buffer or find a more reasonable way to help replay buffer + # access this observation variable. observation = tf.placeholder(tf.float32, shape=(None,) + observation_dim, name="dqn_observation") # network input with tf.variable_scope('q_net'): @@ -59,6 +62,7 @@ if __name__ == '__main__': optimizer = tf.train.AdamOptimizer(1e-3) train_op = optimizer.minimize(total_loss, var_list=train_var_list, global_step=tf.train.get_global_step()) # 3. define data collection + # configuration should be given as parameters, different replay buffer has different parameters. replay_memory = get_replay_buffer('rank_based', env, q_values, q_net, target_net, {'size': 1000, 'batch_size': 64, 'learn_start': 20}) # ShihongSong: Replay(env, q_net, advantage_estimation.qlearning_target(target_network)), use your ReplayMemory, interact as follows. Simplify your advantage_estimation.dqn to run before YongRen's DQN @@ -70,6 +74,7 @@ if __name__ == '__main__': minibatch_count = 0 collection_count = 0 + # need to first collect some then sample, collect_freq must be larger than batch_size collect_freq = 100 while True: # until some stopping criterion met... # collect data diff --git a/tianshou/core/policy/dqn.py b/tianshou/core/policy/dqn.py index 81efc9b..39f6a16 100644 --- a/tianshou/core/policy/dqn.py +++ b/tianshou/core/policy/dqn.py @@ -8,6 +8,7 @@ class DQN(QValuePolicy): """ def __init__(self, logits, observation_placeholder, dtype=None, **kwargs): + # TODO: this version only support non-continuous action space, extend it to support continuous action space self._logits = tf.convert_to_tensor(logits) if dtype is None: dtype = tf.int32 @@ -15,6 +16,7 @@ class DQN(QValuePolicy): super(DQN, self).__init__(observation_placeholder) + # TODO: put the net definition outside of the class net = tf.layers.conv2d(self._observation_placeholder, 16, 8, 4, 'valid', activation=tf.nn.relu) net = tf.layers.conv2d(net, 32, 4, 2, 'valid', activation=tf.nn.relu) net = tf.layers.flatten(net) @@ -26,6 +28,7 @@ class DQN(QValuePolicy): return the action (int) to be executed. no exploration when exploration=None. """ + # TODO: ensure thread safety sess = tf.get_default_session() sampled_action = sess.run(tf.multinomial(self.logits, num_samples=1), feed_dict={self._observation_placeholder: observation[None]}) @@ -33,10 +36,16 @@ class DQN(QValuePolicy): @property def logits(self): + """ + :return: action values + """ return self._logits @property def n_categories(self): + """ + :return: dimension of action space if not continuous + """ return self._n_categories def values(self, observation): diff --git a/tianshou/data/replay_buffer/naive.py b/tianshou/data/replay_buffer/naive.py index 50ba1c3..5eb4dd7 100644 --- a/tianshou/data/replay_buffer/naive.py +++ b/tianshou/data/replay_buffer/naive.py @@ -23,6 +23,10 @@ class NaiveExperience(ReplayBuffer): self.n_entries += 1 def _begin_act(self): + """ + if the previous interaction is ended or the interaction hasn't started + then begin act from the state of env.reset() + """ self.observation = self._env.reset() self.action = self._env.action_space.sample() done = False @@ -33,6 +37,10 @@ class NaiveExperience(ReplayBuffer): self.observation, _, done, _ = self._env.step(self.action) def collect(self): + """ + collect data for replay memory and update the priority according to the given data. + store the previous action, previous observation, reward, action, observation in the replay memory. + """ sess = tf.get_default_session() current_data = dict() current_data['previous_action'] = self.action @@ -59,6 +67,13 @@ class NaiveExperience(ReplayBuffer): return [self.memory[idx] for idx in idxs], [1] * len(idxs), idxs def next_batch(self, batch_size): + """ + collect a batch of data from replay buffer, update the priority and calculate the necessary statistics for + updating q value network. + :param batch_size: int batch size. + :return: a batch of data, with target storing the target q value and wi, rewards storing the coefficient + for gradient of q value network. + """ data = dict() observations = list() actions = list() diff --git a/tianshou/data/replay_buffer/proportional.py b/tianshou/data/replay_buffer/proportional.py index 63aab66..52a231d 100644 --- a/tianshou/data/replay_buffer/proportional.py +++ b/tianshou/data/replay_buffer/proportional.py @@ -45,6 +45,10 @@ class PropotionalExperience(ReplayBuffer): self._begin_act() def _begin_act(self): + """ + if the previous interaction is ended or the interaction hasn't started + then begin act from the state of env.reset() + """ self.observation = self._env.reset() self.action = self._env.action_space.sample() done = False @@ -66,12 +70,6 @@ class PropotionalExperience(ReplayBuffer): """ self.tree.add(data, priority**self.alpha) - def collect(self): - pass - - def next_batch(self, batch_size): - pass - def sample(self, conf): """ The method return samples randomly. @@ -117,6 +115,10 @@ class PropotionalExperience(ReplayBuffer): return out, weights, indices def collect(self): + """ + collect data for replay memory and update the priority according to the given data. + store the previous action, previous observation, reward, action, observation in the replay memory. + """ sess = tf.get_default_session() current_data = dict() current_data['previous_action'] = self.action @@ -134,6 +136,13 @@ class PropotionalExperience(ReplayBuffer): self._begin_act() def next_batch(self, batch_size): + """ + collect a batch of data from replay buffer, update the priority and calculate the necessary statistics for + updating q value network. + :param batch_size: int batch size. + :return: a batch of data, with target storing the target q value and wi, rewards storing the coefficient + for gradient of q value network. + """ data = dict() observations = list() actions = list() diff --git a/tianshou/data/replay_buffer/rank_based.py b/tianshou/data/replay_buffer/rank_based.py index da56763..b71ca68 100644 --- a/tianshou/data/replay_buffer/rank_based.py +++ b/tianshou/data/replay_buffer/rank_based.py @@ -107,6 +107,10 @@ class RankBasedExperience(ReplayBuffer): return self.index def _begin_act(self): + """ + if the previous interaction is ended or the interaction hasn't started + then begin act from the state of env.reset() + """ self.observation = self._env.reset() self.action = self._env.action_space.sample() done = False @@ -117,6 +121,10 @@ class RankBasedExperience(ReplayBuffer): self.observation, _, done, _ = self._env.step(self.action) def collect(self): + """ + collect data for replay memory and update the priority according to the given data. + store the previous action, previous observation, reward, action, observation in the replay memory. + """ sess = tf.get_default_session() current_data = dict() current_data['previous_action'] = self.action @@ -131,6 +139,13 @@ class RankBasedExperience(ReplayBuffer): self._begin_act() def next_batch(self, batch_size): + """ + collect a batch of data from replay buffer, update the priority and calculate the necessary statistics for + updating q value network. + :param batch_size: int batch size. + :return: a batch of data, with target storing the target q value and wi, rewards storing the coefficient + for gradient of q value network. + """ data = dict() observations = list() actions = list() From 75bc2968d27f0e77bd24863d5a887d787bdf4c47 Mon Sep 17 00:00:00 2001 From: Tongzheng Ren Date: Mon, 18 Dec 2017 23:32:41 +0800 Subject: [PATCH 12/98] add a detailed Chinese google coding style for convenience --- .DS_Store | Bin 0 -> 8196 bytes AlphaGo/.DS_Store | Bin 0 -> 6148 bytes README.md | 4 +++- 3 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 .DS_Store create mode 100644 AlphaGo/.DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..99fab83bc04e2a081117249f4d6eb1500ca26cd4 GIT binary patch literal 8196 zcmeHMOHUL*5U%DSvm!FQiF%m0F(H8vh&&F)I4ls{_<*nkA%M&5?yyW8y4RUmo?`am zZ!mf_G4bTVU*OS;SC1b21183^RzHAcRy~S|-J9;OrmL!}`Lw06+ViMb7R4J(a)jrG1vq z`7DPPd{f=P+V?{glmMDw0({K#H^G4!U@_}Af40u;ge5ihhg`|`!f0sd6RE3jXgt?s zBoaoV#aPR1vYkxiMCCB$1S{goE4Vo&FEHjG%T8|5b9b_4=Om+%<54%LGGBe*(E{5^ zu`(@$Vw^cA+C*OwC~Ni}9E^;P4p<{&!-oUb!T9(P%GbvZ4-KQ|+VIWv`qtk5!IQ(M zFT}Ees3#D#%Io9yB}y?;y!^IWCV6+X%+IEIywLN_7g{d1nr-bJZJljhU0vNhUA^s> zE@#b-oRfc44EU5Mna_fhUv{%*YlxJj^Nh{~?lvxXDROtCjJJ=f_L#%GTX=ZVwXj!0 zo6eICG3ZR-x^$%&c!k-GjnIvjs^n!JEgvx7pJSmHd5kWvis+HMm8KgEBk~!{mbfZh zbb}DF#%F2S2?B=}-PGQi@8n#6mZn$~v8^Q!SHWj4LS{-%-lN6jt*?^IMAGh)k_pKp zehrf>P)1!;&dI){eMcz`=*Mqk23FxQRNy(hg17J=_Xf6KN%pSWSrP! zf=rUzBu(a}ciqtqy>b*vB3VI<-O%k!Kz&AwsNr4>Pyhyb2v2|eNe}9(_lf~7?BYJz z!u{f-9JhZKvu}dCkcLT^hWogx>DcJsV%7X+HR|=;sVDUIB=m7XD->Qd)NaU-~&p!-K`?h26a==-g4nLIP3w5 z@8bjD&DfO0b@zl2LY~TgsWWdpesSy=fSK-=FMu(C9*bb-lvTjwzLWzyaV?LCMq}it zFz0WA4+q|{Au13R_}3JWcXy!gY>o|@@9*tJQC=>JazPJS9$vCM`=Q1wVO^rc5?6SR zkF2+3U&Cj8^H^OnGOiMtt`VNM`R;l58FzyfBX4*mtBS{f#Q)mJs!Q(->#(-WGgodN;>uHL zamT9E@BZevVdfb#FM@pP*W!0MvV#7%A#d~9^X~7x<@pDmkLcKnKcZj89G4u^no8a? z8)R%JNkf-D-ryW(nBf(1LjKa5qa00(G34m4IG$_H(fh{5C(z;z$g4Ediu_er4bAUM zBflZ%#IJWqcoHL|iS*Xw^~jTXRge+#r|K<#PU1ge+_S}c1B0(%8yeMndY zQ-`%hT{>9l5rEiXvl_4EV?j8H!_;AIk#}guQi+yo+!4cAI@=THmpZI1S~`q7d>GfW zaVHd`y)%BI?l7svSVsk-0&N8z*uP`h|Id$~|J$VaFDeif_^%W&o!Q&jltXfR>%rh; vuT5Cqv51LZZBZ$#+;*%NvK2pJQRBTtE{Lha+9F$M@k2mmh*ebJuPX2p(V7f* literal 0 HcmV?d00001 diff --git a/README.md b/README.md index 543d237..9c3af16 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,8 @@ Tianshou(å¤©ęŽˆ) is a reinforcement learning platform. The following image illus Please follow [google python coding style](https://google.github.io/styleguide/pyguide.html) +There's a more detailed Chinese version [google python coding style in Chinese](http://www.runoob.com/w3cnote/google-python-styleguide.html) + All files/folders should be named with lower case letters and underline (except specified names such as `AlphaGo`). Try to use full names. Don't use abbrevations for class/function/variable names except common abbrevations (such as `num` for number, `dim` for dimension, `env` for environment, `op` for operation). For now we use `pi` to refer to the policy in examples/ppo_example.py. @@ -73,4 +75,4 @@ HaoshengZou: collaborate mainly on Policy and losses; interfaces and architectur Note: install openai/gym first to run the Atari environment; note that interfaces between modules may not be finalized; the management of placeholders and `feed_dict` may have to be done manually for the time being; -Without preprocessing and other tricks, this example will not train to any meaningful results. Codes should past two tests: individual module test and run through this example code. \ No newline at end of file +Without preprocessing and other tricks, this example will not train to any meaningful results. Codes should past two tests: individual module test and run through this example code. From 6b6c48f122aad3fc415cfbaecbeae449fc8f632d Mon Sep 17 00:00:00 2001 From: Tongzheng Ren Date: Mon, 18 Dec 2017 23:34:32 +0800 Subject: [PATCH 13/98] update gitignore --- .DS_Store | Bin 8196 -> 0 bytes .gitignore | 1 + AlphaGo/.DS_Store | Bin 6148 -> 0 bytes 3 files changed, 1 insertion(+) delete mode 100644 .DS_Store delete mode 100644 AlphaGo/.DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 99fab83bc04e2a081117249f4d6eb1500ca26cd4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHMOHUL*5U%DSvm!FQiF%m0F(H8vh&&F)I4ls{_<*nkA%M&5?yyW8y4RUmo?`am zZ!mf_G4bTVU*OS;SC1b21183^RzHAcRy~S|-J9;OrmL!}`Lw06+ViMb7R4J(a)jrG1vq z`7DPPd{f=P+V?{glmMDw0({K#H^G4!U@_}Af40u;ge5ihhg`|`!f0sd6RE3jXgt?s zBoaoV#aPR1vYkxiMCCB$1S{goE4Vo&FEHjG%T8|5b9b_4=Om+%<54%LGGBe*(E{5^ zu`(@$Vw^cA+C*OwC~Ni}9E^;P4p<{&!-oUb!T9(P%GbvZ4-KQ|+VIWv`qtk5!IQ(M zFT}Ees3#D#%Io9yB}y?;y!^IWCV6+X%+IEIywLN_7g{d1nr-bJZJljhU0vNhUA^s> zE@#b-oRfc44EU5Mna_fhUv{%*YlxJj^Nh{~?lvxXDROtCjJJ=f_L#%GTX=ZVwXj!0 zo6eICG3ZR-x^$%&c!k-GjnIvjs^n!JEgvx7pJSmHd5kWvis+HMm8KgEBk~!{mbfZh zbb}DF#%F2S2?B=}-PGQi@8n#6mZn$~v8^Q!SHWj4LS{-%-lN6jt*?^IMAGh)k_pKp zehrf>P)1!;&dI){eMcz`=*Mqk23FxQRNy(hg17J=_Xf6KN%pSWSrP! zf=rUzBu(a}ciqtqy>b*vB3VI<-O%k!Kz&AwsNr4>Pyhyb2v2|eNe}9(_lf~7?BYJz z!u{f-9JhZKvu}dCkcLT^hWogx>DcJsV%7X+HR|=;sVDUIB=m7XD->Qd)NaU-~&p!-K`?h26a==-g4nLIP3w5 z@8bjD&DfO0b@zl2LY~TgsWWdpesSy=fSK-=FMu(C9*bb-lvTjwzLWzyaV?LCMq}it zFz0WA4+q|{Au13R_}3JWcXy!gY>o|@@9*tJQC=>JazPJS9$vCM`=Q1wVO^rc5?6SR zkF2+3U&Cj8^H^OnGOiMtt`VNM`R;l58FzyfBX4*mtBS{f#Q)mJs!Q(->#(-WGgodN;>uHL zamT9E@BZevVdfb#FM@pP*W!0MvV#7%A#d~9^X~7x<@pDmkLcKnKcZj89G4u^no8a? z8)R%JNkf-D-ryW(nBf(1LjKa5qa00(G34m4IG$_H(fh{5C(z;z$g4Ediu_er4bAUM zBflZ%#IJWqcoHL|iS*Xw^~jTXRge+#r|K<#PU1ge+_S}c1B0(%8yeMndY zQ-`%hT{>9l5rEiXvl_4EV?j8H!_;AIk#}guQi+yo+!4cAI@=THmpZI1S~`q7d>GfW zaVHd`y)%BI?l7svSVsk-0&N8z*uP`h|Id$~|J$VaFDeif_^%W&o!Q&jltXfR>%rh; vuT5Cqv51LZZBZ$#+;*%NvK2pJQRBTtE{Lha+9F$M@k2mmh*ebJuPX2p(V7f* From ea52096713fc42307b3bd5974f7f935edd1c58f5 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Tue, 19 Dec 2017 00:16:21 +0800 Subject: [PATCH 14/98] delete unused parameter of _find_block, and using _find_group to replace _find_block --- AlphaGo/go.py | 13 +++++---- AlphaGo/strategy.py | 66 ++++++++++++++++++++------------------------- 2 files changed, 35 insertions(+), 44 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 0afc877..752973e 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -13,25 +13,24 @@ Settings of the Go game. NEIGHBOR_OFFSET = [[1, 0], [-1, 0], [0, -1], [0, 1]] - class Go: def __init__(self, **kwargs): self.game = kwargs['game'] - def _bfs(self, vertex, color, block, status, alive_break): + def _bfs(self, vertex, color, block, status): block.append(vertex) status[self.game._flatten(vertex)] = True nei = self._neighbor(vertex) for n in nei: if not status[self.game._flatten(n)]: if self.game.board[self.game._flatten(n)] == color: - self._bfs(n, color, block, status, alive_break) + self._bfs(n, color, block, status) - def _find_block(self, vertex, alive_break=False): + def _find_block(self, vertex): block = [] status = [False] * (self.game.size * self.game.size) color = self.game.board[self.game._flatten(vertex)] - self._bfs(vertex, color, block, status, alive_break) + self._bfs(vertex, color, block, status) for b in block: for n in self._neighbor(b): @@ -42,7 +41,7 @@ class Go: def _find_boarder(self, vertex): block = [] status = [False] * (self.game.size * self.game.size) - self._bfs(vertex, utils.EMPTY, block, status, False) + self._bfs(vertex, utils.EMPTY, block, status) border = [] for b in block: for n in self._neighbor(b): @@ -106,7 +105,7 @@ class Go: nei = self._neighbor(vertex) for n in nei: if self.game.board[self.game._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n, alive_break=True) + can_kill, block = self._find_block(n) if can_kill: for b in block: self.game.board[self.game._flatten(b)] = utils.EMPTY diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index 0bad998..8c12c71 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -23,26 +23,32 @@ class GoEnv: x, y = vertex return (x - 1) * self.game.size + (y - 1) - def _bfs(self, vertex, color, block, status, alive_break): + def _find_group(self, start): + color = self.board[self._flatten(start)] + # print ("color : ", color) + chain = set() + frontier = [start] + has_liberty = False + while frontier: + current = frontier.pop() + # print ("current : ", current) + chain.add(current) + for n in self._neighbor(current): + # print n, self._flatten(n), self.board[self._flatten(n)], + if self.board[self._flatten(n)] == color and not n in chain: + frontier.append(n) + if self.board[self._flatten(n)] == utils.EMPTY: + has_liberty = True + return has_liberty, chain + + def _bfs(self, vertex, color, block, status): block.append(vertex) status[self._flatten(vertex)] = True nei = self._neighbor(vertex) for n in nei: if not status[self._flatten(n)]: if self.board[self._flatten(n)] == color: - self._bfs(n, color, block, status, alive_break) - - def _find_block(self, vertex, alive_break=False): - block = [] - status = [False] * (self.game.size * self.game.size) - color = self.board[self._flatten(vertex)] - self._bfs(vertex, color, block, status, alive_break) - - for b in block: - for n in self._neighbor(b): - if self.board[self._flatten(n)] == utils.EMPTY: - return False, block - return True, block + self._bfs(n, color, block, status) def _is_qi(self, color, vertex): nei = self._neighbor(vertex) @@ -53,14 +59,14 @@ class GoEnv: self.board[self._flatten(vertex)] = color for n in nei: if self.board[self._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n) - if can_kill: + has_liberty, group = self._find_group(n) + if not has_liberty: self.board[self._flatten(vertex)] = utils.EMPTY return True ### avoid suicide - can_kill, block = self._find_block(vertex) - if can_kill: + has_liberty, group = self._find_group(vertex) + if not has_liberty: self.board[self._flatten(vertex)] = utils.EMPTY return False @@ -110,26 +116,11 @@ class GoEnv: nei = self._neighbor(vertex) for n in nei: if self.board[self._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n, alive_break=True) - if can_kill: - for b in block: + has_liberty, group = self._find_group(n) + if not has_liberty: + for b in group: self.board[self._flatten(b)] = utils.EMPTY - def _find_group(self, start): - color = self.board[self._flatten(start)] - # print ("color : ", color) - chain = set() - frontier = [start] - while frontier: - current = frontier.pop() - # print ("current : ", current) - chain.add(current) - for n in self._neighbor(current): - # print n, self._flatten(n), self.board[self._flatten(n)], - if self.board[self._flatten(n)] == color and not n in chain: - frontier.append(n) - return chain - def _is_eye(self, color, vertex): nei = self._neighbor(vertex) cor = self._corner(vertex) @@ -137,7 +128,8 @@ class GoEnv: if False in ncolor: # print "not all neighbors are in same color with us" return False - if set(nei) < self._find_group(nei[0]): + _, group = self._find_group(nei[0]) + if set(nei) < group: # print "all neighbors are in same group and same color with us" return True else: From 6a410384bbcccd65fd204503c266b09fd1fc8f4b Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Tue, 19 Dec 2017 00:47:21 +0800 Subject: [PATCH 15/98] rewrite _is_qi in a more understandable way --- AlphaGo/strategy.py | 46 ++++++++++++++++++--------------------------- 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index 8c12c71..e00e69d 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -41,37 +41,27 @@ class GoEnv: has_liberty = True return has_liberty, chain - def _bfs(self, vertex, color, block, status): - block.append(vertex) - status[self._flatten(vertex)] = True - nei = self._neighbor(vertex) - for n in nei: - if not status[self._flatten(n)]: - if self.board[self._flatten(n)] == color: - self._bfs(n, color, block, status) - - def _is_qi(self, color, vertex): - nei = self._neighbor(vertex) - for n in nei: - if self.board[self._flatten(n)] == utils.EMPTY: - return True - + def _is_suicide(self, color, vertex): + ### assume that we already take this move self.board[self._flatten(vertex)] = color - for n in nei: - if self.board[self._flatten(n)] == utils.another_color(color): - has_liberty, group = self._find_group(n) - if not has_liberty: - self.board[self._flatten(vertex)] = utils.EMPTY - return True - ### avoid suicide has_liberty, group = self._find_group(vertex) - if not has_liberty: + if has_liberty: + ### this group still has liberty after this move, not suicide self.board[self._flatten(vertex)] = utils.EMPTY return False - - self.board[self._flatten(vertex)] = utils.EMPTY - return True + else: + ### liberty is zero + for n in self._neighbor(vertex): + if self.board[self._flatten(n)] == utils.another_color(color): + opponent_liberty, group = self._find_group(n) + # this move is able to take opponent's stone, not suicide + if not opponent_liberty: + self.board[self._flatten(vertex)] = utils.EMPTY + return False + # not a take, suicide + self.board[self._flatten(vertex)] = utils.EMPTY + return True def _check_global_isomorphous(self, color, vertex): ##backup @@ -174,8 +164,8 @@ class GoEnv: # print(vertex) return False - ### check if it is qi - if not self._is_qi(color, vertex): + ### check if it is suicide + if self._is_suicide(color, vertex): return False ### forbid global isomorphous From 99a617a1f041643c1b0618d9de3b2017ed144b10 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Tue, 19 Dec 2017 11:16:17 +0800 Subject: [PATCH 16/98] rename variable for clarity --- AlphaGo/game.py | 16 ++++----- AlphaGo/go.py | 83 +++++++++++++++++++++++---------------------- AlphaGo/strategy.py | 60 ++++++++++++++++---------------- 3 files changed, 80 insertions(+), 79 deletions(-) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 02ccb27..3b62435 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -29,7 +29,7 @@ class Game: def __init__(self, size=9, komi=6.5, checkpoint_path=None): self.size = size self.komi = komi - self.board = [utils.EMPTY] * (self.size * self.size) + self.board = [utils.EMPTY] * (self.size ** 2) self.history = [] self.latest_boards = deque(maxlen=8) for _ in range(8): @@ -54,7 +54,7 @@ class Game: return (x,y) def clear(self): - self.board = [utils.EMPTY] * (self.size * self.size) + self.board = [utils.EMPTY] * (self.size ** 2) self.history = [] for _ in range(8): self.latest_boards.append(self.board) @@ -66,11 +66,11 @@ class Game: def set_komi(self, k): self.komi = k - def generate_nn_input(self, history, color): + def generate_nn_input(self, latest_boards, color): state = np.zeros([1, self.size, self.size, 17]) for i in range(8): - state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.size ** 2)).reshape(self.size, self.size) - state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(self.size ** 2)).reshape(self.size, self.size) + state[0, :, :, i] = np.array(np.array(latest_boards[i]) == np.ones(self.size ** 2)).reshape(self.size, self.size) + state[0, :, :, i + 8] = np.array(np.array(latest_boards[i]) == -np.ones(self.size ** 2)).reshape(self.size, self.size) if color == utils.BLACK: state[0, :, :, 16] = np.ones([self.size, self.size]) if color == utils.WHITE: @@ -78,9 +78,9 @@ class Game: return state def strategy_gen_move(self, latest_boards, color): - self.simulator.latest_boards = copy.copy(latest_boards) - self.simulator.board = copy.copy(latest_boards[-1]) - nn_input = self.generate_nn_input(self.simulator.latest_boards, color) + self.simulator.simulate_latest_boards = copy.copy(latest_boards) + self.simulator.simulate_board = copy.copy(latest_boards[-1]) + nn_input = self.generate_nn_input(self.simulator.simulate_latest_boards, color) mcts = MCTS(self.simulator, self.evaluator, nn_input, self.size ** 2 + 1, inverse=True, max_step=1) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 752973e..7b1d3e7 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -28,7 +28,7 @@ class Go: def _find_block(self, vertex): block = [] - status = [False] * (self.game.size * self.game.size) + status = [False] * (self.game.size ** 2) color = self.game.board[self.game._flatten(vertex)] self._bfs(vertex, color, block, status) @@ -40,7 +40,7 @@ class Go: def _find_boarder(self, vertex): block = [] - status = [False] * (self.game.size * self.game.size) + status = [False] * (self.game.size ** 2) self._bfs(vertex, utils.EMPTY, block, status) border = [] for b in block: @@ -141,6 +141,46 @@ class Go: idx = [i for i,x in enumerate(self.game.board) if x == utils.EMPTY ][0] return self.game._deflatten(idx) + def _add_nearby_stones(self, neighbor_vertex_set, start_vertex_x, start_vertex_y, x_diff, y_diff, num_step): + ''' + add the nearby stones around the input vertex + :param neighbor_vertex_set: input list + :param start_vertex_x: x axis of the input vertex + :param start_vertex_y: y axis of the input vertex + :param x_diff: add x axis + :param y_diff: add y axis + :param num_step: number of steps to be added + :return: + ''' + for step in xrange(num_step): + new_neighbor_vertex = (start_vertex_x, start_vertex_y) + if self._in_board(new_neighbor_vertex): + neighbor_vertex_set.append((start_vertex_x, start_vertex_y)) + start_vertex_x += x_diff + start_vertex_y += y_diff + + def _predict_from_nearby(self, vertex, neighbor_step = 3): + ''' + step: the nearby 3 steps is considered + :vertex: position to be estimated + :neighbor_step: how many steps nearby + :return: the nearby positions of the input position + currently the nearby 3*3 grid is returned, altogether 4*8 points involved + ''' + for step in range(1, neighbor_step + 1): # check the stones within the steps in range + neighbor_vertex_set = [] + self._add_nearby_stones(neighbor_vertex_set, vertex[0] - step, vertex[1], 1, 1, neighbor_step) + self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] + step, 1, -1, neighbor_step) + self._add_nearby_stones(neighbor_vertex_set, vertex[0] + step, vertex[1], -1, -1, neighbor_step) + self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] - step, -1, 1, neighbor_step) + color_estimate = 0 + for neighbor_vertex in neighbor_vertex_set: + color_estimate += self.game.board[self.game._flatten(neighbor_vertex)] + if color_estimate > 0: + return utils.BLACK + elif color_estimate < 0: + return utils.WHITE + def get_score(self, is_unknown_estimation = False): ''' is_unknown_estimation: whether use nearby stone to predict the unknown @@ -170,42 +210,3 @@ class Go: self.game.board = _board return score - def _predict_from_nearby(self, vertex, neighbor_step = 3): - ''' - step: the nearby 3 steps is considered - :vertex: position to be estimated - :neighbor_step: how many steps nearby - :return: the nearby positions of the input position - currently the nearby 3*3 grid is returned, altogether 4*8 points involved - ''' - for step in range(1, neighbor_step + 1): # check the stones within the steps in range - neighbor_vertex_set = [] - self._add_nearby_stones(neighbor_vertex_set, vertex[0] - step, vertex[1], 1, 1, neighbor_step) - self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] + step, 1, -1, neighbor_step) - self._add_nearby_stones(neighbor_vertex_set, vertex[0] + step, vertex[1], -1, -1, neighbor_step) - self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] - step, -1, 1, neighbor_step) - color_estimate = 0 - for neighbor_vertex in neighbor_vertex_set: - color_estimate += self.game.board[self.game._flatten(neighbor_vertex)] - if color_estimate > 0: - return utils.BLACK - elif color_estimate < 0: - return utils.WHITE - - def _add_nearby_stones(self, neighbor_vertex_set, start_vertex_x, start_vertex_y, x_diff, y_diff, num_step): - ''' - add the nearby stones around the input vertex - :param neighbor_vertex_set: input list - :param start_vertex_x: x axis of the input vertex - :param start_vertex_y: y axis of the input vertex - :param x_diff: add x axis - :param y_diff: add y axis - :param num_step: number of steps to be added - :return: - ''' - for step in xrange(num_step): - new_neighbor_vertex = (start_vertex_x, start_vertex_y) - if self._in_board(new_neighbor_vertex): - neighbor_vertex_set.append((start_vertex_x, start_vertex_y)) - start_vertex_x += x_diff - start_vertex_y += y_diff diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index e00e69d..fe6bcbf 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -16,15 +16,15 @@ CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] class GoEnv: def __init__(self, **kwargs): self.game = kwargs['game'] - self.board = [utils.EMPTY] * (self.game.size * self.game.size) - self.latest_boards = deque(maxlen=8) + self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) + self.simulate_latest_boards = deque(maxlen=8) - def _flatten(self, vertex): + def simulate_flatten(self, vertex): x, y = vertex return (x - 1) * self.game.size + (y - 1) def _find_group(self, start): - color = self.board[self._flatten(start)] + color = self.simulate_board[self.simulate_flatten(start)] # print ("color : ", color) chain = set() frontier = [start] @@ -35,45 +35,45 @@ class GoEnv: chain.add(current) for n in self._neighbor(current): # print n, self._flatten(n), self.board[self._flatten(n)], - if self.board[self._flatten(n)] == color and not n in chain: + if self.simulate_board[self.simulate_flatten(n)] == color and not n in chain: frontier.append(n) - if self.board[self._flatten(n)] == utils.EMPTY: + if self.simulate_board[self.simulate_flatten(n)] == utils.EMPTY: has_liberty = True return has_liberty, chain def _is_suicide(self, color, vertex): ### assume that we already take this move - self.board[self._flatten(vertex)] = color + self.simulate_board[self.simulate_flatten(vertex)] = color has_liberty, group = self._find_group(vertex) if has_liberty: ### this group still has liberty after this move, not suicide - self.board[self._flatten(vertex)] = utils.EMPTY + self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY return False else: ### liberty is zero for n in self._neighbor(vertex): - if self.board[self._flatten(n)] == utils.another_color(color): + if self.simulate_board[self.simulate_flatten(n)] == utils.another_color(color): opponent_liberty, group = self._find_group(n) # this move is able to take opponent's stone, not suicide if not opponent_liberty: - self.board[self._flatten(vertex)] = utils.EMPTY + self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY return False # not a take, suicide - self.board[self._flatten(vertex)] = utils.EMPTY + self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY return True def _check_global_isomorphous(self, color, vertex): ##backup - _board = copy.copy(self.board) - self.board[self._flatten(vertex)] = color + _board = copy.copy(self.simulate_board) + self.simulate_board[self.simulate_flatten(vertex)] = color self._process_board(color, vertex) - if self.board in self.latest_boards: + if self.simulate_board in self.simulate_latest_boards: res = True else: res = False - self.board = _board + self.simulate_board = _board return res def _in_board(self, vertex): @@ -105,16 +105,16 @@ class GoEnv: def _process_board(self, color, vertex): nei = self._neighbor(vertex) for n in nei: - if self.board[self._flatten(n)] == utils.another_color(color): + if self.simulate_board[self.simulate_flatten(n)] == utils.another_color(color): has_liberty, group = self._find_group(n) if not has_liberty: for b in group: - self.board[self._flatten(b)] = utils.EMPTY + self.simulate_board[self.simulate_flatten(b)] = utils.EMPTY def _is_eye(self, color, vertex): nei = self._neighbor(vertex) cor = self._corner(vertex) - ncolor = {color == self.board[self._flatten(n)] for n in nei} + ncolor = {color == self.simulate_board[self.simulate_flatten(n)] for n in nei} if False in ncolor: # print "not all neighbors are in same color with us" return False @@ -123,7 +123,7 @@ class GoEnv: # print "all neighbors are in same group and same color with us" return True else: - opponent_number = [self.board[self._flatten(c)] for c in cor].count(-color) + opponent_number = [self.simulate_board[self.simulate_flatten(c)] for c in cor].count(-color) opponent_propotion = float(opponent_number) / float(len(cor)) if opponent_propotion < 0.5: # print "few opponents, real eye" @@ -141,7 +141,7 @@ class GoEnv: def simulate_is_valid(self, state, action): # state is the play board, the shape is [1, 9, 9, 17] - if action == self.game.size * self.game.size: + if action == self.game.size ** 2: vertex = (0, 0) else: vertex = (action / self.game.size + 1, action % self.game.size + 1) @@ -149,17 +149,17 @@ class GoEnv: color = utils.BLACK else: color = utils.WHITE - self.latest_boards.clear() + self.simulate_latest_boards.clear() for i in range(8): - self.latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) - self.board = copy.copy(self.latest_boards[-1]) + self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) + self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) ### in board if not self._in_board(vertex): return False ### already have stone - if not self.board[self._flatten(vertex)] == utils.EMPTY: + if not self.simulate_board[self.simulate_flatten(vertex)] == utils.EMPTY: # print(np.array(self.board).reshape(9, 9)) # print(vertex) return False @@ -181,9 +181,9 @@ class GoEnv: if vertex == utils.PASS: return True - id_ = self._flatten(vertex) - if self.board[id_] == utils.EMPTY: - self.board[id_] = color + id_ = self.simulate_flatten(vertex) + if self.simulate_board[id_] == utils.EMPTY: + self.simulate_board[id_] = color return True else: return False @@ -199,11 +199,11 @@ class GoEnv: vertex = (action % self.game.size + 1, action / self.game.size + 1) # print(vertex) # print(self.board) - self.board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() + self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() self.do_move(color, vertex) new_state = np.concatenate( - [state[:, :, :, 1:8], (np.array(self.board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), - state[:, :, :, 9:16], (np.array(self.board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), + [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), + state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), np.array(1 - state[:, :, :, -1]).reshape(1, self.game.size, self.game.size, 1)], axis=3) return new_state, 0 From 4440294c121d4fb36d62db703ce8e7d779424b42 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Tue, 19 Dec 2017 12:00:17 +0800 Subject: [PATCH 17/98] fix bug in check_global_isomorphous and refactor _is_suicide again --- AlphaGo/strategy.py | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index fe6bcbf..e9457cf 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -42,33 +42,27 @@ class GoEnv: return has_liberty, chain def _is_suicide(self, color, vertex): - ### assume that we already take this move - self.simulate_board[self.simulate_flatten(vertex)] = color + self.simulate_board[self.simulate_flatten(vertex)] = color # assume that we already take this move + suicide = False has_liberty, group = self._find_group(vertex) - if has_liberty: - ### this group still has liberty after this move, not suicide - self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY - return False - else: - ### liberty is zero + if not has_liberty: + suicide = True # no liberty, suicide for n in self._neighbor(vertex): if self.simulate_board[self.simulate_flatten(n)] == utils.another_color(color): opponent_liberty, group = self._find_group(n) - # this move is able to take opponent's stone, not suicide if not opponent_liberty: - self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY - return False - # not a take, suicide - self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY - return True + suicide = False # this move is able to take opponent's stone, not suicide + + self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY # undo this move + return suicide def _check_global_isomorphous(self, color, vertex): ##backup _board = copy.copy(self.simulate_board) self.simulate_board[self.simulate_flatten(vertex)] = color self._process_board(color, vertex) - if self.simulate_board in self.simulate_latest_boards: + if self.simulate_board in self.game.history: res = True else: res = False @@ -140,7 +134,9 @@ class GoEnv: return True def simulate_is_valid(self, state, action): - # state is the play board, the shape is [1, 9, 9, 17] + # State is the play board, the shape is [1, self.game.size, self.game.size, 17]. + # Action is an index + # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move if action == self.game.size ** 2: vertex = (0, 0) else: @@ -177,7 +173,7 @@ class GoEnv: return True - def do_move(self, color, vertex): + def simulate_do_move(self, color, vertex): if vertex == utils.PASS: return True @@ -200,7 +196,7 @@ class GoEnv: # print(vertex) # print(self.board) self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() - self.do_move(color, vertex) + self.simulate_do_move(color, vertex) new_state = np.concatenate( [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), From 0991fef527e73617114949a406e9da4632865e2d Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Tue, 19 Dec 2017 15:09:46 +0800 Subject: [PATCH 18/98] deflatten debug --- AlphaGo/game.py | 10 +++++----- AlphaGo/strategy.py | 9 +++++++-- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 3b62435..2a82d8e 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -46,12 +46,12 @@ class Game: def _flatten(self, vertex): x, y = vertex - return (y - 1) * self.size + (x - 1) + return (x - 1) * self.size + (y - 1) def _deflatten(self, idx): - x = idx % self.size + 1 - y = idx // self.size + 1 - return (x,y) + x = idx // self.size + 1 + y = idx % self.size + 1 + return (x, y) def clear(self): self.board = [utils.EMPTY] * (self.size ** 2) @@ -88,7 +88,7 @@ class Game: if choice == self.size ** 2: move = utils.PASS else: - move = (choice % self.size + 1, choice / self.size + 1) + move = self._deflatten(choice) return move, prob def do_move(self, color, vertex): diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index e9457cf..112f130 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -23,6 +23,11 @@ class GoEnv: x, y = vertex return (x - 1) * self.game.size + (y - 1) + def simulate_deflatten(self, idx): + x = idx // self.game.size + 1 + y = idx % self.game.size + 1 + return (x, y) + def _find_group(self, start): color = self.simulate_board[self.simulate_flatten(start)] # print ("color : ", color) @@ -140,7 +145,7 @@ class GoEnv: if action == self.game.size ** 2: vertex = (0, 0) else: - vertex = (action / self.game.size + 1, action % self.game.size + 1) + vertex = self.simulate_deflatten(action) if state[0, 0, 0, -1] == utils.BLACK: color = utils.BLACK else: @@ -192,7 +197,7 @@ class GoEnv: if action == self.game.size ** 2: vertex = utils.PASS else: - vertex = (action % self.game.size + 1, action / self.game.size + 1) + vertex = self.simulate_deflatten(action) # print(vertex) # print(self.board) self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() From 4a2d8f0003443f6ca60f78370027914a4e4ff9c4 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Tue, 19 Dec 2017 15:39:31 +0800 Subject: [PATCH 19/98] start a random player if checkpoint path is not specified --- AlphaGo/play.py | 32 +++++++++++++++++++------------- AlphaGo/player.py | 4 +++- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/AlphaGo/play.py b/AlphaGo/play.py index fe6c7ce..7367804 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -1,3 +1,4 @@ +import argparse import subprocess import sys import re @@ -11,14 +12,17 @@ if __name__ == '__main__': Note that, this function requires the installation of the Pyro4 library. """ # TODO : we should set the network path in a more configurable way. - black_weight_path = "./checkpoints" - white_weight_path = "./checkpoints_origin" - if (not os.path.exists(black_weight_path)): - print "Can't not find the network weights for black player." - sys.exit() - if (not os.path.exists(white_weight_path)): - print "Can't not find the network weights for white player." - sys.exit() + parser = argparse.ArgumentParser() + parser.add_argument("--black_weight_path", type=str, default=None) + parser.add_argument("--white_weight_path", type=str, default=None) + args = parser.parse_args() + + # black_weight_path = "./checkpoints" + # white_weight_path = "./checkpoints_origin" + if args.black_weight_path is not None and (not os.path.exists(args.black_weight_path)): + raise ValueError("Can't not find the network weights for black player.") + if args.white_weight_path is not None and (not os.path.exists(args.white_weight_path)): + raise ValueError("Can't not find the network weights for white player.") # kill the old server kill_old_server = subprocess.Popen(['killall', 'pyro4-ns']) @@ -31,14 +35,16 @@ if __name__ == '__main__': time.sleep(1) # start two different player with different network weights. - agent_v0 = subprocess.Popen(['python', '-u', 'player.py', '--role=black'], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - agent_v1 = subprocess.Popen(['python', '-u', 'player.py', '--role=white', '--checkpoint_path=./checkpoints_origin/'], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + agent_v0 = subprocess.Popen(['python', '-u', 'player.py', '--role=black', '--checkpoint_path=' + str(args.black_weight_path)], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + agent_v1 = subprocess.Popen(['python', '-u', 'player.py', '--role=white', '--checkpoint_path=' + str(args.white_weight_path)], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + server_list = "" while ("black" not in server_list) or ("white" not in server_list): server_list = subprocess.check_output(['pyro4-nsc', 'list']) - print "Waining for the server start..." + print "Waiting for the server start..." time.sleep(1) print server_list print "Start black player at : " + str(agent_v0.pid) diff --git a/AlphaGo/player.py b/AlphaGo/player.py index 8245c38..b468cf3 100644 --- a/AlphaGo/player.py +++ b/AlphaGo/player.py @@ -22,10 +22,12 @@ class Player(object): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument("--checkpoint_path", type=str, default="./checkpoints/") + parser.add_argument("--checkpoint_path", type=str, default=None) parser.add_argument("--role", type=str, default="unknown") args = parser.parse_args() + if args.checkpoint_path == 'None': + args.checkpoint_path = None game = Game(checkpoint_path=args.checkpoint_path) engine = GTPEngine(game_obj=game, name='tianshou', version=0) From fc8114fe35646673e4b2f4ac00527879878a6ce3 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Tue, 19 Dec 2017 16:51:50 +0800 Subject: [PATCH 20/98] merge flatten and deflatten, rename variable for clarity --- AlphaGo/engine.py | 4 +-- AlphaGo/game.py | 15 ++++++----- AlphaGo/strategy.py | 45 +++++++++++++-------------------- tianshou/core/mcts/evaluator.py | 4 +-- tianshou/core/mcts/mcts.py | 2 +- 5 files changed, 31 insertions(+), 39 deletions(-) diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index 1f9af85..1ee8833 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -167,7 +167,7 @@ class GTPEngine(): move = self._parse_move(args) if move: color, vertex = move - res = self._game.do_move(color, vertex) + res = self._game.play_move(color, vertex) if res: return None, True else: @@ -177,7 +177,7 @@ class GTPEngine(): def cmd_genmove(self, args, **kwargs): color = self._parse_color(args) if color: - move = self._game.gen_move(color) + move = self._game.think_play_move(color) return self._vertex_point2string(move), True else: return 'unknown player', False diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 2a82d8e..d0cb91c 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -77,7 +77,7 @@ class Game: state[0, :, :, 16] = np.zeros([self.size, self.size]) return state - def strategy_gen_move(self, latest_boards, color): + def think(self, latest_boards, color): self.simulator.simulate_latest_boards = copy.copy(latest_boards) self.simulator.simulate_board = copy.copy(latest_boards[-1]) nn_input = self.generate_nn_input(self.simulator.simulate_latest_boards, color) @@ -91,17 +91,18 @@ class Game: move = self._deflatten(choice) return move, prob - def do_move(self, color, vertex): + def play_move(self, color, vertex): + # this function can be called directly to play the opponent's move if vertex == utils.PASS: return True res = self.executor.do_move(color, vertex) return res - def gen_move(self, color): - # move = self.strategy.gen_move(color) - # return move - move, self.prob = self.strategy_gen_move(self.latest_boards, color) - self.do_move(color, move) + def think_play_move(self, color): + # although we dont need to return self.prob, however it is needed for neural network training + move, self.prob = self.think(self.latest_boards, color) + # play the move immediately + self.play_move(color, move) return move def status2symbol(self, s): diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index 112f130..af017b1 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -10,7 +10,7 @@ import tensorflow as tf from collections import deque from tianshou.core.mcts.mcts import MCTS -DELTA = [[1, 0], [-1, 0], [0, -1], [0, 1]] +NEIGHBOR_OFFSET = [[1, 0], [-1, 0], [0, -1], [0, 1]] CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] class GoEnv: @@ -19,17 +19,8 @@ class GoEnv: self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) self.simulate_latest_boards = deque(maxlen=8) - def simulate_flatten(self, vertex): - x, y = vertex - return (x - 1) * self.game.size + (y - 1) - - def simulate_deflatten(self, idx): - x = idx // self.game.size + 1 - y = idx % self.game.size + 1 - return (x, y) - def _find_group(self, start): - color = self.simulate_board[self.simulate_flatten(start)] + color = self.simulate_board[self.game._flatten(start)] # print ("color : ", color) chain = set() frontier = [start] @@ -40,32 +31,32 @@ class GoEnv: chain.add(current) for n in self._neighbor(current): # print n, self._flatten(n), self.board[self._flatten(n)], - if self.simulate_board[self.simulate_flatten(n)] == color and not n in chain: + if self.simulate_board[self.game._flatten(n)] == color and not n in chain: frontier.append(n) - if self.simulate_board[self.simulate_flatten(n)] == utils.EMPTY: + if self.simulate_board[self.game._flatten(n)] == utils.EMPTY: has_liberty = True return has_liberty, chain def _is_suicide(self, color, vertex): - self.simulate_board[self.simulate_flatten(vertex)] = color # assume that we already take this move + self.simulate_board[self.game._flatten(vertex)] = color # assume that we already take this move suicide = False has_liberty, group = self._find_group(vertex) if not has_liberty: suicide = True # no liberty, suicide for n in self._neighbor(vertex): - if self.simulate_board[self.simulate_flatten(n)] == utils.another_color(color): + if self.simulate_board[self.game._flatten(n)] == utils.another_color(color): opponent_liberty, group = self._find_group(n) if not opponent_liberty: suicide = False # this move is able to take opponent's stone, not suicide - self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY # undo this move + self.simulate_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move return suicide def _check_global_isomorphous(self, color, vertex): ##backup _board = copy.copy(self.simulate_board) - self.simulate_board[self.simulate_flatten(vertex)] = color + self.simulate_board[self.game._flatten(vertex)] = color self._process_board(color, vertex) if self.simulate_board in self.game.history: res = True @@ -84,7 +75,7 @@ class GoEnv: def _neighbor(self, vertex): x, y = vertex nei = [] - for d in DELTA: + for d in NEIGHBOR_OFFSET: _x = x + d[0] _y = y + d[1] if self._in_board((_x, _y)): @@ -104,16 +95,16 @@ class GoEnv: def _process_board(self, color, vertex): nei = self._neighbor(vertex) for n in nei: - if self.simulate_board[self.simulate_flatten(n)] == utils.another_color(color): + if self.simulate_board[self.game._flatten(n)] == utils.another_color(color): has_liberty, group = self._find_group(n) if not has_liberty: for b in group: - self.simulate_board[self.simulate_flatten(b)] = utils.EMPTY + self.simulate_board[self.game._flatten(b)] = utils.EMPTY def _is_eye(self, color, vertex): nei = self._neighbor(vertex) cor = self._corner(vertex) - ncolor = {color == self.simulate_board[self.simulate_flatten(n)] for n in nei} + ncolor = {color == self.simulate_board[self.game._flatten(n)] for n in nei} if False in ncolor: # print "not all neighbors are in same color with us" return False @@ -122,7 +113,7 @@ class GoEnv: # print "all neighbors are in same group and same color with us" return True else: - opponent_number = [self.simulate_board[self.simulate_flatten(c)] for c in cor].count(-color) + opponent_number = [self.simulate_board[self.game._flatten(c)] for c in cor].count(-color) opponent_propotion = float(opponent_number) / float(len(cor)) if opponent_propotion < 0.5: # print "few opponents, real eye" @@ -145,7 +136,7 @@ class GoEnv: if action == self.game.size ** 2: vertex = (0, 0) else: - vertex = self.simulate_deflatten(action) + vertex = self.game._deflatten(action) if state[0, 0, 0, -1] == utils.BLACK: color = utils.BLACK else: @@ -160,7 +151,7 @@ class GoEnv: return False ### already have stone - if not self.simulate_board[self.simulate_flatten(vertex)] == utils.EMPTY: + if not self.simulate_board[self.game._flatten(vertex)] == utils.EMPTY: # print(np.array(self.board).reshape(9, 9)) # print(vertex) return False @@ -182,14 +173,14 @@ class GoEnv: if vertex == utils.PASS: return True - id_ = self.simulate_flatten(vertex) + id_ = self.game._flatten(vertex) if self.simulate_board[id_] == utils.EMPTY: self.simulate_board[id_] = color return True else: return False - def step_forward(self, state, action): + def simulate_step_forward(self, state, action): if state[0, 0, 0, -1] == 1: color = utils.BLACK else: @@ -197,7 +188,7 @@ class GoEnv: if action == self.game.size ** 2: vertex = utils.PASS else: - vertex = self.simulate_deflatten(action) + vertex = self.game._deflatten(action) # print(vertex) # print(self.board) self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() diff --git a/tianshou/core/mcts/evaluator.py b/tianshou/core/mcts/evaluator.py index 9c4ee8e..a1f9456 100644 --- a/tianshou/core/mcts/evaluator.py +++ b/tianshou/core/mcts/evaluator.py @@ -19,10 +19,10 @@ class rollout_policy(evaluator): # TODO: prior for rollout policy total_reward = 0. action = np.random.randint(0, self.action_num) - state, reward = self.env.step_forward(state, action) + state, reward = self.env.simulate_step_forward(state, action) total_reward += reward while state is not None: action = np.random.randint(0, self.action_num) - state, reward = self.env.step_forward(state, action) + state, reward = self.env.simulate_step_forward(state, action) total_reward += reward return np.ones([self.action_num])/self.action_num, total_reward diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 979e994..b58c105 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -116,7 +116,7 @@ class ActionNode(object): self.next_state = tuple2list(self.next_state) def selection(self, simulator): - self.next_state, self.reward = simulator.step_forward(self.parent.state, self.action) + self.next_state, self.reward = simulator.simulate_step_forward(self.parent.state, self.action) self.origin_state = self.next_state self.state_type = type(self.next_state) self.type_conversion_to_tuple() From 1f011a44ef12ca6a8651a6870cc37670a1c96dec Mon Sep 17 00:00:00 2001 From: mcgrady00h <281130306@qq.com> Date: Tue, 19 Dec 2017 17:04:55 +0800 Subject: [PATCH 21/98] add mcts virtual loss version (may have bugs) --- tianshou/core/mcts/mcts_test.py | 3 + tianshou/core/mcts/mcts_virtual_loss.py | 263 +++++++++++++++++++ tianshou/core/mcts/mcts_virtual_loss_test.py | 55 ++++ 3 files changed, 321 insertions(+) create mode 100644 tianshou/core/mcts/mcts_virtual_loss.py create mode 100644 tianshou/core/mcts/mcts_virtual_loss_test.py diff --git a/tianshou/core/mcts/mcts_test.py b/tianshou/core/mcts/mcts_test.py index da404ca..49b85be 100644 --- a/tianshou/core/mcts/mcts_test.py +++ b/tianshou/core/mcts/mcts_test.py @@ -12,6 +12,9 @@ class TestEnv: print(self.reward) # print("The best arm is {} with expected reward {}".format(self.best[0],self.best[1])) + def simulate_is_valid(self, state, act): + return True + def step_forward(self, state, action): if action != 0 and action != 1: raise ValueError("Action must be 0 or 1! Your action is {}".format(action)) diff --git a/tianshou/core/mcts/mcts_virtual_loss.py b/tianshou/core/mcts/mcts_virtual_loss.py new file mode 100644 index 0000000..9d20b5a --- /dev/null +++ b/tianshou/core/mcts/mcts_virtual_loss.py @@ -0,0 +1,263 @@ +# -*- coding: utf-8 -*- +# vim:fenc=utf-8 +# $File: mcts_virtual_loss.py +# $Date: Tue Dec 19 17:0444 2017 +0800 +# Original file: mcts.py +# $Author: renyong15 Ā© +# + +""" + This is an implementation of the MCTS with virtual loss. + Due to the limitation of Python design mechanism, we implements the virtual loss in a mini-batch + manner. +""" + +import numpy as np +import math +import time + +c_puct = 5 + + +def list2tuple(list): + try: + return tuple(list2tuple(sub) for sub in list) + except TypeError: + return list + + +def tuple2list(tuple): + try: + return list(tuple2list(sub) for sub in tuple) + except TypeError: + return tuple + + +class MCTSNodeVirtualLoss(object): + def __init__(self, parent, action, state, action_num, prior, inverse=False): + self.parent = parent + self.action = action + self.children = {} + self.state = state + self.action_num = action_num + self.prior = np.array(prior).reshape(-1) + self.inverse = inverse + + def selection(self, simulator): + raise NotImplementedError("Need to implement function selection") + + def backpropagation(self, action): + raise NotImplementedError("Need to implement function backpropagation") + + def valid_mask(self, simulator): + pass + +class UCTNodeVirtualLoss(MCTSNodeVirtualLoss): + def __init__(self, parent, action, state, action_num, prior, inverse=False): + super(UCTNodeVirtualLoss, self).__init__(parent, action, state, action_num, prior, inverse) + self.Q = np.zeros([action_num]) + self.W = np.zeros([action_num]) + self.N = np.zeros([action_num]) + self.virtual_loss = np.zeros([action_num]) + #### modified by adding virtual loss + #self.ucb = self.Q + c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1) + + self.mask = None + + def selection(self, simulator): + self.valid_mask(simulator) + self.Q = np.zeros([self.action_num]) + N_not_zero = self.N > 0 + self.Q[N_not_zero] = (self.W[N_not_zero] + self.virtual_loss[N_not_zero] + 0.) / self.N[N_not_zero] + self.ucb = self.Q + c_puct * self.prior * math.sqrt(np.sum(self.N + self.virtual_loss)) /\ + (self.N + self.virtual_loss + 1) + action = np.argmax(self.ucb) + self.virtual_loss[action] += 1 + + if action in self.children.keys(): + return self.children[action].selection(simulator) + else: + self.children[action] = ActionNodeVirtualLoss(self, action) + return self.children[action].selection(simulator) + + def remove_virtual_loss(self): + ### if not virtual_loss for every action is zero + if np.sum(self.virtual_loss > 0) > 0: + self.virtual_loss = np.zeros([self.action_num]) + if self.parent: + self.parent.remove_virtual_loss() + + def backpropagation(self, action): + action = int(action) + self.N[action] += 1 + self.W[action] += self.children[action].reward + + ## do not need to compute Q and ucb immediately since it will be modified by virtual loss + #for i in range(self.action_num): + # if self.N[i] != 0: + # self.Q[i] = (self.W[i] + 0.) / self.N[i] + #self.ucb = self.Q + c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1.) + + if self.parent is not None: + if self.inverse: + self.parent.backpropagation(-self.children[action].reward) + else: + self.parent.backpropagation(self.children[action].reward) + + def valid_mask(self, simulator): + if self.mask is None: + start_time = time.time() + self.mask = [] + for act in range(self.action_num - 1): + if not simulator.simulate_is_valid(self.state, act): + self.mask.append(act) + self.ucb[act] = -float("Inf") + else: + self.ucb[self.mask] = -float("Inf") + + + +class ActionNodeVirtualLoss(object): + def __init__(self, parent, action): + self.parent = parent + self.action = action + self.children = {} + self.next_state = None + self.origin_state = None + self.state_type = None + self.reward = 0 + + def remove_virtual_loss(self): + self.parent.remove_virtual_loss() + + def type_conversion_to_tuple(self): + if type(self.next_state) is np.ndarray: + self.next_state = self.next_state.tolist() + if type(self.next_state) is list: + self.next_state = list2tuple(self.next_state) + + def type_conversion_to_origin(self): + if self.state_type is np.ndarray: + self.next_state = np.array(self.next_state) + if self.state_type is list: + self.next_state = tuple2list(self.next_state) + + def selection(self, simulator): + self.next_state, self.reward = simulator.step_forward(self.parent.state, self.action) + self.origin_state = self.next_state + self.state_type = type(self.next_state) + self.type_conversion_to_tuple() + if self.next_state is not None: + if self.next_state in self.children.keys(): + return self.children[self.next_state].selection(simulator) + else: + return self.parent, self.action + else: + return self.parent, self.action + + def expansion(self, action, state, action_num, prior, inverse ): + if state is not None: + self.children[state] = UCTNodeVirtualLoss(self, action, state, action_num, prior, inverse) + + + def backpropagation(self, value): + self.reward += value + self.parent.backpropagation(self.action) + + +class MCTSVirtualLoss(object): + def __init__(self, simulator, evaluator, root, action_num, batch_size = 1, method = "UCT", inverse = False): + self.simulator = simulator + self.evaluator = evaluator + prior, _ = self.evaluator(root) + self.action_num = action_num + self.batch_size = batch_size + + if method == "": + self.root = root + elif method == "UCT": + self.root = UCTNodeVirtualLoss(None, None, root, action_num, prior, inverse) + elif method == "TS": + self.root = TSNodeVirtualLoss(None, None, root, action_num, prior, inverse=inverse) + else: + raise ValueError("Need a root type") + + self.inverse = inverse + + + def do_search(self, max_step=None, max_time=None): + if max_step is not None: + self.step = 0 + self.max_step = max_step + if max_time is not None: + self.start_time = time.time() + self.max_time = max_time + if max_step is None and max_time is None: + raise ValueError("Need a stop criteria!") + + self.select_time = [] + self.evaluate_time = [] + self.bp_time = [] + while (max_step is not None and self.step < self.max_step or max_step is None) \ + and (max_time is not None and time.time() - self.start_time < self.max_time or max_time is None): + self.expand() + if max_step is not None: + self.step += 1 + + def expand(self): + ## minibatch with virtual loss + nodes = [] + new_actions = [] + next_states = [] + + for i in range(self.batch_size): + node, new_action = self.root.selection(self.simulator) + nodes.append(node) + new_actions.append(new_action) + next_states.append(node.children[new_action].next_state) + + for node in nodes: + node.remove_virtual_loss() + + assert(np.sum(self.root.virtual_loss > 0) == 0) + #### compute value in batch manner unless the evaluator do not support it + try: + priors, values = self.evaluator(next_states) + except: + priors = [] + values = [] + for i in range(self.batch_size): + if next_states[i] is not None: + prior, value = self.evaluator(next_states[i]) + priors.append(prior) + values.append(value) + else: + priors.append(0.) + values.append(0.) + + #### for now next_state == origin_state + #### may have problem here. What if we reached the same next_state with same parent and action pair + for i in range(self.batch_size): + nodes[i].children[new_actions[i]].expansion(new_actions[i], + next_states[i], + self.action_num, + priors[i], + nodes[i].inverse) + + for i in range(self.batch_size): + nodes[i].children[new_actions[i]].backpropagation(values[i] + 0.) + + +##### TODO +class TSNodeVirtualLoss(MCTSNodeVirtualLoss): + def __init__(self, parent, action, state, action_num, prior, method="Gaussian", inverse=False): + super(TSNodeVirtualLoss, self).__init__(parent, action, state, action_num, prior, inverse) + if method == "Beta": + self.alpha = np.ones([action_num]) + self.beta = np.ones([action_num]) + if method == "Gaussian": + self.mu = np.zeros([action_num]) + self.sigma = np.zeros([action_num]) + +if __name__ == "__main__": + mcts_virtual_loss = MCTSNodeVirtualLoss(None, None, 10, 1, 'UCT') diff --git a/tianshou/core/mcts/mcts_virtual_loss_test.py b/tianshou/core/mcts/mcts_virtual_loss_test.py new file mode 100644 index 0000000..d2d6c81 --- /dev/null +++ b/tianshou/core/mcts/mcts_virtual_loss_test.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +# vim:fenc=utf-8 +# $File: mcts_virtual_loss_test.py +# $Date: Tue Dec 19 16:5459 2017 +0800 +# Original file: mcts_test.py +# $Author: renyong15 Ā© +# + + + +import numpy as np +from mcts_virtual_loss import MCTSVirtualLoss +from evaluator import rollout_policy + + +class TestEnv: + def __init__(self, max_step=5): + self.max_step = max_step + self.reward = {i: np.random.uniform() for i in range(2 ** max_step)} + # self.reward = {0:1, 1:0} + self.best = max(self.reward.items(), key=lambda x: x[1]) + print(self.reward) + # print("The best arm is {} with expected reward {}".format(self.best[0],self.best[1])) + + def simulate_is_valid(self, state, act): + return True + + def step_forward(self, state, action): + if action != 0 and action != 1: + raise ValueError("Action must be 0 or 1! Your action is {}".format(action)) + if state[0] >= 2 ** state[1] or state[1] > self.max_step: + raise ValueError("Invalid State! Your state is {}".format(state)) + # print("Operate action {} at state {}, timestep {}".format(action, state[0], state[1])) + if state[1] == self.max_step: + new_state = None + reward = 0 + else: + num = state[0] + 2 ** state[1] * action + step = state[1] + 1 + new_state = [num, step] + if step == self.max_step: + reward = int(np.random.uniform() < self.reward[num]) + else: + reward = 0. + return new_state, reward + + +if __name__ == "__main__": + env = TestEnv(2) + rollout = rollout_policy(env, 2) + evaluator = lambda state: rollout(state) + mcts_virtual_loss = MCTSVirtualLoss(env, evaluator, [0, 0], 2, batch_size = 10) + for i in range(10): + mcts_virtual_loss.do_search(max_step = 100) + From 232204d7970ef261c8f99394f2cc631a674a17a0 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Tue, 19 Dec 2017 22:57:38 +0800 Subject: [PATCH 22/98] fix the copy bug in check_global_isomorphous; refactor code to eliminate side effect --- AlphaGo/go.py | 36 ++++++------- AlphaGo/strategy.py | 104 +++++++++++++++++-------------------- tianshou/core/mcts/mcts.py | 3 +- 3 files changed, 67 insertions(+), 76 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 7b1d3e7..8e3518d 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -72,18 +72,14 @@ class Go: self.game.board[self.game._flatten(vertex)] = utils.EMPTY return True - def _check_global_isomorphous(self, color, vertex): - ##backup - _board = copy.copy(self.game.board) - self.game.board[self.game._flatten(vertex)] = color - self._process_board(color, vertex) - if self.game.board in self.game.history: - res = True - else: - res = False - - self.game.board = _board - return res + def _check_global_isomorphous(self, history_boards, current_board, color, vertex): + repeat = False + next_board = copy.copy(current_board) + next_board[self.game._flatten(vertex)] = color + self._process_board(next_board, color, vertex) + if next_board in history_boards: + repeat = True + return repeat def _in_board(self, vertex): x, y = vertex @@ -101,38 +97,38 @@ class Go: nei.append((_x, _y)) return nei - def _process_board(self, color, vertex): + def _process_board(self, current_board, color, vertex): nei = self._neighbor(vertex) for n in nei: - if self.game.board[self.game._flatten(n)] == utils.another_color(color): + if current_board[self.game._flatten(n)] == utils.another_color(color): can_kill, block = self._find_block(n) if can_kill: for b in block: - self.game.board[self.game._flatten(b)] = utils.EMPTY + current_board[self.game._flatten(b)] = utils.EMPTY - def is_valid(self, color, vertex): + def is_valid(self, history_boards, current_board, color, vertex): ### in board if not self._in_board(vertex): return False ### already have stone - if not self.game.board[self.game._flatten(vertex)] == utils.EMPTY: + if not current_board[self.game._flatten(vertex)] == utils.EMPTY: return False ### check if it is qi if not self._is_qi(color, vertex): return False - if self._check_global_isomorphous(color, vertex): + if self._check_global_isomorphous(history_boards, current_board, color, vertex): return False return True def do_move(self, color, vertex): - if not self.is_valid(color, vertex): + if not self.is_valid(self.game.history, self.game.board, color, vertex): return False self.game.board[self.game._flatten(vertex)] = color - self._process_board(color, vertex) + self._process_board(self.game.board, color, vertex) self.game.history.append(copy.copy(self.game.board)) self.game.latest_boards.append(copy.copy(self.game.board)) return True diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index af017b1..07555e9 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -19,52 +19,47 @@ class GoEnv: self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) self.simulate_latest_boards = deque(maxlen=8) - def _find_group(self, start): - color = self.simulate_board[self.game._flatten(start)] + def _find_group(self, current_board, vertex): + color = current_board[self.game._flatten(vertex)] # print ("color : ", color) chain = set() - frontier = [start] + frontier = [vertex] has_liberty = False while frontier: current = frontier.pop() # print ("current : ", current) chain.add(current) for n in self._neighbor(current): - # print n, self._flatten(n), self.board[self._flatten(n)], - if self.simulate_board[self.game._flatten(n)] == color and not n in chain: + if current_board[self.game._flatten(n)] == color and not n in chain: frontier.append(n) - if self.simulate_board[self.game._flatten(n)] == utils.EMPTY: + if current_board[self.game._flatten(n)] == utils.EMPTY: has_liberty = True return has_liberty, chain - def _is_suicide(self, color, vertex): - self.simulate_board[self.game._flatten(vertex)] = color # assume that we already take this move + def _is_suicide(self, current_board, color, vertex): + current_board[self.game._flatten(vertex)] = color # assume that we already take this move suicide = False - has_liberty, group = self._find_group(vertex) + has_liberty, group = self._find_group(current_board, vertex) if not has_liberty: suicide = True # no liberty, suicide for n in self._neighbor(vertex): - if self.simulate_board[self.game._flatten(n)] == utils.another_color(color): - opponent_liberty, group = self._find_group(n) + if current_board[self.game._flatten(n)] == utils.another_color(color): + opponent_liberty, group = self._find_group(current_board, n) if not opponent_liberty: suicide = False # this move is able to take opponent's stone, not suicide - self.simulate_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move + current_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move return suicide - def _check_global_isomorphous(self, color, vertex): - ##backup - _board = copy.copy(self.simulate_board) - self.simulate_board[self.game._flatten(vertex)] = color - self._process_board(color, vertex) - if self.simulate_board in self.game.history: - res = True - else: - res = False - - self.simulate_board = _board - return res + def _check_global_isomorphous(self, history_boards, current_board, color, vertex): + repeat = False + next_board = copy.copy(current_board) + next_board[self.game._flatten(vertex)] = color + self._process_board(next_board, color, vertex) + if next_board in history_boards: + repeat = True + return repeat def _in_board(self, vertex): x, y = vertex @@ -92,28 +87,28 @@ class GoEnv: corner.append((_x, _y)) return corner - def _process_board(self, color, vertex): + def _process_board(self, current_board, color, vertex): nei = self._neighbor(vertex) for n in nei: - if self.simulate_board[self.game._flatten(n)] == utils.another_color(color): - has_liberty, group = self._find_group(n) + if current_board[self.game._flatten(n)] == utils.another_color(color): + has_liberty, group = self._find_group(current_board, n) if not has_liberty: for b in group: - self.simulate_board[self.game._flatten(b)] = utils.EMPTY + current_board[self.game._flatten(b)] = utils.EMPTY - def _is_eye(self, color, vertex): + def _is_eye(self, current_board, color, vertex): nei = self._neighbor(vertex) cor = self._corner(vertex) - ncolor = {color == self.simulate_board[self.game._flatten(n)] for n in nei} + ncolor = {color == current_board[self.game._flatten(n)] for n in nei} if False in ncolor: # print "not all neighbors are in same color with us" return False - _, group = self._find_group(nei[0]) + _, group = self._find_group(current_board, nei[0]) if set(nei) < group: # print "all neighbors are in same group and same color with us" return True else: - opponent_number = [self.simulate_board[self.game._flatten(c)] for c in cor].count(-color) + opponent_number = [current_board[self.game._flatten(c)] for c in cor].count(-color) opponent_propotion = float(opponent_number) / float(len(cor)) if opponent_propotion < 0.5: # print "few opponents, real eye" @@ -122,49 +117,54 @@ class GoEnv: # print "many opponents, fake eye" return False - def knowledge_prunning(self, color, vertex): + def knowledge_prunning(self, current_board, color, vertex): ### check if it is an eye of yourself ### assumptions : notice that this judgement requires that the state is an endgame - if self._is_eye(color, vertex): + if self._is_eye(current_board, color, vertex): return False return True - def simulate_is_valid(self, state, action): - # State is the play board, the shape is [1, self.game.size, self.game.size, 17]. - # Action is an index + def sa2cv(self, state, action): + # State is the play board, the shape is [1, self.game.size, self.game.size, 17], action is an index. # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move - if action == self.game.size ** 2: - vertex = (0, 0) - else: - vertex = self.game._deflatten(action) if state[0, 0, 0, -1] == utils.BLACK: color = utils.BLACK else: color = utils.WHITE + if action == self.game.size ** 2: + vertex = (0, 0) + else: + vertex = self.game._deflatten(action) + return color, vertex + + def simulate_is_valid(self, history_boards, current_board, state, action): + # initialize simulate_latest_boards and simulate_board from state self.simulate_latest_boards.clear() for i in range(8): self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) + color, vertex = self.sa2cv(state, action) + ### in board if not self._in_board(vertex): return False ### already have stone - if not self.simulate_board[self.game._flatten(vertex)] == utils.EMPTY: + if not current_board[self.game._flatten(vertex)] == utils.EMPTY: # print(np.array(self.board).reshape(9, 9)) # print(vertex) return False ### check if it is suicide - if self._is_suicide(color, vertex): + if self._is_suicide(current_board, color, vertex): return False ### forbid global isomorphous - if self._check_global_isomorphous(color, vertex): + if self._check_global_isomorphous(history_boards, current_board, color, vertex): return False - if not self.knowledge_prunning(color, vertex): + if not self.knowledge_prunning(current_board, color, vertex): return False return True @@ -181,17 +181,11 @@ class GoEnv: return False def simulate_step_forward(self, state, action): - if state[0, 0, 0, -1] == 1: - color = utils.BLACK - else: - color = utils.WHITE - if action == self.game.size ** 2: - vertex = utils.PASS - else: - vertex = self.game._deflatten(action) - # print(vertex) - # print(self.board) + # initialize the simulate_board from state self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() + + color, vertex = self.sa2cv(state, action) + self.simulate_do_move(color, vertex) new_state = np.concatenate( [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index b58c105..12fc85d 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -75,7 +75,8 @@ class UCTNode(MCTSNode): start_time = time.time() self.mask = [] for act in range(self.action_num - 1): - if not simulator.simulate_is_valid(self.state, act): + if not simulator.simulate_is_valid( + simulator.simulate_latest_boards, simulator.simulate_board, self.state, act): self.mask.append(act) self.ucb[act] = -float("Inf") else: From 2a9d949510f3e2032e868fa64bb0d6efc7624fc3 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Wed, 20 Dec 2017 00:16:24 +0800 Subject: [PATCH 23/98] rearrange the sequence of functions of Go and GoEnv before merging --- AlphaGo/go.py | 125 ++++++++++++++++++++------------------------ AlphaGo/strategy.py | 70 ++++++++++++------------- 2 files changed, 91 insertions(+), 104 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 8e3518d..37d8339 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -17,70 +17,6 @@ class Go: def __init__(self, **kwargs): self.game = kwargs['game'] - def _bfs(self, vertex, color, block, status): - block.append(vertex) - status[self.game._flatten(vertex)] = True - nei = self._neighbor(vertex) - for n in nei: - if not status[self.game._flatten(n)]: - if self.game.board[self.game._flatten(n)] == color: - self._bfs(n, color, block, status) - - def _find_block(self, vertex): - block = [] - status = [False] * (self.game.size ** 2) - color = self.game.board[self.game._flatten(vertex)] - self._bfs(vertex, color, block, status) - - for b in block: - for n in self._neighbor(b): - if self.game.board[self.game._flatten(n)] == utils.EMPTY: - return False, block - return True, block - - def _find_boarder(self, vertex): - block = [] - status = [False] * (self.game.size ** 2) - self._bfs(vertex, utils.EMPTY, block, status) - border = [] - for b in block: - for n in self._neighbor(b): - if not (n in block): - border.append(n) - return border - - def _is_qi(self, color, vertex): - nei = self._neighbor(vertex) - for n in nei: - if self.game.board[self.game._flatten(n)] == utils.EMPTY: - return True - - self.game.board[self.game._flatten(vertex)] = color - for n in nei: - if self.game.board[self.game._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n) - if can_kill: - self.game.board[self.game._flatten(vertex)] = utils.EMPTY - return True - - ### can not suicide - can_kill, block = self._find_block(vertex) - if can_kill: - self.game.board[self.game._flatten(vertex)] = utils.EMPTY - return False - - self.game.board[self.game._flatten(vertex)] = utils.EMPTY - return True - - def _check_global_isomorphous(self, history_boards, current_board, color, vertex): - repeat = False - next_board = copy.copy(current_board) - next_board[self.game._flatten(vertex)] = color - self._process_board(next_board, color, vertex) - if next_board in history_boards: - repeat = True - return repeat - def _in_board(self, vertex): x, y = vertex if x < 1 or x > self.game.size: return False @@ -97,15 +33,57 @@ class Go: nei.append((_x, _y)) return nei + def _find_group(self, current_board, vertex): + color = current_board[self.game._flatten(vertex)] + # print ("color : ", color) + chain = set() + frontier = [vertex] + has_liberty = False + while frontier: + current = frontier.pop() + # print ("current : ", current) + chain.add(current) + for n in self._neighbor(current): + if current_board[self.game._flatten(n)] == color and not n in chain: + frontier.append(n) + if current_board[self.game._flatten(n)] == utils.EMPTY: + has_liberty = True + return has_liberty, chain + + def _is_suicide(self, current_board, color, vertex): + current_board[self.game._flatten(vertex)] = color # assume that we already take this move + suicide = False + + has_liberty, group = self._find_group(current_board, vertex) + if not has_liberty: + suicide = True # no liberty, suicide + for n in self._neighbor(vertex): + if current_board[self.game._flatten(n)] == utils.another_color(color): + opponent_liberty, group = self._find_group(current_board, n) + if not opponent_liberty: + suicide = False # this move is able to take opponent's stone, not suicide + + current_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move + return suicide + def _process_board(self, current_board, color, vertex): nei = self._neighbor(vertex) for n in nei: if current_board[self.game._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n) - if can_kill: - for b in block: + has_liberty, group = self._find_group(current_board, n) + if not has_liberty: + for b in group: current_board[self.game._flatten(b)] = utils.EMPTY + def _check_global_isomorphous(self, history_boards, current_board, color, vertex): + repeat = False + next_board = copy.copy(current_board) + next_board[self.game._flatten(vertex)] = color + self._process_board(next_board, color, vertex) + if next_board in history_boards: + repeat = True + return repeat + def is_valid(self, history_boards, current_board, color, vertex): ### in board if not self._in_board(vertex): @@ -115,8 +93,8 @@ class Go: if not current_board[self.game._flatten(vertex)] == utils.EMPTY: return False - ### check if it is qi - if not self._is_qi(color, vertex): + ### check if it is suicide + if self._is_suicide(current_board, color, vertex): return False if self._check_global_isomorphous(history_boards, current_board, color, vertex): @@ -137,6 +115,15 @@ class Go: idx = [i for i,x in enumerate(self.game.board) if x == utils.EMPTY ][0] return self.game._deflatten(idx) + def _find_boarder(self, vertex): + _, group = self._find_group(self.game.board, vertex) + border = [] + for b in group: + for n in self._neighbor(b): + if not (n in group): + border.append(n) + return border + def _add_nearby_stones(self, neighbor_vertex_set, start_vertex_x, start_vertex_y, x_diff, y_diff, num_step): ''' add the nearby stones around the input vertex diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index 07555e9..9ebd421 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -19,6 +19,32 @@ class GoEnv: self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) self.simulate_latest_boards = deque(maxlen=8) + def _in_board(self, vertex): + x, y = vertex + if x < 1 or x > self.game.size: return False + if y < 1 or y > self.game.size: return False + return True + + def _neighbor(self, vertex): + x, y = vertex + nei = [] + for d in NEIGHBOR_OFFSET: + _x = x + d[0] + _y = y + d[1] + if self._in_board((_x, _y)): + nei.append((_x, _y)) + return nei + + def _corner(self, vertex): + x, y = vertex + corner = [] + for d in CORNER_OFFSET: + _x = x + d[0] + _y = y + d[1] + if self._in_board((_x, _y)): + corner.append((_x, _y)) + return corner + def _find_group(self, current_board, vertex): color = current_board[self.game._flatten(vertex)] # print ("color : ", color) @@ -52,41 +78,6 @@ class GoEnv: current_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move return suicide - def _check_global_isomorphous(self, history_boards, current_board, color, vertex): - repeat = False - next_board = copy.copy(current_board) - next_board[self.game._flatten(vertex)] = color - self._process_board(next_board, color, vertex) - if next_board in history_boards: - repeat = True - return repeat - - def _in_board(self, vertex): - x, y = vertex - if x < 1 or x > self.game.size: return False - if y < 1 or y > self.game.size: return False - return True - - def _neighbor(self, vertex): - x, y = vertex - nei = [] - for d in NEIGHBOR_OFFSET: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - nei.append((_x, _y)) - return nei - - def _corner(self, vertex): - x, y = vertex - corner = [] - for d in CORNER_OFFSET: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - corner.append((_x, _y)) - return corner - def _process_board(self, current_board, color, vertex): nei = self._neighbor(vertex) for n in nei: @@ -96,6 +87,15 @@ class GoEnv: for b in group: current_board[self.game._flatten(b)] = utils.EMPTY + def _check_global_isomorphous(self, history_boards, current_board, color, vertex): + repeat = False + next_board = copy.copy(current_board) + next_board[self.game._flatten(vertex)] = color + self._process_board(next_board, color, vertex) + if next_board in history_boards: + repeat = True + return repeat + def _is_eye(self, current_board, color, vertex): nei = self._neighbor(vertex) cor = self._corner(vertex) From d1af137686355b347f7c5b6b7fd117969b9a04cc Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Wed, 20 Dec 2017 00:43:31 +0800 Subject: [PATCH 24/98] final version before merge Go and GoEnv --- AlphaGo/engine.py | 2 +- AlphaGo/game.py | 3 ++- AlphaGo/go.py | 8 ++++---- AlphaGo/self-play.py | 2 +- AlphaGo/strategy.py | 38 +++++++++++++++++++++----------------- 5 files changed, 29 insertions(+), 24 deletions(-) diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index 1ee8833..d11635a 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -183,7 +183,7 @@ class GTPEngine(): return 'unknown player', False def cmd_get_score(self, args, **kwargs): - return self._game.executor.get_score(), None + return self._game.executor.executor_get_score(), None def cmd_show_board(self, args, **kwargs): return self._game.board, True diff --git a/AlphaGo/game.py b/AlphaGo/game.py index d0cb91c..af4ef57 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -78,6 +78,7 @@ class Game: return state def think(self, latest_boards, color): + # TODO : using copy is right, or should we change to deepcopy? self.simulator.simulate_latest_boards = copy.copy(latest_boards) self.simulator.simulate_board = copy.copy(latest_boards[-1]) nn_input = self.generate_nn_input(self.simulator.simulate_latest_boards, color) @@ -95,7 +96,7 @@ class Game: # this function can be called directly to play the opponent's move if vertex == utils.PASS: return True - res = self.executor.do_move(color, vertex) + res = self.executor.executor_do_move(color, vertex) return res def think_play_move(self, color): diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 37d8339..108c9bd 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -84,7 +84,7 @@ class Go: repeat = True return repeat - def is_valid(self, history_boards, current_board, color, vertex): + def _is_valid(self, history_boards, current_board, color, vertex): ### in board if not self._in_board(vertex): return False @@ -102,8 +102,8 @@ class Go: return True - def do_move(self, color, vertex): - if not self.is_valid(self.game.history, self.game.board, color, vertex): + def executor_do_move(self, color, vertex): + if not self._is_valid(self.game.history, self.game.board, color, vertex): return False self.game.board[self.game._flatten(vertex)] = color self._process_board(self.game.board, color, vertex) @@ -164,7 +164,7 @@ class Go: elif color_estimate < 0: return utils.WHITE - def get_score(self, is_unknown_estimation = False): + def executor_get_score(self, is_unknown_estimation = False): ''' is_unknown_estimation: whether use nearby stone to predict the unknown return score from BLACK perspective. diff --git a/AlphaGo/self-play.py b/AlphaGo/self-play.py index 98ccf84..296112b 100644 --- a/AlphaGo/self-play.py +++ b/AlphaGo/self-play.py @@ -79,7 +79,7 @@ while True: prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1)) print("Finished") print("\n") - score = game.executor.get_score(True) + score = game.executor.executor_get_score(True) if score > 0: winner = utils.BLACK else: diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index 9ebd421..1e5fd02 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -117,14 +117,14 @@ class GoEnv: # print "many opponents, fake eye" return False - def knowledge_prunning(self, current_board, color, vertex): + def _knowledge_prunning(self, current_board, color, vertex): ### check if it is an eye of yourself ### assumptions : notice that this judgement requires that the state is an endgame if self._is_eye(current_board, color, vertex): return False return True - def sa2cv(self, state, action): + def _sa2cv(self, state, action): # State is the play board, the shape is [1, self.game.size, self.game.size, 17], action is an index. # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move if state[0, 0, 0, -1] == utils.BLACK: @@ -137,23 +137,13 @@ class GoEnv: vertex = self.game._deflatten(action) return color, vertex - def simulate_is_valid(self, history_boards, current_board, state, action): - # initialize simulate_latest_boards and simulate_board from state - self.simulate_latest_boards.clear() - for i in range(8): - self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) - self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) - - color, vertex = self.sa2cv(state, action) - + def _is_valid(self, history_boards, current_board, color, vertex): ### in board if not self._in_board(vertex): return False ### already have stone if not current_board[self.game._flatten(vertex)] == utils.EMPTY: - # print(np.array(self.board).reshape(9, 9)) - # print(vertex) return False ### check if it is suicide @@ -164,12 +154,26 @@ class GoEnv: if self._check_global_isomorphous(history_boards, current_board, color, vertex): return False - if not self.knowledge_prunning(current_board, color, vertex): + return True + + def simulate_is_valid(self, history_boards, current_board, state, action): + # initialize simulate_latest_boards and simulate_board from state + self.simulate_latest_boards.clear() + for i in range(8): + self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) + self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) + + color, vertex = self._sa2cv(state, action) + + if not self._is_valid(history_boards, current_board, color, vertex): + return False + + if not self._knowledge_prunning(current_board, color, vertex): return False return True - def simulate_do_move(self, color, vertex): + def _do_move(self, color, vertex): if vertex == utils.PASS: return True @@ -184,9 +188,9 @@ class GoEnv: # initialize the simulate_board from state self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() - color, vertex = self.sa2cv(state, action) + color, vertex = self._sa2cv(state, action) - self.simulate_do_move(color, vertex) + self._do_move(color, vertex) new_state = np.concatenate( [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), From c2b46c44e7dce0ef4c73e230aaed07c91af32e0c Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Wed, 20 Dec 2017 01:14:05 +0800 Subject: [PATCH 25/98] merge Go and GoEnv finallygit status! --- AlphaGo/engine.py | 2 +- AlphaGo/game.py | 23 ++--- AlphaGo/go.py | 99 ++++++++++++++++++++- AlphaGo/self-play.py | 2 +- AlphaGo/strategy.py | 199 ------------------------------------------- 5 files changed, 108 insertions(+), 217 deletions(-) delete mode 100644 AlphaGo/strategy.py diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index d11635a..9948176 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -183,7 +183,7 @@ class GTPEngine(): return 'unknown player', False def cmd_get_score(self, args, **kwargs): - return self._game.executor.executor_get_score(), None + return self._game.game_engine.executor_get_score(), None def cmd_show_board(self, args, **kwargs): return self._game.board, True diff --git a/AlphaGo/game.py b/AlphaGo/game.py index af4ef57..aee8d3a 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -9,16 +9,13 @@ import utils import copy import tensorflow as tf import numpy as np -import sys +import sys, os import go import network_small -import strategy from collections import deque +sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir)) from tianshou.core.mcts.mcts import MCTS -import Network -#from strategy import strategy - class Game: ''' Load the real game and trained weights. @@ -34,15 +31,11 @@ class Game: self.latest_boards = deque(maxlen=8) for _ in range(8): self.latest_boards.append(self.board) - - self.executor = go.Go(game=self) - #self.strategy = strategy(checkpoint_path) - - self.simulator = strategy.GoEnv(game=self) self.net = network_small.Network() self.sess = self.net.forward(checkpoint_path) self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], feed_dict={self.net.x: state, self.net.is_training: False}) + self.game_engine = go.Go(game=self) def _flatten(self, vertex): x, y = vertex @@ -79,10 +72,10 @@ class Game: def think(self, latest_boards, color): # TODO : using copy is right, or should we change to deepcopy? - self.simulator.simulate_latest_boards = copy.copy(latest_boards) - self.simulator.simulate_board = copy.copy(latest_boards[-1]) - nn_input = self.generate_nn_input(self.simulator.simulate_latest_boards, color) - mcts = MCTS(self.simulator, self.evaluator, nn_input, self.size ** 2 + 1, inverse=True, max_step=1) + self.game_engine.simulate_latest_boards = copy.copy(latest_boards) + self.game_engine.simulate_board = copy.copy(latest_boards[-1]) + nn_input = self.generate_nn_input(self.game_engine.simulate_latest_boards, color) + mcts = MCTS(self.game_engine, self.evaluator, nn_input, self.size ** 2 + 1, inverse=True, max_step=1) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0] @@ -96,7 +89,7 @@ class Game: # this function can be called directly to play the opponent's move if vertex == utils.PASS: return True - res = self.executor.executor_do_move(color, vertex) + res = self.game_engine.executor_do_move(color, vertex) return res def think_play_move(self, color): diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 108c9bd..10ce7e1 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -1,7 +1,7 @@ from __future__ import print_function import utils import copy -import sys +import numpy as np from collections import deque ''' @@ -12,10 +12,13 @@ Settings of the Go game. ''' NEIGHBOR_OFFSET = [[1, 0], [-1, 0], [0, -1], [0, 1]] +CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] class Go: def __init__(self, **kwargs): self.game = kwargs['game'] + self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) + self.simulate_latest_boards = deque(maxlen=8) def _in_board(self, vertex): x, y = vertex @@ -33,6 +36,16 @@ class Go: nei.append((_x, _y)) return nei + def _corner(self, vertex): + x, y = vertex + corner = [] + for d in CORNER_OFFSET: + _x = x + d[0] + _y = y + d[1] + if self._in_board((_x, _y)): + corner.append((_x, _y)) + return corner + def _find_group(self, current_board, vertex): color = current_board[self.game._flatten(vertex)] # print ("color : ", color) @@ -84,6 +97,47 @@ class Go: repeat = True return repeat + def _is_eye(self, current_board, color, vertex): + nei = self._neighbor(vertex) + cor = self._corner(vertex) + ncolor = {color == current_board[self.game._flatten(n)] for n in nei} + if False in ncolor: + # print "not all neighbors are in same color with us" + return False + _, group = self._find_group(current_board, nei[0]) + if set(nei) < group: + # print "all neighbors are in same group and same color with us" + return True + else: + opponent_number = [current_board[self.game._flatten(c)] for c in cor].count(-color) + opponent_propotion = float(opponent_number) / float(len(cor)) + if opponent_propotion < 0.5: + # print "few opponents, real eye" + return True + else: + # print "many opponents, fake eye" + return False + + def _knowledge_prunning(self, current_board, color, vertex): + ### check if it is an eye of yourself + ### assumptions : notice that this judgement requires that the state is an endgame + if self._is_eye(current_board, color, vertex): + return False + return True + + def _sa2cv(self, state, action): + # State is the play board, the shape is [1, self.game.size, self.game.size, 17], action is an index. + # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move + if state[0, 0, 0, -1] == utils.BLACK: + color = utils.BLACK + else: + color = utils.WHITE + if action == self.game.size ** 2: + vertex = (0, 0) + else: + vertex = self.game._deflatten(action) + return color, vertex + def _is_valid(self, history_boards, current_board, color, vertex): ### in board if not self._in_board(vertex): @@ -97,11 +151,54 @@ class Go: if self._is_suicide(current_board, color, vertex): return False + ### forbid global isomorphous if self._check_global_isomorphous(history_boards, current_board, color, vertex): return False return True + def simulate_is_valid(self, history_boards, current_board, state, action): + # initialize simulate_latest_boards and simulate_board from state + self.simulate_latest_boards.clear() + for i in range(8): + self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) + self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) + + color, vertex = self._sa2cv(state, action) + + if not self._is_valid(history_boards, current_board, color, vertex): + return False + + if not self._knowledge_prunning(current_board, color, vertex): + return False + + return True + + def _do_move(self, color, vertex): + if vertex == utils.PASS: + return True + + id_ = self.game._flatten(vertex) + if self.simulate_board[id_] == utils.EMPTY: + self.simulate_board[id_] = color + return True + else: + return False + + def simulate_step_forward(self, state, action): + # initialize the simulate_board from state + self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() + + color, vertex = self._sa2cv(state, action) + + self._do_move(color, vertex) + new_state = np.concatenate( + [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), + state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), + np.array(1 - state[:, :, :, -1]).reshape(1, self.game.size, self.game.size, 1)], + axis=3) + return new_state, 0 + def executor_do_move(self, color, vertex): if not self._is_valid(self.game.history, self.game.board, color, vertex): return False diff --git a/AlphaGo/self-play.py b/AlphaGo/self-play.py index 296112b..63b7e97 100644 --- a/AlphaGo/self-play.py +++ b/AlphaGo/self-play.py @@ -79,7 +79,7 @@ while True: prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1)) print("Finished") print("\n") - score = game.executor.executor_get_score(True) + score = game.game_engine.executor_get_score(True) if score > 0: winner = utils.BLACK else: diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py deleted file mode 100644 index 1e5fd02..0000000 --- a/AlphaGo/strategy.py +++ /dev/null @@ -1,199 +0,0 @@ -import os, sys - -sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir)) -import numpy as np -import utils -import time -import copy -import network_small -import tensorflow as tf -from collections import deque -from tianshou.core.mcts.mcts import MCTS - -NEIGHBOR_OFFSET = [[1, 0], [-1, 0], [0, -1], [0, 1]] -CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] - -class GoEnv: - def __init__(self, **kwargs): - self.game = kwargs['game'] - self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) - self.simulate_latest_boards = deque(maxlen=8) - - def _in_board(self, vertex): - x, y = vertex - if x < 1 or x > self.game.size: return False - if y < 1 or y > self.game.size: return False - return True - - def _neighbor(self, vertex): - x, y = vertex - nei = [] - for d in NEIGHBOR_OFFSET: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - nei.append((_x, _y)) - return nei - - def _corner(self, vertex): - x, y = vertex - corner = [] - for d in CORNER_OFFSET: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - corner.append((_x, _y)) - return corner - - def _find_group(self, current_board, vertex): - color = current_board[self.game._flatten(vertex)] - # print ("color : ", color) - chain = set() - frontier = [vertex] - has_liberty = False - while frontier: - current = frontier.pop() - # print ("current : ", current) - chain.add(current) - for n in self._neighbor(current): - if current_board[self.game._flatten(n)] == color and not n in chain: - frontier.append(n) - if current_board[self.game._flatten(n)] == utils.EMPTY: - has_liberty = True - return has_liberty, chain - - def _is_suicide(self, current_board, color, vertex): - current_board[self.game._flatten(vertex)] = color # assume that we already take this move - suicide = False - - has_liberty, group = self._find_group(current_board, vertex) - if not has_liberty: - suicide = True # no liberty, suicide - for n in self._neighbor(vertex): - if current_board[self.game._flatten(n)] == utils.another_color(color): - opponent_liberty, group = self._find_group(current_board, n) - if not opponent_liberty: - suicide = False # this move is able to take opponent's stone, not suicide - - current_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move - return suicide - - def _process_board(self, current_board, color, vertex): - nei = self._neighbor(vertex) - for n in nei: - if current_board[self.game._flatten(n)] == utils.another_color(color): - has_liberty, group = self._find_group(current_board, n) - if not has_liberty: - for b in group: - current_board[self.game._flatten(b)] = utils.EMPTY - - def _check_global_isomorphous(self, history_boards, current_board, color, vertex): - repeat = False - next_board = copy.copy(current_board) - next_board[self.game._flatten(vertex)] = color - self._process_board(next_board, color, vertex) - if next_board in history_boards: - repeat = True - return repeat - - def _is_eye(self, current_board, color, vertex): - nei = self._neighbor(vertex) - cor = self._corner(vertex) - ncolor = {color == current_board[self.game._flatten(n)] for n in nei} - if False in ncolor: - # print "not all neighbors are in same color with us" - return False - _, group = self._find_group(current_board, nei[0]) - if set(nei) < group: - # print "all neighbors are in same group and same color with us" - return True - else: - opponent_number = [current_board[self.game._flatten(c)] for c in cor].count(-color) - opponent_propotion = float(opponent_number) / float(len(cor)) - if opponent_propotion < 0.5: - # print "few opponents, real eye" - return True - else: - # print "many opponents, fake eye" - return False - - def _knowledge_prunning(self, current_board, color, vertex): - ### check if it is an eye of yourself - ### assumptions : notice that this judgement requires that the state is an endgame - if self._is_eye(current_board, color, vertex): - return False - return True - - def _sa2cv(self, state, action): - # State is the play board, the shape is [1, self.game.size, self.game.size, 17], action is an index. - # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move - if state[0, 0, 0, -1] == utils.BLACK: - color = utils.BLACK - else: - color = utils.WHITE - if action == self.game.size ** 2: - vertex = (0, 0) - else: - vertex = self.game._deflatten(action) - return color, vertex - - def _is_valid(self, history_boards, current_board, color, vertex): - ### in board - if not self._in_board(vertex): - return False - - ### already have stone - if not current_board[self.game._flatten(vertex)] == utils.EMPTY: - return False - - ### check if it is suicide - if self._is_suicide(current_board, color, vertex): - return False - - ### forbid global isomorphous - if self._check_global_isomorphous(history_boards, current_board, color, vertex): - return False - - return True - - def simulate_is_valid(self, history_boards, current_board, state, action): - # initialize simulate_latest_boards and simulate_board from state - self.simulate_latest_boards.clear() - for i in range(8): - self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) - self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) - - color, vertex = self._sa2cv(state, action) - - if not self._is_valid(history_boards, current_board, color, vertex): - return False - - if not self._knowledge_prunning(current_board, color, vertex): - return False - - return True - - def _do_move(self, color, vertex): - if vertex == utils.PASS: - return True - - id_ = self.game._flatten(vertex) - if self.simulate_board[id_] == utils.EMPTY: - self.simulate_board[id_] = color - return True - else: - return False - - def simulate_step_forward(self, state, action): - # initialize the simulate_board from state - self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() - - color, vertex = self._sa2cv(state, action) - - self._do_move(color, vertex) - new_state = np.concatenate( - [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), - state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), - np.array(1 - state[:, :, :, -1]).reshape(1, self.game.size, self.game.size, 1)], - axis=3) - return new_state, 0 From 7fca90c61b97704463985f1c1774e90a834c906c Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Wed, 20 Dec 2017 16:43:42 +0800 Subject: [PATCH 26/98] modify the mcts, refactor the network --- AlphaGo/Network.py | 211 ----------------------- AlphaGo/Network_ori.py | 175 ------------------- AlphaGo/game.py | 15 +- AlphaGo/go.py | 58 ++----- AlphaGo/model.py | 170 ++++++++++++++++++ AlphaGo/{network_small.py => network.py} | 0 tianshou/core/mcts/mcts.py | 40 ++--- 7 files changed, 212 insertions(+), 457 deletions(-) delete mode 100644 AlphaGo/Network.py delete mode 100644 AlphaGo/Network_ori.py create mode 100644 AlphaGo/model.py rename AlphaGo/{network_small.py => network.py} (100%) diff --git a/AlphaGo/Network.py b/AlphaGo/Network.py deleted file mode 100644 index caf7710..0000000 --- a/AlphaGo/Network.py +++ /dev/null @@ -1,211 +0,0 @@ -import os -import time -import sys - -import numpy as np -import time -import tensorflow as tf -import tensorflow.contrib.layers as layers - -import multi_gpu -import time - -# os.environ["CUDA_VISIBLE_DEVICES"] = "1" -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - - -def residual_block(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = h + input - return tf.nn.relu(h) - - -def policy_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 362, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -def value_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -class Network(object): - def __init__(self): - self.x = tf.placeholder(tf.float32, shape=[None, 19, 19, 17]) - self.is_training = tf.placeholder(tf.bool, shape=[]) - self.z = tf.placeholder(tf.float32, shape=[None, 1]) - self.pi = tf.placeholder(tf.float32, shape=[None, 362]) - self.build_network() - - def build_network(self): - h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm, - normalizer_params={'is_training': self.is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS}, - weights_regularizer=layers.l2_regularizer(1e-4)) - for i in range(19): - h = residual_block(h, self.is_training) - self.v = value_heads(h, self.is_training) - self.p = policy_heads(h, self.is_training) - # loss = tf.reduce_mean(tf.square(z-v)) - tf.multiply(pi, tf.log(tf.clip_by_value(tf.nn.softmax(p), 1e-8, tf.reduce_max(tf.nn.softmax(p))))) - self.value_loss = tf.reduce_mean(tf.square(self.z - self.v)) - self.policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.pi, logits=self.p)) - - self.reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) - self.total_loss = self.value_loss + self.policy_loss + self.reg - # train_op = tf.train.MomentumOptimizer(1e-4, momentum=0.9, use_nesterov=True).minimize(total_loss) - self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) - with tf.control_dependencies(self.update_ops): - self.train_op = tf.train.RMSPropOptimizer(1e-4).minimize(self.total_loss) - self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) - self.saver = tf.train.Saver(max_to_keep=10, var_list=self.var_list) - - def train(self): - data_path = "/home/tongzheng/data/" - data_name = os.listdir("/home/tongzheng/data/") - epochs = 100 - batch_size = 128 - - result_path = "./checkpoints/" - with multi_gpu.create_session() as sess: - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(result_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - self.saver.restore(sess, ckpt_file) - for epoch in range(epochs): - for name in data_name: - data = np.load(data_path + name) - boards = data["boards"] - wins = data["wins"] - ps = data["ps"] - print (boards.shape) - print (wins.shape) - print (ps.shape) - batch_num = boards.shape[0] // batch_size - index = np.arange(boards.shape[0]) - np.random.shuffle(index) - value_losses = [] - policy_losses = [] - regs = [] - time_train = -time.time() - for iter in range(batch_num): - lv, lp, r, value, prob, _ = sess.run( - [self.value_loss, self.policy_loss, self.reg, self.v, tf.nn.softmax(p), self.train_op], - feed_dict={self.x: boards[ - index[iter * batch_size:(iter + 1) * batch_size]], - self.z: wins[index[ - iter * batch_size:(iter + 1) * batch_size]], - self.pi: ps[index[ - iter * batch_size:(iter + 1) * batch_size]], - self.is_training: True}) - value_losses.append(lv) - policy_losses.append(lp) - regs.append(r) - if iter % 1 == 0: - print( - "Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format( - epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)), - np.mean(np.array(policy_losses)), np.mean(np.array(regs)))) - time_train = -time.time() - value_losses = [] - policy_losses = [] - regs = [] - if iter % 20 == 0: - save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter) - self.saver.save(sess, result_path + save_path) - del data, boards, wins, ps - - - # def forward(call_number): - # # checkpoint_path = "/home/yama/rl/tianshou/AlphaGo/checkpoints" - # checkpoint_path = "/home/jialian/stuGo/tianshou/stuGo/checkpoints/" - # board_file = np.genfromtxt("/home/jialian/stuGo/tianshou/leela-zero/src/mcts_nn_files/board_" + call_number, - # dtype='str'); - # human_board = np.zeros((17, 19, 19)) - # - # # TODO : is it ok to ignore the last channel? - # for i in range(17): - # human_board[i] = np.array(list(board_file[i])).reshape(19, 19) - # # print("============================") - # # print("human board sum : " + str(np.sum(human_board[-1]))) - # # print("============================") - # # print(human_board) - # # print("============================") - # # rint(human_board) - # feed_board = human_board.transpose(1, 2, 0).reshape(1, 19, 19, 17) - # # print(feed_board[:,:,:,-1]) - # # print(feed_board.shape) - # - # # npz_board = np.load("/home/yama/rl/tianshou/AlphaGo/data/7f83928932f64a79bc1efdea268698ae.npz") - # # print(npz_board["boards"].shape) - # # feed_board = npz_board["boards"][10].reshape(-1, 19, 19, 17) - # ##print(feed_board) - # # show_board = feed_board[0].transpose(2, 0, 1) - # # print("board shape : ", show_board.shape) - # # print(show_board) - # - # itflag = False - # with multi_gpu.create_session() as sess: - # sess.run(tf.global_variables_initializer()) - # ckpt_file = tf.train.latest_checkpoint(checkpoint_path) - # if ckpt_file is not None: - # # print('Restoring model from {}...'.format(ckpt_file)) - # saver.restore(sess, ckpt_file) - # else: - # raise ValueError("No model loaded") - # res = sess.run([tf.nn.softmax(p), v], feed_dict={x: feed_board, is_training: itflag}) - # # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False}) - # # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True}) - # # print(np.argmax(res[0])) - # np.savetxt(sys.stdout, res[0][0], fmt="%.6f", newline=" ") - # np.savetxt(sys.stdout, res[1][0], fmt="%.6f", newline=" ") - # pv_file = "/home/jialian/stuGotianshou/leela-zero/src/mcts_nn_files/policy_value" - # np.savetxt(pv_file, np.concatenate((res[0][0], res[1][0])), fmt="%.6f", newline=" ") - # # np.savetxt(pv_file, res[1][0], fmt="%.6f", newline=" ") - # return res - - def forward(self): - checkpoint_path = "/home/tongzheng/tianshou/AlphaGo/checkpoints/" - sess = multi_gpu.create_session() - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(checkpoint_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - self.saver.restore(sess, ckpt_file) - print('Successfully loaded') - else: - raise ValueError("No model loaded") - # prior, value = sess.run([tf.nn.softmax(p), v], feed_dict={x: state, is_training: False}) - # return prior, value - return sess - - -if __name__ == '__main__': - state = np.random.randint(0, 1, [1, 19, 19, 17]) - net = Network() - sess = net.forward() - start = time.time() - for i in range(100): - sess.run([tf.nn.softmax(net.p), net.v], feed_dict={net.x: state, net.is_training: False}) - print("Step {}, Cumulative time {}".format(i, time.time() - start)) diff --git a/AlphaGo/Network_ori.py b/AlphaGo/Network_ori.py deleted file mode 100644 index 9d33bb9..0000000 --- a/AlphaGo/Network_ori.py +++ /dev/null @@ -1,175 +0,0 @@ -import os -import time -import gc - -import numpy as np -import tensorflow as tf -import tensorflow.contrib.layers as layers - -import multi_gpu - -os.environ["CUDA_VISIBLE_DEVICES"] = "1" - - -def residual_block(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = h + input - return tf.nn.relu(h) - - -def policy_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 362, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -def value_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -x = tf.placeholder(tf.float32, shape=[None, 19, 19, 17]) -is_training = tf.placeholder(tf.bool, shape=[]) -z = tf.placeholder(tf.float32, shape=[None, 1]) -pi = tf.placeholder(tf.float32, shape=[None, 362]) - -h = layers.conv2d(x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm, - normalizer_params={'is_training': is_training, 'updates_collections': tf.GraphKeys.UPDATE_OPS}, - weights_regularizer=layers.l2_regularizer(1e-4)) -for i in range(19): - h = residual_block(h, is_training) -v = value_heads(h, is_training) -p = policy_heads(h, is_training) -# loss = tf.reduce_mean(tf.square(z-v)) - tf.multiply(pi, tf.log(tf.clip_by_value(tf.nn.softmax(p), 1e-8, tf.reduce_max(tf.nn.softmax(p))))) -value_loss = tf.reduce_mean(tf.square(z - v)) -policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=pi, logits=p)) - -reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) -total_loss = value_loss + policy_loss + reg -# train_op = tf.train.MomentumOptimizer(1e-4, momentum=0.9, use_nesterov=True).minimize(total_loss) -update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) -with tf.control_dependencies(update_ops): - train_op = tf.train.RMSPropOptimizer(1e-4).minimize(total_loss) -var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) -saver = tf.train.Saver(max_to_keep=10, var_list=var_list) - - -def train(): - data_path = "/home/tongzheng/data/" - data_name = os.listdir("/home/tongzheng/data/") - epochs = 100 - batch_size = 128 - - result_path = "./checkpoints/" - with multi_gpu.create_session() as sess: - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(result_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - saver.restore(sess, ckpt_file) - for epoch in range(epochs): - for name in data_name: - data = np.load(data_path + name) - boards = data["boards"] - wins = data["wins"] - ps = data["ps"] - print (boards.shape) - print (wins.shape) - print (ps.shape) - # batch_num = 1 - batch_num = boards.shape[0] // batch_size - index = np.arange(boards.shape[0]) - np.random.shuffle(index) - value_losses = [] - policy_losses = [] - regs = [] - time_train = -time.time() - for iter in range(batch_num): - lv, lp, r, _ = sess.run([value_loss, policy_loss, reg, train_op], - feed_dict={x: boards[ - index[iter * batch_size:(iter + 1) * batch_size]], - z: wins[index[ - iter * batch_size:(iter + 1) * batch_size]], - pi: ps[index[ - iter * batch_size:(iter + 1) * batch_size]], - is_training: True}) - value_losses.append(lv) - policy_losses.append(lp) - regs.append(r) - del lv, lp, r - if iter % 1 == 0: - print( - "Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format( - epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)), - np.mean(np.array(policy_losses)), np.mean(np.array(regs)))) - del value_losses, policy_losses, regs, time_train - time_train = -time.time() - value_losses = [] - policy_losses = [] - regs = [] - if iter % 20 == 0: - save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter) - saver.save(sess, result_path + save_path) - del save_path - del data, boards, wins, ps, batch_num, index - gc.collect() - - -def forward(board): - result_path = "./checkpoints" - itflag = False - res = None - if board is None: - # data = np.load("/home/tongzheng/meta-data/80b7bf21bce14862806d48c3cd760a1b.npz") - data = np.load("./data/7f83928932f64a79bc1efdea268698ae.npz") - board = data["boards"][50].reshape(-1, 19, 19, 17) - human_board = board[0].transpose(2, 0, 1) - print("============================") - print("human board sum : " + str(np.sum(human_board))) - print("============================") - print(board[:, :, :, -1]) - itflag = False - with multi_gpu.create_session() as sess: - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(result_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - saver.restore(sess, ckpt_file) - else: - raise ValueError("No model loaded") - res = sess.run([tf.nn.softmax(p), v], feed_dict={x: board, is_training: itflag}) - # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False}) - # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True}) - # print(np.argmax(res[0])) - print(res) - print(data["p"][0]) - print(np.argmax(res[0])) - print(np.argmax(data["p"][0])) - # print(res[0].tolist()[0]) - # print(np.argmax(res[0])) - return res - - -if __name__ == '__main__': - # train() - # if sys.argv[1] == "test": - forward(None) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index aee8d3a..37b7878 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -11,7 +11,7 @@ import tensorflow as tf import numpy as np import sys, os import go -import network_small +import model from collections import deque sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir)) from tianshou.core.mcts.mcts import MCTS @@ -31,10 +31,9 @@ class Game: self.latest_boards = deque(maxlen=8) for _ in range(8): self.latest_boards.append(self.board) - self.net = network_small.Network() - self.sess = self.net.forward(checkpoint_path) - self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], - feed_dict={self.net.x: state, self.net.is_training: False}) + self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8) + # self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], + # feed_dict={self.net.x: state, self.net.is_training: False}) self.game_engine = go.Go(game=self) def _flatten(self, vertex): @@ -75,7 +74,8 @@ class Game: self.game_engine.simulate_latest_boards = copy.copy(latest_boards) self.game_engine.simulate_board = copy.copy(latest_boards[-1]) nn_input = self.generate_nn_input(self.game_engine.simulate_latest_boards, color) - mcts = MCTS(self.game_engine, self.evaluator, nn_input, self.size ** 2 + 1, inverse=True, max_step=1) + mcts = MCTS(self.game_engine, self.evaluator, [self.game_engine.simulate_latest_boards, color], self.size ** 2 + 1, inverse=True) + mcts.search(max_step=1) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0] @@ -93,7 +93,7 @@ class Game: return res def think_play_move(self, color): - # although we dont need to return self.prob, however it is needed for neural network training + # although we don't need to return self.prob, however it is needed for neural network training move, self.prob = self.think(self.latest_boards, color) # play the move immediately self.play_move(color, move) @@ -122,6 +122,7 @@ class Game: if __name__ == "__main__": g = Game() g.show_board() + g.think_play_move(1) #file = open("debug.txt", "a") #file.write("mcts check\n") #file.close() diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 10ce7e1..335ee39 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -17,8 +17,6 @@ CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] class Go: def __init__(self, **kwargs): self.game = kwargs['game'] - self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) - self.simulate_latest_boards = deque(maxlen=8) def _in_board(self, vertex): x, y = vertex @@ -125,18 +123,12 @@ class Go: return False return True - def _sa2cv(self, state, action): - # State is the play board, the shape is [1, self.game.size, self.game.size, 17], action is an index. - # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move - if state[0, 0, 0, -1] == utils.BLACK: - color = utils.BLACK - else: - color = utils.WHITE + def _action2vertex(self, action): if action == self.game.size ** 2: vertex = (0, 0) else: vertex = self.game._deflatten(action) - return color, vertex + return vertex def _is_valid(self, history_boards, current_board, color, vertex): ### in board @@ -157,14 +149,10 @@ class Go: return True - def simulate_is_valid(self, history_boards, current_board, state, action): - # initialize simulate_latest_boards and simulate_board from state - self.simulate_latest_boards.clear() - for i in range(8): - self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) - self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) - - color, vertex = self._sa2cv(state, action) + def simulate_is_valid(self, state, action): + history_boards, color = state + vertex = self._action2vertex(action) + current_board = history_boards[-1] if not self._is_valid(history_boards, current_board, color, vertex): return False @@ -174,30 +162,22 @@ class Go: return True - def _do_move(self, color, vertex): + def _do_move(self, board, color, vertex): if vertex == utils.PASS: - return True - - id_ = self.game._flatten(vertex) - if self.simulate_board[id_] == utils.EMPTY: - self.simulate_board[id_] = color - return True + return board else: - return False + id_ = self.game._flatten(vertex) + board[id_] = color + return board def simulate_step_forward(self, state, action): # initialize the simulate_board from state - self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() - - color, vertex = self._sa2cv(state, action) - - self._do_move(color, vertex) - new_state = np.concatenate( - [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), - state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), - np.array(1 - state[:, :, :, -1]).reshape(1, self.game.size, self.game.size, 1)], - axis=3) - return new_state, 0 + history_boards, color = state + vertex = self._action2vertex(action) + new_board = self._do_move(copy.copy(history_boards[-1]), color, vertex) + history_boards.append(new_board) + new_color = -color + return [history_boards, new_color], 0 def executor_do_move(self, color, vertex): if not self._is_valid(self.game.history, self.game.board, color, vertex): @@ -239,7 +219,7 @@ class Go: start_vertex_x += x_diff start_vertex_y += y_diff - def _predict_from_nearby(self, vertex, neighbor_step = 3): + def _predict_from_nearby(self, vertex, neighbor_step=3): ''' step: the nearby 3 steps is considered :vertex: position to be estimated @@ -261,7 +241,7 @@ class Go: elif color_estimate < 0: return utils.WHITE - def executor_get_score(self, is_unknown_estimation = False): + def executor_get_score(self, is_unknown_estimation=False): ''' is_unknown_estimation: whether use nearby stone to predict the unknown return score from BLACK perspective. diff --git a/AlphaGo/model.py b/AlphaGo/model.py new file mode 100644 index 0000000..725dbd2 --- /dev/null +++ b/AlphaGo/model.py @@ -0,0 +1,170 @@ +import os +import time +import sys + +import numpy as np +import tensorflow as tf +import tensorflow.contrib.layers as layers + +import multi_gpu + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + + +def residual_block(input, is_training): + """ + one residual block + + :param input: a tensor, input of the residual block + :param is_training: a placeholder, indicate whether the model is training or not + :return: a tensor, output of the residual block + """ + normalizer_params = {'is_training': is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS} + h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = h + input + return tf.nn.relu(h) + + +def policy_head(input, is_training, action_num): + """ + the head of policy branch + + :param input: a tensor, input of the policy head + :param is_training: a placeholder, indicate whether the model is training or not + :param action_num: action_num: an integer, number of unique actions at any state + :return: a tensor: output of the policy head, shape [batch_size, action_num] + """ + normalizer_params = {'is_training': is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS} + h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.flatten(h) + h = layers.fully_connected(h, action_num, activation_fn=tf.identity, + weights_regularizer=layers.l2_regularizer(1e-4)) + return h + + +def value_head(input, is_training): + """ + the head of value branch + + :param input: a tensor, input of the value head + :param is_training: a placeholder, indicate whether the model is training or not + :return: a tensor, output of the value head, shape [batch_size, 1] + """ + normalizer_params = {'is_training': is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS} + h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.flatten(h) + h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4)) + return h + + +class ResNet(object): + def __init__(self, board_size, action_num, history_length=1, residual_block_num=20, checkpoint_path=None): + """ + the resnet model + + :param board_size: an integer, the board size + :param action_num: an integer, number of unique actions at any state + :param history_length: an integer, the history length to use, default is 1 + :param residual_block_num: an integer, the number of residual block, default is 20, at least 1 + :param checkpoint_path: a string, the path to the checkpoint, default is None, + """ + self.board_size = board_size + self.action_num = action_num + self.history_length = history_length + self.x = tf.placeholder(tf.float32, shape=[None, self.board_size, self.board_size, 2 * self.history_length + 1]) + self.is_training = tf.placeholder(tf.bool, shape=[]) + self.z = tf.placeholder(tf.float32, shape=[None, 1]) + self.pi = tf.placeholder(tf.float32, shape=[None, self.action_num]) + self._build_network(residual_block_num, checkpoint_path) + + def _build_network(self, residual_block_num, checkpoint_path): + """ + build the network + + :param residual_block_num: an integer, the number of residual block + :param checkpoint_path: a string, the path to the checkpoint, if None, use random initialization parameter + :return: None + """ + + h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, + normalizer_params={'is_training': self.is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS}, + weights_regularizer=layers.l2_regularizer(1e-4)) + for i in range(residual_block_num - 1): + h = residual_block(h, self.is_training) + self.v = value_head(h, self.is_training) + self.p = policy_head(h, self.is_training, self.action_num) + self.value_loss = tf.reduce_mean(tf.square(self.z - self.v)) + self.policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.pi, logits=self.p)) + + self.reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) + self.total_loss = self.value_loss + self.policy_loss + self.reg + self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + with tf.control_dependencies(self.update_ops): + self.train_op = tf.train.AdamOptimizer(1e-4).minimize(self.total_loss) + self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) + self.saver = tf.train.Saver(max_to_keep=10, var_list=self.var_list) + self.sess = multi_gpu.create_session() + self.sess.run(tf.global_variables_initializer()) + if checkpoint_path is not None: + ckpt_file = tf.train.latest_checkpoint(checkpoint_path) + if ckpt_file is not None: + print('Restoring model from {}...'.format(ckpt_file)) + self.saver.restore(self.sess, ckpt_file) + print('Successfully loaded') + else: + raise ValueError("No model in path {}".format(checkpoint_path)) + + def __call__(self, state): + """ + + :param history: a list, the history + :param color: a string, indicate which one to play + :return: a list of tensor, the predicted value and policy given the history and color + """ + history, color = state + if len(history) != self.history_length: + raise ValueError( + 'The length of history cannot meet the need of the model, given {}, need {}'.format(len(history), + self.history_length)) + state = self._history2state(history, color) + return self.sess.run([self.p, self.v], feed_dict={self.x: state, self.is_training: False}) + + def _history2state(self, history, color): + """ + convert the history to the state we need + + :param history: a list, the history + :param color: a string, indicate which one to play + :return: a ndarray, the state + """ + state = np.zeros([1, self.board_size, self.board_size, 2 * self.history_length + 1]) + for i in range(self.history_length): + state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.board_size ** 2)).reshape(self.board_size, + self.board_size) + state[0, :, :, i + self.history_length] = np.array( + np.array(history[i]) == -np.ones(self.board_size ** 2)).reshape(self.board_size, self.board_size) + # TODO: need a config to specify the BLACK and WHITE + if color == +1: + state[0, :, :, 2 * self.history_length] = np.ones([self.board_size, self.board_size]) + if color == -1: + state[0, :, :, 2 * self.history_length] = np.zeros([self.board_size, self.board_size]) + return state + + #TODO: design the interface between the environment and training + def train(self, mode='memory', *args, **kwargs): + pass \ No newline at end of file diff --git a/AlphaGo/network_small.py b/AlphaGo/network.py similarity index 100% rename from AlphaGo/network_small.py rename to AlphaGo/network.py diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 12fc85d..fac00fb 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -72,11 +72,9 @@ class UCTNode(MCTSNode): def valid_mask(self, simulator): if self.mask is None: - start_time = time.time() self.mask = [] for act in range(self.action_num - 1): - if not simulator.simulate_is_valid( - simulator.simulate_latest_boards, simulator.simulate_board, self.state, act): + if not simulator.simulate_is_valid(self.state, act): self.mask.append(act) self.ucb[act] = -float("Inf") else: @@ -144,8 +142,7 @@ class ActionNode(object): class MCTS(object): - def __init__(self, simulator, evaluator, root, action_num, method="UCT", inverse=False, max_step=None, - max_time=None): + def __init__(self, simulator, evaluator, root, action_num, method="UCT", inverse=False): self.simulator = simulator self.evaluator = evaluator prior, _ = self.evaluator(root) @@ -153,33 +150,26 @@ class MCTS(object): if method == "": self.root = root if method == "UCT": - self.root = UCTNode(None, None, root, action_num, prior, inverse) + self.root = UCTNode(None, None, root, action_num, prior, inverse=inverse) if method == "TS": self.root = TSNode(None, None, root, action_num, prior, inverse=inverse) self.inverse = inverse - if max_step is not None: - self.step = 0 - self.max_step = max_step - # TODO: Optimize the stop criteria - # else: - # self.max_step = 0 - if max_time is not None: - self.start_time = time.time() - self.max_time = max_time + + def search(self, max_step=None, max_time=None): + step = 0 + start_time = time.time() + if max_step is None: + max_step = int("Inf") + if max_time is None: + max_time = float("Inf") if max_step is None and max_time is None: raise ValueError("Need a stop criteria!") - # TODO: running mcts should be implemented in another function, e.g. def search(self, max_step, max_time) - self.select_time = [] - self.evaluate_time = [] - self.bp_time = [] - while (max_step is not None and self.step < self.max_step or max_step is None) \ - and (max_time is not None and time.time() - self.start_time < self.max_time or max_time is None): - self.expand() - if max_step is not None: - self.step += 1 + while step < max_step and time.time() - start_time < max_step: + self._expand() + step += 1 - def expand(self): + def _expand(self): node, new_action = self.root.selection(self.simulator) value = node.children[new_action].expansion(self.evaluator, self.action_num) node.children[new_action].backpropagation(value + 0.) From 50e306368feabf13a8723412481c6f3103ff3c4e Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Wed, 20 Dec 2017 20:12:08 +0800 Subject: [PATCH 27/98] checkpoint --- AlphaGo/go.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 335ee39..7196533 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -117,10 +117,31 @@ class Go: return False def _knowledge_prunning(self, current_board, color, vertex): - ### check if it is an eye of yourself - ### assumptions : notice that this judgement requires that the state is an endgame + # forbid some stupid selfplay using human knowledge if self._is_eye(current_board, color, vertex): return False + # forbid position on its own eye. + if self._is_game_finish(current_board, color) and vertex == utils.PASS + return False + # forbid pass if the game is not finished. + return True + + + def _is_game_finished(self, current_board, color): + ''' + for each empty position, if it has both BLACK and WHITE neighbors, the game is still not finished + :return: return the game is finished + ''' + board = copy.deepcopy(current_board) + empty_idx = [i for i, x in enumerate(board) if x == utils.EMPTY] # find all empty idx + for idx in empty_idx: + neighbor_idx = self._neighbor(self.game.deflatten(idx)) + if len(neighbor_idx) > 1: + first_idx = neighbor_idx[0] + for other_idx in neighbor_idx[1:]: + if self.game.board[self.game.flatten(other_idx)] != self.game.board[self.game.flatten(first_idx)]: + return False + return True def _action2vertex(self, action): From 48e95a21eaeec6495a1bc5985c434d64d7447baf Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Wed, 20 Dec 2017 21:35:35 +0800 Subject: [PATCH 28/98] simulator process a valid set, instead of a single action --- AlphaGo/go.py | 18 +++++++++++++++--- tianshou/core/mcts/mcts.py | 9 ++------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 7196533..559b375 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -121,9 +121,9 @@ class Go: if self._is_eye(current_board, color, vertex): return False # forbid position on its own eye. - if self._is_game_finish(current_board, color) and vertex == utils.PASS - return False - # forbid pass if the game is not finished. + #if self._is_game_finish(current_board, color) and vertex == utils.PASS + # return False + # forbid pass if the game is not finished. return True @@ -183,6 +183,18 @@ class Go: return True + def simulate_is_valid_list(self, state, action_set): + ## find all the valid actions + ## if no action is valid, then pass + valid_action_set = [] + for action_candidate in action_set: + if self.simulate_is_valid(self, state, action_candidate) + valid_action_set.append(action_candidate) + if not valid_action_set: + valid_action_set.append(utils.PASS) + # if valid_action_set is a empty set, add pass + return valid_action_set + def _do_move(self, board, color, vertex): if vertex == utils.PASS: return board diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index fac00fb..c14496d 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -72,13 +72,8 @@ class UCTNode(MCTSNode): def valid_mask(self, simulator): if self.mask is None: - self.mask = [] - for act in range(self.action_num - 1): - if not simulator.simulate_is_valid(self.state, act): - self.mask.append(act) - self.ucb[act] = -float("Inf") - else: - self.ucb[self.mask] = -float("Inf") + self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num - 1)) + self.ucb[self.mask] = -float("Inf") class TSNode(MCTSNode): From cabbb219680be465f03527ea90deb568b53f911f Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Wed, 20 Dec 2017 21:40:03 +0800 Subject: [PATCH 29/98] minor revision --- AlphaGo/go.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 559b375..009d369 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -186,14 +186,14 @@ class Go: def simulate_is_valid_list(self, state, action_set): ## find all the valid actions ## if no action is valid, then pass - valid_action_set = [] + valid_action_list = [] for action_candidate in action_set: - if self.simulate_is_valid(self, state, action_candidate) - valid_action_set.append(action_candidate) - if not valid_action_set: - valid_action_set.append(utils.PASS) + if self.simulate_is_valid(state, action_candidate): + valid_action_list.append(action_candidate) + if not valid_action_list: + valid_action_list.append(utils.PASS) # if valid_action_set is a empty set, add pass - return valid_action_set + return valid_action_list def _do_move(self, board, color, vertex): if vertex == utils.PASS: From e2c6b96e5743341f92278a6437a85a7154bd5ec3 Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Wed, 20 Dec 2017 21:52:30 +0800 Subject: [PATCH 30/98] minor revision. --- AlphaGo/go.py | 3 +-- tianshou/core/mcts/mcts.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 009d369..cbbe07c 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -180,7 +180,6 @@ class Go: if not self._knowledge_prunning(current_board, color, vertex): return False - return True def simulate_is_valid_list(self, state, action_set): @@ -188,7 +187,7 @@ class Go: ## if no action is valid, then pass valid_action_list = [] for action_candidate in action_set: - if self.simulate_is_valid(state, action_candidate): + if not self.simulate_is_valid(state, action_candidate): valid_action_list.append(action_candidate) if not valid_action_list: valid_action_list.append(utils.PASS) diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index c14496d..5aca06a 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -71,6 +71,7 @@ class UCTNode(MCTSNode): self.parent.backpropagation(self.children[action].reward) def valid_mask(self, simulator): + # let all invalid actions illeagel in mcts if self.mask is None: self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num - 1)) self.ucb[self.mask] = -float("Inf") From f0d59dab6cef928cd580f301abbdd54b84af23df Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Wed, 20 Dec 2017 22:10:47 +0800 Subject: [PATCH 31/98] forbid pass, if we have other choices --- AlphaGo/go.py | 18 +++++++++--------- tianshou/core/mcts/mcts.py | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index cbbe07c..1dfbb29 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -183,16 +183,16 @@ class Go: return True def simulate_is_valid_list(self, state, action_set): - ## find all the valid actions - ## if no action is valid, then pass - valid_action_list = [] - for action_candidate in action_set: + # find all the invalid actions + invalid_action_list = [] + for action_candidate in action_set[:-1]: + # go through all the actions excluding pass if not self.simulate_is_valid(state, action_candidate): - valid_action_list.append(action_candidate) - if not valid_action_list: - valid_action_list.append(utils.PASS) - # if valid_action_set is a empty set, add pass - return valid_action_list + invalid_action_list.append(action_candidate) + if len(invalid_action_list) < len(action_set) - 1: + invalid_action_list.append(action_set[-1]) + # forbid pass, if we have other choices + return invalid_action_list def _do_move(self, board, color, vertex): if vertex == utils.PASS: diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 5aca06a..7edac97 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -71,7 +71,7 @@ class UCTNode(MCTSNode): self.parent.backpropagation(self.children[action].reward) def valid_mask(self, simulator): - # let all invalid actions illeagel in mcts + # let all invalid actions be illeagel in mcts if self.mask is None: self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num - 1)) self.ucb[self.mask] = -float("Inf") From 00d2aa86bf668e17d6064b4896797cb79f7cbba7 Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Wed, 20 Dec 2017 22:57:58 +0800 Subject: [PATCH 32/98] repair komi. add todo for forbid pass: --- AlphaGo/engine.py | 2 +- AlphaGo/game.py | 4 ++-- AlphaGo/go.py | 5 +---- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index 9948176..bf30083 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -183,7 +183,7 @@ class GTPEngine(): return 'unknown player', False def cmd_get_score(self, args, **kwargs): - return self._game.game_engine.executor_get_score(), None + return self._game.game_engine.executor_get_score(True), None def cmd_show_board(self, args, **kwargs): return self._game.board, True diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 37b7878..5f35c74 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -23,7 +23,7 @@ class Game: TODO : Maybe merge with the engine class in future, currently leave it untouched for interacting with Go UI. ''' - def __init__(self, size=9, komi=6.5, checkpoint_path=None): + def __init__(self, size=9, komi=3.75, checkpoint_path=None): self.size = size self.komi = komi self.board = [utils.EMPTY] * (self.size ** 2) @@ -75,7 +75,7 @@ class Game: self.game_engine.simulate_board = copy.copy(latest_boards[-1]) nn_input = self.generate_nn_input(self.game_engine.simulate_latest_boards, color) mcts = MCTS(self.game_engine, self.evaluator, [self.game_engine.simulate_latest_boards, color], self.size ** 2 + 1, inverse=True) - mcts.search(max_step=1) + mcts.search(max_step=5) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0] diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 1dfbb29..4f1c759 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -121,12 +121,8 @@ class Go: if self._is_eye(current_board, color, vertex): return False # forbid position on its own eye. - #if self._is_game_finish(current_board, color) and vertex == utils.PASS - # return False - # forbid pass if the game is not finished. return True - def _is_game_finished(self, current_board, color): ''' for each empty position, if it has both BLACK and WHITE neighbors, the game is still not finished @@ -192,6 +188,7 @@ class Go: if len(invalid_action_list) < len(action_set) - 1: invalid_action_list.append(action_set[-1]) # forbid pass, if we have other choices + # TODO: In fact we should not do this. In some extreme cases, we should permit pass. return invalid_action_list def _do_move(self, board, color, vertex): From ced63af18fcc790c4b1bb1548b5494bd2073f9a2 Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Thu, 21 Dec 2017 19:31:51 +0800 Subject: [PATCH 33/98] fixing bug pass parameterg --- tianshou/core/mcts/mcts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 7edac97..8bb5f06 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -73,7 +73,7 @@ class UCTNode(MCTSNode): def valid_mask(self, simulator): # let all invalid actions be illeagel in mcts if self.mask is None: - self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num - 1)) + self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num)) self.ucb[self.mask] = -float("Inf") From eda7ed07a1b7b0251745981d71ab9f358f15944e Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Thu, 21 Dec 2017 21:01:25 +0800 Subject: [PATCH 34/98] implement data collection and part of training --- AlphaGo/engine.py | 6 ++- AlphaGo/game.py | 19 +------- AlphaGo/model.py | 18 +++++++- AlphaGo/play.py | 115 ++++++++++++++++++++++++++++++---------------- AlphaGo/player.py | 1 + 5 files changed, 101 insertions(+), 58 deletions(-) diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index bf30083..c9f1a3c 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -183,11 +183,15 @@ class GTPEngine(): return 'unknown player', False def cmd_get_score(self, args, **kwargs): - return self._game.game_engine.executor_get_score(True), None + return self._game.game_engine.executor_get_score(True), True def cmd_show_board(self, args, **kwargs): return self._game.board, True + def cmd_get_prob(self, args, **kwargs): + return self._game.prob, True + + if __name__ == "main": game = Game() engine = GTPEngine(game_obj=Game) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 5f35c74..bf0d084 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -58,24 +58,9 @@ class Game: def set_komi(self, k): self.komi = k - def generate_nn_input(self, latest_boards, color): - state = np.zeros([1, self.size, self.size, 17]) - for i in range(8): - state[0, :, :, i] = np.array(np.array(latest_boards[i]) == np.ones(self.size ** 2)).reshape(self.size, self.size) - state[0, :, :, i + 8] = np.array(np.array(latest_boards[i]) == -np.ones(self.size ** 2)).reshape(self.size, self.size) - if color == utils.BLACK: - state[0, :, :, 16] = np.ones([self.size, self.size]) - if color == utils.WHITE: - state[0, :, :, 16] = np.zeros([self.size, self.size]) - return state - def think(self, latest_boards, color): - # TODO : using copy is right, or should we change to deepcopy? - self.game_engine.simulate_latest_boards = copy.copy(latest_boards) - self.game_engine.simulate_board = copy.copy(latest_boards[-1]) - nn_input = self.generate_nn_input(self.game_engine.simulate_latest_boards, color) - mcts = MCTS(self.game_engine, self.evaluator, [self.game_engine.simulate_latest_boards, color], self.size ** 2 + 1, inverse=True) - mcts.search(max_step=5) + mcts = MCTS(self.game_engine, self.evaluator, [latest_boards, color], self.size ** 2 + 1, inverse=True) + mcts.search(max_step=1) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0] diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 725dbd2..fab864e 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -1,6 +1,7 @@ import os import time import sys +import cPickle import numpy as np import tensorflow as tf @@ -167,4 +168,19 @@ class ResNet(object): #TODO: design the interface between the environment and training def train(self, mode='memory', *args, **kwargs): - pass \ No newline at end of file + if mode == 'memory': + pass + if mode == 'file': + self.train_with_file(data_path=kwargs['data_path'], checkpoint_path=kwargs['checkpoint_path']) + + def train_with_file(self, data_path, checkpoint_path): + if not os.path.exists(data_path): + raise ValueError("{} doesn't exist".format(data_path)) + + file_list = os.listdir(data_path) + if file_list <= 50: + time.sleep(1) + else: + file_list.sort(key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir( + data_path + file) else 0) + diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 7367804..562dd14 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -5,6 +5,18 @@ import re import Pyro4 import time import os +import cPickle + + +class Data(object): + def __init__(self): + self.boards = [] + self.probs = [] + self.winner = 0 + + def reset(self): + self.__init__() + if __name__ == '__main__': """ @@ -13,10 +25,13 @@ if __name__ == '__main__': """ # TODO : we should set the network path in a more configurable way. parser = argparse.ArgumentParser() + parser.add_argument("--result_path", type=str, default="./data/") parser.add_argument("--black_weight_path", type=str, default=None) parser.add_argument("--white_weight_path", type=str, default=None) args = parser.parse_args() + if not os.path.exists(args.result_path): + os.mkdir(args.result_path) # black_weight_path = "./checkpoints" # white_weight_path = "./checkpoints_origin" if args.black_weight_path is not None and (not os.path.exists(args.black_weight_path)): @@ -35,11 +50,13 @@ if __name__ == '__main__': time.sleep(1) # start two different player with different network weights. - agent_v0 = subprocess.Popen(['python', '-u', 'player.py', '--role=black', '--checkpoint_path=' + str(args.black_weight_path)], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + agent_v0 = subprocess.Popen( + ['python', '-u', 'player.py', '--role=black', '--checkpoint_path=' + str(args.black_weight_path)], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - agent_v1 = subprocess.Popen(['python', '-u', 'player.py', '--role=white', '--checkpoint_path=' + str(args.white_weight_path)], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + agent_v1 = subprocess.Popen( + ['python', '-u', 'player.py', '--role=white', '--checkpoint_path=' + str(args.white_weight_path)], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) server_list = "" while ("black" not in server_list) or ("white" not in server_list): @@ -50,6 +67,7 @@ if __name__ == '__main__': print "Start black player at : " + str(agent_v0.pid) print "Start white player at : " + str(agent_v1.pid) + data = Data() player = [None] * 2 player[0] = Pyro4.Proxy("PYRONAME:black") player[1] = Pyro4.Proxy("PYRONAME:white") @@ -63,39 +81,58 @@ if __name__ == '__main__': evaluate_rounds = 1 game_num = 0 - while game_num < evaluate_rounds: - num = 0 - pass_flag = [False, False] - print("Start game {}".format(game_num)) - # end the game if both palyer chose to pass, or play too much turns - while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2: - turn = num % 2 - move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n') - print role[turn] + " : " + str(move), - num += 1 - match = re.search(pattern, move) - if match is not None: - # print "match : " + str(match.group()) - play_or_pass = match.group() - pass_flag[turn] = False + try: + while True: + num = 0 + pass_flag = [False, False] + print("Start game {}".format(game_num)) + # end the game if both palyer chose to pass, or play too much turns + while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2: + turn = num % 2 + move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n') + print role[turn] + " : " + str(move), + num += 1 + match = re.search(pattern, move) + if match is not None: + # print "match : " + str(match.group()) + play_or_pass = match.group() + pass_flag[turn] = False + else: + # print "no match" + play_or_pass = ' PASS' + pass_flag[turn] = True + result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n') + board = player[turn].run_cmd(str(num) + ' show_board') + board = eval(board[board.index('['):board.index(']') + 1]) + for i in range(size): + for j in range(size): + print show[board[i * size + j]] + " ", + print "\n", + data.boards.append(board) + prob = player[turn].run_cmd(str(num) + ' get_prob') + data.probs.append(prob) + score = player[turn].run_cmd(str(num) + ' get_score') + print "Finished : ", score.split(" ")[1] + # TODO: generalize the player + if score > 0: + data.winner = 1 + if score < 0: + data.winner = -1 + player[0].run_cmd(str(num) + ' clear_board') + player[1].run_cmd(str(num) + ' clear_board') + file_list = os.listdir(args.result_path) + if not file_list: + data_num = 0 else: - # print "no match" - play_or_pass = ' PASS' - pass_flag[turn] = True - result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n') - board = player[turn].run_cmd(str(num) + ' show_board') - board = eval(board[board.index('['):board.index(']') + 1]) - for i in range(size): - for j in range(size): - print show[board[i * size + j]] + " ", - print "\n", - - score = player[turn].run_cmd(str(num) + ' get_score') - print "Finished : ", score.split(" ")[1] - player[0].run_cmd(str(num) + ' clear_board') - player[1].run_cmd(str(num) + ' clear_board') - game_num += 1 - - subprocess.call(["kill", "-9", str(agent_v0.pid)]) - subprocess.call(["kill", "-9", str(agent_v1.pid)]) - print "Kill all player, finish all game." + file_list.sort(key=lambda file: os.path.getmtime(args.result_path + file) if not os.path.isdir( + args.result_path + file) else 0) + data_num = eval(file_list[-1][:-4]) + 1 + print(file_list) + with open("./data/" + str(data_num) + ".pkl", "w") as file: + picklestring = cPickle.dump(data, file) + data.reset() + game_num += 1 + except KeyboardInterrupt: + subprocess.call(["kill", "-9", str(agent_v0.pid)]) + subprocess.call(["kill", "-9", str(agent_v1.pid)]) + print "Kill all player, finish all game." diff --git a/AlphaGo/player.py b/AlphaGo/player.py index b468cf3..0e3daff 100644 --- a/AlphaGo/player.py +++ b/AlphaGo/player.py @@ -20,6 +20,7 @@ class Player(object): #return "inside the Player of player.py" return self.engine.run_cmd(command) + if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--checkpoint_path", type=str, default=None) From 2acb1aab076f5393f79eb853e275de626d4d0247 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Thu, 21 Dec 2017 22:48:53 +0800 Subject: [PATCH 35/98] eliminate all references of Game class in Go class --- AlphaGo/engine.py | 2 +- AlphaGo/game.py | 15 ++----- AlphaGo/go.py | 101 +++++++++++++++++++++++-------------------- AlphaGo/play.py | 4 +- AlphaGo/self-play.py | 2 +- 5 files changed, 63 insertions(+), 61 deletions(-) diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index c9f1a3c..8b54470 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -183,7 +183,7 @@ class GTPEngine(): return 'unknown player', False def cmd_get_score(self, args, **kwargs): - return self._game.game_engine.executor_get_score(True), True + return self._game.game_engine.executor_get_score(self._game.board, True), True def cmd_show_board(self, args, **kwargs): return self._game.board, True diff --git a/AlphaGo/game.py b/AlphaGo/game.py index bf0d084..11ce52b 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -34,16 +34,7 @@ class Game: self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8) # self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], # feed_dict={self.net.x: state, self.net.is_training: False}) - self.game_engine = go.Go(game=self) - - def _flatten(self, vertex): - x, y = vertex - return (x - 1) * self.size + (y - 1) - - def _deflatten(self, idx): - x = idx // self.size + 1 - y = idx % self.size + 1 - return (x, y) + self.game_engine = go.Go(size=self.size, komi=self.komi) def clear(self): self.board = [utils.EMPTY] * (self.size ** 2) @@ -67,14 +58,14 @@ class Game: if choice == self.size ** 2: move = utils.PASS else: - move = self._deflatten(choice) + move = self.game_engine._deflatten(choice) return move, prob def play_move(self, color, vertex): # this function can be called directly to play the opponent's move if vertex == utils.PASS: return True - res = self.game_engine.executor_do_move(color, vertex) + res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex) return res def think_play_move(self, color): diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 4f1c759..9b7e21f 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -16,12 +16,22 @@ CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] class Go: def __init__(self, **kwargs): - self.game = kwargs['game'] + self.size = kwargs['size'] + self.komi = kwargs['komi'] + + def _flatten(self, vertex): + x, y = vertex + return (x - 1) * self.size + (y - 1) + + def _deflatten(self, idx): + x = idx // self.size + 1 + y = idx % self.size + 1 + return (x, y) def _in_board(self, vertex): x, y = vertex - if x < 1 or x > self.game.size: return False - if y < 1 or y > self.game.size: return False + if x < 1 or x > self.size: return False + if y < 1 or y > self.size: return False return True def _neighbor(self, vertex): @@ -45,7 +55,7 @@ class Go: return corner def _find_group(self, current_board, vertex): - color = current_board[self.game._flatten(vertex)] + color = current_board[self._flatten(vertex)] # print ("color : ", color) chain = set() frontier = [vertex] @@ -55,41 +65,41 @@ class Go: # print ("current : ", current) chain.add(current) for n in self._neighbor(current): - if current_board[self.game._flatten(n)] == color and not n in chain: + if current_board[self._flatten(n)] == color and not n in chain: frontier.append(n) - if current_board[self.game._flatten(n)] == utils.EMPTY: + if current_board[self._flatten(n)] == utils.EMPTY: has_liberty = True return has_liberty, chain def _is_suicide(self, current_board, color, vertex): - current_board[self.game._flatten(vertex)] = color # assume that we already take this move + current_board[self._flatten(vertex)] = color # assume that we already take this move suicide = False has_liberty, group = self._find_group(current_board, vertex) if not has_liberty: suicide = True # no liberty, suicide for n in self._neighbor(vertex): - if current_board[self.game._flatten(n)] == utils.another_color(color): + if current_board[self._flatten(n)] == utils.another_color(color): opponent_liberty, group = self._find_group(current_board, n) if not opponent_liberty: suicide = False # this move is able to take opponent's stone, not suicide - current_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move + current_board[self._flatten(vertex)] = utils.EMPTY # undo this move return suicide def _process_board(self, current_board, color, vertex): nei = self._neighbor(vertex) for n in nei: - if current_board[self.game._flatten(n)] == utils.another_color(color): + if current_board[self._flatten(n)] == utils.another_color(color): has_liberty, group = self._find_group(current_board, n) if not has_liberty: for b in group: - current_board[self.game._flatten(b)] = utils.EMPTY + current_board[self._flatten(b)] = utils.EMPTY def _check_global_isomorphous(self, history_boards, current_board, color, vertex): repeat = False next_board = copy.copy(current_board) - next_board[self.game._flatten(vertex)] = color + next_board[self._flatten(vertex)] = color self._process_board(next_board, color, vertex) if next_board in history_boards: repeat = True @@ -98,7 +108,7 @@ class Go: def _is_eye(self, current_board, color, vertex): nei = self._neighbor(vertex) cor = self._corner(vertex) - ncolor = {color == current_board[self.game._flatten(n)] for n in nei} + ncolor = {color == current_board[self._flatten(n)] for n in nei} if False in ncolor: # print "not all neighbors are in same color with us" return False @@ -107,7 +117,7 @@ class Go: # print "all neighbors are in same group and same color with us" return True else: - opponent_number = [current_board[self.game._flatten(c)] for c in cor].count(-color) + opponent_number = [current_board[self._flatten(c)] for c in cor].count(-color) opponent_propotion = float(opponent_number) / float(len(cor)) if opponent_propotion < 0.5: # print "few opponents, real eye" @@ -131,20 +141,20 @@ class Go: board = copy.deepcopy(current_board) empty_idx = [i for i, x in enumerate(board) if x == utils.EMPTY] # find all empty idx for idx in empty_idx: - neighbor_idx = self._neighbor(self.game.deflatten(idx)) + neighbor_idx = self._neighbor(self.deflatten(idx)) if len(neighbor_idx) > 1: first_idx = neighbor_idx[0] for other_idx in neighbor_idx[1:]: - if self.game.board[self.game.flatten(other_idx)] != self.game.board[self.game.flatten(first_idx)]: + if board[self.flatten(other_idx)] != board[self.flatten(first_idx)]: return False return True def _action2vertex(self, action): - if action == self.game.size ** 2: + if action == self.size ** 2: vertex = (0, 0) else: - vertex = self.game._deflatten(action) + vertex = self._deflatten(action) return vertex def _is_valid(self, history_boards, current_board, color, vertex): @@ -153,7 +163,7 @@ class Go: return False ### already have stone - if not current_board[self.game._flatten(vertex)] == utils.EMPTY: + if not current_board[self._flatten(vertex)] == utils.EMPTY: return False ### check if it is suicide @@ -195,7 +205,7 @@ class Go: if vertex == utils.PASS: return board else: - id_ = self.game._flatten(vertex) + id_ = self._flatten(vertex) board[id_] = color return board @@ -208,21 +218,21 @@ class Go: new_color = -color return [history_boards, new_color], 0 - def executor_do_move(self, color, vertex): - if not self._is_valid(self.game.history, self.game.board, color, vertex): + def executor_do_move(self, history, latest_boards, current_board, color, vertex): + if not self._is_valid(history, current_board, color, vertex): return False - self.game.board[self.game._flatten(vertex)] = color - self._process_board(self.game.board, color, vertex) - self.game.history.append(copy.copy(self.game.board)) - self.game.latest_boards.append(copy.copy(self.game.board)) + current_board[self._flatten(vertex)] = color + self._process_board(current_board, color, vertex) + history.append(copy.copy(current_board)) + latest_boards.append(copy.copy(current_board)) return True - def _find_empty(self): - idx = [i for i,x in enumerate(self.game.board) if x == utils.EMPTY ][0] - return self.game._deflatten(idx) + def _find_empty(self, current_board): + idx = [i for i,x in enumerate(current_board) if x == utils.EMPTY ][0] + return self._deflatten(idx) - def _find_boarder(self, vertex): - _, group = self._find_group(self.game.board, vertex) + def _find_boarder(self, current_board, vertex): + _, group = self._find_group(current_board, vertex) border = [] for b in group: for n in self._neighbor(b): @@ -248,7 +258,7 @@ class Go: start_vertex_x += x_diff start_vertex_y += y_diff - def _predict_from_nearby(self, vertex, neighbor_step=3): + def _predict_from_nearby(self, current_board, vertex, neighbor_step=3): ''' step: the nearby 3 steps is considered :vertex: position to be estimated @@ -264,38 +274,37 @@ class Go: self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] - step, -1, 1, neighbor_step) color_estimate = 0 for neighbor_vertex in neighbor_vertex_set: - color_estimate += self.game.board[self.game._flatten(neighbor_vertex)] + color_estimate += current_board[self._flatten(neighbor_vertex)] if color_estimate > 0: return utils.BLACK elif color_estimate < 0: return utils.WHITE - def executor_get_score(self, is_unknown_estimation=False): + def executor_get_score(self, current_board, is_unknown_estimation=False): ''' is_unknown_estimation: whether use nearby stone to predict the unknown return score from BLACK perspective. ''' - _board = copy.copy(self.game.board) - while utils.EMPTY in self.game.board: - vertex = self._find_empty() - boarder = self._find_boarder(vertex) - boarder_color = set(map(lambda v: self.game.board[self.game._flatten(v)], boarder)) + _board = copy.deepcopy(current_board) + while utils.EMPTY in _board: + vertex = self._find_empty(_board) + boarder = self._find_boarder(_board, vertex) + boarder_color = set(map(lambda v: _board[self._flatten(v)], boarder)) if boarder_color == {utils.BLACK}: - self.game.board[self.game._flatten(vertex)] = utils.BLACK + _board[self._flatten(vertex)] = utils.BLACK elif boarder_color == {utils.WHITE}: - self.game.board[self.game._flatten(vertex)] = utils.WHITE + _board[self._flatten(vertex)] = utils.WHITE elif is_unknown_estimation: - self.game.board[self.game._flatten(vertex)] = self._predict_from_nearby(vertex) + _board[self._flatten(vertex)] = self._predict_from_nearby(_board, vertex) else: - self.game.board[self.game._flatten(vertex)] =utils.UNKNOWN + _board[self._flatten(vertex)] =utils.UNKNOWN score = 0 - for i in self.game.board: + for i in _board: if i == utils.BLACK: score += 1 elif i == utils.WHITE: score -= 1 - score -= self.game.komi + score -= self.komi - self.game.board = _board return score diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 562dd14..e18555f 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -82,7 +82,7 @@ if __name__ == '__main__': evaluate_rounds = 1 game_num = 0 try: - while True: + while game_num < evaluate_rounds: num = 0 pass_flag = [False, False] print("Start game {}".format(game_num)) @@ -132,6 +132,8 @@ if __name__ == '__main__': picklestring = cPickle.dump(data, file) data.reset() game_num += 1 + subprocess.call(["kill", "-9", str(agent_v0.pid)]) + subprocess.call(["kill", "-9", str(agent_v1.pid)]) except KeyboardInterrupt: subprocess.call(["kill", "-9", str(agent_v0.pid)]) subprocess.call(["kill", "-9", str(agent_v1.pid)]) diff --git a/AlphaGo/self-play.py b/AlphaGo/self-play.py index 63b7e97..4387b24 100644 --- a/AlphaGo/self-play.py +++ b/AlphaGo/self-play.py @@ -79,7 +79,7 @@ while True: prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1)) print("Finished") print("\n") - score = game.game_engine.executor_get_score(True) + score = game.game_engine.executor_get_score(game.board, True) if score > 0: winner = utils.BLACK else: From 9ad53de54f0ef28aea0df9de31c9d2c405186d15 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Thu, 21 Dec 2017 23:30:24 +0800 Subject: [PATCH 36/98] implement the training process --- .gitignore | 1 + AlphaGo/game.py | 2 +- AlphaGo/model.py | 106 ++++++++++++++++++++++++++++++++++++++++++----- AlphaGo/play.py | 28 ++++++++----- 4 files changed, 114 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index 36d134c..d697b92 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ checkpoints checkpoints_origin *.json .DS_Store +data diff --git a/AlphaGo/game.py b/AlphaGo/game.py index bf0d084..c342d0c 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -60,7 +60,7 @@ class Game: def think(self, latest_boards, color): mcts = MCTS(self.game_engine, self.evaluator, [latest_boards, color], self.size ** 2 + 1, inverse=True) - mcts.search(max_step=1) + mcts.search(max_step=20) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0] diff --git a/AlphaGo/model.py b/AlphaGo/model.py index fab864e..41f3a47 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -2,6 +2,7 @@ import os import time import sys import cPickle +from collections import deque import numpy as np import tensorflow as tf @@ -71,6 +72,13 @@ def value_head(input, is_training): return h +class Data(object): + def __init__(self): + self.boards = [] + self.probs = [] + self.winner = 0 + + class ResNet(object): def __init__(self, board_size, action_num, history_length=1, residual_block_num=20, checkpoint_path=None): """ @@ -85,11 +93,18 @@ class ResNet(object): self.board_size = board_size self.action_num = action_num self.history_length = history_length + self.checkpoint_path = checkpoint_path self.x = tf.placeholder(tf.float32, shape=[None, self.board_size, self.board_size, 2 * self.history_length + 1]) self.is_training = tf.placeholder(tf.bool, shape=[]) self.z = tf.placeholder(tf.float32, shape=[None, 1]) self.pi = tf.placeholder(tf.float32, shape=[None, self.action_num]) - self._build_network(residual_block_num, checkpoint_path) + self._build_network(residual_block_num, self.checkpoint_path) + + # training hyper-parameters: + self.window_length = 1000 + self.save_freq = 1000 + self.training_data = {'states': deque(maxlen=self.window_length), 'probs': deque(maxlen=self.window_length), + 'winner': deque(maxlen=self.window_length)} def _build_network(self, residual_block_num, checkpoint_path): """ @@ -118,7 +133,7 @@ class ResNet(object): with tf.control_dependencies(self.update_ops): self.train_op = tf.train.AdamOptimizer(1e-4).minimize(self.total_loss) self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) - self.saver = tf.train.Saver(max_to_keep=10, var_list=self.var_list) + self.saver = tf.train.Saver(var_list=self.var_list) self.sess = multi_gpu.create_session() self.sess.run(tf.global_variables_initializer()) if checkpoint_path is not None: @@ -166,21 +181,90 @@ class ResNet(object): state[0, :, :, 2 * self.history_length] = np.zeros([self.board_size, self.board_size]) return state - #TODO: design the interface between the environment and training + # TODO: design the interface between the environment and training def train(self, mode='memory', *args, **kwargs): if mode == 'memory': pass if mode == 'file': - self.train_with_file(data_path=kwargs['data_path'], checkpoint_path=kwargs['checkpoint_path']) + self._train_with_file(data_path=kwargs['data_path'], batch_size=kwargs['batch_size'], + checkpoint_path=kwargs['checkpoint_path']) - def train_with_file(self, data_path, checkpoint_path): + def _train_with_file(self, data_path, batch_size, checkpoint_path): + # check if the path is valid if not os.path.exists(data_path): raise ValueError("{} doesn't exist".format(data_path)) + self.checkpoint_path = checkpoint_path + if not os.path.exists(self.checkpoint_path): + os.mkdir(self.checkpoint_path) - file_list = os.listdir(data_path) - if file_list <= 50: - time.sleep(1) - else: - file_list.sort(key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir( - data_path + file) else 0) + new_file_list = [] + all_file_list = [] + training_data = {} + iters = 0 + while True: + new_file_list = list(set(os.listdir(data_path)).difference(all_file_list)) + all_file_list = os.listdir(data_path) + new_file_list.sort( + key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir(data_path + file) else 0) + if new_file_list: + for file in new_file_list: + states, probs, winner = self._file_to_training_data(data_path + file) + assert states.shape[0] == probs.shape[0] + assert states.shape[0] == winner.shape[0] + self.training_data['states'].append(states) + self.training_data['probs'].append(probs) + self.training_data['winner'].append(winner) + training_data['states'] = np.concatenate(self.training_data['states'], axis=0) + training_data['probs'] = np.concatenate(self.training_data['probs'], axis=0) + training_data['winner'] = np.concatenate(self.training_data['winner'], axis=0) + if len(self.training_data['states']) != self.window_length: + continue + else: + data_num = training_data['states'].shape[0] + index = np.arange(data_num) + np.random.shuffle(index) + start_time = time.time() + value_loss, policy_loss, reg, _ = self.sess.run( + [self.value_loss, self.policy_loss, self.reg, self.train_op], + feed_dict={self.x: training_data['states'][index[:batch_size]], + self.z: training_data['winner'][index[:batch_size]], + self.pi: training_data['probs'][index[:batch_size]], + self.is_training: True}) + print("Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format(iters, + time.time() - start_time, + value_loss, + policy_loss, reg)) + iters += 1 + if iters % self.save_freq == 0: + save_path = "Iteration{}.ckpt".format(iters) + self.saver.save(self.sess, self.checkpoint_path + save_path) + + def _file_to_training_data(self, file_name): + with open(file_name, 'r') as file: + data = cPickle.load(file) + history = deque(maxlen=self.history_length) + states = [] + probs = [] + winner = [] + for _ in range(self.history_length): + # Note that 0 is specified, need a more general way like config + history.append([0] * self.board_size ** 2) + # Still, +1 is specified + color = +1 + + for [board, prob] in zip(data.boards, data.probs): + history.append(board) + states.append(self._history2state(history, color)) + probs.append(np.array(prob).reshape(1, self.board_size ** 2 + 1)) + winner.append(np.array(data.winner).reshape(1, 1)) + color *= -1 + states = np.concatenate(states, axis=0) + probs = np.concatenate(probs, axis=0) + winner = np.concatenate(winner, axis=0) + return states, probs, winner + + +if __name__=="__main__": + model = ResNet(board_size=9, action_num=82) + model.train("file", data_path="./data/", batch_size=128, checkpoint_path="./checkpoint/") \ No newline at end of file diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 562dd14..bd3776e 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -76,6 +76,7 @@ if __name__ == '__main__': color = ['b', 'w'] pattern = "[A-Z]{1}[0-9]{1}" + space = re.compile("\s+") size = 9 show = ['.', 'X', 'O'] @@ -83,12 +84,20 @@ if __name__ == '__main__': game_num = 0 try: while True: + start_time = time.time() num = 0 pass_flag = [False, False] print("Start game {}".format(game_num)) # end the game if both palyer chose to pass, or play too much turns while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2: turn = num % 2 + board = player[turn].run_cmd(str(num) + ' show_board') + board = eval(board[board.index('['):board.index(']') + 1]) + for i in range(size): + for j in range(size): + print show[board[i * size + j]] + " ", + print "\n", + data.boards.append(board) move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n') print role[turn] + " : " + str(move), num += 1 @@ -102,21 +111,18 @@ if __name__ == '__main__': play_or_pass = ' PASS' pass_flag[turn] = True result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n') - board = player[turn].run_cmd(str(num) + ' show_board') - board = eval(board[board.index('['):board.index(']') + 1]) - for i in range(size): - for j in range(size): - print show[board[i * size + j]] + " ", - print "\n", - data.boards.append(board) prob = player[turn].run_cmd(str(num) + ' get_prob') + prob = space.sub(',', prob[prob.index('['):prob.index(']') + 1]) + prob = prob.replace('[,', '[') + prob = prob.replace('],', ']') + prob = eval(prob) data.probs.append(prob) score = player[turn].run_cmd(str(num) + ' get_score') print "Finished : ", score.split(" ")[1] # TODO: generalize the player - if score > 0: + if eval(score.split(" ")[1]) > 0: data.winner = 1 - if score < 0: + if eval(score.split(" ")[1]) < 0: data.winner = -1 player[0].run_cmd(str(num) + ' clear_board') player[1].run_cmd(str(num) + ' clear_board') @@ -127,12 +133,12 @@ if __name__ == '__main__': file_list.sort(key=lambda file: os.path.getmtime(args.result_path + file) if not os.path.isdir( args.result_path + file) else 0) data_num = eval(file_list[-1][:-4]) + 1 - print(file_list) with open("./data/" + str(data_num) + ".pkl", "w") as file: picklestring = cPickle.dump(data, file) data.reset() game_num += 1 - except KeyboardInterrupt: + print("Time {}".format(time.time()-start_time)) + except Exception: subprocess.call(["kill", "-9", str(agent_v0.pid)]) subprocess.call(["kill", "-9", str(agent_v1.pid)]) print "Kill all player, finish all game." From 43f6527d8e4ebaec6b9c001361db689090127e87 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Thu, 21 Dec 2017 23:55:31 +0800 Subject: [PATCH 37/98] modify for multi instance --- AlphaGo/play.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 35549dd..a9d3d20 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -28,6 +28,7 @@ if __name__ == '__main__': parser.add_argument("--result_path", type=str, default="./data/") parser.add_argument("--black_weight_path", type=str, default=None) parser.add_argument("--white_weight_path", type=str, default=None) + parser.add_argument("--id", type=int, default=0) args = parser.parse_args() if not os.path.exists(args.result_path): @@ -50,12 +51,15 @@ if __name__ == '__main__': time.sleep(1) # start two different player with different network weights. + black_role_name = 'black' + str(args.id) + white_role_name = 'white' + str(args.id) + agent_v0 = subprocess.Popen( - ['python', '-u', 'player.py', '--role=black', '--checkpoint_path=' + str(args.black_weight_path)], + ['python', '-u', 'player.py', '--role=' + black_role_name, '--checkpoint_path=' + str(args.black_weight_path)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) agent_v1 = subprocess.Popen( - ['python', '-u', 'player.py', '--role=white', '--checkpoint_path=' + str(args.white_weight_path)], + ['python', '-u', 'player.py', '--role=' + white_role_name, '--checkpoint_path=' + str(args.white_weight_path)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) server_list = "" @@ -69,8 +73,8 @@ if __name__ == '__main__': data = Data() player = [None] * 2 - player[0] = Pyro4.Proxy("PYRONAME:black") - player[1] = Pyro4.Proxy("PYRONAME:white") + player[0] = Pyro4.Proxy("PYRONAME:" + black_role_name) + player[1] = Pyro4.Proxy("PYRONAME:" + white_role_name) role = ["BLACK", "WHITE"] color = ['b', 'w'] From 6835ec62e14c63703a46a4adb8df677d6a14a0b3 Mon Sep 17 00:00:00 2001 From: rtz19970824 Date: Fri, 22 Dec 2017 00:04:51 +0800 Subject: [PATCH 38/98] multi-instance support --- AlphaGo/play.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/AlphaGo/play.py b/AlphaGo/play.py index a9d3d20..a8267a7 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -41,14 +41,14 @@ if __name__ == '__main__': raise ValueError("Can't not find the network weights for white player.") # kill the old server - kill_old_server = subprocess.Popen(['killall', 'pyro4-ns']) - print "kill the old pyro4 name server, the return code is : " + str(kill_old_server.wait()) - time.sleep(1) + # kill_old_server = subprocess.Popen(['killall', 'pyro4-ns']) + # print "kill the old pyro4 name server, the return code is : " + str(kill_old_server.wait()) + # time.sleep(1) # start a name server to find the remote object - start_new_server = subprocess.Popen(['pyro4-ns', '&']) - print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait()) - time.sleep(1) + # start_new_server = subprocess.Popen(['pyro4-ns', '&']) + # print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait()) + # time.sleep(1) # start two different player with different network weights. black_role_name = 'black' + str(args.id) @@ -63,7 +63,7 @@ if __name__ == '__main__': stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) server_list = "" - while ("black" not in server_list) or ("white" not in server_list): + while (black_role_name not in server_list) or (white_role_name not in server_list): server_list = subprocess.check_output(['pyro4-nsc', 'list']) print "Waiting for the server start..." time.sleep(1) @@ -142,11 +142,12 @@ if __name__ == '__main__': data.reset() game_num += 1 - except Exception: + except Exception as e: + print(e) subprocess.call(["kill", "-9", str(agent_v0.pid)]) subprocess.call(["kill", "-9", str(agent_v1.pid)]) print "Kill all player, finish all game." subprocess.call(["kill", "-9", str(agent_v0.pid)]) subprocess.call(["kill", "-9", str(agent_v1.pid)]) - print "Kill all player, finish all game." \ No newline at end of file + print "Kill all player, finish all game." From 1cc5063007925ceada46974f21aaf03a2361deee Mon Sep 17 00:00:00 2001 From: Haosheng Zou Date: Fri, 22 Dec 2017 00:22:23 +0800 Subject: [PATCH 39/98] add value_function (critic). value_function and policy not finished yet. --- tianshou/core/policy/base.py | 2 +- tianshou/core/policy/dqn.py | 11 ++++ tianshou/core/value_function/__init__.py | 0 tianshou/core/value_function/action_value.py | 53 ++++++++++++++++++++ tianshou/core/value_function/base.py | 23 +++++++++ tianshou/core/value_function/state_value.py | 23 +++++++++ 6 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 tianshou/core/value_function/__init__.py create mode 100644 tianshou/core/value_function/action_value.py create mode 100644 tianshou/core/value_function/base.py create mode 100644 tianshou/core/value_function/state_value.py diff --git a/tianshou/core/policy/base.py b/tianshou/core/policy/base.py index eecfc4f..025abd5 100644 --- a/tianshou/core/policy/base.py +++ b/tianshou/core/policy/base.py @@ -15,7 +15,7 @@ __all__ = [ 'QValuePolicy', ] -# TODO: separate actor and critic, we should focus on it once we finish the basic module. +# TODO: a even more "base" class for policy class QValuePolicy(object): diff --git a/tianshou/core/policy/dqn.py b/tianshou/core/policy/dqn.py index 39f6a16..d03dbd4 100644 --- a/tianshou/core/policy/dqn.py +++ b/tianshou/core/policy/dqn.py @@ -1,5 +1,16 @@ from tianshou.core.policy.base import QValuePolicy import tensorflow as tf +import sys +sys.path.append('..') +import value_function.action_value as value_func + + +class DQN_refactor(object): + """ + use DQN from value_function as a member + """ + def __init__(self, value_tensor, observation_placeholder, action_placeholder): + self._network = value_func.DQN(value_tensor, observation_placeholder, action_placeholder) class DQN(QValuePolicy): diff --git a/tianshou/core/value_function/__init__.py b/tianshou/core/value_function/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tianshou/core/value_function/action_value.py b/tianshou/core/value_function/action_value.py new file mode 100644 index 0000000..cb8acc8 --- /dev/null +++ b/tianshou/core/value_function/action_value.py @@ -0,0 +1,53 @@ +from base import ValueFunctionBase +import tensorflow as tf + + +class ActionValue(ValueFunctionBase): + """ + class of action values Q(s, a). + """ + def __init__(self, value_tensor, observation_placeholder, action_placeholder): + self._action_placeholder = action_placeholder + super(ActionValue, self).__init__( + value_tensor=value_tensor, + observation_placeholder=observation_placeholder + ) + + def get_value(self, observation, action): + """ + + :param observation: numpy array of observations, of shape (batchsize, observation_dim). + :param action: numpy array of actions, of shape (batchsize, action_dim) + # TODO: Atari discrete action should have dim 1. Super Mario may should have, say, dim 5, where each can be 0/1 + :return: numpy array of state values, of shape (batchsize, ) + # TODO: dealing with the last dim of 1 in V(s) and Q(s, a) + """ + sess = tf.get_default_session() + return sess.run(self.get_value_tensor(), feed_dict= + {self._observation_placeholder: observation, self._action_placeholder:action})[:, 0] + + +class DQN(ActionValue): + """ + class of the very DQN architecture. Instead of feeding s and a to the network to get a value, DQN feed s to the + network and the last layer is Q(s, *) for all actions + """ + def __init__(self, value_tensor, observation_placeholder, action_placeholder): + """ + :param value_tensor: of shape (batchsize, num_actions) + :param observation_placeholder: of shape (batchsize, observation_dim) + :param action_placeholder: of shape (batchsize, ) + """ + self._value_tensor_all_actions = value_tensor + canonical_value_tensor = value_tensor[action_placeholder] # maybe a tf.map_fn. for now it's wrong + + super(DQN, self).__init__(value_tensor=canonical_value_tensor, + observation_placeholder=observation_placeholder, + action_placeholder=action_placeholder) + + def get_value_all_actions(self, observation): + sess = tf.get_default_session() + return sess.run(self._value_tensor_all_actions, feed_dict={self._observation_placeholder: observation}) + + def get_value_tensor_all_actions(self): + return self._value_tensor_all_actions \ No newline at end of file diff --git a/tianshou/core/value_function/base.py b/tianshou/core/value_function/base.py new file mode 100644 index 0000000..0b27759 --- /dev/null +++ b/tianshou/core/value_function/base.py @@ -0,0 +1,23 @@ + +# TODO: linear feature baseline also in tf? +class ValueFunctionBase(object): + """ + base class of value functions. Children include state values V(s) and action values Q(s, a) + """ + def __init__(self, value_tensor, observation_placeholder): + self._observation_placeholder = observation_placeholder + self._value_tensor = value_tensor + + def get_value(self, **kwargs): + """ + + :return: batch of corresponding values in numpy array + """ + raise NotImplementedError() + + def get_value_tensor(self): + """ + + :return: tensor of the corresponding values + """ + return self._value_tensor diff --git a/tianshou/core/value_function/state_value.py b/tianshou/core/value_function/state_value.py new file mode 100644 index 0000000..04fe442 --- /dev/null +++ b/tianshou/core/value_function/state_value.py @@ -0,0 +1,23 @@ +from base import ValueFunctionBase +import tensorflow as tf + + +class StateValue(ValueFunctionBase): + """ + class of state values V(s). + """ + def __init__(self, value_tensor, observation_placeholder): + super(StateValue, self).__init__( + value_tensor=value_tensor, + observation_placeholder=observation_placeholder + ) + + def get_value(self, observation): + """ + + :param observation: numpy array of observations, of shape (batchsize, observation_dim). + :return: numpy array of state values, of shape (batchsize, ) + # TODO: dealing with the last dim of 1 in V(s) and Q(s, a) + """ + sess = tf.get_default_session() + return sess.run(self.get_value_tensor(), feed_dict={self._observation_placeholder: observation})[:, 0] \ No newline at end of file From 5c29dad26367ba76c1fbe4a19213c0bf9ae7391e Mon Sep 17 00:00:00 2001 From: JialianLee Date: Fri, 22 Dec 2017 01:57:48 +0800 Subject: [PATCH 40/98] An initial version for Reversi --- AlphaGo/reversi.py | 252 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 252 insertions(+) create mode 100644 AlphaGo/reversi.py diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py new file mode 100644 index 0000000..49d0e9a --- /dev/null +++ b/AlphaGo/reversi.py @@ -0,0 +1,252 @@ +from __future__ import print_function +import numpy as np + +''' +Settings of the Go game. + +(1, 1) is considered as the upper left corner of the board, +(size, 1) is the lower left +''' + + +def find_correct_moves(own, enemy): + """return legal moves""" + left_right_mask = 0x7e7e7e7e7e7e7e7e # Both most left-right edge are 0, else 1 + top_bottom_mask = 0x00ffffffffffff00 # Both most top-bottom edge are 0, else 1 + mask = left_right_mask & top_bottom_mask + mobility = 0 + mobility |= search_offset_left(own, enemy, left_right_mask, 1) # Left + mobility |= search_offset_left(own, enemy, mask, 9) # Left Top + mobility |= search_offset_left(own, enemy, top_bottom_mask, 8) # Top + mobility |= search_offset_left(own, enemy, mask, 7) # Top Right + mobility |= search_offset_right(own, enemy, left_right_mask, 1) # Right + mobility |= search_offset_right(own, enemy, mask, 9) # Bottom Right + mobility |= search_offset_right(own, enemy, top_bottom_mask, 8) # Bottom + mobility |= search_offset_right(own, enemy, mask, 7) # Left bottom + return mobility + + +def calc_flip(pos, own, enemy): + """return flip stones of enemy by bitboard when I place stone at pos. + + :param pos: 0~63 + :param own: bitboard (0=top left, 63=bottom right) + :param enemy: bitboard + :return: flip stones of enemy when I place stone at pos. + """ + assert 0 <= pos <= 63, f"pos={pos}" + f1 = _calc_flip_half(pos, own, enemy) + f2 = _calc_flip_half(63 - pos, rotate180(own), rotate180(enemy)) + return f1 | rotate180(f2) + + +def _calc_flip_half(pos, own, enemy): + el = [enemy, enemy & 0x7e7e7e7e7e7e7e7e, enemy & 0x7e7e7e7e7e7e7e7e, enemy & 0x7e7e7e7e7e7e7e7e] + masks = [0x0101010101010100, 0x00000000000000fe, 0x0002040810204080, 0x8040201008040200] + masks = [b64(m << pos) for m in masks] + flipped = 0 + for e, mask in zip(el, masks): + outflank = mask & ((e | ~mask) + 1) & own + flipped |= (outflank - (outflank != 0)) & mask + return flipped + + +def search_offset_left(own, enemy, mask, offset): + e = enemy & mask + blank = ~(own | enemy) + t = e & (own >> offset) + t |= e & (t >> offset) + t |= e & (t >> offset) + t |= e & (t >> offset) + t |= e & (t >> offset) + t |= e & (t >> offset) # Up to six stones can be turned at once + return blank & (t >> offset) # Only the blank squares can be started + + +def search_offset_right(own, enemy, mask, offset): + e = enemy & mask + blank = ~(own | enemy) + t = e & (own << offset) + t |= e & (t << offset) + t |= e & (t << offset) + t |= e & (t << offset) + t |= e & (t << offset) + t |= e & (t << offset) # Up to six stones can be turned at once + return blank & (t << offset) # Only the blank squares can be started + + +def flip_vertical(x): + k1 = 0x00FF00FF00FF00FF + k2 = 0x0000FFFF0000FFFF + x = ((x >> 8) & k1) | ((x & k1) << 8) + x = ((x >> 16) & k2) | ((x & k2) << 16) + x = (x >> 32) | b64(x << 32) + return x + + +def b64(x): + return x & 0xFFFFFFFFFFFFFFFF + + +def bit_count(x): + return bin(x).count('1') + + +def bit_to_array(x, size): + """bit_to_array(0b0010, 4) -> array([0, 1, 0, 0])""" + return np.array(list(reversed((("0" * size) + bin(x)[2:])[-size:])), dtype=np.uint8) + + +def flip_diag_a1h8(x): + k1 = 0x5500550055005500 + k2 = 0x3333000033330000 + k4 = 0x0f0f0f0f00000000 + t = k4 & (x ^ b64(x << 28)) + x ^= t ^ (t >> 28) + t = k2 & (x ^ b64(x << 14)) + x ^= t ^ (t >> 14) + t = k1 & (x ^ b64(x << 7)) + x ^= t ^ (t >> 7) + return x + + +def rotate90(x): + return flip_diag_a1h8(flip_vertical(x)) + + +def rotate180(x): + return rotate90(rotate90(x)) + + +class Reversi: + def __init__(self, black=None, white=None): + self.black = black or (0b00001000 << 24 | 0b00010000 << 32) + self.white = white or (0b00010000 << 24 | 0b00001000 << 32) + self.board = None # 8 * 8 board with 1 for black, -1 for white and 0 for blank + self.color = None # 1 for black and -1 for white + self.action = None # number in 0~63 + self.winner = None + + def simulate_is_valid(self, board, color): + self.board = board + self.color = color + self.board2bitboard() + own, enemy = self.get_own_and_enemy() + mobility = find_correct_moves(own, enemy) + valid_moves = bit_to_array(mobility, 64) + valid_moves = list(np.reshape(valid_moves, len(valid_moves))) + return valid_moves + + def simulate_step_forward(self, board, color, vertex): + self.board = board + self.color = color + self.board2bitboard() + self.vertex2action(vertex) + step_forward = self.step() + if step_forward: + new_board = self.bitboard2board() + return new_board + + def executor_do_move(self, board, color, vertex): + self.board = board + self.color = color + self.board2bitboard() + self.vertex2action(vertex) + step_forward = self.step() + if step_forward: + new_board = self.bitboard2board() + return new_board + + def executor_get_score(self, board): + self.board = board + self._game_over() + if self.winner is not None: + return self.winner, 0 - self.winner + else: + ValueError("Game not finished!") + + def board2bitboard(self): + count = 1 + if self.board is None: + ValueError("None board!") + self.black = 0 + self.white = 0 + for i in range(64): + if self.board[i] == 1: + self.black |= count + elif self.board[i] == -1: + self.white |= count + count *= 2 + + def vertex2action(self, vertex): + x, y = vertex + if x == 0 and y == 0: + self.action = None + else: + self.action = 8 * (x - 1) + y - 1 + + def bitboard2board(self): + board = [] + black = bit_to_array(self.black, 64) + white = bit_to_array(self.white, 64) + for i in range(64): + if black[i]: + board.append(1) + elif white[i]: + board.append(-1) + else: + board.append(0) + return board + + def step(self): + if self.action < 0 or self.action > 63: + ValueError("Wrong action!") + if self.action is None: + return False + + own, enemy = self.get_own_and_enemy() + + flipped = calc_flip(self.action, own, enemy) + if bit_count(flipped) == 0: + self.illegal_move_to_lose(self.action) + return False + own ^= flipped + own |= 1 << self.action + enemy ^= flipped + + self.set_own_and_enemy(own, enemy) + return True + + def _game_over(self): + # self.done = True + if self.winner is None: + black_num, white_num = self.number_of_black_and_white + if black_num > white_num: + self.winner = 1 + elif black_num < white_num: + self.winner = -1 + else: + self.winner = 0 + + def illegal_move_to_lose(self, action): + logger.warning(f"Illegal action={action}, No Flipped!") + self._game_over() + + def get_own_and_enemy(self): + if self.color == 1: + own, enemy = self.black, self.white + elif self.color == -1: + own, enemy = self.white, self.black + else: + own, enemy = None, None + return own, enemy + + def set_own_and_enemy(self, own, enemy): + if self.color == 1: + self.black, self.white = own, enemy + else: + self.white, self.black = own, enemy + + @property + def number_of_black_and_white(self): + return bit_count(self.black), bit_count(self.white) From 2b1285143c232bc4006f47eabb498b99baf59785 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Fri, 22 Dec 2017 13:04:02 +0800 Subject: [PATCH 41/98] debug the training process, initialize a nameserver if no nameserver exists --- AlphaGo/model.py | 15 ++++++++++++--- AlphaGo/play.py | 10 ++++++++-- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 41f3a47..541de81 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -203,7 +203,8 @@ class ResNet(object): iters = 0 while True: new_file_list = list(set(os.listdir(data_path)).difference(all_file_list)) - all_file_list = os.listdir(data_path) + if new_file_list: + all_file_list = os.listdir(data_path) new_file_list.sort( key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir(data_path + file) else 0) if new_file_list: @@ -241,8 +242,16 @@ class ResNet(object): self.saver.save(self.sess, self.checkpoint_path + save_path) def _file_to_training_data(self, file_name): - with open(file_name, 'r') as file: - data = cPickle.load(file) + read = False + with open(file_name, 'rb') as file: + while not read: + try: + file.seek(0) + data = cPickle.load(file) + read = True + except Exception as e: + print(e) + time.sleep(1) history = deque(maxlen=self.history_length) states = [] probs = [] diff --git a/AlphaGo/play.py b/AlphaGo/play.py index a8267a7..3681430 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -50,6 +50,12 @@ if __name__ == '__main__': # print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait()) # time.sleep(1) + # start a name server if no name server exists + if len(os.popen('ps aux | grep pyro4-ns | grep -v grep').readlines()) == 0: + start_new_server = subprocess.Popen(['pyro4-ns', '&']) + print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait()) + time.sleep(1) + # start two different player with different network weights. black_role_name = 'black' + str(args.id) white_role_name = 'white' + str(args.id) @@ -137,13 +143,13 @@ if __name__ == '__main__': file_list.sort(key=lambda file: os.path.getmtime(args.result_path + file) if not os.path.isdir( args.result_path + file) else 0) data_num = eval(file_list[-1][:-4]) + 1 - with open("./data/" + str(data_num) + ".pkl", "w") as file: + with open("./data/" + str(data_num) + ".pkl", "wb") as file: picklestring = cPickle.dump(data, file) data.reset() game_num += 1 except Exception as e: - print(e) + print(e) subprocess.call(["kill", "-9", str(agent_v0.pid)]) subprocess.call(["kill", "-9", str(agent_v1.pid)]) print "Kill all player, finish all game." From d281ecc6e082027e7f67341a0abf1c18dbacbae8 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Fri, 22 Dec 2017 13:05:01 +0800 Subject: [PATCH 42/98] no restrict on saving checkpoints --- AlphaGo/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 541de81..5629128 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -133,7 +133,7 @@ class ResNet(object): with tf.control_dependencies(self.update_ops): self.train_op = tf.train.AdamOptimizer(1e-4).minimize(self.total_loss) self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) - self.saver = tf.train.Saver(var_list=self.var_list) + self.saver = tf.train.Saver(max_to_keep=0, var_list=self.var_list) self.sess = multi_gpu.create_session() self.sess.run(tf.global_variables_initializer()) if checkpoint_path is not None: From 6b3efd7fca0f4e2eb7ac4e63524a30976efc4361 Mon Sep 17 00:00:00 2001 From: rtz19970824 Date: Fri, 22 Dec 2017 13:30:48 +0800 Subject: [PATCH 43/98] modify the training config --- AlphaGo/model.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 41f3a47..0d885ef 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -101,8 +101,8 @@ class ResNet(object): self._build_network(residual_block_num, self.checkpoint_path) # training hyper-parameters: - self.window_length = 1000 - self.save_freq = 1000 + self.window_length = 7000 + self.save_freq = 5000 self.training_data = {'states': deque(maxlen=self.window_length), 'probs': deque(maxlen=self.window_length), 'winner': deque(maxlen=self.window_length)} @@ -241,6 +241,7 @@ class ResNet(object): self.saver.save(self.sess, self.checkpoint_path + save_path) def _file_to_training_data(self, file_name): + print(file_name) with open(file_name, 'r') as file: data = cPickle.load(file) history = deque(maxlen=self.history_length) @@ -267,4 +268,4 @@ class ResNet(object): if __name__=="__main__": model = ResNet(board_size=9, action_num=82) - model.train("file", data_path="./data/", batch_size=128, checkpoint_path="./checkpoint/") \ No newline at end of file + model.train("file", data_path="./data/", batch_size=128, checkpoint_path="./checkpoint/") From a8509ba2921795002bd88942bf58523aba80de99 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Fri, 22 Dec 2017 13:42:53 +0800 Subject: [PATCH 44/98] faster the loading --- AlphaGo/model.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 5629128..c4338c8 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -215,9 +215,10 @@ class ResNet(object): self.training_data['states'].append(states) self.training_data['probs'].append(probs) self.training_data['winner'].append(winner) - training_data['states'] = np.concatenate(self.training_data['states'], axis=0) - training_data['probs'] = np.concatenate(self.training_data['probs'], axis=0) - training_data['winner'] = np.concatenate(self.training_data['winner'], axis=0) + if len(self.training_data['states']) == self.window_length: + training_data['states'] = np.concatenate(self.training_data['states'], axis=0) + training_data['probs'] = np.concatenate(self.training_data['probs'], axis=0) + training_data['winner'] = np.concatenate(self.training_data['winner'], axis=0) if len(self.training_data['states']) != self.window_length: continue From 8328153b86871f36953605ebd89e17c001b3f537 Mon Sep 17 00:00:00 2001 From: rtz19970824 Date: Fri, 22 Dec 2017 13:47:27 +0800 Subject: [PATCH 45/98] print in the loading process --- AlphaGo/model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 15fc3da..e8b5eb9 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -249,6 +249,7 @@ class ResNet(object): file.seek(0) data = cPickle.load(file) read = True + print("{} Loaded".format(file_name)) except Exception as e: print(e) time.sleep(1) From 511f64b3d6ada98d4fe0e04215eea93d690f56a4 Mon Sep 17 00:00:00 2001 From: JialianLee Date: Fri, 22 Dec 2017 15:26:47 +0800 Subject: [PATCH 46/98] Modification for reversi --- AlphaGo/reversi.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py index 49d0e9a..cba91d9 100644 --- a/AlphaGo/reversi.py +++ b/AlphaGo/reversi.py @@ -34,7 +34,6 @@ def calc_flip(pos, own, enemy): :param enemy: bitboard :return: flip stones of enemy when I place stone at pos. """ - assert 0 <= pos <= 63, f"pos={pos}" f1 = _calc_flip_half(pos, own, enemy) f2 = _calc_flip_half(63 - pos, rotate180(own), rotate180(enemy)) return f1 | rotate180(f2) @@ -125,7 +124,14 @@ class Reversi: self.board = None # 8 * 8 board with 1 for black, -1 for white and 0 for blank self.color = None # 1 for black and -1 for white self.action = None # number in 0~63 - self.winner = None + # self.winner = None + self.black_win = None + + def get_board(self, black=None, white=None): + self.black = black or (0b00001000 << 24 | 0b00010000 << 32) + self.white = white or (0b00010000 << 24 | 0b00001000 << 32) + self.board = self.bitboard2board() + return self.board def simulate_is_valid(self, board, color): self.board = board @@ -134,18 +140,19 @@ class Reversi: own, enemy = self.get_own_and_enemy() mobility = find_correct_moves(own, enemy) valid_moves = bit_to_array(mobility, 64) + valid_moves = np.argwhere(valid_moves) valid_moves = list(np.reshape(valid_moves, len(valid_moves))) return valid_moves - def simulate_step_forward(self, board, color, vertex): - self.board = board - self.color = color + def simulate_step_forward(self, state, vertex): + self.board = state[0] + self.color = state[1] self.board2bitboard() self.vertex2action(vertex) step_forward = self.step() if step_forward: new_board = self.bitboard2board() - return new_board + return [new_board, 0 - self.color], 0 def executor_do_move(self, board, color, vertex): self.board = board @@ -155,13 +162,14 @@ class Reversi: step_forward = self.step() if step_forward: new_board = self.bitboard2board() - return new_board + for i in range(64): + board[i] = new_board[i] def executor_get_score(self, board): self.board = board self._game_over() - if self.winner is not None: - return self.winner, 0 - self.winner + if self.black_win is not None: + return self.black_win else: ValueError("Game not finished!") @@ -219,6 +227,7 @@ class Reversi: def _game_over(self): # self.done = True + ''' if self.winner is None: black_num, white_num = self.number_of_black_and_white if black_num > white_num: @@ -227,9 +236,12 @@ class Reversi: self.winner = -1 else: self.winner = 0 + ''' + if self.black_win is None: + black_num, white_num = self.number_of_black_and_white + self.black_win = black_num - white_num def illegal_move_to_lose(self, action): - logger.warning(f"Illegal action={action}, No Flipped!") self._game_over() def get_own_and_enemy(self): From c5e33af84173b4c5165e4a51600232daa1485cff Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Fri, 22 Dec 2017 15:44:44 +0800 Subject: [PATCH 47/98] move the unit test of is_eye into go.py --- AlphaGo/go.py | 39 +++++++ AlphaGo/unit_test.py | 266 ------------------------------------------- 2 files changed, 39 insertions(+), 266 deletions(-) delete mode 100644 AlphaGo/unit_test.py diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 9b7e21f..661d918 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -308,3 +308,42 @@ class Go: return score +if __name__ == "__main__": + ### do unit test for Go class + pure_test = [ + 0, 1, 0, 1, 0, 1, 0, 0, 0, + 1, 0, 1, 0, 1, 0, 0, 0, 0, + 0, 1, 0, 1, 0, 0, 1, 0, 0, + 0, 0, 1, 0, 0, 1, 0, 1, 0, + 0, 0, 0, 0, 0, 1, 1, 1, 0, + 1, 1, 1, 0, 0, 0, 0, 0, 0, + 1, 0, 1, 0, 0, 1, 1, 0, 0, + 1, 1, 1, 0, 1, 0, 1, 0, 0, + 0, 0, 0, 0, 1, 1, 1, 0, 0 + ] + + pt_qry = [(1, 1), (1, 5), (3, 3), (4, 7), (7, 2), (8, 6)] + pt_ans = [True, True, True, True, True, True] + + opponent_test = [ + 0, 1, 0, 1, 0, 1, 0,-1, 1, + 1,-1, 0,-1, 1,-1, 0, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, + 1, 1,-1, 0, 1,-1, 1, 0, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 0, + -1,1, 1, 0, 1, 1, 1, 0, 0, + 0, 1,-1, 0,-1,-1,-1, 0, 0, + 1, 0, 1, 0,-1, 0,-1, 0, 0, + 0, 1, 0, 0,-1,-1,-1, 0, 0 + ] + ot_qry = [(1, 1), (1, 5), (2, 9), (5, 2), (5, 6), (8, 6), (8, 2)] + ot_ans = [False, False, False, False, False, False, True] + + go = Go(size=9, komi=3.75) + for i in range(6): + print (go._is_eye(pure_test, utils.BLACK, pt_qry[i])) + print("Test of pure eye\n") + + for i in range(7): + print (go._is_eye(opponent_test, utils.BLACK, ot_qry[i])) + print("Test of eye surrend by opponents\n") diff --git a/AlphaGo/unit_test.py b/AlphaGo/unit_test.py deleted file mode 100644 index 7a33b8e..0000000 --- a/AlphaGo/unit_test.py +++ /dev/null @@ -1,266 +0,0 @@ -import numpy as np -import sys -from game import Game -from engine import GTPEngine -import utils -import time -import copy -import network_small -import tensorflow as tf -from collections import deque -from tianshou.core.mcts.mcts import MCTS - -DELTA = [[1, 0], [-1, 0], [0, -1], [0, 1]] -CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] - -class GoEnv: - def __init__(self, size=9, komi=6.5): - self.size = size - self.komi = komi - self.board = [utils.EMPTY] * (self.size * self.size) - self.history = deque(maxlen=8) - - def _set_board(self, board): - self.board = board - - def _flatten(self, vertex): - x, y = vertex - return (x - 1) * self.size + (y - 1) - - def _bfs(self, vertex, color, block, status, alive_break): - block.append(vertex) - status[self._flatten(vertex)] = True - nei = self._neighbor(vertex) - for n in nei: - if not status[self._flatten(n)]: - if self.board[self._flatten(n)] == color: - self._bfs(n, color, block, status, alive_break) - - def _find_block(self, vertex, alive_break=False): - block = [] - status = [False] * (self.size * self.size) - color = self.board[self._flatten(vertex)] - self._bfs(vertex, color, block, status, alive_break) - - for b in block: - for n in self._neighbor(b): - if self.board[self._flatten(n)] == utils.EMPTY: - return False, block - return True, block - - def _is_qi(self, color, vertex): - nei = self._neighbor(vertex) - for n in nei: - if self.board[self._flatten(n)] == utils.EMPTY: - return True - - self.board[self._flatten(vertex)] = color - for n in nei: - if self.board[self._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n) - if can_kill: - self.board[self._flatten(vertex)] = utils.EMPTY - return True - - ### avoid suicide - can_kill, block = self._find_block(vertex) - if can_kill: - self.board[self._flatten(vertex)] = utils.EMPTY - return False - - self.board[self._flatten(vertex)] = utils.EMPTY - return True - - def _check_global_isomorphous(self, color, vertex): - ##backup - _board = copy.copy(self.board) - self.board[self._flatten(vertex)] = color - self._process_board(color, vertex) - if self.board in self.history: - res = True - else: - res = False - - self.board = _board - return res - - def _in_board(self, vertex): - x, y = vertex - if x < 1 or x > self.size: return False - if y < 1 or y > self.size: return False - return True - - def _neighbor(self, vertex): - x, y = vertex - nei = [] - for d in DELTA: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - nei.append((_x, _y)) - return nei - - def _corner(self, vertex): - x, y = vertex - corner = [] - for d in CORNER_OFFSET: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - corner.append((_x, _y)) - return corner - - def _process_board(self, color, vertex): - nei = self._neighbor(vertex) - for n in nei: - if self.board[self._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n, alive_break=True) - if can_kill: - for b in block: - self.board[self._flatten(b)] = utils.EMPTY - - def _find_group(self, start): - color = self.board[self._flatten(start)] - #print ("color : ", color) - chain = set() - frontier = [start] - while frontier: - current = frontier.pop() - #print ("current : ", current) - chain.add(current) - for n in self._neighbor(current): - #print n, self._flatten(n), self.board[self._flatten(n)], - if self.board[self._flatten(n)] == color and not n in chain: - frontier.append(n) - return chain - - def _is_eye(self, color, vertex): - nei = self._neighbor(vertex) - cor = self._corner(vertex) - ncolor = {color == self.board[self._flatten(n)] for n in nei} - if False in ncolor: - #print "not all neighbors are in same color with us" - return False - if set(nei) < self._find_group(nei[0]): - #print "all neighbors are in same group and same color with us" - return True - else: - opponent_number = [self.board[self._flatten(c)] for c in cor].count(-color) - opponent_propotion = float(opponent_number) / float(len(cor)) - if opponent_propotion < 0.5: - #print "few opponents, real eye" - return True - else: - #print "many opponents, fake eye" - return False - - # def is_valid(self, color, vertex): - def is_valid(self, state, action): - # state is the play board, the shape is [1, 9, 9, 17] - if action == self.size * self.size: - vertex = (0, 0) - else: - vertex = (action / self.size + 1, action % self.size + 1) - if state[0, 0, 0, -1] == utils.BLACK: - color = utils.BLACK - else: - color = utils.WHITE - self.history.clear() - for i in range(8): - self.history.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) - self.board = copy.copy(self.history[-1]) - ### in board - if not self._in_board(vertex): - return False - - ### already have stone - if not self.board[self._flatten(vertex)] == utils.EMPTY: - # print(np.array(self.board).reshape(9, 9)) - # print(vertex) - return False - - ### check if it is qi - if not self._is_qi(color, vertex): - return False - - ### check if it is an eye of yourself - ### assumptions : notice that this judgement requires that the state is an endgame - #if self._is_eye(color, vertex): - # return False - - if self._check_global_isomorphous(color, vertex): - return False - - return True - - def do_move(self, color, vertex): - if vertex == utils.PASS: - return True - - id_ = self._flatten(vertex) - if self.board[id_] == utils.EMPTY: - self.board[id_] = color - self.history.append(copy.copy(self.board)) - return True - else: - return False - - def step_forward(self, state, action): - if state[0, 0, 0, -1] == 1: - color = 1 - else: - color = -1 - if action == 81: - vertex = (0, 0) - else: - vertex = (action % 9 + 1, action / 9 + 1) - # print(vertex) - # print(self.board) - self.board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() - self.do_move(color, vertex) - new_state = np.concatenate( - [state[:, :, :, 1:8], (np.array(self.board) == 1).reshape(1, 9, 9, 1), - state[:, :, :, 9:16], (np.array(self.board) == -1).reshape(1, 9, 9, 1), - np.array(1 - state[:, :, :, -1]).reshape(1, 9, 9, 1)], - axis=3) - return new_state, 0 - - -pure_test = [ - 0, 1, 0, 1, 0, 1, 0, 0, 0, - 1, 0, 1, 0, 1, 0, 0, 0, 0, - 0, 1, 0, 1, 0, 0, 1, 0, 0, - 0, 0, 1, 0, 0, 1, 0, 1, 0, - 0, 0, 0, 0, 0, 1, 1, 1, 0, - 1, 1, 1, 0, 0, 0, 0, 0, 0, - 1, 0, 1, 0, 0, 1, 1, 0, 0, - 1, 1, 1, 0, 1, 0, 1, 0, 0, - 0, 0, 0, 0, 1, 1, 1, 0, 0 -] - -pt_qry = [(1, 1), (1, 5), (3, 3), (4, 7), (7, 2), (8, 6)] -pt_ans = [True, True, True, True, True, True] - -opponent_test = [ - 0, 1, 0, 1, 0, 1, 0,-1, 1, - 1,-1, 0,-1, 1,-1, 0, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, - 1, 1,-1, 0, 1,-1, 1, 0, 0, - 1, 0, 1, 0, 1, 0, 1, 0, 0, - -1, 1, 1, 0, 1, 1, 1, 0, 0, - 0, 1,-1, 0,-1,-1,-1, 0, 0, - 1, 0, 1, 0,-1, 0,-1, 0, 0, - 0, 1, 0, 0,-1,-1,-1, 0, 0 -] -ot_qry = [(1, 1), (1, 5), (2, 9), (5, 2), (5, 6), (8, 2), (8, 6)] -ot_ans = [False, False, False, False, False, True, False] - -#print (ge._find_group((6, 1))) -#print ge._is_eye(utils.BLACK, pt_qry[0]) -ge = GoEnv() -ge._set_board(pure_test) -for i in range(6): - print (ge._is_eye(utils.BLACK, pt_qry[i])) -ge._set_board(opponent_test) -for i in range(7): - print (ge._is_eye(utils.BLACK, ot_qry[i])) From 67ba76a04d42152c1c7ae6f3554b2e8683fca0d5 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Fri, 22 Dec 2017 17:16:44 +0800 Subject: [PATCH 48/98] implement a stochastic sample training method --- AlphaGo/game.py | 4 ++-- AlphaGo/model.py | 44 +++++++++++++++++++++++++------------------- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 8706572..df08c0a 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -31,7 +31,7 @@ class Game: self.latest_boards = deque(maxlen=8) for _ in range(8): self.latest_boards.append(self.board) - self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8) + self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8, checkpoint_path=checkpoint_path) # self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], # feed_dict={self.net.x: state, self.net.is_training: False}) self.game_engine = go.Go(size=self.size, komi=self.komi) @@ -96,7 +96,7 @@ class Game: sys.stdout.flush() if __name__ == "__main__": - g = Game() + g = Game(checkpoint_path='./checkpoints/') g.show_board() g.think_play_move(1) #file = open("debug.txt", "a") diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 764ba5f..22e8626 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -1,5 +1,6 @@ import os import time +import random import sys import cPickle from collections import deque @@ -104,7 +105,7 @@ class ResNet(object): self.window_length = 7000 self.save_freq = 5000 self.training_data = {'states': deque(maxlen=self.window_length), 'probs': deque(maxlen=self.window_length), - 'winner': deque(maxlen=self.window_length)} + 'winner': deque(maxlen=self.window_length), 'length': deque(maxlen=self.window_length)} def _build_network(self, residual_block_num, checkpoint_path): """ @@ -199,15 +200,15 @@ class ResNet(object): new_file_list = [] all_file_list = [] - training_data = {} + training_data = {'states': [], 'probs': [], 'winner': []} + iters = 0 while True: new_file_list = list(set(os.listdir(data_path)).difference(all_file_list)) - if new_file_list: + while new_file_list: all_file_list = os.listdir(data_path) - new_file_list.sort( - key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir(data_path + file) else 0) - if new_file_list: + new_file_list.sort( + key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir(data_path + file) else 0) for file in new_file_list: states, probs, winner = self._file_to_training_data(data_path + file) assert states.shape[0] == probs.shape[0] @@ -215,32 +216,36 @@ class ResNet(object): self.training_data['states'].append(states) self.training_data['probs'].append(probs) self.training_data['winner'].append(winner) - if len(self.training_data['states']) == self.window_length: - training_data['states'] = np.concatenate(self.training_data['states'], axis=0) - training_data['probs'] = np.concatenate(self.training_data['probs'], axis=0) - training_data['winner'] = np.concatenate(self.training_data['winner'], axis=0) + self.training_data['length'].append(states.shape[0]) + new_file_list = list(set(os.listdir(data_path)).difference(all_file_list)) if len(self.training_data['states']) != self.window_length: continue else: - data_num = training_data['states'].shape[0] - index = np.arange(data_num) - np.random.shuffle(index) start_time = time.time() + for i in range(batch_size): + game_num = random.randint(0, self.window_length-1) + state_num = random.randint(0, self.training_data['length'][game_num]-1) + training_data['states'].append(np.expand_dims(self.training_data['states'][game_num][state_num], 0)) + training_data['probs'].append(np.expand_dims(self.training_data['probs'][game_num][state_num], 0)) + training_data['winner'].append(np.expand_dims(self.training_data['winner'][game_num][state_num], 0)) value_loss, policy_loss, reg, _ = self.sess.run( [self.value_loss, self.policy_loss, self.reg, self.train_op], - feed_dict={self.x: training_data['states'][index[:batch_size]], - self.z: training_data['winner'][index[:batch_size]], - self.pi: training_data['probs'][index[:batch_size]], + feed_dict={self.x: np.concatenate(training_data['states'], axis=0), + self.z: np.concatenate(training_data['winner'], axis=0), + self.pi: np.concatenate(training_data['probs'], axis=0), self.is_training: True}) + print("Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format(iters, time.time() - start_time, value_loss, policy_loss, reg)) - iters += 1 if iters % self.save_freq == 0: save_path = "Iteration{}.ckpt".format(iters) self.saver.save(self.sess, self.checkpoint_path + save_path) + for key in training_data.keys(): + training_data[key] = [] + iters += 1 def _file_to_training_data(self, file_name): read = False @@ -250,6 +255,7 @@ class ResNet(object): file.seek(0) data = cPickle.load(file) read = True + print("{} Loaded!".format(file_name)) except Exception as e: print(e) time.sleep(1) @@ -275,6 +281,6 @@ class ResNet(object): return states, probs, winner -if __name__=="__main__": - model = ResNet(board_size=9, action_num=82) +if __name__ == "__main__": + model = ResNet(board_size=9, action_num=82, history_length=8) model.train("file", data_path="./data/", batch_size=128, checkpoint_path="./checkpoint/") From 3b534064bd6c92c972883d448c7c77fa0884e356 Mon Sep 17 00:00:00 2001 From: mcgrady00h <281130306@qq.com> Date: Sat, 23 Dec 2017 02:48:53 +0800 Subject: [PATCH 49/98] fix virtual loss bug --- tianshou/core/mcts/mcts.py | 22 +++-------- tianshou/core/mcts/mcts_virtual_loss.py | 41 ++++++++++---------- tianshou/core/mcts/mcts_virtual_loss_test.py | 6 +-- tianshou/core/mcts/utils.py | 21 ++++++++++ 4 files changed, 49 insertions(+), 41 deletions(-) create mode 100644 tianshou/core/mcts/utils.py diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 979e994..16d13d5 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -1,22 +1,9 @@ import numpy as np import math import time +import sys,os +from .utils import list2tuple, tuple2list -c_puct = 5 - - -def list2tuple(list): - try: - return tuple(list2tuple(sub) for sub in list) - except TypeError: - return list - - -def tuple2list(tuple): - try: - return list(tuple2list(sub) for sub in tuple) - except TypeError: - return tuple class MCTSNode(object): @@ -39,12 +26,13 @@ class MCTSNode(object): pass class UCTNode(MCTSNode): - def __init__(self, parent, action, state, action_num, prior, inverse=False): + def __init__(self, parent, action, state, action_num, prior, inverse=False, c_puct = 5): super(UCTNode, self).__init__(parent, action, state, action_num, prior, inverse) self.Q = np.zeros([action_num]) self.W = np.zeros([action_num]) self.N = np.zeros([action_num]) - self.ucb = self.Q + c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1) + self.c_puct = c_puct + self.ucb = self.Q + self.c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1) self.mask = None def selection(self, simulator): diff --git a/tianshou/core/mcts/mcts_virtual_loss.py b/tianshou/core/mcts/mcts_virtual_loss.py index 9d20b5a..9335464 100644 --- a/tianshou/core/mcts/mcts_virtual_loss.py +++ b/tianshou/core/mcts/mcts_virtual_loss.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # vim:fenc=utf-8 # $File: mcts_virtual_loss.py -# $Date: Tue Dec 19 17:0444 2017 +0800 +# $Date: Sat Dec 23 02:4850 2017 +0800 # Original file: mcts.py # $Author: renyong15 Ā© # @@ -12,25 +12,13 @@ manner. """ +from __future__ import absolute_import + import numpy as np import math import time - -c_puct = 5 - - -def list2tuple(list): - try: - return tuple(list2tuple(sub) for sub in list) - except TypeError: - return list - - -def tuple2list(tuple): - try: - return list(tuple2list(sub) for sub in tuple) - except TypeError: - return tuple +import sys,os +from .utils import list2tuple, tuple2list class MCTSNodeVirtualLoss(object): @@ -53,12 +41,13 @@ class MCTSNodeVirtualLoss(object): pass class UCTNodeVirtualLoss(MCTSNodeVirtualLoss): - def __init__(self, parent, action, state, action_num, prior, inverse=False): + def __init__(self, parent, action, state, action_num, prior, inverse=False, c_puct = 5): super(UCTNodeVirtualLoss, self).__init__(parent, action, state, action_num, prior, inverse) self.Q = np.zeros([action_num]) self.W = np.zeros([action_num]) self.N = np.zeros([action_num]) self.virtual_loss = np.zeros([action_num]) + self.c_puct = c_puct #### modified by adding virtual loss #self.ucb = self.Q + c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1) @@ -67,9 +56,9 @@ class UCTNodeVirtualLoss(MCTSNodeVirtualLoss): def selection(self, simulator): self.valid_mask(simulator) self.Q = np.zeros([self.action_num]) - N_not_zero = self.N > 0 - self.Q[N_not_zero] = (self.W[N_not_zero] + self.virtual_loss[N_not_zero] + 0.) / self.N[N_not_zero] - self.ucb = self.Q + c_puct * self.prior * math.sqrt(np.sum(self.N + self.virtual_loss)) /\ + N_not_zero = (self.N + self.virtual_loss) > 0 + self.Q[N_not_zero] = (self.W[N_not_zero] + 0.)/ (self.virtual_loss[N_not_zero] + self.N[N_not_zero]) + self.ucb = self.Q + self.c_puct * self.prior * math.sqrt(np.sum(self.N + self.virtual_loss)) /\ (self.N + self.virtual_loss + 1) action = np.argmax(self.ucb) self.virtual_loss[action] += 1 @@ -93,6 +82,7 @@ class UCTNodeVirtualLoss(MCTSNodeVirtualLoss): self.W[action] += self.children[action].reward ## do not need to compute Q and ucb immediately since it will be modified by virtual loss + ## just comment out and leaving for comparision #for i in range(self.action_num): # if self.N[i] != 0: # self.Q[i] = (self.W[i] + 0.) / self.N[i] @@ -186,6 +176,12 @@ class MCTSVirtualLoss(object): def do_search(self, max_step=None, max_time=None): + """ + Expand the MCTS tree with stop crierion either by max_step or max_time + + :param max_step search maximum minibath rounds. ONE step is ONE minibatch + :param max_time search maximum seconds + """ if max_step is not None: self.step = 0 self.max_step = max_step @@ -205,6 +201,9 @@ class MCTSVirtualLoss(object): self.step += 1 def expand(self): + """ + Core logic method for MCTS tree to expand nodes. + """ ## minibatch with virtual loss nodes = [] new_actions = [] diff --git a/tianshou/core/mcts/mcts_virtual_loss_test.py b/tianshou/core/mcts/mcts_virtual_loss_test.py index d2d6c81..e4666f3 100644 --- a/tianshou/core/mcts/mcts_virtual_loss_test.py +++ b/tianshou/core/mcts/mcts_virtual_loss_test.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # vim:fenc=utf-8 # $File: mcts_virtual_loss_test.py -# $Date: Tue Dec 19 16:5459 2017 +0800 +# $Date: Sat Dec 23 02:2139 2017 +0800 # Original file: mcts_test.py # $Author: renyong15 Ā© # @@ -9,8 +9,8 @@ import numpy as np -from mcts_virtual_loss import MCTSVirtualLoss -from evaluator import rollout_policy +from .mcts_virtual_loss import MCTSVirtualLoss +from .evaluator import rollout_policy class TestEnv: diff --git a/tianshou/core/mcts/utils.py b/tianshou/core/mcts/utils.py new file mode 100644 index 0000000..de518a0 --- /dev/null +++ b/tianshou/core/mcts/utils.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# vim:fenc=utf-8 +# $File: utils.py +# $Date: Sat Dec 23 02:0854 2017 +0800 +# $Author: renyong15 Ā© +# + +def list2tuple(list): + try: + return tuple(list2tuple(sub) for sub in list) + except TypeError: + return list + + +def tuple2list(tuple): + try: + return list(tuple2list(sub) for sub in tuple) + except TypeError: + return tuple + + From 032ea46b7b729ac09196f34463e2b46523848109 Mon Sep 17 00:00:00 2001 From: JialianLee Date: Sat, 23 Dec 2017 09:47:08 +0800 Subject: [PATCH 50/98] small modification --- AlphaGo/reversi.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py index cba91d9..320445e 100644 --- a/AlphaGo/reversi.py +++ b/AlphaGo/reversi.py @@ -171,12 +171,12 @@ class Reversi: if self.black_win is not None: return self.black_win else: - ValueError("Game not finished!") + raise ValueError("Game not finished!") def board2bitboard(self): count = 1 if self.board is None: - ValueError("None board!") + raise ValueError("None board!") self.black = 0 self.white = 0 for i in range(64): @@ -208,7 +208,7 @@ class Reversi: def step(self): if self.action < 0 or self.action > 63: - ValueError("Wrong action!") + raise ValueError("Wrong action!") if self.action is None: return False From b2ef770415ade966dcc29073973bfea3a447481b Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Sat, 23 Dec 2017 13:05:25 +0800 Subject: [PATCH 51/98] connect reversi with game --- AlphaGo/engine.py | 4 ++-- AlphaGo/game.py | 44 +++++++++++++++++++++++++------------- AlphaGo/go.py | 28 +++++++++++------------- AlphaGo/play.py | 1 - AlphaGo/reversi.py | 16 +++++++++----- AlphaGo/self-play.py | 2 +- tianshou/core/mcts/mcts.py | 2 +- 7 files changed, 57 insertions(+), 40 deletions(-) diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index 8b54470..98e5e61 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -183,7 +183,7 @@ class GTPEngine(): return 'unknown player', False def cmd_get_score(self, args, **kwargs): - return self._game.game_engine.executor_get_score(self._game.board, True), True + return self._game.game_engine.executor_get_score(self._game.board), True def cmd_show_board(self, args, **kwargs): return self._game.board, True @@ -194,4 +194,4 @@ class GTPEngine(): if __name__ == "main": game = Game() - engine = GTPEngine(game_obj=Game) + engine = GTPEngine(game_obj=game) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index df08c0a..ff1faf5 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -10,12 +10,14 @@ import copy import tensorflow as tf import numpy as np import sys, os -import go import model from collections import deque sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir)) from tianshou.core.mcts.mcts import MCTS +import go +import reversi + class Game: ''' Load the real game and trained weights. @@ -23,18 +25,26 @@ class Game: TODO : Maybe merge with the engine class in future, currently leave it untouched for interacting with Go UI. ''' - def __init__(self, size=9, komi=3.75, checkpoint_path=None): - self.size = size - self.komi = komi - self.board = [utils.EMPTY] * (self.size ** 2) - self.history = [] - self.latest_boards = deque(maxlen=8) - for _ in range(8): - self.latest_boards.append(self.board) - self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8, checkpoint_path=checkpoint_path) - # self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], - # feed_dict={self.net.x: state, self.net.is_training: False}) - self.game_engine = go.Go(size=self.size, komi=self.komi) + def __init__(self, name="go", checkpoint_path=None): + self.name = name + if "go" == name: + self.size = 9 + self.komi = 3.75 + self.board = [utils.EMPTY] * (self.size ** 2) + self.history = [] + self.latest_boards = deque(maxlen=8) + for _ in range(8): + self.latest_boards.append(self.board) + + self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8) + self.game_engine = go.Go(size=self.size, komi=self.komi) + elif "reversi" == name: + self.size = 8 + self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=1) + self.game_engine = reversi.Reversi() + self.board = self.game_engine.get_board() + else: + print(name + " is an unknown game...") def clear(self): self.board = [utils.EMPTY] * (self.size ** 2) @@ -65,7 +75,11 @@ class Game: # this function can be called directly to play the opponent's move if vertex == utils.PASS: return True - res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex) + # TODO this implementation is not very elegant + if "go" == self.name: + res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex) + elif "revsersi" == self.name: + res = self.game_engine.executor_do_move(self.board, color, vertex) return res def think_play_move(self, color): @@ -96,7 +110,7 @@ class Game: sys.stdout.flush() if __name__ == "__main__": - g = Game(checkpoint_path='./checkpoints/') + g = Game() g.show_board() g.think_play_move(1) #file = open("debug.txt", "a") diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 661d918..b819c08 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -157,7 +157,7 @@ class Go: vertex = self._deflatten(action) return vertex - def _is_valid(self, history_boards, current_board, color, vertex): + def _rule_check(self, history_boards, current_board, color, vertex): ### in board if not self._in_board(vertex): return False @@ -176,30 +176,30 @@ class Go: return True - def simulate_is_valid(self, state, action): + def _is_valid(self, state, action): history_boards, color = state vertex = self._action2vertex(action) current_board = history_boards[-1] - if not self._is_valid(history_boards, current_board, color, vertex): + if not self._rule_check(history_boards, current_board, color, vertex): return False if not self._knowledge_prunning(current_board, color, vertex): return False return True - def simulate_is_valid_list(self, state, action_set): + def simulate_get_mask(self, state, action_set): # find all the invalid actions - invalid_action_list = [] + invalid_action_mask = [] for action_candidate in action_set[:-1]: # go through all the actions excluding pass - if not self.simulate_is_valid(state, action_candidate): - invalid_action_list.append(action_candidate) - if len(invalid_action_list) < len(action_set) - 1: - invalid_action_list.append(action_set[-1]) + if not self._is_valid(state, action_candidate): + invalid_action_mask.append(action_candidate) + if len(invalid_action_mask) < len(action_set) - 1: + invalid_action_mask.append(action_set[-1]) # forbid pass, if we have other choices # TODO: In fact we should not do this. In some extreme cases, we should permit pass. - return invalid_action_list + return invalid_action_mask def _do_move(self, board, color, vertex): if vertex == utils.PASS: @@ -219,7 +219,7 @@ class Go: return [history_boards, new_color], 0 def executor_do_move(self, history, latest_boards, current_board, color, vertex): - if not self._is_valid(history, current_board, color, vertex): + if not self._rule_check(history, current_board, color, vertex): return False current_board[self._flatten(vertex)] = color self._process_board(current_board, color, vertex) @@ -280,7 +280,7 @@ class Go: elif color_estimate < 0: return utils.WHITE - def executor_get_score(self, current_board, is_unknown_estimation=False): + def executor_get_score(self, current_board): ''' is_unknown_estimation: whether use nearby stone to predict the unknown return score from BLACK perspective. @@ -294,10 +294,8 @@ class Go: _board[self._flatten(vertex)] = utils.BLACK elif boarder_color == {utils.WHITE}: _board[self._flatten(vertex)] = utils.WHITE - elif is_unknown_estimation: - _board[self._flatten(vertex)] = self._predict_from_nearby(_board, vertex) else: - _board[self._flatten(vertex)] =utils.UNKNOWN + _board[self._flatten(vertex)] = self._predict_from_nearby(_board, vertex) score = 0 for i in _board: if i == utils.BLACK: diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 3681430..b601ada 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -7,7 +7,6 @@ import time import os import cPickle - class Data(object): def __init__(self): self.boards = [] diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py index cba91d9..d67a882 100644 --- a/AlphaGo/reversi.py +++ b/AlphaGo/reversi.py @@ -25,7 +25,6 @@ def find_correct_moves(own, enemy): mobility |= search_offset_right(own, enemy, mask, 7) # Left bottom return mobility - def calc_flip(pos, own, enemy): """return flip stones of enemy by bitboard when I place stone at pos. @@ -133,7 +132,9 @@ class Reversi: self.board = self.bitboard2board() return self.board - def simulate_is_valid(self, board, color): + def simulate_get_mask(self, state, action_set): + history_boards, color = state + board = history_boards[-1] self.board = board self.color = color self.board2bitboard() @@ -142,13 +143,18 @@ class Reversi: valid_moves = bit_to_array(mobility, 64) valid_moves = np.argwhere(valid_moves) valid_moves = list(np.reshape(valid_moves, len(valid_moves))) - return valid_moves + # TODO it seems that the pass move is not considered + invalid_action_mask = [] + for action in action_set: + if action not in valid_moves: + invalid_action_mask.append(action) + return invalid_action_mask - def simulate_step_forward(self, state, vertex): + def simulate_step_forward(self, state, action): self.board = state[0] self.color = state[1] self.board2bitboard() - self.vertex2action(vertex) + self.action = action step_forward = self.step() if step_forward: new_board = self.bitboard2board() diff --git a/AlphaGo/self-play.py b/AlphaGo/self-play.py index 4387b24..dd03b13 100644 --- a/AlphaGo/self-play.py +++ b/AlphaGo/self-play.py @@ -79,7 +79,7 @@ while True: prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1)) print("Finished") print("\n") - score = game.game_engine.executor_get_score(game.board, True) + score = game.game_engine.executor_get_score(game.board) if score > 0: winner = utils.BLACK else: diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 8bb5f06..e8f3709 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -73,7 +73,7 @@ class UCTNode(MCTSNode): def valid_mask(self, simulator): # let all invalid actions be illeagel in mcts if self.mask is None: - self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num)) + self.mask = simulator.simulate_get_mask(self.state, range(self.action_num)) self.ucb[self.mask] = -float("Inf") From b96fa9448bde1c42cd5a696568a30bda7bddf195 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Sat, 23 Dec 2017 14:45:07 +0800 Subject: [PATCH 52/98] minor fixed --- .gitignore | 4 ++-- AlphaGo/game.py | 19 ++++++++++--------- AlphaGo/player.py | 2 +- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index d697b92..8ee6691 100644 --- a/.gitignore +++ b/.gitignore @@ -4,8 +4,8 @@ leela-zero parameters *.swp *.sublime* -checkpoints -checkpoints_origin +checkpoint *.json .DS_Store data +.log diff --git a/AlphaGo/game.py b/AlphaGo/game.py index ff1faf5..90d0bf0 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -27,29 +27,30 @@ class Game: ''' def __init__(self, name="go", checkpoint_path=None): self.name = name - if "go" == name: + if self.name == "go": self.size = 9 self.komi = 3.75 self.board = [utils.EMPTY] * (self.size ** 2) self.history = [] + self.history_length = 8 self.latest_boards = deque(maxlen=8) for _ in range(8): self.latest_boards.append(self.board) - - self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8) self.game_engine = go.Go(size=self.size, komi=self.komi) - elif "reversi" == name: + elif self.name == "reversi": self.size = 8 - self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=1) + self.history_length = 1 self.game_engine = reversi.Reversi() self.board = self.game_engine.get_board() else: - print(name + " is an unknown game...") + raise ValueError(name + " is an unknown game...") + + self.evaluator = model.ResNet(self.size, self.size ** 2 + 1, history_length=self.history_length) def clear(self): self.board = [utils.EMPTY] * (self.size ** 2) self.history = [] - for _ in range(8): + for _ in range(self.history_length): self.latest_boards.append(self.board) def set_size(self, n): @@ -76,9 +77,9 @@ class Game: if vertex == utils.PASS: return True # TODO this implementation is not very elegant - if "go" == self.name: + if self.name == "go": res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex) - elif "revsersi" == self.name: + elif self.name == "reversi": res = self.game_engine.executor_do_move(self.board, color, vertex) return res diff --git a/AlphaGo/player.py b/AlphaGo/player.py index 0e3daff..e848d2b 100644 --- a/AlphaGo/player.py +++ b/AlphaGo/player.py @@ -34,7 +34,7 @@ if __name__ == '__main__': daemon = Pyro4.Daemon() # make a Pyro daemon ns = Pyro4.locateNS() # find the name server - player = Player(role = args.role, engine = engine) + player = Player(role=args.role, engine=engine) print "Init " + args.role + " player finished" uri = daemon.register(player) # register the greeting maker as a Pyro object print "Start on name " + args.role From 951eed60edeabbcd90ac465fc2df2050584a0238 Mon Sep 17 00:00:00 2001 From: haoshengzou Date: Sat, 23 Dec 2017 15:34:44 +0800 Subject: [PATCH 53/98] fix imports to support both python2 and python3. move contents from __init__.py to leave for work after major development. --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 9c3af16..fc7d494 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,11 @@ Tianshou(å¤©ęŽˆ) is a reinforcement learning platform. The following image illus +## examples + +During development, run examples under `./examples/` directory with, e.g. `python ppo_example.py`. +Running them under this directory with `python examples/ppo_example.py` will not work. + ## About coding style From 04048b78738d1092768c669f37fa63a9e1922d1a Mon Sep 17 00:00:00 2001 From: haoshengzou Date: Sat, 23 Dec 2017 15:36:10 +0800 Subject: [PATCH 54/98] fix imports to support both python2 and python3. move contents from __init__.py to leave for work after major development. --- examples/ppo_example.py | 7 +++---- tianshou/core/policy/__init__.py | 6 ------ tianshou/core/policy/base.py | 12 ++++++++++++ tianshou/core/policy/dqn.py | 18 ++++++++++++------ tianshou/core/value_function/action_value.py | 17 +++++++++++++---- tianshou/core/value_function/base.py | 5 ++++- tianshou/core/value_function/state_value.py | 8 +++++--- 7 files changed, 49 insertions(+), 24 deletions(-) diff --git a/examples/ppo_example.py b/examples/ppo_example.py index 02ccb52..985c8f2 100755 --- a/examples/ppo_example.py +++ b/examples/ppo_example.py @@ -1,17 +1,16 @@ #!/usr/bin/env python +from __future__ import absolute_import import tensorflow as tf -import numpy as np -import time import gym # our lib imports here! import sys sys.path.append('..') -import tianshou.core.losses as losses +from tianshou.core import losses from tianshou.data.batch import Batch import tianshou.data.advantage_estimation as advantage_estimation -import tianshou.core.policy as policy +import tianshou.core.policy.stochastic as policy # TODO: fix imports as zhusuan so that only need to import to policy def policy_net(observation, action_dim, scope=None): diff --git a/tianshou/core/policy/__init__.py b/tianshou/core/policy/__init__.py index ccde775..e69de29 100644 --- a/tianshou/core/policy/__init__.py +++ b/tianshou/core/policy/__init__.py @@ -1,6 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -from .base import * -from .stochastic import * -from .dqn import * \ No newline at end of file diff --git a/tianshou/core/policy/base.py b/tianshou/core/policy/base.py index 025abd5..1adeaeb 100644 --- a/tianshou/core/policy/base.py +++ b/tianshou/core/policy/base.py @@ -13,11 +13,23 @@ import tensorflow as tf __all__ = [ 'StochasticPolicy', 'QValuePolicy', + 'PolicyBase' ] # TODO: a even more "base" class for policy +class PolicyBase(object): + """ + base class for policy. only provides `act` method with exploration + """ + def __init__(self): + pass + + def act(self, observation, exploration): + raise NotImplementedError() + + class QValuePolicy(object): """ The policy as in DQN diff --git a/tianshou/core/policy/dqn.py b/tianshou/core/policy/dqn.py index d03dbd4..716e4c4 100644 --- a/tianshou/core/policy/dqn.py +++ b/tianshou/core/policy/dqn.py @@ -1,16 +1,22 @@ -from tianshou.core.policy.base import QValuePolicy +from __future__ import absolute_import + +from .base import PolicyBase import tensorflow as tf -import sys -sys.path.append('..') -import value_function.action_value as value_func +from ..value_function.action_value import DQN -class DQN_refactor(object): +class DQNRefactor(PolicyBase): """ use DQN from value_function as a member """ def __init__(self, value_tensor, observation_placeholder, action_placeholder): - self._network = value_func.DQN(value_tensor, observation_placeholder, action_placeholder) + self._network = DQN(value_tensor, observation_placeholder, action_placeholder) + self._argmax_action = tf.argmax(value_tensor, axis=1) + + def act(self, observation, exploration): + sess = tf.get_default_session() + if not exploration: # no exploration + action = sess.run(self._argmax_action, feed_dict={}) class DQN(QValuePolicy): diff --git a/tianshou/core/value_function/action_value.py b/tianshou/core/value_function/action_value.py index cb8acc8..2bda4fa 100644 --- a/tianshou/core/value_function/action_value.py +++ b/tianshou/core/value_function/action_value.py @@ -1,4 +1,6 @@ -from base import ValueFunctionBase +from __future__ import absolute_import + +from .base import ValueFunctionBase import tensorflow as tf @@ -15,7 +17,6 @@ class ActionValue(ValueFunctionBase): def get_value(self, observation, action): """ - :param observation: numpy array of observations, of shape (batchsize, observation_dim). :param action: numpy array of actions, of shape (batchsize, action_dim) # TODO: Atari discrete action should have dim 1. Super Mario may should have, say, dim 5, where each can be 0/1 @@ -24,7 +25,7 @@ class ActionValue(ValueFunctionBase): """ sess = tf.get_default_session() return sess.run(self.get_value_tensor(), feed_dict= - {self._observation_placeholder: observation, self._action_placeholder:action})[:, 0] + {self._observation_placeholder: observation, self._action_placeholder: action}) class DQN(ActionValue): @@ -39,13 +40,21 @@ class DQN(ActionValue): :param action_placeholder: of shape (batchsize, ) """ self._value_tensor_all_actions = value_tensor - canonical_value_tensor = value_tensor[action_placeholder] # maybe a tf.map_fn. for now it's wrong + + batch_size = tf.shape(value_tensor)[0] + batch_dim_index = tf.range(batch_size) + indices = tf.stack([batch_dim_index, action_placeholder], axis=1) + canonical_value_tensor = tf.gather_nd(value_tensor, indices) super(DQN, self).__init__(value_tensor=canonical_value_tensor, observation_placeholder=observation_placeholder, action_placeholder=action_placeholder) def get_value_all_actions(self, observation): + """ + :param observation: + :return: numpy array of Q(s, *) given s, of shape (batchsize, num_actions) + """ sess = tf.get_default_session() return sess.run(self._value_tensor_all_actions, feed_dict={self._observation_placeholder: observation}) diff --git a/tianshou/core/value_function/base.py b/tianshou/core/value_function/base.py index 0b27759..b15f1bf 100644 --- a/tianshou/core/value_function/base.py +++ b/tianshou/core/value_function/base.py @@ -1,3 +1,6 @@ +from __future__ import absolute_import + +import tensorflow as tf # TODO: linear feature baseline also in tf? class ValueFunctionBase(object): @@ -6,7 +9,7 @@ class ValueFunctionBase(object): """ def __init__(self, value_tensor, observation_placeholder): self._observation_placeholder = observation_placeholder - self._value_tensor = value_tensor + self._value_tensor = tf.squeeze(value_tensor) # canonical values has shape (batchsize, ) def get_value(self, **kwargs): """ diff --git a/tianshou/core/value_function/state_value.py b/tianshou/core/value_function/state_value.py index 04fe442..b7de196 100644 --- a/tianshou/core/value_function/state_value.py +++ b/tianshou/core/value_function/state_value.py @@ -1,4 +1,6 @@ -from base import ValueFunctionBase +from __future__ import absolute_import + +from .base import ValueFunctionBase import tensorflow as tf @@ -17,7 +19,7 @@ class StateValue(ValueFunctionBase): :param observation: numpy array of observations, of shape (batchsize, observation_dim). :return: numpy array of state values, of shape (batchsize, ) - # TODO: dealing with the last dim of 1 in V(s) and Q(s, a) + # TODO: dealing with the last dim of 1 in V(s) and Q(s, a), this should rely on the action shape returned by env """ sess = tf.get_default_session() - return sess.run(self.get_value_tensor(), feed_dict={self._observation_placeholder: observation})[:, 0] \ No newline at end of file + return sess.run(self.get_value_tensor(), feed_dict={self._observation_placeholder: observation}) \ No newline at end of file From 84208a7ac96058f1f7dca9fcb609f4641766ea6a Mon Sep 17 00:00:00 2001 From: JialianLee Date: Sat, 23 Dec 2017 15:43:45 +0800 Subject: [PATCH 55/98] Modification for reversi.py --- AlphaGo/reversi.py | 107 +++++++++++++++++++++++++++++---------------- 1 file changed, 70 insertions(+), 37 deletions(-) diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py index c086a2c..ead6f4e 100644 --- a/AlphaGo/reversi.py +++ b/AlphaGo/reversi.py @@ -25,6 +25,7 @@ def find_correct_moves(own, enemy): mobility |= search_offset_right(own, enemy, mask, 7) # Left bottom return mobility + def calc_flip(pos, own, enemy): """return flip stones of enemy by bitboard when I place stone at pos. @@ -123,8 +124,9 @@ class Reversi: self.board = None # 8 * 8 board with 1 for black, -1 for white and 0 for blank self.color = None # 1 for black and -1 for white self.action = None # number in 0~63 - # self.winner = None + self.winner = None self.black_win = None + self.size = 8 def get_board(self, black=None, white=None): self.black = black or (0b00001000 << 24 | 0b00010000 << 32) @@ -132,22 +134,29 @@ class Reversi: self.board = self.bitboard2board() return self.board + def is_valid(self, is_next=False): + self.board2bitboard() + own, enemy = self.get_own_and_enemy(is_next) + mobility = find_correct_moves(own, enemy) + valid_moves = bit_to_array(mobility, 64) + valid_moves = np.argwhere(valid_moves) + valid_moves = list(np.reshape(valid_moves, len(valid_moves))) + return valid_moves + def simulate_get_mask(self, state, action_set): history_boards, color = state board = history_boards[-1] self.board = board self.color = color - self.board2bitboard() - own, enemy = self.get_own_and_enemy() - mobility = find_correct_moves(own, enemy) - valid_moves = bit_to_array(mobility, 64) - valid_moves = np.argwhere(valid_moves) - valid_moves = list(np.reshape(valid_moves, len(valid_moves))) + valid_moves = self.is_valid() # TODO it seems that the pass move is not considered - invalid_action_mask = [] - for action in action_set: - if action not in valid_moves: - invalid_action_mask.append(action) + if not len(valid_moves): + invalid_action_mask = action_set[0:-1] + else: + invalid_action_mask = [] + for action in action_set: + if action not in valid_moves: + invalid_action_mask.append(action) return invalid_action_mask def simulate_step_forward(self, state, action): @@ -155,21 +164,34 @@ class Reversi: self.color = state[1] self.board2bitboard() self.action = action - step_forward = self.step() - if step_forward: - new_board = self.bitboard2board() - return [new_board, 0 - self.color], 0 + if self.action == 64: + valid_moves = self.is_valid(is_next=True) + if not len(valid_moves): + self._game_over() + return None, self.winner * self.color + else: + return [self.board, 0 - self.color], 0 + self.step() + new_board = self.bitboard2board() + return [new_board, 0 - self.color], 0 def executor_do_move(self, board, color, vertex): self.board = board self.color = color self.board2bitboard() - self.vertex2action(vertex) - step_forward = self.step() - if step_forward: + self.action = self._flatten(vertex) + if self.action == 64: + valid_moves = self.is_valid(is_next=True) + if not len(valid_moves): + return False + else: + return True + else: + self.step() new_board = self.bitboard2board() - for i in range(64): - board[i] = new_board[i] + for i in range(64): + board[i] = new_board[i] + return True def executor_get_score(self, board): self.board = board @@ -191,13 +213,14 @@ class Reversi: elif self.board[i] == -1: self.white |= count count *= 2 - + ''' def vertex2action(self, vertex): x, y = vertex if x == 0 and y == 0: self.action = None else: self.action = 8 * (x - 1) + y - 1 + ''' def bitboard2board(self): board = [] @@ -214,46 +237,45 @@ class Reversi: def step(self): if self.action < 0 or self.action > 63: - raise ValueError("Wrong action!") + raise ValueError("Action not in the range of [0,63]!") if self.action is None: - return False + raise ValueError("Action is None!") own, enemy = self.get_own_and_enemy() flipped = calc_flip(self.action, own, enemy) if bit_count(flipped) == 0: - self.illegal_move_to_lose(self.action) - return False + # self.illegal_move_to_lose(self.action) + raise ValueError("Illegal action!") own ^= flipped own |= 1 << self.action enemy ^= flipped - self.set_own_and_enemy(own, enemy) - return True def _game_over(self): # self.done = True - ''' + if self.winner is None: black_num, white_num = self.number_of_black_and_white - if black_num > white_num: + self.black_win = black_num - white_num + if self.black_win > 0: self.winner = 1 - elif black_num < white_num: + elif self.black_win < 0: self.winner = -1 else: self.winner = 0 - ''' - if self.black_win is None: - black_num, white_num = self.number_of_black_and_white - self.black_win = black_num - white_num def illegal_move_to_lose(self, action): self._game_over() - def get_own_and_enemy(self): - if self.color == 1: + def get_own_and_enemy(self, is_next=False): + if is_next: + color = 0 - self.color + else: + color = self.color + if color == 1: own, enemy = self.black, self.white - elif self.color == -1: + elif color == -1: own, enemy = self.white, self.black else: own, enemy = None, None @@ -265,6 +287,17 @@ class Reversi: else: self.white, self.black = own, enemy + def _deflatten(self, idx): + x = idx // self.size + 1 + y = idx % self.size + 1 + return (x, y) + + def _flatten(self, vertex): + x, y = vertex + if (x == 0) and (y == 0): + return 64 + return (x - 1) * self.size + (y - 1) + @property def number_of_black_and_white(self): return bit_count(self.black), bit_count(self.white) From 3f238864fbfe20843900de12513aec75b8a59943 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Sat, 23 Dec 2017 15:58:06 +0800 Subject: [PATCH 56/98] minor fixed for mcts, check finish for go --- AlphaGo/go.py | 13 ++++++++----- tianshou/core/mcts/mcts.py | 12 ++++++++---- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index b819c08..fe2ab74 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -212,11 +212,14 @@ class Go: def simulate_step_forward(self, state, action): # initialize the simulate_board from state history_boards, color = state - vertex = self._action2vertex(action) - new_board = self._do_move(copy.copy(history_boards[-1]), color, vertex) - history_boards.append(new_board) - new_color = -color - return [history_boards, new_color], 0 + if history_boards[-1] == history_boards[-2] and action is utils.PASS: + return None, 2 * (float(self.executor_get_score(history_boards[-1]) > 0)-0.5) * color + else: + vertex = self._action2vertex(action) + new_board = self._do_move(copy.copy(history_boards[-1]), color, vertex) + history_boards.append(new_board) + new_color = -color + return [history_boards, new_color], 0 def executor_do_move(self, history, latest_boards, current_board, color, vertex): if not self._rule_check(history, current_board, color, vertex): diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index e8f3709..e99373c 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -38,6 +38,7 @@ class MCTSNode(object): def valid_mask(self, simulator): pass + class UCTNode(MCTSNode): def __init__(self, parent, action, state, action_num, prior, inverse=False): super(UCTNode, self).__init__(parent, action, state, action_num, prior, inverse) @@ -71,10 +72,13 @@ class UCTNode(MCTSNode): self.parent.backpropagation(self.children[action].reward) def valid_mask(self, simulator): - # let all invalid actions be illeagel in mcts - if self.mask is None: - self.mask = simulator.simulate_get_mask(self.state, range(self.action_num)) - self.ucb[self.mask] = -float("Inf") + # let all invalid actions be illegal in mcts + if not hasattr(simulator, 'simulate_get_mask'): + pass + else: + if self.mask is None: + self.mask = simulator.simulate_get_mask(self.state, range(self.action_num)) + self.ucb[self.mask] = -float("Inf") class TSNode(MCTSNode): From 4589fcf52194eccc219f82e36345573541511674 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Sat, 23 Dec 2017 16:27:09 +0800 Subject: [PATCH 57/98] add random preprocess, modify the uniform sample from training data --- AlphaGo/model.py | 72 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 65 insertions(+), 7 deletions(-) diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 22e8626..68973ac 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -1,7 +1,6 @@ import os import time -import random -import sys +import copy import cPickle from collections import deque @@ -224,11 +223,21 @@ class ResNet(object): else: start_time = time.time() for i in range(batch_size): - game_num = random.randint(0, self.window_length-1) - state_num = random.randint(0, self.training_data['length'][game_num]-1) - training_data['states'].append(np.expand_dims(self.training_data['states'][game_num][state_num], 0)) - training_data['probs'].append(np.expand_dims(self.training_data['probs'][game_num][state_num], 0)) - training_data['winner'].append(np.expand_dims(self.training_data['winner'][game_num][state_num], 0)) + priority = self.training_data['length'] / sum(self.training_data['length']) + game_num = np.random.choice(self.window_length, 1, p=priority) + state_num = np.random.randint(self.training_data['length'][game_num]) + rotate_times = np.random.randint(4) + reflect_times = np.random.randint(2) + reflect_orientation = np.random.randint(2) + training_data['states'].append( + self._preprocession(self.training_data['states'][game_num][state_num], reflect_times, + reflect_orientation, rotate_times)) + training_data['probs'].append( + self._preprocession(self.training_data['probs'][game_num][state_num], reflect_times, + reflect_orientation, rotate_times)) + training_data['winner'].append( + self._preprocession(self.training_data['winner'][game_num][state_num], reflect_times, + reflect_orientation, rotate_times)) value_loss, policy_loss, reg, _ = self.sess.run( [self.value_loss, self.policy_loss, self.reg, self.train_op], feed_dict={self.x: np.concatenate(training_data['states'], axis=0), @@ -280,6 +289,55 @@ class ResNet(object): winner = np.concatenate(winner, axis=0) return states, probs, winner + def _preprocession(self, board, reflect_times=0, reflect_orientation=0, rotate_times=0): + """ + preprocessing for augmentation + + :param board: a ndarray, board to process + :param reflect_times: an integer, how many times to reflect + :param reflect_orientation: an integer, which orientation to reflect + :param rotate_times: an integer, how many times to rotate + :return: + """ + + new_board = copy.copy(board) + if new_board.ndim == 3: + np.expand_dims(new_board, axis=0) + + new_board = self._board_reflection(new_board, reflect_times, reflect_orientation) + new_board = self._board_rotation(new_board, rotate_times) + + return new_board + + def _board_rotation(self, board, times): + """ + rotate the board for augmentation + note that board's shape should be [batch_size, board_size, board_size, channels] + + :param board: a ndarray, shape [batch_size, board_size, board_size, channels] + :param times: an integer, how many times to rotate + :return: + """ + return np.rot90(board, times, (1, 2)) + + def _board_reflection(self, board, times, orientation): + """ + reflect the board for augmentation + note that board's shape should be [batch_size, board_size, board_size, channels] + + :param board: a ndarray, shape [batch_size, board_size, board_size, channels] + :param times: an integer, how many times to reflect + :param orientation: an integer, which orientation to reflect + :return: + """ + new_board = copy.copy(board) + for _ in range(times): + if orientation == 0: + new_board = new_board[:, ::-1] + if orientation == 1: + new_board = new_board[:, :, ::-1] + return new_board + if __name__ == "__main__": model = ResNet(board_size=9, action_num=82, history_length=8) From b21a55dc88fefe7773b842e87af2d6b3eaab821b Mon Sep 17 00:00:00 2001 From: haoshengzou Date: Sat, 23 Dec 2017 17:25:16 +0800 Subject: [PATCH 58/98] towards policy/value refactor --- examples/dqn_example.py | 11 +++++------ tianshou/core/README.md | 6 +++++- tianshou/core/losses.py | 7 +++---- tianshou/core/policy/base.py | 18 +++++------------- tianshou/core/policy/dqn.py | 17 +++++++++++++---- tianshou/core/policy/stochastic.py | 6 ------ tianshou/core/value_function/action_value.py | 9 +++++---- tianshou/core/value_function/base.py | 5 +++-- tianshou/core/value_function/state_value.py | 4 ++-- 9 files changed, 41 insertions(+), 42 deletions(-) diff --git a/examples/dqn_example.py b/examples/dqn_example.py index b676475..cf20d66 100644 --- a/examples/dqn_example.py +++ b/examples/dqn_example.py @@ -1,8 +1,6 @@ #!/usr/bin/env python import tensorflow as tf -import numpy as np -import time import gym # our lib imports here! @@ -10,7 +8,7 @@ import sys sys.path.append('..') import tianshou.core.losses as losses from tianshou.data.replay_buffer.utils import get_replay_buffer -import tianshou.core.policy as policy +import tianshou.core.policy.dqn as policy def policy_net(observation, action_dim): @@ -41,6 +39,8 @@ if __name__ == '__main__': # pass the observation variable to the replay buffer or find a more reasonable way to help replay buffer # access this observation variable. observation = tf.placeholder(tf.float32, shape=(None,) + observation_dim, name="dqn_observation") # network input + action = tf.placeholder(dtype=tf.int32, shape=(None,)) # batch of integer actions + with tf.variable_scope('q_net'): q_values = policy_net(observation, action_dim) @@ -48,10 +48,9 @@ if __name__ == '__main__': q_values_target = policy_net(observation, action_dim) # 2. build losses, optimizers - q_net = policy.DQN(q_values, observation_placeholder=observation) # YongRen: policy.DQN - target_net = policy.DQN(q_values_target, observation_placeholder=observation) + q_net = policy.DQNRefactor(q_values, observation_placeholder=observation, action_placeholder=action) # YongRen: policy.DQN + target_net = policy.DQNRefactor(q_values_target, observation_placeholder=observation, action_placeholder=action) - action = tf.placeholder(dtype=tf.int32, shape=[None]) # batch of integer actions target = tf.placeholder(dtype=tf.float32, shape=[None]) # target value for DQN dqn_loss = losses.dqn_loss(action, target, q_net) # TongzhengRen diff --git a/tianshou/core/README.md b/tianshou/core/README.md index 3617525..a9cda58 100644 --- a/tianshou/core/README.md +++ b/tianshou/core/README.md @@ -21,4 +21,8 @@ referencing QValuePolicy in base.py, should have at least the listed methods. TongzhengRen -seems to be direct python functions. Though the management of placeholders may require some discussion. also may write it in a functional form. \ No newline at end of file +seems to be direct python functions. Though the management of placeholders may require some discussion. also may write it in a functional form. + +# policy, value_function + +naming should be reconsidered. Perhaps use plural forms for all nouns \ No newline at end of file diff --git a/tianshou/core/losses.py b/tianshou/core/losses.py index 3461afb..5d5d2f3 100644 --- a/tianshou/core/losses.py +++ b/tianshou/core/losses.py @@ -35,17 +35,16 @@ def vanilla_policy_gradient(sampled_action, reward, pi, baseline="None"): # TODO: Different baseline methods like REINFORCE, etc. return vanilla_policy_gradient_loss -def dqn_loss(sampled_action, sampled_target, q_net): +def dqn_loss(sampled_action, sampled_target, policy): """ deep q-network :param sampled_action: placeholder of sampled actions during the interaction with the environment :param sampled_target: estimated Q(s,a) - :param q_net: current `policy` to be optimized + :param policy: current `policy` to be optimized :return: """ - action_num = q_net.values_tensor().get_shape()[1] - sampled_q = tf.reduce_sum(q_net.values_tensor() * tf.one_hot(sampled_action, action_num), axis=1) + sampled_q = policy.q_net.value_tensor return tf.reduce_mean(tf.square(sampled_target - sampled_q)) def deterministic_policy_gradient(sampled_state, critic): diff --git a/tianshou/core/policy/base.py b/tianshou/core/policy/base.py index 1adeaeb..1c1e1c5 100644 --- a/tianshou/core/policy/base.py +++ b/tianshou/core/policy/base.py @@ -3,19 +3,12 @@ from __future__ import absolute_import from __future__ import division -import warnings import tensorflow as tf # from zhusuan.utils import add_name_scope -__all__ = [ - 'StochasticPolicy', - 'QValuePolicy', - 'PolicyBase' -] - # TODO: a even more "base" class for policy @@ -23,8 +16,8 @@ class PolicyBase(object): """ base class for policy. only provides `act` method with exploration """ - def __init__(self): - pass + def __init__(self, observation_placeholder): + self._observation_placeholder = observation_placeholder def act(self, observation, exploration): raise NotImplementedError() @@ -37,14 +30,14 @@ class QValuePolicy(object): def __init__(self, observation_placeholder): self._observation_placeholder = observation_placeholder - def act(self, observation, exploration=None): # first implement no exploration + def act(self, observation, exploration=None): # first implement no exploration """ return the action (int) to be executed. no exploration when exploration=None. """ self._act(observation, exploration) - def _act(self, observation, exploration = None): + def _act(self, observation, exploration=None): raise NotImplementedError() def values(self, observation): @@ -60,7 +53,6 @@ class QValuePolicy(object): pass - class StochasticPolicy(object): """ The :class:`Distribution` class is the base class for various probabilistic @@ -130,7 +122,7 @@ class StochasticPolicy(object): param_dtype, is_continuous, observation_placeholder, - group_ndims=0, # maybe useful for repeat_action + group_ndims=0, # maybe useful for repeat_action **kwargs): self._act_dtype = act_dtype diff --git a/tianshou/core/policy/dqn.py b/tianshou/core/policy/dqn.py index 716e4c4..8533549 100644 --- a/tianshou/core/policy/dqn.py +++ b/tianshou/core/policy/dqn.py @@ -10,16 +10,25 @@ class DQNRefactor(PolicyBase): use DQN from value_function as a member """ def __init__(self, value_tensor, observation_placeholder, action_placeholder): - self._network = DQN(value_tensor, observation_placeholder, action_placeholder) + self._q_net = DQN(value_tensor, observation_placeholder, action_placeholder) self._argmax_action = tf.argmax(value_tensor, axis=1) - def act(self, observation, exploration): + super(DQNRefactor, self).__init__(observation_placeholder=observation_placeholder) + + def act(self, observation, exploration=None): sess = tf.get_default_session() if not exploration: # no exploration - action = sess.run(self._argmax_action, feed_dict={}) + action = sess.run(self._argmax_action, feed_dict={self._observation_placeholder: observation}) -class DQN(QValuePolicy): + return action + + @property + def q_net(self): + return self._q_net + + +class DQNOld(QValuePolicy): """ The policy as in DQN """ diff --git a/tianshou/core/policy/stochastic.py b/tianshou/core/policy/stochastic.py index 3ef463e..d7a75d7 100644 --- a/tianshou/core/policy/stochastic.py +++ b/tianshou/core/policy/stochastic.py @@ -10,12 +10,6 @@ import tensorflow as tf from .base import StochasticPolicy -__all__ = [ - 'OnehotCategorical', - 'OnehotDiscrete', -] - - class OnehotCategorical(StochasticPolicy): """ The class of one-hot Categorical distribution. diff --git a/tianshou/core/value_function/action_value.py b/tianshou/core/value_function/action_value.py index 2bda4fa..c62dae6 100644 --- a/tianshou/core/value_function/action_value.py +++ b/tianshou/core/value_function/action_value.py @@ -15,7 +15,7 @@ class ActionValue(ValueFunctionBase): observation_placeholder=observation_placeholder ) - def get_value(self, observation, action): + def eval_value(self, observation, action): """ :param observation: numpy array of observations, of shape (batchsize, observation_dim). :param action: numpy array of actions, of shape (batchsize, action_dim) @@ -24,7 +24,7 @@ class ActionValue(ValueFunctionBase): # TODO: dealing with the last dim of 1 in V(s) and Q(s, a) """ sess = tf.get_default_session() - return sess.run(self.get_value_tensor(), feed_dict= + return sess.run(self.value_tensor, feed_dict= {self._observation_placeholder: observation, self._action_placeholder: action}) @@ -50,7 +50,7 @@ class DQN(ActionValue): observation_placeholder=observation_placeholder, action_placeholder=action_placeholder) - def get_value_all_actions(self, observation): + def eval_value_all_actions(self, observation): """ :param observation: :return: numpy array of Q(s, *) given s, of shape (batchsize, num_actions) @@ -58,5 +58,6 @@ class DQN(ActionValue): sess = tf.get_default_session() return sess.run(self._value_tensor_all_actions, feed_dict={self._observation_placeholder: observation}) - def get_value_tensor_all_actions(self): + @property + def value_tensor_all_actions(self): return self._value_tensor_all_actions \ No newline at end of file diff --git a/tianshou/core/value_function/base.py b/tianshou/core/value_function/base.py index b15f1bf..8ca9dd0 100644 --- a/tianshou/core/value_function/base.py +++ b/tianshou/core/value_function/base.py @@ -11,14 +11,15 @@ class ValueFunctionBase(object): self._observation_placeholder = observation_placeholder self._value_tensor = tf.squeeze(value_tensor) # canonical values has shape (batchsize, ) - def get_value(self, **kwargs): + def eval_value(self, **kwargs): """ :return: batch of corresponding values in numpy array """ raise NotImplementedError() - def get_value_tensor(self): + @property + def value_tensor(self): """ :return: tensor of the corresponding values diff --git a/tianshou/core/value_function/state_value.py b/tianshou/core/value_function/state_value.py index b7de196..02c12fe 100644 --- a/tianshou/core/value_function/state_value.py +++ b/tianshou/core/value_function/state_value.py @@ -14,7 +14,7 @@ class StateValue(ValueFunctionBase): observation_placeholder=observation_placeholder ) - def get_value(self, observation): + def eval_value(self, observation): """ :param observation: numpy array of observations, of shape (batchsize, observation_dim). @@ -22,4 +22,4 @@ class StateValue(ValueFunctionBase): # TODO: dealing with the last dim of 1 in V(s) and Q(s, a), this should rely on the action shape returned by env """ sess = tf.get_default_session() - return sess.run(self.get_value_tensor(), feed_dict={self._observation_placeholder: observation}) \ No newline at end of file + return sess.run(self.value_tensor, feed_dict={self._observation_placeholder: observation}) \ No newline at end of file From 919784e88b011028ff5e8b8e226974a9bbf8d75c Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Sat, 23 Dec 2017 17:43:33 +0800 Subject: [PATCH 59/98] bug fix of model.py --- AlphaGo/model.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 68973ac..2dc1ef0 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -101,7 +101,7 @@ class ResNet(object): self._build_network(residual_block_num, self.checkpoint_path) # training hyper-parameters: - self.window_length = 7000 + self.window_length = 3 self.save_freq = 5000 self.training_data = {'states': deque(maxlen=self.window_length), 'probs': deque(maxlen=self.window_length), 'winner': deque(maxlen=self.window_length), 'length': deque(maxlen=self.window_length)} @@ -223,8 +223,8 @@ class ResNet(object): else: start_time = time.time() for i in range(batch_size): - priority = self.training_data['length'] / sum(self.training_data['length']) - game_num = np.random.choice(self.window_length, 1, p=priority) + priority = np.array(self.training_data['length']) / (0.0 + np.sum(np.array(self.training_data['length']))) + game_num = np.random.choice(self.window_length, 1, p=priority)[0] state_num = np.random.randint(self.training_data['length'][game_num]) rotate_times = np.random.randint(4) reflect_times = np.random.randint(2) @@ -232,12 +232,10 @@ class ResNet(object): training_data['states'].append( self._preprocession(self.training_data['states'][game_num][state_num], reflect_times, reflect_orientation, rotate_times)) - training_data['probs'].append( - self._preprocession(self.training_data['probs'][game_num][state_num], reflect_times, - reflect_orientation, rotate_times)) - training_data['winner'].append( - self._preprocession(self.training_data['winner'][game_num][state_num], reflect_times, - reflect_orientation, rotate_times)) + training_data['probs'].append(np.concatenate( + [self._preprocession(self.training_data['probs'][game_num][state_num][:-1].reshape(self.board_size, self.board_size, 1), reflect_times, + reflect_orientation, rotate_times).reshape(1, self.board_size**2), self.training_data['probs'][game_num][state_num][-1].reshape(1,1)], axis=1)) + training_data['winner'].append(self.training_data['winner'][game_num][state_num].reshape(1, 1)) value_loss, policy_loss, reg, _ = self.sess.run( [self.value_loss, self.policy_loss, self.reg, self.train_op], feed_dict={self.x: np.concatenate(training_data['states'], axis=0), @@ -302,7 +300,7 @@ class ResNet(object): new_board = copy.copy(board) if new_board.ndim == 3: - np.expand_dims(new_board, axis=0) + new_board = np.expand_dims(new_board, axis=0) new_board = self._board_reflection(new_board, reflect_times, reflect_orientation) new_board = self._board_rotation(new_board, rotate_times) From dcf293d63749e0d9febdc8bf9e2ea1795be112ba Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Sat, 23 Dec 2017 22:05:34 +0800 Subject: [PATCH 60/98] count the winning rate for each player --- AlphaGo/.gitignore | 1 + AlphaGo/data_statistic.py | 29 +++++++++++++++++++++++++++++ AlphaGo/game.py | 2 +- 3 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 AlphaGo/data_statistic.py diff --git a/AlphaGo/.gitignore b/AlphaGo/.gitignore index 9c2fe16..e578e5a 100644 --- a/AlphaGo/.gitignore +++ b/AlphaGo/.gitignore @@ -1,3 +1,4 @@ data checkpoints checkpoints_origin +*.log diff --git a/AlphaGo/data_statistic.py b/AlphaGo/data_statistic.py new file mode 100644 index 0000000..6fedf1c --- /dev/null +++ b/AlphaGo/data_statistic.py @@ -0,0 +1,29 @@ +import os +import cPickle + +class Data(object): + def __init__(self): + self.boards = [] + self.probs = [] + self.winner = 0 + +def file_to_training_data(file_name): + with open(file_name, 'rb') as file: + try: + file.seek(0) + data = cPickle.load(file) + return data.winner + except Exception as e: + print(e) + return 0 + +if __name__ == "__main__": + win_count = [0, 0, 0] + file_list = os.listdir("./data") + #print file_list + for file in file_list: + win_count[file_to_training_data("./data/" + file)] += 1 + print "Total play : " + str(len(file_list)) + print "Black wins : " + str(win_count[1]) + print "White wins : " + str(win_count[-1]) + diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 90d0bf0..9fc8fa2 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -62,7 +62,7 @@ class Game: def think(self, latest_boards, color): mcts = MCTS(self.game_engine, self.evaluator, [latest_boards, color], self.size ** 2 + 1, inverse=True) - mcts.search(max_step=20) + mcts.search(max_step=100) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0] From 162aa313b6b75f255b8690b9c809f4e2c5f81fd4 Mon Sep 17 00:00:00 2001 From: JialianLee Date: Sun, 24 Dec 2017 00:42:59 +0800 Subject: [PATCH 61/98] A new version of reversi --- AlphaGo/reversi.py | 505 ++++++++++++++++++--------------------------- 1 file changed, 202 insertions(+), 303 deletions(-) diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py index ead6f4e..4fa1468 100644 --- a/AlphaGo/reversi.py +++ b/AlphaGo/reversi.py @@ -1,303 +1,202 @@ -from __future__ import print_function -import numpy as np - -''' -Settings of the Go game. - -(1, 1) is considered as the upper left corner of the board, -(size, 1) is the lower left -''' - - -def find_correct_moves(own, enemy): - """return legal moves""" - left_right_mask = 0x7e7e7e7e7e7e7e7e # Both most left-right edge are 0, else 1 - top_bottom_mask = 0x00ffffffffffff00 # Both most top-bottom edge are 0, else 1 - mask = left_right_mask & top_bottom_mask - mobility = 0 - mobility |= search_offset_left(own, enemy, left_right_mask, 1) # Left - mobility |= search_offset_left(own, enemy, mask, 9) # Left Top - mobility |= search_offset_left(own, enemy, top_bottom_mask, 8) # Top - mobility |= search_offset_left(own, enemy, mask, 7) # Top Right - mobility |= search_offset_right(own, enemy, left_right_mask, 1) # Right - mobility |= search_offset_right(own, enemy, mask, 9) # Bottom Right - mobility |= search_offset_right(own, enemy, top_bottom_mask, 8) # Bottom - mobility |= search_offset_right(own, enemy, mask, 7) # Left bottom - return mobility - - -def calc_flip(pos, own, enemy): - """return flip stones of enemy by bitboard when I place stone at pos. - - :param pos: 0~63 - :param own: bitboard (0=top left, 63=bottom right) - :param enemy: bitboard - :return: flip stones of enemy when I place stone at pos. - """ - f1 = _calc_flip_half(pos, own, enemy) - f2 = _calc_flip_half(63 - pos, rotate180(own), rotate180(enemy)) - return f1 | rotate180(f2) - - -def _calc_flip_half(pos, own, enemy): - el = [enemy, enemy & 0x7e7e7e7e7e7e7e7e, enemy & 0x7e7e7e7e7e7e7e7e, enemy & 0x7e7e7e7e7e7e7e7e] - masks = [0x0101010101010100, 0x00000000000000fe, 0x0002040810204080, 0x8040201008040200] - masks = [b64(m << pos) for m in masks] - flipped = 0 - for e, mask in zip(el, masks): - outflank = mask & ((e | ~mask) + 1) & own - flipped |= (outflank - (outflank != 0)) & mask - return flipped - - -def search_offset_left(own, enemy, mask, offset): - e = enemy & mask - blank = ~(own | enemy) - t = e & (own >> offset) - t |= e & (t >> offset) - t |= e & (t >> offset) - t |= e & (t >> offset) - t |= e & (t >> offset) - t |= e & (t >> offset) # Up to six stones can be turned at once - return blank & (t >> offset) # Only the blank squares can be started - - -def search_offset_right(own, enemy, mask, offset): - e = enemy & mask - blank = ~(own | enemy) - t = e & (own << offset) - t |= e & (t << offset) - t |= e & (t << offset) - t |= e & (t << offset) - t |= e & (t << offset) - t |= e & (t << offset) # Up to six stones can be turned at once - return blank & (t << offset) # Only the blank squares can be started - - -def flip_vertical(x): - k1 = 0x00FF00FF00FF00FF - k2 = 0x0000FFFF0000FFFF - x = ((x >> 8) & k1) | ((x & k1) << 8) - x = ((x >> 16) & k2) | ((x & k2) << 16) - x = (x >> 32) | b64(x << 32) - return x - - -def b64(x): - return x & 0xFFFFFFFFFFFFFFFF - - -def bit_count(x): - return bin(x).count('1') - - -def bit_to_array(x, size): - """bit_to_array(0b0010, 4) -> array([0, 1, 0, 0])""" - return np.array(list(reversed((("0" * size) + bin(x)[2:])[-size:])), dtype=np.uint8) - - -def flip_diag_a1h8(x): - k1 = 0x5500550055005500 - k2 = 0x3333000033330000 - k4 = 0x0f0f0f0f00000000 - t = k4 & (x ^ b64(x << 28)) - x ^= t ^ (t >> 28) - t = k2 & (x ^ b64(x << 14)) - x ^= t ^ (t >> 14) - t = k1 & (x ^ b64(x << 7)) - x ^= t ^ (t >> 7) - return x - - -def rotate90(x): - return flip_diag_a1h8(flip_vertical(x)) - - -def rotate180(x): - return rotate90(rotate90(x)) - - -class Reversi: - def __init__(self, black=None, white=None): - self.black = black or (0b00001000 << 24 | 0b00010000 << 32) - self.white = white or (0b00010000 << 24 | 0b00001000 << 32) - self.board = None # 8 * 8 board with 1 for black, -1 for white and 0 for blank - self.color = None # 1 for black and -1 for white - self.action = None # number in 0~63 - self.winner = None - self.black_win = None - self.size = 8 - - def get_board(self, black=None, white=None): - self.black = black or (0b00001000 << 24 | 0b00010000 << 32) - self.white = white or (0b00010000 << 24 | 0b00001000 << 32) - self.board = self.bitboard2board() - return self.board - - def is_valid(self, is_next=False): - self.board2bitboard() - own, enemy = self.get_own_and_enemy(is_next) - mobility = find_correct_moves(own, enemy) - valid_moves = bit_to_array(mobility, 64) - valid_moves = np.argwhere(valid_moves) - valid_moves = list(np.reshape(valid_moves, len(valid_moves))) - return valid_moves - - def simulate_get_mask(self, state, action_set): - history_boards, color = state - board = history_boards[-1] - self.board = board - self.color = color - valid_moves = self.is_valid() - # TODO it seems that the pass move is not considered - if not len(valid_moves): - invalid_action_mask = action_set[0:-1] - else: - invalid_action_mask = [] - for action in action_set: - if action not in valid_moves: - invalid_action_mask.append(action) - return invalid_action_mask - - def simulate_step_forward(self, state, action): - self.board = state[0] - self.color = state[1] - self.board2bitboard() - self.action = action - if self.action == 64: - valid_moves = self.is_valid(is_next=True) - if not len(valid_moves): - self._game_over() - return None, self.winner * self.color - else: - return [self.board, 0 - self.color], 0 - self.step() - new_board = self.bitboard2board() - return [new_board, 0 - self.color], 0 - - def executor_do_move(self, board, color, vertex): - self.board = board - self.color = color - self.board2bitboard() - self.action = self._flatten(vertex) - if self.action == 64: - valid_moves = self.is_valid(is_next=True) - if not len(valid_moves): - return False - else: - return True - else: - self.step() - new_board = self.bitboard2board() - for i in range(64): - board[i] = new_board[i] - return True - - def executor_get_score(self, board): - self.board = board - self._game_over() - if self.black_win is not None: - return self.black_win - else: - raise ValueError("Game not finished!") - - def board2bitboard(self): - count = 1 - if self.board is None: - raise ValueError("None board!") - self.black = 0 - self.white = 0 - for i in range(64): - if self.board[i] == 1: - self.black |= count - elif self.board[i] == -1: - self.white |= count - count *= 2 - ''' - def vertex2action(self, vertex): - x, y = vertex - if x == 0 and y == 0: - self.action = None - else: - self.action = 8 * (x - 1) + y - 1 - ''' - - def bitboard2board(self): - board = [] - black = bit_to_array(self.black, 64) - white = bit_to_array(self.white, 64) - for i in range(64): - if black[i]: - board.append(1) - elif white[i]: - board.append(-1) - else: - board.append(0) - return board - - def step(self): - if self.action < 0 or self.action > 63: - raise ValueError("Action not in the range of [0,63]!") - if self.action is None: - raise ValueError("Action is None!") - - own, enemy = self.get_own_and_enemy() - - flipped = calc_flip(self.action, own, enemy) - if bit_count(flipped) == 0: - # self.illegal_move_to_lose(self.action) - raise ValueError("Illegal action!") - own ^= flipped - own |= 1 << self.action - enemy ^= flipped - self.set_own_and_enemy(own, enemy) - - def _game_over(self): - # self.done = True - - if self.winner is None: - black_num, white_num = self.number_of_black_and_white - self.black_win = black_num - white_num - if self.black_win > 0: - self.winner = 1 - elif self.black_win < 0: - self.winner = -1 - else: - self.winner = 0 - - def illegal_move_to_lose(self, action): - self._game_over() - - def get_own_and_enemy(self, is_next=False): - if is_next: - color = 0 - self.color - else: - color = self.color - if color == 1: - own, enemy = self.black, self.white - elif color == -1: - own, enemy = self.white, self.black - else: - own, enemy = None, None - return own, enemy - - def set_own_and_enemy(self, own, enemy): - if self.color == 1: - self.black, self.white = own, enemy - else: - self.white, self.black = own, enemy - - def _deflatten(self, idx): - x = idx // self.size + 1 - y = idx % self.size + 1 - return (x, y) - - def _flatten(self, vertex): - x, y = vertex - if (x == 0) and (y == 0): - return 64 - return (x - 1) * self.size + (y - 1) - - @property - def number_of_black_and_white(self): - return bit_count(self.black), bit_count(self.white) +import numpy as np +''' +Settings of the Reversi game. + +(1, 1) is considered as the upper left corner of the board, +(size, 1) is the lower left +''' + + +class Reversi: + def __init__(self, black=None, white=None): + self.board = None # 8 * 8 board with 1 for black, -1 for white and 0 for blank + self.color = None # 1 for black and -1 for white + self.action = None # number in 0~63 + self.winner = None + self.black_win = None + self.size = 8 + + def _deflatten(self, idx): + x = idx // self.size + 1 + y = idx % self.size + 1 + return (x, y) + + def _flatten(self, vertex): + x, y = vertex + if (x == 0) and (y == 0): + return 64 + return (x - 1) * self.size + (y - 1) + + def get_board(self, board=None): + self.board = board or np.zeros([8,8]) + self.board[3, 3] = -1 + self.board[4, 4] = -1 + self.board[3, 4] = 1 + self.board[4, 3] = 1 + return self.board + + def _find_correct_moves(self, is_next=False): + moves = [] + if is_next: + color = 0 - self.color + else: + color = self.color + for i in range(64): + x, y = self._deflatten(i) + valid = self._is_valid(x - 1, y - 1, color) + if valid: + moves.append(i) + return moves + + def _one_direction_valid(self, x, y, color): + if (x >= 0) and (x < self.size): + if (y >= 0) and (y < self.size): + if self.board[x, y] == color: + return True + return False + + def _is_valid(self, x, y, color): + if self.board[x, y]: + return False + for x_direction in [-1, 0, 1]: + for y_direction in [-1, 0, 1]: + new_x = x + new_y = y + flag = 0 + while True: + new_x += x_direction + new_y += y_direction + if self._one_direction_valid(new_x, new_y, 0 - color): + flag = 1 + else: + break + if self._one_direction_valid(new_x, new_y, color) and flag: + return True + return False + + def simulate_get_mask(self, state, action_set): + history_boards, color = state + self.board = np.reshape(history_boards[-1], (self.size, self.size)) + self.color = color + valid_moves = self._find_correct_moves() + print(valid_moves) + if not len(valid_moves): + invalid_action_mask = action_set[0:-1] + else: + invalid_action_mask = [] + for action in action_set: + if action not in valid_moves: + invalid_action_mask.append(action) + return invalid_action_mask + + def simulate_step_forward(self, state, action): + self.board = state[0].copy() + self.board = np.reshape(self.board, (self.size, self.size)) + self.color = state[1] + self.action = action + if self.action == 64: + valid_moves = self._find_correct_moves(is_next=True) + if not len(valid_moves): + self._game_over() + return None, self.winner * self.color + else: + return [self.board, 0 - self.color], 0 + self._step() + return [self.board, 0 - self.color], 0 + + def _game_over(self): + black_num, white_num = self._number_of_black_and_white() + self.black_win = black_num - white_num + if self.black_win > 0: + self.winner = 1 + elif self.black_win < 0: + self.winner = -1 + else: + self.winner = 0 + + def _number_of_black_and_white(self): + black_num = 0 + white_num = 0 + board_list = np.reshape(self.board, self.size ** 2) + for i in range(len(board_list)): + if board_list[i] == 1: + black_num += 1 + elif board_list[i] == -1: + white_num += 1 + return black_num, white_num + + def _step(self): + if self.action < 0 or self.action > 63: + raise ValueError("Action not in the range of [0,63]!") + if self.action is None: + raise ValueError("Action is None!") + x, y = self._deflatten(self.action) + valid = self._flip(x -1, y - 1) + if not valid: + raise ValueError("Illegal action!") + + def _flip(self, x, y): + valid = 0 + self.board[x, y] = self.color + for x_direction in [-1, 0, 1]: + for y_direction in [-1, 0, 1]: + new_x = x + new_y = y + flag = 0 + while True: + new_x += x_direction + new_y += y_direction + if self._one_direction_valid(new_x, new_y, 0 - self.color): + flag = 1 + else: + break + if self._one_direction_valid(new_x, new_y, self.color) and flag: + valid = 1 + flip_x = x + flip_y = y + while True: + flip_x += x_direction + flip_y += y_direction + if self._one_direction_valid(flip_x, flip_y, 0 - self.color): + self.board[flip_x, flip_y] = self.color + else: + break + if valid: + return True + else: + return False + + def executor_do_move(self, history, latest_boards, board, color, vertex): + self.board = np.reshape(board, (self.size, self.size)) + self.color = color + self.action = self._flatten(vertex) + if self.action == 64: + valid_moves = self._find_correct_moves(is_next=True) + if not len(valid_moves): + return False + else: + return True + else: + self._step() + return True + + def executor_get_score(self, board): + self.board = board + self._game_over() + if self.black_win is not None: + return self.black_win + else: + raise ValueError("Game not finished!") + + +if __name__ == "__main__": + reversi = Reversi() + # board = reversi.get_board() + # print(board) + # state, value = reversi.simulate_step_forward([board, -1], 20) + # print(state[0]) + # print("board") + # print(board) + # r = reversi.executor_get_score(board) + # print(r) + From 426251e15852e894a0ac200838fd8dec3078f62c Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Sun, 24 Dec 2017 01:07:46 +0800 Subject: [PATCH 62/98] add some code for debug and profiling --- AlphaGo/game.py | 10 +++++++--- AlphaGo/go.py | 1 + AlphaGo/model.py | 3 +++ AlphaGo/play.py | 11 ++++++++--- AlphaGo/player.py | 6 +++++- tianshou/core/mcts/mcts.py | 40 ++++++++++++++++++++++++++++++++++---- 6 files changed, 60 insertions(+), 11 deletions(-) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 9fc8fa2..442cb73 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -17,6 +17,7 @@ from tianshou.core.mcts.mcts import MCTS import go import reversi +import time class Game: ''' @@ -25,8 +26,10 @@ class Game: TODO : Maybe merge with the engine class in future, currently leave it untouched for interacting with Go UI. ''' - def __init__(self, name="go", checkpoint_path=None): + def __init__(self, name="go", role="unknown", debug=False, checkpoint_path=None): self.name = name + self.role = role + self.debug = debug if self.name == "go": self.size = 9 self.komi = 3.75 @@ -36,7 +39,7 @@ class Game: self.latest_boards = deque(maxlen=8) for _ in range(8): self.latest_boards.append(self.board) - self.game_engine = go.Go(size=self.size, komi=self.komi) + self.game_engine = go.Go(size=self.size, komi=self.komi, role=self.role) elif self.name == "reversi": self.size = 8 self.history_length = 1 @@ -61,7 +64,8 @@ class Game: self.komi = k def think(self, latest_boards, color): - mcts = MCTS(self.game_engine, self.evaluator, [latest_boards, color], self.size ** 2 + 1, inverse=True) + mcts = MCTS(self.game_engine, self.evaluator, [latest_boards, color], + self.size ** 2 + 1, role=self.role, debug=self.debug, inverse=True) mcts.search(max_step=100) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index fe2ab74..833b01f 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -18,6 +18,7 @@ class Go: def __init__(self, **kwargs): self.size = kwargs['size'] self.komi = kwargs['komi'] + self.role = kwargs['role'] def _flatten(self, vertex): x, y = vertex diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 2dc1ef0..2a620f9 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -152,6 +152,9 @@ class ResNet(object): :param color: a string, indicate which one to play :return: a list of tensor, the predicted value and policy given the history and color """ + # Note : maybe we can use it for isolating test of MCTS + #prob = [1.0 / self.action_num] * self.action_num + #return [prob, np.random.uniform(-1, 1)] history, color = state if len(history) != self.history_length: raise ValueError( diff --git a/AlphaGo/play.py b/AlphaGo/play.py index b601ada..9144a40 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -28,6 +28,7 @@ if __name__ == '__main__': parser.add_argument("--black_weight_path", type=str, default=None) parser.add_argument("--white_weight_path", type=str, default=None) parser.add_argument("--id", type=int, default=0) + parser.add_argument("--debug", type=bool, default=False) args = parser.parse_args() if not os.path.exists(args.result_path): @@ -60,11 +61,13 @@ if __name__ == '__main__': white_role_name = 'white' + str(args.id) agent_v0 = subprocess.Popen( - ['python', '-u', 'player.py', '--role=' + black_role_name, '--checkpoint_path=' + str(args.black_weight_path)], + ['python', '-u', 'player.py', '--role=' + black_role_name, + '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) agent_v1 = subprocess.Popen( - ['python', '-u', 'player.py', '--role=' + white_role_name, '--checkpoint_path=' + str(args.white_weight_path)], + ['python', '-u', 'player.py', '--role=' + white_role_name, + '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) server_list = "" @@ -92,7 +95,8 @@ if __name__ == '__main__': evaluate_rounds = 1 game_num = 0 try: - while True: + #while True: + while game_num < evaluate_rounds: start_time = time.time() num = 0 pass_flag = [False, False] @@ -107,6 +111,7 @@ if __name__ == '__main__': print show[board[i * size + j]] + " ", print "\n", data.boards.append(board) + start_time = time.time() move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n') print role[turn] + " : " + str(move), num += 1 diff --git a/AlphaGo/player.py b/AlphaGo/player.py index e848d2b..66a487f 100644 --- a/AlphaGo/player.py +++ b/AlphaGo/player.py @@ -25,11 +25,15 @@ if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--checkpoint_path", type=str, default=None) parser.add_argument("--role", type=str, default="unknown") + parser.add_argument("--debug", type=str, default=False) args = parser.parse_args() if args.checkpoint_path == 'None': args.checkpoint_path = None - game = Game(checkpoint_path=args.checkpoint_path) + debug = False + if args.debug == "True": + debug = True + game = Game(role=args.role, checkpoint_path=args.checkpoint_path, debug=debug) engine = GTPEngine(game_obj=game, name='tianshou', version=0) daemon = Pyro4.Daemon() # make a Pyro daemon diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index e99373c..e565337 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -40,16 +40,23 @@ class MCTSNode(object): class UCTNode(MCTSNode): - def __init__(self, parent, action, state, action_num, prior, inverse=False): + def __init__(self, parent, action, state, action_num, prior, debug=False, inverse=False): super(UCTNode, self).__init__(parent, action, state, action_num, prior, inverse) self.Q = np.zeros([action_num]) self.W = np.zeros([action_num]) self.N = np.zeros([action_num]) self.ucb = self.Q + c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1) self.mask = None + self.debug=debug + self.elapse_time = 0 + + def clear_elapse_time(self): + self.elapse_time = 0 def selection(self, simulator): + head = time.time() self.valid_mask(simulator) + self.elapse_time += time.time() - head action = np.argmax(self.ucb) if action in self.children.keys(): return self.children[action].selection(simulator) @@ -142,15 +149,18 @@ class ActionNode(object): class MCTS(object): - def __init__(self, simulator, evaluator, root, action_num, method="UCT", inverse=False): + def __init__(self, simulator, evaluator, root, action_num, method="UCT", + role="unknown", debug=False, inverse=False): self.simulator = simulator self.evaluator = evaluator + self.role = role + self.debug = debug prior, _ = self.evaluator(root) self.action_num = action_num if method == "": self.root = root if method == "UCT": - self.root = UCTNode(None, None, root, action_num, prior, inverse=inverse) + self.root = UCTNode(None, None, root, action_num, prior, self.debug, inverse=inverse) if method == "TS": self.root = TSNode(None, None, root, action_num, prior, inverse=inverse) self.inverse = inverse @@ -165,14 +175,36 @@ class MCTS(object): if max_step is None and max_time is None: raise ValueError("Need a stop criteria!") + selection_time = 0 + expansion_time = 0 + backprop_time = 0 + self.root.clear_elapse_time() while step < max_step and time.time() - start_time < max_step: - self._expand() + sel_time, exp_time, back_time = self._expand() + selection_time += sel_time + expansion_time += exp_time + backprop_time += back_time step += 1 + if (self.debug): + file = open("debug.txt", "a") + file.write("[" + str(self.role) + "]" + + " selection : " + str(selection_time) + "\t" + + " validmask : " + str(self.root.elapse_time) + "\t" + + " expansion : " + str(expansion_time) + "\t" + + " backprop : " + str(backprop_time) + "\t" + + "\n") + file.close() def _expand(self): + t0 = time.time() node, new_action = self.root.selection(self.simulator) + t1 = time.time() value = node.children[new_action].expansion(self.evaluator, self.action_num) + t2 = time.time() node.children[new_action].backpropagation(value + 0.) + t3 = time.time() + return t1 - t0, t2 - t1, t3 - t2 + if __name__ == "__main__": pass From 001263a683c008d2a130b2468b68dcfdcbe5b82f Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Sun, 24 Dec 2017 12:07:56 +0800 Subject: [PATCH 63/98] use a simplified version of get_score --- AlphaGo/go.py | 49 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 833b01f..37e8e9f 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -3,7 +3,7 @@ import utils import copy import numpy as np from collections import deque - +import time ''' Settings of the Go game. @@ -214,7 +214,7 @@ class Go: # initialize the simulate_board from state history_boards, color = state if history_boards[-1] == history_boards[-2] and action is utils.PASS: - return None, 2 * (float(self.executor_get_score(history_boards[-1]) > 0)-0.5) * color + return None, 2 * (float(self.simple_executor_get_score(history_boards[-1]) > 0)-0.5) * color else: vertex = self._action2vertex(action) new_board = self._do_move(copy.copy(history_boards[-1]), color, vertex) @@ -285,10 +285,7 @@ class Go: return utils.WHITE def executor_get_score(self, current_board): - ''' - is_unknown_estimation: whether use nearby stone to predict the unknown - return score from BLACK perspective. - ''' + #return score from BLACK perspective. _board = copy.deepcopy(current_board) while utils.EMPTY in _board: vertex = self._find_empty(_board) @@ -310,7 +307,46 @@ class Go: return score + + def simple_executor_get_score(self, current_board): + ''' + can only be used for the empty group only have one single stone + return score from BLACK perspective. + ''' + score = 0 + for idx, color in enumerate(current_board): + if color == utils.EMPTY: + neighbors = self._neighbor(self._deflatten(idx)) + color = current_board[self._flatten(neighbors[0])] + if color == utils.BLACK: + score += 1 + elif color == utils.WHITE: + score -= 1 + score -= self.komi + return score + + if __name__ == "__main__": + go = Go(size=9, komi=3.75, role = utils.BLACK) + endgame = [ + 1, 0, 1, 0, 1, 1, -1, 0, -1, + 1, 1, 1, 1, 1, 1, -1, -1, -1, + 0, 1, 1, 1, 1, -1, 0, -1, 0, + 1, 1, 1, 1, 1, -1, -1, -1, -1, + 1, -1, 1, -1, 1, 1, -1, -1, -1, + -1, -1, -1, -1, -1, 1, -1, 0, -1, + 1, 1, 1, -1, -1, -1, -1, -1, -1, + 1, 0, 1, 1, 1, 1, 1, -1, 0, + 1, 1, 0, 1, -1, -1, -1, -1, -1 + ] + time0 = time.time() + score = go.executor_get_score(endgame) + time1 = time.time() + print(score, time1 - time0) + score = go.new_executor_get_score(endgame) + time2 = time.time() + print(score, time2 - time1) + ''' ### do unit test for Go class pure_test = [ 0, 1, 0, 1, 0, 1, 0, 0, 0, @@ -349,3 +385,4 @@ if __name__ == "__main__": for i in range(7): print (go._is_eye(opponent_test, utils.BLACK, ot_qry[i])) print("Test of eye surrend by opponents\n") + ''' From 74504ceb1dbbb6b28ea9ce2abae7dcd6ae7f761d Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Sun, 24 Dec 2017 14:40:50 +0800 Subject: [PATCH 64/98] debug for go and reversi --- AlphaGo/engine.py | 7 +- AlphaGo/game.py | 29 ++++--- AlphaGo/go.py | 8 +- AlphaGo/model.py | 8 +- AlphaGo/play.py | 10 +-- AlphaGo/reversi.py | 150 ++++++++++++++++++------------------- tianshou/core/mcts/mcts.py | 8 +- 7 files changed, 111 insertions(+), 109 deletions(-) diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index 98e5e61..5624a2f 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -6,6 +6,8 @@ # from game import Game +import copy +import numpy as np import utils @@ -186,7 +188,10 @@ class GTPEngine(): return self._game.game_engine.executor_get_score(self._game.board), True def cmd_show_board(self, args, **kwargs): - return self._game.board, True + board = copy.deepcopy(self._game.board) + if isinstance(board, np.ndarray): + board = board.flatten().tolist() + return board, True def cmd_get_prob(self, args, **kwargs): return self._game.prob, True diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 442cb73..3a7959c 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -26,33 +26,37 @@ class Game: TODO : Maybe merge with the engine class in future, currently leave it untouched for interacting with Go UI. ''' - def __init__(self, name="go", role="unknown", debug=False, checkpoint_path=None): + def __init__(self, name="reversi", role="unknown", debug=False, checkpoint_path=None): self.name = name self.role = role self.debug = debug if self.name == "go": self.size = 9 self.komi = 3.75 - self.board = [utils.EMPTY] * (self.size ** 2) self.history = [] self.history_length = 8 - self.latest_boards = deque(maxlen=8) - for _ in range(8): - self.latest_boards.append(self.board) self.game_engine = go.Go(size=self.size, komi=self.komi, role=self.role) + self.board = [utils.EMPTY] * (self.size ** 2) elif self.name == "reversi": self.size = 8 self.history_length = 1 - self.game_engine = reversi.Reversi() + self.history = [] + self.game_engine = reversi.Reversi(size=self.size) self.board = self.game_engine.get_board() else: raise ValueError(name + " is an unknown game...") self.evaluator = model.ResNet(self.size, self.size ** 2 + 1, history_length=self.history_length) + self.latest_boards = deque(maxlen=self.history_length) + for _ in range(self.history_length): + self.latest_boards.append(self.board) def clear(self): - self.board = [utils.EMPTY] * (self.size ** 2) - self.history = [] + if self.name == "go": + self.board = [utils.EMPTY] * (self.size ** 2) + self.history = [] + if self.name == "reversi": + self.board = self.game_engine.get_board() for _ in range(self.history_length): self.latest_boards.append(self.board) @@ -84,7 +88,7 @@ class Game: if self.name == "go": res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex) elif self.name == "reversi": - res = self.game_engine.executor_do_move(self.board, color, vertex) + res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex) return res def think_play_move(self, color): @@ -110,13 +114,14 @@ class Game: if row[i] < 10: print(' ', end='') for j in range(self.size): - print(self.status2symbol(self.board[self._flatten((j + 1, i + 1))]), end=' ') + print(self.status2symbol(self.board[self.game_engine._flatten((j + 1, i + 1))]), end=' ') print('') sys.stdout.flush() if __name__ == "__main__": - g = Game() - g.show_board() + g = Game("go") + print(g.board) + g.clear() g.think_play_move(1) #file = open("debug.txt", "a") #file.write("mcts check\n") diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 833b01f..aca6632 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -212,12 +212,12 @@ class Go: def simulate_step_forward(self, state, action): # initialize the simulate_board from state - history_boards, color = state + history_boards, color = copy.deepcopy(state) if history_boards[-1] == history_boards[-2] and action is utils.PASS: return None, 2 * (float(self.executor_get_score(history_boards[-1]) > 0)-0.5) * color else: vertex = self._action2vertex(action) - new_board = self._do_move(copy.copy(history_boards[-1]), color, vertex) + new_board = self._do_move(copy.deepcopy(history_boards[-1]), color, vertex) history_boards.append(new_board) new_color = -color return [history_boards, new_color], 0 @@ -227,8 +227,8 @@ class Go: return False current_board[self._flatten(vertex)] = color self._process_board(current_board, color, vertex) - history.append(copy.copy(current_board)) - latest_boards.append(copy.copy(current_board)) + history.append(copy.deepcopy(current_board)) + latest_boards.append(copy.deepcopy(current_board)) return True def _find_empty(self, current_board): diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 2a620f9..0549f41 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -173,10 +173,10 @@ class ResNet(object): """ state = np.zeros([1, self.board_size, self.board_size, 2 * self.history_length + 1]) for i in range(self.history_length): - state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.board_size ** 2)).reshape(self.board_size, + state[0, :, :, i] = np.array(np.array(history[i]).flatten() == np.ones(self.board_size ** 2)).reshape(self.board_size, self.board_size) state[0, :, :, i + self.history_length] = np.array( - np.array(history[i]) == -np.ones(self.board_size ** 2)).reshape(self.board_size, self.board_size) + np.array(history[i]).flatten() == -np.ones(self.board_size ** 2)).reshape(self.board_size, self.board_size) # TODO: need a config to specify the BLACK and WHITE if color == +1: state[0, :, :, 2 * self.history_length] = np.ones([self.board_size, self.board_size]) @@ -301,7 +301,7 @@ class ResNet(object): :return: """ - new_board = copy.copy(board) + new_board = copy.deepcopy(board) if new_board.ndim == 3: new_board = np.expand_dims(new_board, axis=0) @@ -331,7 +331,7 @@ class ResNet(object): :param orientation: an integer, which orientation to reflect :return: """ - new_board = copy.copy(board) + new_board = copy.deepcopy(board) for _ in range(times): if orientation == 0: new_board = new_board[:, ::-1] diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 9144a40..2731948 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -89,7 +89,7 @@ if __name__ == '__main__': pattern = "[A-Z]{1}[0-9]{1}" space = re.compile("\s+") - size = 9 + size = {"go":9, "reversi":8} show = ['.', 'X', 'O'] evaluate_rounds = 1 @@ -102,13 +102,13 @@ if __name__ == '__main__': pass_flag = [False, False] print("Start game {}".format(game_num)) # end the game if both palyer chose to pass, or play too much turns - while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2: + while not (pass_flag[0] and pass_flag[1]) and num < size["reversi"] ** 2 * 2: turn = num % 2 board = player[turn].run_cmd(str(num) + ' show_board') board = eval(board[board.index('['):board.index(']') + 1]) - for i in range(size): - for j in range(size): - print show[board[i * size + j]] + " ", + for i in range(size["reversi"]): + for j in range(size["reversi"]): + print show[board[i * size["reversi"] + j]] + " ", print "\n", data.boards.append(board) start_time = time.time() diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py index 4fa1468..c6c8a5b 100644 --- a/AlphaGo/reversi.py +++ b/AlphaGo/reversi.py @@ -1,4 +1,5 @@ import numpy as np +import copy ''' Settings of the Reversi game. @@ -8,13 +9,8 @@ Settings of the Reversi game. class Reversi: - def __init__(self, black=None, white=None): - self.board = None # 8 * 8 board with 1 for black, -1 for white and 0 for blank - self.color = None # 1 for black and -1 for white - self.action = None # number in 0~63 - self.winner = None - self.black_win = None - self.size = 8 + def __init__(self, **kwargs): + self.size = kwargs['size'] def _deflatten(self, idx): x = idx // self.size + 1 @@ -24,39 +20,39 @@ class Reversi: def _flatten(self, vertex): x, y = vertex if (x == 0) and (y == 0): - return 64 + return self.size ** 2 return (x - 1) * self.size + (y - 1) - def get_board(self, board=None): - self.board = board or np.zeros([8,8]) - self.board[3, 3] = -1 - self.board[4, 4] = -1 - self.board[3, 4] = 1 - self.board[4, 3] = 1 - return self.board + def get_board(self): + board = np.zeros([self.size, self.size], dtype=np.int32) + board[self.size / 2 - 1, self.size / 2 - 1] = -1 + board[self.size / 2, self.size / 2] = -1 + board[self.size / 2 - 1, self.size / 2] = 1 + board[self.size / 2, self.size / 2 - 1] = 1 + return board - def _find_correct_moves(self, is_next=False): + def _find_correct_moves(self, board, color, is_next=False): moves = [] if is_next: - color = 0 - self.color + new_color = 0 - color else: - color = self.color - for i in range(64): + new_color = color + for i in range(self.size ** 2): x, y = self._deflatten(i) - valid = self._is_valid(x - 1, y - 1, color) + valid = self._is_valid(board, x - 1, y - 1, new_color) if valid: moves.append(i) return moves - def _one_direction_valid(self, x, y, color): + def _one_direction_valid(self, board, x, y, color): if (x >= 0) and (x < self.size): if (y >= 0) and (y < self.size): - if self.board[x, y] == color: + if board[x, y] == color: return True return False - def _is_valid(self, x, y, color): - if self.board[x, y]: + def _is_valid(self, board, x, y, color): + if board[x, y]: return False for x_direction in [-1, 0, 1]: for y_direction in [-1, 0, 1]: @@ -66,20 +62,18 @@ class Reversi: while True: new_x += x_direction new_y += y_direction - if self._one_direction_valid(new_x, new_y, 0 - color): + if self._one_direction_valid(board, new_x, new_y, 0 - color): flag = 1 else: break - if self._one_direction_valid(new_x, new_y, color) and flag: + if self._one_direction_valid(board, new_x, new_y, color) and flag: return True return False def simulate_get_mask(self, state, action_set): - history_boards, color = state - self.board = np.reshape(history_boards[-1], (self.size, self.size)) - self.color = color - valid_moves = self._find_correct_moves() - print(valid_moves) + history_boards, color = copy.deepcopy(state) + board = copy.deepcopy(history_boards[-1]) + valid_moves = self._find_correct_moves(board, color) if not len(valid_moves): invalid_action_mask = action_set[0:-1] else: @@ -90,34 +84,34 @@ class Reversi: return invalid_action_mask def simulate_step_forward(self, state, action): - self.board = state[0].copy() - self.board = np.reshape(self.board, (self.size, self.size)) - self.color = state[1] - self.action = action - if self.action == 64: - valid_moves = self._find_correct_moves(is_next=True) + history_boards, color = copy.deepcopy(state) + board = copy.deepcopy(history_boards[-1]) + if action == self.size ** 2: + valid_moves = self._find_correct_moves(board, color, is_next=True) if not len(valid_moves): - self._game_over() - return None, self.winner * self.color + winner = self._get_winner(board) + return None, winner * color else: - return [self.board, 0 - self.color], 0 - self._step() - return [self.board, 0 - self.color], 0 + return [history_boards, 0 - color], 0 + new_board = self._step(board, color, action) + history_boards.append(new_board) + return [history_boards, 0 - color], 0 - def _game_over(self): - black_num, white_num = self._number_of_black_and_white() - self.black_win = black_num - white_num - if self.black_win > 0: - self.winner = 1 - elif self.black_win < 0: - self.winner = -1 + def _get_winner(self, board): + black_num, white_num = self._number_of_black_and_white(board) + black_win = black_num - white_num + if black_win > 0: + winner = 1 + elif black_win < 0: + winner = -1 else: - self.winner = 0 + winner = 0 + return winner - def _number_of_black_and_white(self): + def _number_of_black_and_white(self, board): black_num = 0 white_num = 0 - board_list = np.reshape(self.board, self.size ** 2) + board_list = np.reshape(board, self.size ** 2) for i in range(len(board_list)): if board_list[i] == 1: black_num += 1 @@ -125,19 +119,18 @@ class Reversi: white_num += 1 return black_num, white_num - def _step(self): - if self.action < 0 or self.action > 63: + def _step(self, board, color, action): + if action < 0 or action > self.size ** 2 - 1: raise ValueError("Action not in the range of [0,63]!") - if self.action is None: + if action is None: raise ValueError("Action is None!") - x, y = self._deflatten(self.action) - valid = self._flip(x -1, y - 1) - if not valid: - raise ValueError("Illegal action!") + x, y = self._deflatten(action) + new_board = self._flip(board, x - 1, y - 1, color) + return new_board - def _flip(self, x, y): + def _flip(self, board, x, y, color): valid = 0 - self.board[x, y] = self.color + board[x, y] = color for x_direction in [-1, 0, 1]: for y_direction in [-1, 0, 1]: new_x = x @@ -146,47 +139,46 @@ class Reversi: while True: new_x += x_direction new_y += y_direction - if self._one_direction_valid(new_x, new_y, 0 - self.color): + if self._one_direction_valid(board, new_x, new_y, 0 - color): flag = 1 else: break - if self._one_direction_valid(new_x, new_y, self.color) and flag: + if self._one_direction_valid(board, new_x, new_y, color) and flag: valid = 1 flip_x = x flip_y = y while True: flip_x += x_direction flip_y += y_direction - if self._one_direction_valid(flip_x, flip_y, 0 - self.color): - self.board[flip_x, flip_y] = self.color + if self._one_direction_valid(board, flip_x, flip_y, 0 - color): + board[flip_x, flip_y] = color else: break if valid: - return True + return board else: - return False + raise ValueError("Invalid action") def executor_do_move(self, history, latest_boards, board, color, vertex): - self.board = np.reshape(board, (self.size, self.size)) - self.color = color - self.action = self._flatten(vertex) - if self.action == 64: - valid_moves = self._find_correct_moves(is_next=True) + board = np.reshape(board, (self.size, self.size)) + color = color + action = self._flatten(vertex) + if action == self.size ** 2: + valid_moves = self._find_correct_moves(board, color, is_next=True) if not len(valid_moves): return False else: return True else: - self._step() + new_board = self._step(board, color, action) + history.append(new_board) + latest_boards.append(new_board) return True def executor_get_score(self, board): - self.board = board - self._game_over() - if self.black_win is not None: - return self.black_win - else: - raise ValueError("Game not finished!") + board = board + winner = self._get_winner(board) + return winner if __name__ == "__main__": diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index e565337..493cf7d 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -110,15 +110,15 @@ class ActionNode(object): self.reward = 0 def type_conversion_to_tuple(self): - if type(self.next_state) is np.ndarray: + if isinstance(self.next_state, np.ndarray): self.next_state = self.next_state.tolist() - if type(self.next_state) is list: + if isinstance(self.next_state, list): self.next_state = list2tuple(self.next_state) def type_conversion_to_origin(self): - if self.state_type is np.ndarray: + if isinstance(self.state_type, np.ndarray): self.next_state = np.array(self.next_state) - if self.state_type is list: + if isinstance(self.state_type, np.ndarray): self.next_state = tuple2list(self.next_state) def selection(self, simulator): From 2d9aa32758968829c0351e84887e9277d8c1697d Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Sun, 24 Dec 2017 14:41:40 +0800 Subject: [PATCH 65/98] change all copy to deepcopy --- AlphaGo/go.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 15fc5c6..55f5a4a 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -99,7 +99,7 @@ class Go: def _check_global_isomorphous(self, history_boards, current_board, color, vertex): repeat = False - next_board = copy.copy(current_board) + next_board = copy.deepcopy(current_board) next_board[self._flatten(vertex)] = color self._process_board(next_board, color, vertex) if next_board in history_boards: From cf57144ce994dc57588c1473fc05e85bbac92587 Mon Sep 17 00:00:00 2001 From: mcgrady00h <281130306@qq.com> Date: Sun, 24 Dec 2017 15:47:11 +0800 Subject: [PATCH 66/98] merge master --- AlphaGo/network.py | 225 --------------------------------------------- 1 file changed, 225 deletions(-) delete mode 100644 AlphaGo/network.py diff --git a/AlphaGo/network.py b/AlphaGo/network.py deleted file mode 100644 index cfff6f3..0000000 --- a/AlphaGo/network.py +++ /dev/null @@ -1,225 +0,0 @@ -import os -import time -import sys - -import numpy as np -import time -import tensorflow as tf -import tensorflow.contrib.layers as layers - -import multi_gpu -import time -import copy - -# os.environ["CUDA_VISIBLE_DEVICES"] = "1" -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - - -def residual_block(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = h + input - return tf.nn.relu(h) - - -def policy_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 82, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -def value_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -class Network(object): - def __init__(self): - self.x = tf.placeholder(tf.float32, shape=[None, 9, 9, 17]) - self.is_training = tf.placeholder(tf.bool, shape=[]) - self.z = tf.placeholder(tf.float32, shape=[None, 1]) - self.pi = tf.placeholder(tf.float32, shape=[None, 82]) - self.build_network() - - def build_network(self): - h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, - normalizer_params={'is_training': self.is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS}, - weights_regularizer=layers.l2_regularizer(1e-4)) - for i in range(4): - h = residual_block(h, self.is_training) - self.v = value_heads(h, self.is_training) - self.p = policy_heads(h, self.is_training) - # loss = tf.reduce_mean(tf.square(z-v)) - tf.multiply(pi, tf.log(tf.clip_by_value(tf.nn.softmax(p), 1e-8, tf.reduce_max(tf.nn.softmax(p))))) - self.value_loss = tf.reduce_mean(tf.square(self.z - self.v)) - self.policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.pi, logits=self.p)) - - self.reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) - self.total_loss = self.value_loss + self.policy_loss + self.reg - # train_op = tf.train.MomentumOptimizer(1e-4, momentum=0.9, use_nesterov=True).minimize(total_loss) - self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) - with tf.control_dependencies(self.update_ops): - self.train_op = tf.train.RMSPropOptimizer(1e-4).minimize(self.total_loss) - self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) - self.saver = tf.train.Saver(max_to_keep=10, var_list=self.var_list) - self.sess = multi_gpu.create_session() - - def train(self): - data_path = "./training_data/" - data_name = os.listdir(data_path) - epochs = 100 - batch_size = 128 - - result_path = "./checkpoints_origin/" - with multi_gpu.create_session() as sess: - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(result_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - self.saver.restore(sess, ckpt_file) - for epoch in range(epochs): - for name in data_name: - data = np.load(data_path + name) - boards = data["boards"] - wins = data["wins"] - ps = data["ps"] - print (boards.shape) - print (wins.shape) - print (ps.shape) - batch_num = boards.shape[0] // batch_size - index = np.arange(boards.shape[0]) - np.random.shuffle(index) - value_losses = [] - policy_losses = [] - regs = [] - time_train = -time.time() - for iter in range(batch_num): - lv, lp, r, value, prob, _ = sess.run( - [self.value_loss, self.policy_loss, self.reg, self.v, tf.nn.softmax(self.p), self.train_op], - feed_dict={self.x: boards[ - index[iter * batch_size:(iter + 1) * batch_size]], - self.z: wins[index[ - iter * batch_size:(iter + 1) * batch_size]], - self.pi: ps[index[ - iter * batch_size:(iter + 1) * batch_size]], - self.is_training: True}) - value_losses.append(lv) - policy_losses.append(lp) - regs.append(r) - if iter % 1 == 0: - print( - "Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format( - epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)), - np.mean(np.array(policy_losses)), np.mean(np.array(regs)))) - time_train = -time.time() - value_losses = [] - policy_losses = [] - regs = [] - if iter % 20 == 0: - save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter) - self.saver.save(sess, result_path + save_path) - del data, boards, wins, ps - - - # def forward(call_number): - # # checkpoint_path = "/home/yama/rl/tianshou/AlphaGo/checkpoints" - # checkpoint_path = "/home/jialian/stuGo/tianshou/stuGo/checkpoints/" - # board_file = np.genfromtxt("/home/jialian/stuGo/tianshou/leela-zero/src/mcts_nn_files/board_" + call_number, - # dtype='str'); - # human_board = np.zeros((17, 19, 19)) - # - # # TODO : is it ok to ignore the last channel? - # for i in range(17): - # human_board[i] = np.array(list(board_file[i])).reshape(19, 19) - # # print("============================") - # # print("human board sum : " + str(np.sum(human_board[-1]))) - # # print("============================") - # # print(human_board) - # # print("============================") - # # rint(human_board) - # feed_board = human_board.transpose(1, 2, 0).reshape(1, 19, 19, 17) - # # print(feed_board[:,:,:,-1]) - # # print(feed_board.shape) - # - # # npz_board = np.load("/home/yama/rl/tianshou/AlphaGo/data/7f83928932f64a79bc1efdea268698ae.npz") - # # print(npz_board["boards"].shape) - # # feed_board = npz_board["boards"][10].reshape(-1, 19, 19, 17) - # ##print(feed_board) - # # show_board = feed_board[0].transpose(2, 0, 1) - # # print("board shape : ", show_board.shape) - # # print(show_board) - # - # itflag = False - # with multi_gpu.create_session() as sess: - # sess.run(tf.global_variables_initializer()) - # ckpt_file = tf.train.latest_checkpoint(checkpoint_path) - # if ckpt_file is not None: - # # print('Restoring model from {}...'.format(ckpt_file)) - # saver.restore(sess, ckpt_file) - # else: - # raise ValueError("No model loaded") - # res = sess.run([tf.nn.softmax(p), v], feed_dict={x: feed_board, is_training: itflag}) - # # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False}) - # # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True}) - # # print(np.argmax(res[0])) - # np.savetxt(sys.stdout, res[0][0], fmt="%.6f", newline=" ") - # np.savetxt(sys.stdout, res[1][0], fmt="%.6f", newline=" ") - # pv_file = "/home/jialian/stuGotianshou/leela-zero/src/mcts_nn_files/policy_value" - # np.savetxt(pv_file, np.concatenate((res[0][0], res[1][0])), fmt="%.6f", newline=" ") - # # np.savetxt(pv_file, res[1][0], fmt="%.6f", newline=" ") - # return res - - def forward(self, checkpoint_path): - # checkpoint_path = "/home/tongzheng/tianshou/AlphaGo/checkpoints/" - # sess = multi_gpu.create_session() - # sess.run(tf.global_variables_initializer()) - if checkpoint_path is None: - self.sess.run(tf.global_variables_initializer()) - else: - ckpt_file = tf.train.latest_checkpoint(checkpoint_path) - if ckpt_file is not None: - # print('Restoring model from {}...'.format(ckpt_file)) - self.saver.restore(self.sess, ckpt_file) - # print('Successfully loaded') - else: - raise ValueError("No model loaded") - # prior, value = sess.run([tf.nn.softmax(p), v], feed_dict={x: state, is_training: False}) - # return prior, value - return self.sess - - -if __name__ == '__main__': - # state = np.random.randint(0, 1, [256, 9, 9, 17]) - # net = Network() - # net.train() - # sess = net.forward() - # start_time = time.time() - # for i in range(100): - # sess.run([tf.nn.softmax(net.p), net.v], feed_dict={net.x: state, net.is_training: False}) - # print("Step {}, use time {}".format(i, time.time() - start_time)) - # start_time = time.time() - net0 = Network() - sess0 = net0.forward("./checkpoints/") - print("Loaded") - while True: - pass - From 5aa5dcd191a266aca637574ff8aaab46ee1c58ae Mon Sep 17 00:00:00 2001 From: mcgrady00h <281130306@qq.com> Date: Sun, 24 Dec 2017 16:47:43 +0800 Subject: [PATCH 67/98] add comments for mcts with virtual loss --- tianshou/core/mcts/mcts_virtual_loss.py | 47 +++++++++++++++++++++---- 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/tianshou/core/mcts/mcts_virtual_loss.py b/tianshou/core/mcts/mcts_virtual_loss.py index 9335464..f27d8a3 100644 --- a/tianshou/core/mcts/mcts_virtual_loss.py +++ b/tianshou/core/mcts/mcts_virtual_loss.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # vim:fenc=utf-8 # $File: mcts_virtual_loss.py -# $Date: Sat Dec 23 02:4850 2017 +0800 +# $Date: Sun Dec 24 16:4740 2017 +0800 # Original file: mcts.py # $Author: renyong15 Ā© # @@ -22,7 +22,17 @@ from .utils import list2tuple, tuple2list class MCTSNodeVirtualLoss(object): - def __init__(self, parent, action, state, action_num, prior, inverse=False): + """ + MCTS abstract class with virtual loss. Currently we only support UCT node. + Role of the Parameters can be found in Readme.md. + """ + def __init__(self, + parent, + action, + state, + action_num, + prior, + inverse = False): self.parent = parent self.action = action self.children = {} @@ -41,7 +51,19 @@ class MCTSNodeVirtualLoss(object): pass class UCTNodeVirtualLoss(MCTSNodeVirtualLoss): - def __init__(self, parent, action, state, action_num, prior, inverse=False, c_puct = 5): + """ + UCT node (state node) with virtual loss. + Role of the Parameters can be found in Readme.md. + :param c_puct balance between exploration and exploition, + """ + def __init__(self, + parent, + action, + state, + action_num, + prior, + inverse=False, + c_puct = 5): super(UCTNodeVirtualLoss, self).__init__(parent, action, state, action_num, prior, inverse) self.Q = np.zeros([action_num]) self.W = np.zeros([action_num]) @@ -53,7 +75,8 @@ class UCTNodeVirtualLoss(MCTSNodeVirtualLoss): self.mask = None - def selection(self, simulator): + def selection(self, + simulator): self.valid_mask(simulator) self.Q = np.zeros([self.action_num]) N_not_zero = (self.N + self.virtual_loss) > 0 @@ -108,6 +131,9 @@ class UCTNodeVirtualLoss(MCTSNodeVirtualLoss): class ActionNodeVirtualLoss(object): + """ + Action node with virtual loss. + """ def __init__(self, parent, action): self.parent = parent self.action = action @@ -156,6 +182,9 @@ class ActionNodeVirtualLoss(object): class MCTSVirtualLoss(object): + """ + MCTS class with virtual loss + """ def __init__(self, simulator, evaluator, root, action_num, batch_size = 1, method = "UCT", inverse = False): self.simulator = simulator self.evaluator = evaluator @@ -196,13 +225,19 @@ class MCTSVirtualLoss(object): self.bp_time = [] while (max_step is not None and self.step < self.max_step or max_step is None) \ and (max_time is not None and time.time() - self.start_time < self.max_time or max_time is None): - self.expand() + self._expand() if max_step is not None: self.step += 1 - def expand(self): + def _expand(self): """ Core logic method for MCTS tree to expand nodes. + Steps to expand node: + 1. Select final action node with virtual loss and collect them in to a minibatch. + (i.e. root->action->state->action...->action) + 2. Remove the virtual loss + 3. Evaluate the whole minibatch using evaluator + 4. Expand new nodes and perform back propogation. """ ## minibatch with virtual loss nodes = [] From f0074aa7ca0db4736309e708f7332284dc5e9d64 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Sun, 24 Dec 2017 17:43:45 +0800 Subject: [PATCH 68/98] fix bug of game config and add profing functions to mcts --- AlphaGo/engine.py | 3 +- AlphaGo/game.py | 7 +-- AlphaGo/play.py | 13 ++-- AlphaGo/player.py | 3 +- AlphaGo/random_data.py | 123 ------------------------------------- AlphaGo/self-play.py | 103 ------------------------------- tianshou/core/mcts/mcts.py | 68 +++++++++++++------- 7 files changed, 58 insertions(+), 262 deletions(-) delete mode 100644 AlphaGo/random_data.py delete mode 100644 AlphaGo/self-play.py diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index 5624a2f..b662dbd 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -198,5 +198,4 @@ class GTPEngine(): if __name__ == "main": - game = Game() - engine = GTPEngine(game_obj=game) + print ("test engine.py") diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 3a7959c..8ffde93 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -26,7 +26,7 @@ class Game: TODO : Maybe merge with the engine class in future, currently leave it untouched for interacting with Go UI. ''' - def __init__(self, name="reversi", role="unknown", debug=False, checkpoint_path=None): + def __init__(self, name=None, role=None, debug=False, checkpoint_path=None): self.name = name self.role = role self.debug = debug @@ -119,10 +119,7 @@ class Game: sys.stdout.flush() if __name__ == "__main__": - g = Game("go") - print(g.board) - g.clear() - g.think_play_move(1) + print("test game.py") #file = open("debug.txt", "a") #file.write("mcts check\n") #file.close() diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 2731948..5777982 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -60,13 +60,14 @@ if __name__ == '__main__': black_role_name = 'black' + str(args.id) white_role_name = 'white' + str(args.id) + game_name = 'go' agent_v0 = subprocess.Popen( - ['python', '-u', 'player.py', '--role=' + black_role_name, + ['python', '-u', 'player.py', '--game=' + game_name, '--role=' + black_role_name, '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) agent_v1 = subprocess.Popen( - ['python', '-u', 'player.py', '--role=' + white_role_name, + ['python', '-u', 'player.py', '--game=' + game_name, '--role=' + white_role_name, '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) @@ -102,13 +103,13 @@ if __name__ == '__main__': pass_flag = [False, False] print("Start game {}".format(game_num)) # end the game if both palyer chose to pass, or play too much turns - while not (pass_flag[0] and pass_flag[1]) and num < size["reversi"] ** 2 * 2: + while not (pass_flag[0] and pass_flag[1]) and num < size[game_name] ** 2 * 2: turn = num % 2 board = player[turn].run_cmd(str(num) + ' show_board') board = eval(board[board.index('['):board.index(']') + 1]) - for i in range(size["reversi"]): - for j in range(size["reversi"]): - print show[board[i * size["reversi"] + j]] + " ", + for i in range(size[game_name]): + for j in range(size[game_name]): + print show[board[i * size[game_name] + j]] + " ", print "\n", data.boards.append(board) start_time = time.time() diff --git a/AlphaGo/player.py b/AlphaGo/player.py index 66a487f..a8f61c1 100644 --- a/AlphaGo/player.py +++ b/AlphaGo/player.py @@ -26,6 +26,7 @@ if __name__ == '__main__': parser.add_argument("--checkpoint_path", type=str, default=None) parser.add_argument("--role", type=str, default="unknown") parser.add_argument("--debug", type=str, default=False) + parser.add_argument("--game", type=str, default=False) args = parser.parse_args() if args.checkpoint_path == 'None': @@ -33,7 +34,7 @@ if __name__ == '__main__': debug = False if args.debug == "True": debug = True - game = Game(role=args.role, checkpoint_path=args.checkpoint_path, debug=debug) + game = Game(name=args.game, role=args.role, checkpoint_path=args.checkpoint_path, debug=debug) engine = GTPEngine(game_obj=game, name='tianshou', version=0) daemon = Pyro4.Daemon() # make a Pyro daemon diff --git a/AlphaGo/random_data.py b/AlphaGo/random_data.py deleted file mode 100644 index 5b53bd6..0000000 --- a/AlphaGo/random_data.py +++ /dev/null @@ -1,123 +0,0 @@ -import os -import numpy as np -import time - -size = 9 -path = "/raid/tongzheng/tianshou/AlphaGo/data/part1/" -save_path = "/raid/tongzheng/tianshou/AlphaGo/data/" -name = os.listdir(path) -print(len(name)) -batch_size = 128 -batch_num = 512 - -block_size = batch_size * batch_num -slots_num = 16 - - -class block(object): - def __init__(self, block_size, block_id): - self.boards = [] - self.wins = [] - self.ps = [] - self.block_size = block_size - self.block_id = block_id - - def concat(self, board, p, win): - board = board.reshape(-1, size, size, 17) - win = win.reshape(-1, 1) - p = p.reshape(-1, size ** 2 + 1) - self.boards.append(board) - self.wins.append(win) - self.ps.append(p) - - def isfull(self): - assert len(self.boards) == len(self.wins) - assert len(self.boards) == len(self.ps) - return len(self.boards) == self.block_size - - def save_and_reset(self, block_id): - self.boards = np.concatenate(self.boards, axis=0) - self.wins = np.concatenate(self.wins, axis=0) - self.ps = np.concatenate(self.ps, axis=0) - print ("Block {}, Boards shape {}, Wins Shape {}, Ps Shape {}".format(self.block_id, self.boards.shape[0], - self.wins.shape[0], self.ps.shape[0])) - np.savez(save_path + "block" + str(self.block_id), boards=self.boards, wins=self.wins, ps=self.ps) - self.boards = [] - self.wins = [] - self.ps = [] - self.block_id = block_id - - def store_num(self): - assert len(self.boards) == len(self.wins) - assert len(self.boards) == len(self.ps) - return len(self.boards) - - -def concat(block_list, board, win, p): - global index - seed = np.random.randint(slots_num) - block_list[seed].concat(board, win, p) - if block_list[seed].isfull(): - block_list[seed].save_and_reset(index) - index = index + 1 - - -block_list = [] -for index in range(slots_num): - block_list.append(block(block_size, index)) -index = index + 1 -for n in name: - data = np.load(path + n) - board = data["boards"] - win = data["win"] - p = data["p"] - print("Start {}".format(n)) - print("Shape {}".format(board.shape[0])) - start = -time.time() - for i in range(board.shape[0]): - board_ori = board[i].reshape(-1, size, size, 17) - win_ori = win[i].reshape(-1, 1) - p_ori = p[i].reshape(-1, size ** 2 + 1) - concat(block_list, board_ori, p_ori, win_ori) - - for t in range(1, 4): - board_aug = np.rot90(board_ori, t, (1, 2)) - p_aug = np.concatenate( - [np.rot90(p_ori[:, :-1].reshape(-1, size, size), t, (1, 2)).reshape(-1, size ** 2), p_ori[:, -1].reshape(-1, 1)], - axis=1) - concat(block_list, board_aug, p_aug, win_ori) - - board_aug = board_ori[:, ::-1] - p_aug = np.concatenate( - [p_ori[:, :-1].reshape(-1, size, size)[:, ::-1].reshape(-1, size ** 2), p_ori[:, -1].reshape(-1, 1)], - axis=1) - concat(block_list, board_aug, p_aug, win_ori) - - board_aug = board_ori[:, :, ::-1] - p_aug = np.concatenate( - [p_ori[:, :-1].reshape(-1, size, size)[:, :, ::-1].reshape(-1, size ** 2), p_ori[:, -1].reshape(-1, 1)], - axis=1) - concat(block_list, board_aug, p_aug, win_ori) - - board_aug = np.rot90(board_ori[:, ::-1], 1, (1, 2)) - p_aug = np.concatenate( - [np.rot90(p_ori[:, :-1].reshape(-1, size, size)[:, ::-1], 1, (1, 2)).reshape(-1, size ** 2), - p_ori[:, -1].reshape(-1, 1)], - axis=1) - concat(block_list, board_aug, p_aug, win_ori) - - board_aug = np.rot90(board_ori[:, :, ::-1], 1, (1, 2)) - p_aug = np.concatenate( - [np.rot90(p_ori[:, :-1].reshape(-1, size, size)[:, :, ::-1], 1, (1, 2)).reshape(-1, size ** 2), - p_ori[:, -1].reshape(-1, 1)], - axis=1) - concat(block_list, board_aug, p_aug, win_ori) - print ("Finished {} with time {}".format(n, time.time() + start)) - data_num = 0 - for i in range(slots_num): - print("Block {} ".format(block_list[i].block_id) + "Size {}".format(block_list[i].store_num())) - data_num = data_num + block_list[i].store_num() - print ("Total data {}".format(data_num)) - -for i in range(slots_num): - block_list[i].save_and_reset(block_list[i].block_id) diff --git a/AlphaGo/self-play.py b/AlphaGo/self-play.py deleted file mode 100644 index dd03b13..0000000 --- a/AlphaGo/self-play.py +++ /dev/null @@ -1,103 +0,0 @@ -from game import Game -from engine import GTPEngine -import re -import numpy as np -import os -from collections import deque -import utils -import argparse - -parser = argparse.ArgumentParser() -parser.add_argument('--result_path', type=str, default='./part1') -args = parser.parse_args() - -if not os.path.exists(args.result_path): - os.makedirs(args.result_path) - -game = Game() -engine = GTPEngine(game_obj=game) -history = deque(maxlen=8) -for i in range(8): - history.append(game.board) -state = [] -prob = [] -winner = [] -pattern = "[A-Z]{1}[0-9]{1}" -game.show_board() - - -def history2state(history, color): - state = np.zeros([1, game.size, game.size, 17]) - for i in range(8): - state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(game.size ** 2)).reshape(game.size, game.size) - state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(game.size ** 2)).reshape(game.size, game.size) - if color == utils.BLACK: - state[0, :, :, 16] = np.ones([game.size, game.size]) - if color == utils.WHITE: - state[0, :, :, 16] = np.zeros([game.size, game.size]) - return state - - -num = 0 -game_num = 0 -black_pass = False -white_pass = False -while True: - print("Start game {}".format(game_num)) - while not (black_pass and white_pass) and num < game.size ** 2 * 2: - if num % 2 == 0: - color = utils.BLACK - new_state = history2state(history, color) - state.append(new_state) - result = engine.run_cmd(str(num) + " genmove BLACK") - num += 1 - match = re.search(pattern, result) - if match is not None: - print(match.group()) - else: - print("pass") - if re.search("pass", result) is not None: - black_pass = True - else: - black_pass = False - else: - color = utils.WHITE - new_state = history2state(history, color) - state.append(new_state) - result = engine.run_cmd(str(num) + " genmove WHITE") - num += 1 - match = re.search(pattern, result) - if match is not None: - print(match.group()) - else: - print("pass") - if re.search("pass", result) is not None: - white_pass = True - else: - white_pass = False - game.show_board() - prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1)) - print("Finished") - print("\n") - score = game.game_engine.executor_get_score(game.board) - if score > 0: - winner = utils.BLACK - else: - winner = utils.WHITE - state = np.concatenate(state, axis=0) - prob = np.concatenate(prob, axis=0) - winner = np.ones([num, 1]) * winner - assert state.shape[0] == prob.shape[0] - assert state.shape[0] == winner.shape[0] - np.savez(args.result_path + "/game" + str(game_num), state=state, prob=prob, winner=winner) - state = [] - prob = [] - winner = [] - num = 0 - black_pass = False - white_pass = False - engine.run_cmd(str(num) + " clear_board") - history.clear() - for _ in range(8): - history.append(game.board) - game_num += 1 diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 493cf7d..1994284 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -40,28 +40,27 @@ class MCTSNode(object): class UCTNode(MCTSNode): - def __init__(self, parent, action, state, action_num, prior, debug=False, inverse=False): + def __init__(self, parent, action, state, action_num, prior, mcts, inverse=False): super(UCTNode, self).__init__(parent, action, state, action_num, prior, inverse) self.Q = np.zeros([action_num]) self.W = np.zeros([action_num]) self.N = np.zeros([action_num]) self.ucb = self.Q + c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1) self.mask = None - self.debug=debug - self.elapse_time = 0 - - def clear_elapse_time(self): self.elapse_time = 0 + self.mcts = mcts def selection(self, simulator): head = time.time() self.valid_mask(simulator) - self.elapse_time += time.time() - head + self.mcts.valid_mask_time += time.time() - head action = np.argmax(self.ucb) if action in self.children.keys(): + self.mcts.state_selection_time += time.time() - head return self.children[action].selection(simulator) else: - self.children[action] = ActionNode(self, action) + self.children[action] = ActionNode(self, action, mcts=self.mcts) + self.mcts.state_selection_time += time.time() - head return self.children[action].selection(simulator) def backpropagation(self, action): @@ -100,7 +99,7 @@ class TSNode(MCTSNode): class ActionNode(object): - def __init__(self, parent, action): + def __init__(self, parent, action, mcts): self.parent = parent self.action = action self.children = {} @@ -108,12 +107,18 @@ class ActionNode(object): self.origin_state = None self.state_type = None self.reward = 0 + self.mcts = mcts def type_conversion_to_tuple(self): + t0 = time.time() if isinstance(self.next_state, np.ndarray): self.next_state = self.next_state.tolist() + t1 = time.time() if isinstance(self.next_state, list): self.next_state = list2tuple(self.next_state) + t2 = time.time() + self.mcts.ndarray2list_time += t1 - t0 + self.mcts.list2tuple_time += t2 - t1 def type_conversion_to_origin(self): if isinstance(self.state_type, np.ndarray): @@ -122,23 +127,28 @@ class ActionNode(object): self.next_state = tuple2list(self.next_state) def selection(self, simulator): + head = time.time() self.next_state, self.reward = simulator.simulate_step_forward(self.parent.state, self.action) + self.mcts.simulate_sf_time += time.time() - head self.origin_state = self.next_state self.state_type = type(self.next_state) self.type_conversion_to_tuple() if self.next_state is not None: if self.next_state in self.children.keys(): + self.mcts.action_selection_time += time.time() - head return self.children[self.next_state].selection(simulator) else: + self.mcts.action_selection_time += time.time() - head return self.parent, self.action else: + self.mcts.action_selection_time += time.time() - head return self.parent, self.action def expansion(self, evaluator, action_num): if self.next_state is not None: prior, value = evaluator(self.next_state) self.children[self.next_state] = UCTNode(self, self.action, self.origin_state, action_num, prior, - self.parent.inverse) + mcts=self.mcts, inverse=self.parent.inverse) return value else: return 0. @@ -160,11 +170,23 @@ class MCTS(object): if method == "": self.root = root if method == "UCT": - self.root = UCTNode(None, None, root, action_num, prior, self.debug, inverse=inverse) + self.root = UCTNode(None, None, root, action_num, prior, mcts=self, inverse=inverse) if method == "TS": self.root = TSNode(None, None, root, action_num, prior, inverse=inverse) self.inverse = inverse + # time spend on each step + self.selection_time = 0 + self.expansion_time = 0 + self.backpropagation_time = 0 + self.action_selection_time = 0 + self.state_selection_time = 0 + self.simulate_sf_time = 0 + self.valid_mask_time = 0 + self.ndarray2list_time = 0 + self.list2tuple_time = 0 + self.check = 0 + def search(self, max_step=None, max_time=None): step = 0 start_time = time.time() @@ -175,23 +197,25 @@ class MCTS(object): if max_step is None and max_time is None: raise ValueError("Need a stop criteria!") - selection_time = 0 - expansion_time = 0 - backprop_time = 0 - self.root.clear_elapse_time() while step < max_step and time.time() - start_time < max_step: sel_time, exp_time, back_time = self._expand() - selection_time += sel_time - expansion_time += exp_time - backprop_time += back_time + self.selection_time += sel_time + self.expansion_time += exp_time + self.backpropagation_time += back_time step += 1 if (self.debug): - file = open("debug.txt", "a") + file = open("mcts_profiling.txt", "a") file.write("[" + str(self.role) + "]" - + " selection : " + str(selection_time) + "\t" - + " validmask : " + str(self.root.elapse_time) + "\t" - + " expansion : " + str(expansion_time) + "\t" - + " backprop : " + str(backprop_time) + "\t" + + " sel " + '%.3f' % self.selection_time + " " + + " sel_sta " + '%.3f' % self.state_selection_time + " " + + " valid " + '%.3f' % self.valid_mask_time + " " + + " sel_act " + '%.3f' % self.action_selection_time + " " + + " array2list " + '%.4f' % self.ndarray2list_time + " " + + " check " + str(self.check) + " " + + " list2tuple " + '%.4f' % self.list2tuple_time + " \t" + + " forward " + '%.3f' % self.simulate_sf_time + " " + + " exp " + '%.3f' % self.expansion_time + " " + + " bak " + '%.3f' % self.backpropagation_time + " " + "\n") file.close() From 89226b449a8d0a05ffd852805913fcf05efdca07 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Sun, 24 Dec 2017 20:57:53 +0800 Subject: [PATCH 69/98] replace try catch by isinstance collections.Hashable --- AlphaGo/.gitignore | 1 + AlphaGo/game.py | 2 +- tianshou/core/mcts/mcts.py | 29 ++++++++--------------------- 3 files changed, 10 insertions(+), 22 deletions(-) diff --git a/AlphaGo/.gitignore b/AlphaGo/.gitignore index e578e5a..ff61326 100644 --- a/AlphaGo/.gitignore +++ b/AlphaGo/.gitignore @@ -2,3 +2,4 @@ data checkpoints checkpoints_origin *.log +*.txt diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 8ffde93..a962f5c 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -33,8 +33,8 @@ class Game: if self.name == "go": self.size = 9 self.komi = 3.75 - self.history = [] self.history_length = 8 + self.history = [] self.game_engine = go.Go(size=self.size, komi=self.komi, role=self.role) self.board = [utils.EMPTY] * (self.size ** 2) elif self.name == "reversi": diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 1994284..bd21e09 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -1,23 +1,16 @@ import numpy as np import math import time +import sys +import collections c_puct = 5 - -def list2tuple(list): - try: - return tuple(list2tuple(sub) for sub in list) - except TypeError: - return list - - -def tuple2list(tuple): - try: - return list(tuple2list(sub) for sub in tuple) - except TypeError: - return tuple - +def list2tuple(obj): + if isinstance(obj, collections.Hashable): + return obj + else: + return tuple(list2tuple(sub) for sub in obj) class MCTSNode(object): def __init__(self, parent, action, state, action_num, prior, inverse=False): @@ -38,7 +31,6 @@ class MCTSNode(object): def valid_mask(self, simulator): pass - class UCTNode(MCTSNode): def __init__(self, parent, action, state, action_num, prior, mcts, inverse=False): super(UCTNode, self).__init__(parent, action, state, action_num, prior, inverse) @@ -119,12 +111,7 @@ class ActionNode(object): t2 = time.time() self.mcts.ndarray2list_time += t1 - t0 self.mcts.list2tuple_time += t2 - t1 - - def type_conversion_to_origin(self): - if isinstance(self.state_type, np.ndarray): - self.next_state = np.array(self.next_state) - if isinstance(self.state_type, np.ndarray): - self.next_state = tuple2list(self.next_state) + self.mcts.check += sys.getsizeof(object) def selection(self, simulator): head = time.time() From 70824a3612632fa8a81c039774d1efd03cf17881 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Mon, 25 Dec 2017 15:09:26 +0800 Subject: [PATCH 70/98] remove historical file data.py --- AlphaGo/data.py | 84 ------------------------------------------------- 1 file changed, 84 deletions(-) delete mode 100644 AlphaGo/data.py diff --git a/AlphaGo/data.py b/AlphaGo/data.py deleted file mode 100644 index 464ebb9..0000000 --- a/AlphaGo/data.py +++ /dev/null @@ -1,84 +0,0 @@ -import os -import threading -import numpy as np - -size = 9 -path = "/home/yama/leela-zero/data/npz-files/" -name = os.listdir(path) -print(len(name)) -thread_num = 17 -batch_num = len(name) // thread_num - -def integrate(name, index): - boards = np.zeros([0, size, size, 17]) - wins = np.zeros([0, 1]) - ps = np.zeros([0, size**2 + 1]) - for n in name: - data = np.load(path + n) - board = data["state"] - win = data["winner"] - p = data["prob"] - # board = np.zeros([0, size, size, 17]) - # win = np.zeros([0, 1]) - # p = np.zeros([0, size**2 + 1]) - # for i in range(data["boards"].shape[3]): - # board = np.concatenate([board, data["boards"][:,:,:,i].reshape(-1, size, size, 17)], axis=0) - # win = np.concatenate([win, data["win"][:,i].reshape(-1, 1)], axis=0) - # p = np.concatenate([p, data["p"][:,i].reshape(-1, size**2 + 1)], axis=0) - boards = np.concatenate([boards, board], axis=0) - wins = np.concatenate([wins, win], axis=0) - ps = np.concatenate([ps, p], axis=0) - # print("Finish " + n) - print ("Integration {} Finished!".format(index)) - board_ori = boards - win_ori = wins - p_ori = ps - for i in range(1, 3): - board = np.rot90(board_ori, i, (1, 2)) - p = np.concatenate( - [np.rot90(p_ori[:, :-1].reshape(-1, size, size), i, (1, 2)).reshape(-1, size**2), p_ori[:, -1].reshape(-1, 1)], - axis=1) - boards = np.concatenate([boards, board], axis=0) - wins = np.concatenate([wins, win_ori], axis=0) - ps = np.concatenate([ps, p], axis=0) - - board = board_ori[:, ::-1] - p = np.concatenate([p_ori[:, :-1].reshape(-1, size, size)[:, ::-1].reshape(-1, size**2), p_ori[:, -1].reshape(-1, 1)], - axis=1) - boards = np.concatenate([boards, board], axis=0) - wins = np.concatenate([wins, win_ori], axis=0) - ps = np.concatenate([ps, p], axis=0) - - board = board_ori[:, :, ::-1] - p = np.concatenate([p_ori[:, :-1].reshape(-1, size, size)[:, :, ::-1].reshape(-1, size**2), p_ori[:, -1].reshape(-1, 1)], - axis=1) - boards = np.concatenate([boards, board], axis=0) - wins = np.concatenate([wins, win_ori], axis=0) - ps = np.concatenate([ps, p], axis=0) - - board = board_ori[:, ::-1] - p = np.concatenate( - [np.rot90(p_ori[:, :-1].reshape(-1, size, size)[:, ::-1], 1, (1, 2)).reshape(-1, size**2), p_ori[:, -1].reshape(-1, 1)], - axis=1) - boards = np.concatenate([boards, np.rot90(board, 1, (1, 2))], axis=0) - wins = np.concatenate([wins, win_ori], axis=0) - ps = np.concatenate([ps, p], axis=0) - - board = board_ori[:, :, ::-1] - p = np.concatenate( - [np.rot90(p_ori[:, :-1].reshape(-1, size, size)[:, :, ::-1], 1, (1, 2)).reshape(-1, size**2), - p_ori[:, -1].reshape(-1, 1)], - axis=1) - boards = np.concatenate([boards, np.rot90(board, 1, (1, 2))], axis=0) - wins = np.concatenate([wins, win_ori], axis=0) - ps = np.concatenate([ps, p], axis=0) - - np.savez("/home/tongzheng/data/data-" + str(index), state=boards, winner=wins, prob=ps) - print ("Thread {} has finished.".format(index)) -thread_list = list() -for i in range(thread_num): - thread_list.append(threading.Thread(target=integrate, args=(name[batch_num * i:batch_num * (i + 1)], i,))) -for thread in thread_list: - thread.start() -for thread in thread_list: - thread.join() From 0fdbaef1a19e6de4ae866fd59ea05428dfe12bfa Mon Sep 17 00:00:00 2001 From: mcgrady00h <281130306@qq.com> Date: Mon, 25 Dec 2017 15:33:17 +0800 Subject: [PATCH 71/98] add '()' to support python3 --- AlphaGo/play.py | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 5777982..4e4aa6f 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -5,7 +5,14 @@ import re import Pyro4 import time import os -import cPickle + +python_version = sys.version_info + +if python_version < (3, 0): + import cPickle +else: + import _pickle as cPickle + class Data(object): def __init__(self): @@ -53,7 +60,7 @@ if __name__ == '__main__': # start a name server if no name server exists if len(os.popen('ps aux | grep pyro4-ns | grep -v grep').readlines()) == 0: start_new_server = subprocess.Popen(['pyro4-ns', '&']) - print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait()) + print("Start Name Sever : " + str(start_new_server.pid)) # + str(start_new_server.wait()) time.sleep(1) # start two different player with different network weights. @@ -73,12 +80,15 @@ if __name__ == '__main__': server_list = "" while (black_role_name not in server_list) or (white_role_name not in server_list): - server_list = subprocess.check_output(['pyro4-nsc', 'list']) - print "Waiting for the server start..." + if python_version < (3, 0): + server_list = subprocess.check_output(['pyro4-nsc', 'list']) + else: + server_list = subprocess.check_output(['pyro4-nsc', 'list']) + print("Waiting for the server start...") time.sleep(1) - print server_list - print "Start black player at : " + str(agent_v0.pid) - print "Start white player at : " + str(agent_v1.pid) + print(server_list) + print("Start black player at : " + str(agent_v0.pid)) + print("Start white player at : " + str(agent_v1.pid)) data = Data() player = [None] * 2 @@ -109,12 +119,12 @@ if __name__ == '__main__': board = eval(board[board.index('['):board.index(']') + 1]) for i in range(size[game_name]): for j in range(size[game_name]): - print show[board[i * size[game_name] + j]] + " ", - print "\n", + print(show[board[i * size[game_name] + j]] + " ",) + print("\n",) data.boards.append(board) start_time = time.time() move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n') - print role[turn] + " : " + str(move), + print(role[turn] + " : " + str(move),) num += 1 match = re.search(pattern, move) if match is not None: @@ -133,7 +143,7 @@ if __name__ == '__main__': prob = eval(prob) data.probs.append(prob) score = player[turn].run_cmd(str(num) + ' get_score') - print "Finished : ", score.split(" ")[1] + print("Finished : ", score.split(" ")[1]) # TODO: generalize the player if eval(score.split(" ")[1]) > 0: data.winner = 1 @@ -157,8 +167,8 @@ if __name__ == '__main__': print(e) subprocess.call(["kill", "-9", str(agent_v0.pid)]) subprocess.call(["kill", "-9", str(agent_v1.pid)]) - print "Kill all player, finish all game." + print("Kill all player, finish all game.") subprocess.call(["kill", "-9", str(agent_v0.pid)]) subprocess.call(["kill", "-9", str(agent_v1.pid)]) - print "Kill all player, finish all game." + print("Kill all player, finish all game.") From 64da200e5d4d4cff8c1642f4def897cefadbb87d Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Mon, 25 Dec 2017 16:26:51 +0800 Subject: [PATCH 72/98] move , from inside of () to outside of () --- AlphaGo/play.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 4e4aa6f..b3cc02a 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -75,7 +75,7 @@ if __name__ == '__main__': agent_v1 = subprocess.Popen( ['python', '-u', 'player.py', '--game=' + game_name, '--role=' + white_role_name, - '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)], + '--checkpoint_path=' + str(args.white_weight_path), '--debug=' + str(args.debug)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) server_list = "" @@ -119,12 +119,12 @@ if __name__ == '__main__': board = eval(board[board.index('['):board.index(']') + 1]) for i in range(size[game_name]): for j in range(size[game_name]): - print(show[board[i * size[game_name] + j]] + " ",) - print("\n",) + print(show[board[i * size[game_name] + j]] + " "), + print("\n"), data.boards.append(board) start_time = time.time() move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n') - print(role[turn] + " : " + str(move),) + print(role[turn] + " : " + str(move)), num += 1 match = re.search(pattern, move) if match is not None: From fcb160dff674f3d587dfd61a79ceffaeacb18ba1 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Mon, 25 Dec 2017 16:35:43 +0800 Subject: [PATCH 73/98] fix python 2,3 print format error --- AlphaGo/play.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AlphaGo/play.py b/AlphaGo/play.py index b3cc02a..e67621a 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -143,7 +143,7 @@ if __name__ == '__main__': prob = eval(prob) data.probs.append(prob) score = player[turn].run_cmd(str(num) + ' get_score') - print("Finished : ", score.split(" ")[1]) + print("Finished : {}".format(score.split(" ")[1])) # TODO: generalize the player if eval(score.split(" ")[1]) > 0: data.winner = 1 From 4379f4c0fd87ff8925724a4db67fad99bdff3098 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Mon, 25 Dec 2017 16:40:38 +0800 Subject: [PATCH 74/98] modify play.py for better experience --- AlphaGo/play.py | 59 +++++++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 5777982..6526f13 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -7,6 +7,7 @@ import time import os import cPickle + class Data(object): def __init__(self): self.boards = [] @@ -24,15 +25,16 @@ if __name__ == '__main__': """ # TODO : we should set the network path in a more configurable way. parser = argparse.ArgumentParser() - parser.add_argument("--result_path", type=str, default="./data/") + parser.add_argument("--data_path", type=str, default="./data/") parser.add_argument("--black_weight_path", type=str, default=None) parser.add_argument("--white_weight_path", type=str, default=None) - parser.add_argument("--id", type=int, default=0) + parser.add_argument("--id", type=int, default=-1) parser.add_argument("--debug", type=bool, default=False) + parser.add_argument("--game", type=str, default="go") args = parser.parse_args() - if not os.path.exists(args.result_path): - os.mkdir(args.result_path) + if not os.path.exists(args.data_path): + os.mkdir(args.data_path) # black_weight_path = "./checkpoints" # white_weight_path = "./checkpoints_origin" if args.black_weight_path is not None and (not os.path.exists(args.black_weight_path)): @@ -57,18 +59,34 @@ if __name__ == '__main__': time.sleep(1) # start two different player with different network weights. + server_list = subprocess.check_output(['pyro4-nsc', 'list']) + index = [] + if server_list is not None: + server_list = server_list.split("\n")[3:-2] + for s in server_list: + id = s.split(" ")[0][5:] + index.append(eval(id)) + index.sort() + if args.id == -1: + if index: + args.id = index[-1] + 1 + else: + args.id = 0 + else: + if args.id in index: + raise ValueError("Name exists in name server!") + black_role_name = 'black' + str(args.id) white_role_name = 'white' + str(args.id) - game_name = 'go' agent_v0 = subprocess.Popen( - ['python', '-u', 'player.py', '--game=' + game_name, '--role=' + black_role_name, + ['python', '-u', 'player.py', '--game=' + args.game, '--role=' + black_role_name, '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) agent_v1 = subprocess.Popen( - ['python', '-u', 'player.py', '--game=' + game_name, '--role=' + white_role_name, - '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)], + ['python', '-u', 'player.py', '--game=' + args.game, '--role=' + white_role_name, + '--checkpoint_path=' + str(args.white_weight_path), '--debug=' + str(args.debug)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) server_list = "" @@ -103,13 +121,13 @@ if __name__ == '__main__': pass_flag = [False, False] print("Start game {}".format(game_num)) # end the game if both palyer chose to pass, or play too much turns - while not (pass_flag[0] and pass_flag[1]) and num < size[game_name] ** 2 * 2: + while not (pass_flag[0] and pass_flag[1]) and num < size[args.game] ** 2 * 2: turn = num % 2 board = player[turn].run_cmd(str(num) + ' show_board') board = eval(board[board.index('['):board.index(']') + 1]) - for i in range(size[game_name]): - for j in range(size[game_name]): - print show[board[i * size[game_name] + j]] + " ", + for i in range(size[args.game]): + for j in range(size[args.game]): + print show[board[i * size[args.game] + j]] + " ", print "\n", data.boards.append(board) start_time = time.time() @@ -141,24 +159,23 @@ if __name__ == '__main__': data.winner = -1 player[0].run_cmd(str(num) + ' clear_board') player[1].run_cmd(str(num) + ' clear_board') - file_list = os.listdir(args.result_path) + file_list = os.listdir(args.data_path) if not file_list: data_num = 0 else: - file_list.sort(key=lambda file: os.path.getmtime(args.result_path + file) if not os.path.isdir( - args.result_path + file) else 0) + file_list.sort(key=lambda file: os.path.getmtime(args.data_path + file) if not os.path.isdir( + args.data_path + file) else 0) data_num = eval(file_list[-1][:-4]) + 1 with open("./data/" + str(data_num) + ".pkl", "wb") as file: picklestring = cPickle.dump(data, file) data.reset() game_num += 1 + except KeyboardInterrupt: + pass - except Exception as e: - print(e) - subprocess.call(["kill", "-9", str(agent_v0.pid)]) - subprocess.call(["kill", "-9", str(agent_v1.pid)]) - print "Kill all player, finish all game." - + ns = Pyro4.locateNS() + ns.unregister(black_role_name) + ns.unregister(white_role_name) subprocess.call(["kill", "-9", str(agent_v0.pid)]) subprocess.call(["kill", "-9", str(agent_v1.pid)]) print "Kill all player, finish all game." From 76f641a0f1b0583ccd2bee2892f996be970152f9 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Mon, 25 Dec 2017 16:51:44 +0800 Subject: [PATCH 75/98] minor fixed --- AlphaGo/play.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/AlphaGo/play.py b/AlphaGo/play.py index c0bdc5b..6b57b86 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -92,7 +92,7 @@ if __name__ == '__main__': stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) agent_v1 = subprocess.Popen( - ['python', '-u', 'player.py', '--game=' + game_name, '--role=' + white_role_name, + ['python', '-u', 'player.py', '--game=' + args.game, '--role=' + white_role_name, '--checkpoint_path=' + str(args.white_weight_path), '--debug=' + str(args.debug)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) @@ -183,9 +183,6 @@ if __name__ == '__main__': except KeyboardInterrupt: pass - ns = Pyro4.locateNS() - ns.unregister(black_role_name) - ns.unregister(white_role_name) subprocess.call(["kill", "-9", str(agent_v0.pid)]) subprocess.call(["kill", "-9", str(agent_v1.pid)]) print("Kill all player, finish all game.") From 725fc2c04eb7b98350684519dabbd7fdd48b32ea Mon Sep 17 00:00:00 2001 From: rtz19970824 Date: Tue, 26 Dec 2017 13:17:46 +0800 Subject: [PATCH 76/98] pass the checkpoint path to the model --- AlphaGo/game.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index a962f5c..72ae2e0 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -22,8 +22,8 @@ import time class Game: ''' Load the real game and trained weights. - - TODO : Maybe merge with the engine class in future, + + TODO : Maybe merge with the engine class in future, currently leave it untouched for interacting with Go UI. ''' def __init__(self, name=None, role=None, debug=False, checkpoint_path=None): @@ -46,7 +46,7 @@ class Game: else: raise ValueError(name + " is an unknown game...") - self.evaluator = model.ResNet(self.size, self.size ** 2 + 1, history_length=self.history_length) + self.evaluator = model.ResNet(self.size, self.size ** 2 + 1, history_length=self.history_length, checkpoint_path=checkpoint_path) self.latest_boards = deque(maxlen=self.history_length) for _ in range(self.history_length): self.latest_boards.append(self.board) From aa6b5434c673c8d7c83c290bacd4a92b1ac0832b Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Tue, 26 Dec 2017 14:46:14 +0800 Subject: [PATCH 77/98] add debuf info for mcts and add softmax for the prior --- AlphaGo/game.py | 15 +++++++++++---- AlphaGo/model.py | 4 ++-- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 72ae2e0..ec39f94 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -71,6 +71,13 @@ class Game: mcts = MCTS(self.game_engine, self.evaluator, [latest_boards, color], self.size ** 2 + 1, role=self.role, debug=self.debug, inverse=True) mcts.search(max_step=100) + if self.debug: + file = open("mcts_debug.log", 'ab') + np.savetxt(file, mcts.root.Q, header="\nQ value : ", fmt='%.4f', newline=", ") + np.savetxt(file, mcts.root.W, header="\nW value : ", fmt='%.4f', newline=", ") + np.savetxt(file, mcts.root.N, header="\nN value : ", fmt="%d", newline=", ") + np.savetxt(file, mcts.root.prior, header="\nprior : ", fmt='%.4f', newline=", ") + file.close() temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0] @@ -119,7 +126,7 @@ class Game: sys.stdout.flush() if __name__ == "__main__": - print("test game.py") - #file = open("debug.txt", "a") - #file.write("mcts check\n") - #file.close() + game = Game(name="go", checkpoint_path="./checkpoint") + game.debug = True + game.think_play_move(utils.BLACK) + diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 0549f41..704a034 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -80,7 +80,7 @@ class Data(object): class ResNet(object): - def __init__(self, board_size, action_num, history_length=1, residual_block_num=20, checkpoint_path=None): + def __init__(self, board_size, action_num, history_length=1, residual_block_num=10, checkpoint_path=None): """ the resnet model @@ -161,7 +161,7 @@ class ResNet(object): 'The length of history cannot meet the need of the model, given {}, need {}'.format(len(history), self.history_length)) state = self._history2state(history, color) - return self.sess.run([self.p, self.v], feed_dict={self.x: state, self.is_training: False}) + return self.sess.run([tf.nn.softmax(self.p), self.v], feed_dict={self.x: state, self.is_training: False}) def _history2state(self, history, color): """ From 8f508c790b0b8351e1dfab25df7416337dfb8ac0 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Tue, 26 Dec 2017 15:07:15 +0800 Subject: [PATCH 78/98] add role for mcts debug --- AlphaGo/game.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index ec39f94..d123a92 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -73,10 +73,10 @@ class Game: mcts.search(max_step=100) if self.debug: file = open("mcts_debug.log", 'ab') - np.savetxt(file, mcts.root.Q, header="\nQ value : ", fmt='%.4f', newline=", ") - np.savetxt(file, mcts.root.W, header="\nW value : ", fmt='%.4f', newline=", ") - np.savetxt(file, mcts.root.N, header="\nN value : ", fmt="%d", newline=", ") - np.savetxt(file, mcts.root.prior, header="\nprior : ", fmt='%.4f', newline=", ") + np.savetxt(file, mcts.root.Q, header="\n" + self.role + " Q value : ", fmt='%.4f', newline=", ") + np.savetxt(file, mcts.root.W, header="\n" + self.role + " W value : ", fmt='%.4f', newline=", ") + np.savetxt(file, mcts.root.N, header="\n" + self.role + " N value : ", fmt="%d", newline=", ") + np.savetxt(file, mcts.root.prior, header="\n" + self.role + " prior : ", fmt='%.4f', newline=", ") file.close() temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) From 029ab199f4a8da3fd15897cd9f3ef830467ad578 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Tue, 26 Dec 2017 16:47:24 +0800 Subject: [PATCH 79/98] add softmax for mcts root node --- AlphaGo/model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 704a034..dbfc5ca 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -124,6 +124,7 @@ class ResNet(object): h = residual_block(h, self.is_training) self.v = value_head(h, self.is_training) self.p = policy_head(h, self.is_training, self.action_num) + self.prob = tf.nn.softmax(self.p) self.value_loss = tf.reduce_mean(tf.square(self.z - self.v)) self.policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.pi, logits=self.p)) @@ -161,7 +162,7 @@ class ResNet(object): 'The length of history cannot meet the need of the model, given {}, need {}'.format(len(history), self.history_length)) state = self._history2state(history, color) - return self.sess.run([tf.nn.softmax(self.p), self.v], feed_dict={self.x: state, self.is_training: False}) + return self.sess.run([self.prob, self.v], feed_dict={self.x: state, self.is_training: False}) def _history2state(self, history, color): """ From 0c3ff3bf373f8c3c12a9572de9dac568e7cb69eb Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Tue, 26 Dec 2017 19:29:35 +0800 Subject: [PATCH 80/98] delete unused code --- AlphaGo/.gitignore | 2 +- AlphaGo/engine.py | 16 +++++----------- AlphaGo/play.py | 46 +++++++++++++++++++++------------------------- AlphaGo/player.py | 14 ++------------ 4 files changed, 29 insertions(+), 49 deletions(-) diff --git a/AlphaGo/.gitignore b/AlphaGo/.gitignore index ff61326..38ff946 100644 --- a/AlphaGo/.gitignore +++ b/AlphaGo/.gitignore @@ -1,5 +1,5 @@ data checkpoints -checkpoints_origin +random *.log *.txt diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index b662dbd..d298aea 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -13,8 +13,6 @@ import utils class GTPEngine(): def __init__(self, **kwargs): - self.size = 9 - self.komi = 6.5 try: self._game = kwargs['game_obj'] self._game.clear() @@ -143,11 +141,9 @@ class GTPEngine(): self.disconnect = True return None, True - def cmd_boardsize(self, args, **kwargs): - if args.isdigit(): - size = int(args) - self.size = size - self._game.set_size(size) + def cmd_boardsize(self, board_size, **kwargs): + if board_size.isdigit(): + self._game.set_size(int(board_size)) return None, True else: return 'non digit size', False @@ -156,11 +152,9 @@ class GTPEngine(): self._game.clear() return None, True - def cmd_komi(self, args, **kwargs): + def cmd_komi(self, komi, **kwargs): try: - komi = float(args) - self.komi = komi - self._game.set_komi(komi) + self._game.set_komi(float(komi)) return None, True except ValueError: raise ValueError("syntax error") diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 6b57b86..884d2ab 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -5,6 +5,8 @@ import re import Pyro4 import time import os +import utils +from time import gmtime, strftime python_version = sys.version_info @@ -13,8 +15,6 @@ if python_version < (3, 0): else: import _pickle as cPickle - - class Data(object): def __init__(self): self.boards = [] @@ -45,9 +45,9 @@ if __name__ == '__main__': # black_weight_path = "./checkpoints" # white_weight_path = "./checkpoints_origin" if args.black_weight_path is not None and (not os.path.exists(args.black_weight_path)): - raise ValueError("Can't not find the network weights for black player.") + raise ValueError("Can't find the network weights for black player.") if args.white_weight_path is not None and (not os.path.exists(args.white_weight_path)): - raise ValueError("Can't not find the network weights for white player.") + raise ValueError("Can't find the network weights for white player.") # kill the old server # kill_old_server = subprocess.Popen(['killall', 'pyro4-ns']) @@ -86,27 +86,29 @@ if __name__ == '__main__': black_role_name = 'black' + str(args.id) white_role_name = 'white' + str(args.id) - agent_v0 = subprocess.Popen( + #TODO : check if we can get the output of player from the stdout, for debug convenience + black_player = subprocess.Popen( ['python', '-u', 'player.py', '--game=' + args.game, '--role=' + black_role_name, '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - agent_v1 = subprocess.Popen( + white_player = subprocess.Popen( ['python', '-u', 'player.py', '--game=' + args.game, '--role=' + white_role_name, - '--checkpoint_path=' + str(args.white_weight_path), '--debug=' + str(args.debug)], + '--checkpoint_path=' + str(args.white_weight_path), '--debug=' + str(args.debug)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) server_list = "" while (black_role_name not in server_list) or (white_role_name not in server_list): if python_version < (3, 0): + # TODO : @renyong what is the difference between those two options? server_list = subprocess.check_output(['pyro4-nsc', 'list']) else: server_list = subprocess.check_output(['pyro4-nsc', 'list']) print("Waiting for the server start...") time.sleep(1) print(server_list) - print("Start black player at : " + str(agent_v0.pid)) - print("Start white player at : " + str(agent_v1.pid)) + print("Start black player at : " + str(black_player.pid)) + print("Start white player at : " + str(white_player.pid)) data = Data() player = [None] * 2 @@ -121,7 +123,7 @@ if __name__ == '__main__': size = {"go":9, "reversi":8} show = ['.', 'X', 'O'] - evaluate_rounds = 1 + evaluate_rounds = 100 game_num = 0 try: #while True: @@ -141,8 +143,8 @@ if __name__ == '__main__': print "\n", data.boards.append(board) start_time = time.time() - move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n') - print(role[turn] + " : " + str(move)), + move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn])[:-1] + print("\n" + role[turn] + " : " + str(move)), num += 1 match = re.search(pattern, move) if match is not None: @@ -160,29 +162,23 @@ if __name__ == '__main__': prob = prob.replace('],', ']') prob = eval(prob) data.probs.append(prob) - score = player[turn].run_cmd(str(num) + ' get_score') + score = player[0].run_cmd(str(num) + ' get_score') print("Finished : {}".format(score.split(" ")[1])) - # TODO: generalize the player if eval(score.split(" ")[1]) > 0: - data.winner = 1 + data.winner = utils.BLACK if eval(score.split(" ")[1]) < 0: - data.winner = -1 + data.winner = utils.WHITE player[0].run_cmd(str(num) + ' clear_board') player[1].run_cmd(str(num) + ' clear_board') file_list = os.listdir(args.data_path) - if not file_list: - data_num = 0 - else: - file_list.sort(key=lambda file: os.path.getmtime(args.data_path + file) if not os.path.isdir( - args.data_path + file) else 0) - data_num = eval(file_list[-1][:-4]) + 1 - with open("./data/" + str(data_num) + ".pkl", "wb") as file: + current_time = strftime("%Y%m%d_%H%M%S", gmtime()) + with open(args.data_path + current_time + ".pkl", "wb") as file: picklestring = cPickle.dump(data, file) data.reset() game_num += 1 except KeyboardInterrupt: pass - subprocess.call(["kill", "-9", str(agent_v0.pid)]) - subprocess.call(["kill", "-9", str(agent_v1.pid)]) + subprocess.call(["kill", "-9", str(black_player.pid)]) + subprocess.call(["kill", "-9", str(white_player.pid)]) print("Kill all player, finish all game.") diff --git a/AlphaGo/player.py b/AlphaGo/player.py index a8f61c1..b93c124 100644 --- a/AlphaGo/player.py +++ b/AlphaGo/player.py @@ -1,8 +1,5 @@ import argparse -import time -import sys import Pyro4 - from game import Game from engine import GTPEngine @@ -17,10 +14,8 @@ class Player(object): self.engine = kwargs['engine'] def run_cmd(self, command): - #return "inside the Player of player.py" return self.engine.run_cmd(command) - if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--checkpoint_path", type=str, default=None) @@ -29,12 +24,7 @@ if __name__ == '__main__': parser.add_argument("--game", type=str, default=False) args = parser.parse_args() - if args.checkpoint_path == 'None': - args.checkpoint_path = None - debug = False - if args.debug == "True": - debug = True - game = Game(name=args.game, role=args.role, checkpoint_path=args.checkpoint_path, debug=debug) + game = Game(name=args.game, role=args.role, checkpoint_path=eval(args.checkpoint_path), debug=eval(args.debug)) engine = GTPEngine(game_obj=game, name='tianshou', version=0) daemon = Pyro4.Daemon() # make a Pyro daemon @@ -43,7 +33,7 @@ if __name__ == '__main__': print "Init " + args.role + " player finished" uri = daemon.register(player) # register the greeting maker as a Pyro object print "Start on name " + args.role - ns.register(args.role, uri) # register the object with a name in the name server + ns.register(args.role, uri) # register the object with a name in the name server print "Start Request Loop " + str(uri) daemon.requestLoop() # start the event loop of the server to wait for calls From 7f0565a5f65b7784ba7145bcce237a09aff8f632 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Tue, 26 Dec 2017 22:19:10 +0800 Subject: [PATCH 81/98] variable rename and delete redundant code --- AlphaGo/game.py | 9 +++------ tianshou/core/mcts/mcts.py | 13 +++++++------ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index d123a92..f17c7af 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -46,7 +46,8 @@ class Game: else: raise ValueError(name + " is an unknown game...") - self.evaluator = model.ResNet(self.size, self.size ** 2 + 1, history_length=self.history_length, checkpoint_path=checkpoint_path) + self.evaluator = model.ResNet(self.size, self.size ** 2 + 1, history_length=self.history_length, + checkpoint_path=checkpoint_path) self.latest_boards = deque(maxlen=self.history_length) for _ in range(self.history_length): self.latest_boards.append(self.board) @@ -91,11 +92,7 @@ class Game: # this function can be called directly to play the opponent's move if vertex == utils.PASS: return True - # TODO this implementation is not very elegant - if self.name == "go": - res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex) - elif self.name == "reversi": - res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex) + res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex) return res def think_play_move(self, color): diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 5c96d38..3d547c6 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -129,6 +129,7 @@ class ActionNode(object): self.mcts.action_selection_time += time.time() - head return self.parent, self.action else: + # self.next_state is None means we have reach the terminate state self.mcts.action_selection_time += time.time() - head return self.parent, self.action @@ -147,20 +148,20 @@ class ActionNode(object): class MCTS(object): - def __init__(self, simulator, evaluator, root, action_num, method="UCT", + def __init__(self, simulator, evaluator, start_state, action_num, method="UCT", role="unknown", debug=False, inverse=False): self.simulator = simulator self.evaluator = evaluator self.role = role self.debug = debug - prior, _ = self.evaluator(root) + prior, _ = self.evaluator(start_state) self.action_num = action_num if method == "": - self.root = root + self.root = start_state if method == "UCT": - self.root = UCTNode(None, None, root, action_num, prior, mcts=self, inverse=inverse) + self.root = UCTNode(None, None, start_state, action_num, prior, mcts=self, inverse=inverse) if method == "TS": - self.root = TSNode(None, None, root, action_num, prior, inverse=inverse) + self.root = TSNode(None, None, start_state, action_num, prior, inverse=inverse) self.inverse = inverse # time spend on each step @@ -191,7 +192,7 @@ class MCTS(object): self.expansion_time += exp_time self.backpropagation_time += back_time step += 1 - if (self.debug): + if self.debug: file = open("mcts_profiling.txt", "a") file.write("[" + str(self.role) + "]" + " sel " + '%.3f' % self.selection_time + " " From c788b253fbf27706d0cac693f6c02ac806376c5a Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Wed, 27 Dec 2017 01:04:09 +0800 Subject: [PATCH 82/98] show the stdout of player.py for debugging --- AlphaGo/play.py | 17 ++++++++++++----- AlphaGo/player.py | 12 ++++++++---- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 884d2ab..038953f 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -54,11 +54,6 @@ if __name__ == '__main__': # print "kill the old pyro4 name server, the return code is : " + str(kill_old_server.wait()) # time.sleep(1) - # start a name server to find the remote object - # start_new_server = subprocess.Popen(['pyro4-ns', '&']) - # print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait()) - # time.sleep(1) - # start a name server if no name server exists if len(os.popen('ps aux | grep pyro4-ns | grep -v grep').readlines()) == 0: start_new_server = subprocess.Popen(['pyro4-ns', '&']) @@ -91,11 +86,23 @@ if __name__ == '__main__': ['python', '-u', 'player.py', '--game=' + args.game, '--role=' + black_role_name, '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + bp_output = black_player.stdout.readline() + bp_message = bp_output + while bp_output != '' and "Start requestLoop" not in bp_output: + bp_output = black_player.stdout.readline() + bp_message += bp_output + print("============ " + black_role_name + " message ============" + "\n" + bp_message), white_player = subprocess.Popen( ['python', '-u', 'player.py', '--game=' + args.game, '--role=' + white_role_name, '--checkpoint_path=' + str(args.white_weight_path), '--debug=' + str(args.debug)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + wp_output = white_player.stdout.readline() + wp_message = wp_output + while wp_output != '' and "Start requestLoop" not in wp_output: + wp_output = white_player.stdout.readline() + wp_message += wp_output + print("============ " + white_role_name + " message ============" + "\n" + wp_message), server_list = "" while (black_role_name not in server_list) or (white_role_name not in server_list): diff --git a/AlphaGo/player.py b/AlphaGo/player.py index b93c124..8d46ae5 100644 --- a/AlphaGo/player.py +++ b/AlphaGo/player.py @@ -24,16 +24,20 @@ if __name__ == '__main__': parser.add_argument("--game", type=str, default=False) args = parser.parse_args() - game = Game(name=args.game, role=args.role, checkpoint_path=eval(args.checkpoint_path), debug=eval(args.debug)) + if args.checkpoint_path == 'None': + args.checkpoint_path = None + game = Game(name=args.game, role=args.role, + checkpoint_path=args.checkpoint_path, + debug=eval(args.debug)) engine = GTPEngine(game_obj=game, name='tianshou', version=0) daemon = Pyro4.Daemon() # make a Pyro daemon ns = Pyro4.locateNS() # find the name server player = Player(role=args.role, engine=engine) - print "Init " + args.role + " player finished" + print("Init " + args.role + " player finished") uri = daemon.register(player) # register the greeting maker as a Pyro object - print "Start on name " + args.role + print("Start on name " + args.role) ns.register(args.role, uri) # register the object with a name in the name server - print "Start Request Loop " + str(uri) + print("Start requestLoop " + str(uri)) daemon.requestLoop() # start the event loop of the server to wait for calls From a1f6044cba6114bc931fb69e208aa4bd1fa0e61d Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Wed, 27 Dec 2017 11:43:04 +0800 Subject: [PATCH 83/98] rewrite selection function of ActionNode for clarity, add and delete some notes --- AlphaGo/play.py | 3 +-- tianshou/core/mcts/mcts.py | 18 +++++++----------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 038953f..7c7961c 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -24,7 +24,6 @@ class Data(object): def reset(self): self.__init__() - if __name__ == '__main__': """ Starting two different players which load network weights to evaluate the winning ratio. @@ -81,13 +80,13 @@ if __name__ == '__main__': black_role_name = 'black' + str(args.id) white_role_name = 'white' + str(args.id) - #TODO : check if we can get the output of player from the stdout, for debug convenience black_player = subprocess.Popen( ['python', '-u', 'player.py', '--game=' + args.game, '--role=' + black_role_name, '--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) bp_output = black_player.stdout.readline() bp_message = bp_output + # '' means player.py failed to start, "Start requestLoop" means player.py start successfully while bp_output != '' and "Start requestLoop" not in bp_output: bp_output = black_player.stdout.readline() bp_message += bp_output diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 3d547c6..f64b5a0 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -97,7 +97,6 @@ class ActionNode(object): self.action = action self.children = {} self.next_state = None - self.origin_state = None self.state_type = None self.reward = 0 self.mcts = mcts @@ -118,18 +117,15 @@ class ActionNode(object): head = time.time() self.next_state, self.reward = simulator.simulate_step_forward(self.parent.state, self.action) self.mcts.simulate_sf_time += time.time() - head + if self.next_state is None: # next_state is None means that self.parent.state is the terminate state + self.mcts.action_selection_time += time.time() - head + return self.parent, self.action self.origin_state = self.next_state - self.state_type = type(self.next_state) self.type_conversion_to_tuple() - if self.next_state is not None: - if self.next_state in self.children.keys(): - self.mcts.action_selection_time += time.time() - head - return self.children[self.next_state].selection(simulator) - else: - self.mcts.action_selection_time += time.time() - head - return self.parent, self.action - else: - # self.next_state is None means we have reach the terminate state + if self.next_state in self.children.keys(): # next state has already visited before + self.mcts.action_selection_time += time.time() - head + return self.children[self.next_state].selection(simulator) + else: # next state is a new state never seen before self.mcts.action_selection_time += time.time() - head return self.parent, self.action From 9f6098497336d989c70b7f6fc67ebf2bc4ad6e85 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Wed, 27 Dec 2017 14:08:34 +0800 Subject: [PATCH 84/98] remove type_conversion function --- AlphaGo/model.py | 4 ++-- tianshou/core/mcts/mcts.py | 33 ++++++++++----------------------- 2 files changed, 12 insertions(+), 25 deletions(-) diff --git a/AlphaGo/model.py b/AlphaGo/model.py index dbfc5ca..6fde6e5 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -161,8 +161,8 @@ class ResNet(object): raise ValueError( 'The length of history cannot meet the need of the model, given {}, need {}'.format(len(history), self.history_length)) - state = self._history2state(history, color) - return self.sess.run([self.prob, self.v], feed_dict={self.x: state, self.is_training: False}) + eval_state = self._history2state(history, color) + return self.sess.run([self.prob, self.v], feed_dict={self.x: eval_state, self.is_training: False}) def _history2state(self, history, color): """ diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index f64b5a0..98ab056 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -6,11 +6,11 @@ import collections c_puct = 5 -def list2tuple(obj): +def hashable_conversion(obj): if isinstance(obj, collections.Hashable): return obj else: - return tuple(list2tuple(sub) for sub in obj) + return tuple(hashable_conversion(sub) for sub in obj) class MCTSNode(object): def __init__(self, parent, action, state, action_num, prior, inverse=False): @@ -79,7 +79,7 @@ class UCTNode(MCTSNode): self.mask = simulator.simulate_get_mask(self.state, range(self.action_num)) self.ucb[self.mask] = -float("Inf") - +# Code reserved for Thompson Sampling class TSNode(MCTSNode): def __init__(self, parent, action, state, action_num, prior, method="Gaussian", inverse=False): super(TSNode, self).__init__(parent, action, state, action_num, prior, inverse) @@ -97,22 +97,11 @@ class ActionNode(object): self.action = action self.children = {} self.next_state = None + self.next_state_hashable = None self.state_type = None self.reward = 0 self.mcts = mcts - def type_conversion_to_tuple(self): - t0 = time.time() - if isinstance(self.next_state, np.ndarray): - self.next_state = self.next_state.tolist() - t1 = time.time() - if isinstance(self.next_state, list): - self.next_state = list2tuple(self.next_state) - t2 = time.time() - self.mcts.ndarray2list_time += t1 - t0 - self.mcts.list2tuple_time += t2 - t1 - self.mcts.check += sys.getsizeof(object) - def selection(self, simulator): head = time.time() self.next_state, self.reward = simulator.simulate_step_forward(self.parent.state, self.action) @@ -120,29 +109,28 @@ class ActionNode(object): if self.next_state is None: # next_state is None means that self.parent.state is the terminate state self.mcts.action_selection_time += time.time() - head return self.parent, self.action - self.origin_state = self.next_state - self.type_conversion_to_tuple() - if self.next_state in self.children.keys(): # next state has already visited before + self.next_state_hashable = hashable_conversion(self.next_state) + if self.next_state_hashable in self.children.keys(): # next state has already visited before self.mcts.action_selection_time += time.time() - head - return self.children[self.next_state].selection(simulator) + return self.children[self.next_state_hashable].selection(simulator) else: # next state is a new state never seen before self.mcts.action_selection_time += time.time() - head return self.parent, self.action def expansion(self, evaluator, action_num): if self.next_state is not None: + # note that self.next_state was assigned already at the selection function prior, value = evaluator(self.next_state) - self.children[self.next_state] = UCTNode(self, self.action, self.origin_state, action_num, prior, + self.children[self.next_state_hashable] = UCTNode(self, self.action, self.next_state, action_num, prior, mcts=self.mcts, inverse=self.parent.inverse) return value - else: + else: # self.next_state is None means MCTS selected a terminate node return 0. def backpropagation(self, value): self.reward += value self.parent.backpropagation(self.action) - class MCTS(object): def __init__(self, simulator, evaluator, start_state, action_num, method="UCT", role="unknown", debug=False, inverse=False): @@ -214,6 +202,5 @@ class MCTS(object): t3 = time.time() return t1 - t0, t2 - t1, t3 - t2 - if __name__ == "__main__": pass From 8d102d249fd05a274f8d4174d061b0fc046181cb Mon Sep 17 00:00:00 2001 From: JialianLee Date: Wed, 27 Dec 2017 18:55:00 +0800 Subject: [PATCH 85/98] Modification for backpropagation process --- tianshou/core/mcts/mcts.py | 5 ++++- tianshou/core/mcts/mcts_virtual_loss.py | 8 ++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 98ab056..f733f83 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -198,7 +198,10 @@ class MCTS(object): t1 = time.time() value = node.children[new_action].expansion(self.evaluator, self.action_num) t2 = time.time() - node.children[new_action].backpropagation(value + 0.) + if self.inverse: + node.children[new_action].backpropagation(-value + 0.) + else: + node.children[new_action].backpropagation(value + 0.) t3 = time.time() return t1 - t0, t2 - t1, t3 - t2 diff --git a/tianshou/core/mcts/mcts_virtual_loss.py b/tianshou/core/mcts/mcts_virtual_loss.py index f27d8a3..5826bd5 100644 --- a/tianshou/core/mcts/mcts_virtual_loss.py +++ b/tianshou/core/mcts/mcts_virtual_loss.py @@ -278,8 +278,12 @@ class MCTSVirtualLoss(object): priors[i], nodes[i].inverse) - for i in range(self.batch_size): - nodes[i].children[new_actions[i]].backpropagation(values[i] + 0.) + if self.inverse: + for i in range(self.batch_size): + nodes[i].children[new_actions[i]].backpropagation(-values[i] + 0.) + else: + for i in range(self.batch_size): + nodes[i].children[new_actions[i]].backpropagation(values[i] + 0.) ##### TODO From f2291efc72cd88a55db3719b04997d94770d10bc Mon Sep 17 00:00:00 2001 From: rtz19970824 Date: Wed, 27 Dec 2017 19:54:36 +0800 Subject: [PATCH 86/98] check exists when save data --- AlphaGo/play.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 7c7961c..d1a5301 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -178,6 +178,9 @@ if __name__ == '__main__': player[1].run_cmd(str(num) + ' clear_board') file_list = os.listdir(args.data_path) current_time = strftime("%Y%m%d_%H%M%S", gmtime()) + if os.path.exists(args.data_path + current_time + ".pkl"): + time.sleep(1) + current_time = strftime("%Y%m%d_%H%M%S", gmtime()) with open(args.data_path + current_time + ".pkl", "wb") as file: picklestring = cPickle.dump(data, file) data.reset() From d48982d59ed2ca797d07ef6afee5f211a7e22aed Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Wed, 27 Dec 2017 20:49:54 +0800 Subject: [PATCH 87/98] move evaluator from action node to mcts --- AlphaGo/model.py | 2 ++ tianshou/core/mcts/mcts.py | 10 ++++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 6fde6e5..c3bb9f0 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -156,6 +156,8 @@ class ResNet(object): # Note : maybe we can use it for isolating test of MCTS #prob = [1.0 / self.action_num] * self.action_num #return [prob, np.random.uniform(-1, 1)] + if state is None: + return [[0.0] * self.action_num, 0] history, color = state if len(history) != self.history_length: raise ValueError( diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index f733f83..a1b0b3d 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -117,15 +117,12 @@ class ActionNode(object): self.mcts.action_selection_time += time.time() - head return self.parent, self.action - def expansion(self, evaluator, action_num): + def expansion(self, prior, action_num): if self.next_state is not None: # note that self.next_state was assigned already at the selection function - prior, value = evaluator(self.next_state) + # self.next_state is None means MCTS selected a terminate node self.children[self.next_state_hashable] = UCTNode(self, self.action, self.next_state, action_num, prior, mcts=self.mcts, inverse=self.parent.inverse) - return value - else: # self.next_state is None means MCTS selected a terminate node - return 0. def backpropagation(self, value): self.reward += value @@ -196,7 +193,8 @@ class MCTS(object): t0 = time.time() node, new_action = self.root.selection(self.simulator) t1 = time.time() - value = node.children[new_action].expansion(self.evaluator, self.action_num) + prior, value = self.evaluator(node.children[new_action].next_state) + node.children[new_action].expansion(prior, self.action_num) t2 = time.time() if self.inverse: node.children[new_action].backpropagation(-value + 0.) From affd0319e283a26276e44c1359bcd72172751da5 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Wed, 27 Dec 2017 21:11:40 +0800 Subject: [PATCH 88/98] rewrite the selection fuction of UCTNode to return the action node instead of return the state node and next action --- tianshou/core/mcts/mcts.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index a1b0b3d..4c23809 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -108,14 +108,14 @@ class ActionNode(object): self.mcts.simulate_sf_time += time.time() - head if self.next_state is None: # next_state is None means that self.parent.state is the terminate state self.mcts.action_selection_time += time.time() - head - return self.parent, self.action + return self self.next_state_hashable = hashable_conversion(self.next_state) if self.next_state_hashable in self.children.keys(): # next state has already visited before self.mcts.action_selection_time += time.time() - head return self.children[self.next_state_hashable].selection(simulator) else: # next state is a new state never seen before self.mcts.action_selection_time += time.time() - head - return self.parent, self.action + return self def expansion(self, prior, action_num): if self.next_state is not None: @@ -191,15 +191,15 @@ class MCTS(object): def _expand(self): t0 = time.time() - node, new_action = self.root.selection(self.simulator) + next_action = self.root.selection(self.simulator) t1 = time.time() - prior, value = self.evaluator(node.children[new_action].next_state) - node.children[new_action].expansion(prior, self.action_num) + prior, value = self.evaluator(next_action.next_state) + next_action.expansion(prior, self.action_num) t2 = time.time() if self.inverse: - node.children[new_action].backpropagation(-value + 0.) + next_action.backpropagation(-value + 0.) else: - node.children[new_action].backpropagation(value + 0.) + next_action.backpropagation(value + 0.) t3 = time.time() return t1 - t0, t2 - t1, t3 - t2 From 47676993fdd5bcd99b8d484f689245681dcd09db Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Thu, 28 Dec 2017 01:16:24 +0800 Subject: [PATCH 89/98] solve the performance bottleneck by only hashing the last board --- AlphaGo/go.py | 4 ++++ AlphaGo/reversi.py | 4 ++++ tianshou/core/mcts/mcts.py | 36 +++++++++++++----------------------- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 55f5a4a..987fe93 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -222,6 +222,10 @@ class Go: new_color = -color return [history_boards, new_color], 0 + def simulate_hashable_conversion(self, state): + # since go is MDP, we only need the last board for hashing + return tuple(state[0][-1]) + def executor_do_move(self, history, latest_boards, current_board, color, vertex): if not self._rule_check(history, current_board, color, vertex): return False diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py index c6c8a5b..08a5ec5 100644 --- a/AlphaGo/reversi.py +++ b/AlphaGo/reversi.py @@ -97,6 +97,10 @@ class Reversi: history_boards.append(new_board) return [history_boards, 0 - color], 0 + def simulate_hashable_conversion(self, state): + # since go is MDP, we only need the last board for hashing + return tuple(state[0][-1]) + def _get_winner(self, board): black_num, white_num = self._number_of_black_and_white(board) black_win = black_num - white_num diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 4c23809..9625261 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -1,17 +1,9 @@ import numpy as np import math import time -import sys -import collections c_puct = 5 -def hashable_conversion(obj): - if isinstance(obj, collections.Hashable): - return obj - else: - return tuple(hashable_conversion(sub) for sub in obj) - class MCTSNode(object): def __init__(self, parent, action, state, action_num, prior, inverse=False): self.parent = parent @@ -109,7 +101,9 @@ class ActionNode(object): if self.next_state is None: # next_state is None means that self.parent.state is the terminate state self.mcts.action_selection_time += time.time() - head return self - self.next_state_hashable = hashable_conversion(self.next_state) + head = time.time() + self.next_state_hashable = simulator.simulate_hashable_conversion(self.next_state) + self.mcts.hash_time += time.time() - head if self.next_state_hashable in self.children.keys(): # next state has already visited before self.mcts.action_selection_time += time.time() - head return self.children[self.next_state_hashable].selection(simulator) @@ -153,9 +147,7 @@ class MCTS(object): self.state_selection_time = 0 self.simulate_sf_time = 0 self.valid_mask_time = 0 - self.ndarray2list_time = 0 - self.list2tuple_time = 0 - self.check = 0 + self.hash_time = 0 def search(self, max_step=None, max_time=None): step = 0 @@ -174,18 +166,16 @@ class MCTS(object): self.backpropagation_time += back_time step += 1 if self.debug: - file = open("mcts_profiling.txt", "a") + file = open("mcts_profiling.log", "a") file.write("[" + str(self.role) + "]" - + " sel " + '%.3f' % self.selection_time + " " - + " sel_sta " + '%.3f' % self.state_selection_time + " " - + " valid " + '%.3f' % self.valid_mask_time + " " - + " sel_act " + '%.3f' % self.action_selection_time + " " - + " array2list " + '%.4f' % self.ndarray2list_time + " " - + " check " + str(self.check) + " " - + " list2tuple " + '%.4f' % self.list2tuple_time + " \t" - + " forward " + '%.3f' % self.simulate_sf_time + " " - + " exp " + '%.3f' % self.expansion_time + " " - + " bak " + '%.3f' % self.backpropagation_time + " " + + " sel " + '%.3f' % self.selection_time + " " + + " sel_sta " + '%.3f' % self.state_selection_time + " " + + " valid " + '%.3f' % self.valid_mask_time + " " + + " sel_act " + '%.3f' % self.action_selection_time + " " + + " hash " + '%.3f' % self.hash_time + " " + + " step forward " + '%.3f' % self.simulate_sf_time + " " + + " expansion " + '%.3f' % self.expansion_time + " " + + " backprop " + '%.3f' % self.backpropagation_time + " " + "\n") file.close() From 08b6649fead4c9f550c6c8b6c51f44ac605a67a3 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Thu, 28 Dec 2017 15:52:31 +0800 Subject: [PATCH 90/98] test next_action.next_state in MCTS --- AlphaGo/model.py | 2 -- tianshou/core/mcts/mcts.py | 15 ++++++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/AlphaGo/model.py b/AlphaGo/model.py index c3bb9f0..6fde6e5 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -156,8 +156,6 @@ class ResNet(object): # Note : maybe we can use it for isolating test of MCTS #prob = [1.0 / self.action_num] * self.action_num #return [prob, np.random.uniform(-1, 1)] - if state is None: - return [[0.0] * self.action_num, 0] history, color = state if len(history) != self.history_length: raise ValueError( diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 9625261..1251d05 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -112,11 +112,8 @@ class ActionNode(object): return self def expansion(self, prior, action_num): - if self.next_state is not None: - # note that self.next_state was assigned already at the selection function - # self.next_state is None means MCTS selected a terminate node - self.children[self.next_state_hashable] = UCTNode(self, self.action, self.next_state, action_num, prior, - mcts=self.mcts, inverse=self.parent.inverse) + self.children[self.next_state_hashable] = UCTNode(self, self.action, self.next_state, action_num, prior, + mcts=self.mcts, inverse=self.parent.inverse) def backpropagation(self, value): self.reward += value @@ -183,8 +180,12 @@ class MCTS(object): t0 = time.time() next_action = self.root.selection(self.simulator) t1 = time.time() - prior, value = self.evaluator(next_action.next_state) - next_action.expansion(prior, self.action_num) + # next_action.next_state is None means the parent state node of next_action is a terminate node + if next_action.next_state is not None: + prior, value = self.evaluator(next_action.next_state) + next_action.expansion(prior, self.action_num) + else: + value = 0 t2 = time.time() if self.inverse: next_action.backpropagation(-value + 0.) From b699258e769429f881fdef34ca947e1311983be3 Mon Sep 17 00:00:00 2001 From: rtz19970824 Date: Thu, 28 Dec 2017 15:55:07 +0800 Subject: [PATCH 91/98] debug for reversi --- AlphaGo/game.py | 2 +- AlphaGo/player.py | 7 +++---- AlphaGo/reversi.py | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index f17c7af..82cf254 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -123,7 +123,7 @@ class Game: sys.stdout.flush() if __name__ == "__main__": - game = Game(name="go", checkpoint_path="./checkpoint") + game = Game(name="reversi", checkpoint_path=None) game.debug = True game.think_play_move(utils.BLACK) diff --git a/AlphaGo/player.py b/AlphaGo/player.py index 8d46ae5..bd2a2d1 100644 --- a/AlphaGo/player.py +++ b/AlphaGo/player.py @@ -18,12 +18,11 @@ class Player(object): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument("--checkpoint_path", type=str, default=None) + parser.add_argument("--checkpoint_path", type=str, default="None") parser.add_argument("--role", type=str, default="unknown") - parser.add_argument("--debug", type=str, default=False) - parser.add_argument("--game", type=str, default=False) + parser.add_argument("--debug", type=str, default="False") + parser.add_argument("--game", type=str, default="go") args = parser.parse_args() - if args.checkpoint_path == 'None': args.checkpoint_path = None game = Game(name=args.game, role=args.role, diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py index 08a5ec5..1685b66 100644 --- a/AlphaGo/reversi.py +++ b/AlphaGo/reversi.py @@ -99,7 +99,7 @@ class Reversi: def simulate_hashable_conversion(self, state): # since go is MDP, we only need the last board for hashing - return tuple(state[0][-1]) + return tuple(state[0][-1].flatten().tolist()) def _get_winner(self, board): black_num, white_num = self._number_of_black_and_white(board) From 5457e5134e3cb6c0566df17b9b79fc479e773375 Mon Sep 17 00:00:00 2001 From: JialianLee Date: Thu, 28 Dec 2017 16:20:44 +0800 Subject: [PATCH 92/98] add a unit test --- tianshou/core/mcts/unit_test/Evaluator.py | 28 +++ tianshou/core/mcts/unit_test/ZOGame.py | 70 ++++++++ tianshou/core/mcts/unit_test/agent.py | 27 +++ tianshou/core/mcts/unit_test/game.py | 37 ++++ tianshou/core/mcts/unit_test/mcts.py | 198 ++++++++++++++++++++++ 5 files changed, 360 insertions(+) create mode 100644 tianshou/core/mcts/unit_test/Evaluator.py create mode 100644 tianshou/core/mcts/unit_test/ZOGame.py create mode 100644 tianshou/core/mcts/unit_test/agent.py create mode 100644 tianshou/core/mcts/unit_test/game.py create mode 100644 tianshou/core/mcts/unit_test/mcts.py diff --git a/tianshou/core/mcts/unit_test/Evaluator.py b/tianshou/core/mcts/unit_test/Evaluator.py new file mode 100644 index 0000000..a1f9456 --- /dev/null +++ b/tianshou/core/mcts/unit_test/Evaluator.py @@ -0,0 +1,28 @@ +import numpy as np + + +class evaluator(object): + def __init__(self, env, action_num): + self.env = env + self.action_num = action_num + + def __call__(self, state): + raise NotImplementedError("Need to implement the evaluator") + + +class rollout_policy(evaluator): + def __init__(self, env, action_num): + super(rollout_policy, self).__init__(env, action_num) + self.is_terminated = False + + def __call__(self, state): + # TODO: prior for rollout policy + total_reward = 0. + action = np.random.randint(0, self.action_num) + state, reward = self.env.simulate_step_forward(state, action) + total_reward += reward + while state is not None: + action = np.random.randint(0, self.action_num) + state, reward = self.env.simulate_step_forward(state, action) + total_reward += reward + return np.ones([self.action_num])/self.action_num, total_reward diff --git a/tianshou/core/mcts/unit_test/ZOGame.py b/tianshou/core/mcts/unit_test/ZOGame.py new file mode 100644 index 0000000..8a2ed54 --- /dev/null +++ b/tianshou/core/mcts/unit_test/ZOGame.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +import numpy as np +import copy + + +class ZOTree: + def __init__(self, size): + self.size = size + self.depth = self.size * 2 + + def simulate_step_forward(self, state, action): + seq, color = copy.deepcopy(state) + if len(seq) == self.depth: + winner = self.executor_get_reward(state) + return None, color * winner + else: + seq.append(int(action)) + return [seq, 0 - color], 0 + + def executor_get_reward(self, state): + seq = np.array(state[0], dtype='int16') + length = len(seq) + if length != self.depth: + raise ValueError("The game is not terminated!") + result = np.sum(seq) + if result > 0: + winner = 1 + elif result < 0: + winner = -1 + else: + winner = 0 + return winner + + def executor_do_move(self, state, action): + seq, color = state + if len(seq) == self.depth: + return False + else: + seq.append(int(action)) + if len(seq) == self.depth: + return False + return True + + def v_value(self, state): + seq, color = state + choosen_result = np.sum(np.array(seq, dtype='int16')) + if color == 1: + if choosen_result > 0: + return 1 + elif choosen_result < 0: + return -1 + else: + return 0 + elif color == -1: + if choosen_result > 1: + return 1 + elif choosen_result < 1: + return -1 + else: + return 0 + else: + raise ValueError("Wrong color") + +if __name__ == "__main__": + size = 2 + game = ZOTree(size) + seq = [1, -1, 1, 1] + result = game.executor_do_move([seq, 1], 1) + print(result) + print(seq) \ No newline at end of file diff --git a/tianshou/core/mcts/unit_test/agent.py b/tianshou/core/mcts/unit_test/agent.py new file mode 100644 index 0000000..1bffdd0 --- /dev/null +++ b/tianshou/core/mcts/unit_test/agent.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +import numpy as np +import ZOGame +import Evaluator +from mcts import MCTS + +temp = 1 + + +class Agent: + def __init__(self, size, color): + self.size = size + self.color = color + self.simulator = ZOGame.ZOTree(self.size) + self.evaluator = Evaluator.rollout_policy(self.simulator, 2) + + def gen_move(self, seq): + if len(seq) >= 2 * self.size: + raise ValueError("Game is terminated.") + mcts = MCTS(self.simulator, self.evaluator, [seq, self.color], 2) + mcts.search(max_step=50) + N = mcts.root.N + N = np.power(N, 1.0 / temp) + prob = N / np.sum(N) + print("prob: {}".format(prob)) + action = int(np.random.binomial(1, prob[1]) * 2 - 1) + return action \ No newline at end of file diff --git a/tianshou/core/mcts/unit_test/game.py b/tianshou/core/mcts/unit_test/game.py new file mode 100644 index 0000000..7ac044c --- /dev/null +++ b/tianshou/core/mcts/unit_test/game.py @@ -0,0 +1,37 @@ +import ZOGame +import agent + + +if __name__ == '__main__': + print("Our game has 2 players.") + print("Player 1 has color 1 and plays first. Player 2 has color -1 and plays following player 1.") + print("Both player choose 1 or -1 for an action.") + size = 1 + print("This game has {} iterations".format(size)) + print("If the final sequence has more 1 that -1, player 1 wins.") + print("If the final sequence has less 1 that -1, player 2 wins.") + print("Otherwise, both players get 0.\n") + game = ZOGame.ZOTree(size) + player1 = agent.Agent(size, 1) + player2 = agent.Agent(size, -1) + + seq = [] + print("Sequence is {}\n".format(seq)) + while True: + action1 = player1.gen_move(seq) + print("action1 is {}".format(action1)) + result = game.executor_do_move([seq, 1], action1) + print("Sequence is {}\n".format(seq)) + if not result: + winner = game.executor_get_reward([seq, 1]) + break + action2 = player2.gen_move(seq) + print("action2 is {}".format(action2)) + result = game.executor_do_move([seq, -1], action2) + print("Sequence is {}\n".format(seq)) + if not result: + winner = game.executor_get_reward([seq, 1]) + break + + print("The choice sequence is {}".format(seq)) + print("The game result is {}".format(winner)) \ No newline at end of file diff --git a/tianshou/core/mcts/unit_test/mcts.py b/tianshou/core/mcts/unit_test/mcts.py new file mode 100644 index 0000000..1251d05 --- /dev/null +++ b/tianshou/core/mcts/unit_test/mcts.py @@ -0,0 +1,198 @@ +import numpy as np +import math +import time + +c_puct = 5 + +class MCTSNode(object): + def __init__(self, parent, action, state, action_num, prior, inverse=False): + self.parent = parent + self.action = action + self.children = {} + self.state = state + self.action_num = action_num + self.prior = np.array(prior).reshape(-1) + self.inverse = inverse + + def selection(self, simulator): + raise NotImplementedError("Need to implement function selection") + + def backpropagation(self, action): + raise NotImplementedError("Need to implement function backpropagation") + + def valid_mask(self, simulator): + pass + +class UCTNode(MCTSNode): + def __init__(self, parent, action, state, action_num, prior, mcts, inverse=False): + super(UCTNode, self).__init__(parent, action, state, action_num, prior, inverse) + self.Q = np.zeros([action_num]) + self.W = np.zeros([action_num]) + self.N = np.zeros([action_num]) + self.c_puct = c_puct + self.ucb = self.Q + self.c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1) + self.mask = None + self.elapse_time = 0 + self.mcts = mcts + + def selection(self, simulator): + head = time.time() + self.valid_mask(simulator) + self.mcts.valid_mask_time += time.time() - head + action = np.argmax(self.ucb) + if action in self.children.keys(): + self.mcts.state_selection_time += time.time() - head + return self.children[action].selection(simulator) + else: + self.children[action] = ActionNode(self, action, mcts=self.mcts) + self.mcts.state_selection_time += time.time() - head + return self.children[action].selection(simulator) + + def backpropagation(self, action): + action = int(action) + self.N[action] += 1 + self.W[action] += self.children[action].reward + for i in range(self.action_num): + if self.N[i] != 0: + self.Q[i] = (self.W[i] + 0.) / self.N[i] + self.ucb = self.Q + c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1.) + if self.parent is not None: + if self.inverse: + self.parent.backpropagation(-self.children[action].reward) + else: + self.parent.backpropagation(self.children[action].reward) + + def valid_mask(self, simulator): + # let all invalid actions be illegal in mcts + if not hasattr(simulator, 'simulate_get_mask'): + pass + else: + if self.mask is None: + self.mask = simulator.simulate_get_mask(self.state, range(self.action_num)) + self.ucb[self.mask] = -float("Inf") + +# Code reserved for Thompson Sampling +class TSNode(MCTSNode): + def __init__(self, parent, action, state, action_num, prior, method="Gaussian", inverse=False): + super(TSNode, self).__init__(parent, action, state, action_num, prior, inverse) + if method == "Beta": + self.alpha = np.ones([action_num]) + self.beta = np.ones([action_num]) + if method == "Gaussian": + self.mu = np.zeros([action_num]) + self.sigma = np.zeros([action_num]) + + +class ActionNode(object): + def __init__(self, parent, action, mcts): + self.parent = parent + self.action = action + self.children = {} + self.next_state = None + self.next_state_hashable = None + self.state_type = None + self.reward = 0 + self.mcts = mcts + + def selection(self, simulator): + head = time.time() + self.next_state, self.reward = simulator.simulate_step_forward(self.parent.state, self.action) + self.mcts.simulate_sf_time += time.time() - head + if self.next_state is None: # next_state is None means that self.parent.state is the terminate state + self.mcts.action_selection_time += time.time() - head + return self + head = time.time() + self.next_state_hashable = simulator.simulate_hashable_conversion(self.next_state) + self.mcts.hash_time += time.time() - head + if self.next_state_hashable in self.children.keys(): # next state has already visited before + self.mcts.action_selection_time += time.time() - head + return self.children[self.next_state_hashable].selection(simulator) + else: # next state is a new state never seen before + self.mcts.action_selection_time += time.time() - head + return self + + def expansion(self, prior, action_num): + self.children[self.next_state_hashable] = UCTNode(self, self.action, self.next_state, action_num, prior, + mcts=self.mcts, inverse=self.parent.inverse) + + def backpropagation(self, value): + self.reward += value + self.parent.backpropagation(self.action) + +class MCTS(object): + def __init__(self, simulator, evaluator, start_state, action_num, method="UCT", + role="unknown", debug=False, inverse=False): + self.simulator = simulator + self.evaluator = evaluator + self.role = role + self.debug = debug + prior, _ = self.evaluator(start_state) + self.action_num = action_num + if method == "": + self.root = start_state + if method == "UCT": + self.root = UCTNode(None, None, start_state, action_num, prior, mcts=self, inverse=inverse) + if method == "TS": + self.root = TSNode(None, None, start_state, action_num, prior, inverse=inverse) + self.inverse = inverse + + # time spend on each step + self.selection_time = 0 + self.expansion_time = 0 + self.backpropagation_time = 0 + self.action_selection_time = 0 + self.state_selection_time = 0 + self.simulate_sf_time = 0 + self.valid_mask_time = 0 + self.hash_time = 0 + + def search(self, max_step=None, max_time=None): + step = 0 + start_time = time.time() + if max_step is None: + max_step = int("Inf") + if max_time is None: + max_time = float("Inf") + if max_step is None and max_time is None: + raise ValueError("Need a stop criteria!") + + while step < max_step and time.time() - start_time < max_step: + sel_time, exp_time, back_time = self._expand() + self.selection_time += sel_time + self.expansion_time += exp_time + self.backpropagation_time += back_time + step += 1 + if self.debug: + file = open("mcts_profiling.log", "a") + file.write("[" + str(self.role) + "]" + + " sel " + '%.3f' % self.selection_time + " " + + " sel_sta " + '%.3f' % self.state_selection_time + " " + + " valid " + '%.3f' % self.valid_mask_time + " " + + " sel_act " + '%.3f' % self.action_selection_time + " " + + " hash " + '%.3f' % self.hash_time + " " + + " step forward " + '%.3f' % self.simulate_sf_time + " " + + " expansion " + '%.3f' % self.expansion_time + " " + + " backprop " + '%.3f' % self.backpropagation_time + " " + + "\n") + file.close() + + def _expand(self): + t0 = time.time() + next_action = self.root.selection(self.simulator) + t1 = time.time() + # next_action.next_state is None means the parent state node of next_action is a terminate node + if next_action.next_state is not None: + prior, value = self.evaluator(next_action.next_state) + next_action.expansion(prior, self.action_num) + else: + value = 0 + t2 = time.time() + if self.inverse: + next_action.backpropagation(-value + 0.) + else: + next_action.backpropagation(value + 0.) + t3 = time.time() + return t1 - t0, t2 - t1, t3 - t2 + +if __name__ == "__main__": + pass From 0352866b1ab1c6da3ac230479c4ebe8493ed71d0 Mon Sep 17 00:00:00 2001 From: JialianLee Date: Thu, 28 Dec 2017 16:27:28 +0800 Subject: [PATCH 93/98] Modification for game engine --- tianshou/core/mcts/unit_test/ZOGame.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tianshou/core/mcts/unit_test/ZOGame.py b/tianshou/core/mcts/unit_test/ZOGame.py index 8a2ed54..acad284 100644 --- a/tianshou/core/mcts/unit_test/ZOGame.py +++ b/tianshou/core/mcts/unit_test/ZOGame.py @@ -17,6 +17,10 @@ class ZOTree: seq.append(int(action)) return [seq, 0 - color], 0 + def simulate_hashable_conversion(self, state): + # since go is MDP, we only need the last board for hashing + return tuple(state[0]) + def executor_get_reward(self, state): seq = np.array(state[0], dtype='int16') length = len(seq) From 4140d8c9d28fd2164ebb1f1dce902b77e0d9c5b5 Mon Sep 17 00:00:00 2001 From: JialianLee Date: Thu, 28 Dec 2017 17:10:25 +0800 Subject: [PATCH 94/98] Modification on unit test --- tianshou/core/mcts/unit_test/Evaluator.py | 3 +- tianshou/core/mcts/unit_test/ZOGame.py | 36 +++++++++++++++++++++-- tianshou/core/mcts/unit_test/agent.py | 2 +- tianshou/core/mcts/unit_test/game.py | 6 ++-- tianshou/core/mcts/unit_test/mcts.py | 2 ++ 5 files changed, 41 insertions(+), 8 deletions(-) diff --git a/tianshou/core/mcts/unit_test/Evaluator.py b/tianshou/core/mcts/unit_test/Evaluator.py index a1f9456..f78da95 100644 --- a/tianshou/core/mcts/unit_test/Evaluator.py +++ b/tianshou/core/mcts/unit_test/Evaluator.py @@ -18,6 +18,7 @@ class rollout_policy(evaluator): def __call__(self, state): # TODO: prior for rollout policy total_reward = 0. + color = state[1] action = np.random.randint(0, self.action_num) state, reward = self.env.simulate_step_forward(state, action) total_reward += reward @@ -25,4 +26,4 @@ class rollout_policy(evaluator): action = np.random.randint(0, self.action_num) state, reward = self.env.simulate_step_forward(state, action) total_reward += reward - return np.ones([self.action_num])/self.action_num, total_reward + return np.ones([self.action_num])/self.action_num, total_reward * color diff --git a/tianshou/core/mcts/unit_test/ZOGame.py b/tianshou/core/mcts/unit_test/ZOGame.py index acad284..b598579 100644 --- a/tianshou/core/mcts/unit_test/ZOGame.py +++ b/tianshou/core/mcts/unit_test/ZOGame.py @@ -9,6 +9,7 @@ class ZOTree: self.depth = self.size * 2 def simulate_step_forward(self, state, action): + self._check_state(state) seq, color = copy.deepcopy(state) if len(seq) == self.depth: winner = self.executor_get_reward(state) @@ -18,15 +19,24 @@ class ZOTree: return [seq, 0 - color], 0 def simulate_hashable_conversion(self, state): + self._check_state(state) # since go is MDP, we only need the last board for hashing return tuple(state[0]) - + def executor_get_reward(self, state): + self._check_state(state) seq = np.array(state[0], dtype='int16') length = len(seq) if length != self.depth: raise ValueError("The game is not terminated!") - result = np.sum(seq) + ones = 0 + zeros = 0 + for i in range(len(seq)): + if seq[i] == 0: + zeros += 1 + if seq[i] == 1: + ones += 1 + result = ones - zeros if result > 0: winner = 1 elif result < 0: @@ -36,6 +46,7 @@ class ZOTree: return winner def executor_do_move(self, state, action): + self._check_state(state) seq, color = state if len(seq) == self.depth: return False @@ -46,8 +57,16 @@ class ZOTree: return True def v_value(self, state): + self._check_state(state) seq, color = state - choosen_result = np.sum(np.array(seq, dtype='int16')) + ones = 0 + zeros = 0 + for i in range(len(seq)): + if seq[i] == 0: + zeros += 1 + if seq[i] == 1: + ones += 1 + choosen_result = ones - zeros if color == 1: if choosen_result > 0: return 1 @@ -65,6 +84,17 @@ class ZOTree: else: raise ValueError("Wrong color") + def _check_state(self, state): + seq, color = state + if color == 1: + if len(seq) % 2: + raise ValueError("Color is 1 but the length of seq is odd!") + elif color == -1: + if not len(seq) % 2: + raise ValueError("Color is -1 but the length of seq is even!") + else: + raise ValueError("Wrong color!") + if __name__ == "__main__": size = 2 game = ZOTree(size) diff --git a/tianshou/core/mcts/unit_test/agent.py b/tianshou/core/mcts/unit_test/agent.py index 1bffdd0..ebe346e 100644 --- a/tianshou/core/mcts/unit_test/agent.py +++ b/tianshou/core/mcts/unit_test/agent.py @@ -23,5 +23,5 @@ class Agent: N = np.power(N, 1.0 / temp) prob = N / np.sum(N) print("prob: {}".format(prob)) - action = int(np.random.binomial(1, prob[1]) * 2 - 1) + action = int(np.random.binomial(1, prob[1])) return action \ No newline at end of file diff --git a/tianshou/core/mcts/unit_test/game.py b/tianshou/core/mcts/unit_test/game.py index 7ac044c..14c2df5 100644 --- a/tianshou/core/mcts/unit_test/game.py +++ b/tianshou/core/mcts/unit_test/game.py @@ -5,11 +5,11 @@ import agent if __name__ == '__main__': print("Our game has 2 players.") print("Player 1 has color 1 and plays first. Player 2 has color -1 and plays following player 1.") - print("Both player choose 1 or -1 for an action.") + print("Both player choose 1 or 0 for an action.") size = 1 print("This game has {} iterations".format(size)) - print("If the final sequence has more 1 that -1, player 1 wins.") - print("If the final sequence has less 1 that -1, player 2 wins.") + print("If the final sequence has more 1 that 0, player 1 wins.") + print("If the final sequence has less 1 that 0, player 2 wins.") print("Otherwise, both players get 0.\n") game = ZOGame.ZOTree(size) player1 = agent.Agent(size, 1) diff --git a/tianshou/core/mcts/unit_test/mcts.py b/tianshou/core/mcts/unit_test/mcts.py index 1251d05..dd89f57 100644 --- a/tianshou/core/mcts/unit_test/mcts.py +++ b/tianshou/core/mcts/unit_test/mcts.py @@ -162,6 +162,8 @@ class MCTS(object): self.expansion_time += exp_time self.backpropagation_time += back_time step += 1 + print("Q = {}".format(self.root.Q)) + print("N = {}".format(self.root.N)) if self.debug: file = open("mcts_profiling.log", "a") file.write("[" + str(self.role) + "]" From 2dfab68efe58a047919b16e7c89a83d3c7f13d7f Mon Sep 17 00:00:00 2001 From: rtz19970824 Date: Thu, 28 Dec 2017 19:28:21 +0800 Subject: [PATCH 95/98] debug for unit test --- AlphaGo/game.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 82cf254..8329b1b 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -28,6 +28,8 @@ class Game: ''' def __init__(self, name=None, role=None, debug=False, checkpoint_path=None): self.name = name + if role is None: + raise ValueError("Need a role!") self.role = role self.debug = debug if self.name == "go": @@ -123,7 +125,7 @@ class Game: sys.stdout.flush() if __name__ == "__main__": - game = Game(name="reversi", checkpoint_path=None) + game = Game(name="reversi", role="black", checkpoint_path=None) game.debug = True game.think_play_move(utils.BLACK) From 63a0d32b3445cb1fc2994218d1a3cfd0ed2bbf08 Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Fri, 29 Dec 2017 03:30:09 +0800 Subject: [PATCH 96/98] use hash table for check_global_isomorphous --- AlphaGo/game.py | 8 ++++++-- AlphaGo/go.py | 13 +++++++------ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 82cf254..60e09f0 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -35,6 +35,7 @@ class Game: self.komi = 3.75 self.history_length = 8 self.history = [] + self.history_set = set() self.game_engine = go.Go(size=self.size, komi=self.komi, role=self.role) self.board = [utils.EMPTY] * (self.size ** 2) elif self.name == "reversi": @@ -92,7 +93,10 @@ class Game: # this function can be called directly to play the opponent's move if vertex == utils.PASS: return True - res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex) + if self.name == "reversi": + res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex) + if self.name == "go": + res = self.game_engine.executor_do_move(self.history, self.history_set, self.latest_boards, self.board, color, vertex) return res def think_play_move(self, color): @@ -124,6 +128,6 @@ class Game: if __name__ == "__main__": game = Game(name="reversi", checkpoint_path=None) - game.debug = True + game.debug = False game.think_play_move(utils.BLACK) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 987fe93..cf6b7aa 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -97,12 +97,12 @@ class Go: for b in group: current_board[self._flatten(b)] = utils.EMPTY - def _check_global_isomorphous(self, history_boards, current_board, color, vertex): + def _check_global_isomorphous(self, history_boards_set, current_board, color, vertex): repeat = False next_board = copy.deepcopy(current_board) next_board[self._flatten(vertex)] = color self._process_board(next_board, color, vertex) - if next_board in history_boards: + if hash(tuple(next_board)) in history_boards_set: repeat = True return repeat @@ -158,7 +158,7 @@ class Go: vertex = self._deflatten(action) return vertex - def _rule_check(self, history_boards, current_board, color, vertex): + def _rule_check(self, history_boards_set, current_board, color, vertex): ### in board if not self._in_board(vertex): return False @@ -172,7 +172,7 @@ class Go: return False ### forbid global isomorphous - if self._check_global_isomorphous(history_boards, current_board, color, vertex): + if self._check_global_isomorphous(history_boards_set, current_board, color, vertex): return False return True @@ -226,13 +226,14 @@ class Go: # since go is MDP, we only need the last board for hashing return tuple(state[0][-1]) - def executor_do_move(self, history, latest_boards, current_board, color, vertex): - if not self._rule_check(history, current_board, color, vertex): + def executor_do_move(self, history, history_set, latest_boards, current_board, color, vertex): + if not self._rule_check(history_set, current_board, color, vertex): return False current_board[self._flatten(vertex)] = color self._process_board(current_board, color, vertex) history.append(copy.deepcopy(current_board)) latest_boards.append(copy.deepcopy(current_board)) + history_set.add(hash(tuple(current_board))) return True def _find_empty(self, current_board): From 01f39f40d3df481703401ebaf2d8305f232074b6 Mon Sep 17 00:00:00 2001 From: rtz19970824 Date: Thu, 28 Dec 2017 19:38:25 +0800 Subject: [PATCH 97/98] debug for unit test --- tianshou/core/mcts/unit_test/ZOGame.py | 17 +++++------------ tianshou/core/mcts/unit_test/agent.py | 4 ++-- tianshou/core/mcts/unit_test/game.py | 4 ++-- tianshou/core/mcts/unit_test/mcts.py | 2 +- 4 files changed, 10 insertions(+), 17 deletions(-) diff --git a/tianshou/core/mcts/unit_test/ZOGame.py b/tianshou/core/mcts/unit_test/ZOGame.py index b598579..0b3d771 100644 --- a/tianshou/core/mcts/unit_test/ZOGame.py +++ b/tianshou/core/mcts/unit_test/ZOGame.py @@ -29,17 +29,10 @@ class ZOTree: length = len(seq) if length != self.depth: raise ValueError("The game is not terminated!") - ones = 0 - zeros = 0 - for i in range(len(seq)): - if seq[i] == 0: - zeros += 1 - if seq[i] == 1: - ones += 1 - result = ones - zeros - if result > 0: + result = np.sum(seq) + if result > self.size: winner = 1 - elif result < 0: + elif result < self.size: winner = -1 else: winner = 0 @@ -98,7 +91,7 @@ class ZOTree: if __name__ == "__main__": size = 2 game = ZOTree(size) - seq = [1, -1, 1, 1] + seq = [1, 0, 1, 1] result = game.executor_do_move([seq, 1], 1) print(result) - print(seq) \ No newline at end of file + print(seq) diff --git a/tianshou/core/mcts/unit_test/agent.py b/tianshou/core/mcts/unit_test/agent.py index ebe346e..6dd34aa 100644 --- a/tianshou/core/mcts/unit_test/agent.py +++ b/tianshou/core/mcts/unit_test/agent.py @@ -17,11 +17,11 @@ class Agent: def gen_move(self, seq): if len(seq) >= 2 * self.size: raise ValueError("Game is terminated.") - mcts = MCTS(self.simulator, self.evaluator, [seq, self.color], 2) + mcts = MCTS(self.simulator, self.evaluator, [seq, self.color], 2, inverse=True) mcts.search(max_step=50) N = mcts.root.N N = np.power(N, 1.0 / temp) prob = N / np.sum(N) print("prob: {}".format(prob)) action = int(np.random.binomial(1, prob[1])) - return action \ No newline at end of file + return action diff --git a/tianshou/core/mcts/unit_test/game.py b/tianshou/core/mcts/unit_test/game.py index 14c2df5..6fb504b 100644 --- a/tianshou/core/mcts/unit_test/game.py +++ b/tianshou/core/mcts/unit_test/game.py @@ -6,7 +6,7 @@ if __name__ == '__main__': print("Our game has 2 players.") print("Player 1 has color 1 and plays first. Player 2 has color -1 and plays following player 1.") print("Both player choose 1 or 0 for an action.") - size = 1 + size = 2 print("This game has {} iterations".format(size)) print("If the final sequence has more 1 that 0, player 1 wins.") print("If the final sequence has less 1 that 0, player 2 wins.") @@ -34,4 +34,4 @@ if __name__ == '__main__': break print("The choice sequence is {}".format(seq)) - print("The game result is {}".format(winner)) \ No newline at end of file + print("The game result is {}".format(winner)) diff --git a/tianshou/core/mcts/unit_test/mcts.py b/tianshou/core/mcts/unit_test/mcts.py index dd89f57..49c9faf 100644 --- a/tianshou/core/mcts/unit_test/mcts.py +++ b/tianshou/core/mcts/unit_test/mcts.py @@ -187,7 +187,7 @@ class MCTS(object): prior, value = self.evaluator(next_action.next_state) next_action.expansion(prior, self.action_num) else: - value = 0 + value = 0. t2 = time.time() if self.inverse: next_action.backpropagation(-value + 0.) From 5849776c9aa48b7ef040200881eaecf9e71c0967 Mon Sep 17 00:00:00 2001 From: JialianLee Date: Fri, 29 Dec 2017 13:45:53 +0800 Subject: [PATCH 98/98] Modification and doc for unit test --- tianshou/core/mcts/unit_test/README.md | 21 +++++++++++++++++++++ tianshou/core/mcts/unit_test/ZOGame.py | 2 +- tianshou/core/mcts/unit_test/agent.py | 11 ++++++----- tianshou/core/mcts/unit_test/game.py | 8 +++++--- tianshou/core/mcts/unit_test/mcts.py | 2 -- 5 files changed, 33 insertions(+), 11 deletions(-) create mode 100644 tianshou/core/mcts/unit_test/README.md diff --git a/tianshou/core/mcts/unit_test/README.md b/tianshou/core/mcts/unit_test/README.md new file mode 100644 index 0000000..b7d0214 --- /dev/null +++ b/tianshou/core/mcts/unit_test/README.md @@ -0,0 +1,21 @@ +# Unit Test + +This is a two-player zero-sum perfect information extensive game. Player 1 and player 2 iteratively choose actions. At every iteration, player 1 players first and player 2 follows. Both players have choices 0 or 1. + +The number of iterations is given as a fixed number. After one game finished, the game counts the number of 0s and 1s that are choosen. If the number of 1 is more than that of 0, player 1 gets 1 and player 2 gets -1. If the number of 1 is less than that of 0, player 1 gets -1 and player 2 gets 1. Otherwise, they both get 0. + +## Files + ++ game.py: run this file to play the game. ++ agent.py: a class for players. MCTS is used here. ++ ZOgame.py: the game environment. ++ mcts.py: MCTS method. ++ Evaluator: evaluator for MCTS. Rollout policy is also here. + +## Parameters + +Three paramters are given in game.py. + ++ size: the number of iterations ++ searching_step: the number of searching times of MCTS for one step ++ temp: the temporature paramter used to tradeoff exploitation and exploration diff --git a/tianshou/core/mcts/unit_test/ZOGame.py b/tianshou/core/mcts/unit_test/ZOGame.py index 0b3d771..a4ea5e9 100644 --- a/tianshou/core/mcts/unit_test/ZOGame.py +++ b/tianshou/core/mcts/unit_test/ZOGame.py @@ -29,7 +29,7 @@ class ZOTree: length = len(seq) if length != self.depth: raise ValueError("The game is not terminated!") - result = np.sum(seq) + result = np.sum(seq) if result > self.size: winner = 1 elif result < self.size: diff --git a/tianshou/core/mcts/unit_test/agent.py b/tianshou/core/mcts/unit_test/agent.py index 6dd34aa..f2946ce 100644 --- a/tianshou/core/mcts/unit_test/agent.py +++ b/tianshou/core/mcts/unit_test/agent.py @@ -4,13 +4,15 @@ import ZOGame import Evaluator from mcts import MCTS -temp = 1 + class Agent: - def __init__(self, size, color): + def __init__(self, size, color, searching_step, temp): self.size = size self.color = color + self.searching_step = searching_step + self.temp = temp self.simulator = ZOGame.ZOTree(self.size) self.evaluator = Evaluator.rollout_policy(self.simulator, 2) @@ -18,10 +20,9 @@ class Agent: if len(seq) >= 2 * self.size: raise ValueError("Game is terminated.") mcts = MCTS(self.simulator, self.evaluator, [seq, self.color], 2, inverse=True) - mcts.search(max_step=50) + mcts.search(max_step=self.searching_step) N = mcts.root.N - N = np.power(N, 1.0 / temp) + N = np.power(N, 1.0 / self.temp) prob = N / np.sum(N) - print("prob: {}".format(prob)) action = int(np.random.binomial(1, prob[1])) return action diff --git a/tianshou/core/mcts/unit_test/game.py b/tianshou/core/mcts/unit_test/game.py index 6fb504b..92fcea8 100644 --- a/tianshou/core/mcts/unit_test/game.py +++ b/tianshou/core/mcts/unit_test/game.py @@ -3,17 +3,19 @@ import agent if __name__ == '__main__': + size = 10 + seaching_step = 100 + temp = 1 print("Our game has 2 players.") print("Player 1 has color 1 and plays first. Player 2 has color -1 and plays following player 1.") print("Both player choose 1 or 0 for an action.") - size = 2 print("This game has {} iterations".format(size)) print("If the final sequence has more 1 that 0, player 1 wins.") print("If the final sequence has less 1 that 0, player 2 wins.") print("Otherwise, both players get 0.\n") game = ZOGame.ZOTree(size) - player1 = agent.Agent(size, 1) - player2 = agent.Agent(size, -1) + player1 = agent.Agent(size, 1, seaching_step, temp) + player2 = agent.Agent(size, -1, seaching_step, temp) seq = [] print("Sequence is {}\n".format(seq)) diff --git a/tianshou/core/mcts/unit_test/mcts.py b/tianshou/core/mcts/unit_test/mcts.py index 49c9faf..ab566f0 100644 --- a/tianshou/core/mcts/unit_test/mcts.py +++ b/tianshou/core/mcts/unit_test/mcts.py @@ -162,8 +162,6 @@ class MCTS(object): self.expansion_time += exp_time self.backpropagation_time += back_time step += 1 - print("Q = {}".format(self.root.Q)) - print("N = {}".format(self.root.N)) if self.debug: file = open("mcts_profiling.log", "a") file.write("[" + str(self.role) + "]"