diff --git a/AlphaGo/go.py b/AlphaGo/go.py index b819c08..fe2ab74 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -212,11 +212,14 @@ class Go: def simulate_step_forward(self, state, action): # initialize the simulate_board from state history_boards, color = state - vertex = self._action2vertex(action) - new_board = self._do_move(copy.copy(history_boards[-1]), color, vertex) - history_boards.append(new_board) - new_color = -color - return [history_boards, new_color], 0 + if history_boards[-1] == history_boards[-2] and action is utils.PASS: + return None, 2 * (float(self.executor_get_score(history_boards[-1]) > 0)-0.5) * color + else: + vertex = self._action2vertex(action) + new_board = self._do_move(copy.copy(history_boards[-1]), color, vertex) + history_boards.append(new_board) + new_color = -color + return [history_boards, new_color], 0 def executor_do_move(self, history, latest_boards, current_board, color, vertex): if not self._rule_check(history, current_board, color, vertex): diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 22e8626..68973ac 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -1,7 +1,6 @@ import os import time -import random -import sys +import copy import cPickle from collections import deque @@ -224,11 +223,21 @@ class ResNet(object): else: start_time = time.time() for i in range(batch_size): - game_num = random.randint(0, self.window_length-1) - state_num = random.randint(0, self.training_data['length'][game_num]-1) - training_data['states'].append(np.expand_dims(self.training_data['states'][game_num][state_num], 0)) - training_data['probs'].append(np.expand_dims(self.training_data['probs'][game_num][state_num], 0)) - training_data['winner'].append(np.expand_dims(self.training_data['winner'][game_num][state_num], 0)) + priority = self.training_data['length'] / sum(self.training_data['length']) + game_num = np.random.choice(self.window_length, 1, p=priority) + state_num = np.random.randint(self.training_data['length'][game_num]) + rotate_times = np.random.randint(4) + reflect_times = np.random.randint(2) + reflect_orientation = np.random.randint(2) + training_data['states'].append( + self._preprocession(self.training_data['states'][game_num][state_num], reflect_times, + reflect_orientation, rotate_times)) + training_data['probs'].append( + self._preprocession(self.training_data['probs'][game_num][state_num], reflect_times, + reflect_orientation, rotate_times)) + training_data['winner'].append( + self._preprocession(self.training_data['winner'][game_num][state_num], reflect_times, + reflect_orientation, rotate_times)) value_loss, policy_loss, reg, _ = self.sess.run( [self.value_loss, self.policy_loss, self.reg, self.train_op], feed_dict={self.x: np.concatenate(training_data['states'], axis=0), @@ -280,6 +289,55 @@ class ResNet(object): winner = np.concatenate(winner, axis=0) return states, probs, winner + def _preprocession(self, board, reflect_times=0, reflect_orientation=0, rotate_times=0): + """ + preprocessing for augmentation + + :param board: a ndarray, board to process + :param reflect_times: an integer, how many times to reflect + :param reflect_orientation: an integer, which orientation to reflect + :param rotate_times: an integer, how many times to rotate + :return: + """ + + new_board = copy.copy(board) + if new_board.ndim == 3: + np.expand_dims(new_board, axis=0) + + new_board = self._board_reflection(new_board, reflect_times, reflect_orientation) + new_board = self._board_rotation(new_board, rotate_times) + + return new_board + + def _board_rotation(self, board, times): + """ + rotate the board for augmentation + note that board's shape should be [batch_size, board_size, board_size, channels] + + :param board: a ndarray, shape [batch_size, board_size, board_size, channels] + :param times: an integer, how many times to rotate + :return: + """ + return np.rot90(board, times, (1, 2)) + + def _board_reflection(self, board, times, orientation): + """ + reflect the board for augmentation + note that board's shape should be [batch_size, board_size, board_size, channels] + + :param board: a ndarray, shape [batch_size, board_size, board_size, channels] + :param times: an integer, how many times to reflect + :param orientation: an integer, which orientation to reflect + :return: + """ + new_board = copy.copy(board) + for _ in range(times): + if orientation == 0: + new_board = new_board[:, ::-1] + if orientation == 1: + new_board = new_board[:, :, ::-1] + return new_board + if __name__ == "__main__": model = ResNet(board_size=9, action_num=82, history_length=8) diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py index c086a2c..ead6f4e 100644 --- a/AlphaGo/reversi.py +++ b/AlphaGo/reversi.py @@ -25,6 +25,7 @@ def find_correct_moves(own, enemy): mobility |= search_offset_right(own, enemy, mask, 7) # Left bottom return mobility + def calc_flip(pos, own, enemy): """return flip stones of enemy by bitboard when I place stone at pos. @@ -123,8 +124,9 @@ class Reversi: self.board = None # 8 * 8 board with 1 for black, -1 for white and 0 for blank self.color = None # 1 for black and -1 for white self.action = None # number in 0~63 - # self.winner = None + self.winner = None self.black_win = None + self.size = 8 def get_board(self, black=None, white=None): self.black = black or (0b00001000 << 24 | 0b00010000 << 32) @@ -132,22 +134,29 @@ class Reversi: self.board = self.bitboard2board() return self.board + def is_valid(self, is_next=False): + self.board2bitboard() + own, enemy = self.get_own_and_enemy(is_next) + mobility = find_correct_moves(own, enemy) + valid_moves = bit_to_array(mobility, 64) + valid_moves = np.argwhere(valid_moves) + valid_moves = list(np.reshape(valid_moves, len(valid_moves))) + return valid_moves + def simulate_get_mask(self, state, action_set): history_boards, color = state board = history_boards[-1] self.board = board self.color = color - self.board2bitboard() - own, enemy = self.get_own_and_enemy() - mobility = find_correct_moves(own, enemy) - valid_moves = bit_to_array(mobility, 64) - valid_moves = np.argwhere(valid_moves) - valid_moves = list(np.reshape(valid_moves, len(valid_moves))) + valid_moves = self.is_valid() # TODO it seems that the pass move is not considered - invalid_action_mask = [] - for action in action_set: - if action not in valid_moves: - invalid_action_mask.append(action) + if not len(valid_moves): + invalid_action_mask = action_set[0:-1] + else: + invalid_action_mask = [] + for action in action_set: + if action not in valid_moves: + invalid_action_mask.append(action) return invalid_action_mask def simulate_step_forward(self, state, action): @@ -155,21 +164,34 @@ class Reversi: self.color = state[1] self.board2bitboard() self.action = action - step_forward = self.step() - if step_forward: - new_board = self.bitboard2board() - return [new_board, 0 - self.color], 0 + if self.action == 64: + valid_moves = self.is_valid(is_next=True) + if not len(valid_moves): + self._game_over() + return None, self.winner * self.color + else: + return [self.board, 0 - self.color], 0 + self.step() + new_board = self.bitboard2board() + return [new_board, 0 - self.color], 0 def executor_do_move(self, board, color, vertex): self.board = board self.color = color self.board2bitboard() - self.vertex2action(vertex) - step_forward = self.step() - if step_forward: + self.action = self._flatten(vertex) + if self.action == 64: + valid_moves = self.is_valid(is_next=True) + if not len(valid_moves): + return False + else: + return True + else: + self.step() new_board = self.bitboard2board() - for i in range(64): - board[i] = new_board[i] + for i in range(64): + board[i] = new_board[i] + return True def executor_get_score(self, board): self.board = board @@ -191,13 +213,14 @@ class Reversi: elif self.board[i] == -1: self.white |= count count *= 2 - + ''' def vertex2action(self, vertex): x, y = vertex if x == 0 and y == 0: self.action = None else: self.action = 8 * (x - 1) + y - 1 + ''' def bitboard2board(self): board = [] @@ -214,46 +237,45 @@ class Reversi: def step(self): if self.action < 0 or self.action > 63: - raise ValueError("Wrong action!") + raise ValueError("Action not in the range of [0,63]!") if self.action is None: - return False + raise ValueError("Action is None!") own, enemy = self.get_own_and_enemy() flipped = calc_flip(self.action, own, enemy) if bit_count(flipped) == 0: - self.illegal_move_to_lose(self.action) - return False + # self.illegal_move_to_lose(self.action) + raise ValueError("Illegal action!") own ^= flipped own |= 1 << self.action enemy ^= flipped - self.set_own_and_enemy(own, enemy) - return True def _game_over(self): # self.done = True - ''' + if self.winner is None: black_num, white_num = self.number_of_black_and_white - if black_num > white_num: + self.black_win = black_num - white_num + if self.black_win > 0: self.winner = 1 - elif black_num < white_num: + elif self.black_win < 0: self.winner = -1 else: self.winner = 0 - ''' - if self.black_win is None: - black_num, white_num = self.number_of_black_and_white - self.black_win = black_num - white_num def illegal_move_to_lose(self, action): self._game_over() - def get_own_and_enemy(self): - if self.color == 1: + def get_own_and_enemy(self, is_next=False): + if is_next: + color = 0 - self.color + else: + color = self.color + if color == 1: own, enemy = self.black, self.white - elif self.color == -1: + elif color == -1: own, enemy = self.white, self.black else: own, enemy = None, None @@ -265,6 +287,17 @@ class Reversi: else: self.white, self.black = own, enemy + def _deflatten(self, idx): + x = idx // self.size + 1 + y = idx % self.size + 1 + return (x, y) + + def _flatten(self, vertex): + x, y = vertex + if (x == 0) and (y == 0): + return 64 + return (x - 1) * self.size + (y - 1) + @property def number_of_black_and_white(self): return bit_count(self.black), bit_count(self.white) diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index e8f3709..e99373c 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -38,6 +38,7 @@ class MCTSNode(object): def valid_mask(self, simulator): pass + class UCTNode(MCTSNode): def __init__(self, parent, action, state, action_num, prior, inverse=False): super(UCTNode, self).__init__(parent, action, state, action_num, prior, inverse) @@ -71,10 +72,13 @@ class UCTNode(MCTSNode): self.parent.backpropagation(self.children[action].reward) def valid_mask(self, simulator): - # let all invalid actions be illeagel in mcts - if self.mask is None: - self.mask = simulator.simulate_get_mask(self.state, range(self.action_num)) - self.ucb[self.mask] = -float("Inf") + # let all invalid actions be illegal in mcts + if not hasattr(simulator, 'simulate_get_mask'): + pass + else: + if self.mask is None: + self.mask = simulator.simulate_get_mask(self.state, range(self.action_num)) + self.ucb[self.mask] = -float("Inf") class TSNode(MCTSNode):