From 83f9e19fa5fdd427ede9d37c0cbf431fac3b6b27 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Tue, 19 Dec 2017 16:51:50 +0800 Subject: [PATCH 01/18] merge flatten and deflatten, rename variable for clarity --- AlphaGo/engine.py | 4 +-- AlphaGo/game.py | 15 ++++++----- AlphaGo/strategy.py | 45 +++++++++++++-------------------- tianshou/core/mcts/evaluator.py | 4 +-- tianshou/core/mcts/mcts.py | 2 +- 5 files changed, 31 insertions(+), 39 deletions(-) diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index 1f9af85..1ee8833 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -167,7 +167,7 @@ class GTPEngine(): move = self._parse_move(args) if move: color, vertex = move - res = self._game.do_move(color, vertex) + res = self._game.play_move(color, vertex) if res: return None, True else: @@ -177,7 +177,7 @@ class GTPEngine(): def cmd_genmove(self, args, **kwargs): color = self._parse_color(args) if color: - move = self._game.gen_move(color) + move = self._game.think_play_move(color) return self._vertex_point2string(move), True else: return 'unknown player', False diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 2a82d8e..d0cb91c 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -77,7 +77,7 @@ class Game: state[0, :, :, 16] = np.zeros([self.size, self.size]) return state - def strategy_gen_move(self, latest_boards, color): + def think(self, latest_boards, color): self.simulator.simulate_latest_boards = copy.copy(latest_boards) self.simulator.simulate_board = copy.copy(latest_boards[-1]) nn_input = self.generate_nn_input(self.simulator.simulate_latest_boards, color) @@ -91,17 +91,18 @@ class Game: move = self._deflatten(choice) return move, prob - def do_move(self, color, vertex): + def play_move(self, color, vertex): + # this function can be called directly to play the opponent's move if vertex == utils.PASS: return True res = self.executor.do_move(color, vertex) return res - def gen_move(self, color): - # move = self.strategy.gen_move(color) - # return move - move, self.prob = self.strategy_gen_move(self.latest_boards, color) - self.do_move(color, move) + def think_play_move(self, color): + # although we dont need to return self.prob, however it is needed for neural network training + move, self.prob = self.think(self.latest_boards, color) + # play the move immediately + self.play_move(color, move) return move def status2symbol(self, s): diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index 112f130..af017b1 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -10,7 +10,7 @@ import tensorflow as tf from collections import deque from tianshou.core.mcts.mcts import MCTS -DELTA = [[1, 0], [-1, 0], [0, -1], [0, 1]] +NEIGHBOR_OFFSET = [[1, 0], [-1, 0], [0, -1], [0, 1]] CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] class GoEnv: @@ -19,17 +19,8 @@ class GoEnv: self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) self.simulate_latest_boards = deque(maxlen=8) - def simulate_flatten(self, vertex): - x, y = vertex - return (x - 1) * self.game.size + (y - 1) - - def simulate_deflatten(self, idx): - x = idx // self.game.size + 1 - y = idx % self.game.size + 1 - return (x, y) - def _find_group(self, start): - color = self.simulate_board[self.simulate_flatten(start)] + color = self.simulate_board[self.game._flatten(start)] # print ("color : ", color) chain = set() frontier = [start] @@ -40,32 +31,32 @@ class GoEnv: chain.add(current) for n in self._neighbor(current): # print n, self._flatten(n), self.board[self._flatten(n)], - if self.simulate_board[self.simulate_flatten(n)] == color and not n in chain: + if self.simulate_board[self.game._flatten(n)] == color and not n in chain: frontier.append(n) - if self.simulate_board[self.simulate_flatten(n)] == utils.EMPTY: + if self.simulate_board[self.game._flatten(n)] == utils.EMPTY: has_liberty = True return has_liberty, chain def _is_suicide(self, color, vertex): - self.simulate_board[self.simulate_flatten(vertex)] = color # assume that we already take this move + self.simulate_board[self.game._flatten(vertex)] = color # assume that we already take this move suicide = False has_liberty, group = self._find_group(vertex) if not has_liberty: suicide = True # no liberty, suicide for n in self._neighbor(vertex): - if self.simulate_board[self.simulate_flatten(n)] == utils.another_color(color): + if self.simulate_board[self.game._flatten(n)] == utils.another_color(color): opponent_liberty, group = self._find_group(n) if not opponent_liberty: suicide = False # this move is able to take opponent's stone, not suicide - self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY # undo this move + self.simulate_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move return suicide def _check_global_isomorphous(self, color, vertex): ##backup _board = copy.copy(self.simulate_board) - self.simulate_board[self.simulate_flatten(vertex)] = color + self.simulate_board[self.game._flatten(vertex)] = color self._process_board(color, vertex) if self.simulate_board in self.game.history: res = True @@ -84,7 +75,7 @@ class GoEnv: def _neighbor(self, vertex): x, y = vertex nei = [] - for d in DELTA: + for d in NEIGHBOR_OFFSET: _x = x + d[0] _y = y + d[1] if self._in_board((_x, _y)): @@ -104,16 +95,16 @@ class GoEnv: def _process_board(self, color, vertex): nei = self._neighbor(vertex) for n in nei: - if self.simulate_board[self.simulate_flatten(n)] == utils.another_color(color): + if self.simulate_board[self.game._flatten(n)] == utils.another_color(color): has_liberty, group = self._find_group(n) if not has_liberty: for b in group: - self.simulate_board[self.simulate_flatten(b)] = utils.EMPTY + self.simulate_board[self.game._flatten(b)] = utils.EMPTY def _is_eye(self, color, vertex): nei = self._neighbor(vertex) cor = self._corner(vertex) - ncolor = {color == self.simulate_board[self.simulate_flatten(n)] for n in nei} + ncolor = {color == self.simulate_board[self.game._flatten(n)] for n in nei} if False in ncolor: # print "not all neighbors are in same color with us" return False @@ -122,7 +113,7 @@ class GoEnv: # print "all neighbors are in same group and same color with us" return True else: - opponent_number = [self.simulate_board[self.simulate_flatten(c)] for c in cor].count(-color) + opponent_number = [self.simulate_board[self.game._flatten(c)] for c in cor].count(-color) opponent_propotion = float(opponent_number) / float(len(cor)) if opponent_propotion < 0.5: # print "few opponents, real eye" @@ -145,7 +136,7 @@ class GoEnv: if action == self.game.size ** 2: vertex = (0, 0) else: - vertex = self.simulate_deflatten(action) + vertex = self.game._deflatten(action) if state[0, 0, 0, -1] == utils.BLACK: color = utils.BLACK else: @@ -160,7 +151,7 @@ class GoEnv: return False ### already have stone - if not self.simulate_board[self.simulate_flatten(vertex)] == utils.EMPTY: + if not self.simulate_board[self.game._flatten(vertex)] == utils.EMPTY: # print(np.array(self.board).reshape(9, 9)) # print(vertex) return False @@ -182,14 +173,14 @@ class GoEnv: if vertex == utils.PASS: return True - id_ = self.simulate_flatten(vertex) + id_ = self.game._flatten(vertex) if self.simulate_board[id_] == utils.EMPTY: self.simulate_board[id_] = color return True else: return False - def step_forward(self, state, action): + def simulate_step_forward(self, state, action): if state[0, 0, 0, -1] == 1: color = utils.BLACK else: @@ -197,7 +188,7 @@ class GoEnv: if action == self.game.size ** 2: vertex = utils.PASS else: - vertex = self.simulate_deflatten(action) + vertex = self.game._deflatten(action) # print(vertex) # print(self.board) self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() diff --git a/tianshou/core/mcts/evaluator.py b/tianshou/core/mcts/evaluator.py index 9c4ee8e..a1f9456 100644 --- a/tianshou/core/mcts/evaluator.py +++ b/tianshou/core/mcts/evaluator.py @@ -19,10 +19,10 @@ class rollout_policy(evaluator): # TODO: prior for rollout policy total_reward = 0. action = np.random.randint(0, self.action_num) - state, reward = self.env.step_forward(state, action) + state, reward = self.env.simulate_step_forward(state, action) total_reward += reward while state is not None: action = np.random.randint(0, self.action_num) - state, reward = self.env.step_forward(state, action) + state, reward = self.env.simulate_step_forward(state, action) total_reward += reward return np.ones([self.action_num])/self.action_num, total_reward diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 979e994..b58c105 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -116,7 +116,7 @@ class ActionNode(object): self.next_state = tuple2list(self.next_state) def selection(self, simulator): - self.next_state, self.reward = simulator.step_forward(self.parent.state, self.action) + self.next_state, self.reward = simulator.simulate_step_forward(self.parent.state, self.action) self.origin_state = self.next_state self.state_type = type(self.next_state) self.type_conversion_to_tuple() From f8a70183b6c391491b1ae0f5e0f9ef1563e1285c Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Tue, 19 Dec 2017 22:57:38 +0800 Subject: [PATCH 02/18] fix the copy bug in check_global_isomorphous; refactor code to eliminate side effect --- AlphaGo/go.py | 36 ++++++------- AlphaGo/strategy.py | 104 +++++++++++++++++-------------------- tianshou/core/mcts/mcts.py | 3 +- 3 files changed, 67 insertions(+), 76 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 7b1d3e7..8e3518d 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -72,18 +72,14 @@ class Go: self.game.board[self.game._flatten(vertex)] = utils.EMPTY return True - def _check_global_isomorphous(self, color, vertex): - ##backup - _board = copy.copy(self.game.board) - self.game.board[self.game._flatten(vertex)] = color - self._process_board(color, vertex) - if self.game.board in self.game.history: - res = True - else: - res = False - - self.game.board = _board - return res + def _check_global_isomorphous(self, history_boards, current_board, color, vertex): + repeat = False + next_board = copy.copy(current_board) + next_board[self.game._flatten(vertex)] = color + self._process_board(next_board, color, vertex) + if next_board in history_boards: + repeat = True + return repeat def _in_board(self, vertex): x, y = vertex @@ -101,38 +97,38 @@ class Go: nei.append((_x, _y)) return nei - def _process_board(self, color, vertex): + def _process_board(self, current_board, color, vertex): nei = self._neighbor(vertex) for n in nei: - if self.game.board[self.game._flatten(n)] == utils.another_color(color): + if current_board[self.game._flatten(n)] == utils.another_color(color): can_kill, block = self._find_block(n) if can_kill: for b in block: - self.game.board[self.game._flatten(b)] = utils.EMPTY + current_board[self.game._flatten(b)] = utils.EMPTY - def is_valid(self, color, vertex): + def is_valid(self, history_boards, current_board, color, vertex): ### in board if not self._in_board(vertex): return False ### already have stone - if not self.game.board[self.game._flatten(vertex)] == utils.EMPTY: + if not current_board[self.game._flatten(vertex)] == utils.EMPTY: return False ### check if it is qi if not self._is_qi(color, vertex): return False - if self._check_global_isomorphous(color, vertex): + if self._check_global_isomorphous(history_boards, current_board, color, vertex): return False return True def do_move(self, color, vertex): - if not self.is_valid(color, vertex): + if not self.is_valid(self.game.history, self.game.board, color, vertex): return False self.game.board[self.game._flatten(vertex)] = color - self._process_board(color, vertex) + self._process_board(self.game.board, color, vertex) self.game.history.append(copy.copy(self.game.board)) self.game.latest_boards.append(copy.copy(self.game.board)) return True diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index af017b1..07555e9 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -19,52 +19,47 @@ class GoEnv: self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) self.simulate_latest_boards = deque(maxlen=8) - def _find_group(self, start): - color = self.simulate_board[self.game._flatten(start)] + def _find_group(self, current_board, vertex): + color = current_board[self.game._flatten(vertex)] # print ("color : ", color) chain = set() - frontier = [start] + frontier = [vertex] has_liberty = False while frontier: current = frontier.pop() # print ("current : ", current) chain.add(current) for n in self._neighbor(current): - # print n, self._flatten(n), self.board[self._flatten(n)], - if self.simulate_board[self.game._flatten(n)] == color and not n in chain: + if current_board[self.game._flatten(n)] == color and not n in chain: frontier.append(n) - if self.simulate_board[self.game._flatten(n)] == utils.EMPTY: + if current_board[self.game._flatten(n)] == utils.EMPTY: has_liberty = True return has_liberty, chain - def _is_suicide(self, color, vertex): - self.simulate_board[self.game._flatten(vertex)] = color # assume that we already take this move + def _is_suicide(self, current_board, color, vertex): + current_board[self.game._flatten(vertex)] = color # assume that we already take this move suicide = False - has_liberty, group = self._find_group(vertex) + has_liberty, group = self._find_group(current_board, vertex) if not has_liberty: suicide = True # no liberty, suicide for n in self._neighbor(vertex): - if self.simulate_board[self.game._flatten(n)] == utils.another_color(color): - opponent_liberty, group = self._find_group(n) + if current_board[self.game._flatten(n)] == utils.another_color(color): + opponent_liberty, group = self._find_group(current_board, n) if not opponent_liberty: suicide = False # this move is able to take opponent's stone, not suicide - self.simulate_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move + current_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move return suicide - def _check_global_isomorphous(self, color, vertex): - ##backup - _board = copy.copy(self.simulate_board) - self.simulate_board[self.game._flatten(vertex)] = color - self._process_board(color, vertex) - if self.simulate_board in self.game.history: - res = True - else: - res = False - - self.simulate_board = _board - return res + def _check_global_isomorphous(self, history_boards, current_board, color, vertex): + repeat = False + next_board = copy.copy(current_board) + next_board[self.game._flatten(vertex)] = color + self._process_board(next_board, color, vertex) + if next_board in history_boards: + repeat = True + return repeat def _in_board(self, vertex): x, y = vertex @@ -92,28 +87,28 @@ class GoEnv: corner.append((_x, _y)) return corner - def _process_board(self, color, vertex): + def _process_board(self, current_board, color, vertex): nei = self._neighbor(vertex) for n in nei: - if self.simulate_board[self.game._flatten(n)] == utils.another_color(color): - has_liberty, group = self._find_group(n) + if current_board[self.game._flatten(n)] == utils.another_color(color): + has_liberty, group = self._find_group(current_board, n) if not has_liberty: for b in group: - self.simulate_board[self.game._flatten(b)] = utils.EMPTY + current_board[self.game._flatten(b)] = utils.EMPTY - def _is_eye(self, color, vertex): + def _is_eye(self, current_board, color, vertex): nei = self._neighbor(vertex) cor = self._corner(vertex) - ncolor = {color == self.simulate_board[self.game._flatten(n)] for n in nei} + ncolor = {color == current_board[self.game._flatten(n)] for n in nei} if False in ncolor: # print "not all neighbors are in same color with us" return False - _, group = self._find_group(nei[0]) + _, group = self._find_group(current_board, nei[0]) if set(nei) < group: # print "all neighbors are in same group and same color with us" return True else: - opponent_number = [self.simulate_board[self.game._flatten(c)] for c in cor].count(-color) + opponent_number = [current_board[self.game._flatten(c)] for c in cor].count(-color) opponent_propotion = float(opponent_number) / float(len(cor)) if opponent_propotion < 0.5: # print "few opponents, real eye" @@ -122,49 +117,54 @@ class GoEnv: # print "many opponents, fake eye" return False - def knowledge_prunning(self, color, vertex): + def knowledge_prunning(self, current_board, color, vertex): ### check if it is an eye of yourself ### assumptions : notice that this judgement requires that the state is an endgame - if self._is_eye(color, vertex): + if self._is_eye(current_board, color, vertex): return False return True - def simulate_is_valid(self, state, action): - # State is the play board, the shape is [1, self.game.size, self.game.size, 17]. - # Action is an index + def sa2cv(self, state, action): + # State is the play board, the shape is [1, self.game.size, self.game.size, 17], action is an index. # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move - if action == self.game.size ** 2: - vertex = (0, 0) - else: - vertex = self.game._deflatten(action) if state[0, 0, 0, -1] == utils.BLACK: color = utils.BLACK else: color = utils.WHITE + if action == self.game.size ** 2: + vertex = (0, 0) + else: + vertex = self.game._deflatten(action) + return color, vertex + + def simulate_is_valid(self, history_boards, current_board, state, action): + # initialize simulate_latest_boards and simulate_board from state self.simulate_latest_boards.clear() for i in range(8): self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) + color, vertex = self.sa2cv(state, action) + ### in board if not self._in_board(vertex): return False ### already have stone - if not self.simulate_board[self.game._flatten(vertex)] == utils.EMPTY: + if not current_board[self.game._flatten(vertex)] == utils.EMPTY: # print(np.array(self.board).reshape(9, 9)) # print(vertex) return False ### check if it is suicide - if self._is_suicide(color, vertex): + if self._is_suicide(current_board, color, vertex): return False ### forbid global isomorphous - if self._check_global_isomorphous(color, vertex): + if self._check_global_isomorphous(history_boards, current_board, color, vertex): return False - if not self.knowledge_prunning(color, vertex): + if not self.knowledge_prunning(current_board, color, vertex): return False return True @@ -181,17 +181,11 @@ class GoEnv: return False def simulate_step_forward(self, state, action): - if state[0, 0, 0, -1] == 1: - color = utils.BLACK - else: - color = utils.WHITE - if action == self.game.size ** 2: - vertex = utils.PASS - else: - vertex = self.game._deflatten(action) - # print(vertex) - # print(self.board) + # initialize the simulate_board from state self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() + + color, vertex = self.sa2cv(state, action) + self.simulate_do_move(color, vertex) new_state = np.concatenate( [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index b58c105..12fc85d 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -75,7 +75,8 @@ class UCTNode(MCTSNode): start_time = time.time() self.mask = [] for act in range(self.action_num - 1): - if not simulator.simulate_is_valid(self.state, act): + if not simulator.simulate_is_valid( + simulator.simulate_latest_boards, simulator.simulate_board, self.state, act): self.mask.append(act) self.ucb[act] = -float("Inf") else: From afc5dbac5a2c4bf6a9a22769dfb4d20df8f87be4 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Wed, 20 Dec 2017 00:16:24 +0800 Subject: [PATCH 03/18] rearrange the sequence of functions of Go and GoEnv before merging --- AlphaGo/go.py | 125 ++++++++++++++++++++------------------------ AlphaGo/strategy.py | 70 ++++++++++++------------- 2 files changed, 91 insertions(+), 104 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 8e3518d..37d8339 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -17,70 +17,6 @@ class Go: def __init__(self, **kwargs): self.game = kwargs['game'] - def _bfs(self, vertex, color, block, status): - block.append(vertex) - status[self.game._flatten(vertex)] = True - nei = self._neighbor(vertex) - for n in nei: - if not status[self.game._flatten(n)]: - if self.game.board[self.game._flatten(n)] == color: - self._bfs(n, color, block, status) - - def _find_block(self, vertex): - block = [] - status = [False] * (self.game.size ** 2) - color = self.game.board[self.game._flatten(vertex)] - self._bfs(vertex, color, block, status) - - for b in block: - for n in self._neighbor(b): - if self.game.board[self.game._flatten(n)] == utils.EMPTY: - return False, block - return True, block - - def _find_boarder(self, vertex): - block = [] - status = [False] * (self.game.size ** 2) - self._bfs(vertex, utils.EMPTY, block, status) - border = [] - for b in block: - for n in self._neighbor(b): - if not (n in block): - border.append(n) - return border - - def _is_qi(self, color, vertex): - nei = self._neighbor(vertex) - for n in nei: - if self.game.board[self.game._flatten(n)] == utils.EMPTY: - return True - - self.game.board[self.game._flatten(vertex)] = color - for n in nei: - if self.game.board[self.game._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n) - if can_kill: - self.game.board[self.game._flatten(vertex)] = utils.EMPTY - return True - - ### can not suicide - can_kill, block = self._find_block(vertex) - if can_kill: - self.game.board[self.game._flatten(vertex)] = utils.EMPTY - return False - - self.game.board[self.game._flatten(vertex)] = utils.EMPTY - return True - - def _check_global_isomorphous(self, history_boards, current_board, color, vertex): - repeat = False - next_board = copy.copy(current_board) - next_board[self.game._flatten(vertex)] = color - self._process_board(next_board, color, vertex) - if next_board in history_boards: - repeat = True - return repeat - def _in_board(self, vertex): x, y = vertex if x < 1 or x > self.game.size: return False @@ -97,15 +33,57 @@ class Go: nei.append((_x, _y)) return nei + def _find_group(self, current_board, vertex): + color = current_board[self.game._flatten(vertex)] + # print ("color : ", color) + chain = set() + frontier = [vertex] + has_liberty = False + while frontier: + current = frontier.pop() + # print ("current : ", current) + chain.add(current) + for n in self._neighbor(current): + if current_board[self.game._flatten(n)] == color and not n in chain: + frontier.append(n) + if current_board[self.game._flatten(n)] == utils.EMPTY: + has_liberty = True + return has_liberty, chain + + def _is_suicide(self, current_board, color, vertex): + current_board[self.game._flatten(vertex)] = color # assume that we already take this move + suicide = False + + has_liberty, group = self._find_group(current_board, vertex) + if not has_liberty: + suicide = True # no liberty, suicide + for n in self._neighbor(vertex): + if current_board[self.game._flatten(n)] == utils.another_color(color): + opponent_liberty, group = self._find_group(current_board, n) + if not opponent_liberty: + suicide = False # this move is able to take opponent's stone, not suicide + + current_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move + return suicide + def _process_board(self, current_board, color, vertex): nei = self._neighbor(vertex) for n in nei: if current_board[self.game._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n) - if can_kill: - for b in block: + has_liberty, group = self._find_group(current_board, n) + if not has_liberty: + for b in group: current_board[self.game._flatten(b)] = utils.EMPTY + def _check_global_isomorphous(self, history_boards, current_board, color, vertex): + repeat = False + next_board = copy.copy(current_board) + next_board[self.game._flatten(vertex)] = color + self._process_board(next_board, color, vertex) + if next_board in history_boards: + repeat = True + return repeat + def is_valid(self, history_boards, current_board, color, vertex): ### in board if not self._in_board(vertex): @@ -115,8 +93,8 @@ class Go: if not current_board[self.game._flatten(vertex)] == utils.EMPTY: return False - ### check if it is qi - if not self._is_qi(color, vertex): + ### check if it is suicide + if self._is_suicide(current_board, color, vertex): return False if self._check_global_isomorphous(history_boards, current_board, color, vertex): @@ -137,6 +115,15 @@ class Go: idx = [i for i,x in enumerate(self.game.board) if x == utils.EMPTY ][0] return self.game._deflatten(idx) + def _find_boarder(self, vertex): + _, group = self._find_group(self.game.board, vertex) + border = [] + for b in group: + for n in self._neighbor(b): + if not (n in group): + border.append(n) + return border + def _add_nearby_stones(self, neighbor_vertex_set, start_vertex_x, start_vertex_y, x_diff, y_diff, num_step): ''' add the nearby stones around the input vertex diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index 07555e9..9ebd421 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -19,6 +19,32 @@ class GoEnv: self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) self.simulate_latest_boards = deque(maxlen=8) + def _in_board(self, vertex): + x, y = vertex + if x < 1 or x > self.game.size: return False + if y < 1 or y > self.game.size: return False + return True + + def _neighbor(self, vertex): + x, y = vertex + nei = [] + for d in NEIGHBOR_OFFSET: + _x = x + d[0] + _y = y + d[1] + if self._in_board((_x, _y)): + nei.append((_x, _y)) + return nei + + def _corner(self, vertex): + x, y = vertex + corner = [] + for d in CORNER_OFFSET: + _x = x + d[0] + _y = y + d[1] + if self._in_board((_x, _y)): + corner.append((_x, _y)) + return corner + def _find_group(self, current_board, vertex): color = current_board[self.game._flatten(vertex)] # print ("color : ", color) @@ -52,41 +78,6 @@ class GoEnv: current_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move return suicide - def _check_global_isomorphous(self, history_boards, current_board, color, vertex): - repeat = False - next_board = copy.copy(current_board) - next_board[self.game._flatten(vertex)] = color - self._process_board(next_board, color, vertex) - if next_board in history_boards: - repeat = True - return repeat - - def _in_board(self, vertex): - x, y = vertex - if x < 1 or x > self.game.size: return False - if y < 1 or y > self.game.size: return False - return True - - def _neighbor(self, vertex): - x, y = vertex - nei = [] - for d in NEIGHBOR_OFFSET: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - nei.append((_x, _y)) - return nei - - def _corner(self, vertex): - x, y = vertex - corner = [] - for d in CORNER_OFFSET: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - corner.append((_x, _y)) - return corner - def _process_board(self, current_board, color, vertex): nei = self._neighbor(vertex) for n in nei: @@ -96,6 +87,15 @@ class GoEnv: for b in group: current_board[self.game._flatten(b)] = utils.EMPTY + def _check_global_isomorphous(self, history_boards, current_board, color, vertex): + repeat = False + next_board = copy.copy(current_board) + next_board[self.game._flatten(vertex)] = color + self._process_board(next_board, color, vertex) + if next_board in history_boards: + repeat = True + return repeat + def _is_eye(self, current_board, color, vertex): nei = self._neighbor(vertex) cor = self._corner(vertex) From 0456e0c15ee894bb4fe6ba043f87859da643bfae Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Wed, 20 Dec 2017 00:43:31 +0800 Subject: [PATCH 04/18] final version before merge Go and GoEnv --- AlphaGo/engine.py | 2 +- AlphaGo/game.py | 3 ++- AlphaGo/go.py | 8 ++++---- AlphaGo/self-play.py | 2 +- AlphaGo/strategy.py | 38 +++++++++++++++++++++----------------- 5 files changed, 29 insertions(+), 24 deletions(-) diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index 1ee8833..d11635a 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -183,7 +183,7 @@ class GTPEngine(): return 'unknown player', False def cmd_get_score(self, args, **kwargs): - return self._game.executor.get_score(), None + return self._game.executor.executor_get_score(), None def cmd_show_board(self, args, **kwargs): return self._game.board, True diff --git a/AlphaGo/game.py b/AlphaGo/game.py index d0cb91c..af4ef57 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -78,6 +78,7 @@ class Game: return state def think(self, latest_boards, color): + # TODO : using copy is right, or should we change to deepcopy? self.simulator.simulate_latest_boards = copy.copy(latest_boards) self.simulator.simulate_board = copy.copy(latest_boards[-1]) nn_input = self.generate_nn_input(self.simulator.simulate_latest_boards, color) @@ -95,7 +96,7 @@ class Game: # this function can be called directly to play the opponent's move if vertex == utils.PASS: return True - res = self.executor.do_move(color, vertex) + res = self.executor.executor_do_move(color, vertex) return res def think_play_move(self, color): diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 37d8339..108c9bd 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -84,7 +84,7 @@ class Go: repeat = True return repeat - def is_valid(self, history_boards, current_board, color, vertex): + def _is_valid(self, history_boards, current_board, color, vertex): ### in board if not self._in_board(vertex): return False @@ -102,8 +102,8 @@ class Go: return True - def do_move(self, color, vertex): - if not self.is_valid(self.game.history, self.game.board, color, vertex): + def executor_do_move(self, color, vertex): + if not self._is_valid(self.game.history, self.game.board, color, vertex): return False self.game.board[self.game._flatten(vertex)] = color self._process_board(self.game.board, color, vertex) @@ -164,7 +164,7 @@ class Go: elif color_estimate < 0: return utils.WHITE - def get_score(self, is_unknown_estimation = False): + def executor_get_score(self, is_unknown_estimation = False): ''' is_unknown_estimation: whether use nearby stone to predict the unknown return score from BLACK perspective. diff --git a/AlphaGo/self-play.py b/AlphaGo/self-play.py index 98ccf84..296112b 100644 --- a/AlphaGo/self-play.py +++ b/AlphaGo/self-play.py @@ -79,7 +79,7 @@ while True: prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1)) print("Finished") print("\n") - score = game.executor.get_score(True) + score = game.executor.executor_get_score(True) if score > 0: winner = utils.BLACK else: diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index 9ebd421..1e5fd02 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -117,14 +117,14 @@ class GoEnv: # print "many opponents, fake eye" return False - def knowledge_prunning(self, current_board, color, vertex): + def _knowledge_prunning(self, current_board, color, vertex): ### check if it is an eye of yourself ### assumptions : notice that this judgement requires that the state is an endgame if self._is_eye(current_board, color, vertex): return False return True - def sa2cv(self, state, action): + def _sa2cv(self, state, action): # State is the play board, the shape is [1, self.game.size, self.game.size, 17], action is an index. # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move if state[0, 0, 0, -1] == utils.BLACK: @@ -137,23 +137,13 @@ class GoEnv: vertex = self.game._deflatten(action) return color, vertex - def simulate_is_valid(self, history_boards, current_board, state, action): - # initialize simulate_latest_boards and simulate_board from state - self.simulate_latest_boards.clear() - for i in range(8): - self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) - self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) - - color, vertex = self.sa2cv(state, action) - + def _is_valid(self, history_boards, current_board, color, vertex): ### in board if not self._in_board(vertex): return False ### already have stone if not current_board[self.game._flatten(vertex)] == utils.EMPTY: - # print(np.array(self.board).reshape(9, 9)) - # print(vertex) return False ### check if it is suicide @@ -164,12 +154,26 @@ class GoEnv: if self._check_global_isomorphous(history_boards, current_board, color, vertex): return False - if not self.knowledge_prunning(current_board, color, vertex): + return True + + def simulate_is_valid(self, history_boards, current_board, state, action): + # initialize simulate_latest_boards and simulate_board from state + self.simulate_latest_boards.clear() + for i in range(8): + self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) + self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) + + color, vertex = self._sa2cv(state, action) + + if not self._is_valid(history_boards, current_board, color, vertex): + return False + + if not self._knowledge_prunning(current_board, color, vertex): return False return True - def simulate_do_move(self, color, vertex): + def _do_move(self, color, vertex): if vertex == utils.PASS: return True @@ -184,9 +188,9 @@ class GoEnv: # initialize the simulate_board from state self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() - color, vertex = self.sa2cv(state, action) + color, vertex = self._sa2cv(state, action) - self.simulate_do_move(color, vertex) + self._do_move(color, vertex) new_state = np.concatenate( [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), From db40994e1145aed511b2b80e503334178ade3c14 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Wed, 20 Dec 2017 01:14:05 +0800 Subject: [PATCH 05/18] merge Go and GoEnv finallygit status! --- AlphaGo/engine.py | 2 +- AlphaGo/game.py | 23 ++--- AlphaGo/go.py | 99 ++++++++++++++++++++- AlphaGo/self-play.py | 2 +- AlphaGo/strategy.py | 199 ------------------------------------------- 5 files changed, 108 insertions(+), 217 deletions(-) delete mode 100644 AlphaGo/strategy.py diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index d11635a..9948176 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -183,7 +183,7 @@ class GTPEngine(): return 'unknown player', False def cmd_get_score(self, args, **kwargs): - return self._game.executor.executor_get_score(), None + return self._game.game_engine.executor_get_score(), None def cmd_show_board(self, args, **kwargs): return self._game.board, True diff --git a/AlphaGo/game.py b/AlphaGo/game.py index af4ef57..aee8d3a 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -9,16 +9,13 @@ import utils import copy import tensorflow as tf import numpy as np -import sys +import sys, os import go import network_small -import strategy from collections import deque +sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir)) from tianshou.core.mcts.mcts import MCTS -import Network -#from strategy import strategy - class Game: ''' Load the real game and trained weights. @@ -34,15 +31,11 @@ class Game: self.latest_boards = deque(maxlen=8) for _ in range(8): self.latest_boards.append(self.board) - - self.executor = go.Go(game=self) - #self.strategy = strategy(checkpoint_path) - - self.simulator = strategy.GoEnv(game=self) self.net = network_small.Network() self.sess = self.net.forward(checkpoint_path) self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], feed_dict={self.net.x: state, self.net.is_training: False}) + self.game_engine = go.Go(game=self) def _flatten(self, vertex): x, y = vertex @@ -79,10 +72,10 @@ class Game: def think(self, latest_boards, color): # TODO : using copy is right, or should we change to deepcopy? - self.simulator.simulate_latest_boards = copy.copy(latest_boards) - self.simulator.simulate_board = copy.copy(latest_boards[-1]) - nn_input = self.generate_nn_input(self.simulator.simulate_latest_boards, color) - mcts = MCTS(self.simulator, self.evaluator, nn_input, self.size ** 2 + 1, inverse=True, max_step=1) + self.game_engine.simulate_latest_boards = copy.copy(latest_boards) + self.game_engine.simulate_board = copy.copy(latest_boards[-1]) + nn_input = self.generate_nn_input(self.game_engine.simulate_latest_boards, color) + mcts = MCTS(self.game_engine, self.evaluator, nn_input, self.size ** 2 + 1, inverse=True, max_step=1) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0] @@ -96,7 +89,7 @@ class Game: # this function can be called directly to play the opponent's move if vertex == utils.PASS: return True - res = self.executor.executor_do_move(color, vertex) + res = self.game_engine.executor_do_move(color, vertex) return res def think_play_move(self, color): diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 108c9bd..10ce7e1 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -1,7 +1,7 @@ from __future__ import print_function import utils import copy -import sys +import numpy as np from collections import deque ''' @@ -12,10 +12,13 @@ Settings of the Go game. ''' NEIGHBOR_OFFSET = [[1, 0], [-1, 0], [0, -1], [0, 1]] +CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] class Go: def __init__(self, **kwargs): self.game = kwargs['game'] + self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) + self.simulate_latest_boards = deque(maxlen=8) def _in_board(self, vertex): x, y = vertex @@ -33,6 +36,16 @@ class Go: nei.append((_x, _y)) return nei + def _corner(self, vertex): + x, y = vertex + corner = [] + for d in CORNER_OFFSET: + _x = x + d[0] + _y = y + d[1] + if self._in_board((_x, _y)): + corner.append((_x, _y)) + return corner + def _find_group(self, current_board, vertex): color = current_board[self.game._flatten(vertex)] # print ("color : ", color) @@ -84,6 +97,47 @@ class Go: repeat = True return repeat + def _is_eye(self, current_board, color, vertex): + nei = self._neighbor(vertex) + cor = self._corner(vertex) + ncolor = {color == current_board[self.game._flatten(n)] for n in nei} + if False in ncolor: + # print "not all neighbors are in same color with us" + return False + _, group = self._find_group(current_board, nei[0]) + if set(nei) < group: + # print "all neighbors are in same group and same color with us" + return True + else: + opponent_number = [current_board[self.game._flatten(c)] for c in cor].count(-color) + opponent_propotion = float(opponent_number) / float(len(cor)) + if opponent_propotion < 0.5: + # print "few opponents, real eye" + return True + else: + # print "many opponents, fake eye" + return False + + def _knowledge_prunning(self, current_board, color, vertex): + ### check if it is an eye of yourself + ### assumptions : notice that this judgement requires that the state is an endgame + if self._is_eye(current_board, color, vertex): + return False + return True + + def _sa2cv(self, state, action): + # State is the play board, the shape is [1, self.game.size, self.game.size, 17], action is an index. + # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move + if state[0, 0, 0, -1] == utils.BLACK: + color = utils.BLACK + else: + color = utils.WHITE + if action == self.game.size ** 2: + vertex = (0, 0) + else: + vertex = self.game._deflatten(action) + return color, vertex + def _is_valid(self, history_boards, current_board, color, vertex): ### in board if not self._in_board(vertex): @@ -97,11 +151,54 @@ class Go: if self._is_suicide(current_board, color, vertex): return False + ### forbid global isomorphous if self._check_global_isomorphous(history_boards, current_board, color, vertex): return False return True + def simulate_is_valid(self, history_boards, current_board, state, action): + # initialize simulate_latest_boards and simulate_board from state + self.simulate_latest_boards.clear() + for i in range(8): + self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) + self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) + + color, vertex = self._sa2cv(state, action) + + if not self._is_valid(history_boards, current_board, color, vertex): + return False + + if not self._knowledge_prunning(current_board, color, vertex): + return False + + return True + + def _do_move(self, color, vertex): + if vertex == utils.PASS: + return True + + id_ = self.game._flatten(vertex) + if self.simulate_board[id_] == utils.EMPTY: + self.simulate_board[id_] = color + return True + else: + return False + + def simulate_step_forward(self, state, action): + # initialize the simulate_board from state + self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() + + color, vertex = self._sa2cv(state, action) + + self._do_move(color, vertex) + new_state = np.concatenate( + [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), + state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), + np.array(1 - state[:, :, :, -1]).reshape(1, self.game.size, self.game.size, 1)], + axis=3) + return new_state, 0 + def executor_do_move(self, color, vertex): if not self._is_valid(self.game.history, self.game.board, color, vertex): return False diff --git a/AlphaGo/self-play.py b/AlphaGo/self-play.py index 296112b..63b7e97 100644 --- a/AlphaGo/self-play.py +++ b/AlphaGo/self-play.py @@ -79,7 +79,7 @@ while True: prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1)) print("Finished") print("\n") - score = game.executor.executor_get_score(True) + score = game.game_engine.executor_get_score(True) if score > 0: winner = utils.BLACK else: diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py deleted file mode 100644 index 1e5fd02..0000000 --- a/AlphaGo/strategy.py +++ /dev/null @@ -1,199 +0,0 @@ -import os, sys - -sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir)) -import numpy as np -import utils -import time -import copy -import network_small -import tensorflow as tf -from collections import deque -from tianshou.core.mcts.mcts import MCTS - -NEIGHBOR_OFFSET = [[1, 0], [-1, 0], [0, -1], [0, 1]] -CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] - -class GoEnv: - def __init__(self, **kwargs): - self.game = kwargs['game'] - self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) - self.simulate_latest_boards = deque(maxlen=8) - - def _in_board(self, vertex): - x, y = vertex - if x < 1 or x > self.game.size: return False - if y < 1 or y > self.game.size: return False - return True - - def _neighbor(self, vertex): - x, y = vertex - nei = [] - for d in NEIGHBOR_OFFSET: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - nei.append((_x, _y)) - return nei - - def _corner(self, vertex): - x, y = vertex - corner = [] - for d in CORNER_OFFSET: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - corner.append((_x, _y)) - return corner - - def _find_group(self, current_board, vertex): - color = current_board[self.game._flatten(vertex)] - # print ("color : ", color) - chain = set() - frontier = [vertex] - has_liberty = False - while frontier: - current = frontier.pop() - # print ("current : ", current) - chain.add(current) - for n in self._neighbor(current): - if current_board[self.game._flatten(n)] == color and not n in chain: - frontier.append(n) - if current_board[self.game._flatten(n)] == utils.EMPTY: - has_liberty = True - return has_liberty, chain - - def _is_suicide(self, current_board, color, vertex): - current_board[self.game._flatten(vertex)] = color # assume that we already take this move - suicide = False - - has_liberty, group = self._find_group(current_board, vertex) - if not has_liberty: - suicide = True # no liberty, suicide - for n in self._neighbor(vertex): - if current_board[self.game._flatten(n)] == utils.another_color(color): - opponent_liberty, group = self._find_group(current_board, n) - if not opponent_liberty: - suicide = False # this move is able to take opponent's stone, not suicide - - current_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move - return suicide - - def _process_board(self, current_board, color, vertex): - nei = self._neighbor(vertex) - for n in nei: - if current_board[self.game._flatten(n)] == utils.another_color(color): - has_liberty, group = self._find_group(current_board, n) - if not has_liberty: - for b in group: - current_board[self.game._flatten(b)] = utils.EMPTY - - def _check_global_isomorphous(self, history_boards, current_board, color, vertex): - repeat = False - next_board = copy.copy(current_board) - next_board[self.game._flatten(vertex)] = color - self._process_board(next_board, color, vertex) - if next_board in history_boards: - repeat = True - return repeat - - def _is_eye(self, current_board, color, vertex): - nei = self._neighbor(vertex) - cor = self._corner(vertex) - ncolor = {color == current_board[self.game._flatten(n)] for n in nei} - if False in ncolor: - # print "not all neighbors are in same color with us" - return False - _, group = self._find_group(current_board, nei[0]) - if set(nei) < group: - # print "all neighbors are in same group and same color with us" - return True - else: - opponent_number = [current_board[self.game._flatten(c)] for c in cor].count(-color) - opponent_propotion = float(opponent_number) / float(len(cor)) - if opponent_propotion < 0.5: - # print "few opponents, real eye" - return True - else: - # print "many opponents, fake eye" - return False - - def _knowledge_prunning(self, current_board, color, vertex): - ### check if it is an eye of yourself - ### assumptions : notice that this judgement requires that the state is an endgame - if self._is_eye(current_board, color, vertex): - return False - return True - - def _sa2cv(self, state, action): - # State is the play board, the shape is [1, self.game.size, self.game.size, 17], action is an index. - # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move - if state[0, 0, 0, -1] == utils.BLACK: - color = utils.BLACK - else: - color = utils.WHITE - if action == self.game.size ** 2: - vertex = (0, 0) - else: - vertex = self.game._deflatten(action) - return color, vertex - - def _is_valid(self, history_boards, current_board, color, vertex): - ### in board - if not self._in_board(vertex): - return False - - ### already have stone - if not current_board[self.game._flatten(vertex)] == utils.EMPTY: - return False - - ### check if it is suicide - if self._is_suicide(current_board, color, vertex): - return False - - ### forbid global isomorphous - if self._check_global_isomorphous(history_boards, current_board, color, vertex): - return False - - return True - - def simulate_is_valid(self, history_boards, current_board, state, action): - # initialize simulate_latest_boards and simulate_board from state - self.simulate_latest_boards.clear() - for i in range(8): - self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) - self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) - - color, vertex = self._sa2cv(state, action) - - if not self._is_valid(history_boards, current_board, color, vertex): - return False - - if not self._knowledge_prunning(current_board, color, vertex): - return False - - return True - - def _do_move(self, color, vertex): - if vertex == utils.PASS: - return True - - id_ = self.game._flatten(vertex) - if self.simulate_board[id_] == utils.EMPTY: - self.simulate_board[id_] = color - return True - else: - return False - - def simulate_step_forward(self, state, action): - # initialize the simulate_board from state - self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() - - color, vertex = self._sa2cv(state, action) - - self._do_move(color, vertex) - new_state = np.concatenate( - [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), - state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), - np.array(1 - state[:, :, :, -1]).reshape(1, self.game.size, self.game.size, 1)], - axis=3) - return new_state, 0 From 112fd07b130841f8cb699b7507487ffec405e449 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Wed, 20 Dec 2017 16:43:42 +0800 Subject: [PATCH 06/18] modify the mcts, refactor the network --- AlphaGo/Network.py | 211 ----------------------- AlphaGo/Network_ori.py | 175 ------------------- AlphaGo/game.py | 15 +- AlphaGo/go.py | 58 ++----- AlphaGo/model.py | 170 ++++++++++++++++++ AlphaGo/{network_small.py => network.py} | 0 tianshou/core/mcts/mcts.py | 40 ++--- 7 files changed, 212 insertions(+), 457 deletions(-) delete mode 100644 AlphaGo/Network.py delete mode 100644 AlphaGo/Network_ori.py create mode 100644 AlphaGo/model.py rename AlphaGo/{network_small.py => network.py} (100%) diff --git a/AlphaGo/Network.py b/AlphaGo/Network.py deleted file mode 100644 index caf7710..0000000 --- a/AlphaGo/Network.py +++ /dev/null @@ -1,211 +0,0 @@ -import os -import time -import sys - -import numpy as np -import time -import tensorflow as tf -import tensorflow.contrib.layers as layers - -import multi_gpu -import time - -# os.environ["CUDA_VISIBLE_DEVICES"] = "1" -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - - -def residual_block(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = h + input - return tf.nn.relu(h) - - -def policy_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 362, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -def value_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -class Network(object): - def __init__(self): - self.x = tf.placeholder(tf.float32, shape=[None, 19, 19, 17]) - self.is_training = tf.placeholder(tf.bool, shape=[]) - self.z = tf.placeholder(tf.float32, shape=[None, 1]) - self.pi = tf.placeholder(tf.float32, shape=[None, 362]) - self.build_network() - - def build_network(self): - h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm, - normalizer_params={'is_training': self.is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS}, - weights_regularizer=layers.l2_regularizer(1e-4)) - for i in range(19): - h = residual_block(h, self.is_training) - self.v = value_heads(h, self.is_training) - self.p = policy_heads(h, self.is_training) - # loss = tf.reduce_mean(tf.square(z-v)) - tf.multiply(pi, tf.log(tf.clip_by_value(tf.nn.softmax(p), 1e-8, tf.reduce_max(tf.nn.softmax(p))))) - self.value_loss = tf.reduce_mean(tf.square(self.z - self.v)) - self.policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.pi, logits=self.p)) - - self.reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) - self.total_loss = self.value_loss + self.policy_loss + self.reg - # train_op = tf.train.MomentumOptimizer(1e-4, momentum=0.9, use_nesterov=True).minimize(total_loss) - self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) - with tf.control_dependencies(self.update_ops): - self.train_op = tf.train.RMSPropOptimizer(1e-4).minimize(self.total_loss) - self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) - self.saver = tf.train.Saver(max_to_keep=10, var_list=self.var_list) - - def train(self): - data_path = "/home/tongzheng/data/" - data_name = os.listdir("/home/tongzheng/data/") - epochs = 100 - batch_size = 128 - - result_path = "./checkpoints/" - with multi_gpu.create_session() as sess: - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(result_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - self.saver.restore(sess, ckpt_file) - for epoch in range(epochs): - for name in data_name: - data = np.load(data_path + name) - boards = data["boards"] - wins = data["wins"] - ps = data["ps"] - print (boards.shape) - print (wins.shape) - print (ps.shape) - batch_num = boards.shape[0] // batch_size - index = np.arange(boards.shape[0]) - np.random.shuffle(index) - value_losses = [] - policy_losses = [] - regs = [] - time_train = -time.time() - for iter in range(batch_num): - lv, lp, r, value, prob, _ = sess.run( - [self.value_loss, self.policy_loss, self.reg, self.v, tf.nn.softmax(p), self.train_op], - feed_dict={self.x: boards[ - index[iter * batch_size:(iter + 1) * batch_size]], - self.z: wins[index[ - iter * batch_size:(iter + 1) * batch_size]], - self.pi: ps[index[ - iter * batch_size:(iter + 1) * batch_size]], - self.is_training: True}) - value_losses.append(lv) - policy_losses.append(lp) - regs.append(r) - if iter % 1 == 0: - print( - "Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format( - epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)), - np.mean(np.array(policy_losses)), np.mean(np.array(regs)))) - time_train = -time.time() - value_losses = [] - policy_losses = [] - regs = [] - if iter % 20 == 0: - save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter) - self.saver.save(sess, result_path + save_path) - del data, boards, wins, ps - - - # def forward(call_number): - # # checkpoint_path = "/home/yama/rl/tianshou/AlphaGo/checkpoints" - # checkpoint_path = "/home/jialian/stuGo/tianshou/stuGo/checkpoints/" - # board_file = np.genfromtxt("/home/jialian/stuGo/tianshou/leela-zero/src/mcts_nn_files/board_" + call_number, - # dtype='str'); - # human_board = np.zeros((17, 19, 19)) - # - # # TODO : is it ok to ignore the last channel? - # for i in range(17): - # human_board[i] = np.array(list(board_file[i])).reshape(19, 19) - # # print("============================") - # # print("human board sum : " + str(np.sum(human_board[-1]))) - # # print("============================") - # # print(human_board) - # # print("============================") - # # rint(human_board) - # feed_board = human_board.transpose(1, 2, 0).reshape(1, 19, 19, 17) - # # print(feed_board[:,:,:,-1]) - # # print(feed_board.shape) - # - # # npz_board = np.load("/home/yama/rl/tianshou/AlphaGo/data/7f83928932f64a79bc1efdea268698ae.npz") - # # print(npz_board["boards"].shape) - # # feed_board = npz_board["boards"][10].reshape(-1, 19, 19, 17) - # ##print(feed_board) - # # show_board = feed_board[0].transpose(2, 0, 1) - # # print("board shape : ", show_board.shape) - # # print(show_board) - # - # itflag = False - # with multi_gpu.create_session() as sess: - # sess.run(tf.global_variables_initializer()) - # ckpt_file = tf.train.latest_checkpoint(checkpoint_path) - # if ckpt_file is not None: - # # print('Restoring model from {}...'.format(ckpt_file)) - # saver.restore(sess, ckpt_file) - # else: - # raise ValueError("No model loaded") - # res = sess.run([tf.nn.softmax(p), v], feed_dict={x: feed_board, is_training: itflag}) - # # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False}) - # # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True}) - # # print(np.argmax(res[0])) - # np.savetxt(sys.stdout, res[0][0], fmt="%.6f", newline=" ") - # np.savetxt(sys.stdout, res[1][0], fmt="%.6f", newline=" ") - # pv_file = "/home/jialian/stuGotianshou/leela-zero/src/mcts_nn_files/policy_value" - # np.savetxt(pv_file, np.concatenate((res[0][0], res[1][0])), fmt="%.6f", newline=" ") - # # np.savetxt(pv_file, res[1][0], fmt="%.6f", newline=" ") - # return res - - def forward(self): - checkpoint_path = "/home/tongzheng/tianshou/AlphaGo/checkpoints/" - sess = multi_gpu.create_session() - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(checkpoint_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - self.saver.restore(sess, ckpt_file) - print('Successfully loaded') - else: - raise ValueError("No model loaded") - # prior, value = sess.run([tf.nn.softmax(p), v], feed_dict={x: state, is_training: False}) - # return prior, value - return sess - - -if __name__ == '__main__': - state = np.random.randint(0, 1, [1, 19, 19, 17]) - net = Network() - sess = net.forward() - start = time.time() - for i in range(100): - sess.run([tf.nn.softmax(net.p), net.v], feed_dict={net.x: state, net.is_training: False}) - print("Step {}, Cumulative time {}".format(i, time.time() - start)) diff --git a/AlphaGo/Network_ori.py b/AlphaGo/Network_ori.py deleted file mode 100644 index 9d33bb9..0000000 --- a/AlphaGo/Network_ori.py +++ /dev/null @@ -1,175 +0,0 @@ -import os -import time -import gc - -import numpy as np -import tensorflow as tf -import tensorflow.contrib.layers as layers - -import multi_gpu - -os.environ["CUDA_VISIBLE_DEVICES"] = "1" - - -def residual_block(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = h + input - return tf.nn.relu(h) - - -def policy_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 362, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -def value_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -x = tf.placeholder(tf.float32, shape=[None, 19, 19, 17]) -is_training = tf.placeholder(tf.bool, shape=[]) -z = tf.placeholder(tf.float32, shape=[None, 1]) -pi = tf.placeholder(tf.float32, shape=[None, 362]) - -h = layers.conv2d(x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm, - normalizer_params={'is_training': is_training, 'updates_collections': tf.GraphKeys.UPDATE_OPS}, - weights_regularizer=layers.l2_regularizer(1e-4)) -for i in range(19): - h = residual_block(h, is_training) -v = value_heads(h, is_training) -p = policy_heads(h, is_training) -# loss = tf.reduce_mean(tf.square(z-v)) - tf.multiply(pi, tf.log(tf.clip_by_value(tf.nn.softmax(p), 1e-8, tf.reduce_max(tf.nn.softmax(p))))) -value_loss = tf.reduce_mean(tf.square(z - v)) -policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=pi, logits=p)) - -reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) -total_loss = value_loss + policy_loss + reg -# train_op = tf.train.MomentumOptimizer(1e-4, momentum=0.9, use_nesterov=True).minimize(total_loss) -update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) -with tf.control_dependencies(update_ops): - train_op = tf.train.RMSPropOptimizer(1e-4).minimize(total_loss) -var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) -saver = tf.train.Saver(max_to_keep=10, var_list=var_list) - - -def train(): - data_path = "/home/tongzheng/data/" - data_name = os.listdir("/home/tongzheng/data/") - epochs = 100 - batch_size = 128 - - result_path = "./checkpoints/" - with multi_gpu.create_session() as sess: - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(result_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - saver.restore(sess, ckpt_file) - for epoch in range(epochs): - for name in data_name: - data = np.load(data_path + name) - boards = data["boards"] - wins = data["wins"] - ps = data["ps"] - print (boards.shape) - print (wins.shape) - print (ps.shape) - # batch_num = 1 - batch_num = boards.shape[0] // batch_size - index = np.arange(boards.shape[0]) - np.random.shuffle(index) - value_losses = [] - policy_losses = [] - regs = [] - time_train = -time.time() - for iter in range(batch_num): - lv, lp, r, _ = sess.run([value_loss, policy_loss, reg, train_op], - feed_dict={x: boards[ - index[iter * batch_size:(iter + 1) * batch_size]], - z: wins[index[ - iter * batch_size:(iter + 1) * batch_size]], - pi: ps[index[ - iter * batch_size:(iter + 1) * batch_size]], - is_training: True}) - value_losses.append(lv) - policy_losses.append(lp) - regs.append(r) - del lv, lp, r - if iter % 1 == 0: - print( - "Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format( - epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)), - np.mean(np.array(policy_losses)), np.mean(np.array(regs)))) - del value_losses, policy_losses, regs, time_train - time_train = -time.time() - value_losses = [] - policy_losses = [] - regs = [] - if iter % 20 == 0: - save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter) - saver.save(sess, result_path + save_path) - del save_path - del data, boards, wins, ps, batch_num, index - gc.collect() - - -def forward(board): - result_path = "./checkpoints" - itflag = False - res = None - if board is None: - # data = np.load("/home/tongzheng/meta-data/80b7bf21bce14862806d48c3cd760a1b.npz") - data = np.load("./data/7f83928932f64a79bc1efdea268698ae.npz") - board = data["boards"][50].reshape(-1, 19, 19, 17) - human_board = board[0].transpose(2, 0, 1) - print("============================") - print("human board sum : " + str(np.sum(human_board))) - print("============================") - print(board[:, :, :, -1]) - itflag = False - with multi_gpu.create_session() as sess: - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(result_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - saver.restore(sess, ckpt_file) - else: - raise ValueError("No model loaded") - res = sess.run([tf.nn.softmax(p), v], feed_dict={x: board, is_training: itflag}) - # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False}) - # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True}) - # print(np.argmax(res[0])) - print(res) - print(data["p"][0]) - print(np.argmax(res[0])) - print(np.argmax(data["p"][0])) - # print(res[0].tolist()[0]) - # print(np.argmax(res[0])) - return res - - -if __name__ == '__main__': - # train() - # if sys.argv[1] == "test": - forward(None) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index aee8d3a..37b7878 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -11,7 +11,7 @@ import tensorflow as tf import numpy as np import sys, os import go -import network_small +import model from collections import deque sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir)) from tianshou.core.mcts.mcts import MCTS @@ -31,10 +31,9 @@ class Game: self.latest_boards = deque(maxlen=8) for _ in range(8): self.latest_boards.append(self.board) - self.net = network_small.Network() - self.sess = self.net.forward(checkpoint_path) - self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], - feed_dict={self.net.x: state, self.net.is_training: False}) + self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8) + # self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], + # feed_dict={self.net.x: state, self.net.is_training: False}) self.game_engine = go.Go(game=self) def _flatten(self, vertex): @@ -75,7 +74,8 @@ class Game: self.game_engine.simulate_latest_boards = copy.copy(latest_boards) self.game_engine.simulate_board = copy.copy(latest_boards[-1]) nn_input = self.generate_nn_input(self.game_engine.simulate_latest_boards, color) - mcts = MCTS(self.game_engine, self.evaluator, nn_input, self.size ** 2 + 1, inverse=True, max_step=1) + mcts = MCTS(self.game_engine, self.evaluator, [self.game_engine.simulate_latest_boards, color], self.size ** 2 + 1, inverse=True) + mcts.search(max_step=1) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0] @@ -93,7 +93,7 @@ class Game: return res def think_play_move(self, color): - # although we dont need to return self.prob, however it is needed for neural network training + # although we don't need to return self.prob, however it is needed for neural network training move, self.prob = self.think(self.latest_boards, color) # play the move immediately self.play_move(color, move) @@ -122,6 +122,7 @@ class Game: if __name__ == "__main__": g = Game() g.show_board() + g.think_play_move(1) #file = open("debug.txt", "a") #file.write("mcts check\n") #file.close() diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 10ce7e1..335ee39 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -17,8 +17,6 @@ CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] class Go: def __init__(self, **kwargs): self.game = kwargs['game'] - self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) - self.simulate_latest_boards = deque(maxlen=8) def _in_board(self, vertex): x, y = vertex @@ -125,18 +123,12 @@ class Go: return False return True - def _sa2cv(self, state, action): - # State is the play board, the shape is [1, self.game.size, self.game.size, 17], action is an index. - # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move - if state[0, 0, 0, -1] == utils.BLACK: - color = utils.BLACK - else: - color = utils.WHITE + def _action2vertex(self, action): if action == self.game.size ** 2: vertex = (0, 0) else: vertex = self.game._deflatten(action) - return color, vertex + return vertex def _is_valid(self, history_boards, current_board, color, vertex): ### in board @@ -157,14 +149,10 @@ class Go: return True - def simulate_is_valid(self, history_boards, current_board, state, action): - # initialize simulate_latest_boards and simulate_board from state - self.simulate_latest_boards.clear() - for i in range(8): - self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) - self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) - - color, vertex = self._sa2cv(state, action) + def simulate_is_valid(self, state, action): + history_boards, color = state + vertex = self._action2vertex(action) + current_board = history_boards[-1] if not self._is_valid(history_boards, current_board, color, vertex): return False @@ -174,30 +162,22 @@ class Go: return True - def _do_move(self, color, vertex): + def _do_move(self, board, color, vertex): if vertex == utils.PASS: - return True - - id_ = self.game._flatten(vertex) - if self.simulate_board[id_] == utils.EMPTY: - self.simulate_board[id_] = color - return True + return board else: - return False + id_ = self.game._flatten(vertex) + board[id_] = color + return board def simulate_step_forward(self, state, action): # initialize the simulate_board from state - self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() - - color, vertex = self._sa2cv(state, action) - - self._do_move(color, vertex) - new_state = np.concatenate( - [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), - state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), - np.array(1 - state[:, :, :, -1]).reshape(1, self.game.size, self.game.size, 1)], - axis=3) - return new_state, 0 + history_boards, color = state + vertex = self._action2vertex(action) + new_board = self._do_move(copy.copy(history_boards[-1]), color, vertex) + history_boards.append(new_board) + new_color = -color + return [history_boards, new_color], 0 def executor_do_move(self, color, vertex): if not self._is_valid(self.game.history, self.game.board, color, vertex): @@ -239,7 +219,7 @@ class Go: start_vertex_x += x_diff start_vertex_y += y_diff - def _predict_from_nearby(self, vertex, neighbor_step = 3): + def _predict_from_nearby(self, vertex, neighbor_step=3): ''' step: the nearby 3 steps is considered :vertex: position to be estimated @@ -261,7 +241,7 @@ class Go: elif color_estimate < 0: return utils.WHITE - def executor_get_score(self, is_unknown_estimation = False): + def executor_get_score(self, is_unknown_estimation=False): ''' is_unknown_estimation: whether use nearby stone to predict the unknown return score from BLACK perspective. diff --git a/AlphaGo/model.py b/AlphaGo/model.py new file mode 100644 index 0000000..725dbd2 --- /dev/null +++ b/AlphaGo/model.py @@ -0,0 +1,170 @@ +import os +import time +import sys + +import numpy as np +import tensorflow as tf +import tensorflow.contrib.layers as layers + +import multi_gpu + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + + +def residual_block(input, is_training): + """ + one residual block + + :param input: a tensor, input of the residual block + :param is_training: a placeholder, indicate whether the model is training or not + :return: a tensor, output of the residual block + """ + normalizer_params = {'is_training': is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS} + h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = h + input + return tf.nn.relu(h) + + +def policy_head(input, is_training, action_num): + """ + the head of policy branch + + :param input: a tensor, input of the policy head + :param is_training: a placeholder, indicate whether the model is training or not + :param action_num: action_num: an integer, number of unique actions at any state + :return: a tensor: output of the policy head, shape [batch_size, action_num] + """ + normalizer_params = {'is_training': is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS} + h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.flatten(h) + h = layers.fully_connected(h, action_num, activation_fn=tf.identity, + weights_regularizer=layers.l2_regularizer(1e-4)) + return h + + +def value_head(input, is_training): + """ + the head of value branch + + :param input: a tensor, input of the value head + :param is_training: a placeholder, indicate whether the model is training or not + :return: a tensor, output of the value head, shape [batch_size, 1] + """ + normalizer_params = {'is_training': is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS} + h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.flatten(h) + h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4)) + return h + + +class ResNet(object): + def __init__(self, board_size, action_num, history_length=1, residual_block_num=20, checkpoint_path=None): + """ + the resnet model + + :param board_size: an integer, the board size + :param action_num: an integer, number of unique actions at any state + :param history_length: an integer, the history length to use, default is 1 + :param residual_block_num: an integer, the number of residual block, default is 20, at least 1 + :param checkpoint_path: a string, the path to the checkpoint, default is None, + """ + self.board_size = board_size + self.action_num = action_num + self.history_length = history_length + self.x = tf.placeholder(tf.float32, shape=[None, self.board_size, self.board_size, 2 * self.history_length + 1]) + self.is_training = tf.placeholder(tf.bool, shape=[]) + self.z = tf.placeholder(tf.float32, shape=[None, 1]) + self.pi = tf.placeholder(tf.float32, shape=[None, self.action_num]) + self._build_network(residual_block_num, checkpoint_path) + + def _build_network(self, residual_block_num, checkpoint_path): + """ + build the network + + :param residual_block_num: an integer, the number of residual block + :param checkpoint_path: a string, the path to the checkpoint, if None, use random initialization parameter + :return: None + """ + + h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, + normalizer_params={'is_training': self.is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS}, + weights_regularizer=layers.l2_regularizer(1e-4)) + for i in range(residual_block_num - 1): + h = residual_block(h, self.is_training) + self.v = value_head(h, self.is_training) + self.p = policy_head(h, self.is_training, self.action_num) + self.value_loss = tf.reduce_mean(tf.square(self.z - self.v)) + self.policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.pi, logits=self.p)) + + self.reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) + self.total_loss = self.value_loss + self.policy_loss + self.reg + self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + with tf.control_dependencies(self.update_ops): + self.train_op = tf.train.AdamOptimizer(1e-4).minimize(self.total_loss) + self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) + self.saver = tf.train.Saver(max_to_keep=10, var_list=self.var_list) + self.sess = multi_gpu.create_session() + self.sess.run(tf.global_variables_initializer()) + if checkpoint_path is not None: + ckpt_file = tf.train.latest_checkpoint(checkpoint_path) + if ckpt_file is not None: + print('Restoring model from {}...'.format(ckpt_file)) + self.saver.restore(self.sess, ckpt_file) + print('Successfully loaded') + else: + raise ValueError("No model in path {}".format(checkpoint_path)) + + def __call__(self, state): + """ + + :param history: a list, the history + :param color: a string, indicate which one to play + :return: a list of tensor, the predicted value and policy given the history and color + """ + history, color = state + if len(history) != self.history_length: + raise ValueError( + 'The length of history cannot meet the need of the model, given {}, need {}'.format(len(history), + self.history_length)) + state = self._history2state(history, color) + return self.sess.run([self.p, self.v], feed_dict={self.x: state, self.is_training: False}) + + def _history2state(self, history, color): + """ + convert the history to the state we need + + :param history: a list, the history + :param color: a string, indicate which one to play + :return: a ndarray, the state + """ + state = np.zeros([1, self.board_size, self.board_size, 2 * self.history_length + 1]) + for i in range(self.history_length): + state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.board_size ** 2)).reshape(self.board_size, + self.board_size) + state[0, :, :, i + self.history_length] = np.array( + np.array(history[i]) == -np.ones(self.board_size ** 2)).reshape(self.board_size, self.board_size) + # TODO: need a config to specify the BLACK and WHITE + if color == +1: + state[0, :, :, 2 * self.history_length] = np.ones([self.board_size, self.board_size]) + if color == -1: + state[0, :, :, 2 * self.history_length] = np.zeros([self.board_size, self.board_size]) + return state + + #TODO: design the interface between the environment and training + def train(self, mode='memory', *args, **kwargs): + pass \ No newline at end of file diff --git a/AlphaGo/network_small.py b/AlphaGo/network.py similarity index 100% rename from AlphaGo/network_small.py rename to AlphaGo/network.py diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 12fc85d..fac00fb 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -72,11 +72,9 @@ class UCTNode(MCTSNode): def valid_mask(self, simulator): if self.mask is None: - start_time = time.time() self.mask = [] for act in range(self.action_num - 1): - if not simulator.simulate_is_valid( - simulator.simulate_latest_boards, simulator.simulate_board, self.state, act): + if not simulator.simulate_is_valid(self.state, act): self.mask.append(act) self.ucb[act] = -float("Inf") else: @@ -144,8 +142,7 @@ class ActionNode(object): class MCTS(object): - def __init__(self, simulator, evaluator, root, action_num, method="UCT", inverse=False, max_step=None, - max_time=None): + def __init__(self, simulator, evaluator, root, action_num, method="UCT", inverse=False): self.simulator = simulator self.evaluator = evaluator prior, _ = self.evaluator(root) @@ -153,33 +150,26 @@ class MCTS(object): if method == "": self.root = root if method == "UCT": - self.root = UCTNode(None, None, root, action_num, prior, inverse) + self.root = UCTNode(None, None, root, action_num, prior, inverse=inverse) if method == "TS": self.root = TSNode(None, None, root, action_num, prior, inverse=inverse) self.inverse = inverse - if max_step is not None: - self.step = 0 - self.max_step = max_step - # TODO: Optimize the stop criteria - # else: - # self.max_step = 0 - if max_time is not None: - self.start_time = time.time() - self.max_time = max_time + + def search(self, max_step=None, max_time=None): + step = 0 + start_time = time.time() + if max_step is None: + max_step = int("Inf") + if max_time is None: + max_time = float("Inf") if max_step is None and max_time is None: raise ValueError("Need a stop criteria!") - # TODO: running mcts should be implemented in another function, e.g. def search(self, max_step, max_time) - self.select_time = [] - self.evaluate_time = [] - self.bp_time = [] - while (max_step is not None and self.step < self.max_step or max_step is None) \ - and (max_time is not None and time.time() - self.start_time < self.max_time or max_time is None): - self.expand() - if max_step is not None: - self.step += 1 + while step < max_step and time.time() - start_time < max_step: + self._expand() + step += 1 - def expand(self): + def _expand(self): node, new_action = self.root.selection(self.simulator) value = node.children[new_action].expansion(self.evaluator, self.action_num) node.children[new_action].backpropagation(value + 0.) From 12f45d9dc65121a66e31d122fddd79c491190eac Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Wed, 20 Dec 2017 20:12:08 +0800 Subject: [PATCH 07/18] checkpoint --- AlphaGo/go.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 335ee39..7196533 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -117,10 +117,31 @@ class Go: return False def _knowledge_prunning(self, current_board, color, vertex): - ### check if it is an eye of yourself - ### assumptions : notice that this judgement requires that the state is an endgame + # forbid some stupid selfplay using human knowledge if self._is_eye(current_board, color, vertex): return False + # forbid position on its own eye. + if self._is_game_finish(current_board, color) and vertex == utils.PASS + return False + # forbid pass if the game is not finished. + return True + + + def _is_game_finished(self, current_board, color): + ''' + for each empty position, if it has both BLACK and WHITE neighbors, the game is still not finished + :return: return the game is finished + ''' + board = copy.deepcopy(current_board) + empty_idx = [i for i, x in enumerate(board) if x == utils.EMPTY] # find all empty idx + for idx in empty_idx: + neighbor_idx = self._neighbor(self.game.deflatten(idx)) + if len(neighbor_idx) > 1: + first_idx = neighbor_idx[0] + for other_idx in neighbor_idx[1:]: + if self.game.board[self.game.flatten(other_idx)] != self.game.board[self.game.flatten(first_idx)]: + return False + return True def _action2vertex(self, action): From 818da800e2da106a540cf70ee8fe8a3956061e43 Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Wed, 20 Dec 2017 21:35:35 +0800 Subject: [PATCH 08/18] simulator process a valid set, instead of a single action --- AlphaGo/go.py | 18 +++++++++++++++--- tianshou/core/mcts/mcts.py | 9 ++------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 7196533..559b375 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -121,9 +121,9 @@ class Go: if self._is_eye(current_board, color, vertex): return False # forbid position on its own eye. - if self._is_game_finish(current_board, color) and vertex == utils.PASS - return False - # forbid pass if the game is not finished. + #if self._is_game_finish(current_board, color) and vertex == utils.PASS + # return False + # forbid pass if the game is not finished. return True @@ -183,6 +183,18 @@ class Go: return True + def simulate_is_valid_list(self, state, action_set): + ## find all the valid actions + ## if no action is valid, then pass + valid_action_set = [] + for action_candidate in action_set: + if self.simulate_is_valid(self, state, action_candidate) + valid_action_set.append(action_candidate) + if not valid_action_set: + valid_action_set.append(utils.PASS) + # if valid_action_set is a empty set, add pass + return valid_action_set + def _do_move(self, board, color, vertex): if vertex == utils.PASS: return board diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index fac00fb..c14496d 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -72,13 +72,8 @@ class UCTNode(MCTSNode): def valid_mask(self, simulator): if self.mask is None: - self.mask = [] - for act in range(self.action_num - 1): - if not simulator.simulate_is_valid(self.state, act): - self.mask.append(act) - self.ucb[act] = -float("Inf") - else: - self.ucb[self.mask] = -float("Inf") + self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num - 1)) + self.ucb[self.mask] = -float("Inf") class TSNode(MCTSNode): From 8875ad1bf7762371fb44a5ea5406ae5512704c62 Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Wed, 20 Dec 2017 21:40:03 +0800 Subject: [PATCH 09/18] minor revision --- AlphaGo/go.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 559b375..009d369 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -186,14 +186,14 @@ class Go: def simulate_is_valid_list(self, state, action_set): ## find all the valid actions ## if no action is valid, then pass - valid_action_set = [] + valid_action_list = [] for action_candidate in action_set: - if self.simulate_is_valid(self, state, action_candidate) - valid_action_set.append(action_candidate) - if not valid_action_set: - valid_action_set.append(utils.PASS) + if self.simulate_is_valid(state, action_candidate): + valid_action_list.append(action_candidate) + if not valid_action_list: + valid_action_list.append(utils.PASS) # if valid_action_set is a empty set, add pass - return valid_action_set + return valid_action_list def _do_move(self, board, color, vertex): if vertex == utils.PASS: From 0ab38743aa054083410ca8e6d6f80f9018a8f1d4 Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Wed, 20 Dec 2017 21:52:30 +0800 Subject: [PATCH 10/18] minor revision. --- AlphaGo/go.py | 3 +-- tianshou/core/mcts/mcts.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 009d369..cbbe07c 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -180,7 +180,6 @@ class Go: if not self._knowledge_prunning(current_board, color, vertex): return False - return True def simulate_is_valid_list(self, state, action_set): @@ -188,7 +187,7 @@ class Go: ## if no action is valid, then pass valid_action_list = [] for action_candidate in action_set: - if self.simulate_is_valid(state, action_candidate): + if not self.simulate_is_valid(state, action_candidate): valid_action_list.append(action_candidate) if not valid_action_list: valid_action_list.append(utils.PASS) diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index c14496d..5aca06a 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -71,6 +71,7 @@ class UCTNode(MCTSNode): self.parent.backpropagation(self.children[action].reward) def valid_mask(self, simulator): + # let all invalid actions illeagel in mcts if self.mask is None: self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num - 1)) self.ucb[self.mask] = -float("Inf") From 40909fa994c7fccc8391123a8463807968219d26 Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Wed, 20 Dec 2017 22:10:47 +0800 Subject: [PATCH 11/18] forbid pass, if we have other choices --- AlphaGo/go.py | 18 +++++++++--------- tianshou/core/mcts/mcts.py | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index cbbe07c..1dfbb29 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -183,16 +183,16 @@ class Go: return True def simulate_is_valid_list(self, state, action_set): - ## find all the valid actions - ## if no action is valid, then pass - valid_action_list = [] - for action_candidate in action_set: + # find all the invalid actions + invalid_action_list = [] + for action_candidate in action_set[:-1]: + # go through all the actions excluding pass if not self.simulate_is_valid(state, action_candidate): - valid_action_list.append(action_candidate) - if not valid_action_list: - valid_action_list.append(utils.PASS) - # if valid_action_set is a empty set, add pass - return valid_action_list + invalid_action_list.append(action_candidate) + if len(invalid_action_list) < len(action_set) - 1: + invalid_action_list.append(action_set[-1]) + # forbid pass, if we have other choices + return invalid_action_list def _do_move(self, board, color, vertex): if vertex == utils.PASS: diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 5aca06a..7edac97 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -71,7 +71,7 @@ class UCTNode(MCTSNode): self.parent.backpropagation(self.children[action].reward) def valid_mask(self, simulator): - # let all invalid actions illeagel in mcts + # let all invalid actions be illeagel in mcts if self.mask is None: self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num - 1)) self.ucb[self.mask] = -float("Inf") From 336cede197ae2ad3d8ae489065c3780496fcf469 Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Wed, 20 Dec 2017 22:57:58 +0800 Subject: [PATCH 12/18] repair komi. add todo for forbid pass: --- AlphaGo/engine.py | 2 +- AlphaGo/game.py | 4 ++-- AlphaGo/go.py | 5 +---- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index 9948176..bf30083 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -183,7 +183,7 @@ class GTPEngine(): return 'unknown player', False def cmd_get_score(self, args, **kwargs): - return self._game.game_engine.executor_get_score(), None + return self._game.game_engine.executor_get_score(True), None def cmd_show_board(self, args, **kwargs): return self._game.board, True diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 37b7878..5f35c74 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -23,7 +23,7 @@ class Game: TODO : Maybe merge with the engine class in future, currently leave it untouched for interacting with Go UI. ''' - def __init__(self, size=9, komi=6.5, checkpoint_path=None): + def __init__(self, size=9, komi=3.75, checkpoint_path=None): self.size = size self.komi = komi self.board = [utils.EMPTY] * (self.size ** 2) @@ -75,7 +75,7 @@ class Game: self.game_engine.simulate_board = copy.copy(latest_boards[-1]) nn_input = self.generate_nn_input(self.game_engine.simulate_latest_boards, color) mcts = MCTS(self.game_engine, self.evaluator, [self.game_engine.simulate_latest_boards, color], self.size ** 2 + 1, inverse=True) - mcts.search(max_step=1) + mcts.search(max_step=5) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0] diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 1dfbb29..4f1c759 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -121,12 +121,8 @@ class Go: if self._is_eye(current_board, color, vertex): return False # forbid position on its own eye. - #if self._is_game_finish(current_board, color) and vertex == utils.PASS - # return False - # forbid pass if the game is not finished. return True - def _is_game_finished(self, current_board, color): ''' for each empty position, if it has both BLACK and WHITE neighbors, the game is still not finished @@ -192,6 +188,7 @@ class Go: if len(invalid_action_list) < len(action_set) - 1: invalid_action_list.append(action_set[-1]) # forbid pass, if we have other choices + # TODO: In fact we should not do this. In some extreme cases, we should permit pass. return invalid_action_list def _do_move(self, board, color, vertex): From 1e2567c17452a68bc23802157f92440bfce61032 Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Thu, 21 Dec 2017 19:31:51 +0800 Subject: [PATCH 13/18] fixing bug pass parameterg --- tianshou/core/mcts/mcts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 7edac97..8bb5f06 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -73,7 +73,7 @@ class UCTNode(MCTSNode): def valid_mask(self, simulator): # let all invalid actions be illeagel in mcts if self.mask is None: - self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num - 1)) + self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num)) self.ucb[self.mask] = -float("Inf") From 2dad8e40200856587e95747cd3f1b196ef708e06 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Thu, 21 Dec 2017 21:01:25 +0800 Subject: [PATCH 14/18] implement data collection and part of training --- AlphaGo/engine.py | 6 ++- AlphaGo/game.py | 19 +------- AlphaGo/model.py | 18 +++++++- AlphaGo/play.py | 115 ++++++++++++++++++++++++++++++---------------- AlphaGo/player.py | 1 + 5 files changed, 101 insertions(+), 58 deletions(-) diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index bf30083..c9f1a3c 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -183,11 +183,15 @@ class GTPEngine(): return 'unknown player', False def cmd_get_score(self, args, **kwargs): - return self._game.game_engine.executor_get_score(True), None + return self._game.game_engine.executor_get_score(True), True def cmd_show_board(self, args, **kwargs): return self._game.board, True + def cmd_get_prob(self, args, **kwargs): + return self._game.prob, True + + if __name__ == "main": game = Game() engine = GTPEngine(game_obj=Game) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 5f35c74..bf0d084 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -58,24 +58,9 @@ class Game: def set_komi(self, k): self.komi = k - def generate_nn_input(self, latest_boards, color): - state = np.zeros([1, self.size, self.size, 17]) - for i in range(8): - state[0, :, :, i] = np.array(np.array(latest_boards[i]) == np.ones(self.size ** 2)).reshape(self.size, self.size) - state[0, :, :, i + 8] = np.array(np.array(latest_boards[i]) == -np.ones(self.size ** 2)).reshape(self.size, self.size) - if color == utils.BLACK: - state[0, :, :, 16] = np.ones([self.size, self.size]) - if color == utils.WHITE: - state[0, :, :, 16] = np.zeros([self.size, self.size]) - return state - def think(self, latest_boards, color): - # TODO : using copy is right, or should we change to deepcopy? - self.game_engine.simulate_latest_boards = copy.copy(latest_boards) - self.game_engine.simulate_board = copy.copy(latest_boards[-1]) - nn_input = self.generate_nn_input(self.game_engine.simulate_latest_boards, color) - mcts = MCTS(self.game_engine, self.evaluator, [self.game_engine.simulate_latest_boards, color], self.size ** 2 + 1, inverse=True) - mcts.search(max_step=5) + mcts = MCTS(self.game_engine, self.evaluator, [latest_boards, color], self.size ** 2 + 1, inverse=True) + mcts.search(max_step=1) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0] diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 725dbd2..fab864e 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -1,6 +1,7 @@ import os import time import sys +import cPickle import numpy as np import tensorflow as tf @@ -167,4 +168,19 @@ class ResNet(object): #TODO: design the interface between the environment and training def train(self, mode='memory', *args, **kwargs): - pass \ No newline at end of file + if mode == 'memory': + pass + if mode == 'file': + self.train_with_file(data_path=kwargs['data_path'], checkpoint_path=kwargs['checkpoint_path']) + + def train_with_file(self, data_path, checkpoint_path): + if not os.path.exists(data_path): + raise ValueError("{} doesn't exist".format(data_path)) + + file_list = os.listdir(data_path) + if file_list <= 50: + time.sleep(1) + else: + file_list.sort(key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir( + data_path + file) else 0) + diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 7367804..562dd14 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -5,6 +5,18 @@ import re import Pyro4 import time import os +import cPickle + + +class Data(object): + def __init__(self): + self.boards = [] + self.probs = [] + self.winner = 0 + + def reset(self): + self.__init__() + if __name__ == '__main__': """ @@ -13,10 +25,13 @@ if __name__ == '__main__': """ # TODO : we should set the network path in a more configurable way. parser = argparse.ArgumentParser() + parser.add_argument("--result_path", type=str, default="./data/") parser.add_argument("--black_weight_path", type=str, default=None) parser.add_argument("--white_weight_path", type=str, default=None) args = parser.parse_args() + if not os.path.exists(args.result_path): + os.mkdir(args.result_path) # black_weight_path = "./checkpoints" # white_weight_path = "./checkpoints_origin" if args.black_weight_path is not None and (not os.path.exists(args.black_weight_path)): @@ -35,11 +50,13 @@ if __name__ == '__main__': time.sleep(1) # start two different player with different network weights. - agent_v0 = subprocess.Popen(['python', '-u', 'player.py', '--role=black', '--checkpoint_path=' + str(args.black_weight_path)], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + agent_v0 = subprocess.Popen( + ['python', '-u', 'player.py', '--role=black', '--checkpoint_path=' + str(args.black_weight_path)], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - agent_v1 = subprocess.Popen(['python', '-u', 'player.py', '--role=white', '--checkpoint_path=' + str(args.white_weight_path)], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + agent_v1 = subprocess.Popen( + ['python', '-u', 'player.py', '--role=white', '--checkpoint_path=' + str(args.white_weight_path)], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) server_list = "" while ("black" not in server_list) or ("white" not in server_list): @@ -50,6 +67,7 @@ if __name__ == '__main__': print "Start black player at : " + str(agent_v0.pid) print "Start white player at : " + str(agent_v1.pid) + data = Data() player = [None] * 2 player[0] = Pyro4.Proxy("PYRONAME:black") player[1] = Pyro4.Proxy("PYRONAME:white") @@ -63,39 +81,58 @@ if __name__ == '__main__': evaluate_rounds = 1 game_num = 0 - while game_num < evaluate_rounds: - num = 0 - pass_flag = [False, False] - print("Start game {}".format(game_num)) - # end the game if both palyer chose to pass, or play too much turns - while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2: - turn = num % 2 - move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n') - print role[turn] + " : " + str(move), - num += 1 - match = re.search(pattern, move) - if match is not None: - # print "match : " + str(match.group()) - play_or_pass = match.group() - pass_flag[turn] = False + try: + while True: + num = 0 + pass_flag = [False, False] + print("Start game {}".format(game_num)) + # end the game if both palyer chose to pass, or play too much turns + while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2: + turn = num % 2 + move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n') + print role[turn] + " : " + str(move), + num += 1 + match = re.search(pattern, move) + if match is not None: + # print "match : " + str(match.group()) + play_or_pass = match.group() + pass_flag[turn] = False + else: + # print "no match" + play_or_pass = ' PASS' + pass_flag[turn] = True + result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n') + board = player[turn].run_cmd(str(num) + ' show_board') + board = eval(board[board.index('['):board.index(']') + 1]) + for i in range(size): + for j in range(size): + print show[board[i * size + j]] + " ", + print "\n", + data.boards.append(board) + prob = player[turn].run_cmd(str(num) + ' get_prob') + data.probs.append(prob) + score = player[turn].run_cmd(str(num) + ' get_score') + print "Finished : ", score.split(" ")[1] + # TODO: generalize the player + if score > 0: + data.winner = 1 + if score < 0: + data.winner = -1 + player[0].run_cmd(str(num) + ' clear_board') + player[1].run_cmd(str(num) + ' clear_board') + file_list = os.listdir(args.result_path) + if not file_list: + data_num = 0 else: - # print "no match" - play_or_pass = ' PASS' - pass_flag[turn] = True - result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n') - board = player[turn].run_cmd(str(num) + ' show_board') - board = eval(board[board.index('['):board.index(']') + 1]) - for i in range(size): - for j in range(size): - print show[board[i * size + j]] + " ", - print "\n", - - score = player[turn].run_cmd(str(num) + ' get_score') - print "Finished : ", score.split(" ")[1] - player[0].run_cmd(str(num) + ' clear_board') - player[1].run_cmd(str(num) + ' clear_board') - game_num += 1 - - subprocess.call(["kill", "-9", str(agent_v0.pid)]) - subprocess.call(["kill", "-9", str(agent_v1.pid)]) - print "Kill all player, finish all game." + file_list.sort(key=lambda file: os.path.getmtime(args.result_path + file) if not os.path.isdir( + args.result_path + file) else 0) + data_num = eval(file_list[-1][:-4]) + 1 + print(file_list) + with open("./data/" + str(data_num) + ".pkl", "w") as file: + picklestring = cPickle.dump(data, file) + data.reset() + game_num += 1 + except KeyboardInterrupt: + subprocess.call(["kill", "-9", str(agent_v0.pid)]) + subprocess.call(["kill", "-9", str(agent_v1.pid)]) + print "Kill all player, finish all game." diff --git a/AlphaGo/player.py b/AlphaGo/player.py index b468cf3..0e3daff 100644 --- a/AlphaGo/player.py +++ b/AlphaGo/player.py @@ -20,6 +20,7 @@ class Player(object): #return "inside the Player of player.py" return self.engine.run_cmd(command) + if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--checkpoint_path", type=str, default=None) From c3e9e55b243eb390456e6892636825b0a42332f0 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Thu, 21 Dec 2017 22:48:53 +0800 Subject: [PATCH 15/18] eliminate all references of Game class in Go class --- AlphaGo/engine.py | 2 +- AlphaGo/game.py | 15 ++----- AlphaGo/go.py | 101 +++++++++++++++++++++++-------------------- AlphaGo/play.py | 4 +- AlphaGo/self-play.py | 2 +- 5 files changed, 63 insertions(+), 61 deletions(-) diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index c9f1a3c..8b54470 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -183,7 +183,7 @@ class GTPEngine(): return 'unknown player', False def cmd_get_score(self, args, **kwargs): - return self._game.game_engine.executor_get_score(True), True + return self._game.game_engine.executor_get_score(self._game.board, True), True def cmd_show_board(self, args, **kwargs): return self._game.board, True diff --git a/AlphaGo/game.py b/AlphaGo/game.py index bf0d084..11ce52b 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -34,16 +34,7 @@ class Game: self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8) # self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], # feed_dict={self.net.x: state, self.net.is_training: False}) - self.game_engine = go.Go(game=self) - - def _flatten(self, vertex): - x, y = vertex - return (x - 1) * self.size + (y - 1) - - def _deflatten(self, idx): - x = idx // self.size + 1 - y = idx % self.size + 1 - return (x, y) + self.game_engine = go.Go(size=self.size, komi=self.komi) def clear(self): self.board = [utils.EMPTY] * (self.size ** 2) @@ -67,14 +58,14 @@ class Game: if choice == self.size ** 2: move = utils.PASS else: - move = self._deflatten(choice) + move = self.game_engine._deflatten(choice) return move, prob def play_move(self, color, vertex): # this function can be called directly to play the opponent's move if vertex == utils.PASS: return True - res = self.game_engine.executor_do_move(color, vertex) + res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex) return res def think_play_move(self, color): diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 4f1c759..9b7e21f 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -16,12 +16,22 @@ CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] class Go: def __init__(self, **kwargs): - self.game = kwargs['game'] + self.size = kwargs['size'] + self.komi = kwargs['komi'] + + def _flatten(self, vertex): + x, y = vertex + return (x - 1) * self.size + (y - 1) + + def _deflatten(self, idx): + x = idx // self.size + 1 + y = idx % self.size + 1 + return (x, y) def _in_board(self, vertex): x, y = vertex - if x < 1 or x > self.game.size: return False - if y < 1 or y > self.game.size: return False + if x < 1 or x > self.size: return False + if y < 1 or y > self.size: return False return True def _neighbor(self, vertex): @@ -45,7 +55,7 @@ class Go: return corner def _find_group(self, current_board, vertex): - color = current_board[self.game._flatten(vertex)] + color = current_board[self._flatten(vertex)] # print ("color : ", color) chain = set() frontier = [vertex] @@ -55,41 +65,41 @@ class Go: # print ("current : ", current) chain.add(current) for n in self._neighbor(current): - if current_board[self.game._flatten(n)] == color and not n in chain: + if current_board[self._flatten(n)] == color and not n in chain: frontier.append(n) - if current_board[self.game._flatten(n)] == utils.EMPTY: + if current_board[self._flatten(n)] == utils.EMPTY: has_liberty = True return has_liberty, chain def _is_suicide(self, current_board, color, vertex): - current_board[self.game._flatten(vertex)] = color # assume that we already take this move + current_board[self._flatten(vertex)] = color # assume that we already take this move suicide = False has_liberty, group = self._find_group(current_board, vertex) if not has_liberty: suicide = True # no liberty, suicide for n in self._neighbor(vertex): - if current_board[self.game._flatten(n)] == utils.another_color(color): + if current_board[self._flatten(n)] == utils.another_color(color): opponent_liberty, group = self._find_group(current_board, n) if not opponent_liberty: suicide = False # this move is able to take opponent's stone, not suicide - current_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move + current_board[self._flatten(vertex)] = utils.EMPTY # undo this move return suicide def _process_board(self, current_board, color, vertex): nei = self._neighbor(vertex) for n in nei: - if current_board[self.game._flatten(n)] == utils.another_color(color): + if current_board[self._flatten(n)] == utils.another_color(color): has_liberty, group = self._find_group(current_board, n) if not has_liberty: for b in group: - current_board[self.game._flatten(b)] = utils.EMPTY + current_board[self._flatten(b)] = utils.EMPTY def _check_global_isomorphous(self, history_boards, current_board, color, vertex): repeat = False next_board = copy.copy(current_board) - next_board[self.game._flatten(vertex)] = color + next_board[self._flatten(vertex)] = color self._process_board(next_board, color, vertex) if next_board in history_boards: repeat = True @@ -98,7 +108,7 @@ class Go: def _is_eye(self, current_board, color, vertex): nei = self._neighbor(vertex) cor = self._corner(vertex) - ncolor = {color == current_board[self.game._flatten(n)] for n in nei} + ncolor = {color == current_board[self._flatten(n)] for n in nei} if False in ncolor: # print "not all neighbors are in same color with us" return False @@ -107,7 +117,7 @@ class Go: # print "all neighbors are in same group and same color with us" return True else: - opponent_number = [current_board[self.game._flatten(c)] for c in cor].count(-color) + opponent_number = [current_board[self._flatten(c)] for c in cor].count(-color) opponent_propotion = float(opponent_number) / float(len(cor)) if opponent_propotion < 0.5: # print "few opponents, real eye" @@ -131,20 +141,20 @@ class Go: board = copy.deepcopy(current_board) empty_idx = [i for i, x in enumerate(board) if x == utils.EMPTY] # find all empty idx for idx in empty_idx: - neighbor_idx = self._neighbor(self.game.deflatten(idx)) + neighbor_idx = self._neighbor(self.deflatten(idx)) if len(neighbor_idx) > 1: first_idx = neighbor_idx[0] for other_idx in neighbor_idx[1:]: - if self.game.board[self.game.flatten(other_idx)] != self.game.board[self.game.flatten(first_idx)]: + if board[self.flatten(other_idx)] != board[self.flatten(first_idx)]: return False return True def _action2vertex(self, action): - if action == self.game.size ** 2: + if action == self.size ** 2: vertex = (0, 0) else: - vertex = self.game._deflatten(action) + vertex = self._deflatten(action) return vertex def _is_valid(self, history_boards, current_board, color, vertex): @@ -153,7 +163,7 @@ class Go: return False ### already have stone - if not current_board[self.game._flatten(vertex)] == utils.EMPTY: + if not current_board[self._flatten(vertex)] == utils.EMPTY: return False ### check if it is suicide @@ -195,7 +205,7 @@ class Go: if vertex == utils.PASS: return board else: - id_ = self.game._flatten(vertex) + id_ = self._flatten(vertex) board[id_] = color return board @@ -208,21 +218,21 @@ class Go: new_color = -color return [history_boards, new_color], 0 - def executor_do_move(self, color, vertex): - if not self._is_valid(self.game.history, self.game.board, color, vertex): + def executor_do_move(self, history, latest_boards, current_board, color, vertex): + if not self._is_valid(history, current_board, color, vertex): return False - self.game.board[self.game._flatten(vertex)] = color - self._process_board(self.game.board, color, vertex) - self.game.history.append(copy.copy(self.game.board)) - self.game.latest_boards.append(copy.copy(self.game.board)) + current_board[self._flatten(vertex)] = color + self._process_board(current_board, color, vertex) + history.append(copy.copy(current_board)) + latest_boards.append(copy.copy(current_board)) return True - def _find_empty(self): - idx = [i for i,x in enumerate(self.game.board) if x == utils.EMPTY ][0] - return self.game._deflatten(idx) + def _find_empty(self, current_board): + idx = [i for i,x in enumerate(current_board) if x == utils.EMPTY ][0] + return self._deflatten(idx) - def _find_boarder(self, vertex): - _, group = self._find_group(self.game.board, vertex) + def _find_boarder(self, current_board, vertex): + _, group = self._find_group(current_board, vertex) border = [] for b in group: for n in self._neighbor(b): @@ -248,7 +258,7 @@ class Go: start_vertex_x += x_diff start_vertex_y += y_diff - def _predict_from_nearby(self, vertex, neighbor_step=3): + def _predict_from_nearby(self, current_board, vertex, neighbor_step=3): ''' step: the nearby 3 steps is considered :vertex: position to be estimated @@ -264,38 +274,37 @@ class Go: self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] - step, -1, 1, neighbor_step) color_estimate = 0 for neighbor_vertex in neighbor_vertex_set: - color_estimate += self.game.board[self.game._flatten(neighbor_vertex)] + color_estimate += current_board[self._flatten(neighbor_vertex)] if color_estimate > 0: return utils.BLACK elif color_estimate < 0: return utils.WHITE - def executor_get_score(self, is_unknown_estimation=False): + def executor_get_score(self, current_board, is_unknown_estimation=False): ''' is_unknown_estimation: whether use nearby stone to predict the unknown return score from BLACK perspective. ''' - _board = copy.copy(self.game.board) - while utils.EMPTY in self.game.board: - vertex = self._find_empty() - boarder = self._find_boarder(vertex) - boarder_color = set(map(lambda v: self.game.board[self.game._flatten(v)], boarder)) + _board = copy.deepcopy(current_board) + while utils.EMPTY in _board: + vertex = self._find_empty(_board) + boarder = self._find_boarder(_board, vertex) + boarder_color = set(map(lambda v: _board[self._flatten(v)], boarder)) if boarder_color == {utils.BLACK}: - self.game.board[self.game._flatten(vertex)] = utils.BLACK + _board[self._flatten(vertex)] = utils.BLACK elif boarder_color == {utils.WHITE}: - self.game.board[self.game._flatten(vertex)] = utils.WHITE + _board[self._flatten(vertex)] = utils.WHITE elif is_unknown_estimation: - self.game.board[self.game._flatten(vertex)] = self._predict_from_nearby(vertex) + _board[self._flatten(vertex)] = self._predict_from_nearby(_board, vertex) else: - self.game.board[self.game._flatten(vertex)] =utils.UNKNOWN + _board[self._flatten(vertex)] =utils.UNKNOWN score = 0 - for i in self.game.board: + for i in _board: if i == utils.BLACK: score += 1 elif i == utils.WHITE: score -= 1 - score -= self.game.komi + score -= self.komi - self.game.board = _board return score diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 562dd14..e18555f 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -82,7 +82,7 @@ if __name__ == '__main__': evaluate_rounds = 1 game_num = 0 try: - while True: + while game_num < evaluate_rounds: num = 0 pass_flag = [False, False] print("Start game {}".format(game_num)) @@ -132,6 +132,8 @@ if __name__ == '__main__': picklestring = cPickle.dump(data, file) data.reset() game_num += 1 + subprocess.call(["kill", "-9", str(agent_v0.pid)]) + subprocess.call(["kill", "-9", str(agent_v1.pid)]) except KeyboardInterrupt: subprocess.call(["kill", "-9", str(agent_v0.pid)]) subprocess.call(["kill", "-9", str(agent_v1.pid)]) diff --git a/AlphaGo/self-play.py b/AlphaGo/self-play.py index 63b7e97..4387b24 100644 --- a/AlphaGo/self-play.py +++ b/AlphaGo/self-play.py @@ -79,7 +79,7 @@ while True: prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1)) print("Finished") print("\n") - score = game.game_engine.executor_get_score(True) + score = game.game_engine.executor_get_score(game.board, True) if score > 0: winner = utils.BLACK else: From c11eccbc908d552aca5c08e240eefa73cd0ca35b Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Thu, 21 Dec 2017 23:30:24 +0800 Subject: [PATCH 16/18] implement the training process --- .gitignore | 1 + AlphaGo/game.py | 2 +- AlphaGo/model.py | 106 ++++++++++++++++++++++++++++++++++++++++++----- AlphaGo/play.py | 28 ++++++++----- 4 files changed, 114 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index 36d134c..d697b92 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ checkpoints checkpoints_origin *.json .DS_Store +data diff --git a/AlphaGo/game.py b/AlphaGo/game.py index bf0d084..c342d0c 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -60,7 +60,7 @@ class Game: def think(self, latest_boards, color): mcts = MCTS(self.game_engine, self.evaluator, [latest_boards, color], self.size ** 2 + 1, inverse=True) - mcts.search(max_step=1) + mcts.search(max_step=20) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0] diff --git a/AlphaGo/model.py b/AlphaGo/model.py index fab864e..41f3a47 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -2,6 +2,7 @@ import os import time import sys import cPickle +from collections import deque import numpy as np import tensorflow as tf @@ -71,6 +72,13 @@ def value_head(input, is_training): return h +class Data(object): + def __init__(self): + self.boards = [] + self.probs = [] + self.winner = 0 + + class ResNet(object): def __init__(self, board_size, action_num, history_length=1, residual_block_num=20, checkpoint_path=None): """ @@ -85,11 +93,18 @@ class ResNet(object): self.board_size = board_size self.action_num = action_num self.history_length = history_length + self.checkpoint_path = checkpoint_path self.x = tf.placeholder(tf.float32, shape=[None, self.board_size, self.board_size, 2 * self.history_length + 1]) self.is_training = tf.placeholder(tf.bool, shape=[]) self.z = tf.placeholder(tf.float32, shape=[None, 1]) self.pi = tf.placeholder(tf.float32, shape=[None, self.action_num]) - self._build_network(residual_block_num, checkpoint_path) + self._build_network(residual_block_num, self.checkpoint_path) + + # training hyper-parameters: + self.window_length = 1000 + self.save_freq = 1000 + self.training_data = {'states': deque(maxlen=self.window_length), 'probs': deque(maxlen=self.window_length), + 'winner': deque(maxlen=self.window_length)} def _build_network(self, residual_block_num, checkpoint_path): """ @@ -118,7 +133,7 @@ class ResNet(object): with tf.control_dependencies(self.update_ops): self.train_op = tf.train.AdamOptimizer(1e-4).minimize(self.total_loss) self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) - self.saver = tf.train.Saver(max_to_keep=10, var_list=self.var_list) + self.saver = tf.train.Saver(var_list=self.var_list) self.sess = multi_gpu.create_session() self.sess.run(tf.global_variables_initializer()) if checkpoint_path is not None: @@ -166,21 +181,90 @@ class ResNet(object): state[0, :, :, 2 * self.history_length] = np.zeros([self.board_size, self.board_size]) return state - #TODO: design the interface between the environment and training + # TODO: design the interface between the environment and training def train(self, mode='memory', *args, **kwargs): if mode == 'memory': pass if mode == 'file': - self.train_with_file(data_path=kwargs['data_path'], checkpoint_path=kwargs['checkpoint_path']) + self._train_with_file(data_path=kwargs['data_path'], batch_size=kwargs['batch_size'], + checkpoint_path=kwargs['checkpoint_path']) - def train_with_file(self, data_path, checkpoint_path): + def _train_with_file(self, data_path, batch_size, checkpoint_path): + # check if the path is valid if not os.path.exists(data_path): raise ValueError("{} doesn't exist".format(data_path)) + self.checkpoint_path = checkpoint_path + if not os.path.exists(self.checkpoint_path): + os.mkdir(self.checkpoint_path) - file_list = os.listdir(data_path) - if file_list <= 50: - time.sleep(1) - else: - file_list.sort(key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir( - data_path + file) else 0) + new_file_list = [] + all_file_list = [] + training_data = {} + iters = 0 + while True: + new_file_list = list(set(os.listdir(data_path)).difference(all_file_list)) + all_file_list = os.listdir(data_path) + new_file_list.sort( + key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir(data_path + file) else 0) + if new_file_list: + for file in new_file_list: + states, probs, winner = self._file_to_training_data(data_path + file) + assert states.shape[0] == probs.shape[0] + assert states.shape[0] == winner.shape[0] + self.training_data['states'].append(states) + self.training_data['probs'].append(probs) + self.training_data['winner'].append(winner) + training_data['states'] = np.concatenate(self.training_data['states'], axis=0) + training_data['probs'] = np.concatenate(self.training_data['probs'], axis=0) + training_data['winner'] = np.concatenate(self.training_data['winner'], axis=0) + if len(self.training_data['states']) != self.window_length: + continue + else: + data_num = training_data['states'].shape[0] + index = np.arange(data_num) + np.random.shuffle(index) + start_time = time.time() + value_loss, policy_loss, reg, _ = self.sess.run( + [self.value_loss, self.policy_loss, self.reg, self.train_op], + feed_dict={self.x: training_data['states'][index[:batch_size]], + self.z: training_data['winner'][index[:batch_size]], + self.pi: training_data['probs'][index[:batch_size]], + self.is_training: True}) + print("Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format(iters, + time.time() - start_time, + value_loss, + policy_loss, reg)) + iters += 1 + if iters % self.save_freq == 0: + save_path = "Iteration{}.ckpt".format(iters) + self.saver.save(self.sess, self.checkpoint_path + save_path) + + def _file_to_training_data(self, file_name): + with open(file_name, 'r') as file: + data = cPickle.load(file) + history = deque(maxlen=self.history_length) + states = [] + probs = [] + winner = [] + for _ in range(self.history_length): + # Note that 0 is specified, need a more general way like config + history.append([0] * self.board_size ** 2) + # Still, +1 is specified + color = +1 + + for [board, prob] in zip(data.boards, data.probs): + history.append(board) + states.append(self._history2state(history, color)) + probs.append(np.array(prob).reshape(1, self.board_size ** 2 + 1)) + winner.append(np.array(data.winner).reshape(1, 1)) + color *= -1 + states = np.concatenate(states, axis=0) + probs = np.concatenate(probs, axis=0) + winner = np.concatenate(winner, axis=0) + return states, probs, winner + + +if __name__=="__main__": + model = ResNet(board_size=9, action_num=82) + model.train("file", data_path="./data/", batch_size=128, checkpoint_path="./checkpoint/") \ No newline at end of file diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 562dd14..bd3776e 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -76,6 +76,7 @@ if __name__ == '__main__': color = ['b', 'w'] pattern = "[A-Z]{1}[0-9]{1}" + space = re.compile("\s+") size = 9 show = ['.', 'X', 'O'] @@ -83,12 +84,20 @@ if __name__ == '__main__': game_num = 0 try: while True: + start_time = time.time() num = 0 pass_flag = [False, False] print("Start game {}".format(game_num)) # end the game if both palyer chose to pass, or play too much turns while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2: turn = num % 2 + board = player[turn].run_cmd(str(num) + ' show_board') + board = eval(board[board.index('['):board.index(']') + 1]) + for i in range(size): + for j in range(size): + print show[board[i * size + j]] + " ", + print "\n", + data.boards.append(board) move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n') print role[turn] + " : " + str(move), num += 1 @@ -102,21 +111,18 @@ if __name__ == '__main__': play_or_pass = ' PASS' pass_flag[turn] = True result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n') - board = player[turn].run_cmd(str(num) + ' show_board') - board = eval(board[board.index('['):board.index(']') + 1]) - for i in range(size): - for j in range(size): - print show[board[i * size + j]] + " ", - print "\n", - data.boards.append(board) prob = player[turn].run_cmd(str(num) + ' get_prob') + prob = space.sub(',', prob[prob.index('['):prob.index(']') + 1]) + prob = prob.replace('[,', '[') + prob = prob.replace('],', ']') + prob = eval(prob) data.probs.append(prob) score = player[turn].run_cmd(str(num) + ' get_score') print "Finished : ", score.split(" ")[1] # TODO: generalize the player - if score > 0: + if eval(score.split(" ")[1]) > 0: data.winner = 1 - if score < 0: + if eval(score.split(" ")[1]) < 0: data.winner = -1 player[0].run_cmd(str(num) + ' clear_board') player[1].run_cmd(str(num) + ' clear_board') @@ -127,12 +133,12 @@ if __name__ == '__main__': file_list.sort(key=lambda file: os.path.getmtime(args.result_path + file) if not os.path.isdir( args.result_path + file) else 0) data_num = eval(file_list[-1][:-4]) + 1 - print(file_list) with open("./data/" + str(data_num) + ".pkl", "w") as file: picklestring = cPickle.dump(data, file) data.reset() game_num += 1 - except KeyboardInterrupt: + print("Time {}".format(time.time()-start_time)) + except Exception: subprocess.call(["kill", "-9", str(agent_v0.pid)]) subprocess.call(["kill", "-9", str(agent_v1.pid)]) print "Kill all player, finish all game." From a20255249cc0a83c8d6a320f07c8fe2f5b109de4 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Thu, 21 Dec 2017 23:55:31 +0800 Subject: [PATCH 17/18] modify for multi instance --- AlphaGo/play.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 35549dd..a9d3d20 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -28,6 +28,7 @@ if __name__ == '__main__': parser.add_argument("--result_path", type=str, default="./data/") parser.add_argument("--black_weight_path", type=str, default=None) parser.add_argument("--white_weight_path", type=str, default=None) + parser.add_argument("--id", type=int, default=0) args = parser.parse_args() if not os.path.exists(args.result_path): @@ -50,12 +51,15 @@ if __name__ == '__main__': time.sleep(1) # start two different player with different network weights. + black_role_name = 'black' + str(args.id) + white_role_name = 'white' + str(args.id) + agent_v0 = subprocess.Popen( - ['python', '-u', 'player.py', '--role=black', '--checkpoint_path=' + str(args.black_weight_path)], + ['python', '-u', 'player.py', '--role=' + black_role_name, '--checkpoint_path=' + str(args.black_weight_path)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) agent_v1 = subprocess.Popen( - ['python', '-u', 'player.py', '--role=white', '--checkpoint_path=' + str(args.white_weight_path)], + ['python', '-u', 'player.py', '--role=' + white_role_name, '--checkpoint_path=' + str(args.white_weight_path)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) server_list = "" @@ -69,8 +73,8 @@ if __name__ == '__main__': data = Data() player = [None] * 2 - player[0] = Pyro4.Proxy("PYRONAME:black") - player[1] = Pyro4.Proxy("PYRONAME:white") + player[0] = Pyro4.Proxy("PYRONAME:" + black_role_name) + player[1] = Pyro4.Proxy("PYRONAME:" + white_role_name) role = ["BLACK", "WHITE"] color = ['b', 'w'] From a61c1f136a1f2a26578ba7ce72ac36eb7e1a760d Mon Sep 17 00:00:00 2001 From: rtz19970824 Date: Fri, 22 Dec 2017 00:04:51 +0800 Subject: [PATCH 18/18] multi-instance support --- AlphaGo/play.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/AlphaGo/play.py b/AlphaGo/play.py index a9d3d20..a8267a7 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -41,14 +41,14 @@ if __name__ == '__main__': raise ValueError("Can't not find the network weights for white player.") # kill the old server - kill_old_server = subprocess.Popen(['killall', 'pyro4-ns']) - print "kill the old pyro4 name server, the return code is : " + str(kill_old_server.wait()) - time.sleep(1) + # kill_old_server = subprocess.Popen(['killall', 'pyro4-ns']) + # print "kill the old pyro4 name server, the return code is : " + str(kill_old_server.wait()) + # time.sleep(1) # start a name server to find the remote object - start_new_server = subprocess.Popen(['pyro4-ns', '&']) - print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait()) - time.sleep(1) + # start_new_server = subprocess.Popen(['pyro4-ns', '&']) + # print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait()) + # time.sleep(1) # start two different player with different network weights. black_role_name = 'black' + str(args.id) @@ -63,7 +63,7 @@ if __name__ == '__main__': stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) server_list = "" - while ("black" not in server_list) or ("white" not in server_list): + while (black_role_name not in server_list) or (white_role_name not in server_list): server_list = subprocess.check_output(['pyro4-nsc', 'list']) print "Waiting for the server start..." time.sleep(1) @@ -142,11 +142,12 @@ if __name__ == '__main__': data.reset() game_num += 1 - except Exception: + except Exception as e: + print(e) subprocess.call(["kill", "-9", str(agent_v0.pid)]) subprocess.call(["kill", "-9", str(agent_v1.pid)]) print "Kill all player, finish all game." subprocess.call(["kill", "-9", str(agent_v0.pid)]) subprocess.call(["kill", "-9", str(agent_v1.pid)]) - print "Kill all player, finish all game." \ No newline at end of file + print "Kill all player, finish all game."