From 75bc2968d27f0e77bd24863d5a887d787bdf4c47 Mon Sep 17 00:00:00 2001 From: Tongzheng Ren Date: Mon, 18 Dec 2017 23:32:41 +0800 Subject: [PATCH 01/36] add a detailed Chinese google coding style for convenience --- .DS_Store | Bin 0 -> 8196 bytes AlphaGo/.DS_Store | Bin 0 -> 6148 bytes README.md | 4 +++- 3 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 .DS_Store create mode 100644 AlphaGo/.DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..99fab83bc04e2a081117249f4d6eb1500ca26cd4 GIT binary patch literal 8196 zcmeHMOHUL*5U%DSvm!FQiF%m0F(H8vh&&F)I4ls{_<*nkA%M&5?yyW8y4RUmo?`am zZ!mf_G4bTVU*OS;SC1b21183^RzHAcRy~S|-J9;OrmL!}`Lw06+ViMb7R4J(a)jrG1vq z`7DPPd{f=P+V?{glmMDw0({K#H^G4!U@_}Af40u;ge5ihhg`|`!f0sd6RE3jXgt?s zBoaoV#aPR1vYkxiMCCB$1S{goE4Vo&FEHjG%T8|5b9b_4=Om+%<54%LGGBe*(E{5^ zu`(@$Vw^cA+C*OwC~Ni}9E^;P4p<{&!-oUb!T9(P%GbvZ4-KQ|+VIWv`qtk5!IQ(M zFT}Ees3#D#%Io9yB}y?;y!^IWCV6+X%+IEIywLN_7g{d1nr-bJZJljhU0vNhUA^s> zE@#b-oRfc44EU5Mna_fhUv{%*YlxJj^Nh{~?lvxXDROtCjJJ=f_L#%GTX=ZVwXj!0 zo6eICG3ZR-x^$%&c!k-GjnIvjs^n!JEgvx7pJSmHd5kWvis+HMm8KgEBk~!{mbfZh zbb}DF#%F2S2?B=}-PGQi@8n#6mZn$~v8^Q!SHWj4LS{-%-lN6jt*?^IMAGh)k_pKp zehrf>P)1!;&dI){eMcz`=*Mqk23FxQRNy(hg17J=_Xf6KN%pSWSrP! zf=rUzBu(a}ciqtqy>b*vB3VI<-O%k!Kz&AwsNr4>Pyhyb2v2|eNe}9(_lf~7?BYJz z!u{f-9JhZKvu}dCkcLT^hWogx>DcJsV%7X+HR|=;sVDUIB=m7XD->Qd)NaU-~&p!-K`?h26a==-g4nLIP3w5 z@8bjD&DfO0b@zl2LY~TgsWWdpesSy=fSK-=FMu(C9*bb-lvTjwzLWzyaV?LCMq}it zFz0WA4+q|{Au13R_}3JWcXy!gY>o|@@9*tJQC=>JazPJS9$vCM`=Q1wVO^rc5?6SR zkF2+3U&Cj8^H^OnGOiMtt`VNM`R;l58FzyfBX4*mtBS{f#Q)mJs!Q(->#(-WGgodN;>uHL zamT9E@BZevVdfb#FM@pP*W!0MvV#7%A#d~9^X~7x<@pDmkLcKnKcZj89G4u^no8a? z8)R%JNkf-D-ryW(nBf(1LjKa5qa00(G34m4IG$_H(fh{5C(z;z$g4Ediu_er4bAUM zBflZ%#IJWqcoHL|iS*Xw^~jTXRge+#r|K<#PU1ge+_S}c1B0(%8yeMndY zQ-`%hT{>9l5rEiXvl_4EV?j8H!_;AIk#}guQi+yo+!4cAI@=THmpZI1S~`q7d>GfW zaVHd`y)%BI?l7svSVsk-0&N8z*uP`h|Id$~|J$VaFDeif_^%W&o!Q&jltXfR>%rh; vuT5Cqv51LZZBZ$#+;*%NvK2pJQRBTtE{Lha+9F$M@k2mmh*ebJuPX2p(V7f* literal 0 HcmV?d00001 diff --git a/README.md b/README.md index 543d237..9c3af16 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,8 @@ Tianshou(天授) is a reinforcement learning platform. The following image illus Please follow [google python coding style](https://google.github.io/styleguide/pyguide.html) +There's a more detailed Chinese version [google python coding style in Chinese](http://www.runoob.com/w3cnote/google-python-styleguide.html) + All files/folders should be named with lower case letters and underline (except specified names such as `AlphaGo`). Try to use full names. Don't use abbrevations for class/function/variable names except common abbrevations (such as `num` for number, `dim` for dimension, `env` for environment, `op` for operation). For now we use `pi` to refer to the policy in examples/ppo_example.py. @@ -73,4 +75,4 @@ HaoshengZou: collaborate mainly on Policy and losses; interfaces and architectur Note: install openai/gym first to run the Atari environment; note that interfaces between modules may not be finalized; the management of placeholders and `feed_dict` may have to be done manually for the time being; -Without preprocessing and other tricks, this example will not train to any meaningful results. Codes should past two tests: individual module test and run through this example code. \ No newline at end of file +Without preprocessing and other tricks, this example will not train to any meaningful results. Codes should past two tests: individual module test and run through this example code. From 6b6c48f122aad3fc415cfbaecbeae449fc8f632d Mon Sep 17 00:00:00 2001 From: Tongzheng Ren Date: Mon, 18 Dec 2017 23:34:32 +0800 Subject: [PATCH 02/36] update gitignore --- .DS_Store | Bin 8196 -> 0 bytes .gitignore | 1 + AlphaGo/.DS_Store | Bin 6148 -> 0 bytes 3 files changed, 1 insertion(+) delete mode 100644 .DS_Store delete mode 100644 AlphaGo/.DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 99fab83bc04e2a081117249f4d6eb1500ca26cd4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHMOHUL*5U%DSvm!FQiF%m0F(H8vh&&F)I4ls{_<*nkA%M&5?yyW8y4RUmo?`am zZ!mf_G4bTVU*OS;SC1b21183^RzHAcRy~S|-J9;OrmL!}`Lw06+ViMb7R4J(a)jrG1vq z`7DPPd{f=P+V?{glmMDw0({K#H^G4!U@_}Af40u;ge5ihhg`|`!f0sd6RE3jXgt?s zBoaoV#aPR1vYkxiMCCB$1S{goE4Vo&FEHjG%T8|5b9b_4=Om+%<54%LGGBe*(E{5^ zu`(@$Vw^cA+C*OwC~Ni}9E^;P4p<{&!-oUb!T9(P%GbvZ4-KQ|+VIWv`qtk5!IQ(M zFT}Ees3#D#%Io9yB}y?;y!^IWCV6+X%+IEIywLN_7g{d1nr-bJZJljhU0vNhUA^s> zE@#b-oRfc44EU5Mna_fhUv{%*YlxJj^Nh{~?lvxXDROtCjJJ=f_L#%GTX=ZVwXj!0 zo6eICG3ZR-x^$%&c!k-GjnIvjs^n!JEgvx7pJSmHd5kWvis+HMm8KgEBk~!{mbfZh zbb}DF#%F2S2?B=}-PGQi@8n#6mZn$~v8^Q!SHWj4LS{-%-lN6jt*?^IMAGh)k_pKp zehrf>P)1!;&dI){eMcz`=*Mqk23FxQRNy(hg17J=_Xf6KN%pSWSrP! zf=rUzBu(a}ciqtqy>b*vB3VI<-O%k!Kz&AwsNr4>Pyhyb2v2|eNe}9(_lf~7?BYJz z!u{f-9JhZKvu}dCkcLT^hWogx>DcJsV%7X+HR|=;sVDUIB=m7XD->Qd)NaU-~&p!-K`?h26a==-g4nLIP3w5 z@8bjD&DfO0b@zl2LY~TgsWWdpesSy=fSK-=FMu(C9*bb-lvTjwzLWzyaV?LCMq}it zFz0WA4+q|{Au13R_}3JWcXy!gY>o|@@9*tJQC=>JazPJS9$vCM`=Q1wVO^rc5?6SR zkF2+3U&Cj8^H^OnGOiMtt`VNM`R;l58FzyfBX4*mtBS{f#Q)mJs!Q(->#(-WGgodN;>uHL zamT9E@BZevVdfb#FM@pP*W!0MvV#7%A#d~9^X~7x<@pDmkLcKnKcZj89G4u^no8a? z8)R%JNkf-D-ryW(nBf(1LjKa5qa00(G34m4IG$_H(fh{5C(z;z$g4Ediu_er4bAUM zBflZ%#IJWqcoHL|iS*Xw^~jTXRge+#r|K<#PU1ge+_S}c1B0(%8yeMndY zQ-`%hT{>9l5rEiXvl_4EV?j8H!_;AIk#}guQi+yo+!4cAI@=THmpZI1S~`q7d>GfW zaVHd`y)%BI?l7svSVsk-0&N8z*uP`h|Id$~|J$VaFDeif_^%W&o!Q&jltXfR>%rh; vuT5Cqv51LZZBZ$#+;*%NvK2pJQRBTtE{Lha+9F$M@k2mmh*ebJuPX2p(V7f* From ea52096713fc42307b3bd5974f7f935edd1c58f5 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Tue, 19 Dec 2017 00:16:21 +0800 Subject: [PATCH 03/36] delete unused parameter of _find_block, and using _find_group to replace _find_block --- AlphaGo/go.py | 13 +++++---- AlphaGo/strategy.py | 66 ++++++++++++++++++++------------------------- 2 files changed, 35 insertions(+), 44 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 0afc877..752973e 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -13,25 +13,24 @@ Settings of the Go game. NEIGHBOR_OFFSET = [[1, 0], [-1, 0], [0, -1], [0, 1]] - class Go: def __init__(self, **kwargs): self.game = kwargs['game'] - def _bfs(self, vertex, color, block, status, alive_break): + def _bfs(self, vertex, color, block, status): block.append(vertex) status[self.game._flatten(vertex)] = True nei = self._neighbor(vertex) for n in nei: if not status[self.game._flatten(n)]: if self.game.board[self.game._flatten(n)] == color: - self._bfs(n, color, block, status, alive_break) + self._bfs(n, color, block, status) - def _find_block(self, vertex, alive_break=False): + def _find_block(self, vertex): block = [] status = [False] * (self.game.size * self.game.size) color = self.game.board[self.game._flatten(vertex)] - self._bfs(vertex, color, block, status, alive_break) + self._bfs(vertex, color, block, status) for b in block: for n in self._neighbor(b): @@ -42,7 +41,7 @@ class Go: def _find_boarder(self, vertex): block = [] status = [False] * (self.game.size * self.game.size) - self._bfs(vertex, utils.EMPTY, block, status, False) + self._bfs(vertex, utils.EMPTY, block, status) border = [] for b in block: for n in self._neighbor(b): @@ -106,7 +105,7 @@ class Go: nei = self._neighbor(vertex) for n in nei: if self.game.board[self.game._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n, alive_break=True) + can_kill, block = self._find_block(n) if can_kill: for b in block: self.game.board[self.game._flatten(b)] = utils.EMPTY diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index 0bad998..8c12c71 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -23,26 +23,32 @@ class GoEnv: x, y = vertex return (x - 1) * self.game.size + (y - 1) - def _bfs(self, vertex, color, block, status, alive_break): + def _find_group(self, start): + color = self.board[self._flatten(start)] + # print ("color : ", color) + chain = set() + frontier = [start] + has_liberty = False + while frontier: + current = frontier.pop() + # print ("current : ", current) + chain.add(current) + for n in self._neighbor(current): + # print n, self._flatten(n), self.board[self._flatten(n)], + if self.board[self._flatten(n)] == color and not n in chain: + frontier.append(n) + if self.board[self._flatten(n)] == utils.EMPTY: + has_liberty = True + return has_liberty, chain + + def _bfs(self, vertex, color, block, status): block.append(vertex) status[self._flatten(vertex)] = True nei = self._neighbor(vertex) for n in nei: if not status[self._flatten(n)]: if self.board[self._flatten(n)] == color: - self._bfs(n, color, block, status, alive_break) - - def _find_block(self, vertex, alive_break=False): - block = [] - status = [False] * (self.game.size * self.game.size) - color = self.board[self._flatten(vertex)] - self._bfs(vertex, color, block, status, alive_break) - - for b in block: - for n in self._neighbor(b): - if self.board[self._flatten(n)] == utils.EMPTY: - return False, block - return True, block + self._bfs(n, color, block, status) def _is_qi(self, color, vertex): nei = self._neighbor(vertex) @@ -53,14 +59,14 @@ class GoEnv: self.board[self._flatten(vertex)] = color for n in nei: if self.board[self._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n) - if can_kill: + has_liberty, group = self._find_group(n) + if not has_liberty: self.board[self._flatten(vertex)] = utils.EMPTY return True ### avoid suicide - can_kill, block = self._find_block(vertex) - if can_kill: + has_liberty, group = self._find_group(vertex) + if not has_liberty: self.board[self._flatten(vertex)] = utils.EMPTY return False @@ -110,26 +116,11 @@ class GoEnv: nei = self._neighbor(vertex) for n in nei: if self.board[self._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n, alive_break=True) - if can_kill: - for b in block: + has_liberty, group = self._find_group(n) + if not has_liberty: + for b in group: self.board[self._flatten(b)] = utils.EMPTY - def _find_group(self, start): - color = self.board[self._flatten(start)] - # print ("color : ", color) - chain = set() - frontier = [start] - while frontier: - current = frontier.pop() - # print ("current : ", current) - chain.add(current) - for n in self._neighbor(current): - # print n, self._flatten(n), self.board[self._flatten(n)], - if self.board[self._flatten(n)] == color and not n in chain: - frontier.append(n) - return chain - def _is_eye(self, color, vertex): nei = self._neighbor(vertex) cor = self._corner(vertex) @@ -137,7 +128,8 @@ class GoEnv: if False in ncolor: # print "not all neighbors are in same color with us" return False - if set(nei) < self._find_group(nei[0]): + _, group = self._find_group(nei[0]) + if set(nei) < group: # print "all neighbors are in same group and same color with us" return True else: From 6a410384bbcccd65fd204503c266b09fd1fc8f4b Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Tue, 19 Dec 2017 00:47:21 +0800 Subject: [PATCH 04/36] rewrite _is_qi in a more understandable way --- AlphaGo/strategy.py | 46 ++++++++++++++++++--------------------------- 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index 8c12c71..e00e69d 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -41,37 +41,27 @@ class GoEnv: has_liberty = True return has_liberty, chain - def _bfs(self, vertex, color, block, status): - block.append(vertex) - status[self._flatten(vertex)] = True - nei = self._neighbor(vertex) - for n in nei: - if not status[self._flatten(n)]: - if self.board[self._flatten(n)] == color: - self._bfs(n, color, block, status) - - def _is_qi(self, color, vertex): - nei = self._neighbor(vertex) - for n in nei: - if self.board[self._flatten(n)] == utils.EMPTY: - return True - + def _is_suicide(self, color, vertex): + ### assume that we already take this move self.board[self._flatten(vertex)] = color - for n in nei: - if self.board[self._flatten(n)] == utils.another_color(color): - has_liberty, group = self._find_group(n) - if not has_liberty: - self.board[self._flatten(vertex)] = utils.EMPTY - return True - ### avoid suicide has_liberty, group = self._find_group(vertex) - if not has_liberty: + if has_liberty: + ### this group still has liberty after this move, not suicide self.board[self._flatten(vertex)] = utils.EMPTY return False - - self.board[self._flatten(vertex)] = utils.EMPTY - return True + else: + ### liberty is zero + for n in self._neighbor(vertex): + if self.board[self._flatten(n)] == utils.another_color(color): + opponent_liberty, group = self._find_group(n) + # this move is able to take opponent's stone, not suicide + if not opponent_liberty: + self.board[self._flatten(vertex)] = utils.EMPTY + return False + # not a take, suicide + self.board[self._flatten(vertex)] = utils.EMPTY + return True def _check_global_isomorphous(self, color, vertex): ##backup @@ -174,8 +164,8 @@ class GoEnv: # print(vertex) return False - ### check if it is qi - if not self._is_qi(color, vertex): + ### check if it is suicide + if self._is_suicide(color, vertex): return False ### forbid global isomorphous From 99a617a1f041643c1b0618d9de3b2017ed144b10 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Tue, 19 Dec 2017 11:16:17 +0800 Subject: [PATCH 05/36] rename variable for clarity --- AlphaGo/game.py | 16 ++++----- AlphaGo/go.py | 83 +++++++++++++++++++++++---------------------- AlphaGo/strategy.py | 60 ++++++++++++++++---------------- 3 files changed, 80 insertions(+), 79 deletions(-) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 02ccb27..3b62435 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -29,7 +29,7 @@ class Game: def __init__(self, size=9, komi=6.5, checkpoint_path=None): self.size = size self.komi = komi - self.board = [utils.EMPTY] * (self.size * self.size) + self.board = [utils.EMPTY] * (self.size ** 2) self.history = [] self.latest_boards = deque(maxlen=8) for _ in range(8): @@ -54,7 +54,7 @@ class Game: return (x,y) def clear(self): - self.board = [utils.EMPTY] * (self.size * self.size) + self.board = [utils.EMPTY] * (self.size ** 2) self.history = [] for _ in range(8): self.latest_boards.append(self.board) @@ -66,11 +66,11 @@ class Game: def set_komi(self, k): self.komi = k - def generate_nn_input(self, history, color): + def generate_nn_input(self, latest_boards, color): state = np.zeros([1, self.size, self.size, 17]) for i in range(8): - state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.size ** 2)).reshape(self.size, self.size) - state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(self.size ** 2)).reshape(self.size, self.size) + state[0, :, :, i] = np.array(np.array(latest_boards[i]) == np.ones(self.size ** 2)).reshape(self.size, self.size) + state[0, :, :, i + 8] = np.array(np.array(latest_boards[i]) == -np.ones(self.size ** 2)).reshape(self.size, self.size) if color == utils.BLACK: state[0, :, :, 16] = np.ones([self.size, self.size]) if color == utils.WHITE: @@ -78,9 +78,9 @@ class Game: return state def strategy_gen_move(self, latest_boards, color): - self.simulator.latest_boards = copy.copy(latest_boards) - self.simulator.board = copy.copy(latest_boards[-1]) - nn_input = self.generate_nn_input(self.simulator.latest_boards, color) + self.simulator.simulate_latest_boards = copy.copy(latest_boards) + self.simulator.simulate_board = copy.copy(latest_boards[-1]) + nn_input = self.generate_nn_input(self.simulator.simulate_latest_boards, color) mcts = MCTS(self.simulator, self.evaluator, nn_input, self.size ** 2 + 1, inverse=True, max_step=1) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 752973e..7b1d3e7 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -28,7 +28,7 @@ class Go: def _find_block(self, vertex): block = [] - status = [False] * (self.game.size * self.game.size) + status = [False] * (self.game.size ** 2) color = self.game.board[self.game._flatten(vertex)] self._bfs(vertex, color, block, status) @@ -40,7 +40,7 @@ class Go: def _find_boarder(self, vertex): block = [] - status = [False] * (self.game.size * self.game.size) + status = [False] * (self.game.size ** 2) self._bfs(vertex, utils.EMPTY, block, status) border = [] for b in block: @@ -141,6 +141,46 @@ class Go: idx = [i for i,x in enumerate(self.game.board) if x == utils.EMPTY ][0] return self.game._deflatten(idx) + def _add_nearby_stones(self, neighbor_vertex_set, start_vertex_x, start_vertex_y, x_diff, y_diff, num_step): + ''' + add the nearby stones around the input vertex + :param neighbor_vertex_set: input list + :param start_vertex_x: x axis of the input vertex + :param start_vertex_y: y axis of the input vertex + :param x_diff: add x axis + :param y_diff: add y axis + :param num_step: number of steps to be added + :return: + ''' + for step in xrange(num_step): + new_neighbor_vertex = (start_vertex_x, start_vertex_y) + if self._in_board(new_neighbor_vertex): + neighbor_vertex_set.append((start_vertex_x, start_vertex_y)) + start_vertex_x += x_diff + start_vertex_y += y_diff + + def _predict_from_nearby(self, vertex, neighbor_step = 3): + ''' + step: the nearby 3 steps is considered + :vertex: position to be estimated + :neighbor_step: how many steps nearby + :return: the nearby positions of the input position + currently the nearby 3*3 grid is returned, altogether 4*8 points involved + ''' + for step in range(1, neighbor_step + 1): # check the stones within the steps in range + neighbor_vertex_set = [] + self._add_nearby_stones(neighbor_vertex_set, vertex[0] - step, vertex[1], 1, 1, neighbor_step) + self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] + step, 1, -1, neighbor_step) + self._add_nearby_stones(neighbor_vertex_set, vertex[0] + step, vertex[1], -1, -1, neighbor_step) + self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] - step, -1, 1, neighbor_step) + color_estimate = 0 + for neighbor_vertex in neighbor_vertex_set: + color_estimate += self.game.board[self.game._flatten(neighbor_vertex)] + if color_estimate > 0: + return utils.BLACK + elif color_estimate < 0: + return utils.WHITE + def get_score(self, is_unknown_estimation = False): ''' is_unknown_estimation: whether use nearby stone to predict the unknown @@ -170,42 +210,3 @@ class Go: self.game.board = _board return score - def _predict_from_nearby(self, vertex, neighbor_step = 3): - ''' - step: the nearby 3 steps is considered - :vertex: position to be estimated - :neighbor_step: how many steps nearby - :return: the nearby positions of the input position - currently the nearby 3*3 grid is returned, altogether 4*8 points involved - ''' - for step in range(1, neighbor_step + 1): # check the stones within the steps in range - neighbor_vertex_set = [] - self._add_nearby_stones(neighbor_vertex_set, vertex[0] - step, vertex[1], 1, 1, neighbor_step) - self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] + step, 1, -1, neighbor_step) - self._add_nearby_stones(neighbor_vertex_set, vertex[0] + step, vertex[1], -1, -1, neighbor_step) - self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] - step, -1, 1, neighbor_step) - color_estimate = 0 - for neighbor_vertex in neighbor_vertex_set: - color_estimate += self.game.board[self.game._flatten(neighbor_vertex)] - if color_estimate > 0: - return utils.BLACK - elif color_estimate < 0: - return utils.WHITE - - def _add_nearby_stones(self, neighbor_vertex_set, start_vertex_x, start_vertex_y, x_diff, y_diff, num_step): - ''' - add the nearby stones around the input vertex - :param neighbor_vertex_set: input list - :param start_vertex_x: x axis of the input vertex - :param start_vertex_y: y axis of the input vertex - :param x_diff: add x axis - :param y_diff: add y axis - :param num_step: number of steps to be added - :return: - ''' - for step in xrange(num_step): - new_neighbor_vertex = (start_vertex_x, start_vertex_y) - if self._in_board(new_neighbor_vertex): - neighbor_vertex_set.append((start_vertex_x, start_vertex_y)) - start_vertex_x += x_diff - start_vertex_y += y_diff diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index e00e69d..fe6bcbf 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -16,15 +16,15 @@ CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] class GoEnv: def __init__(self, **kwargs): self.game = kwargs['game'] - self.board = [utils.EMPTY] * (self.game.size * self.game.size) - self.latest_boards = deque(maxlen=8) + self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) + self.simulate_latest_boards = deque(maxlen=8) - def _flatten(self, vertex): + def simulate_flatten(self, vertex): x, y = vertex return (x - 1) * self.game.size + (y - 1) def _find_group(self, start): - color = self.board[self._flatten(start)] + color = self.simulate_board[self.simulate_flatten(start)] # print ("color : ", color) chain = set() frontier = [start] @@ -35,45 +35,45 @@ class GoEnv: chain.add(current) for n in self._neighbor(current): # print n, self._flatten(n), self.board[self._flatten(n)], - if self.board[self._flatten(n)] == color and not n in chain: + if self.simulate_board[self.simulate_flatten(n)] == color and not n in chain: frontier.append(n) - if self.board[self._flatten(n)] == utils.EMPTY: + if self.simulate_board[self.simulate_flatten(n)] == utils.EMPTY: has_liberty = True return has_liberty, chain def _is_suicide(self, color, vertex): ### assume that we already take this move - self.board[self._flatten(vertex)] = color + self.simulate_board[self.simulate_flatten(vertex)] = color has_liberty, group = self._find_group(vertex) if has_liberty: ### this group still has liberty after this move, not suicide - self.board[self._flatten(vertex)] = utils.EMPTY + self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY return False else: ### liberty is zero for n in self._neighbor(vertex): - if self.board[self._flatten(n)] == utils.another_color(color): + if self.simulate_board[self.simulate_flatten(n)] == utils.another_color(color): opponent_liberty, group = self._find_group(n) # this move is able to take opponent's stone, not suicide if not opponent_liberty: - self.board[self._flatten(vertex)] = utils.EMPTY + self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY return False # not a take, suicide - self.board[self._flatten(vertex)] = utils.EMPTY + self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY return True def _check_global_isomorphous(self, color, vertex): ##backup - _board = copy.copy(self.board) - self.board[self._flatten(vertex)] = color + _board = copy.copy(self.simulate_board) + self.simulate_board[self.simulate_flatten(vertex)] = color self._process_board(color, vertex) - if self.board in self.latest_boards: + if self.simulate_board in self.simulate_latest_boards: res = True else: res = False - self.board = _board + self.simulate_board = _board return res def _in_board(self, vertex): @@ -105,16 +105,16 @@ class GoEnv: def _process_board(self, color, vertex): nei = self._neighbor(vertex) for n in nei: - if self.board[self._flatten(n)] == utils.another_color(color): + if self.simulate_board[self.simulate_flatten(n)] == utils.another_color(color): has_liberty, group = self._find_group(n) if not has_liberty: for b in group: - self.board[self._flatten(b)] = utils.EMPTY + self.simulate_board[self.simulate_flatten(b)] = utils.EMPTY def _is_eye(self, color, vertex): nei = self._neighbor(vertex) cor = self._corner(vertex) - ncolor = {color == self.board[self._flatten(n)] for n in nei} + ncolor = {color == self.simulate_board[self.simulate_flatten(n)] for n in nei} if False in ncolor: # print "not all neighbors are in same color with us" return False @@ -123,7 +123,7 @@ class GoEnv: # print "all neighbors are in same group and same color with us" return True else: - opponent_number = [self.board[self._flatten(c)] for c in cor].count(-color) + opponent_number = [self.simulate_board[self.simulate_flatten(c)] for c in cor].count(-color) opponent_propotion = float(opponent_number) / float(len(cor)) if opponent_propotion < 0.5: # print "few opponents, real eye" @@ -141,7 +141,7 @@ class GoEnv: def simulate_is_valid(self, state, action): # state is the play board, the shape is [1, 9, 9, 17] - if action == self.game.size * self.game.size: + if action == self.game.size ** 2: vertex = (0, 0) else: vertex = (action / self.game.size + 1, action % self.game.size + 1) @@ -149,17 +149,17 @@ class GoEnv: color = utils.BLACK else: color = utils.WHITE - self.latest_boards.clear() + self.simulate_latest_boards.clear() for i in range(8): - self.latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) - self.board = copy.copy(self.latest_boards[-1]) + self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) + self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) ### in board if not self._in_board(vertex): return False ### already have stone - if not self.board[self._flatten(vertex)] == utils.EMPTY: + if not self.simulate_board[self.simulate_flatten(vertex)] == utils.EMPTY: # print(np.array(self.board).reshape(9, 9)) # print(vertex) return False @@ -181,9 +181,9 @@ class GoEnv: if vertex == utils.PASS: return True - id_ = self._flatten(vertex) - if self.board[id_] == utils.EMPTY: - self.board[id_] = color + id_ = self.simulate_flatten(vertex) + if self.simulate_board[id_] == utils.EMPTY: + self.simulate_board[id_] = color return True else: return False @@ -199,11 +199,11 @@ class GoEnv: vertex = (action % self.game.size + 1, action / self.game.size + 1) # print(vertex) # print(self.board) - self.board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() + self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() self.do_move(color, vertex) new_state = np.concatenate( - [state[:, :, :, 1:8], (np.array(self.board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), - state[:, :, :, 9:16], (np.array(self.board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), + [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), + state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), np.array(1 - state[:, :, :, -1]).reshape(1, self.game.size, self.game.size, 1)], axis=3) return new_state, 0 From 4440294c121d4fb36d62db703ce8e7d779424b42 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Tue, 19 Dec 2017 12:00:17 +0800 Subject: [PATCH 06/36] fix bug in check_global_isomorphous and refactor _is_suicide again --- AlphaGo/strategy.py | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index fe6bcbf..e9457cf 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -42,33 +42,27 @@ class GoEnv: return has_liberty, chain def _is_suicide(self, color, vertex): - ### assume that we already take this move - self.simulate_board[self.simulate_flatten(vertex)] = color + self.simulate_board[self.simulate_flatten(vertex)] = color # assume that we already take this move + suicide = False has_liberty, group = self._find_group(vertex) - if has_liberty: - ### this group still has liberty after this move, not suicide - self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY - return False - else: - ### liberty is zero + if not has_liberty: + suicide = True # no liberty, suicide for n in self._neighbor(vertex): if self.simulate_board[self.simulate_flatten(n)] == utils.another_color(color): opponent_liberty, group = self._find_group(n) - # this move is able to take opponent's stone, not suicide if not opponent_liberty: - self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY - return False - # not a take, suicide - self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY - return True + suicide = False # this move is able to take opponent's stone, not suicide + + self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY # undo this move + return suicide def _check_global_isomorphous(self, color, vertex): ##backup _board = copy.copy(self.simulate_board) self.simulate_board[self.simulate_flatten(vertex)] = color self._process_board(color, vertex) - if self.simulate_board in self.simulate_latest_boards: + if self.simulate_board in self.game.history: res = True else: res = False @@ -140,7 +134,9 @@ class GoEnv: return True def simulate_is_valid(self, state, action): - # state is the play board, the shape is [1, 9, 9, 17] + # State is the play board, the shape is [1, self.game.size, self.game.size, 17]. + # Action is an index + # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move if action == self.game.size ** 2: vertex = (0, 0) else: @@ -177,7 +173,7 @@ class GoEnv: return True - def do_move(self, color, vertex): + def simulate_do_move(self, color, vertex): if vertex == utils.PASS: return True @@ -200,7 +196,7 @@ class GoEnv: # print(vertex) # print(self.board) self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() - self.do_move(color, vertex) + self.simulate_do_move(color, vertex) new_state = np.concatenate( [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), From 0991fef527e73617114949a406e9da4632865e2d Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Tue, 19 Dec 2017 15:09:46 +0800 Subject: [PATCH 07/36] deflatten debug --- AlphaGo/game.py | 10 +++++----- AlphaGo/strategy.py | 9 +++++++-- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 3b62435..2a82d8e 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -46,12 +46,12 @@ class Game: def _flatten(self, vertex): x, y = vertex - return (y - 1) * self.size + (x - 1) + return (x - 1) * self.size + (y - 1) def _deflatten(self, idx): - x = idx % self.size + 1 - y = idx // self.size + 1 - return (x,y) + x = idx // self.size + 1 + y = idx % self.size + 1 + return (x, y) def clear(self): self.board = [utils.EMPTY] * (self.size ** 2) @@ -88,7 +88,7 @@ class Game: if choice == self.size ** 2: move = utils.PASS else: - move = (choice % self.size + 1, choice / self.size + 1) + move = self._deflatten(choice) return move, prob def do_move(self, color, vertex): diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index e9457cf..112f130 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -23,6 +23,11 @@ class GoEnv: x, y = vertex return (x - 1) * self.game.size + (y - 1) + def simulate_deflatten(self, idx): + x = idx // self.game.size + 1 + y = idx % self.game.size + 1 + return (x, y) + def _find_group(self, start): color = self.simulate_board[self.simulate_flatten(start)] # print ("color : ", color) @@ -140,7 +145,7 @@ class GoEnv: if action == self.game.size ** 2: vertex = (0, 0) else: - vertex = (action / self.game.size + 1, action % self.game.size + 1) + vertex = self.simulate_deflatten(action) if state[0, 0, 0, -1] == utils.BLACK: color = utils.BLACK else: @@ -192,7 +197,7 @@ class GoEnv: if action == self.game.size ** 2: vertex = utils.PASS else: - vertex = (action % self.game.size + 1, action / self.game.size + 1) + vertex = self.simulate_deflatten(action) # print(vertex) # print(self.board) self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() From 4a2d8f0003443f6ca60f78370027914a4e4ff9c4 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Tue, 19 Dec 2017 15:39:31 +0800 Subject: [PATCH 08/36] start a random player if checkpoint path is not specified --- AlphaGo/play.py | 32 +++++++++++++++++++------------- AlphaGo/player.py | 4 +++- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/AlphaGo/play.py b/AlphaGo/play.py index fe6c7ce..7367804 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -1,3 +1,4 @@ +import argparse import subprocess import sys import re @@ -11,14 +12,17 @@ if __name__ == '__main__': Note that, this function requires the installation of the Pyro4 library. """ # TODO : we should set the network path in a more configurable way. - black_weight_path = "./checkpoints" - white_weight_path = "./checkpoints_origin" - if (not os.path.exists(black_weight_path)): - print "Can't not find the network weights for black player." - sys.exit() - if (not os.path.exists(white_weight_path)): - print "Can't not find the network weights for white player." - sys.exit() + parser = argparse.ArgumentParser() + parser.add_argument("--black_weight_path", type=str, default=None) + parser.add_argument("--white_weight_path", type=str, default=None) + args = parser.parse_args() + + # black_weight_path = "./checkpoints" + # white_weight_path = "./checkpoints_origin" + if args.black_weight_path is not None and (not os.path.exists(args.black_weight_path)): + raise ValueError("Can't not find the network weights for black player.") + if args.white_weight_path is not None and (not os.path.exists(args.white_weight_path)): + raise ValueError("Can't not find the network weights for white player.") # kill the old server kill_old_server = subprocess.Popen(['killall', 'pyro4-ns']) @@ -31,14 +35,16 @@ if __name__ == '__main__': time.sleep(1) # start two different player with different network weights. - agent_v0 = subprocess.Popen(['python', '-u', 'player.py', '--role=black'], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - agent_v1 = subprocess.Popen(['python', '-u', 'player.py', '--role=white', '--checkpoint_path=./checkpoints_origin/'], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + agent_v0 = subprocess.Popen(['python', '-u', 'player.py', '--role=black', '--checkpoint_path=' + str(args.black_weight_path)], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + agent_v1 = subprocess.Popen(['python', '-u', 'player.py', '--role=white', '--checkpoint_path=' + str(args.white_weight_path)], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + server_list = "" while ("black" not in server_list) or ("white" not in server_list): server_list = subprocess.check_output(['pyro4-nsc', 'list']) - print "Waining for the server start..." + print "Waiting for the server start..." time.sleep(1) print server_list print "Start black player at : " + str(agent_v0.pid) diff --git a/AlphaGo/player.py b/AlphaGo/player.py index 8245c38..b468cf3 100644 --- a/AlphaGo/player.py +++ b/AlphaGo/player.py @@ -22,10 +22,12 @@ class Player(object): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument("--checkpoint_path", type=str, default="./checkpoints/") + parser.add_argument("--checkpoint_path", type=str, default=None) parser.add_argument("--role", type=str, default="unknown") args = parser.parse_args() + if args.checkpoint_path == 'None': + args.checkpoint_path = None game = Game(checkpoint_path=args.checkpoint_path) engine = GTPEngine(game_obj=game, name='tianshou', version=0) From fc8114fe35646673e4b2f4ac00527879878a6ce3 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Tue, 19 Dec 2017 16:51:50 +0800 Subject: [PATCH 09/36] merge flatten and deflatten, rename variable for clarity --- AlphaGo/engine.py | 4 +-- AlphaGo/game.py | 15 ++++++----- AlphaGo/strategy.py | 45 +++++++++++++-------------------- tianshou/core/mcts/evaluator.py | 4 +-- tianshou/core/mcts/mcts.py | 2 +- 5 files changed, 31 insertions(+), 39 deletions(-) diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index 1f9af85..1ee8833 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -167,7 +167,7 @@ class GTPEngine(): move = self._parse_move(args) if move: color, vertex = move - res = self._game.do_move(color, vertex) + res = self._game.play_move(color, vertex) if res: return None, True else: @@ -177,7 +177,7 @@ class GTPEngine(): def cmd_genmove(self, args, **kwargs): color = self._parse_color(args) if color: - move = self._game.gen_move(color) + move = self._game.think_play_move(color) return self._vertex_point2string(move), True else: return 'unknown player', False diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 2a82d8e..d0cb91c 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -77,7 +77,7 @@ class Game: state[0, :, :, 16] = np.zeros([self.size, self.size]) return state - def strategy_gen_move(self, latest_boards, color): + def think(self, latest_boards, color): self.simulator.simulate_latest_boards = copy.copy(latest_boards) self.simulator.simulate_board = copy.copy(latest_boards[-1]) nn_input = self.generate_nn_input(self.simulator.simulate_latest_boards, color) @@ -91,17 +91,18 @@ class Game: move = self._deflatten(choice) return move, prob - def do_move(self, color, vertex): + def play_move(self, color, vertex): + # this function can be called directly to play the opponent's move if vertex == utils.PASS: return True res = self.executor.do_move(color, vertex) return res - def gen_move(self, color): - # move = self.strategy.gen_move(color) - # return move - move, self.prob = self.strategy_gen_move(self.latest_boards, color) - self.do_move(color, move) + def think_play_move(self, color): + # although we dont need to return self.prob, however it is needed for neural network training + move, self.prob = self.think(self.latest_boards, color) + # play the move immediately + self.play_move(color, move) return move def status2symbol(self, s): diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index 112f130..af017b1 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -10,7 +10,7 @@ import tensorflow as tf from collections import deque from tianshou.core.mcts.mcts import MCTS -DELTA = [[1, 0], [-1, 0], [0, -1], [0, 1]] +NEIGHBOR_OFFSET = [[1, 0], [-1, 0], [0, -1], [0, 1]] CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] class GoEnv: @@ -19,17 +19,8 @@ class GoEnv: self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) self.simulate_latest_boards = deque(maxlen=8) - def simulate_flatten(self, vertex): - x, y = vertex - return (x - 1) * self.game.size + (y - 1) - - def simulate_deflatten(self, idx): - x = idx // self.game.size + 1 - y = idx % self.game.size + 1 - return (x, y) - def _find_group(self, start): - color = self.simulate_board[self.simulate_flatten(start)] + color = self.simulate_board[self.game._flatten(start)] # print ("color : ", color) chain = set() frontier = [start] @@ -40,32 +31,32 @@ class GoEnv: chain.add(current) for n in self._neighbor(current): # print n, self._flatten(n), self.board[self._flatten(n)], - if self.simulate_board[self.simulate_flatten(n)] == color and not n in chain: + if self.simulate_board[self.game._flatten(n)] == color and not n in chain: frontier.append(n) - if self.simulate_board[self.simulate_flatten(n)] == utils.EMPTY: + if self.simulate_board[self.game._flatten(n)] == utils.EMPTY: has_liberty = True return has_liberty, chain def _is_suicide(self, color, vertex): - self.simulate_board[self.simulate_flatten(vertex)] = color # assume that we already take this move + self.simulate_board[self.game._flatten(vertex)] = color # assume that we already take this move suicide = False has_liberty, group = self._find_group(vertex) if not has_liberty: suicide = True # no liberty, suicide for n in self._neighbor(vertex): - if self.simulate_board[self.simulate_flatten(n)] == utils.another_color(color): + if self.simulate_board[self.game._flatten(n)] == utils.another_color(color): opponent_liberty, group = self._find_group(n) if not opponent_liberty: suicide = False # this move is able to take opponent's stone, not suicide - self.simulate_board[self.simulate_flatten(vertex)] = utils.EMPTY # undo this move + self.simulate_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move return suicide def _check_global_isomorphous(self, color, vertex): ##backup _board = copy.copy(self.simulate_board) - self.simulate_board[self.simulate_flatten(vertex)] = color + self.simulate_board[self.game._flatten(vertex)] = color self._process_board(color, vertex) if self.simulate_board in self.game.history: res = True @@ -84,7 +75,7 @@ class GoEnv: def _neighbor(self, vertex): x, y = vertex nei = [] - for d in DELTA: + for d in NEIGHBOR_OFFSET: _x = x + d[0] _y = y + d[1] if self._in_board((_x, _y)): @@ -104,16 +95,16 @@ class GoEnv: def _process_board(self, color, vertex): nei = self._neighbor(vertex) for n in nei: - if self.simulate_board[self.simulate_flatten(n)] == utils.another_color(color): + if self.simulate_board[self.game._flatten(n)] == utils.another_color(color): has_liberty, group = self._find_group(n) if not has_liberty: for b in group: - self.simulate_board[self.simulate_flatten(b)] = utils.EMPTY + self.simulate_board[self.game._flatten(b)] = utils.EMPTY def _is_eye(self, color, vertex): nei = self._neighbor(vertex) cor = self._corner(vertex) - ncolor = {color == self.simulate_board[self.simulate_flatten(n)] for n in nei} + ncolor = {color == self.simulate_board[self.game._flatten(n)] for n in nei} if False in ncolor: # print "not all neighbors are in same color with us" return False @@ -122,7 +113,7 @@ class GoEnv: # print "all neighbors are in same group and same color with us" return True else: - opponent_number = [self.simulate_board[self.simulate_flatten(c)] for c in cor].count(-color) + opponent_number = [self.simulate_board[self.game._flatten(c)] for c in cor].count(-color) opponent_propotion = float(opponent_number) / float(len(cor)) if opponent_propotion < 0.5: # print "few opponents, real eye" @@ -145,7 +136,7 @@ class GoEnv: if action == self.game.size ** 2: vertex = (0, 0) else: - vertex = self.simulate_deflatten(action) + vertex = self.game._deflatten(action) if state[0, 0, 0, -1] == utils.BLACK: color = utils.BLACK else: @@ -160,7 +151,7 @@ class GoEnv: return False ### already have stone - if not self.simulate_board[self.simulate_flatten(vertex)] == utils.EMPTY: + if not self.simulate_board[self.game._flatten(vertex)] == utils.EMPTY: # print(np.array(self.board).reshape(9, 9)) # print(vertex) return False @@ -182,14 +173,14 @@ class GoEnv: if vertex == utils.PASS: return True - id_ = self.simulate_flatten(vertex) + id_ = self.game._flatten(vertex) if self.simulate_board[id_] == utils.EMPTY: self.simulate_board[id_] = color return True else: return False - def step_forward(self, state, action): + def simulate_step_forward(self, state, action): if state[0, 0, 0, -1] == 1: color = utils.BLACK else: @@ -197,7 +188,7 @@ class GoEnv: if action == self.game.size ** 2: vertex = utils.PASS else: - vertex = self.simulate_deflatten(action) + vertex = self.game._deflatten(action) # print(vertex) # print(self.board) self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() diff --git a/tianshou/core/mcts/evaluator.py b/tianshou/core/mcts/evaluator.py index 9c4ee8e..a1f9456 100644 --- a/tianshou/core/mcts/evaluator.py +++ b/tianshou/core/mcts/evaluator.py @@ -19,10 +19,10 @@ class rollout_policy(evaluator): # TODO: prior for rollout policy total_reward = 0. action = np.random.randint(0, self.action_num) - state, reward = self.env.step_forward(state, action) + state, reward = self.env.simulate_step_forward(state, action) total_reward += reward while state is not None: action = np.random.randint(0, self.action_num) - state, reward = self.env.step_forward(state, action) + state, reward = self.env.simulate_step_forward(state, action) total_reward += reward return np.ones([self.action_num])/self.action_num, total_reward diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 979e994..b58c105 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -116,7 +116,7 @@ class ActionNode(object): self.next_state = tuple2list(self.next_state) def selection(self, simulator): - self.next_state, self.reward = simulator.step_forward(self.parent.state, self.action) + self.next_state, self.reward = simulator.simulate_step_forward(self.parent.state, self.action) self.origin_state = self.next_state self.state_type = type(self.next_state) self.type_conversion_to_tuple() From 232204d7970ef261c8f99394f2cc631a674a17a0 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Tue, 19 Dec 2017 22:57:38 +0800 Subject: [PATCH 10/36] fix the copy bug in check_global_isomorphous; refactor code to eliminate side effect --- AlphaGo/go.py | 36 ++++++------- AlphaGo/strategy.py | 104 +++++++++++++++++-------------------- tianshou/core/mcts/mcts.py | 3 +- 3 files changed, 67 insertions(+), 76 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 7b1d3e7..8e3518d 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -72,18 +72,14 @@ class Go: self.game.board[self.game._flatten(vertex)] = utils.EMPTY return True - def _check_global_isomorphous(self, color, vertex): - ##backup - _board = copy.copy(self.game.board) - self.game.board[self.game._flatten(vertex)] = color - self._process_board(color, vertex) - if self.game.board in self.game.history: - res = True - else: - res = False - - self.game.board = _board - return res + def _check_global_isomorphous(self, history_boards, current_board, color, vertex): + repeat = False + next_board = copy.copy(current_board) + next_board[self.game._flatten(vertex)] = color + self._process_board(next_board, color, vertex) + if next_board in history_boards: + repeat = True + return repeat def _in_board(self, vertex): x, y = vertex @@ -101,38 +97,38 @@ class Go: nei.append((_x, _y)) return nei - def _process_board(self, color, vertex): + def _process_board(self, current_board, color, vertex): nei = self._neighbor(vertex) for n in nei: - if self.game.board[self.game._flatten(n)] == utils.another_color(color): + if current_board[self.game._flatten(n)] == utils.another_color(color): can_kill, block = self._find_block(n) if can_kill: for b in block: - self.game.board[self.game._flatten(b)] = utils.EMPTY + current_board[self.game._flatten(b)] = utils.EMPTY - def is_valid(self, color, vertex): + def is_valid(self, history_boards, current_board, color, vertex): ### in board if not self._in_board(vertex): return False ### already have stone - if not self.game.board[self.game._flatten(vertex)] == utils.EMPTY: + if not current_board[self.game._flatten(vertex)] == utils.EMPTY: return False ### check if it is qi if not self._is_qi(color, vertex): return False - if self._check_global_isomorphous(color, vertex): + if self._check_global_isomorphous(history_boards, current_board, color, vertex): return False return True def do_move(self, color, vertex): - if not self.is_valid(color, vertex): + if not self.is_valid(self.game.history, self.game.board, color, vertex): return False self.game.board[self.game._flatten(vertex)] = color - self._process_board(color, vertex) + self._process_board(self.game.board, color, vertex) self.game.history.append(copy.copy(self.game.board)) self.game.latest_boards.append(copy.copy(self.game.board)) return True diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index af017b1..07555e9 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -19,52 +19,47 @@ class GoEnv: self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) self.simulate_latest_boards = deque(maxlen=8) - def _find_group(self, start): - color = self.simulate_board[self.game._flatten(start)] + def _find_group(self, current_board, vertex): + color = current_board[self.game._flatten(vertex)] # print ("color : ", color) chain = set() - frontier = [start] + frontier = [vertex] has_liberty = False while frontier: current = frontier.pop() # print ("current : ", current) chain.add(current) for n in self._neighbor(current): - # print n, self._flatten(n), self.board[self._flatten(n)], - if self.simulate_board[self.game._flatten(n)] == color and not n in chain: + if current_board[self.game._flatten(n)] == color and not n in chain: frontier.append(n) - if self.simulate_board[self.game._flatten(n)] == utils.EMPTY: + if current_board[self.game._flatten(n)] == utils.EMPTY: has_liberty = True return has_liberty, chain - def _is_suicide(self, color, vertex): - self.simulate_board[self.game._flatten(vertex)] = color # assume that we already take this move + def _is_suicide(self, current_board, color, vertex): + current_board[self.game._flatten(vertex)] = color # assume that we already take this move suicide = False - has_liberty, group = self._find_group(vertex) + has_liberty, group = self._find_group(current_board, vertex) if not has_liberty: suicide = True # no liberty, suicide for n in self._neighbor(vertex): - if self.simulate_board[self.game._flatten(n)] == utils.another_color(color): - opponent_liberty, group = self._find_group(n) + if current_board[self.game._flatten(n)] == utils.another_color(color): + opponent_liberty, group = self._find_group(current_board, n) if not opponent_liberty: suicide = False # this move is able to take opponent's stone, not suicide - self.simulate_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move + current_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move return suicide - def _check_global_isomorphous(self, color, vertex): - ##backup - _board = copy.copy(self.simulate_board) - self.simulate_board[self.game._flatten(vertex)] = color - self._process_board(color, vertex) - if self.simulate_board in self.game.history: - res = True - else: - res = False - - self.simulate_board = _board - return res + def _check_global_isomorphous(self, history_boards, current_board, color, vertex): + repeat = False + next_board = copy.copy(current_board) + next_board[self.game._flatten(vertex)] = color + self._process_board(next_board, color, vertex) + if next_board in history_boards: + repeat = True + return repeat def _in_board(self, vertex): x, y = vertex @@ -92,28 +87,28 @@ class GoEnv: corner.append((_x, _y)) return corner - def _process_board(self, color, vertex): + def _process_board(self, current_board, color, vertex): nei = self._neighbor(vertex) for n in nei: - if self.simulate_board[self.game._flatten(n)] == utils.another_color(color): - has_liberty, group = self._find_group(n) + if current_board[self.game._flatten(n)] == utils.another_color(color): + has_liberty, group = self._find_group(current_board, n) if not has_liberty: for b in group: - self.simulate_board[self.game._flatten(b)] = utils.EMPTY + current_board[self.game._flatten(b)] = utils.EMPTY - def _is_eye(self, color, vertex): + def _is_eye(self, current_board, color, vertex): nei = self._neighbor(vertex) cor = self._corner(vertex) - ncolor = {color == self.simulate_board[self.game._flatten(n)] for n in nei} + ncolor = {color == current_board[self.game._flatten(n)] for n in nei} if False in ncolor: # print "not all neighbors are in same color with us" return False - _, group = self._find_group(nei[0]) + _, group = self._find_group(current_board, nei[0]) if set(nei) < group: # print "all neighbors are in same group and same color with us" return True else: - opponent_number = [self.simulate_board[self.game._flatten(c)] for c in cor].count(-color) + opponent_number = [current_board[self.game._flatten(c)] for c in cor].count(-color) opponent_propotion = float(opponent_number) / float(len(cor)) if opponent_propotion < 0.5: # print "few opponents, real eye" @@ -122,49 +117,54 @@ class GoEnv: # print "many opponents, fake eye" return False - def knowledge_prunning(self, color, vertex): + def knowledge_prunning(self, current_board, color, vertex): ### check if it is an eye of yourself ### assumptions : notice that this judgement requires that the state is an endgame - if self._is_eye(color, vertex): + if self._is_eye(current_board, color, vertex): return False return True - def simulate_is_valid(self, state, action): - # State is the play board, the shape is [1, self.game.size, self.game.size, 17]. - # Action is an index + def sa2cv(self, state, action): + # State is the play board, the shape is [1, self.game.size, self.game.size, 17], action is an index. # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move - if action == self.game.size ** 2: - vertex = (0, 0) - else: - vertex = self.game._deflatten(action) if state[0, 0, 0, -1] == utils.BLACK: color = utils.BLACK else: color = utils.WHITE + if action == self.game.size ** 2: + vertex = (0, 0) + else: + vertex = self.game._deflatten(action) + return color, vertex + + def simulate_is_valid(self, history_boards, current_board, state, action): + # initialize simulate_latest_boards and simulate_board from state self.simulate_latest_boards.clear() for i in range(8): self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) + color, vertex = self.sa2cv(state, action) + ### in board if not self._in_board(vertex): return False ### already have stone - if not self.simulate_board[self.game._flatten(vertex)] == utils.EMPTY: + if not current_board[self.game._flatten(vertex)] == utils.EMPTY: # print(np.array(self.board).reshape(9, 9)) # print(vertex) return False ### check if it is suicide - if self._is_suicide(color, vertex): + if self._is_suicide(current_board, color, vertex): return False ### forbid global isomorphous - if self._check_global_isomorphous(color, vertex): + if self._check_global_isomorphous(history_boards, current_board, color, vertex): return False - if not self.knowledge_prunning(color, vertex): + if not self.knowledge_prunning(current_board, color, vertex): return False return True @@ -181,17 +181,11 @@ class GoEnv: return False def simulate_step_forward(self, state, action): - if state[0, 0, 0, -1] == 1: - color = utils.BLACK - else: - color = utils.WHITE - if action == self.game.size ** 2: - vertex = utils.PASS - else: - vertex = self.game._deflatten(action) - # print(vertex) - # print(self.board) + # initialize the simulate_board from state self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() + + color, vertex = self.sa2cv(state, action) + self.simulate_do_move(color, vertex) new_state = np.concatenate( [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index b58c105..12fc85d 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -75,7 +75,8 @@ class UCTNode(MCTSNode): start_time = time.time() self.mask = [] for act in range(self.action_num - 1): - if not simulator.simulate_is_valid(self.state, act): + if not simulator.simulate_is_valid( + simulator.simulate_latest_boards, simulator.simulate_board, self.state, act): self.mask.append(act) self.ucb[act] = -float("Inf") else: From 2a9d949510f3e2032e868fa64bb0d6efc7624fc3 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Wed, 20 Dec 2017 00:16:24 +0800 Subject: [PATCH 11/36] rearrange the sequence of functions of Go and GoEnv before merging --- AlphaGo/go.py | 125 ++++++++++++++++++++------------------------ AlphaGo/strategy.py | 70 ++++++++++++------------- 2 files changed, 91 insertions(+), 104 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 8e3518d..37d8339 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -17,70 +17,6 @@ class Go: def __init__(self, **kwargs): self.game = kwargs['game'] - def _bfs(self, vertex, color, block, status): - block.append(vertex) - status[self.game._flatten(vertex)] = True - nei = self._neighbor(vertex) - for n in nei: - if not status[self.game._flatten(n)]: - if self.game.board[self.game._flatten(n)] == color: - self._bfs(n, color, block, status) - - def _find_block(self, vertex): - block = [] - status = [False] * (self.game.size ** 2) - color = self.game.board[self.game._flatten(vertex)] - self._bfs(vertex, color, block, status) - - for b in block: - for n in self._neighbor(b): - if self.game.board[self.game._flatten(n)] == utils.EMPTY: - return False, block - return True, block - - def _find_boarder(self, vertex): - block = [] - status = [False] * (self.game.size ** 2) - self._bfs(vertex, utils.EMPTY, block, status) - border = [] - for b in block: - for n in self._neighbor(b): - if not (n in block): - border.append(n) - return border - - def _is_qi(self, color, vertex): - nei = self._neighbor(vertex) - for n in nei: - if self.game.board[self.game._flatten(n)] == utils.EMPTY: - return True - - self.game.board[self.game._flatten(vertex)] = color - for n in nei: - if self.game.board[self.game._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n) - if can_kill: - self.game.board[self.game._flatten(vertex)] = utils.EMPTY - return True - - ### can not suicide - can_kill, block = self._find_block(vertex) - if can_kill: - self.game.board[self.game._flatten(vertex)] = utils.EMPTY - return False - - self.game.board[self.game._flatten(vertex)] = utils.EMPTY - return True - - def _check_global_isomorphous(self, history_boards, current_board, color, vertex): - repeat = False - next_board = copy.copy(current_board) - next_board[self.game._flatten(vertex)] = color - self._process_board(next_board, color, vertex) - if next_board in history_boards: - repeat = True - return repeat - def _in_board(self, vertex): x, y = vertex if x < 1 or x > self.game.size: return False @@ -97,15 +33,57 @@ class Go: nei.append((_x, _y)) return nei + def _find_group(self, current_board, vertex): + color = current_board[self.game._flatten(vertex)] + # print ("color : ", color) + chain = set() + frontier = [vertex] + has_liberty = False + while frontier: + current = frontier.pop() + # print ("current : ", current) + chain.add(current) + for n in self._neighbor(current): + if current_board[self.game._flatten(n)] == color and not n in chain: + frontier.append(n) + if current_board[self.game._flatten(n)] == utils.EMPTY: + has_liberty = True + return has_liberty, chain + + def _is_suicide(self, current_board, color, vertex): + current_board[self.game._flatten(vertex)] = color # assume that we already take this move + suicide = False + + has_liberty, group = self._find_group(current_board, vertex) + if not has_liberty: + suicide = True # no liberty, suicide + for n in self._neighbor(vertex): + if current_board[self.game._flatten(n)] == utils.another_color(color): + opponent_liberty, group = self._find_group(current_board, n) + if not opponent_liberty: + suicide = False # this move is able to take opponent's stone, not suicide + + current_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move + return suicide + def _process_board(self, current_board, color, vertex): nei = self._neighbor(vertex) for n in nei: if current_board[self.game._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n) - if can_kill: - for b in block: + has_liberty, group = self._find_group(current_board, n) + if not has_liberty: + for b in group: current_board[self.game._flatten(b)] = utils.EMPTY + def _check_global_isomorphous(self, history_boards, current_board, color, vertex): + repeat = False + next_board = copy.copy(current_board) + next_board[self.game._flatten(vertex)] = color + self._process_board(next_board, color, vertex) + if next_board in history_boards: + repeat = True + return repeat + def is_valid(self, history_boards, current_board, color, vertex): ### in board if not self._in_board(vertex): @@ -115,8 +93,8 @@ class Go: if not current_board[self.game._flatten(vertex)] == utils.EMPTY: return False - ### check if it is qi - if not self._is_qi(color, vertex): + ### check if it is suicide + if self._is_suicide(current_board, color, vertex): return False if self._check_global_isomorphous(history_boards, current_board, color, vertex): @@ -137,6 +115,15 @@ class Go: idx = [i for i,x in enumerate(self.game.board) if x == utils.EMPTY ][0] return self.game._deflatten(idx) + def _find_boarder(self, vertex): + _, group = self._find_group(self.game.board, vertex) + border = [] + for b in group: + for n in self._neighbor(b): + if not (n in group): + border.append(n) + return border + def _add_nearby_stones(self, neighbor_vertex_set, start_vertex_x, start_vertex_y, x_diff, y_diff, num_step): ''' add the nearby stones around the input vertex diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index 07555e9..9ebd421 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -19,6 +19,32 @@ class GoEnv: self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) self.simulate_latest_boards = deque(maxlen=8) + def _in_board(self, vertex): + x, y = vertex + if x < 1 or x > self.game.size: return False + if y < 1 or y > self.game.size: return False + return True + + def _neighbor(self, vertex): + x, y = vertex + nei = [] + for d in NEIGHBOR_OFFSET: + _x = x + d[0] + _y = y + d[1] + if self._in_board((_x, _y)): + nei.append((_x, _y)) + return nei + + def _corner(self, vertex): + x, y = vertex + corner = [] + for d in CORNER_OFFSET: + _x = x + d[0] + _y = y + d[1] + if self._in_board((_x, _y)): + corner.append((_x, _y)) + return corner + def _find_group(self, current_board, vertex): color = current_board[self.game._flatten(vertex)] # print ("color : ", color) @@ -52,41 +78,6 @@ class GoEnv: current_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move return suicide - def _check_global_isomorphous(self, history_boards, current_board, color, vertex): - repeat = False - next_board = copy.copy(current_board) - next_board[self.game._flatten(vertex)] = color - self._process_board(next_board, color, vertex) - if next_board in history_boards: - repeat = True - return repeat - - def _in_board(self, vertex): - x, y = vertex - if x < 1 or x > self.game.size: return False - if y < 1 or y > self.game.size: return False - return True - - def _neighbor(self, vertex): - x, y = vertex - nei = [] - for d in NEIGHBOR_OFFSET: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - nei.append((_x, _y)) - return nei - - def _corner(self, vertex): - x, y = vertex - corner = [] - for d in CORNER_OFFSET: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - corner.append((_x, _y)) - return corner - def _process_board(self, current_board, color, vertex): nei = self._neighbor(vertex) for n in nei: @@ -96,6 +87,15 @@ class GoEnv: for b in group: current_board[self.game._flatten(b)] = utils.EMPTY + def _check_global_isomorphous(self, history_boards, current_board, color, vertex): + repeat = False + next_board = copy.copy(current_board) + next_board[self.game._flatten(vertex)] = color + self._process_board(next_board, color, vertex) + if next_board in history_boards: + repeat = True + return repeat + def _is_eye(self, current_board, color, vertex): nei = self._neighbor(vertex) cor = self._corner(vertex) From d1af137686355b347f7c5b6b7fd117969b9a04cc Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Wed, 20 Dec 2017 00:43:31 +0800 Subject: [PATCH 12/36] final version before merge Go and GoEnv --- AlphaGo/engine.py | 2 +- AlphaGo/game.py | 3 ++- AlphaGo/go.py | 8 ++++---- AlphaGo/self-play.py | 2 +- AlphaGo/strategy.py | 38 +++++++++++++++++++++----------------- 5 files changed, 29 insertions(+), 24 deletions(-) diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index 1ee8833..d11635a 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -183,7 +183,7 @@ class GTPEngine(): return 'unknown player', False def cmd_get_score(self, args, **kwargs): - return self._game.executor.get_score(), None + return self._game.executor.executor_get_score(), None def cmd_show_board(self, args, **kwargs): return self._game.board, True diff --git a/AlphaGo/game.py b/AlphaGo/game.py index d0cb91c..af4ef57 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -78,6 +78,7 @@ class Game: return state def think(self, latest_boards, color): + # TODO : using copy is right, or should we change to deepcopy? self.simulator.simulate_latest_boards = copy.copy(latest_boards) self.simulator.simulate_board = copy.copy(latest_boards[-1]) nn_input = self.generate_nn_input(self.simulator.simulate_latest_boards, color) @@ -95,7 +96,7 @@ class Game: # this function can be called directly to play the opponent's move if vertex == utils.PASS: return True - res = self.executor.do_move(color, vertex) + res = self.executor.executor_do_move(color, vertex) return res def think_play_move(self, color): diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 37d8339..108c9bd 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -84,7 +84,7 @@ class Go: repeat = True return repeat - def is_valid(self, history_boards, current_board, color, vertex): + def _is_valid(self, history_boards, current_board, color, vertex): ### in board if not self._in_board(vertex): return False @@ -102,8 +102,8 @@ class Go: return True - def do_move(self, color, vertex): - if not self.is_valid(self.game.history, self.game.board, color, vertex): + def executor_do_move(self, color, vertex): + if not self._is_valid(self.game.history, self.game.board, color, vertex): return False self.game.board[self.game._flatten(vertex)] = color self._process_board(self.game.board, color, vertex) @@ -164,7 +164,7 @@ class Go: elif color_estimate < 0: return utils.WHITE - def get_score(self, is_unknown_estimation = False): + def executor_get_score(self, is_unknown_estimation = False): ''' is_unknown_estimation: whether use nearby stone to predict the unknown return score from BLACK perspective. diff --git a/AlphaGo/self-play.py b/AlphaGo/self-play.py index 98ccf84..296112b 100644 --- a/AlphaGo/self-play.py +++ b/AlphaGo/self-play.py @@ -79,7 +79,7 @@ while True: prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1)) print("Finished") print("\n") - score = game.executor.get_score(True) + score = game.executor.executor_get_score(True) if score > 0: winner = utils.BLACK else: diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py index 9ebd421..1e5fd02 100644 --- a/AlphaGo/strategy.py +++ b/AlphaGo/strategy.py @@ -117,14 +117,14 @@ class GoEnv: # print "many opponents, fake eye" return False - def knowledge_prunning(self, current_board, color, vertex): + def _knowledge_prunning(self, current_board, color, vertex): ### check if it is an eye of yourself ### assumptions : notice that this judgement requires that the state is an endgame if self._is_eye(current_board, color, vertex): return False return True - def sa2cv(self, state, action): + def _sa2cv(self, state, action): # State is the play board, the shape is [1, self.game.size, self.game.size, 17], action is an index. # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move if state[0, 0, 0, -1] == utils.BLACK: @@ -137,23 +137,13 @@ class GoEnv: vertex = self.game._deflatten(action) return color, vertex - def simulate_is_valid(self, history_boards, current_board, state, action): - # initialize simulate_latest_boards and simulate_board from state - self.simulate_latest_boards.clear() - for i in range(8): - self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) - self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) - - color, vertex = self.sa2cv(state, action) - + def _is_valid(self, history_boards, current_board, color, vertex): ### in board if not self._in_board(vertex): return False ### already have stone if not current_board[self.game._flatten(vertex)] == utils.EMPTY: - # print(np.array(self.board).reshape(9, 9)) - # print(vertex) return False ### check if it is suicide @@ -164,12 +154,26 @@ class GoEnv: if self._check_global_isomorphous(history_boards, current_board, color, vertex): return False - if not self.knowledge_prunning(current_board, color, vertex): + return True + + def simulate_is_valid(self, history_boards, current_board, state, action): + # initialize simulate_latest_boards and simulate_board from state + self.simulate_latest_boards.clear() + for i in range(8): + self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) + self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) + + color, vertex = self._sa2cv(state, action) + + if not self._is_valid(history_boards, current_board, color, vertex): + return False + + if not self._knowledge_prunning(current_board, color, vertex): return False return True - def simulate_do_move(self, color, vertex): + def _do_move(self, color, vertex): if vertex == utils.PASS: return True @@ -184,9 +188,9 @@ class GoEnv: # initialize the simulate_board from state self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() - color, vertex = self.sa2cv(state, action) + color, vertex = self._sa2cv(state, action) - self.simulate_do_move(color, vertex) + self._do_move(color, vertex) new_state = np.concatenate( [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), From c2b46c44e7dce0ef4c73e230aaed07c91af32e0c Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Wed, 20 Dec 2017 01:14:05 +0800 Subject: [PATCH 13/36] merge Go and GoEnv finallygit status! --- AlphaGo/engine.py | 2 +- AlphaGo/game.py | 23 ++--- AlphaGo/go.py | 99 ++++++++++++++++++++- AlphaGo/self-play.py | 2 +- AlphaGo/strategy.py | 199 ------------------------------------------- 5 files changed, 108 insertions(+), 217 deletions(-) delete mode 100644 AlphaGo/strategy.py diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index d11635a..9948176 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -183,7 +183,7 @@ class GTPEngine(): return 'unknown player', False def cmd_get_score(self, args, **kwargs): - return self._game.executor.executor_get_score(), None + return self._game.game_engine.executor_get_score(), None def cmd_show_board(self, args, **kwargs): return self._game.board, True diff --git a/AlphaGo/game.py b/AlphaGo/game.py index af4ef57..aee8d3a 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -9,16 +9,13 @@ import utils import copy import tensorflow as tf import numpy as np -import sys +import sys, os import go import network_small -import strategy from collections import deque +sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir)) from tianshou.core.mcts.mcts import MCTS -import Network -#from strategy import strategy - class Game: ''' Load the real game and trained weights. @@ -34,15 +31,11 @@ class Game: self.latest_boards = deque(maxlen=8) for _ in range(8): self.latest_boards.append(self.board) - - self.executor = go.Go(game=self) - #self.strategy = strategy(checkpoint_path) - - self.simulator = strategy.GoEnv(game=self) self.net = network_small.Network() self.sess = self.net.forward(checkpoint_path) self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], feed_dict={self.net.x: state, self.net.is_training: False}) + self.game_engine = go.Go(game=self) def _flatten(self, vertex): x, y = vertex @@ -79,10 +72,10 @@ class Game: def think(self, latest_boards, color): # TODO : using copy is right, or should we change to deepcopy? - self.simulator.simulate_latest_boards = copy.copy(latest_boards) - self.simulator.simulate_board = copy.copy(latest_boards[-1]) - nn_input = self.generate_nn_input(self.simulator.simulate_latest_boards, color) - mcts = MCTS(self.simulator, self.evaluator, nn_input, self.size ** 2 + 1, inverse=True, max_step=1) + self.game_engine.simulate_latest_boards = copy.copy(latest_boards) + self.game_engine.simulate_board = copy.copy(latest_boards[-1]) + nn_input = self.generate_nn_input(self.game_engine.simulate_latest_boards, color) + mcts = MCTS(self.game_engine, self.evaluator, nn_input, self.size ** 2 + 1, inverse=True, max_step=1) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0] @@ -96,7 +89,7 @@ class Game: # this function can be called directly to play the opponent's move if vertex == utils.PASS: return True - res = self.executor.executor_do_move(color, vertex) + res = self.game_engine.executor_do_move(color, vertex) return res def think_play_move(self, color): diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 108c9bd..10ce7e1 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -1,7 +1,7 @@ from __future__ import print_function import utils import copy -import sys +import numpy as np from collections import deque ''' @@ -12,10 +12,13 @@ Settings of the Go game. ''' NEIGHBOR_OFFSET = [[1, 0], [-1, 0], [0, -1], [0, 1]] +CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] class Go: def __init__(self, **kwargs): self.game = kwargs['game'] + self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) + self.simulate_latest_boards = deque(maxlen=8) def _in_board(self, vertex): x, y = vertex @@ -33,6 +36,16 @@ class Go: nei.append((_x, _y)) return nei + def _corner(self, vertex): + x, y = vertex + corner = [] + for d in CORNER_OFFSET: + _x = x + d[0] + _y = y + d[1] + if self._in_board((_x, _y)): + corner.append((_x, _y)) + return corner + def _find_group(self, current_board, vertex): color = current_board[self.game._flatten(vertex)] # print ("color : ", color) @@ -84,6 +97,47 @@ class Go: repeat = True return repeat + def _is_eye(self, current_board, color, vertex): + nei = self._neighbor(vertex) + cor = self._corner(vertex) + ncolor = {color == current_board[self.game._flatten(n)] for n in nei} + if False in ncolor: + # print "not all neighbors are in same color with us" + return False + _, group = self._find_group(current_board, nei[0]) + if set(nei) < group: + # print "all neighbors are in same group and same color with us" + return True + else: + opponent_number = [current_board[self.game._flatten(c)] for c in cor].count(-color) + opponent_propotion = float(opponent_number) / float(len(cor)) + if opponent_propotion < 0.5: + # print "few opponents, real eye" + return True + else: + # print "many opponents, fake eye" + return False + + def _knowledge_prunning(self, current_board, color, vertex): + ### check if it is an eye of yourself + ### assumptions : notice that this judgement requires that the state is an endgame + if self._is_eye(current_board, color, vertex): + return False + return True + + def _sa2cv(self, state, action): + # State is the play board, the shape is [1, self.game.size, self.game.size, 17], action is an index. + # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move + if state[0, 0, 0, -1] == utils.BLACK: + color = utils.BLACK + else: + color = utils.WHITE + if action == self.game.size ** 2: + vertex = (0, 0) + else: + vertex = self.game._deflatten(action) + return color, vertex + def _is_valid(self, history_boards, current_board, color, vertex): ### in board if not self._in_board(vertex): @@ -97,11 +151,54 @@ class Go: if self._is_suicide(current_board, color, vertex): return False + ### forbid global isomorphous if self._check_global_isomorphous(history_boards, current_board, color, vertex): return False return True + def simulate_is_valid(self, history_boards, current_board, state, action): + # initialize simulate_latest_boards and simulate_board from state + self.simulate_latest_boards.clear() + for i in range(8): + self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) + self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) + + color, vertex = self._sa2cv(state, action) + + if not self._is_valid(history_boards, current_board, color, vertex): + return False + + if not self._knowledge_prunning(current_board, color, vertex): + return False + + return True + + def _do_move(self, color, vertex): + if vertex == utils.PASS: + return True + + id_ = self.game._flatten(vertex) + if self.simulate_board[id_] == utils.EMPTY: + self.simulate_board[id_] = color + return True + else: + return False + + def simulate_step_forward(self, state, action): + # initialize the simulate_board from state + self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() + + color, vertex = self._sa2cv(state, action) + + self._do_move(color, vertex) + new_state = np.concatenate( + [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), + state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), + np.array(1 - state[:, :, :, -1]).reshape(1, self.game.size, self.game.size, 1)], + axis=3) + return new_state, 0 + def executor_do_move(self, color, vertex): if not self._is_valid(self.game.history, self.game.board, color, vertex): return False diff --git a/AlphaGo/self-play.py b/AlphaGo/self-play.py index 296112b..63b7e97 100644 --- a/AlphaGo/self-play.py +++ b/AlphaGo/self-play.py @@ -79,7 +79,7 @@ while True: prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1)) print("Finished") print("\n") - score = game.executor.executor_get_score(True) + score = game.game_engine.executor_get_score(True) if score > 0: winner = utils.BLACK else: diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py deleted file mode 100644 index 1e5fd02..0000000 --- a/AlphaGo/strategy.py +++ /dev/null @@ -1,199 +0,0 @@ -import os, sys - -sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir)) -import numpy as np -import utils -import time -import copy -import network_small -import tensorflow as tf -from collections import deque -from tianshou.core.mcts.mcts import MCTS - -NEIGHBOR_OFFSET = [[1, 0], [-1, 0], [0, -1], [0, 1]] -CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] - -class GoEnv: - def __init__(self, **kwargs): - self.game = kwargs['game'] - self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) - self.simulate_latest_boards = deque(maxlen=8) - - def _in_board(self, vertex): - x, y = vertex - if x < 1 or x > self.game.size: return False - if y < 1 or y > self.game.size: return False - return True - - def _neighbor(self, vertex): - x, y = vertex - nei = [] - for d in NEIGHBOR_OFFSET: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - nei.append((_x, _y)) - return nei - - def _corner(self, vertex): - x, y = vertex - corner = [] - for d in CORNER_OFFSET: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - corner.append((_x, _y)) - return corner - - def _find_group(self, current_board, vertex): - color = current_board[self.game._flatten(vertex)] - # print ("color : ", color) - chain = set() - frontier = [vertex] - has_liberty = False - while frontier: - current = frontier.pop() - # print ("current : ", current) - chain.add(current) - for n in self._neighbor(current): - if current_board[self.game._flatten(n)] == color and not n in chain: - frontier.append(n) - if current_board[self.game._flatten(n)] == utils.EMPTY: - has_liberty = True - return has_liberty, chain - - def _is_suicide(self, current_board, color, vertex): - current_board[self.game._flatten(vertex)] = color # assume that we already take this move - suicide = False - - has_liberty, group = self._find_group(current_board, vertex) - if not has_liberty: - suicide = True # no liberty, suicide - for n in self._neighbor(vertex): - if current_board[self.game._flatten(n)] == utils.another_color(color): - opponent_liberty, group = self._find_group(current_board, n) - if not opponent_liberty: - suicide = False # this move is able to take opponent's stone, not suicide - - current_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move - return suicide - - def _process_board(self, current_board, color, vertex): - nei = self._neighbor(vertex) - for n in nei: - if current_board[self.game._flatten(n)] == utils.another_color(color): - has_liberty, group = self._find_group(current_board, n) - if not has_liberty: - for b in group: - current_board[self.game._flatten(b)] = utils.EMPTY - - def _check_global_isomorphous(self, history_boards, current_board, color, vertex): - repeat = False - next_board = copy.copy(current_board) - next_board[self.game._flatten(vertex)] = color - self._process_board(next_board, color, vertex) - if next_board in history_boards: - repeat = True - return repeat - - def _is_eye(self, current_board, color, vertex): - nei = self._neighbor(vertex) - cor = self._corner(vertex) - ncolor = {color == current_board[self.game._flatten(n)] for n in nei} - if False in ncolor: - # print "not all neighbors are in same color with us" - return False - _, group = self._find_group(current_board, nei[0]) - if set(nei) < group: - # print "all neighbors are in same group and same color with us" - return True - else: - opponent_number = [current_board[self.game._flatten(c)] for c in cor].count(-color) - opponent_propotion = float(opponent_number) / float(len(cor)) - if opponent_propotion < 0.5: - # print "few opponents, real eye" - return True - else: - # print "many opponents, fake eye" - return False - - def _knowledge_prunning(self, current_board, color, vertex): - ### check if it is an eye of yourself - ### assumptions : notice that this judgement requires that the state is an endgame - if self._is_eye(current_board, color, vertex): - return False - return True - - def _sa2cv(self, state, action): - # State is the play board, the shape is [1, self.game.size, self.game.size, 17], action is an index. - # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move - if state[0, 0, 0, -1] == utils.BLACK: - color = utils.BLACK - else: - color = utils.WHITE - if action == self.game.size ** 2: - vertex = (0, 0) - else: - vertex = self.game._deflatten(action) - return color, vertex - - def _is_valid(self, history_boards, current_board, color, vertex): - ### in board - if not self._in_board(vertex): - return False - - ### already have stone - if not current_board[self.game._flatten(vertex)] == utils.EMPTY: - return False - - ### check if it is suicide - if self._is_suicide(current_board, color, vertex): - return False - - ### forbid global isomorphous - if self._check_global_isomorphous(history_boards, current_board, color, vertex): - return False - - return True - - def simulate_is_valid(self, history_boards, current_board, state, action): - # initialize simulate_latest_boards and simulate_board from state - self.simulate_latest_boards.clear() - for i in range(8): - self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) - self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) - - color, vertex = self._sa2cv(state, action) - - if not self._is_valid(history_boards, current_board, color, vertex): - return False - - if not self._knowledge_prunning(current_board, color, vertex): - return False - - return True - - def _do_move(self, color, vertex): - if vertex == utils.PASS: - return True - - id_ = self.game._flatten(vertex) - if self.simulate_board[id_] == utils.EMPTY: - self.simulate_board[id_] = color - return True - else: - return False - - def simulate_step_forward(self, state, action): - # initialize the simulate_board from state - self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() - - color, vertex = self._sa2cv(state, action) - - self._do_move(color, vertex) - new_state = np.concatenate( - [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), - state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), - np.array(1 - state[:, :, :, -1]).reshape(1, self.game.size, self.game.size, 1)], - axis=3) - return new_state, 0 From 7fca90c61b97704463985f1c1774e90a834c906c Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Wed, 20 Dec 2017 16:43:42 +0800 Subject: [PATCH 14/36] modify the mcts, refactor the network --- AlphaGo/Network.py | 211 ----------------------- AlphaGo/Network_ori.py | 175 ------------------- AlphaGo/game.py | 15 +- AlphaGo/go.py | 58 ++----- AlphaGo/model.py | 170 ++++++++++++++++++ AlphaGo/{network_small.py => network.py} | 0 tianshou/core/mcts/mcts.py | 40 ++--- 7 files changed, 212 insertions(+), 457 deletions(-) delete mode 100644 AlphaGo/Network.py delete mode 100644 AlphaGo/Network_ori.py create mode 100644 AlphaGo/model.py rename AlphaGo/{network_small.py => network.py} (100%) diff --git a/AlphaGo/Network.py b/AlphaGo/Network.py deleted file mode 100644 index caf7710..0000000 --- a/AlphaGo/Network.py +++ /dev/null @@ -1,211 +0,0 @@ -import os -import time -import sys - -import numpy as np -import time -import tensorflow as tf -import tensorflow.contrib.layers as layers - -import multi_gpu -import time - -# os.environ["CUDA_VISIBLE_DEVICES"] = "1" -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - - -def residual_block(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = h + input - return tf.nn.relu(h) - - -def policy_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 362, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -def value_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -class Network(object): - def __init__(self): - self.x = tf.placeholder(tf.float32, shape=[None, 19, 19, 17]) - self.is_training = tf.placeholder(tf.bool, shape=[]) - self.z = tf.placeholder(tf.float32, shape=[None, 1]) - self.pi = tf.placeholder(tf.float32, shape=[None, 362]) - self.build_network() - - def build_network(self): - h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm, - normalizer_params={'is_training': self.is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS}, - weights_regularizer=layers.l2_regularizer(1e-4)) - for i in range(19): - h = residual_block(h, self.is_training) - self.v = value_heads(h, self.is_training) - self.p = policy_heads(h, self.is_training) - # loss = tf.reduce_mean(tf.square(z-v)) - tf.multiply(pi, tf.log(tf.clip_by_value(tf.nn.softmax(p), 1e-8, tf.reduce_max(tf.nn.softmax(p))))) - self.value_loss = tf.reduce_mean(tf.square(self.z - self.v)) - self.policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.pi, logits=self.p)) - - self.reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) - self.total_loss = self.value_loss + self.policy_loss + self.reg - # train_op = tf.train.MomentumOptimizer(1e-4, momentum=0.9, use_nesterov=True).minimize(total_loss) - self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) - with tf.control_dependencies(self.update_ops): - self.train_op = tf.train.RMSPropOptimizer(1e-4).minimize(self.total_loss) - self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) - self.saver = tf.train.Saver(max_to_keep=10, var_list=self.var_list) - - def train(self): - data_path = "/home/tongzheng/data/" - data_name = os.listdir("/home/tongzheng/data/") - epochs = 100 - batch_size = 128 - - result_path = "./checkpoints/" - with multi_gpu.create_session() as sess: - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(result_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - self.saver.restore(sess, ckpt_file) - for epoch in range(epochs): - for name in data_name: - data = np.load(data_path + name) - boards = data["boards"] - wins = data["wins"] - ps = data["ps"] - print (boards.shape) - print (wins.shape) - print (ps.shape) - batch_num = boards.shape[0] // batch_size - index = np.arange(boards.shape[0]) - np.random.shuffle(index) - value_losses = [] - policy_losses = [] - regs = [] - time_train = -time.time() - for iter in range(batch_num): - lv, lp, r, value, prob, _ = sess.run( - [self.value_loss, self.policy_loss, self.reg, self.v, tf.nn.softmax(p), self.train_op], - feed_dict={self.x: boards[ - index[iter * batch_size:(iter + 1) * batch_size]], - self.z: wins[index[ - iter * batch_size:(iter + 1) * batch_size]], - self.pi: ps[index[ - iter * batch_size:(iter + 1) * batch_size]], - self.is_training: True}) - value_losses.append(lv) - policy_losses.append(lp) - regs.append(r) - if iter % 1 == 0: - print( - "Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format( - epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)), - np.mean(np.array(policy_losses)), np.mean(np.array(regs)))) - time_train = -time.time() - value_losses = [] - policy_losses = [] - regs = [] - if iter % 20 == 0: - save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter) - self.saver.save(sess, result_path + save_path) - del data, boards, wins, ps - - - # def forward(call_number): - # # checkpoint_path = "/home/yama/rl/tianshou/AlphaGo/checkpoints" - # checkpoint_path = "/home/jialian/stuGo/tianshou/stuGo/checkpoints/" - # board_file = np.genfromtxt("/home/jialian/stuGo/tianshou/leela-zero/src/mcts_nn_files/board_" + call_number, - # dtype='str'); - # human_board = np.zeros((17, 19, 19)) - # - # # TODO : is it ok to ignore the last channel? - # for i in range(17): - # human_board[i] = np.array(list(board_file[i])).reshape(19, 19) - # # print("============================") - # # print("human board sum : " + str(np.sum(human_board[-1]))) - # # print("============================") - # # print(human_board) - # # print("============================") - # # rint(human_board) - # feed_board = human_board.transpose(1, 2, 0).reshape(1, 19, 19, 17) - # # print(feed_board[:,:,:,-1]) - # # print(feed_board.shape) - # - # # npz_board = np.load("/home/yama/rl/tianshou/AlphaGo/data/7f83928932f64a79bc1efdea268698ae.npz") - # # print(npz_board["boards"].shape) - # # feed_board = npz_board["boards"][10].reshape(-1, 19, 19, 17) - # ##print(feed_board) - # # show_board = feed_board[0].transpose(2, 0, 1) - # # print("board shape : ", show_board.shape) - # # print(show_board) - # - # itflag = False - # with multi_gpu.create_session() as sess: - # sess.run(tf.global_variables_initializer()) - # ckpt_file = tf.train.latest_checkpoint(checkpoint_path) - # if ckpt_file is not None: - # # print('Restoring model from {}...'.format(ckpt_file)) - # saver.restore(sess, ckpt_file) - # else: - # raise ValueError("No model loaded") - # res = sess.run([tf.nn.softmax(p), v], feed_dict={x: feed_board, is_training: itflag}) - # # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False}) - # # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True}) - # # print(np.argmax(res[0])) - # np.savetxt(sys.stdout, res[0][0], fmt="%.6f", newline=" ") - # np.savetxt(sys.stdout, res[1][0], fmt="%.6f", newline=" ") - # pv_file = "/home/jialian/stuGotianshou/leela-zero/src/mcts_nn_files/policy_value" - # np.savetxt(pv_file, np.concatenate((res[0][0], res[1][0])), fmt="%.6f", newline=" ") - # # np.savetxt(pv_file, res[1][0], fmt="%.6f", newline=" ") - # return res - - def forward(self): - checkpoint_path = "/home/tongzheng/tianshou/AlphaGo/checkpoints/" - sess = multi_gpu.create_session() - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(checkpoint_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - self.saver.restore(sess, ckpt_file) - print('Successfully loaded') - else: - raise ValueError("No model loaded") - # prior, value = sess.run([tf.nn.softmax(p), v], feed_dict={x: state, is_training: False}) - # return prior, value - return sess - - -if __name__ == '__main__': - state = np.random.randint(0, 1, [1, 19, 19, 17]) - net = Network() - sess = net.forward() - start = time.time() - for i in range(100): - sess.run([tf.nn.softmax(net.p), net.v], feed_dict={net.x: state, net.is_training: False}) - print("Step {}, Cumulative time {}".format(i, time.time() - start)) diff --git a/AlphaGo/Network_ori.py b/AlphaGo/Network_ori.py deleted file mode 100644 index 9d33bb9..0000000 --- a/AlphaGo/Network_ori.py +++ /dev/null @@ -1,175 +0,0 @@ -import os -import time -import gc - -import numpy as np -import tensorflow as tf -import tensorflow.contrib.layers as layers - -import multi_gpu - -os.environ["CUDA_VISIBLE_DEVICES"] = "1" - - -def residual_block(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = h + input - return tf.nn.relu(h) - - -def policy_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 362, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -def value_heads(input, is_training): - normalizer_params = {'is_training': is_training, - 'updates_collections': tf.GraphKeys.UPDATE_OPS} - h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, - normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, - weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.flatten(h) - h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4)) - h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4)) - return h - - -x = tf.placeholder(tf.float32, shape=[None, 19, 19, 17]) -is_training = tf.placeholder(tf.bool, shape=[]) -z = tf.placeholder(tf.float32, shape=[None, 1]) -pi = tf.placeholder(tf.float32, shape=[None, 362]) - -h = layers.conv2d(x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm, - normalizer_params={'is_training': is_training, 'updates_collections': tf.GraphKeys.UPDATE_OPS}, - weights_regularizer=layers.l2_regularizer(1e-4)) -for i in range(19): - h = residual_block(h, is_training) -v = value_heads(h, is_training) -p = policy_heads(h, is_training) -# loss = tf.reduce_mean(tf.square(z-v)) - tf.multiply(pi, tf.log(tf.clip_by_value(tf.nn.softmax(p), 1e-8, tf.reduce_max(tf.nn.softmax(p))))) -value_loss = tf.reduce_mean(tf.square(z - v)) -policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=pi, logits=p)) - -reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) -total_loss = value_loss + policy_loss + reg -# train_op = tf.train.MomentumOptimizer(1e-4, momentum=0.9, use_nesterov=True).minimize(total_loss) -update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) -with tf.control_dependencies(update_ops): - train_op = tf.train.RMSPropOptimizer(1e-4).minimize(total_loss) -var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) -saver = tf.train.Saver(max_to_keep=10, var_list=var_list) - - -def train(): - data_path = "/home/tongzheng/data/" - data_name = os.listdir("/home/tongzheng/data/") - epochs = 100 - batch_size = 128 - - result_path = "./checkpoints/" - with multi_gpu.create_session() as sess: - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(result_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - saver.restore(sess, ckpt_file) - for epoch in range(epochs): - for name in data_name: - data = np.load(data_path + name) - boards = data["boards"] - wins = data["wins"] - ps = data["ps"] - print (boards.shape) - print (wins.shape) - print (ps.shape) - # batch_num = 1 - batch_num = boards.shape[0] // batch_size - index = np.arange(boards.shape[0]) - np.random.shuffle(index) - value_losses = [] - policy_losses = [] - regs = [] - time_train = -time.time() - for iter in range(batch_num): - lv, lp, r, _ = sess.run([value_loss, policy_loss, reg, train_op], - feed_dict={x: boards[ - index[iter * batch_size:(iter + 1) * batch_size]], - z: wins[index[ - iter * batch_size:(iter + 1) * batch_size]], - pi: ps[index[ - iter * batch_size:(iter + 1) * batch_size]], - is_training: True}) - value_losses.append(lv) - policy_losses.append(lp) - regs.append(r) - del lv, lp, r - if iter % 1 == 0: - print( - "Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format( - epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)), - np.mean(np.array(policy_losses)), np.mean(np.array(regs)))) - del value_losses, policy_losses, regs, time_train - time_train = -time.time() - value_losses = [] - policy_losses = [] - regs = [] - if iter % 20 == 0: - save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter) - saver.save(sess, result_path + save_path) - del save_path - del data, boards, wins, ps, batch_num, index - gc.collect() - - -def forward(board): - result_path = "./checkpoints" - itflag = False - res = None - if board is None: - # data = np.load("/home/tongzheng/meta-data/80b7bf21bce14862806d48c3cd760a1b.npz") - data = np.load("./data/7f83928932f64a79bc1efdea268698ae.npz") - board = data["boards"][50].reshape(-1, 19, 19, 17) - human_board = board[0].transpose(2, 0, 1) - print("============================") - print("human board sum : " + str(np.sum(human_board))) - print("============================") - print(board[:, :, :, -1]) - itflag = False - with multi_gpu.create_session() as sess: - sess.run(tf.global_variables_initializer()) - ckpt_file = tf.train.latest_checkpoint(result_path) - if ckpt_file is not None: - print('Restoring model from {}...'.format(ckpt_file)) - saver.restore(sess, ckpt_file) - else: - raise ValueError("No model loaded") - res = sess.run([tf.nn.softmax(p), v], feed_dict={x: board, is_training: itflag}) - # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False}) - # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True}) - # print(np.argmax(res[0])) - print(res) - print(data["p"][0]) - print(np.argmax(res[0])) - print(np.argmax(data["p"][0])) - # print(res[0].tolist()[0]) - # print(np.argmax(res[0])) - return res - - -if __name__ == '__main__': - # train() - # if sys.argv[1] == "test": - forward(None) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index aee8d3a..37b7878 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -11,7 +11,7 @@ import tensorflow as tf import numpy as np import sys, os import go -import network_small +import model from collections import deque sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir)) from tianshou.core.mcts.mcts import MCTS @@ -31,10 +31,9 @@ class Game: self.latest_boards = deque(maxlen=8) for _ in range(8): self.latest_boards.append(self.board) - self.net = network_small.Network() - self.sess = self.net.forward(checkpoint_path) - self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], - feed_dict={self.net.x: state, self.net.is_training: False}) + self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8) + # self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], + # feed_dict={self.net.x: state, self.net.is_training: False}) self.game_engine = go.Go(game=self) def _flatten(self, vertex): @@ -75,7 +74,8 @@ class Game: self.game_engine.simulate_latest_boards = copy.copy(latest_boards) self.game_engine.simulate_board = copy.copy(latest_boards[-1]) nn_input = self.generate_nn_input(self.game_engine.simulate_latest_boards, color) - mcts = MCTS(self.game_engine, self.evaluator, nn_input, self.size ** 2 + 1, inverse=True, max_step=1) + mcts = MCTS(self.game_engine, self.evaluator, [self.game_engine.simulate_latest_boards, color], self.size ** 2 + 1, inverse=True) + mcts.search(max_step=1) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0] @@ -93,7 +93,7 @@ class Game: return res def think_play_move(self, color): - # although we dont need to return self.prob, however it is needed for neural network training + # although we don't need to return self.prob, however it is needed for neural network training move, self.prob = self.think(self.latest_boards, color) # play the move immediately self.play_move(color, move) @@ -122,6 +122,7 @@ class Game: if __name__ == "__main__": g = Game() g.show_board() + g.think_play_move(1) #file = open("debug.txt", "a") #file.write("mcts check\n") #file.close() diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 10ce7e1..335ee39 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -17,8 +17,6 @@ CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] class Go: def __init__(self, **kwargs): self.game = kwargs['game'] - self.simulate_board = [utils.EMPTY] * (self.game.size ** 2) - self.simulate_latest_boards = deque(maxlen=8) def _in_board(self, vertex): x, y = vertex @@ -125,18 +123,12 @@ class Go: return False return True - def _sa2cv(self, state, action): - # State is the play board, the shape is [1, self.game.size, self.game.size, 17], action is an index. - # We need to transfer the (state, action) pair into (color, vertex) pair to simulate the move - if state[0, 0, 0, -1] == utils.BLACK: - color = utils.BLACK - else: - color = utils.WHITE + def _action2vertex(self, action): if action == self.game.size ** 2: vertex = (0, 0) else: vertex = self.game._deflatten(action) - return color, vertex + return vertex def _is_valid(self, history_boards, current_board, color, vertex): ### in board @@ -157,14 +149,10 @@ class Go: return True - def simulate_is_valid(self, history_boards, current_board, state, action): - # initialize simulate_latest_boards and simulate_board from state - self.simulate_latest_boards.clear() - for i in range(8): - self.simulate_latest_boards.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) - self.simulate_board = copy.copy(self.simulate_latest_boards[-1]) - - color, vertex = self._sa2cv(state, action) + def simulate_is_valid(self, state, action): + history_boards, color = state + vertex = self._action2vertex(action) + current_board = history_boards[-1] if not self._is_valid(history_boards, current_board, color, vertex): return False @@ -174,30 +162,22 @@ class Go: return True - def _do_move(self, color, vertex): + def _do_move(self, board, color, vertex): if vertex == utils.PASS: - return True - - id_ = self.game._flatten(vertex) - if self.simulate_board[id_] == utils.EMPTY: - self.simulate_board[id_] = color - return True + return board else: - return False + id_ = self.game._flatten(vertex) + board[id_] = color + return board def simulate_step_forward(self, state, action): # initialize the simulate_board from state - self.simulate_board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() - - color, vertex = self._sa2cv(state, action) - - self._do_move(color, vertex) - new_state = np.concatenate( - [state[:, :, :, 1:8], (np.array(self.simulate_board) == utils.BLACK).reshape(1, self.game.size, self.game.size, 1), - state[:, :, :, 9:16], (np.array(self.simulate_board) == utils.WHITE).reshape(1, self.game.size, self.game.size, 1), - np.array(1 - state[:, :, :, -1]).reshape(1, self.game.size, self.game.size, 1)], - axis=3) - return new_state, 0 + history_boards, color = state + vertex = self._action2vertex(action) + new_board = self._do_move(copy.copy(history_boards[-1]), color, vertex) + history_boards.append(new_board) + new_color = -color + return [history_boards, new_color], 0 def executor_do_move(self, color, vertex): if not self._is_valid(self.game.history, self.game.board, color, vertex): @@ -239,7 +219,7 @@ class Go: start_vertex_x += x_diff start_vertex_y += y_diff - def _predict_from_nearby(self, vertex, neighbor_step = 3): + def _predict_from_nearby(self, vertex, neighbor_step=3): ''' step: the nearby 3 steps is considered :vertex: position to be estimated @@ -261,7 +241,7 @@ class Go: elif color_estimate < 0: return utils.WHITE - def executor_get_score(self, is_unknown_estimation = False): + def executor_get_score(self, is_unknown_estimation=False): ''' is_unknown_estimation: whether use nearby stone to predict the unknown return score from BLACK perspective. diff --git a/AlphaGo/model.py b/AlphaGo/model.py new file mode 100644 index 0000000..725dbd2 --- /dev/null +++ b/AlphaGo/model.py @@ -0,0 +1,170 @@ +import os +import time +import sys + +import numpy as np +import tensorflow as tf +import tensorflow.contrib.layers as layers + +import multi_gpu + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + + +def residual_block(input, is_training): + """ + one residual block + + :param input: a tensor, input of the residual block + :param is_training: a placeholder, indicate whether the model is training or not + :return: a tensor, output of the residual block + """ + normalizer_params = {'is_training': is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS} + h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = h + input + return tf.nn.relu(h) + + +def policy_head(input, is_training, action_num): + """ + the head of policy branch + + :param input: a tensor, input of the policy head + :param is_training: a placeholder, indicate whether the model is training or not + :param action_num: action_num: an integer, number of unique actions at any state + :return: a tensor: output of the policy head, shape [batch_size, action_num] + """ + normalizer_params = {'is_training': is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS} + h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.flatten(h) + h = layers.fully_connected(h, action_num, activation_fn=tf.identity, + weights_regularizer=layers.l2_regularizer(1e-4)) + return h + + +def value_head(input, is_training): + """ + the head of value branch + + :param input: a tensor, input of the value head + :param is_training: a placeholder, indicate whether the model is training or not + :return: a tensor, output of the value head, shape [batch_size, 1] + """ + normalizer_params = {'is_training': is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS} + h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params, + weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.flatten(h) + h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4)) + h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4)) + return h + + +class ResNet(object): + def __init__(self, board_size, action_num, history_length=1, residual_block_num=20, checkpoint_path=None): + """ + the resnet model + + :param board_size: an integer, the board size + :param action_num: an integer, number of unique actions at any state + :param history_length: an integer, the history length to use, default is 1 + :param residual_block_num: an integer, the number of residual block, default is 20, at least 1 + :param checkpoint_path: a string, the path to the checkpoint, default is None, + """ + self.board_size = board_size + self.action_num = action_num + self.history_length = history_length + self.x = tf.placeholder(tf.float32, shape=[None, self.board_size, self.board_size, 2 * self.history_length + 1]) + self.is_training = tf.placeholder(tf.bool, shape=[]) + self.z = tf.placeholder(tf.float32, shape=[None, 1]) + self.pi = tf.placeholder(tf.float32, shape=[None, self.action_num]) + self._build_network(residual_block_num, checkpoint_path) + + def _build_network(self, residual_block_num, checkpoint_path): + """ + build the network + + :param residual_block_num: an integer, the number of residual block + :param checkpoint_path: a string, the path to the checkpoint, if None, use random initialization parameter + :return: None + """ + + h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, + normalizer_fn=layers.batch_norm, + normalizer_params={'is_training': self.is_training, + 'updates_collections': tf.GraphKeys.UPDATE_OPS}, + weights_regularizer=layers.l2_regularizer(1e-4)) + for i in range(residual_block_num - 1): + h = residual_block(h, self.is_training) + self.v = value_head(h, self.is_training) + self.p = policy_head(h, self.is_training, self.action_num) + self.value_loss = tf.reduce_mean(tf.square(self.z - self.v)) + self.policy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.pi, logits=self.p)) + + self.reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) + self.total_loss = self.value_loss + self.policy_loss + self.reg + self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + with tf.control_dependencies(self.update_ops): + self.train_op = tf.train.AdamOptimizer(1e-4).minimize(self.total_loss) + self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) + self.saver = tf.train.Saver(max_to_keep=10, var_list=self.var_list) + self.sess = multi_gpu.create_session() + self.sess.run(tf.global_variables_initializer()) + if checkpoint_path is not None: + ckpt_file = tf.train.latest_checkpoint(checkpoint_path) + if ckpt_file is not None: + print('Restoring model from {}...'.format(ckpt_file)) + self.saver.restore(self.sess, ckpt_file) + print('Successfully loaded') + else: + raise ValueError("No model in path {}".format(checkpoint_path)) + + def __call__(self, state): + """ + + :param history: a list, the history + :param color: a string, indicate which one to play + :return: a list of tensor, the predicted value and policy given the history and color + """ + history, color = state + if len(history) != self.history_length: + raise ValueError( + 'The length of history cannot meet the need of the model, given {}, need {}'.format(len(history), + self.history_length)) + state = self._history2state(history, color) + return self.sess.run([self.p, self.v], feed_dict={self.x: state, self.is_training: False}) + + def _history2state(self, history, color): + """ + convert the history to the state we need + + :param history: a list, the history + :param color: a string, indicate which one to play + :return: a ndarray, the state + """ + state = np.zeros([1, self.board_size, self.board_size, 2 * self.history_length + 1]) + for i in range(self.history_length): + state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.board_size ** 2)).reshape(self.board_size, + self.board_size) + state[0, :, :, i + self.history_length] = np.array( + np.array(history[i]) == -np.ones(self.board_size ** 2)).reshape(self.board_size, self.board_size) + # TODO: need a config to specify the BLACK and WHITE + if color == +1: + state[0, :, :, 2 * self.history_length] = np.ones([self.board_size, self.board_size]) + if color == -1: + state[0, :, :, 2 * self.history_length] = np.zeros([self.board_size, self.board_size]) + return state + + #TODO: design the interface between the environment and training + def train(self, mode='memory', *args, **kwargs): + pass \ No newline at end of file diff --git a/AlphaGo/network_small.py b/AlphaGo/network.py similarity index 100% rename from AlphaGo/network_small.py rename to AlphaGo/network.py diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 12fc85d..fac00fb 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -72,11 +72,9 @@ class UCTNode(MCTSNode): def valid_mask(self, simulator): if self.mask is None: - start_time = time.time() self.mask = [] for act in range(self.action_num - 1): - if not simulator.simulate_is_valid( - simulator.simulate_latest_boards, simulator.simulate_board, self.state, act): + if not simulator.simulate_is_valid(self.state, act): self.mask.append(act) self.ucb[act] = -float("Inf") else: @@ -144,8 +142,7 @@ class ActionNode(object): class MCTS(object): - def __init__(self, simulator, evaluator, root, action_num, method="UCT", inverse=False, max_step=None, - max_time=None): + def __init__(self, simulator, evaluator, root, action_num, method="UCT", inverse=False): self.simulator = simulator self.evaluator = evaluator prior, _ = self.evaluator(root) @@ -153,33 +150,26 @@ class MCTS(object): if method == "": self.root = root if method == "UCT": - self.root = UCTNode(None, None, root, action_num, prior, inverse) + self.root = UCTNode(None, None, root, action_num, prior, inverse=inverse) if method == "TS": self.root = TSNode(None, None, root, action_num, prior, inverse=inverse) self.inverse = inverse - if max_step is not None: - self.step = 0 - self.max_step = max_step - # TODO: Optimize the stop criteria - # else: - # self.max_step = 0 - if max_time is not None: - self.start_time = time.time() - self.max_time = max_time + + def search(self, max_step=None, max_time=None): + step = 0 + start_time = time.time() + if max_step is None: + max_step = int("Inf") + if max_time is None: + max_time = float("Inf") if max_step is None and max_time is None: raise ValueError("Need a stop criteria!") - # TODO: running mcts should be implemented in another function, e.g. def search(self, max_step, max_time) - self.select_time = [] - self.evaluate_time = [] - self.bp_time = [] - while (max_step is not None and self.step < self.max_step or max_step is None) \ - and (max_time is not None and time.time() - self.start_time < self.max_time or max_time is None): - self.expand() - if max_step is not None: - self.step += 1 + while step < max_step and time.time() - start_time < max_step: + self._expand() + step += 1 - def expand(self): + def _expand(self): node, new_action = self.root.selection(self.simulator) value = node.children[new_action].expansion(self.evaluator, self.action_num) node.children[new_action].backpropagation(value + 0.) From 50e306368feabf13a8723412481c6f3103ff3c4e Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Wed, 20 Dec 2017 20:12:08 +0800 Subject: [PATCH 15/36] checkpoint --- AlphaGo/go.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 335ee39..7196533 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -117,10 +117,31 @@ class Go: return False def _knowledge_prunning(self, current_board, color, vertex): - ### check if it is an eye of yourself - ### assumptions : notice that this judgement requires that the state is an endgame + # forbid some stupid selfplay using human knowledge if self._is_eye(current_board, color, vertex): return False + # forbid position on its own eye. + if self._is_game_finish(current_board, color) and vertex == utils.PASS + return False + # forbid pass if the game is not finished. + return True + + + def _is_game_finished(self, current_board, color): + ''' + for each empty position, if it has both BLACK and WHITE neighbors, the game is still not finished + :return: return the game is finished + ''' + board = copy.deepcopy(current_board) + empty_idx = [i for i, x in enumerate(board) if x == utils.EMPTY] # find all empty idx + for idx in empty_idx: + neighbor_idx = self._neighbor(self.game.deflatten(idx)) + if len(neighbor_idx) > 1: + first_idx = neighbor_idx[0] + for other_idx in neighbor_idx[1:]: + if self.game.board[self.game.flatten(other_idx)] != self.game.board[self.game.flatten(first_idx)]: + return False + return True def _action2vertex(self, action): From 48e95a21eaeec6495a1bc5985c434d64d7447baf Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Wed, 20 Dec 2017 21:35:35 +0800 Subject: [PATCH 16/36] simulator process a valid set, instead of a single action --- AlphaGo/go.py | 18 +++++++++++++++--- tianshou/core/mcts/mcts.py | 9 ++------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 7196533..559b375 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -121,9 +121,9 @@ class Go: if self._is_eye(current_board, color, vertex): return False # forbid position on its own eye. - if self._is_game_finish(current_board, color) and vertex == utils.PASS - return False - # forbid pass if the game is not finished. + #if self._is_game_finish(current_board, color) and vertex == utils.PASS + # return False + # forbid pass if the game is not finished. return True @@ -183,6 +183,18 @@ class Go: return True + def simulate_is_valid_list(self, state, action_set): + ## find all the valid actions + ## if no action is valid, then pass + valid_action_set = [] + for action_candidate in action_set: + if self.simulate_is_valid(self, state, action_candidate) + valid_action_set.append(action_candidate) + if not valid_action_set: + valid_action_set.append(utils.PASS) + # if valid_action_set is a empty set, add pass + return valid_action_set + def _do_move(self, board, color, vertex): if vertex == utils.PASS: return board diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index fac00fb..c14496d 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -72,13 +72,8 @@ class UCTNode(MCTSNode): def valid_mask(self, simulator): if self.mask is None: - self.mask = [] - for act in range(self.action_num - 1): - if not simulator.simulate_is_valid(self.state, act): - self.mask.append(act) - self.ucb[act] = -float("Inf") - else: - self.ucb[self.mask] = -float("Inf") + self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num - 1)) + self.ucb[self.mask] = -float("Inf") class TSNode(MCTSNode): From cabbb219680be465f03527ea90deb568b53f911f Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Wed, 20 Dec 2017 21:40:03 +0800 Subject: [PATCH 17/36] minor revision --- AlphaGo/go.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 559b375..009d369 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -186,14 +186,14 @@ class Go: def simulate_is_valid_list(self, state, action_set): ## find all the valid actions ## if no action is valid, then pass - valid_action_set = [] + valid_action_list = [] for action_candidate in action_set: - if self.simulate_is_valid(self, state, action_candidate) - valid_action_set.append(action_candidate) - if not valid_action_set: - valid_action_set.append(utils.PASS) + if self.simulate_is_valid(state, action_candidate): + valid_action_list.append(action_candidate) + if not valid_action_list: + valid_action_list.append(utils.PASS) # if valid_action_set is a empty set, add pass - return valid_action_set + return valid_action_list def _do_move(self, board, color, vertex): if vertex == utils.PASS: From e2c6b96e5743341f92278a6437a85a7154bd5ec3 Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Wed, 20 Dec 2017 21:52:30 +0800 Subject: [PATCH 18/36] minor revision. --- AlphaGo/go.py | 3 +-- tianshou/core/mcts/mcts.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 009d369..cbbe07c 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -180,7 +180,6 @@ class Go: if not self._knowledge_prunning(current_board, color, vertex): return False - return True def simulate_is_valid_list(self, state, action_set): @@ -188,7 +187,7 @@ class Go: ## if no action is valid, then pass valid_action_list = [] for action_candidate in action_set: - if self.simulate_is_valid(state, action_candidate): + if not self.simulate_is_valid(state, action_candidate): valid_action_list.append(action_candidate) if not valid_action_list: valid_action_list.append(utils.PASS) diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index c14496d..5aca06a 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -71,6 +71,7 @@ class UCTNode(MCTSNode): self.parent.backpropagation(self.children[action].reward) def valid_mask(self, simulator): + # let all invalid actions illeagel in mcts if self.mask is None: self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num - 1)) self.ucb[self.mask] = -float("Inf") From f0d59dab6cef928cd580f301abbdd54b84af23df Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Wed, 20 Dec 2017 22:10:47 +0800 Subject: [PATCH 19/36] forbid pass, if we have other choices --- AlphaGo/go.py | 18 +++++++++--------- tianshou/core/mcts/mcts.py | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/AlphaGo/go.py b/AlphaGo/go.py index cbbe07c..1dfbb29 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -183,16 +183,16 @@ class Go: return True def simulate_is_valid_list(self, state, action_set): - ## find all the valid actions - ## if no action is valid, then pass - valid_action_list = [] - for action_candidate in action_set: + # find all the invalid actions + invalid_action_list = [] + for action_candidate in action_set[:-1]: + # go through all the actions excluding pass if not self.simulate_is_valid(state, action_candidate): - valid_action_list.append(action_candidate) - if not valid_action_list: - valid_action_list.append(utils.PASS) - # if valid_action_set is a empty set, add pass - return valid_action_list + invalid_action_list.append(action_candidate) + if len(invalid_action_list) < len(action_set) - 1: + invalid_action_list.append(action_set[-1]) + # forbid pass, if we have other choices + return invalid_action_list def _do_move(self, board, color, vertex): if vertex == utils.PASS: diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 5aca06a..7edac97 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -71,7 +71,7 @@ class UCTNode(MCTSNode): self.parent.backpropagation(self.children[action].reward) def valid_mask(self, simulator): - # let all invalid actions illeagel in mcts + # let all invalid actions be illeagel in mcts if self.mask is None: self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num - 1)) self.ucb[self.mask] = -float("Inf") From 00d2aa86bf668e17d6064b4896797cb79f7cbba7 Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Wed, 20 Dec 2017 22:57:58 +0800 Subject: [PATCH 20/36] repair komi. add todo for forbid pass: --- AlphaGo/engine.py | 2 +- AlphaGo/game.py | 4 ++-- AlphaGo/go.py | 5 +---- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index 9948176..bf30083 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -183,7 +183,7 @@ class GTPEngine(): return 'unknown player', False def cmd_get_score(self, args, **kwargs): - return self._game.game_engine.executor_get_score(), None + return self._game.game_engine.executor_get_score(True), None def cmd_show_board(self, args, **kwargs): return self._game.board, True diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 37b7878..5f35c74 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -23,7 +23,7 @@ class Game: TODO : Maybe merge with the engine class in future, currently leave it untouched for interacting with Go UI. ''' - def __init__(self, size=9, komi=6.5, checkpoint_path=None): + def __init__(self, size=9, komi=3.75, checkpoint_path=None): self.size = size self.komi = komi self.board = [utils.EMPTY] * (self.size ** 2) @@ -75,7 +75,7 @@ class Game: self.game_engine.simulate_board = copy.copy(latest_boards[-1]) nn_input = self.generate_nn_input(self.game_engine.simulate_latest_boards, color) mcts = MCTS(self.game_engine, self.evaluator, [self.game_engine.simulate_latest_boards, color], self.size ** 2 + 1, inverse=True) - mcts.search(max_step=1) + mcts.search(max_step=5) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0] diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 1dfbb29..4f1c759 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -121,12 +121,8 @@ class Go: if self._is_eye(current_board, color, vertex): return False # forbid position on its own eye. - #if self._is_game_finish(current_board, color) and vertex == utils.PASS - # return False - # forbid pass if the game is not finished. return True - def _is_game_finished(self, current_board, color): ''' for each empty position, if it has both BLACK and WHITE neighbors, the game is still not finished @@ -192,6 +188,7 @@ class Go: if len(invalid_action_list) < len(action_set) - 1: invalid_action_list.append(action_set[-1]) # forbid pass, if we have other choices + # TODO: In fact we should not do this. In some extreme cases, we should permit pass. return invalid_action_list def _do_move(self, board, color, vertex): From ced63af18fcc790c4b1bb1548b5494bd2073f9a2 Mon Sep 17 00:00:00 2001 From: Wenbo Hu Date: Thu, 21 Dec 2017 19:31:51 +0800 Subject: [PATCH 21/36] fixing bug pass parameterg --- tianshou/core/mcts/mcts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tianshou/core/mcts/mcts.py b/tianshou/core/mcts/mcts.py index 7edac97..8bb5f06 100644 --- a/tianshou/core/mcts/mcts.py +++ b/tianshou/core/mcts/mcts.py @@ -73,7 +73,7 @@ class UCTNode(MCTSNode): def valid_mask(self, simulator): # let all invalid actions be illeagel in mcts if self.mask is None: - self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num - 1)) + self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num)) self.ucb[self.mask] = -float("Inf") From eda7ed07a1b7b0251745981d71ab9f358f15944e Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Thu, 21 Dec 2017 21:01:25 +0800 Subject: [PATCH 22/36] implement data collection and part of training --- AlphaGo/engine.py | 6 ++- AlphaGo/game.py | 19 +------- AlphaGo/model.py | 18 +++++++- AlphaGo/play.py | 115 ++++++++++++++++++++++++++++++---------------- AlphaGo/player.py | 1 + 5 files changed, 101 insertions(+), 58 deletions(-) diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index bf30083..c9f1a3c 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -183,11 +183,15 @@ class GTPEngine(): return 'unknown player', False def cmd_get_score(self, args, **kwargs): - return self._game.game_engine.executor_get_score(True), None + return self._game.game_engine.executor_get_score(True), True def cmd_show_board(self, args, **kwargs): return self._game.board, True + def cmd_get_prob(self, args, **kwargs): + return self._game.prob, True + + if __name__ == "main": game = Game() engine = GTPEngine(game_obj=Game) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 5f35c74..bf0d084 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -58,24 +58,9 @@ class Game: def set_komi(self, k): self.komi = k - def generate_nn_input(self, latest_boards, color): - state = np.zeros([1, self.size, self.size, 17]) - for i in range(8): - state[0, :, :, i] = np.array(np.array(latest_boards[i]) == np.ones(self.size ** 2)).reshape(self.size, self.size) - state[0, :, :, i + 8] = np.array(np.array(latest_boards[i]) == -np.ones(self.size ** 2)).reshape(self.size, self.size) - if color == utils.BLACK: - state[0, :, :, 16] = np.ones([self.size, self.size]) - if color == utils.WHITE: - state[0, :, :, 16] = np.zeros([self.size, self.size]) - return state - def think(self, latest_boards, color): - # TODO : using copy is right, or should we change to deepcopy? - self.game_engine.simulate_latest_boards = copy.copy(latest_boards) - self.game_engine.simulate_board = copy.copy(latest_boards[-1]) - nn_input = self.generate_nn_input(self.game_engine.simulate_latest_boards, color) - mcts = MCTS(self.game_engine, self.evaluator, [self.game_engine.simulate_latest_boards, color], self.size ** 2 + 1, inverse=True) - mcts.search(max_step=5) + mcts = MCTS(self.game_engine, self.evaluator, [latest_boards, color], self.size ** 2 + 1, inverse=True) + mcts.search(max_step=1) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0] diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 725dbd2..fab864e 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -1,6 +1,7 @@ import os import time import sys +import cPickle import numpy as np import tensorflow as tf @@ -167,4 +168,19 @@ class ResNet(object): #TODO: design the interface between the environment and training def train(self, mode='memory', *args, **kwargs): - pass \ No newline at end of file + if mode == 'memory': + pass + if mode == 'file': + self.train_with_file(data_path=kwargs['data_path'], checkpoint_path=kwargs['checkpoint_path']) + + def train_with_file(self, data_path, checkpoint_path): + if not os.path.exists(data_path): + raise ValueError("{} doesn't exist".format(data_path)) + + file_list = os.listdir(data_path) + if file_list <= 50: + time.sleep(1) + else: + file_list.sort(key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir( + data_path + file) else 0) + diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 7367804..562dd14 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -5,6 +5,18 @@ import re import Pyro4 import time import os +import cPickle + + +class Data(object): + def __init__(self): + self.boards = [] + self.probs = [] + self.winner = 0 + + def reset(self): + self.__init__() + if __name__ == '__main__': """ @@ -13,10 +25,13 @@ if __name__ == '__main__': """ # TODO : we should set the network path in a more configurable way. parser = argparse.ArgumentParser() + parser.add_argument("--result_path", type=str, default="./data/") parser.add_argument("--black_weight_path", type=str, default=None) parser.add_argument("--white_weight_path", type=str, default=None) args = parser.parse_args() + if not os.path.exists(args.result_path): + os.mkdir(args.result_path) # black_weight_path = "./checkpoints" # white_weight_path = "./checkpoints_origin" if args.black_weight_path is not None and (not os.path.exists(args.black_weight_path)): @@ -35,11 +50,13 @@ if __name__ == '__main__': time.sleep(1) # start two different player with different network weights. - agent_v0 = subprocess.Popen(['python', '-u', 'player.py', '--role=black', '--checkpoint_path=' + str(args.black_weight_path)], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + agent_v0 = subprocess.Popen( + ['python', '-u', 'player.py', '--role=black', '--checkpoint_path=' + str(args.black_weight_path)], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - agent_v1 = subprocess.Popen(['python', '-u', 'player.py', '--role=white', '--checkpoint_path=' + str(args.white_weight_path)], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + agent_v1 = subprocess.Popen( + ['python', '-u', 'player.py', '--role=white', '--checkpoint_path=' + str(args.white_weight_path)], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) server_list = "" while ("black" not in server_list) or ("white" not in server_list): @@ -50,6 +67,7 @@ if __name__ == '__main__': print "Start black player at : " + str(agent_v0.pid) print "Start white player at : " + str(agent_v1.pid) + data = Data() player = [None] * 2 player[0] = Pyro4.Proxy("PYRONAME:black") player[1] = Pyro4.Proxy("PYRONAME:white") @@ -63,39 +81,58 @@ if __name__ == '__main__': evaluate_rounds = 1 game_num = 0 - while game_num < evaluate_rounds: - num = 0 - pass_flag = [False, False] - print("Start game {}".format(game_num)) - # end the game if both palyer chose to pass, or play too much turns - while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2: - turn = num % 2 - move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n') - print role[turn] + " : " + str(move), - num += 1 - match = re.search(pattern, move) - if match is not None: - # print "match : " + str(match.group()) - play_or_pass = match.group() - pass_flag[turn] = False + try: + while True: + num = 0 + pass_flag = [False, False] + print("Start game {}".format(game_num)) + # end the game if both palyer chose to pass, or play too much turns + while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2: + turn = num % 2 + move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n') + print role[turn] + " : " + str(move), + num += 1 + match = re.search(pattern, move) + if match is not None: + # print "match : " + str(match.group()) + play_or_pass = match.group() + pass_flag[turn] = False + else: + # print "no match" + play_or_pass = ' PASS' + pass_flag[turn] = True + result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n') + board = player[turn].run_cmd(str(num) + ' show_board') + board = eval(board[board.index('['):board.index(']') + 1]) + for i in range(size): + for j in range(size): + print show[board[i * size + j]] + " ", + print "\n", + data.boards.append(board) + prob = player[turn].run_cmd(str(num) + ' get_prob') + data.probs.append(prob) + score = player[turn].run_cmd(str(num) + ' get_score') + print "Finished : ", score.split(" ")[1] + # TODO: generalize the player + if score > 0: + data.winner = 1 + if score < 0: + data.winner = -1 + player[0].run_cmd(str(num) + ' clear_board') + player[1].run_cmd(str(num) + ' clear_board') + file_list = os.listdir(args.result_path) + if not file_list: + data_num = 0 else: - # print "no match" - play_or_pass = ' PASS' - pass_flag[turn] = True - result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n') - board = player[turn].run_cmd(str(num) + ' show_board') - board = eval(board[board.index('['):board.index(']') + 1]) - for i in range(size): - for j in range(size): - print show[board[i * size + j]] + " ", - print "\n", - - score = player[turn].run_cmd(str(num) + ' get_score') - print "Finished : ", score.split(" ")[1] - player[0].run_cmd(str(num) + ' clear_board') - player[1].run_cmd(str(num) + ' clear_board') - game_num += 1 - - subprocess.call(["kill", "-9", str(agent_v0.pid)]) - subprocess.call(["kill", "-9", str(agent_v1.pid)]) - print "Kill all player, finish all game." + file_list.sort(key=lambda file: os.path.getmtime(args.result_path + file) if not os.path.isdir( + args.result_path + file) else 0) + data_num = eval(file_list[-1][:-4]) + 1 + print(file_list) + with open("./data/" + str(data_num) + ".pkl", "w") as file: + picklestring = cPickle.dump(data, file) + data.reset() + game_num += 1 + except KeyboardInterrupt: + subprocess.call(["kill", "-9", str(agent_v0.pid)]) + subprocess.call(["kill", "-9", str(agent_v1.pid)]) + print "Kill all player, finish all game." diff --git a/AlphaGo/player.py b/AlphaGo/player.py index b468cf3..0e3daff 100644 --- a/AlphaGo/player.py +++ b/AlphaGo/player.py @@ -20,6 +20,7 @@ class Player(object): #return "inside the Player of player.py" return self.engine.run_cmd(command) + if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--checkpoint_path", type=str, default=None) From 2acb1aab076f5393f79eb853e275de626d4d0247 Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Thu, 21 Dec 2017 22:48:53 +0800 Subject: [PATCH 23/36] eliminate all references of Game class in Go class --- AlphaGo/engine.py | 2 +- AlphaGo/game.py | 15 ++----- AlphaGo/go.py | 101 +++++++++++++++++++++++-------------------- AlphaGo/play.py | 4 +- AlphaGo/self-play.py | 2 +- 5 files changed, 63 insertions(+), 61 deletions(-) diff --git a/AlphaGo/engine.py b/AlphaGo/engine.py index c9f1a3c..8b54470 100644 --- a/AlphaGo/engine.py +++ b/AlphaGo/engine.py @@ -183,7 +183,7 @@ class GTPEngine(): return 'unknown player', False def cmd_get_score(self, args, **kwargs): - return self._game.game_engine.executor_get_score(True), True + return self._game.game_engine.executor_get_score(self._game.board, True), True def cmd_show_board(self, args, **kwargs): return self._game.board, True diff --git a/AlphaGo/game.py b/AlphaGo/game.py index bf0d084..11ce52b 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -34,16 +34,7 @@ class Game: self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8) # self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], # feed_dict={self.net.x: state, self.net.is_training: False}) - self.game_engine = go.Go(game=self) - - def _flatten(self, vertex): - x, y = vertex - return (x - 1) * self.size + (y - 1) - - def _deflatten(self, idx): - x = idx // self.size + 1 - y = idx % self.size + 1 - return (x, y) + self.game_engine = go.Go(size=self.size, komi=self.komi) def clear(self): self.board = [utils.EMPTY] * (self.size ** 2) @@ -67,14 +58,14 @@ class Game: if choice == self.size ** 2: move = utils.PASS else: - move = self._deflatten(choice) + move = self.game_engine._deflatten(choice) return move, prob def play_move(self, color, vertex): # this function can be called directly to play the opponent's move if vertex == utils.PASS: return True - res = self.game_engine.executor_do_move(color, vertex) + res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex) return res def think_play_move(self, color): diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 4f1c759..9b7e21f 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -16,12 +16,22 @@ CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] class Go: def __init__(self, **kwargs): - self.game = kwargs['game'] + self.size = kwargs['size'] + self.komi = kwargs['komi'] + + def _flatten(self, vertex): + x, y = vertex + return (x - 1) * self.size + (y - 1) + + def _deflatten(self, idx): + x = idx // self.size + 1 + y = idx % self.size + 1 + return (x, y) def _in_board(self, vertex): x, y = vertex - if x < 1 or x > self.game.size: return False - if y < 1 or y > self.game.size: return False + if x < 1 or x > self.size: return False + if y < 1 or y > self.size: return False return True def _neighbor(self, vertex): @@ -45,7 +55,7 @@ class Go: return corner def _find_group(self, current_board, vertex): - color = current_board[self.game._flatten(vertex)] + color = current_board[self._flatten(vertex)] # print ("color : ", color) chain = set() frontier = [vertex] @@ -55,41 +65,41 @@ class Go: # print ("current : ", current) chain.add(current) for n in self._neighbor(current): - if current_board[self.game._flatten(n)] == color and not n in chain: + if current_board[self._flatten(n)] == color and not n in chain: frontier.append(n) - if current_board[self.game._flatten(n)] == utils.EMPTY: + if current_board[self._flatten(n)] == utils.EMPTY: has_liberty = True return has_liberty, chain def _is_suicide(self, current_board, color, vertex): - current_board[self.game._flatten(vertex)] = color # assume that we already take this move + current_board[self._flatten(vertex)] = color # assume that we already take this move suicide = False has_liberty, group = self._find_group(current_board, vertex) if not has_liberty: suicide = True # no liberty, suicide for n in self._neighbor(vertex): - if current_board[self.game._flatten(n)] == utils.another_color(color): + if current_board[self._flatten(n)] == utils.another_color(color): opponent_liberty, group = self._find_group(current_board, n) if not opponent_liberty: suicide = False # this move is able to take opponent's stone, not suicide - current_board[self.game._flatten(vertex)] = utils.EMPTY # undo this move + current_board[self._flatten(vertex)] = utils.EMPTY # undo this move return suicide def _process_board(self, current_board, color, vertex): nei = self._neighbor(vertex) for n in nei: - if current_board[self.game._flatten(n)] == utils.another_color(color): + if current_board[self._flatten(n)] == utils.another_color(color): has_liberty, group = self._find_group(current_board, n) if not has_liberty: for b in group: - current_board[self.game._flatten(b)] = utils.EMPTY + current_board[self._flatten(b)] = utils.EMPTY def _check_global_isomorphous(self, history_boards, current_board, color, vertex): repeat = False next_board = copy.copy(current_board) - next_board[self.game._flatten(vertex)] = color + next_board[self._flatten(vertex)] = color self._process_board(next_board, color, vertex) if next_board in history_boards: repeat = True @@ -98,7 +108,7 @@ class Go: def _is_eye(self, current_board, color, vertex): nei = self._neighbor(vertex) cor = self._corner(vertex) - ncolor = {color == current_board[self.game._flatten(n)] for n in nei} + ncolor = {color == current_board[self._flatten(n)] for n in nei} if False in ncolor: # print "not all neighbors are in same color with us" return False @@ -107,7 +117,7 @@ class Go: # print "all neighbors are in same group and same color with us" return True else: - opponent_number = [current_board[self.game._flatten(c)] for c in cor].count(-color) + opponent_number = [current_board[self._flatten(c)] for c in cor].count(-color) opponent_propotion = float(opponent_number) / float(len(cor)) if opponent_propotion < 0.5: # print "few opponents, real eye" @@ -131,20 +141,20 @@ class Go: board = copy.deepcopy(current_board) empty_idx = [i for i, x in enumerate(board) if x == utils.EMPTY] # find all empty idx for idx in empty_idx: - neighbor_idx = self._neighbor(self.game.deflatten(idx)) + neighbor_idx = self._neighbor(self.deflatten(idx)) if len(neighbor_idx) > 1: first_idx = neighbor_idx[0] for other_idx in neighbor_idx[1:]: - if self.game.board[self.game.flatten(other_idx)] != self.game.board[self.game.flatten(first_idx)]: + if board[self.flatten(other_idx)] != board[self.flatten(first_idx)]: return False return True def _action2vertex(self, action): - if action == self.game.size ** 2: + if action == self.size ** 2: vertex = (0, 0) else: - vertex = self.game._deflatten(action) + vertex = self._deflatten(action) return vertex def _is_valid(self, history_boards, current_board, color, vertex): @@ -153,7 +163,7 @@ class Go: return False ### already have stone - if not current_board[self.game._flatten(vertex)] == utils.EMPTY: + if not current_board[self._flatten(vertex)] == utils.EMPTY: return False ### check if it is suicide @@ -195,7 +205,7 @@ class Go: if vertex == utils.PASS: return board else: - id_ = self.game._flatten(vertex) + id_ = self._flatten(vertex) board[id_] = color return board @@ -208,21 +218,21 @@ class Go: new_color = -color return [history_boards, new_color], 0 - def executor_do_move(self, color, vertex): - if not self._is_valid(self.game.history, self.game.board, color, vertex): + def executor_do_move(self, history, latest_boards, current_board, color, vertex): + if not self._is_valid(history, current_board, color, vertex): return False - self.game.board[self.game._flatten(vertex)] = color - self._process_board(self.game.board, color, vertex) - self.game.history.append(copy.copy(self.game.board)) - self.game.latest_boards.append(copy.copy(self.game.board)) + current_board[self._flatten(vertex)] = color + self._process_board(current_board, color, vertex) + history.append(copy.copy(current_board)) + latest_boards.append(copy.copy(current_board)) return True - def _find_empty(self): - idx = [i for i,x in enumerate(self.game.board) if x == utils.EMPTY ][0] - return self.game._deflatten(idx) + def _find_empty(self, current_board): + idx = [i for i,x in enumerate(current_board) if x == utils.EMPTY ][0] + return self._deflatten(idx) - def _find_boarder(self, vertex): - _, group = self._find_group(self.game.board, vertex) + def _find_boarder(self, current_board, vertex): + _, group = self._find_group(current_board, vertex) border = [] for b in group: for n in self._neighbor(b): @@ -248,7 +258,7 @@ class Go: start_vertex_x += x_diff start_vertex_y += y_diff - def _predict_from_nearby(self, vertex, neighbor_step=3): + def _predict_from_nearby(self, current_board, vertex, neighbor_step=3): ''' step: the nearby 3 steps is considered :vertex: position to be estimated @@ -264,38 +274,37 @@ class Go: self._add_nearby_stones(neighbor_vertex_set, vertex[0], vertex[1] - step, -1, 1, neighbor_step) color_estimate = 0 for neighbor_vertex in neighbor_vertex_set: - color_estimate += self.game.board[self.game._flatten(neighbor_vertex)] + color_estimate += current_board[self._flatten(neighbor_vertex)] if color_estimate > 0: return utils.BLACK elif color_estimate < 0: return utils.WHITE - def executor_get_score(self, is_unknown_estimation=False): + def executor_get_score(self, current_board, is_unknown_estimation=False): ''' is_unknown_estimation: whether use nearby stone to predict the unknown return score from BLACK perspective. ''' - _board = copy.copy(self.game.board) - while utils.EMPTY in self.game.board: - vertex = self._find_empty() - boarder = self._find_boarder(vertex) - boarder_color = set(map(lambda v: self.game.board[self.game._flatten(v)], boarder)) + _board = copy.deepcopy(current_board) + while utils.EMPTY in _board: + vertex = self._find_empty(_board) + boarder = self._find_boarder(_board, vertex) + boarder_color = set(map(lambda v: _board[self._flatten(v)], boarder)) if boarder_color == {utils.BLACK}: - self.game.board[self.game._flatten(vertex)] = utils.BLACK + _board[self._flatten(vertex)] = utils.BLACK elif boarder_color == {utils.WHITE}: - self.game.board[self.game._flatten(vertex)] = utils.WHITE + _board[self._flatten(vertex)] = utils.WHITE elif is_unknown_estimation: - self.game.board[self.game._flatten(vertex)] = self._predict_from_nearby(vertex) + _board[self._flatten(vertex)] = self._predict_from_nearby(_board, vertex) else: - self.game.board[self.game._flatten(vertex)] =utils.UNKNOWN + _board[self._flatten(vertex)] =utils.UNKNOWN score = 0 - for i in self.game.board: + for i in _board: if i == utils.BLACK: score += 1 elif i == utils.WHITE: score -= 1 - score -= self.game.komi + score -= self.komi - self.game.board = _board return score diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 562dd14..e18555f 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -82,7 +82,7 @@ if __name__ == '__main__': evaluate_rounds = 1 game_num = 0 try: - while True: + while game_num < evaluate_rounds: num = 0 pass_flag = [False, False] print("Start game {}".format(game_num)) @@ -132,6 +132,8 @@ if __name__ == '__main__': picklestring = cPickle.dump(data, file) data.reset() game_num += 1 + subprocess.call(["kill", "-9", str(agent_v0.pid)]) + subprocess.call(["kill", "-9", str(agent_v1.pid)]) except KeyboardInterrupt: subprocess.call(["kill", "-9", str(agent_v0.pid)]) subprocess.call(["kill", "-9", str(agent_v1.pid)]) diff --git a/AlphaGo/self-play.py b/AlphaGo/self-play.py index 63b7e97..4387b24 100644 --- a/AlphaGo/self-play.py +++ b/AlphaGo/self-play.py @@ -79,7 +79,7 @@ while True: prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1)) print("Finished") print("\n") - score = game.game_engine.executor_get_score(True) + score = game.game_engine.executor_get_score(game.board, True) if score > 0: winner = utils.BLACK else: From 9ad53de54f0ef28aea0df9de31c9d2c405186d15 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Thu, 21 Dec 2017 23:30:24 +0800 Subject: [PATCH 24/36] implement the training process --- .gitignore | 1 + AlphaGo/game.py | 2 +- AlphaGo/model.py | 106 ++++++++++++++++++++++++++++++++++++++++++----- AlphaGo/play.py | 28 ++++++++----- 4 files changed, 114 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index 36d134c..d697b92 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ checkpoints checkpoints_origin *.json .DS_Store +data diff --git a/AlphaGo/game.py b/AlphaGo/game.py index bf0d084..c342d0c 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -60,7 +60,7 @@ class Game: def think(self, latest_boards, color): mcts = MCTS(self.game_engine, self.evaluator, [latest_boards, color], self.size ** 2 + 1, inverse=True) - mcts.search(max_step=1) + mcts.search(max_step=20) temp = 1 prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp) choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0] diff --git a/AlphaGo/model.py b/AlphaGo/model.py index fab864e..41f3a47 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -2,6 +2,7 @@ import os import time import sys import cPickle +from collections import deque import numpy as np import tensorflow as tf @@ -71,6 +72,13 @@ def value_head(input, is_training): return h +class Data(object): + def __init__(self): + self.boards = [] + self.probs = [] + self.winner = 0 + + class ResNet(object): def __init__(self, board_size, action_num, history_length=1, residual_block_num=20, checkpoint_path=None): """ @@ -85,11 +93,18 @@ class ResNet(object): self.board_size = board_size self.action_num = action_num self.history_length = history_length + self.checkpoint_path = checkpoint_path self.x = tf.placeholder(tf.float32, shape=[None, self.board_size, self.board_size, 2 * self.history_length + 1]) self.is_training = tf.placeholder(tf.bool, shape=[]) self.z = tf.placeholder(tf.float32, shape=[None, 1]) self.pi = tf.placeholder(tf.float32, shape=[None, self.action_num]) - self._build_network(residual_block_num, checkpoint_path) + self._build_network(residual_block_num, self.checkpoint_path) + + # training hyper-parameters: + self.window_length = 1000 + self.save_freq = 1000 + self.training_data = {'states': deque(maxlen=self.window_length), 'probs': deque(maxlen=self.window_length), + 'winner': deque(maxlen=self.window_length)} def _build_network(self, residual_block_num, checkpoint_path): """ @@ -118,7 +133,7 @@ class ResNet(object): with tf.control_dependencies(self.update_ops): self.train_op = tf.train.AdamOptimizer(1e-4).minimize(self.total_loss) self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) - self.saver = tf.train.Saver(max_to_keep=10, var_list=self.var_list) + self.saver = tf.train.Saver(var_list=self.var_list) self.sess = multi_gpu.create_session() self.sess.run(tf.global_variables_initializer()) if checkpoint_path is not None: @@ -166,21 +181,90 @@ class ResNet(object): state[0, :, :, 2 * self.history_length] = np.zeros([self.board_size, self.board_size]) return state - #TODO: design the interface between the environment and training + # TODO: design the interface between the environment and training def train(self, mode='memory', *args, **kwargs): if mode == 'memory': pass if mode == 'file': - self.train_with_file(data_path=kwargs['data_path'], checkpoint_path=kwargs['checkpoint_path']) + self._train_with_file(data_path=kwargs['data_path'], batch_size=kwargs['batch_size'], + checkpoint_path=kwargs['checkpoint_path']) - def train_with_file(self, data_path, checkpoint_path): + def _train_with_file(self, data_path, batch_size, checkpoint_path): + # check if the path is valid if not os.path.exists(data_path): raise ValueError("{} doesn't exist".format(data_path)) + self.checkpoint_path = checkpoint_path + if not os.path.exists(self.checkpoint_path): + os.mkdir(self.checkpoint_path) - file_list = os.listdir(data_path) - if file_list <= 50: - time.sleep(1) - else: - file_list.sort(key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir( - data_path + file) else 0) + new_file_list = [] + all_file_list = [] + training_data = {} + iters = 0 + while True: + new_file_list = list(set(os.listdir(data_path)).difference(all_file_list)) + all_file_list = os.listdir(data_path) + new_file_list.sort( + key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir(data_path + file) else 0) + if new_file_list: + for file in new_file_list: + states, probs, winner = self._file_to_training_data(data_path + file) + assert states.shape[0] == probs.shape[0] + assert states.shape[0] == winner.shape[0] + self.training_data['states'].append(states) + self.training_data['probs'].append(probs) + self.training_data['winner'].append(winner) + training_data['states'] = np.concatenate(self.training_data['states'], axis=0) + training_data['probs'] = np.concatenate(self.training_data['probs'], axis=0) + training_data['winner'] = np.concatenate(self.training_data['winner'], axis=0) + if len(self.training_data['states']) != self.window_length: + continue + else: + data_num = training_data['states'].shape[0] + index = np.arange(data_num) + np.random.shuffle(index) + start_time = time.time() + value_loss, policy_loss, reg, _ = self.sess.run( + [self.value_loss, self.policy_loss, self.reg, self.train_op], + feed_dict={self.x: training_data['states'][index[:batch_size]], + self.z: training_data['winner'][index[:batch_size]], + self.pi: training_data['probs'][index[:batch_size]], + self.is_training: True}) + print("Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format(iters, + time.time() - start_time, + value_loss, + policy_loss, reg)) + iters += 1 + if iters % self.save_freq == 0: + save_path = "Iteration{}.ckpt".format(iters) + self.saver.save(self.sess, self.checkpoint_path + save_path) + + def _file_to_training_data(self, file_name): + with open(file_name, 'r') as file: + data = cPickle.load(file) + history = deque(maxlen=self.history_length) + states = [] + probs = [] + winner = [] + for _ in range(self.history_length): + # Note that 0 is specified, need a more general way like config + history.append([0] * self.board_size ** 2) + # Still, +1 is specified + color = +1 + + for [board, prob] in zip(data.boards, data.probs): + history.append(board) + states.append(self._history2state(history, color)) + probs.append(np.array(prob).reshape(1, self.board_size ** 2 + 1)) + winner.append(np.array(data.winner).reshape(1, 1)) + color *= -1 + states = np.concatenate(states, axis=0) + probs = np.concatenate(probs, axis=0) + winner = np.concatenate(winner, axis=0) + return states, probs, winner + + +if __name__=="__main__": + model = ResNet(board_size=9, action_num=82) + model.train("file", data_path="./data/", batch_size=128, checkpoint_path="./checkpoint/") \ No newline at end of file diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 562dd14..bd3776e 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -76,6 +76,7 @@ if __name__ == '__main__': color = ['b', 'w'] pattern = "[A-Z]{1}[0-9]{1}" + space = re.compile("\s+") size = 9 show = ['.', 'X', 'O'] @@ -83,12 +84,20 @@ if __name__ == '__main__': game_num = 0 try: while True: + start_time = time.time() num = 0 pass_flag = [False, False] print("Start game {}".format(game_num)) # end the game if both palyer chose to pass, or play too much turns while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2: turn = num % 2 + board = player[turn].run_cmd(str(num) + ' show_board') + board = eval(board[board.index('['):board.index(']') + 1]) + for i in range(size): + for j in range(size): + print show[board[i * size + j]] + " ", + print "\n", + data.boards.append(board) move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n') print role[turn] + " : " + str(move), num += 1 @@ -102,21 +111,18 @@ if __name__ == '__main__': play_or_pass = ' PASS' pass_flag[turn] = True result = player[1 - turn].run_cmd(str(num) + ' play ' + color[turn] + ' ' + play_or_pass + '\n') - board = player[turn].run_cmd(str(num) + ' show_board') - board = eval(board[board.index('['):board.index(']') + 1]) - for i in range(size): - for j in range(size): - print show[board[i * size + j]] + " ", - print "\n", - data.boards.append(board) prob = player[turn].run_cmd(str(num) + ' get_prob') + prob = space.sub(',', prob[prob.index('['):prob.index(']') + 1]) + prob = prob.replace('[,', '[') + prob = prob.replace('],', ']') + prob = eval(prob) data.probs.append(prob) score = player[turn].run_cmd(str(num) + ' get_score') print "Finished : ", score.split(" ")[1] # TODO: generalize the player - if score > 0: + if eval(score.split(" ")[1]) > 0: data.winner = 1 - if score < 0: + if eval(score.split(" ")[1]) < 0: data.winner = -1 player[0].run_cmd(str(num) + ' clear_board') player[1].run_cmd(str(num) + ' clear_board') @@ -127,12 +133,12 @@ if __name__ == '__main__': file_list.sort(key=lambda file: os.path.getmtime(args.result_path + file) if not os.path.isdir( args.result_path + file) else 0) data_num = eval(file_list[-1][:-4]) + 1 - print(file_list) with open("./data/" + str(data_num) + ".pkl", "w") as file: picklestring = cPickle.dump(data, file) data.reset() game_num += 1 - except KeyboardInterrupt: + print("Time {}".format(time.time()-start_time)) + except Exception: subprocess.call(["kill", "-9", str(agent_v0.pid)]) subprocess.call(["kill", "-9", str(agent_v1.pid)]) print "Kill all player, finish all game." From 43f6527d8e4ebaec6b9c001361db689090127e87 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Thu, 21 Dec 2017 23:55:31 +0800 Subject: [PATCH 25/36] modify for multi instance --- AlphaGo/play.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/AlphaGo/play.py b/AlphaGo/play.py index 35549dd..a9d3d20 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -28,6 +28,7 @@ if __name__ == '__main__': parser.add_argument("--result_path", type=str, default="./data/") parser.add_argument("--black_weight_path", type=str, default=None) parser.add_argument("--white_weight_path", type=str, default=None) + parser.add_argument("--id", type=int, default=0) args = parser.parse_args() if not os.path.exists(args.result_path): @@ -50,12 +51,15 @@ if __name__ == '__main__': time.sleep(1) # start two different player with different network weights. + black_role_name = 'black' + str(args.id) + white_role_name = 'white' + str(args.id) + agent_v0 = subprocess.Popen( - ['python', '-u', 'player.py', '--role=black', '--checkpoint_path=' + str(args.black_weight_path)], + ['python', '-u', 'player.py', '--role=' + black_role_name, '--checkpoint_path=' + str(args.black_weight_path)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) agent_v1 = subprocess.Popen( - ['python', '-u', 'player.py', '--role=white', '--checkpoint_path=' + str(args.white_weight_path)], + ['python', '-u', 'player.py', '--role=' + white_role_name, '--checkpoint_path=' + str(args.white_weight_path)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) server_list = "" @@ -69,8 +73,8 @@ if __name__ == '__main__': data = Data() player = [None] * 2 - player[0] = Pyro4.Proxy("PYRONAME:black") - player[1] = Pyro4.Proxy("PYRONAME:white") + player[0] = Pyro4.Proxy("PYRONAME:" + black_role_name) + player[1] = Pyro4.Proxy("PYRONAME:" + white_role_name) role = ["BLACK", "WHITE"] color = ['b', 'w'] From 6835ec62e14c63703a46a4adb8df677d6a14a0b3 Mon Sep 17 00:00:00 2001 From: rtz19970824 Date: Fri, 22 Dec 2017 00:04:51 +0800 Subject: [PATCH 26/36] multi-instance support --- AlphaGo/play.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/AlphaGo/play.py b/AlphaGo/play.py index a9d3d20..a8267a7 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -41,14 +41,14 @@ if __name__ == '__main__': raise ValueError("Can't not find the network weights for white player.") # kill the old server - kill_old_server = subprocess.Popen(['killall', 'pyro4-ns']) - print "kill the old pyro4 name server, the return code is : " + str(kill_old_server.wait()) - time.sleep(1) + # kill_old_server = subprocess.Popen(['killall', 'pyro4-ns']) + # print "kill the old pyro4 name server, the return code is : " + str(kill_old_server.wait()) + # time.sleep(1) # start a name server to find the remote object - start_new_server = subprocess.Popen(['pyro4-ns', '&']) - print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait()) - time.sleep(1) + # start_new_server = subprocess.Popen(['pyro4-ns', '&']) + # print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait()) + # time.sleep(1) # start two different player with different network weights. black_role_name = 'black' + str(args.id) @@ -63,7 +63,7 @@ if __name__ == '__main__': stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) server_list = "" - while ("black" not in server_list) or ("white" not in server_list): + while (black_role_name not in server_list) or (white_role_name not in server_list): server_list = subprocess.check_output(['pyro4-nsc', 'list']) print "Waiting for the server start..." time.sleep(1) @@ -142,11 +142,12 @@ if __name__ == '__main__': data.reset() game_num += 1 - except Exception: + except Exception as e: + print(e) subprocess.call(["kill", "-9", str(agent_v0.pid)]) subprocess.call(["kill", "-9", str(agent_v1.pid)]) print "Kill all player, finish all game." subprocess.call(["kill", "-9", str(agent_v0.pid)]) subprocess.call(["kill", "-9", str(agent_v1.pid)]) - print "Kill all player, finish all game." \ No newline at end of file + print "Kill all player, finish all game." From 1cc5063007925ceada46974f21aaf03a2361deee Mon Sep 17 00:00:00 2001 From: Haosheng Zou Date: Fri, 22 Dec 2017 00:22:23 +0800 Subject: [PATCH 27/36] add value_function (critic). value_function and policy not finished yet. --- tianshou/core/policy/base.py | 2 +- tianshou/core/policy/dqn.py | 11 ++++ tianshou/core/value_function/__init__.py | 0 tianshou/core/value_function/action_value.py | 53 ++++++++++++++++++++ tianshou/core/value_function/base.py | 23 +++++++++ tianshou/core/value_function/state_value.py | 23 +++++++++ 6 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 tianshou/core/value_function/__init__.py create mode 100644 tianshou/core/value_function/action_value.py create mode 100644 tianshou/core/value_function/base.py create mode 100644 tianshou/core/value_function/state_value.py diff --git a/tianshou/core/policy/base.py b/tianshou/core/policy/base.py index eecfc4f..025abd5 100644 --- a/tianshou/core/policy/base.py +++ b/tianshou/core/policy/base.py @@ -15,7 +15,7 @@ __all__ = [ 'QValuePolicy', ] -# TODO: separate actor and critic, we should focus on it once we finish the basic module. +# TODO: a even more "base" class for policy class QValuePolicy(object): diff --git a/tianshou/core/policy/dqn.py b/tianshou/core/policy/dqn.py index 39f6a16..d03dbd4 100644 --- a/tianshou/core/policy/dqn.py +++ b/tianshou/core/policy/dqn.py @@ -1,5 +1,16 @@ from tianshou.core.policy.base import QValuePolicy import tensorflow as tf +import sys +sys.path.append('..') +import value_function.action_value as value_func + + +class DQN_refactor(object): + """ + use DQN from value_function as a member + """ + def __init__(self, value_tensor, observation_placeholder, action_placeholder): + self._network = value_func.DQN(value_tensor, observation_placeholder, action_placeholder) class DQN(QValuePolicy): diff --git a/tianshou/core/value_function/__init__.py b/tianshou/core/value_function/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tianshou/core/value_function/action_value.py b/tianshou/core/value_function/action_value.py new file mode 100644 index 0000000..cb8acc8 --- /dev/null +++ b/tianshou/core/value_function/action_value.py @@ -0,0 +1,53 @@ +from base import ValueFunctionBase +import tensorflow as tf + + +class ActionValue(ValueFunctionBase): + """ + class of action values Q(s, a). + """ + def __init__(self, value_tensor, observation_placeholder, action_placeholder): + self._action_placeholder = action_placeholder + super(ActionValue, self).__init__( + value_tensor=value_tensor, + observation_placeholder=observation_placeholder + ) + + def get_value(self, observation, action): + """ + + :param observation: numpy array of observations, of shape (batchsize, observation_dim). + :param action: numpy array of actions, of shape (batchsize, action_dim) + # TODO: Atari discrete action should have dim 1. Super Mario may should have, say, dim 5, where each can be 0/1 + :return: numpy array of state values, of shape (batchsize, ) + # TODO: dealing with the last dim of 1 in V(s) and Q(s, a) + """ + sess = tf.get_default_session() + return sess.run(self.get_value_tensor(), feed_dict= + {self._observation_placeholder: observation, self._action_placeholder:action})[:, 0] + + +class DQN(ActionValue): + """ + class of the very DQN architecture. Instead of feeding s and a to the network to get a value, DQN feed s to the + network and the last layer is Q(s, *) for all actions + """ + def __init__(self, value_tensor, observation_placeholder, action_placeholder): + """ + :param value_tensor: of shape (batchsize, num_actions) + :param observation_placeholder: of shape (batchsize, observation_dim) + :param action_placeholder: of shape (batchsize, ) + """ + self._value_tensor_all_actions = value_tensor + canonical_value_tensor = value_tensor[action_placeholder] # maybe a tf.map_fn. for now it's wrong + + super(DQN, self).__init__(value_tensor=canonical_value_tensor, + observation_placeholder=observation_placeholder, + action_placeholder=action_placeholder) + + def get_value_all_actions(self, observation): + sess = tf.get_default_session() + return sess.run(self._value_tensor_all_actions, feed_dict={self._observation_placeholder: observation}) + + def get_value_tensor_all_actions(self): + return self._value_tensor_all_actions \ No newline at end of file diff --git a/tianshou/core/value_function/base.py b/tianshou/core/value_function/base.py new file mode 100644 index 0000000..0b27759 --- /dev/null +++ b/tianshou/core/value_function/base.py @@ -0,0 +1,23 @@ + +# TODO: linear feature baseline also in tf? +class ValueFunctionBase(object): + """ + base class of value functions. Children include state values V(s) and action values Q(s, a) + """ + def __init__(self, value_tensor, observation_placeholder): + self._observation_placeholder = observation_placeholder + self._value_tensor = value_tensor + + def get_value(self, **kwargs): + """ + + :return: batch of corresponding values in numpy array + """ + raise NotImplementedError() + + def get_value_tensor(self): + """ + + :return: tensor of the corresponding values + """ + return self._value_tensor diff --git a/tianshou/core/value_function/state_value.py b/tianshou/core/value_function/state_value.py new file mode 100644 index 0000000..04fe442 --- /dev/null +++ b/tianshou/core/value_function/state_value.py @@ -0,0 +1,23 @@ +from base import ValueFunctionBase +import tensorflow as tf + + +class StateValue(ValueFunctionBase): + """ + class of state values V(s). + """ + def __init__(self, value_tensor, observation_placeholder): + super(StateValue, self).__init__( + value_tensor=value_tensor, + observation_placeholder=observation_placeholder + ) + + def get_value(self, observation): + """ + + :param observation: numpy array of observations, of shape (batchsize, observation_dim). + :return: numpy array of state values, of shape (batchsize, ) + # TODO: dealing with the last dim of 1 in V(s) and Q(s, a) + """ + sess = tf.get_default_session() + return sess.run(self.get_value_tensor(), feed_dict={self._observation_placeholder: observation})[:, 0] \ No newline at end of file From 5c29dad26367ba76c1fbe4a19213c0bf9ae7391e Mon Sep 17 00:00:00 2001 From: JialianLee Date: Fri, 22 Dec 2017 01:57:48 +0800 Subject: [PATCH 28/36] An initial version for Reversi --- AlphaGo/reversi.py | 252 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 252 insertions(+) create mode 100644 AlphaGo/reversi.py diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py new file mode 100644 index 0000000..49d0e9a --- /dev/null +++ b/AlphaGo/reversi.py @@ -0,0 +1,252 @@ +from __future__ import print_function +import numpy as np + +''' +Settings of the Go game. + +(1, 1) is considered as the upper left corner of the board, +(size, 1) is the lower left +''' + + +def find_correct_moves(own, enemy): + """return legal moves""" + left_right_mask = 0x7e7e7e7e7e7e7e7e # Both most left-right edge are 0, else 1 + top_bottom_mask = 0x00ffffffffffff00 # Both most top-bottom edge are 0, else 1 + mask = left_right_mask & top_bottom_mask + mobility = 0 + mobility |= search_offset_left(own, enemy, left_right_mask, 1) # Left + mobility |= search_offset_left(own, enemy, mask, 9) # Left Top + mobility |= search_offset_left(own, enemy, top_bottom_mask, 8) # Top + mobility |= search_offset_left(own, enemy, mask, 7) # Top Right + mobility |= search_offset_right(own, enemy, left_right_mask, 1) # Right + mobility |= search_offset_right(own, enemy, mask, 9) # Bottom Right + mobility |= search_offset_right(own, enemy, top_bottom_mask, 8) # Bottom + mobility |= search_offset_right(own, enemy, mask, 7) # Left bottom + return mobility + + +def calc_flip(pos, own, enemy): + """return flip stones of enemy by bitboard when I place stone at pos. + + :param pos: 0~63 + :param own: bitboard (0=top left, 63=bottom right) + :param enemy: bitboard + :return: flip stones of enemy when I place stone at pos. + """ + assert 0 <= pos <= 63, f"pos={pos}" + f1 = _calc_flip_half(pos, own, enemy) + f2 = _calc_flip_half(63 - pos, rotate180(own), rotate180(enemy)) + return f1 | rotate180(f2) + + +def _calc_flip_half(pos, own, enemy): + el = [enemy, enemy & 0x7e7e7e7e7e7e7e7e, enemy & 0x7e7e7e7e7e7e7e7e, enemy & 0x7e7e7e7e7e7e7e7e] + masks = [0x0101010101010100, 0x00000000000000fe, 0x0002040810204080, 0x8040201008040200] + masks = [b64(m << pos) for m in masks] + flipped = 0 + for e, mask in zip(el, masks): + outflank = mask & ((e | ~mask) + 1) & own + flipped |= (outflank - (outflank != 0)) & mask + return flipped + + +def search_offset_left(own, enemy, mask, offset): + e = enemy & mask + blank = ~(own | enemy) + t = e & (own >> offset) + t |= e & (t >> offset) + t |= e & (t >> offset) + t |= e & (t >> offset) + t |= e & (t >> offset) + t |= e & (t >> offset) # Up to six stones can be turned at once + return blank & (t >> offset) # Only the blank squares can be started + + +def search_offset_right(own, enemy, mask, offset): + e = enemy & mask + blank = ~(own | enemy) + t = e & (own << offset) + t |= e & (t << offset) + t |= e & (t << offset) + t |= e & (t << offset) + t |= e & (t << offset) + t |= e & (t << offset) # Up to six stones can be turned at once + return blank & (t << offset) # Only the blank squares can be started + + +def flip_vertical(x): + k1 = 0x00FF00FF00FF00FF + k2 = 0x0000FFFF0000FFFF + x = ((x >> 8) & k1) | ((x & k1) << 8) + x = ((x >> 16) & k2) | ((x & k2) << 16) + x = (x >> 32) | b64(x << 32) + return x + + +def b64(x): + return x & 0xFFFFFFFFFFFFFFFF + + +def bit_count(x): + return bin(x).count('1') + + +def bit_to_array(x, size): + """bit_to_array(0b0010, 4) -> array([0, 1, 0, 0])""" + return np.array(list(reversed((("0" * size) + bin(x)[2:])[-size:])), dtype=np.uint8) + + +def flip_diag_a1h8(x): + k1 = 0x5500550055005500 + k2 = 0x3333000033330000 + k4 = 0x0f0f0f0f00000000 + t = k4 & (x ^ b64(x << 28)) + x ^= t ^ (t >> 28) + t = k2 & (x ^ b64(x << 14)) + x ^= t ^ (t >> 14) + t = k1 & (x ^ b64(x << 7)) + x ^= t ^ (t >> 7) + return x + + +def rotate90(x): + return flip_diag_a1h8(flip_vertical(x)) + + +def rotate180(x): + return rotate90(rotate90(x)) + + +class Reversi: + def __init__(self, black=None, white=None): + self.black = black or (0b00001000 << 24 | 0b00010000 << 32) + self.white = white or (0b00010000 << 24 | 0b00001000 << 32) + self.board = None # 8 * 8 board with 1 for black, -1 for white and 0 for blank + self.color = None # 1 for black and -1 for white + self.action = None # number in 0~63 + self.winner = None + + def simulate_is_valid(self, board, color): + self.board = board + self.color = color + self.board2bitboard() + own, enemy = self.get_own_and_enemy() + mobility = find_correct_moves(own, enemy) + valid_moves = bit_to_array(mobility, 64) + valid_moves = list(np.reshape(valid_moves, len(valid_moves))) + return valid_moves + + def simulate_step_forward(self, board, color, vertex): + self.board = board + self.color = color + self.board2bitboard() + self.vertex2action(vertex) + step_forward = self.step() + if step_forward: + new_board = self.bitboard2board() + return new_board + + def executor_do_move(self, board, color, vertex): + self.board = board + self.color = color + self.board2bitboard() + self.vertex2action(vertex) + step_forward = self.step() + if step_forward: + new_board = self.bitboard2board() + return new_board + + def executor_get_score(self, board): + self.board = board + self._game_over() + if self.winner is not None: + return self.winner, 0 - self.winner + else: + ValueError("Game not finished!") + + def board2bitboard(self): + count = 1 + if self.board is None: + ValueError("None board!") + self.black = 0 + self.white = 0 + for i in range(64): + if self.board[i] == 1: + self.black |= count + elif self.board[i] == -1: + self.white |= count + count *= 2 + + def vertex2action(self, vertex): + x, y = vertex + if x == 0 and y == 0: + self.action = None + else: + self.action = 8 * (x - 1) + y - 1 + + def bitboard2board(self): + board = [] + black = bit_to_array(self.black, 64) + white = bit_to_array(self.white, 64) + for i in range(64): + if black[i]: + board.append(1) + elif white[i]: + board.append(-1) + else: + board.append(0) + return board + + def step(self): + if self.action < 0 or self.action > 63: + ValueError("Wrong action!") + if self.action is None: + return False + + own, enemy = self.get_own_and_enemy() + + flipped = calc_flip(self.action, own, enemy) + if bit_count(flipped) == 0: + self.illegal_move_to_lose(self.action) + return False + own ^= flipped + own |= 1 << self.action + enemy ^= flipped + + self.set_own_and_enemy(own, enemy) + return True + + def _game_over(self): + # self.done = True + if self.winner is None: + black_num, white_num = self.number_of_black_and_white + if black_num > white_num: + self.winner = 1 + elif black_num < white_num: + self.winner = -1 + else: + self.winner = 0 + + def illegal_move_to_lose(self, action): + logger.warning(f"Illegal action={action}, No Flipped!") + self._game_over() + + def get_own_and_enemy(self): + if self.color == 1: + own, enemy = self.black, self.white + elif self.color == -1: + own, enemy = self.white, self.black + else: + own, enemy = None, None + return own, enemy + + def set_own_and_enemy(self, own, enemy): + if self.color == 1: + self.black, self.white = own, enemy + else: + self.white, self.black = own, enemy + + @property + def number_of_black_and_white(self): + return bit_count(self.black), bit_count(self.white) From 2b1285143c232bc4006f47eabb498b99baf59785 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Fri, 22 Dec 2017 13:04:02 +0800 Subject: [PATCH 29/36] debug the training process, initialize a nameserver if no nameserver exists --- AlphaGo/model.py | 15 ++++++++++++--- AlphaGo/play.py | 10 ++++++++-- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 41f3a47..541de81 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -203,7 +203,8 @@ class ResNet(object): iters = 0 while True: new_file_list = list(set(os.listdir(data_path)).difference(all_file_list)) - all_file_list = os.listdir(data_path) + if new_file_list: + all_file_list = os.listdir(data_path) new_file_list.sort( key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir(data_path + file) else 0) if new_file_list: @@ -241,8 +242,16 @@ class ResNet(object): self.saver.save(self.sess, self.checkpoint_path + save_path) def _file_to_training_data(self, file_name): - with open(file_name, 'r') as file: - data = cPickle.load(file) + read = False + with open(file_name, 'rb') as file: + while not read: + try: + file.seek(0) + data = cPickle.load(file) + read = True + except Exception as e: + print(e) + time.sleep(1) history = deque(maxlen=self.history_length) states = [] probs = [] diff --git a/AlphaGo/play.py b/AlphaGo/play.py index a8267a7..3681430 100644 --- a/AlphaGo/play.py +++ b/AlphaGo/play.py @@ -50,6 +50,12 @@ if __name__ == '__main__': # print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait()) # time.sleep(1) + # start a name server if no name server exists + if len(os.popen('ps aux | grep pyro4-ns | grep -v grep').readlines()) == 0: + start_new_server = subprocess.Popen(['pyro4-ns', '&']) + print "Start Name Sever : " + str(start_new_server.pid) # + str(start_new_server.wait()) + time.sleep(1) + # start two different player with different network weights. black_role_name = 'black' + str(args.id) white_role_name = 'white' + str(args.id) @@ -137,13 +143,13 @@ if __name__ == '__main__': file_list.sort(key=lambda file: os.path.getmtime(args.result_path + file) if not os.path.isdir( args.result_path + file) else 0) data_num = eval(file_list[-1][:-4]) + 1 - with open("./data/" + str(data_num) + ".pkl", "w") as file: + with open("./data/" + str(data_num) + ".pkl", "wb") as file: picklestring = cPickle.dump(data, file) data.reset() game_num += 1 except Exception as e: - print(e) + print(e) subprocess.call(["kill", "-9", str(agent_v0.pid)]) subprocess.call(["kill", "-9", str(agent_v1.pid)]) print "Kill all player, finish all game." From d281ecc6e082027e7f67341a0abf1c18dbacbae8 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Fri, 22 Dec 2017 13:05:01 +0800 Subject: [PATCH 30/36] no restrict on saving checkpoints --- AlphaGo/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 541de81..5629128 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -133,7 +133,7 @@ class ResNet(object): with tf.control_dependencies(self.update_ops): self.train_op = tf.train.AdamOptimizer(1e-4).minimize(self.total_loss) self.var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) - self.saver = tf.train.Saver(var_list=self.var_list) + self.saver = tf.train.Saver(max_to_keep=0, var_list=self.var_list) self.sess = multi_gpu.create_session() self.sess.run(tf.global_variables_initializer()) if checkpoint_path is not None: From 6b3efd7fca0f4e2eb7ac4e63524a30976efc4361 Mon Sep 17 00:00:00 2001 From: rtz19970824 Date: Fri, 22 Dec 2017 13:30:48 +0800 Subject: [PATCH 31/36] modify the training config --- AlphaGo/model.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 41f3a47..0d885ef 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -101,8 +101,8 @@ class ResNet(object): self._build_network(residual_block_num, self.checkpoint_path) # training hyper-parameters: - self.window_length = 1000 - self.save_freq = 1000 + self.window_length = 7000 + self.save_freq = 5000 self.training_data = {'states': deque(maxlen=self.window_length), 'probs': deque(maxlen=self.window_length), 'winner': deque(maxlen=self.window_length)} @@ -241,6 +241,7 @@ class ResNet(object): self.saver.save(self.sess, self.checkpoint_path + save_path) def _file_to_training_data(self, file_name): + print(file_name) with open(file_name, 'r') as file: data = cPickle.load(file) history = deque(maxlen=self.history_length) @@ -267,4 +268,4 @@ class ResNet(object): if __name__=="__main__": model = ResNet(board_size=9, action_num=82) - model.train("file", data_path="./data/", batch_size=128, checkpoint_path="./checkpoint/") \ No newline at end of file + model.train("file", data_path="./data/", batch_size=128, checkpoint_path="./checkpoint/") From a8509ba2921795002bd88942bf58523aba80de99 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Fri, 22 Dec 2017 13:42:53 +0800 Subject: [PATCH 32/36] faster the loading --- AlphaGo/model.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 5629128..c4338c8 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -215,9 +215,10 @@ class ResNet(object): self.training_data['states'].append(states) self.training_data['probs'].append(probs) self.training_data['winner'].append(winner) - training_data['states'] = np.concatenate(self.training_data['states'], axis=0) - training_data['probs'] = np.concatenate(self.training_data['probs'], axis=0) - training_data['winner'] = np.concatenate(self.training_data['winner'], axis=0) + if len(self.training_data['states']) == self.window_length: + training_data['states'] = np.concatenate(self.training_data['states'], axis=0) + training_data['probs'] = np.concatenate(self.training_data['probs'], axis=0) + training_data['winner'] = np.concatenate(self.training_data['winner'], axis=0) if len(self.training_data['states']) != self.window_length: continue From 8328153b86871f36953605ebd89e17c001b3f537 Mon Sep 17 00:00:00 2001 From: rtz19970824 Date: Fri, 22 Dec 2017 13:47:27 +0800 Subject: [PATCH 33/36] print in the loading process --- AlphaGo/model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 15fc3da..e8b5eb9 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -249,6 +249,7 @@ class ResNet(object): file.seek(0) data = cPickle.load(file) read = True + print("{} Loaded".format(file_name)) except Exception as e: print(e) time.sleep(1) From 511f64b3d6ada98d4fe0e04215eea93d690f56a4 Mon Sep 17 00:00:00 2001 From: JialianLee Date: Fri, 22 Dec 2017 15:26:47 +0800 Subject: [PATCH 34/36] Modification for reversi --- AlphaGo/reversi.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/AlphaGo/reversi.py b/AlphaGo/reversi.py index 49d0e9a..cba91d9 100644 --- a/AlphaGo/reversi.py +++ b/AlphaGo/reversi.py @@ -34,7 +34,6 @@ def calc_flip(pos, own, enemy): :param enemy: bitboard :return: flip stones of enemy when I place stone at pos. """ - assert 0 <= pos <= 63, f"pos={pos}" f1 = _calc_flip_half(pos, own, enemy) f2 = _calc_flip_half(63 - pos, rotate180(own), rotate180(enemy)) return f1 | rotate180(f2) @@ -125,7 +124,14 @@ class Reversi: self.board = None # 8 * 8 board with 1 for black, -1 for white and 0 for blank self.color = None # 1 for black and -1 for white self.action = None # number in 0~63 - self.winner = None + # self.winner = None + self.black_win = None + + def get_board(self, black=None, white=None): + self.black = black or (0b00001000 << 24 | 0b00010000 << 32) + self.white = white or (0b00010000 << 24 | 0b00001000 << 32) + self.board = self.bitboard2board() + return self.board def simulate_is_valid(self, board, color): self.board = board @@ -134,18 +140,19 @@ class Reversi: own, enemy = self.get_own_and_enemy() mobility = find_correct_moves(own, enemy) valid_moves = bit_to_array(mobility, 64) + valid_moves = np.argwhere(valid_moves) valid_moves = list(np.reshape(valid_moves, len(valid_moves))) return valid_moves - def simulate_step_forward(self, board, color, vertex): - self.board = board - self.color = color + def simulate_step_forward(self, state, vertex): + self.board = state[0] + self.color = state[1] self.board2bitboard() self.vertex2action(vertex) step_forward = self.step() if step_forward: new_board = self.bitboard2board() - return new_board + return [new_board, 0 - self.color], 0 def executor_do_move(self, board, color, vertex): self.board = board @@ -155,13 +162,14 @@ class Reversi: step_forward = self.step() if step_forward: new_board = self.bitboard2board() - return new_board + for i in range(64): + board[i] = new_board[i] def executor_get_score(self, board): self.board = board self._game_over() - if self.winner is not None: - return self.winner, 0 - self.winner + if self.black_win is not None: + return self.black_win else: ValueError("Game not finished!") @@ -219,6 +227,7 @@ class Reversi: def _game_over(self): # self.done = True + ''' if self.winner is None: black_num, white_num = self.number_of_black_and_white if black_num > white_num: @@ -227,9 +236,12 @@ class Reversi: self.winner = -1 else: self.winner = 0 + ''' + if self.black_win is None: + black_num, white_num = self.number_of_black_and_white + self.black_win = black_num - white_num def illegal_move_to_lose(self, action): - logger.warning(f"Illegal action={action}, No Flipped!") self._game_over() def get_own_and_enemy(self): From c5e33af84173b4c5165e4a51600232daa1485cff Mon Sep 17 00:00:00 2001 From: Dong Yan Date: Fri, 22 Dec 2017 15:44:44 +0800 Subject: [PATCH 35/36] move the unit test of is_eye into go.py --- AlphaGo/go.py | 39 +++++++ AlphaGo/unit_test.py | 266 ------------------------------------------- 2 files changed, 39 insertions(+), 266 deletions(-) delete mode 100644 AlphaGo/unit_test.py diff --git a/AlphaGo/go.py b/AlphaGo/go.py index 9b7e21f..661d918 100644 --- a/AlphaGo/go.py +++ b/AlphaGo/go.py @@ -308,3 +308,42 @@ class Go: return score +if __name__ == "__main__": + ### do unit test for Go class + pure_test = [ + 0, 1, 0, 1, 0, 1, 0, 0, 0, + 1, 0, 1, 0, 1, 0, 0, 0, 0, + 0, 1, 0, 1, 0, 0, 1, 0, 0, + 0, 0, 1, 0, 0, 1, 0, 1, 0, + 0, 0, 0, 0, 0, 1, 1, 1, 0, + 1, 1, 1, 0, 0, 0, 0, 0, 0, + 1, 0, 1, 0, 0, 1, 1, 0, 0, + 1, 1, 1, 0, 1, 0, 1, 0, 0, + 0, 0, 0, 0, 1, 1, 1, 0, 0 + ] + + pt_qry = [(1, 1), (1, 5), (3, 3), (4, 7), (7, 2), (8, 6)] + pt_ans = [True, True, True, True, True, True] + + opponent_test = [ + 0, 1, 0, 1, 0, 1, 0,-1, 1, + 1,-1, 0,-1, 1,-1, 0, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, + 1, 1,-1, 0, 1,-1, 1, 0, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 0, + -1,1, 1, 0, 1, 1, 1, 0, 0, + 0, 1,-1, 0,-1,-1,-1, 0, 0, + 1, 0, 1, 0,-1, 0,-1, 0, 0, + 0, 1, 0, 0,-1,-1,-1, 0, 0 + ] + ot_qry = [(1, 1), (1, 5), (2, 9), (5, 2), (5, 6), (8, 6), (8, 2)] + ot_ans = [False, False, False, False, False, False, True] + + go = Go(size=9, komi=3.75) + for i in range(6): + print (go._is_eye(pure_test, utils.BLACK, pt_qry[i])) + print("Test of pure eye\n") + + for i in range(7): + print (go._is_eye(opponent_test, utils.BLACK, ot_qry[i])) + print("Test of eye surrend by opponents\n") diff --git a/AlphaGo/unit_test.py b/AlphaGo/unit_test.py deleted file mode 100644 index 7a33b8e..0000000 --- a/AlphaGo/unit_test.py +++ /dev/null @@ -1,266 +0,0 @@ -import numpy as np -import sys -from game import Game -from engine import GTPEngine -import utils -import time -import copy -import network_small -import tensorflow as tf -from collections import deque -from tianshou.core.mcts.mcts import MCTS - -DELTA = [[1, 0], [-1, 0], [0, -1], [0, 1]] -CORNER_OFFSET = [[-1, -1], [-1, 1], [1, 1], [1, -1]] - -class GoEnv: - def __init__(self, size=9, komi=6.5): - self.size = size - self.komi = komi - self.board = [utils.EMPTY] * (self.size * self.size) - self.history = deque(maxlen=8) - - def _set_board(self, board): - self.board = board - - def _flatten(self, vertex): - x, y = vertex - return (x - 1) * self.size + (y - 1) - - def _bfs(self, vertex, color, block, status, alive_break): - block.append(vertex) - status[self._flatten(vertex)] = True - nei = self._neighbor(vertex) - for n in nei: - if not status[self._flatten(n)]: - if self.board[self._flatten(n)] == color: - self._bfs(n, color, block, status, alive_break) - - def _find_block(self, vertex, alive_break=False): - block = [] - status = [False] * (self.size * self.size) - color = self.board[self._flatten(vertex)] - self._bfs(vertex, color, block, status, alive_break) - - for b in block: - for n in self._neighbor(b): - if self.board[self._flatten(n)] == utils.EMPTY: - return False, block - return True, block - - def _is_qi(self, color, vertex): - nei = self._neighbor(vertex) - for n in nei: - if self.board[self._flatten(n)] == utils.EMPTY: - return True - - self.board[self._flatten(vertex)] = color - for n in nei: - if self.board[self._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n) - if can_kill: - self.board[self._flatten(vertex)] = utils.EMPTY - return True - - ### avoid suicide - can_kill, block = self._find_block(vertex) - if can_kill: - self.board[self._flatten(vertex)] = utils.EMPTY - return False - - self.board[self._flatten(vertex)] = utils.EMPTY - return True - - def _check_global_isomorphous(self, color, vertex): - ##backup - _board = copy.copy(self.board) - self.board[self._flatten(vertex)] = color - self._process_board(color, vertex) - if self.board in self.history: - res = True - else: - res = False - - self.board = _board - return res - - def _in_board(self, vertex): - x, y = vertex - if x < 1 or x > self.size: return False - if y < 1 or y > self.size: return False - return True - - def _neighbor(self, vertex): - x, y = vertex - nei = [] - for d in DELTA: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - nei.append((_x, _y)) - return nei - - def _corner(self, vertex): - x, y = vertex - corner = [] - for d in CORNER_OFFSET: - _x = x + d[0] - _y = y + d[1] - if self._in_board((_x, _y)): - corner.append((_x, _y)) - return corner - - def _process_board(self, color, vertex): - nei = self._neighbor(vertex) - for n in nei: - if self.board[self._flatten(n)] == utils.another_color(color): - can_kill, block = self._find_block(n, alive_break=True) - if can_kill: - for b in block: - self.board[self._flatten(b)] = utils.EMPTY - - def _find_group(self, start): - color = self.board[self._flatten(start)] - #print ("color : ", color) - chain = set() - frontier = [start] - while frontier: - current = frontier.pop() - #print ("current : ", current) - chain.add(current) - for n in self._neighbor(current): - #print n, self._flatten(n), self.board[self._flatten(n)], - if self.board[self._flatten(n)] == color and not n in chain: - frontier.append(n) - return chain - - def _is_eye(self, color, vertex): - nei = self._neighbor(vertex) - cor = self._corner(vertex) - ncolor = {color == self.board[self._flatten(n)] for n in nei} - if False in ncolor: - #print "not all neighbors are in same color with us" - return False - if set(nei) < self._find_group(nei[0]): - #print "all neighbors are in same group and same color with us" - return True - else: - opponent_number = [self.board[self._flatten(c)] for c in cor].count(-color) - opponent_propotion = float(opponent_number) / float(len(cor)) - if opponent_propotion < 0.5: - #print "few opponents, real eye" - return True - else: - #print "many opponents, fake eye" - return False - - # def is_valid(self, color, vertex): - def is_valid(self, state, action): - # state is the play board, the shape is [1, 9, 9, 17] - if action == self.size * self.size: - vertex = (0, 0) - else: - vertex = (action / self.size + 1, action % self.size + 1) - if state[0, 0, 0, -1] == utils.BLACK: - color = utils.BLACK - else: - color = utils.WHITE - self.history.clear() - for i in range(8): - self.history.append((state[:, :, :, i] - state[:, :, :, i + 8]).reshape(-1).tolist()) - self.board = copy.copy(self.history[-1]) - ### in board - if not self._in_board(vertex): - return False - - ### already have stone - if not self.board[self._flatten(vertex)] == utils.EMPTY: - # print(np.array(self.board).reshape(9, 9)) - # print(vertex) - return False - - ### check if it is qi - if not self._is_qi(color, vertex): - return False - - ### check if it is an eye of yourself - ### assumptions : notice that this judgement requires that the state is an endgame - #if self._is_eye(color, vertex): - # return False - - if self._check_global_isomorphous(color, vertex): - return False - - return True - - def do_move(self, color, vertex): - if vertex == utils.PASS: - return True - - id_ = self._flatten(vertex) - if self.board[id_] == utils.EMPTY: - self.board[id_] = color - self.history.append(copy.copy(self.board)) - return True - else: - return False - - def step_forward(self, state, action): - if state[0, 0, 0, -1] == 1: - color = 1 - else: - color = -1 - if action == 81: - vertex = (0, 0) - else: - vertex = (action % 9 + 1, action / 9 + 1) - # print(vertex) - # print(self.board) - self.board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist() - self.do_move(color, vertex) - new_state = np.concatenate( - [state[:, :, :, 1:8], (np.array(self.board) == 1).reshape(1, 9, 9, 1), - state[:, :, :, 9:16], (np.array(self.board) == -1).reshape(1, 9, 9, 1), - np.array(1 - state[:, :, :, -1]).reshape(1, 9, 9, 1)], - axis=3) - return new_state, 0 - - -pure_test = [ - 0, 1, 0, 1, 0, 1, 0, 0, 0, - 1, 0, 1, 0, 1, 0, 0, 0, 0, - 0, 1, 0, 1, 0, 0, 1, 0, 0, - 0, 0, 1, 0, 0, 1, 0, 1, 0, - 0, 0, 0, 0, 0, 1, 1, 1, 0, - 1, 1, 1, 0, 0, 0, 0, 0, 0, - 1, 0, 1, 0, 0, 1, 1, 0, 0, - 1, 1, 1, 0, 1, 0, 1, 0, 0, - 0, 0, 0, 0, 1, 1, 1, 0, 0 -] - -pt_qry = [(1, 1), (1, 5), (3, 3), (4, 7), (7, 2), (8, 6)] -pt_ans = [True, True, True, True, True, True] - -opponent_test = [ - 0, 1, 0, 1, 0, 1, 0,-1, 1, - 1,-1, 0,-1, 1,-1, 0, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, - 1, 1,-1, 0, 1,-1, 1, 0, 0, - 1, 0, 1, 0, 1, 0, 1, 0, 0, - -1, 1, 1, 0, 1, 1, 1, 0, 0, - 0, 1,-1, 0,-1,-1,-1, 0, 0, - 1, 0, 1, 0,-1, 0,-1, 0, 0, - 0, 1, 0, 0,-1,-1,-1, 0, 0 -] -ot_qry = [(1, 1), (1, 5), (2, 9), (5, 2), (5, 6), (8, 2), (8, 6)] -ot_ans = [False, False, False, False, False, True, False] - -#print (ge._find_group((6, 1))) -#print ge._is_eye(utils.BLACK, pt_qry[0]) -ge = GoEnv() -ge._set_board(pure_test) -for i in range(6): - print (ge._is_eye(utils.BLACK, pt_qry[i])) -ge._set_board(opponent_test) -for i in range(7): - print (ge._is_eye(utils.BLACK, ot_qry[i])) From 67ba76a04d42152c1c7ae6f3554b2e8683fca0d5 Mon Sep 17 00:00:00 2001 From: rtz19970824 <1289226405@qq.com> Date: Fri, 22 Dec 2017 17:16:44 +0800 Subject: [PATCH 36/36] implement a stochastic sample training method --- AlphaGo/game.py | 4 ++-- AlphaGo/model.py | 44 +++++++++++++++++++++++++------------------- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/AlphaGo/game.py b/AlphaGo/game.py index 8706572..df08c0a 100644 --- a/AlphaGo/game.py +++ b/AlphaGo/game.py @@ -31,7 +31,7 @@ class Game: self.latest_boards = deque(maxlen=8) for _ in range(8): self.latest_boards.append(self.board) - self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8) + self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8, checkpoint_path=checkpoint_path) # self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v], # feed_dict={self.net.x: state, self.net.is_training: False}) self.game_engine = go.Go(size=self.size, komi=self.komi) @@ -96,7 +96,7 @@ class Game: sys.stdout.flush() if __name__ == "__main__": - g = Game() + g = Game(checkpoint_path='./checkpoints/') g.show_board() g.think_play_move(1) #file = open("debug.txt", "a") diff --git a/AlphaGo/model.py b/AlphaGo/model.py index 764ba5f..22e8626 100644 --- a/AlphaGo/model.py +++ b/AlphaGo/model.py @@ -1,5 +1,6 @@ import os import time +import random import sys import cPickle from collections import deque @@ -104,7 +105,7 @@ class ResNet(object): self.window_length = 7000 self.save_freq = 5000 self.training_data = {'states': deque(maxlen=self.window_length), 'probs': deque(maxlen=self.window_length), - 'winner': deque(maxlen=self.window_length)} + 'winner': deque(maxlen=self.window_length), 'length': deque(maxlen=self.window_length)} def _build_network(self, residual_block_num, checkpoint_path): """ @@ -199,15 +200,15 @@ class ResNet(object): new_file_list = [] all_file_list = [] - training_data = {} + training_data = {'states': [], 'probs': [], 'winner': []} + iters = 0 while True: new_file_list = list(set(os.listdir(data_path)).difference(all_file_list)) - if new_file_list: + while new_file_list: all_file_list = os.listdir(data_path) - new_file_list.sort( - key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir(data_path + file) else 0) - if new_file_list: + new_file_list.sort( + key=lambda file: os.path.getmtime(data_path + file) if not os.path.isdir(data_path + file) else 0) for file in new_file_list: states, probs, winner = self._file_to_training_data(data_path + file) assert states.shape[0] == probs.shape[0] @@ -215,32 +216,36 @@ class ResNet(object): self.training_data['states'].append(states) self.training_data['probs'].append(probs) self.training_data['winner'].append(winner) - if len(self.training_data['states']) == self.window_length: - training_data['states'] = np.concatenate(self.training_data['states'], axis=0) - training_data['probs'] = np.concatenate(self.training_data['probs'], axis=0) - training_data['winner'] = np.concatenate(self.training_data['winner'], axis=0) + self.training_data['length'].append(states.shape[0]) + new_file_list = list(set(os.listdir(data_path)).difference(all_file_list)) if len(self.training_data['states']) != self.window_length: continue else: - data_num = training_data['states'].shape[0] - index = np.arange(data_num) - np.random.shuffle(index) start_time = time.time() + for i in range(batch_size): + game_num = random.randint(0, self.window_length-1) + state_num = random.randint(0, self.training_data['length'][game_num]-1) + training_data['states'].append(np.expand_dims(self.training_data['states'][game_num][state_num], 0)) + training_data['probs'].append(np.expand_dims(self.training_data['probs'][game_num][state_num], 0)) + training_data['winner'].append(np.expand_dims(self.training_data['winner'][game_num][state_num], 0)) value_loss, policy_loss, reg, _ = self.sess.run( [self.value_loss, self.policy_loss, self.reg, self.train_op], - feed_dict={self.x: training_data['states'][index[:batch_size]], - self.z: training_data['winner'][index[:batch_size]], - self.pi: training_data['probs'][index[:batch_size]], + feed_dict={self.x: np.concatenate(training_data['states'], axis=0), + self.z: np.concatenate(training_data['winner'], axis=0), + self.pi: np.concatenate(training_data['probs'], axis=0), self.is_training: True}) + print("Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format(iters, time.time() - start_time, value_loss, policy_loss, reg)) - iters += 1 if iters % self.save_freq == 0: save_path = "Iteration{}.ckpt".format(iters) self.saver.save(self.sess, self.checkpoint_path + save_path) + for key in training_data.keys(): + training_data[key] = [] + iters += 1 def _file_to_training_data(self, file_name): read = False @@ -250,6 +255,7 @@ class ResNet(object): file.seek(0) data = cPickle.load(file) read = True + print("{} Loaded!".format(file_name)) except Exception as e: print(e) time.sleep(1) @@ -275,6 +281,6 @@ class ResNet(object): return states, probs, winner -if __name__=="__main__": - model = ResNet(board_size=9, action_num=82) +if __name__ == "__main__": + model = ResNet(board_size=9, action_num=82, history_length=8) model.train("file", data_path="./data/", batch_size=128, checkpoint_path="./checkpoint/")