self play

2017-12-08 17:05:33 +08:00 · 2017-12-08 17:05:33 +08:00 · 906ced84a3
commit 906ced84a3
parent b687241a7d
8 changed files with 182 additions and 104 deletions
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@ -190,7 +190,7 @@ class Game:
        self.executor = Executor(game=self)
        self.history = []
        self.past = deque(maxlen=8)
-        for i in range(8):
+        for _ in range(8):
            self.past.append(self.board)
    def _flatten(self, vertex):
@ -205,6 +205,9 @@ class Game:
    def clear(self):
        self.board = [utils.EMPTY] * (self.size * self.size)
        self.history = []
        for _ in range(8):
            self.past.append(self.board)
    def set_size(self, n):
        self.size = n
@ -225,7 +228,7 @@ class Game:
    def gen_move(self, color):
        # move = self.strategy.gen_move(color)
        # return move
-        move = self.strategy.gen_move(self.past, color)
+        move, self.prob = self.strategy.gen_move(self.past, color)
        self.do_move(color, move)
        return move
--- a/AlphaGo/network_small.py
+++ b/AlphaGo/network_small.py
@ -59,11 +59,12 @@ class Network(object):
        self.build_network()
    def build_network(self):
-        h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm,
+        h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu,
                          normalizer_fn=layers.batch_norm,
                          normalizer_params={'is_training': self.is_training,
                                             'updates_collections': tf.GraphKeys.UPDATE_OPS},
                          weights_regularizer=layers.l2_regularizer(1e-4))
-        for i in range(19):
+        for i in range(4):
            h = residual_block(h, self.is_training)
        self.v = value_heads(h, self.is_training)
        self.p = policy_heads(h, self.is_training)
--- a/AlphaGo/self-play.py
+++ b/AlphaGo/self-play.py
@ -1,40 +1,98 @@
 from game import Game
 from engine import GTPEngine
 import re
 import numpy as np
 from collections import deque
 import utils
 import argparse
-g = Game()
+parser = argparse.ArgumentParser()
 parser.add_argument('--result_path', type=str, default='./part1')
 args = parser.parse_args()
 game = Game()
 engine = GTPEngine(game_obj=game)
 history = deque(maxlen=8)
 for i in range(8):
    history.append(game.board)
 state = []
 prob = []
 winner = []
 pattern = "[A-Z]{1}[0-9]{1}"
 game.show_board()
 def history2state(history, color):
    state = np.zeros([1, game.size, game.size, 17])
    for i in range(8):
        state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(game.size ** 2)).reshape(game.size, game.size)
        state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(game.size ** 2)).reshape(game.size, game.size)
    if color == utils.BLACK:
        state[0, :, :, 16] = np.ones([game.size, game.size])
    if color == utils.WHITE:
        state[0, :, :, 16] = np.zeros([game.size, game.size])
    return state
 g.show_board()
 e = GTPEngine(game_obj=g)
 num = 0
 game_num = 0
 black_pass = False
 white_pass = False
-while not (black_pass and white_pass):
+while True:
    while not (black_pass and white_pass) and num < game.size ** 2 * 2:
        if num % 2 == 0:
-        res = e.run_cmd(str(num) + " genmove BLACK")
+            color = utils.BLACK
            new_state = history2state(history, color)
            state.append(new_state)
            result = engine.run_cmd(str(num) + " genmove BLACK")
            num += 1
-        # print(res)
+            match = re.search(pattern, result)
        match = re.search(pattern, res)
            if match is not None:
                print(match.group())
            else:
                print("pass")
-        if re.search("pass", res) is not None:
+            if re.search("pass", result) is not None:
                black_pass = True
            else:
                black_pass = False
        else:
-        res = e.run_cmd(str(num) + " genmove WHITE")
+            color = utils.WHITE
            new_state = history2state(history, color)
            state.append(new_state)
            result = engine.run_cmd(str(num) + " genmove WHITE")
            num += 1
-        match = re.search(pattern, res)
+            match = re.search(pattern, result)
            if match is not None:
                print(match.group())
            else:
                print("pass")
-        if re.search("pass", res) is not None:
+            if re.search("pass", result) is not None:
                white_pass = True
            else:
                white_pass = False
-    g.show_board()
+        game.show_board()
        prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1))
    print("Finished")
    score = game.executor.get_score()
    if score > 0:
        winner = utils.BLACK
    else:
        winner = utils.WHITE
    state = np.concatenate(state, axis=0)
    prob = np.concatenate(prob, axis=0)
    winner = np.ones([num, 1]) * winner
    assert state.shape[0] == prob.shape[0]
    assert state.shape[0] == winner.shape[0]
    np.savez(args.result_path + "/game" + game_num, state=state, prob=prob, winner=winner)
    state = []
    prob = []
    winner = []
    num = 0
    black_pass = False
    white_pass = False
    engine.run_cmd(str(num) + " clear_board")
    history.clear()
    for _ in range(8):
        history.append(game.board)
    game.show_board()
    game_num += 1
--- a/AlphaGo/strategy.py
+++ b/AlphaGo/strategy.py
@ -198,28 +198,27 @@ class GoEnv:
        id_ = self._flatten(vertex)
        if self.board[id_] == utils.EMPTY:
            self.board[id_] = color
            self.history.append(copy.copy(self.board))
            return True
        else:
            return False
    def step_forward(self, state, action):
        if state[0, 0, 0, -1] == 1:
-            color = 1
+            color = utils.BLACK
        else:
-            color = -1
+            color = utils.WHITE
-        if action == 81:
+        if action == self.size ** 2:
-            vertex = (0, 0)
+            vertex = utils.PASS
        else:
-            vertex = (action % 9 + 1, action / 9 + 1)
+            vertex = (action % self.size + 1, action / self.size + 1)
        # print(vertex)
        # print(self.board)
        self.board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist()
        self.do_move(color, vertex)
        new_state = np.concatenate(
-            [state[:, :, :, 1:8], (np.array(self.board) == 1).reshape(1, 9, 9, 1),
+            [state[:, :, :, 1:8], (np.array(self.board) == utils.BLACK).reshape(1, self.size, self.size, 1),
-             state[:, :, :, 9:16], (np.array(self.board) == -1).reshape(1, 9, 9, 1),
+             state[:, :, :, 9:16], (np.array(self.board) == utils.WHITE).reshape(1, self.size, self.size, 1),
-             np.array(1 - state[:, :, :, -1]).reshape(1, 9, 9, 1)],
+             np.array(1 - state[:, :, :, -1]).reshape(1, self.size, self.size, 1)],
            axis=3)
        return new_state, 0
@ -233,26 +232,26 @@ class strategy(object):
                                                     feed_dict={self.net.x: state, self.net.is_training: False})
    def data_process(self, history, color):
-        state = np.zeros([1, 9, 9, 17])
+        state = np.zeros([1, self.simulator.size, self.simulator.size, 17])
        for i in range(8):
-            state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(81)).reshape(9, 9)
+            state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.simulator.size ** 2)).reshape(self.simulator.size, self.simulator.size)
-            state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(81)).reshape(9, 9)
+            state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(self.simulator.size ** 2)).reshape(self.simulator.size, self.simulator.size)
-        if color == 1:
+        if color == utils.BLACK:
-            state[0, :, :, 16] = np.ones([9, 9])
+            state[0, :, :, 16] = np.ones([self.simulator.size, self.simulator.size])
-        if color == -1:
+        if color == utils.WHITE:
-            state[0, :, :, 16] = np.zeros([9, 9])
+            state[0, :, :, 16] = np.zeros([self.simulator.size, self.simulator.size])
        return state
    def gen_move(self, history, color):
        self.simulator.history = copy.copy(history)
        self.simulator.board = copy.copy(history[-1])
        state = self.data_process(self.simulator.history, color)
-        mcts = MCTS(self.simulator, self.evaluator, state, 82, inverse=True, max_step=10)
+        mcts = MCTS(self.simulator, self.evaluator, state, self.simulator.size ** 2 + 1, inverse=True, max_step=100)
        temp = 1
-        p = mcts.root.N ** temp / np.sum(mcts.root.N ** temp)
+        prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp)
-        choice = np.random.choice(82, 1, p=p).tolist()[0]
+        choice = np.random.choice(self.simulator.size ** 2 + 1, 1, p=prob).tolist()[0]
-        if choice == 81:
+        if choice == self.simulator.size ** 2:
-            move = (0, 0)
+            move = utils.PASS
        else:
-            move = (choice % 9 + 1, choice / 9 + 1)
+            move = (choice % self.simulator.size + 1, choice / self.simulator.size + 1)
-        return move
+        return move, prob
--- a/AlphaGo/test.py
+++ b/AlphaGo/test.py
@ -0,0 +1,14 @@
 import sys
 from game import Game
 from engine import GTPEngine
 # import utils
 game = Game()
 engine = GTPEngine(game_obj=game, name='tianshou')
 cmd = raw_input
 while not engine.disconnect:
    command = cmd()
    result = engine.run_cmd(command)
    sys.stdout.write(result)
    sys.stdout.flush()
--- a/bin/activate
+++ b/bin/activate
@ -0,0 +1 @@
 /home/tongzheng/anaconda2/bin/activate
--- a/bin/conda
+++ b/bin/conda
@ -0,0 +1 @@
 /home/tongzheng/anaconda2/bin/conda
--- a/bin/deactivate
+++ b/bin/deactivate
@ -0,0 +1 @@
 /home/tongzheng/anaconda2/bin/deactivate