From 906ced84a3d588143e5b5d1f8177ed5f123e4dbe Mon Sep 17 00:00:00 2001
From: rtz19970824 <rtz19970824@gmail.com>
Date: Fri, 8 Dec 2017 17:05:33 +0800
Subject: [PATCH] self play

---
 AlphaGo/game.py          |   7 ++-
 AlphaGo/network_small.py | 103 +++++++++++++++++------------------
 AlphaGo/self-play.py     | 114 +++++++++++++++++++++++++++++----------
 AlphaGo/strategy.py      |  45 ++++++++--------
 AlphaGo/test.py          |  14 +++++
 bin/activate             |   1 +
 bin/conda                |   1 +
 bin/deactivate           |   1 +
 8 files changed, 182 insertions(+), 104 deletions(-)
 create mode 100644 AlphaGo/test.py
 create mode 120000 bin/activate
 create mode 120000 bin/conda
 create mode 120000 bin/deactivate

diff --git a/AlphaGo/game.py b/AlphaGo/game.py
index 4db1007..941401e 100644
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@@ -190,7 +190,7 @@ class Game:
         self.executor = Executor(game=self)
         self.history = []
         self.past = deque(maxlen=8)
-        for i in range(8):
+        for _ in range(8):
             self.past.append(self.board)
 
     def _flatten(self, vertex):
@@ -205,6 +205,9 @@ class Game:
 
     def clear(self):
         self.board = [utils.EMPTY] * (self.size * self.size)
+        self.history = []
+        for _ in range(8):
+            self.past.append(self.board)
 
     def set_size(self, n):
         self.size = n
@@ -225,7 +228,7 @@ class Game:
     def gen_move(self, color):
         # move = self.strategy.gen_move(color)
         # return move
-        move = self.strategy.gen_move(self.past, color)
+        move, self.prob = self.strategy.gen_move(self.past, color)
         self.do_move(color, move)
         return move
 
diff --git a/AlphaGo/network_small.py b/AlphaGo/network_small.py
index 8dd5140..096aea6 100644
--- a/AlphaGo/network_small.py
+++ b/AlphaGo/network_small.py
@@ -59,11 +59,12 @@ class Network(object):
         self.build_network()
 
     def build_network(self):
-        h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm,
+        h = layers.conv2d(self.x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu,
+                          normalizer_fn=layers.batch_norm,
                           normalizer_params={'is_training': self.is_training,
                                              'updates_collections': tf.GraphKeys.UPDATE_OPS},
                           weights_regularizer=layers.l2_regularizer(1e-4))
-        for i in range(19):
+        for i in range(4):
             h = residual_block(h, self.is_training)
         self.v = value_heads(h, self.is_training)
         self.p = policy_heads(h, self.is_training)
@@ -115,9 +116,9 @@ class Network(object):
                             feed_dict={self.x: boards[
                                 index[iter * batch_size:(iter + 1) * batch_size]],
                                        self.z: wins[index[
-                                               iter * batch_size:(iter + 1) * batch_size]],
+                                                    iter * batch_size:(iter + 1) * batch_size]],
                                        self.pi: ps[index[
-                                              iter * batch_size:(iter + 1) * batch_size]],
+                                                   iter * batch_size:(iter + 1) * batch_size]],
                                        self.is_training: True})
                         value_losses.append(lv)
                         policy_losses.append(lp)
@@ -137,53 +138,53 @@ class Network(object):
                     del data, boards, wins, ps
 
 
-                # def forward(call_number):
-                #     # checkpoint_path = "/home/yama/rl/tianshou/AlphaGo/checkpoints"
-                #     checkpoint_path = "/home/jialian/stuGo/tianshou/stuGo/checkpoints/"
-                #     board_file = np.genfromtxt("/home/jialian/stuGo/tianshou/leela-zero/src/mcts_nn_files/board_" + call_number,
-                #                                dtype='str');
-                #     human_board = np.zeros((17, 19, 19))
-                #
-                #     # TODO : is it ok to ignore the last channel?
-                #     for i in range(17):
-                #         human_board[i] = np.array(list(board_file[i])).reshape(19, 19)
-                #     # print("============================")
-                #     # print("human board sum : " + str(np.sum(human_board[-1])))
-                #     # print("============================")
-                #     # print(human_board)
-                #     # print("============================")
-                #     # rint(human_board)
-                #     feed_board = human_board.transpose(1, 2, 0).reshape(1, 19, 19, 17)
-                #     # print(feed_board[:,:,:,-1])
-                #     # print(feed_board.shape)
-                #
-                #     # npz_board = np.load("/home/yama/rl/tianshou/AlphaGo/data/7f83928932f64a79bc1efdea268698ae.npz")
-                #     # print(npz_board["boards"].shape)
-                #     # feed_board = npz_board["boards"][10].reshape(-1, 19, 19, 17)
-                #     ##print(feed_board)
-                #     # show_board = feed_board[0].transpose(2, 0, 1)
-                #     # print("board shape : ", show_board.shape)
-                #     # print(show_board)
-                #
-                #     itflag = False
-                #     with multi_gpu.create_session() as sess:
-                #         sess.run(tf.global_variables_initializer())
-                #         ckpt_file = tf.train.latest_checkpoint(checkpoint_path)
-                #         if ckpt_file is not None:
-                #             # print('Restoring model from {}...'.format(ckpt_file))
-                #             saver.restore(sess, ckpt_file)
-                #         else:
-                #             raise ValueError("No model loaded")
-                #         res = sess.run([tf.nn.softmax(p), v], feed_dict={x: feed_board, is_training: itflag})
-                #         # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False})
-                #         # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True})
-                #         # print(np.argmax(res[0]))
-                #         np.savetxt(sys.stdout, res[0][0], fmt="%.6f", newline=" ")
-                #         np.savetxt(sys.stdout, res[1][0], fmt="%.6f", newline=" ")
-                #         pv_file = "/home/jialian/stuGotianshou/leela-zero/src/mcts_nn_files/policy_value"
-                #         np.savetxt(pv_file, np.concatenate((res[0][0], res[1][0])), fmt="%.6f", newline=" ")
-                #     # np.savetxt(pv_file, res[1][0], fmt="%.6f", newline=" ")
-                #     return res
+                    # def forward(call_number):
+                    #     # checkpoint_path = "/home/yama/rl/tianshou/AlphaGo/checkpoints"
+                    #     checkpoint_path = "/home/jialian/stuGo/tianshou/stuGo/checkpoints/"
+                    #     board_file = np.genfromtxt("/home/jialian/stuGo/tianshou/leela-zero/src/mcts_nn_files/board_" + call_number,
+                    #                                dtype='str');
+                    #     human_board = np.zeros((17, 19, 19))
+                    #
+                    #     # TODO : is it ok to ignore the last channel?
+                    #     for i in range(17):
+                    #         human_board[i] = np.array(list(board_file[i])).reshape(19, 19)
+                    #     # print("============================")
+                    #     # print("human board sum : " + str(np.sum(human_board[-1])))
+                    #     # print("============================")
+                    #     # print(human_board)
+                    #     # print("============================")
+                    #     # rint(human_board)
+                    #     feed_board = human_board.transpose(1, 2, 0).reshape(1, 19, 19, 17)
+                    #     # print(feed_board[:,:,:,-1])
+                    #     # print(feed_board.shape)
+                    #
+                    #     # npz_board = np.load("/home/yama/rl/tianshou/AlphaGo/data/7f83928932f64a79bc1efdea268698ae.npz")
+                    #     # print(npz_board["boards"].shape)
+                    #     # feed_board = npz_board["boards"][10].reshape(-1, 19, 19, 17)
+                    #     ##print(feed_board)
+                    #     # show_board = feed_board[0].transpose(2, 0, 1)
+                    #     # print("board shape : ", show_board.shape)
+                    #     # print(show_board)
+                    #
+                    #     itflag = False
+                    #     with multi_gpu.create_session() as sess:
+                    #         sess.run(tf.global_variables_initializer())
+                    #         ckpt_file = tf.train.latest_checkpoint(checkpoint_path)
+                    #         if ckpt_file is not None:
+                    #             # print('Restoring model from {}...'.format(ckpt_file))
+                    #             saver.restore(sess, ckpt_file)
+                    #         else:
+                    #             raise ValueError("No model loaded")
+                    #         res = sess.run([tf.nn.softmax(p), v], feed_dict={x: feed_board, is_training: itflag})
+                    #         # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False})
+                    #         # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True})
+                    #         # print(np.argmax(res[0]))
+                    #         np.savetxt(sys.stdout, res[0][0], fmt="%.6f", newline=" ")
+                    #         np.savetxt(sys.stdout, res[1][0], fmt="%.6f", newline=" ")
+                    #         pv_file = "/home/jialian/stuGotianshou/leela-zero/src/mcts_nn_files/policy_value"
+                    #         np.savetxt(pv_file, np.concatenate((res[0][0], res[1][0])), fmt="%.6f", newline=" ")
+                    #     # np.savetxt(pv_file, res[1][0], fmt="%.6f", newline=" ")
+                    #     return res
 
     def forward(self):
         # checkpoint_path = "/home/tongzheng/tianshou/AlphaGo/checkpoints/"
diff --git a/AlphaGo/self-play.py b/AlphaGo/self-play.py
index 2336e72..20e51c8 100644
--- a/AlphaGo/self-play.py
+++ b/AlphaGo/self-play.py
@@ -1,40 +1,98 @@
 from game import Game
 from engine import GTPEngine
 import re
+import numpy as np
+from collections import deque
+import utils
+import argparse
 
-g = Game()
+parser = argparse.ArgumentParser()
+parser.add_argument('--result_path', type=str, default='./part1')
+args = parser.parse_args()
+
+game = Game()
+engine = GTPEngine(game_obj=game)
+history = deque(maxlen=8)
+for i in range(8):
+    history.append(game.board)
+state = []
+prob = []
+winner = []
 pattern = "[A-Z]{1}[0-9]{1}"
+game.show_board()
+
+
+def history2state(history, color):
+    state = np.zeros([1, game.size, game.size, 17])
+    for i in range(8):
+        state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(game.size ** 2)).reshape(game.size, game.size)
+        state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(game.size ** 2)).reshape(game.size, game.size)
+    if color == utils.BLACK:
+        state[0, :, :, 16] = np.ones([game.size, game.size])
+    if color == utils.WHITE:
+        state[0, :, :, 16] = np.zeros([game.size, game.size])
+    return state
 
-g.show_board()
-e = GTPEngine(game_obj=g)
 
 num = 0
+game_num = 0
 black_pass = False
 white_pass = False
-while not (black_pass and white_pass):
-    if num % 2 == 0:
-        res = e.run_cmd(str(num) + " genmove BLACK")
-        num += 1
-        # print(res)
-        match = re.search(pattern, res)
-        if match is not None:
-            print(match.group())
+while True:
+    while not (black_pass and white_pass) and num < game.size ** 2 * 2:
+        if num % 2 == 0:
+            color = utils.BLACK
+            new_state = history2state(history, color)
+            state.append(new_state)
+            result = engine.run_cmd(str(num) + " genmove BLACK")
+            num += 1
+            match = re.search(pattern, result)
+            if match is not None:
+                print(match.group())
+            else:
+                print("pass")
+            if re.search("pass", result) is not None:
+                black_pass = True
+            else:
+                black_pass = False
         else:
-            print("pass")
-        if re.search("pass", res) is not None:
-            black_pass = True
-        else:
-            black_pass = False
+            color = utils.WHITE
+            new_state = history2state(history, color)
+            state.append(new_state)
+            result = engine.run_cmd(str(num) + " genmove WHITE")
+            num += 1
+            match = re.search(pattern, result)
+            if match is not None:
+                print(match.group())
+            else:
+                print("pass")
+            if re.search("pass", result) is not None:
+                white_pass = True
+            else:
+                white_pass = False
+        game.show_board()
+        prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1))
+    print("Finished")
+    score = game.executor.get_score()
+    if score > 0:
+        winner = utils.BLACK
     else:
-        res = e.run_cmd(str(num) + " genmove WHITE")
-        num += 1
-        match = re.search(pattern, res)
-        if match is not None:
-            print(match.group())
-        else:
-            print("pass")
-        if re.search("pass", res) is not None:
-            white_pass = True
-        else:
-            white_pass = False
-    g.show_board()
+        winner = utils.WHITE
+    state = np.concatenate(state, axis=0)
+    prob = np.concatenate(prob, axis=0)
+    winner = np.ones([num, 1]) * winner
+    assert state.shape[0] == prob.shape[0]
+    assert state.shape[0] == winner.shape[0]
+    np.savez(args.result_path + "/game" + game_num, state=state, prob=prob, winner=winner)
+    state = []
+    prob = []
+    winner = []
+    num = 0
+    black_pass = False
+    white_pass = False
+    engine.run_cmd(str(num) + " clear_board")
+    history.clear()
+    for _ in range(8):
+        history.append(game.board)
+    game.show_board()
+    game_num += 1
diff --git a/AlphaGo/strategy.py b/AlphaGo/strategy.py
index dc328d5..590edb3 100644
--- a/AlphaGo/strategy.py
+++ b/AlphaGo/strategy.py
@@ -198,28 +198,27 @@ class GoEnv:
         id_ = self._flatten(vertex)
         if self.board[id_] == utils.EMPTY:
             self.board[id_] = color
-            self.history.append(copy.copy(self.board))
             return True
         else:
             return False
 
     def step_forward(self, state, action):
         if state[0, 0, 0, -1] == 1:
-            color = 1
+            color = utils.BLACK
         else:
-            color = -1
-        if action == 81:
-            vertex = (0, 0)
+            color = utils.WHITE
+        if action == self.size ** 2:
+            vertex = utils.PASS
         else:
-            vertex = (action % 9 + 1, action / 9 + 1)
+            vertex = (action % self.size + 1, action / self.size + 1)
         # print(vertex)
         # print(self.board)
         self.board = (state[:, :, :, 7] - state[:, :, :, 15]).reshape(-1).tolist()
         self.do_move(color, vertex)
         new_state = np.concatenate(
-            [state[:, :, :, 1:8], (np.array(self.board) == 1).reshape(1, 9, 9, 1),
-             state[:, :, :, 9:16], (np.array(self.board) == -1).reshape(1, 9, 9, 1),
-             np.array(1 - state[:, :, :, -1]).reshape(1, 9, 9, 1)],
+            [state[:, :, :, 1:8], (np.array(self.board) == utils.BLACK).reshape(1, self.size, self.size, 1),
+             state[:, :, :, 9:16], (np.array(self.board) == utils.WHITE).reshape(1, self.size, self.size, 1),
+             np.array(1 - state[:, :, :, -1]).reshape(1, self.size, self.size, 1)],
             axis=3)
         return new_state, 0
 
@@ -233,26 +232,26 @@ class strategy(object):
                                                      feed_dict={self.net.x: state, self.net.is_training: False})
 
     def data_process(self, history, color):
-        state = np.zeros([1, 9, 9, 17])
+        state = np.zeros([1, self.simulator.size, self.simulator.size, 17])
         for i in range(8):
-            state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(81)).reshape(9, 9)
-            state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(81)).reshape(9, 9)
-        if color == 1:
-            state[0, :, :, 16] = np.ones([9, 9])
-        if color == -1:
-            state[0, :, :, 16] = np.zeros([9, 9])
+            state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.simulator.size ** 2)).reshape(self.simulator.size, self.simulator.size)
+            state[0, :, :, i + 8] = np.array(np.array(history[i]) == -np.ones(self.simulator.size ** 2)).reshape(self.simulator.size, self.simulator.size)
+        if color == utils.BLACK:
+            state[0, :, :, 16] = np.ones([self.simulator.size, self.simulator.size])
+        if color == utils.WHITE:
+            state[0, :, :, 16] = np.zeros([self.simulator.size, self.simulator.size])
         return state
 
     def gen_move(self, history, color):
         self.simulator.history = copy.copy(history)
         self.simulator.board = copy.copy(history[-1])
         state = self.data_process(self.simulator.history, color)
-        mcts = MCTS(self.simulator, self.evaluator, state, 82, inverse=True, max_step=10)
+        mcts = MCTS(self.simulator, self.evaluator, state, self.simulator.size ** 2 + 1, inverse=True, max_step=100)
         temp = 1
-        p = mcts.root.N ** temp / np.sum(mcts.root.N ** temp)
-        choice = np.random.choice(82, 1, p=p).tolist()[0]
-        if choice == 81:
-            move = (0, 0)
+        prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp)
+        choice = np.random.choice(self.simulator.size ** 2 + 1, 1, p=prob).tolist()[0]
+        if choice == self.simulator.size ** 2:
+            move = utils.PASS
         else:
-            move = (choice % 9 + 1, choice / 9 + 1)
-        return move
+            move = (choice % self.simulator.size + 1, choice / self.simulator.size + 1)
+        return move, prob
diff --git a/AlphaGo/test.py b/AlphaGo/test.py
new file mode 100644
index 0000000..d9a0915
--- /dev/null
+++ b/AlphaGo/test.py
@@ -0,0 +1,14 @@
+import sys
+from game import Game
+from engine import GTPEngine
+# import utils
+
+game = Game()
+engine = GTPEngine(game_obj=game, name='tianshou')
+cmd = raw_input
+
+while not engine.disconnect:
+    command = cmd()
+    result = engine.run_cmd(command)
+    sys.stdout.write(result)
+    sys.stdout.flush()
diff --git a/bin/activate b/bin/activate
new file mode 120000
index 0000000..1515f22
--- /dev/null
+++ b/bin/activate
@@ -0,0 +1 @@
+/home/tongzheng/anaconda2/bin/activate
\ No newline at end of file
diff --git a/bin/conda b/bin/conda
new file mode 120000
index 0000000..79ea700
--- /dev/null
+++ b/bin/conda
@@ -0,0 +1 @@
+/home/tongzheng/anaconda2/bin/conda
\ No newline at end of file
diff --git a/bin/deactivate b/bin/deactivate
new file mode 120000
index 0000000..02b42f9
--- /dev/null
+++ b/bin/deactivate
@@ -0,0 +1 @@
+/home/tongzheng/anaconda2/bin/deactivate
\ No newline at end of file