AlphaGo update

2017-11-26 13:36:52 +08:00 · 2017-11-26 13:36:52 +08:00 · ca0021083f
commit ca0021083f
parent e727ce4d9b
21 changed files with 625 additions and 422 deletions
--- a/AlphaGo/Network.py
+++ b/AlphaGo/Network.py
@ -9,43 +9,44 @@ import tensorflow.contrib.layers as layers
 import multi_gpu
 import time

-#os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+# os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

+
 def residual_block(input, is_training):
-	normalizer_params = {'is_training': is_training,
-						 'updates_collections': tf.GraphKeys.UPDATE_OPS}
-	h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu,
-					  normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
-					  weights_regularizer=layers.l2_regularizer(1e-4))
-	h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity,
-					  normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
-					  weights_regularizer=layers.l2_regularizer(1e-4))
-	h = h + input
-	return tf.nn.relu(h)
+    normalizer_params = {'is_training': is_training,
+                         'updates_collections': tf.GraphKeys.UPDATE_OPS}
+    h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu,
+                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
+                      weights_regularizer=layers.l2_regularizer(1e-4))
+    h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity,
+                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
+                      weights_regularizer=layers.l2_regularizer(1e-4))
+    h = h + input
+    return tf.nn.relu(h)


 def policy_heads(input, is_training):
-	normalizer_params = {'is_training': is_training,
-						 'updates_collections': tf.GraphKeys.UPDATE_OPS}
-	h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
-					  normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
-					  weights_regularizer=layers.l2_regularizer(1e-4))
-	h = layers.flatten(h)
-	h = layers.fully_connected(h, 362, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4))
-	return h
+    normalizer_params = {'is_training': is_training,
+                         'updates_collections': tf.GraphKeys.UPDATE_OPS}
+    h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
+                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
+                      weights_regularizer=layers.l2_regularizer(1e-4))
+    h = layers.flatten(h)
+    h = layers.fully_connected(h, 362, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4))
+    return h


 def value_heads(input, is_training):
-	normalizer_params = {'is_training': is_training,
-						 'updates_collections': tf.GraphKeys.UPDATE_OPS}
-	h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
-					  normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
-					  weights_regularizer=layers.l2_regularizer(1e-4))
-	h = layers.flatten(h)
-	h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4))
-	h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4))
-	return h
+    normalizer_params = {'is_training': is_training,
+                         'updates_collections': tf.GraphKeys.UPDATE_OPS}
+    h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
+                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
+                      weights_regularizer=layers.l2_regularizer(1e-4))
+    h = layers.flatten(h)
+    h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4))
+    h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4))
+    return h


 x = tf.placeholder(tf.float32, shape=[None, 19, 19, 17])
@ -54,10 +55,10 @@ z = tf.placeholder(tf.float32, shape=[None, 1])
 pi = tf.placeholder(tf.float32, shape=[None, 362])

 h = layers.conv2d(x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm,
-				  normalizer_params={'is_training': is_training, 'updates_collections': tf.GraphKeys.UPDATE_OPS},
-				  weights_regularizer=layers.l2_regularizer(1e-4))
+                  normalizer_params={'is_training': is_training, 'updates_collections': tf.GraphKeys.UPDATE_OPS},
+                  weights_regularizer=layers.l2_regularizer(1e-4))
 for i in range(19):
-	h = residual_block(h, is_training)
+    h = residual_block(h, is_training)
 v = value_heads(h, is_training)
 p = policy_heads(h, is_training)
 # loss = tf.reduce_mean(tf.square(z-v)) - tf.multiply(pi, tf.log(tf.clip_by_value(tf.nn.softmax(p), 1e-8, tf.reduce_max(tf.nn.softmax(p)))))
@ -69,114 +70,130 @@ total_loss = value_loss + policy_loss + reg
 # train_op = tf.train.MomentumOptimizer(1e-4, momentum=0.9, use_nesterov=True).minimize(total_loss)
 update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
 with tf.control_dependencies(update_ops):
-	train_op = tf.train.RMSPropOptimizer(1e-4).minimize(total_loss)
+    train_op = tf.train.RMSPropOptimizer(1e-4).minimize(total_loss)
 var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
 saver = tf.train.Saver(max_to_keep=10, var_list=var_list)


 def train():
-	data_path = "/home/tongzheng/data/"
-	data_name = os.listdir("/home/tongzheng/data/")
-	epochs = 100
-	batch_size = 128
+    data_path = "/home/tongzheng/data/"
+    data_name = os.listdir("/home/tongzheng/data/")
+    epochs = 100
+    batch_size = 128

-	result_path = "./checkpoints/"
-	with multi_gpu.create_session() as sess:
-		sess.run(tf.global_variables_initializer())
-		ckpt_file = tf.train.latest_checkpoint(result_path)
-		if ckpt_file is not None:
-			print('Restoring model from {}...'.format(ckpt_file))
-			saver.restore(sess, ckpt_file)
-		for epoch in range(epochs):
-			for name in data_name:
-				data = np.load(data_path + name)
-				boards = data["boards"]
-				wins = data["wins"]
-				ps = data["ps"]
-				print (boards.shape)
-				print (wins.shape)
-				print (ps.shape)
-				batch_num = boards.shape[0] // batch_size
-				index = np.arange(boards.shape[0])
-				np.random.shuffle(index)
-				value_losses = []
-				policy_losses = []
-				regs = []
-				time_train = -time.time()
-				for iter in range(batch_num):
-					lv, lp, r, value, prob, _ = sess.run([value_loss, policy_loss, reg, v, tf.nn.softmax(p), train_op],
-														 feed_dict={x: boards[
-															 index[iter * batch_size:(iter + 1) * batch_size]],
-																	z: wins[index[
-																			iter * batch_size:(iter + 1) * batch_size]],
-																	pi: ps[index[
-																		   iter * batch_size:(iter + 1) * batch_size]],
-																	is_training: True})
-					value_losses.append(lv)
-					policy_losses.append(lp)
-					regs.append(r)
-					if iter % 1 == 0:
-						print(
-						"Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format(
-							epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)),
-							np.mean(np.array(policy_losses)), np.mean(np.array(regs))))
-						time_train = -time.time()
-						value_losses = []
-						policy_losses = []
-						regs = []
-					if iter % 20 == 0:
-						save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter)
-						saver.save(sess, result_path + save_path)
-				del data, boards, wins, ps
-
-def forward(call_number):
-    #checkpoint_path = "/home/yama/rl/tianshou/AlphaGo/checkpoints"
-    checkpoint_path = "/home/jialian/stuGo/tianshou/stuGo/checkpoints/"
-    board_file = np.genfromtxt("/home/jialian/stuGo/tianshou/leela-zero/src/mcts_nn_files/board_" + call_number, dtype='str');
-    human_board = np.zeros((17, 19, 19))
-
-    #TODO : is it ok to ignore the last channel?
-    for i in range(17):
-        human_board[i] = np.array(list(board_file[i])).reshape(19, 19)
-    #print("============================")
-    #print("human board sum : " + str(np.sum(human_board[-1])))
-    #print("============================")
-    #print(human_board)
-    #print("============================")
-    #rint(human_board)
-    feed_board = human_board.transpose(1, 2, 0).reshape(1, 19, 19, 17)
-    #print(feed_board[:,:,:,-1])
-    #print(feed_board.shape)
-
-    #npz_board = np.load("/home/yama/rl/tianshou/AlphaGo/data/7f83928932f64a79bc1efdea268698ae.npz")
-    #print(npz_board["boards"].shape)
-    #feed_board = npz_board["boards"][10].reshape(-1, 19, 19, 17)
-    ##print(feed_board)
-    #show_board = feed_board[0].transpose(2, 0, 1)
-    #print("board shape : ", show_board.shape)
-    #print(show_board)
-
-    itflag = False
+    result_path = "./checkpoints/"
    with multi_gpu.create_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            ckpt_file = tf.train.latest_checkpoint(checkpoint_path)
-            if ckpt_file is not None:
-            	#print('Restoring model from {}...'.format(ckpt_file))
-            	saver.restore(sess, ckpt_file)
-            else:
-            	raise ValueError("No model loaded")
-            res = sess.run([tf.nn.softmax(p),v], feed_dict={x:feed_board, is_training:itflag})
-            #res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False})
-            #res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True})
-            #print(np.argmax(res[0]))
-            np.savetxt(sys.stdout, res[0][0], fmt="%.6f", newline=" ")
-            np.savetxt(sys.stdout, res[1][0], fmt="%.6f", newline=" ")
-            pv_file = "/home/jialian/stuGotianshou/leela-zero/src/mcts_nn_files/policy_value"
-            np.savetxt(pv_file, np.concatenate((res[0][0], res[1][0])), fmt="%.6f", newline=" ")
-            #np.savetxt(pv_file, res[1][0], fmt="%.6f", newline=" ")
-    return res
+        sess.run(tf.global_variables_initializer())
+        ckpt_file = tf.train.latest_checkpoint(result_path)
+        if ckpt_file is not None:
+            print('Restoring model from {}...'.format(ckpt_file))
+            saver.restore(sess, ckpt_file)
+        for epoch in range(epochs):
+            for name in data_name:
+                data = np.load(data_path + name)
+                boards = data["boards"]
+                wins = data["wins"]
+                ps = data["ps"]
+                print (boards.shape)
+                print (wins.shape)
+                print (ps.shape)
+                batch_num = boards.shape[0] // batch_size
+                index = np.arange(boards.shape[0])
+                np.random.shuffle(index)
+                value_losses = []
+                policy_losses = []
+                regs = []
+                time_train = -time.time()
+                for iter in range(batch_num):
+                    lv, lp, r, value, prob, _ = sess.run([value_loss, policy_loss, reg, v, tf.nn.softmax(p), train_op],
+                                                         feed_dict={x: boards[
+                                                             index[iter * batch_size:(iter + 1) * batch_size]],
+                                                                    z: wins[index[
+                                                                            iter * batch_size:(iter + 1) * batch_size]],
+                                                                    pi: ps[index[
+                                                                           iter * batch_size:(iter + 1) * batch_size]],
+                                                                    is_training: True})
+                    value_losses.append(lv)
+                    policy_losses.append(lp)
+                    regs.append(r)
+                    if iter % 1 == 0:
+                        print(
+                            "Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format(
+                                epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)),
+                                np.mean(np.array(policy_losses)), np.mean(np.array(regs))))
+                        time_train = -time.time()
+                        value_losses = []
+                        policy_losses = []
+                        regs = []
+                    if iter % 20 == 0:
+                        save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter)
+                        saver.save(sess, result_path + save_path)
+                del data, boards, wins, ps

-if __name__=='__main__':
-        np.set_printoptions(threshold='nan')
-        #time.sleep(2)
-        forward(sys.argv[1])
+
+# def forward(call_number):
+#     # checkpoint_path = "/home/yama/rl/tianshou/AlphaGo/checkpoints"
+#     checkpoint_path = "/home/jialian/stuGo/tianshou/stuGo/checkpoints/"
+#     board_file = np.genfromtxt("/home/jialian/stuGo/tianshou/leela-zero/src/mcts_nn_files/board_" + call_number,
+#                                dtype='str');
+#     human_board = np.zeros((17, 19, 19))
+#
+#     # TODO : is it ok to ignore the last channel?
+#     for i in range(17):
+#         human_board[i] = np.array(list(board_file[i])).reshape(19, 19)
+#     # print("============================")
+#     # print("human board sum : " + str(np.sum(human_board[-1])))
+#     # print("============================")
+#     # print(human_board)
+#     # print("============================")
+#     # rint(human_board)
+#     feed_board = human_board.transpose(1, 2, 0).reshape(1, 19, 19, 17)
+#     # print(feed_board[:,:,:,-1])
+#     # print(feed_board.shape)
+#
+#     # npz_board = np.load("/home/yama/rl/tianshou/AlphaGo/data/7f83928932f64a79bc1efdea268698ae.npz")
+#     # print(npz_board["boards"].shape)
+#     # feed_board = npz_board["boards"][10].reshape(-1, 19, 19, 17)
+#     ##print(feed_board)
+#     # show_board = feed_board[0].transpose(2, 0, 1)
+#     # print("board shape : ", show_board.shape)
+#     # print(show_board)
+#
+#     itflag = False
+#     with multi_gpu.create_session() as sess:
+#         sess.run(tf.global_variables_initializer())
+#         ckpt_file = tf.train.latest_checkpoint(checkpoint_path)
+#         if ckpt_file is not None:
+#             # print('Restoring model from {}...'.format(ckpt_file))
+#             saver.restore(sess, ckpt_file)
+#         else:
+#             raise ValueError("No model loaded")
+#         res = sess.run([tf.nn.softmax(p), v], feed_dict={x: feed_board, is_training: itflag})
+#         # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False})
+#         # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True})
+#         # print(np.argmax(res[0]))
+#         np.savetxt(sys.stdout, res[0][0], fmt="%.6f", newline=" ")
+#         np.savetxt(sys.stdout, res[1][0], fmt="%.6f", newline=" ")
+#         pv_file = "/home/jialian/stuGotianshou/leela-zero/src/mcts_nn_files/policy_value"
+#         np.savetxt(pv_file, np.concatenate((res[0][0], res[1][0])), fmt="%.6f", newline=" ")
+#     # np.savetxt(pv_file, res[1][0], fmt="%.6f", newline=" ")
+#     return res
+
+def forward(state):
+    checkpoint_path = "/home/tongzheng/tianshou/AlphaGo/checkpoints/"
+    with multi_gpu.create_session() as sess:
+        sess.run(tf.global_variables_initializer())
+        ckpt_file = tf.train.latest_checkpoint(checkpoint_path)
+        if ckpt_file is not None:
+            print('Restoring model from {}...'.format(ckpt_file))
+            saver.restore(sess, ckpt_file)
+        else:
+            raise ValueError("No model loaded")
+        prior, value = sess.run([tf.nn.softmax(p), v], feed_dict={x: state, is_training: False})
+    return prior, value
+
+
+if __name__ == '__main__':
+    np.set_printoptions(threshold='nan')
+    # time.sleep(2)
+    forward(sys.argv[1])
--- a/AlphaGo/Network_ori.py
+++ b/AlphaGo/Network_ori.py
@ -12,39 +12,39 @@ os.environ["CUDA_VISIBLE_DEVICES"] = "1"


 def residual_block(input, is_training):
-	normalizer_params = {'is_training': is_training,
-						 'updates_collections': tf.GraphKeys.UPDATE_OPS}
-	h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu,
-					  normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
-					  weights_regularizer=layers.l2_regularizer(1e-4))
-	h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity,
-					  normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
-					  weights_regularizer=layers.l2_regularizer(1e-4))
-	h = h + input
-	return tf.nn.relu(h)
+    normalizer_params = {'is_training': is_training,
+                         'updates_collections': tf.GraphKeys.UPDATE_OPS}
+    h = layers.conv2d(input, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu,
+                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
+                      weights_regularizer=layers.l2_regularizer(1e-4))
+    h = layers.conv2d(h, 256, kernel_size=3, stride=1, activation_fn=tf.identity,
+                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
+                      weights_regularizer=layers.l2_regularizer(1e-4))
+    h = h + input
+    return tf.nn.relu(h)


 def policy_heads(input, is_training):
-	normalizer_params = {'is_training': is_training,
-						 'updates_collections': tf.GraphKeys.UPDATE_OPS}
-	h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
-					  normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
-					  weights_regularizer=layers.l2_regularizer(1e-4))
-	h = layers.flatten(h)
-	h = layers.fully_connected(h, 362, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4))
-	return h
+    normalizer_params = {'is_training': is_training,
+                         'updates_collections': tf.GraphKeys.UPDATE_OPS}
+    h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
+                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
+                      weights_regularizer=layers.l2_regularizer(1e-4))
+    h = layers.flatten(h)
+    h = layers.fully_connected(h, 362, activation_fn=tf.identity, weights_regularizer=layers.l2_regularizer(1e-4))
+    return h


 def value_heads(input, is_training):
-	normalizer_params = {'is_training': is_training,
-						 'updates_collections': tf.GraphKeys.UPDATE_OPS}
-	h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
-					  normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
-					  weights_regularizer=layers.l2_regularizer(1e-4))
-	h = layers.flatten(h)
-	h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4))
-	h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4))
-	return h
+    normalizer_params = {'is_training': is_training,
+                         'updates_collections': tf.GraphKeys.UPDATE_OPS}
+    h = layers.conv2d(input, 2, kernel_size=1, stride=1, activation_fn=tf.nn.relu,
+                      normalizer_fn=layers.batch_norm, normalizer_params=normalizer_params,
+                      weights_regularizer=layers.l2_regularizer(1e-4))
+    h = layers.flatten(h)
+    h = layers.fully_connected(h, 256, activation_fn=tf.nn.relu, weights_regularizer=layers.l2_regularizer(1e-4))
+    h = layers.fully_connected(h, 1, activation_fn=tf.nn.tanh, weights_regularizer=layers.l2_regularizer(1e-4))
+    return h


 x = tf.placeholder(tf.float32, shape=[None, 19, 19, 17])
@ -53,10 +53,10 @@ z = tf.placeholder(tf.float32, shape=[None, 1])
 pi = tf.placeholder(tf.float32, shape=[None, 362])

 h = layers.conv2d(x, 256, kernel_size=3, stride=1, activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm,
-				  normalizer_params={'is_training': is_training, 'updates_collections': tf.GraphKeys.UPDATE_OPS},
-				  weights_regularizer=layers.l2_regularizer(1e-4))
+                  normalizer_params={'is_training': is_training, 'updates_collections': tf.GraphKeys.UPDATE_OPS},
+                  weights_regularizer=layers.l2_regularizer(1e-4))
 for i in range(19):
-	h = residual_block(h, is_training)
+    h = residual_block(h, is_training)
 v = value_heads(h, is_training)
 p = policy_heads(h, is_training)
 # loss = tf.reduce_mean(tf.square(z-v)) - tf.multiply(pi, tf.log(tf.clip_by_value(tf.nn.softmax(p), 1e-8, tf.reduce_max(tf.nn.softmax(p)))))
@ -68,106 +68,108 @@ total_loss = value_loss + policy_loss + reg
 # train_op = tf.train.MomentumOptimizer(1e-4, momentum=0.9, use_nesterov=True).minimize(total_loss)
 update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
 with tf.control_dependencies(update_ops):
-	train_op = tf.train.RMSPropOptimizer(1e-4).minimize(total_loss)
+    train_op = tf.train.RMSPropOptimizer(1e-4).minimize(total_loss)
 var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
 saver = tf.train.Saver(max_to_keep=10, var_list=var_list)


 def train():
-	data_path = "/home/tongzheng/data/"
-	data_name = os.listdir("/home/tongzheng/data/")
-	epochs = 100
-	batch_size = 128
+    data_path = "/home/tongzheng/data/"
+    data_name = os.listdir("/home/tongzheng/data/")
+    epochs = 100
+    batch_size = 128
+
+    result_path = "./checkpoints/"
+    with multi_gpu.create_session() as sess:
+        sess.run(tf.global_variables_initializer())
+        ckpt_file = tf.train.latest_checkpoint(result_path)
+        if ckpt_file is not None:
+            print('Restoring model from {}...'.format(ckpt_file))
+            saver.restore(sess, ckpt_file)
+        for epoch in range(epochs):
+            for name in data_name:
+                data = np.load(data_path + name)
+                boards = data["boards"]
+                wins = data["wins"]
+                ps = data["ps"]
+                print (boards.shape)
+                print (wins.shape)
+                print (ps.shape)
+                # batch_num = 1
+                batch_num = boards.shape[0] // batch_size
+                index = np.arange(boards.shape[0])
+                np.random.shuffle(index)
+                value_losses = []
+                policy_losses = []
+                regs = []
+                time_train = -time.time()
+                for iter in range(batch_num):
+                    lv, lp, r, _ = sess.run([value_loss, policy_loss, reg, train_op],
+                                            feed_dict={x: boards[
+                                                index[iter * batch_size:(iter + 1) * batch_size]],
+                                                       z: wins[index[
+                                                               iter * batch_size:(iter + 1) * batch_size]],
+                                                       pi: ps[index[
+                                                              iter * batch_size:(iter + 1) * batch_size]],
+                                                       is_training: True})
+                    value_losses.append(lv)
+                    policy_losses.append(lp)
+                    regs.append(r)
+                    del lv, lp, r
+                    if iter % 1 == 0:
+                        print(
+                            "Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format(
+                                epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)),
+                                np.mean(np.array(policy_losses)), np.mean(np.array(regs))))
+                        del value_losses, policy_losses, regs, time_train
+                        time_train = -time.time()
+                        value_losses = []
+                        policy_losses = []
+                        regs = []
+                    if iter % 20 == 0:
+                        save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter)
+                        saver.save(sess, result_path + save_path)
+                        del save_path
+                del data, boards, wins, ps, batch_num, index
+                gc.collect()
+

-	result_path = "./checkpoints/"
-	with multi_gpu.create_session() as sess:
-		sess.run(tf.global_variables_initializer())
-		ckpt_file = tf.train.latest_checkpoint(result_path)
-		if ckpt_file is not None:
-			print('Restoring model from {}...'.format(ckpt_file))
-			saver.restore(sess, ckpt_file)
-		for epoch in range(epochs):
-			for name in data_name:
-				data = np.load(data_path + name)
-				boards = data["boards"]
-				wins = data["wins"]
-				ps = data["ps"]
-				print (boards.shape)
-				print (wins.shape)
-				print (ps.shape)
-				# batch_num = 1
-				batch_num = boards.shape[0] // batch_size
-				index = np.arange(boards.shape[0])
-				np.random.shuffle(index)
-				value_losses = []
-				policy_losses = []
-				regs = []
-				time_train = -time.time()
-				for iter in range(batch_num):
-					lv, lp, r, _ = sess.run([value_loss, policy_loss, reg, train_op],
-														 feed_dict={x: boards[
-															 index[iter * batch_size:(iter + 1) * batch_size]],
-																	z: wins[index[
-																			iter * batch_size:(iter + 1) * batch_size]],
-																	pi: ps[index[
-																		   iter * batch_size:(iter + 1) * batch_size]],
-																	is_training: True})
-					value_losses.append(lv)
-					policy_losses.append(lp)
-					regs.append(r)
-					del lv, lp, r
-					if iter % 1 == 0:
-						print(
-						"Epoch: {}, Part {}, Iteration: {}, Time: {}, Value Loss: {}, Policy Loss: {}, Reg: {}".format(
-							epoch, name, iter, time.time() + time_train, np.mean(np.array(value_losses)),
-							np.mean(np.array(policy_losses)), np.mean(np.array(regs))))
-						del value_losses, policy_losses, regs, time_train
-						time_train = -time.time()
-						value_losses = []
-						policy_losses = []
-						regs = []
-					if iter % 20 == 0:
-						save_path = "Epoch{}.Part{}.Iteration{}.ckpt".format(epoch, name, iter)
-						saver.save(sess, result_path + save_path)
-						del save_path
-				del data, boards, wins, ps, batch_num, index
-				gc.collect()
 def forward(board):
-	result_path = "./checkpoints"
-	itflag = False
-	res = None
-	if board is None:
-		# data = np.load("/home/tongzheng/meta-data/80b7bf21bce14862806d48c3cd760a1b.npz")
-		data = np.load("./data/7f83928932f64a79bc1efdea268698ae.npz")
-                board = data["boards"][50].reshape(-1, 19, 19, 17)
-                human_board = board[0].transpose(2, 0, 1)
-                print("============================")
-                print("human board sum : " + str(np.sum(human_board)))
-                print("============================")
-                print(board[:,:,:,-1])
-		itflag = False
-	with multi_gpu.create_session() as sess:
-		sess.run(tf.global_variables_initializer())
-		ckpt_file = tf.train.latest_checkpoint(result_path)
-		if ckpt_file is not None:
-			print('Restoring model from {}...'.format(ckpt_file))
-			saver.restore(sess, ckpt_file)
-		else:
-			raise ValueError("No model loaded")
-		res = sess.run([tf.nn.softmax(p), v], feed_dict={x: board, is_training: itflag})
-		# res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False})
-		# res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True})
-		# print(np.argmax(res[0]))
-                print(res)
-                print(data["p"][0])
-		print(np.argmax(res[0]))
-		print(np.argmax(data["p"][0]))
-	# print(res[0].tolist()[0])
-	# print(np.argmax(res[0]))
-	return res
+    result_path = "./checkpoints"
+    itflag = False
+    res = None
+    if board is None:
+        # data = np.load("/home/tongzheng/meta-data/80b7bf21bce14862806d48c3cd760a1b.npz")
+        data = np.load("./data/7f83928932f64a79bc1efdea268698ae.npz")
+        board = data["boards"][50].reshape(-1, 19, 19, 17)
+        human_board = board[0].transpose(2, 0, 1)
+        print("============================")
+        print("human board sum : " + str(np.sum(human_board)))
+        print("============================")
+        print(board[:, :, :, -1])
+        itflag = False
+    with multi_gpu.create_session() as sess:
+        sess.run(tf.global_variables_initializer())
+        ckpt_file = tf.train.latest_checkpoint(result_path)
+        if ckpt_file is not None:
+            print('Restoring model from {}...'.format(ckpt_file))
+            saver.restore(sess, ckpt_file)
+        else:
+            raise ValueError("No model loaded")
+        res = sess.run([tf.nn.softmax(p), v], feed_dict={x: board, is_training: itflag})
+        # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][300].reshape(-1, 19, 19, 17), is_training:False})
+        # res = sess.run([tf.nn.softmax(p),v], feed_dict={x:fix_board["boards"][50].reshape(-1, 19, 19, 17), is_training:True})
+        # print(np.argmax(res[0]))
+        print(res)
+        print(data["p"][0])
+        print(np.argmax(res[0]))
+        print(np.argmax(data["p"][0]))
+    # print(res[0].tolist()[0])
+    # print(np.argmax(res[0]))
+    return res


 if __name__ == '__main__':
-	# train()
-	# if sys.argv[1] == "test":
-	forward(None)
+    # train()
+    # if sys.argv[1] == "test":
+    forward(None)
--- a/AlphaGo/engine.py
+++ b/AlphaGo/engine.py
@ -6,7 +6,8 @@
 #

 from game import Game
-import utils 
+import utils
+

 class GTPEngine():
    def __init__(self, **kwargs):
@ -27,7 +28,6 @@ class GTPEngine():
        except:
            self._version = 2

-
        self.disconnect = False

        self.known_commands = [
@ -42,9 +42,6 @@ class GTPEngine():
            x, y = vertex
            return "{}{}".format("ABCDEFGHJKLMNOPQRSTYVWYZ"[x - 1], y)

-
-
-
    def _vertex_string2point(self, s):
        if s is None:
            return False
@ -62,7 +59,6 @@ class GTPEngine():
            return False
        return (x, y)

-
    def _parse_color(self, color):
        if color.lower() in ["b", "black"]:
            color = utils.BLACK
@ -72,21 +68,18 @@ class GTPEngine():
            color = None
        return color

-
    def _parse_move(self, move_string):
-        color, move = move_string.split(" ",1)
+        color, move = move_string.split(" ", 1)
        color = self._parse_color(color)

        point = self._vertex_string2point(move)

        if point and color:
-            return color,point
+            return color, point
        else:
            return False
-        
-        

-    def _parse_res(self, res, id_ = None, success = True):
+    def _parse_res(self, res, id_=None, success=True):
        if success:
            if id_:
                return '={} {}\n\n'.format(id_, res)
@ -98,7 +91,6 @@ class GTPEngine():
            else:
                return '? {}\n\n'.format(res)

-
    def _parse_cmd(self, message):
        try:
            m = message.strip().split(" ", 1)
@ -119,19 +111,17 @@ class GTPEngine():
            return self._parse_res("invaild message", id_, False)

        if cmd in self.known_commands:
-            #dispatch
-            #try:
+            # dispatch
+            # try:
            if True:
                res, flag = getattr(self, "cmd_" + cmd)(args)
                return self._parse_res(res, id_, flag)
-            #except Exception as e:
-            #    print(e)
-            #    return self._parse_res("command excution failed", id_, False)
+                # except Exception as e:
+                #    print(e)
+                #    return self._parse_res("command excution failed", id_, False)
        else:
            return self._parse_res("unknown command", id_, False)

-
-
    def cmd_protocol_version(self, args, **kwargs):
        return 2, True

@ -148,50 +138,45 @@ class GTPEngine():
        return self.known_commands, True

    def cmd_quit(self, args, **kwargs):
-        return None,True
+        return None, True

    def cmd_boardsize(self, args, **kwargs):
        if args.isdigit():
            size = int(args)
            self.size = size
            self._game.set_size(size)
-            return None,True
+            return None, True
        else:
-            return 'non digit size',False
+            return 'non digit size', False

    def cmd_clear_board(self, args, **kwargs):
        self._game.clear()
-        return None,True
+        return None, True

    def cmd_komi(self, args, **kwargs):
        try:
            komi = float(args)
            self.komi = komi
            self._game.set_komi(komi)
-            return None,True
+            return None, True
        except ValueError:
            raise ValueError("syntax error")

-
    def cmd_play(self, args, **kwargs):
        move = self._parse_move(args)
        if move:
            color, vertex = move
            res = self._game.do_move(color, vertex)
            if res:
-                return None,True 
+                return None, True
            else:
-                return None,False
-        return None,True
+                return None, False
+        return None, True

    def cmd_genmove(self, args, **kwargs):
        color = self._parse_color(args)
        if color:
            move = self._game.gen_move(color)
-            return self._vertex_point2string(move),True
+            return self._vertex_point2string(move), True
        else:
-            return 'unknown player',False
-    
-    
-
-
+            return 'unknown player', False
--- a/AlphaGo/game.py
+++ b/AlphaGo/game.py
@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+# vim:fenc=utf-8
+# $File: game.py
+# $Date: Fri Nov 17 15:0745 2017 +0800
+# $Author: renyong15 © <mails.tsinghua.edu.cn>
+#
+
+import numpy as np
+import utils
+import Network
+from strategy import strategy
+from collections import deque
+
+
+class Game:
+    def __init__(self, size=19, komi=6.5):
+        self.size = size
+        self.komi = 6.5
+        self.board = [utils.EMPTY] * (self.size * self.size)
+        self.strategy = strategy(Network.forward)
+        self.history = deque(maxlen=8)
+        for i in range(8):
+            self.history.append(self.board)
+
+    def _flatten(self, vertex):
+        x, y = vertex
+        return (x - 1) * self.size + (y - 1)
+
+    def clear(self):
+        self.board = [utils.EMPTY] * (self.size * self.size)
+
+    def set_size(self, n):
+        self.size = n
+        self.clear()
+
+    def set_komi(self, k):
+        self.komi = k
+
+    def do_move(self, color, vertex):
+        if vertex == utils.PASS:
+            return True
+
+        id_ = self._flatten(vertex)
+        if self.board[id_] == utils.EMPTY:
+            self.board[id_] = color
+            self.history.append(self.board)
+            return True
+        else:
+            return False
+
+    def step_forward(self, state, action):
+        if state[0, 0, 0, -1] == 1:
+            color = 1
+        else:
+            color = -1
+        if action == 361:
+            vertex = (0, 0)
+        else:
+            vertex = (action / 19 + 1, action % 19)
+        self.do_move(color, vertex)
+        new_state = np.concatenate([state[:, :, :, 1:8], self.board == 1, state[:, :, :, 9:16], 1 - state[:, :, :, -1]],
+                                   axis=3)
+        return new_state, 0
+
+    def gen_move(self, color):
+        move = self.strategy.gen_move(self.history, color)
+        return move
+        # return utils.PASS
+
--- a/AlphaGo/go.py
+++ b/AlphaGo/go.py
--- a/AlphaGo/gtp_wrapper.py
+++ b/AlphaGo/gtp_wrapper.py
@ -3,7 +3,6 @@ import go
 import utils


-
 def translate_gtp_colors(gtp_color):
    if gtp_color == gtp.BLACK:
        return go.BLACK
@ -12,6 +11,7 @@ def translate_gtp_colors(gtp_color):
    else:
        return go.EMPTY

+
 class GtpInterface(object):
    def __init__(self):
        self.size = 9
@ -68,19 +68,3 @@ class GtpInterface(object):

    def suggest_move(self, position):
        raise NotImplementedError
-
-def make_gtp_instance(strategy_name, read_file):
-    n = PolicyNetwork(use_cpu=True)
-    n.initialize_variables(read_file)
-    if strategy_name == 'random':
-        instance = RandomPlayer()
-    elif strategy_name == 'policy':
-        instance = GreedyPolicyPlayer(n)
-    elif strategy_name == 'randompolicy':
-        instance = RandomPolicyPlayer(n)
-    elif strategy_name == 'mcts':
-        instance = MCTSPlayer(n)
-    else:
-        return None
-    gtp_engine = gtp.Engine(instance)
-    return gtp_engine
--- a/AlphaGo/random_data.py
+++ b/AlphaGo/random_data.py
@ -39,7 +39,7 @@ class block(object):
        self.wins = np.concatenate(self.wins, axis=0)
        self.ps = np.concatenate(self.ps, axis=0)
        print ("Block {}, Boards shape {}, Wins Shape {}, Ps Shape {}".format(self.block_id, self.boards.shape[0],
-                                                                             self.wins.shape[0], self.ps.shape[0]))
+                                                                              self.wins.shape[0], self.ps.shape[0]))
        np.savez(save_path + "block" + str(self.block_id), boards=self.boards, wins=self.wins, ps=self.ps)
        self.boards = []
        self.wins = []
@ -111,7 +111,7 @@ for n in name:
             p_ori[:, -1].reshape(-1, 1)],
            axis=1)
        concat(block_list, board_aug, p_aug, win_ori)
-    print ("Finished {} with time {}".format(n, time.time()+start))
+    print ("Finished {} with time {}".format(n, time.time() + start))
    data_num = 0
    for i in range(slots_num):
        print("Block {} ".format(block_list[i].block_id) + "Size {}".format(block_list[i].store_num()))
--- a/AlphaGo/strategy.py
+++ b/AlphaGo/strategy.py
@ -0,0 +1,78 @@
+import numpy as np
+import utils
+from collections import deque
+from tianshou.core.mcts.mcts import MCTS
+
+
+class GoEnv:
+    def __init__(self, size=19, komi=6.5):
+        self.size = size
+        self.komi = 6.5
+        self.board = [utils.EMPTY] * (self.size * self.size)
+        self.history = deque(maxlen=8)
+
+    def _flatten(self, vertex):
+        x, y = vertex
+        return (x - 1) * self.size + (y - 1)
+
+    def do_move(self, color, vertex):
+        if vertex == utils.PASS:
+            return True
+
+        id_ = self._flatten(vertex)
+        if self.board[id_] == utils.EMPTY:
+            self.board[id_] = color
+            self.history.append(self.board)
+            return True
+        else:
+            return False
+
+    def step_forward(self, state, action):
+        # print(state)
+        if state[0, 0, 0, -1] == 1:
+            color = 1
+        else:
+            color = -1
+        if action == 361:
+            vertex = (0, 0)
+        else:
+            vertex = (action / 19 + 1, action % 19)
+        self.do_move(color, vertex)
+        new_state = np.concatenate(
+            [state[:, :, :, 1:8], (np.array(self.board) == 1).reshape(1, 19, 19, 1),
+             state[:, :, :, 9:16], (np.array(self.board) == -1).reshape(1, 19, 19, 1),
+             np.array(1 - state[:, :, :, -1]).reshape(1, 19, 19, 1)],
+            axis=3)
+        return new_state, 0
+
+
+class strategy(object):
+    def __init__(self, evaluator):
+        self.simulator = GoEnv()
+        self.evaluator = evaluator
+
+    def data_process(self, history, color):
+        state = np.zeros([1, 19, 19, 17])
+        for i in range(8):
+            state[0, :, :, i] = history[i] == 1
+            state[0, :, :, i + 8] = history[i] == -1
+        if color == 1:
+            state[0, :, :, 16] = np.ones([19, 19])
+        if color == -1:
+            state[0, :, :, 16] = np.zeros([19, 19])
+        return state
+
+    def gen_move(self, history, color):
+        self.simulator.history = history
+        self.simulator.board = history[-1]
+        state = self.data_process(history, color)
+        prior = self.evaluator(state)[0]
+        mcts = MCTS(self.simulator, self.evaluator, state, 362, prior, inverse=True, max_step=20)
+        temp = 1
+        p = mcts.root.N ** temp / np.sum(mcts.root.N ** temp)
+        choice = np.random.choice(362, 1, p=p).tolist()[0]
+        if choice == 361:
+            move = (0, 0)
+        else:
+            move = (choice / 19 + 1, choice % 19 + 1)
+        return move
--- a/AlphaGo/test.py
+++ b/AlphaGo/test.py
@ -8,10 +8,8 @@
 from game import Game
 from engine import GTPEngine

-
-
 g = Game()
-e = GTPEngine(game_obj = g)
+e = GTPEngine(game_obj=g)
 res = e.run_cmd('1 protocol_version')
 print(e.known_commands)
 print(res)
@ -37,4 +35,5 @@ print(res)
 res = e.run_cmd('8 genmove BLACK')
 print(res)

-
+res = e.run_cmd('9 genmove WHITE')
+print(res)
--- a/AlphaGo/utils.py
+++ b/AlphaGo/utils.py
@ -0,0 +1,119 @@
+# -*- coding: utf-8 -*-
+# vim:fenc=utf-8
+# $File: utils.py
+# $Date: Fri Nov 17 10:2407 2017 +0800
+# $Author: renyong15 © <mails.tsinghua.edu.cn>
+#
+
+WHITE = -1
+BLACK = +1
+EMPTY = 0
+
+PASS = (0, 0)
+RESIGN = "resign"
+
+from collections import defaultdict
+import functools
+import itertools
+import operator
+import random
+import re
+import time
+
+import gtp
+import go
+
+KGS_COLUMNS = 'ABCDEFGHJKLMNOPQRST'
+SGF_COLUMNS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
+
+def parse_sgf_to_flat(sgf):
+    return flatten_coords(parse_sgf_coords(sgf))
+
+def flatten_coords(c):
+    return go.N * c[0] + c[1]
+
+def unflatten_coords(f):
+    return divmod(f, go.N)
+
+def parse_sgf_coords(s):
+    'Interprets coords. aa is top left corner; sa is top right corner'
+    if s is None or s == '':
+        return None
+    return SGF_COLUMNS.index(s[1]), SGF_COLUMNS.index(s[0])
+
+def unparse_sgf_coords(c):
+    if c is None:
+        return ''
+    return SGF_COLUMNS[c[1]] + SGF_COLUMNS[c[0]]
+
+def parse_kgs_coords(s):
+    'Interprets coords. A1 is bottom left; A9 is top left.'
+    if s == 'pass':
+        return None
+    s = s.upper()
+    col = KGS_COLUMNS.index(s[0])
+    row_from_bottom = int(s[1:]) - 1
+    return go.N - row_from_bottom - 1, col
+
+def parse_pygtp_coords(vertex):
+    'Interprets coords. (1, 1) is bottom left; (1, 9) is top left.'
+    if vertex in (gtp.PASS, gtp.RESIGN):
+        return None
+    return go.N - vertex[1], vertex[0] - 1
+
+def unparse_pygtp_coords(c):
+    if c is None:
+        return gtp.PASS
+    return c[1] + 1, go.N - c[0]
+
+def parse_game_result(result):
+    if re.match(r'[bB]\+', result):
+        return go.BLACK
+    elif re.match(r'[wW]\+', result):
+        return go.WHITE
+    else:
+        return None
+
+def product(numbers):
+    return functools.reduce(operator.mul, numbers)
+
+def take_n(n, iterable):
+    return list(itertools.islice(iterable, n))
+
+def iter_chunks(chunk_size, iterator):
+    while True:
+        next_chunk = take_n(chunk_size, iterator)
+        # If len(iterable) % chunk_size == 0, don't return an empty chunk.
+        if next_chunk:
+            yield next_chunk
+        else:
+            break
+
+def shuffler(iterator, pool_size=10**5, refill_threshold=0.9):
+    yields_between_refills = round(pool_size * (1 - refill_threshold))
+    # initialize pool; this step may or may not exhaust the iterator.
+    pool = take_n(pool_size, iterator)
+    while True:
+        random.shuffle(pool)
+        for i in range(yields_between_refills):
+            yield pool.pop()
+        next_batch = take_n(yields_between_refills, iterator)
+        if not next_batch:
+            break
+        pool.extend(next_batch)
+    # finish consuming whatever's left - no need for further randomization.
+    yield from pool
+
+class timer(object):
+    all_times = defaultdict(float)
+    def __init__(self, label):
+        self.label = label
+    def __enter__(self):
+        self.tick = time.time()
+    def __exit__(self, type, value, traceback):
+        self.tock = time.time()
+        self.all_times[self.label] += self.tock - self.tick
+    @classmethod
+    def print_times(cls):
+        for k, v in cls.all_times.items():
+            print("%s: %.3f" % (k, v))
--- a/GTP/.game.py.swp
+++ b/GTP/.game.py.swp
--- a/GTP/.test.py.swp
+++ b/GTP/.test.py.swp
--- a/GTP/.utils.py.swp
+++ b/GTP/.utils.py.swp
--- a/GTP/init.py
+++ b/GTP/init.py
@ -1,7 +0,0 @@
-# -*- coding: utf-8 -*-
-# vim:fenc=utf-8
-# $File: __init__.py
-# $Date: Thu Nov 16 14:1006 2017 +0800
-# $Author: renyong15 © <mails.tsinghua.edu.cn>
-#
-
--- a/GTP/game.py
+++ b/GTP/game.py
@ -1,50 +0,0 @@
-# -*- coding: utf-8 -*-
-# vim:fenc=utf-8
-# $File: game.py
-# $Date: Fri Nov 17 15:0745 2017 +0800
-# $Author: renyong15 © <mails.tsinghua.edu.cn>
-#
-
-import utils
-
-
-class Game:
-    def __init__(self, size=19, komi=6.5):
-        self.size = size
-        self.komi = 6.5
-        self.board = [utils.EMPTY] * (self.size * self.size)
-        self.strategy = None
-
-    def _flatten(self, vertex):
-        x,y = vertex
-        return (x-1) * self.size + (y-1)
-        
-
-    def clear(self):
-        self.board = [utils.EMPTY] * (self.size * self.size)
-
-    def set_size(self, n):
-        self.size = n
-        self.clear()
-
-    def set_komi(self, k):
-        self.komi = k
-
-    def do_move(self, color, vertex):
-        if vertex == utils.PASS:
-            return True
-
-        id_ = self._flatten(vertex)
-        if self.board[id_] == utils.EMPTY:
-            self.board[id_] = color
-            return True
-        else:
-            return False
-
-    def gen_move(self, color):
-        move = self.strategy.gen_move(color)
-        return move
-        #return utils.PASS
-
-
-    
--- a/GTP/utils.py
+++ b/GTP/utils.py
@ -1,16 +0,0 @@
-# -*- coding: utf-8 -*-
-# vim:fenc=utf-8
-# $File: utils.py
-# $Date: Fri Nov 17 10:2407 2017 +0800
-# $Author: renyong15 © <mails.tsinghua.edu.cn>
-#
-
-WHITE = -1
-BLACK = +1
-EMPTY = 0
-
-PASS = (0,0)
-RESIGN = "resign"
-
-
-
--- a/tianshou/core/mcts/evaluator.py
+++ b/tianshou/core/mcts/evaluator.py
@ -25,4 +25,4 @@ class rollout_policy(evaluator):
            action = np.random.randint(0, self.action_num)
            state, reward = self.env.step_forward(state, action)
            total_reward += reward
-        return total_reward
+        return np.ones([self.action_num])/self.action_num, total_reward
--- a/tianshou/core/mcts/mcts.py
+++ b/tianshou/core/mcts/mcts.py
@ -5,14 +5,29 @@ import time
 c_puct = 5


+def list2tuple(list):
+    try:
+        return tuple(list2tuple(sub) for sub in list)
+    except TypeError:
+        return list
+
+
+def tuple2list(tuple):
+    try:
+        return list(tuple2list(sub) for sub in tuple)
+    except TypeError:
+        return tuple
+
+
 class MCTSNode(object):
-    def __init__(self, parent, action, state, action_num, prior):
+    def __init__(self, parent, action, state, action_num, prior, inverse=False):
        self.parent = parent
        self.action = action
        self.children = {}
        self.state = state
        self.action_num = action_num
        self.prior = prior
+        self.inverse = inverse

    def selection(self, simulator):
        raise NotImplementedError("Need to implement function selection")
@ -20,13 +35,10 @@ class MCTSNode(object):
    def backpropagation(self, action):
        raise NotImplementedError("Need to implement function backpropagation")

-    def simulation(self, state, evaluator):
-        raise NotImplementedError("Need to implement function simulation")
-

 class UCTNode(MCTSNode):
-    def __init__(self, parent, action, state, action_num, prior):
-        super(UCTNode, self).__init__(parent, action, state, action_num, prior)
+    def __init__(self, parent, action, state, action_num, prior, inverse=False):
+        super(UCTNode, self).__init__(parent, action, state, action_num, prior, inverse)
        self.Q = np.zeros([action_num])
        self.W = np.zeros([action_num])
        self.N = np.zeros([action_num])
@ -49,16 +61,15 @@ class UCTNode(MCTSNode):
                self.Q[i] = (self.W[i] + 0.) / self.N[i]
        self.ucb = self.Q + c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1.)
        if self.parent is not None:
-            self.parent.backpropagation(self.children[action].reward)
-
-    def simulation(self, evaluator, state):
-        value = evaluator(state)
-        return value
+            if self.inverse:
+                self.parent.backpropagation(-self.children[action].reward)
+            else:
+                self.parent.backpropagation(self.children[action].reward)


 class TSNode(MCTSNode):
-    def __init__(self, parent, action, state, action_num, prior, method="Gaussian"):
-        super(TSNode, self).__init__(parent, action, state, action_num, prior)
+    def __init__(self, parent, action, state, action_num, prior, method="Gaussian", inverse=False):
+        super(TSNode, self).__init__(parent, action, state, action_num, prior, inverse)
        if method == "Beta":
            self.alpha = np.ones([action_num])
            self.beta = np.ones([action_num])
@ -73,10 +84,27 @@ class ActionNode:
        self.action = action
        self.children = {}
        self.next_state = None
+        self.origin_state = None
+        self.state_type = None
        self.reward = 0

+    def type_conversion_to_tuple(self):
+        if type(self.next_state) is np.ndarray:
+            self.next_state = self.next_state.tolist()
+        if type(self.next_state) is list:
+            self.next_state = list2tuple(self.next_state)
+
+    def type_conversion_to_origin(self):
+        if self.state_type is np.ndarray:
+            self.next_state = np.array(self.next_state)
+        if self.state_type is list:
+            self.next_state = tuple2list(self.next_state)
+
    def selection(self, simulator):
        self.next_state, self.reward = simulator.step_forward(self.parent.state, self.action)
+        self.origin_state = self.next_state
+        self.state_type = type(self.next_state)
+        self.type_conversion_to_tuple()
        if self.next_state is not None:
            if self.next_state in self.children.keys():
                return self.children[self.next_state].selection(simulator)
@ -85,14 +113,15 @@ class ActionNode:
        else:
            return self.parent, self.action

-    def expansion(self, action_num):
+    def expansion(self, evaluator, action_num):
        # TODO: Let users/evaluator give the prior
        if self.next_state is not None:
-            prior = np.ones([action_num]) / action_num
-            self.children[self.next_state] = UCTNode(self, self.action, self.next_state, action_num, prior)
-            return True
+            prior, value = evaluator(self.next_state)
+            self.children[self.next_state] = UCTNode(self, self.action, self.origin_state, action_num, prior,
+                                                     self.parent.inverse)
+            return value
        else:
-            return False
+            return 0

    def backpropagation(self, value):
        self.reward += value
@ -100,14 +129,16 @@ class ActionNode:


 class MCTS:
-    def __init__(self, simulator, evaluator, root, action_num, prior, method="UCT", max_step=None, max_time=None):
+    def __init__(self, simulator, evaluator, root, action_num, prior, method="UCT", inverse=False, max_step=None,
+                 max_time=None):
        self.simulator = simulator
        self.evaluator = evaluator
        self.action_num = action_num
        if method == "UCT":
-            self.root = UCTNode(None, None, root, action_num, prior)
+            self.root = UCTNode(None, None, root, action_num, prior, inverse)
        if method == "TS":
-            self.root = TSNode(None, None, root, action_num, prior)
+            self.root = TSNode(None, None, root, action_num, prior, inverse=inverse)
+        self.inverse = inverse
        if max_step is not None:
            self.step = 0
            self.max_step = max_step
@ -118,23 +149,15 @@ class MCTS:
            raise ValueError("Need a stop criteria!")
        while (max_step is not None and self.step < self.max_step or max_step is None) \
                and (max_time is not None and time.time() - self.start_time < self.max_time or max_time is None):
-            print("Q={}".format(self.root.Q))
-            print("N={}".format(self.root.N))
-            print("W={}".format(self.root.W))
-            print("UCB={}".format(self.root.ucb))
-            print("\n")
            self.expand()
            if max_step is not None:
                self.step += 1

    def expand(self):
        node, new_action = self.root.selection(self.simulator)
-        success = node.children[new_action].expansion(self.action_num)
-        if success:
-            value = node.simulation(self.evaluator, node.children[new_action].next_state)
-            node.children[new_action].backpropagation(value + 0.)
-        else:
-            node.children[new_action].backpropagation(0.)
+        value = node.children[new_action].expansion(self.evaluator, self.action_num)
+        print("Value:{}".format(value))
+        node.children[new_action].backpropagation(value + 0.)


 if __name__ == "__main__":
--- a/tianshou/core/mcts/mcts_test.py
+++ b/tianshou/core/mcts/mcts_test.py
@ -24,7 +24,7 @@ class TestEnv:
        else:
            num = state[0] + 2 ** state[1] * action
            step = state[1] + 1
-            new_state = (num, step)
+            new_state = [num, step]
            if step == self.max_step:
                reward = int(np.random.uniform() < self.reward[num])
            else:
--- a/utils/.gtp.py.swp
+++ b/utils/.gtp.py.swp
--- a/utils/.gtp_wrapper.py.swp
+++ b/utils/.gtp_wrapper.py.swp