Merge remote-tracking branch 'origin' into mcts_virtual_loss

This commit is contained in:
mcgrady00h 2017-12-24 15:49:45 +08:00
commit 8c6f44a015
24 changed files with 572 additions and 412 deletions

4
.gitignore vendored
View File

@ -4,8 +4,8 @@ leela-zero
parameters
*.swp
*.sublime*
checkpoints
checkpoints_origin
checkpoint
*.json
.DS_Store
data
.log

1
AlphaGo/.gitignore vendored
View File

@ -1,3 +1,4 @@
data
checkpoints
checkpoints_origin
*.log

29
AlphaGo/data_statistic.py Normal file
View File

@ -0,0 +1,29 @@
import os
import cPickle
class Data(object):
def __init__(self):
self.boards = []
self.probs = []
self.winner = 0
def file_to_training_data(file_name):
with open(file_name, 'rb') as file:
try:
file.seek(0)
data = cPickle.load(file)
return data.winner
except Exception as e:
print(e)
return 0
if __name__ == "__main__":
win_count = [0, 0, 0]
file_list = os.listdir("./data")
#print file_list
for file in file_list:
win_count[file_to_training_data("./data/" + file)] += 1
print "Total play : " + str(len(file_list))
print "Black wins : " + str(win_count[1])
print "White wins : " + str(win_count[-1])

View File

@ -6,6 +6,8 @@
#
from game import Game
import copy
import numpy as np
import utils
@ -183,10 +185,13 @@ class GTPEngine():
return 'unknown player', False
def cmd_get_score(self, args, **kwargs):
return self._game.game_engine.executor_get_score(self._game.board, True), True
return self._game.game_engine.executor_get_score(self._game.board), True
def cmd_show_board(self, args, **kwargs):
return self._game.board, True
board = copy.deepcopy(self._game.board)
if isinstance(board, np.ndarray):
board = board.flatten().tolist()
return board, True
def cmd_get_prob(self, args, **kwargs):
return self._game.prob, True
@ -194,4 +199,4 @@ class GTPEngine():
if __name__ == "main":
game = Game()
engine = GTPEngine(game_obj=Game)
engine = GTPEngine(game_obj=game)

View File

@ -10,12 +10,15 @@ import copy
import tensorflow as tf
import numpy as np
import sys, os
import go
import model
from collections import deque
sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir))
from tianshou.core.mcts.mcts import MCTS
import go
import reversi
import time
class Game:
'''
Load the real game and trained weights.
@ -23,23 +26,38 @@ class Game:
TODO : Maybe merge with the engine class in future,
currently leave it untouched for interacting with Go UI.
'''
def __init__(self, size=9, komi=3.75, checkpoint_path=None):
self.size = size
self.komi = komi
self.board = [utils.EMPTY] * (self.size ** 2)
self.history = []
self.latest_boards = deque(maxlen=8)
for _ in range(8):
def __init__(self, name="reversi", role="unknown", debug=False, checkpoint_path=None):
self.name = name
self.role = role
self.debug = debug
if self.name == "go":
self.size = 9
self.komi = 3.75
self.history = []
self.history_length = 8
self.game_engine = go.Go(size=self.size, komi=self.komi, role=self.role)
self.board = [utils.EMPTY] * (self.size ** 2)
elif self.name == "reversi":
self.size = 8
self.history_length = 1
self.history = []
self.game_engine = reversi.Reversi(size=self.size)
self.board = self.game_engine.get_board()
else:
raise ValueError(name + " is an unknown game...")
self.evaluator = model.ResNet(self.size, self.size ** 2 + 1, history_length=self.history_length)
self.latest_boards = deque(maxlen=self.history_length)
for _ in range(self.history_length):
self.latest_boards.append(self.board)
self.evaluator = model.ResNet(self.size, self.size**2 + 1, history_length=8, checkpoint_path=checkpoint_path)
# self.evaluator = lambda state: self.sess.run([tf.nn.softmax(self.net.p), self.net.v],
# feed_dict={self.net.x: state, self.net.is_training: False})
self.game_engine = go.Go(size=self.size, komi=self.komi)
def clear(self):
self.board = [utils.EMPTY] * (self.size ** 2)
self.history = []
for _ in range(8):
if self.name == "go":
self.board = [utils.EMPTY] * (self.size ** 2)
self.history = []
if self.name == "reversi":
self.board = self.game_engine.get_board()
for _ in range(self.history_length):
self.latest_boards.append(self.board)
def set_size(self, n):
@ -50,8 +68,9 @@ class Game:
self.komi = k
def think(self, latest_boards, color):
mcts = MCTS(self.game_engine, self.evaluator, [latest_boards, color], self.size ** 2 + 1, inverse=True)
mcts.search(max_step=20)
mcts = MCTS(self.game_engine, self.evaluator, [latest_boards, color],
self.size ** 2 + 1, role=self.role, debug=self.debug, inverse=True)
mcts.search(max_step=100)
temp = 1
prob = mcts.root.N ** temp / np.sum(mcts.root.N ** temp)
choice = np.random.choice(self.size ** 2 + 1, 1, p=prob).tolist()[0]
@ -65,7 +84,11 @@ class Game:
# this function can be called directly to play the opponent's move
if vertex == utils.PASS:
return True
res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex)
# TODO this implementation is not very elegant
if self.name == "go":
res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex)
elif self.name == "reversi":
res = self.game_engine.executor_do_move(self.history, self.latest_boards, self.board, color, vertex)
return res
def think_play_move(self, color):
@ -91,13 +114,14 @@ class Game:
if row[i] < 10:
print(' ', end='')
for j in range(self.size):
print(self.status2symbol(self.board[self._flatten((j + 1, i + 1))]), end=' ')
print(self.status2symbol(self.board[self.game_engine._flatten((j + 1, i + 1))]), end=' ')
print('')
sys.stdout.flush()
if __name__ == "__main__":
g = Game(checkpoint_path='./checkpoints/')
g.show_board()
g = Game("go")
print(g.board)
g.clear()
g.think_play_move(1)
#file = open("debug.txt", "a")
#file.write("mcts check\n")

View File

@ -3,7 +3,7 @@ import utils
import copy
import numpy as np
from collections import deque
import time
'''
Settings of the Go game.
@ -18,6 +18,7 @@ class Go:
def __init__(self, **kwargs):
self.size = kwargs['size']
self.komi = kwargs['komi']
self.role = kwargs['role']
def _flatten(self, vertex):
x, y = vertex
@ -98,7 +99,7 @@ class Go:
def _check_global_isomorphous(self, history_boards, current_board, color, vertex):
repeat = False
next_board = copy.copy(current_board)
next_board = copy.deepcopy(current_board)
next_board[self._flatten(vertex)] = color
self._process_board(next_board, color, vertex)
if next_board in history_boards:
@ -157,7 +158,7 @@ class Go:
vertex = self._deflatten(action)
return vertex
def _is_valid(self, history_boards, current_board, color, vertex):
def _rule_check(self, history_boards, current_board, color, vertex):
### in board
if not self._in_board(vertex):
return False
@ -176,30 +177,30 @@ class Go:
return True
def simulate_is_valid(self, state, action):
def _is_valid(self, state, action):
history_boards, color = state
vertex = self._action2vertex(action)
current_board = history_boards[-1]
if not self._is_valid(history_boards, current_board, color, vertex):
if not self._rule_check(history_boards, current_board, color, vertex):
return False
if not self._knowledge_prunning(current_board, color, vertex):
return False
return True
def simulate_is_valid_list(self, state, action_set):
def simulate_get_mask(self, state, action_set):
# find all the invalid actions
invalid_action_list = []
invalid_action_mask = []
for action_candidate in action_set[:-1]:
# go through all the actions excluding pass
if not self.simulate_is_valid(state, action_candidate):
invalid_action_list.append(action_candidate)
if len(invalid_action_list) < len(action_set) - 1:
invalid_action_list.append(action_set[-1])
if not self._is_valid(state, action_candidate):
invalid_action_mask.append(action_candidate)
if len(invalid_action_mask) < len(action_set) - 1:
invalid_action_mask.append(action_set[-1])
# forbid pass, if we have other choices
# TODO: In fact we should not do this. In some extreme cases, we should permit pass.
return invalid_action_list
return invalid_action_mask
def _do_move(self, board, color, vertex):
if vertex == utils.PASS:
@ -211,20 +212,23 @@ class Go:
def simulate_step_forward(self, state, action):
# initialize the simulate_board from state
history_boards, color = state
vertex = self._action2vertex(action)
new_board = self._do_move(copy.copy(history_boards[-1]), color, vertex)
history_boards.append(new_board)
new_color = -color
return [history_boards, new_color], 0
history_boards, color = copy.deepcopy(state)
if history_boards[-1] == history_boards[-2] and action is utils.PASS:
return None, 2 * (float(self.simple_executor_get_score(history_boards[-1]) > 0)-0.5) * color
else:
vertex = self._action2vertex(action)
new_board = self._do_move(copy.deepcopy(history_boards[-1]), color, vertex)
history_boards.append(new_board)
new_color = -color
return [history_boards, new_color], 0
def executor_do_move(self, history, latest_boards, current_board, color, vertex):
if not self._is_valid(history, current_board, color, vertex):
if not self._rule_check(history, current_board, color, vertex):
return False
current_board[self._flatten(vertex)] = color
self._process_board(current_board, color, vertex)
history.append(copy.copy(current_board))
latest_boards.append(copy.copy(current_board))
history.append(copy.deepcopy(current_board))
latest_boards.append(copy.deepcopy(current_board))
return True
def _find_empty(self, current_board):
@ -280,11 +284,8 @@ class Go:
elif color_estimate < 0:
return utils.WHITE
def executor_get_score(self, current_board, is_unknown_estimation=False):
'''
is_unknown_estimation: whether use nearby stone to predict the unknown
return score from BLACK perspective.
'''
def executor_get_score(self, current_board):
#return score from BLACK perspective.
_board = copy.deepcopy(current_board)
while utils.EMPTY in _board:
vertex = self._find_empty(_board)
@ -294,10 +295,8 @@ class Go:
_board[self._flatten(vertex)] = utils.BLACK
elif boarder_color == {utils.WHITE}:
_board[self._flatten(vertex)] = utils.WHITE
elif is_unknown_estimation:
_board[self._flatten(vertex)] = self._predict_from_nearby(_board, vertex)
else:
_board[self._flatten(vertex)] =utils.UNKNOWN
_board[self._flatten(vertex)] = self._predict_from_nearby(_board, vertex)
score = 0
for i in _board:
if i == utils.BLACK:
@ -308,7 +307,46 @@ class Go:
return score
def simple_executor_get_score(self, current_board):
'''
can only be used for the empty group only have one single stone
return score from BLACK perspective.
'''
score = 0
for idx, color in enumerate(current_board):
if color == utils.EMPTY:
neighbors = self._neighbor(self._deflatten(idx))
color = current_board[self._flatten(neighbors[0])]
if color == utils.BLACK:
score += 1
elif color == utils.WHITE:
score -= 1
score -= self.komi
return score
if __name__ == "__main__":
go = Go(size=9, komi=3.75, role = utils.BLACK)
endgame = [
1, 0, 1, 0, 1, 1, -1, 0, -1,
1, 1, 1, 1, 1, 1, -1, -1, -1,
0, 1, 1, 1, 1, -1, 0, -1, 0,
1, 1, 1, 1, 1, -1, -1, -1, -1,
1, -1, 1, -1, 1, 1, -1, -1, -1,
-1, -1, -1, -1, -1, 1, -1, 0, -1,
1, 1, 1, -1, -1, -1, -1, -1, -1,
1, 0, 1, 1, 1, 1, 1, -1, 0,
1, 1, 0, 1, -1, -1, -1, -1, -1
]
time0 = time.time()
score = go.executor_get_score(endgame)
time1 = time.time()
print(score, time1 - time0)
score = go.new_executor_get_score(endgame)
time2 = time.time()
print(score, time2 - time1)
'''
### do unit test for Go class
pure_test = [
0, 1, 0, 1, 0, 1, 0, 0, 0,
@ -347,3 +385,4 @@ if __name__ == "__main__":
for i in range(7):
print (go._is_eye(opponent_test, utils.BLACK, ot_qry[i]))
print("Test of eye surrend by opponents\n")
'''

View File

@ -1,7 +1,6 @@
import os
import time
import random
import sys
import copy
import cPickle
from collections import deque
@ -102,7 +101,7 @@ class ResNet(object):
self._build_network(residual_block_num, self.checkpoint_path)
# training hyper-parameters:
self.window_length = 7000
self.window_length = 3
self.save_freq = 5000
self.training_data = {'states': deque(maxlen=self.window_length), 'probs': deque(maxlen=self.window_length),
'winner': deque(maxlen=self.window_length), 'length': deque(maxlen=self.window_length)}
@ -153,6 +152,9 @@ class ResNet(object):
:param color: a string, indicate which one to play
:return: a list of tensor, the predicted value and policy given the history and color
"""
# Note : maybe we can use it for isolating test of MCTS
#prob = [1.0 / self.action_num] * self.action_num
#return [prob, np.random.uniform(-1, 1)]
history, color = state
if len(history) != self.history_length:
raise ValueError(
@ -171,10 +173,10 @@ class ResNet(object):
"""
state = np.zeros([1, self.board_size, self.board_size, 2 * self.history_length + 1])
for i in range(self.history_length):
state[0, :, :, i] = np.array(np.array(history[i]) == np.ones(self.board_size ** 2)).reshape(self.board_size,
state[0, :, :, i] = np.array(np.array(history[i]).flatten() == np.ones(self.board_size ** 2)).reshape(self.board_size,
self.board_size)
state[0, :, :, i + self.history_length] = np.array(
np.array(history[i]) == -np.ones(self.board_size ** 2)).reshape(self.board_size, self.board_size)
np.array(history[i]).flatten() == -np.ones(self.board_size ** 2)).reshape(self.board_size, self.board_size)
# TODO: need a config to specify the BLACK and WHITE
if color == +1:
state[0, :, :, 2 * self.history_length] = np.ones([self.board_size, self.board_size])
@ -224,11 +226,19 @@ class ResNet(object):
else:
start_time = time.time()
for i in range(batch_size):
game_num = random.randint(0, self.window_length-1)
state_num = random.randint(0, self.training_data['length'][game_num]-1)
training_data['states'].append(np.expand_dims(self.training_data['states'][game_num][state_num], 0))
training_data['probs'].append(np.expand_dims(self.training_data['probs'][game_num][state_num], 0))
training_data['winner'].append(np.expand_dims(self.training_data['winner'][game_num][state_num], 0))
priority = np.array(self.training_data['length']) / (0.0 + np.sum(np.array(self.training_data['length'])))
game_num = np.random.choice(self.window_length, 1, p=priority)[0]
state_num = np.random.randint(self.training_data['length'][game_num])
rotate_times = np.random.randint(4)
reflect_times = np.random.randint(2)
reflect_orientation = np.random.randint(2)
training_data['states'].append(
self._preprocession(self.training_data['states'][game_num][state_num], reflect_times,
reflect_orientation, rotate_times))
training_data['probs'].append(np.concatenate(
[self._preprocession(self.training_data['probs'][game_num][state_num][:-1].reshape(self.board_size, self.board_size, 1), reflect_times,
reflect_orientation, rotate_times).reshape(1, self.board_size**2), self.training_data['probs'][game_num][state_num][-1].reshape(1,1)], axis=1))
training_data['winner'].append(self.training_data['winner'][game_num][state_num].reshape(1, 1))
value_loss, policy_loss, reg, _ = self.sess.run(
[self.value_loss, self.policy_loss, self.reg, self.train_op],
feed_dict={self.x: np.concatenate(training_data['states'], axis=0),
@ -280,6 +290,55 @@ class ResNet(object):
winner = np.concatenate(winner, axis=0)
return states, probs, winner
def _preprocession(self, board, reflect_times=0, reflect_orientation=0, rotate_times=0):
"""
preprocessing for augmentation
:param board: a ndarray, board to process
:param reflect_times: an integer, how many times to reflect
:param reflect_orientation: an integer, which orientation to reflect
:param rotate_times: an integer, how many times to rotate
:return:
"""
new_board = copy.deepcopy(board)
if new_board.ndim == 3:
new_board = np.expand_dims(new_board, axis=0)
new_board = self._board_reflection(new_board, reflect_times, reflect_orientation)
new_board = self._board_rotation(new_board, rotate_times)
return new_board
def _board_rotation(self, board, times):
"""
rotate the board for augmentation
note that board's shape should be [batch_size, board_size, board_size, channels]
:param board: a ndarray, shape [batch_size, board_size, board_size, channels]
:param times: an integer, how many times to rotate
:return:
"""
return np.rot90(board, times, (1, 2))
def _board_reflection(self, board, times, orientation):
"""
reflect the board for augmentation
note that board's shape should be [batch_size, board_size, board_size, channels]
:param board: a ndarray, shape [batch_size, board_size, board_size, channels]
:param times: an integer, how many times to reflect
:param orientation: an integer, which orientation to reflect
:return:
"""
new_board = copy.deepcopy(board)
for _ in range(times):
if orientation == 0:
new_board = new_board[:, ::-1]
if orientation == 1:
new_board = new_board[:, :, ::-1]
return new_board
if __name__ == "__main__":
model = ResNet(board_size=9, action_num=82, history_length=8)

View File

@ -7,7 +7,6 @@ import time
import os
import cPickle
class Data(object):
def __init__(self):
self.boards = []
@ -29,6 +28,7 @@ if __name__ == '__main__':
parser.add_argument("--black_weight_path", type=str, default=None)
parser.add_argument("--white_weight_path", type=str, default=None)
parser.add_argument("--id", type=int, default=0)
parser.add_argument("--debug", type=bool, default=False)
args = parser.parse_args()
if not os.path.exists(args.result_path):
@ -61,11 +61,13 @@ if __name__ == '__main__':
white_role_name = 'white' + str(args.id)
agent_v0 = subprocess.Popen(
['python', '-u', 'player.py', '--role=' + black_role_name, '--checkpoint_path=' + str(args.black_weight_path)],
['python', '-u', 'player.py', '--role=' + black_role_name,
'--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)],
stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
agent_v1 = subprocess.Popen(
['python', '-u', 'player.py', '--role=' + white_role_name, '--checkpoint_path=' + str(args.white_weight_path)],
['python', '-u', 'player.py', '--role=' + white_role_name,
'--checkpoint_path=' + str(args.black_weight_path), '--debug=' + str(args.debug)],
stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
server_list = ""
@ -87,27 +89,29 @@ if __name__ == '__main__':
pattern = "[A-Z]{1}[0-9]{1}"
space = re.compile("\s+")
size = 9
size = {"go":9, "reversi":8}
show = ['.', 'X', 'O']
evaluate_rounds = 1
game_num = 0
try:
while True:
#while True:
while game_num < evaluate_rounds:
start_time = time.time()
num = 0
pass_flag = [False, False]
print("Start game {}".format(game_num))
# end the game if both palyer chose to pass, or play too much turns
while not (pass_flag[0] and pass_flag[1]) and num < size ** 2 * 2:
while not (pass_flag[0] and pass_flag[1]) and num < size["reversi"] ** 2 * 2:
turn = num % 2
board = player[turn].run_cmd(str(num) + ' show_board')
board = eval(board[board.index('['):board.index(']') + 1])
for i in range(size):
for j in range(size):
print show[board[i * size + j]] + " ",
for i in range(size["reversi"]):
for j in range(size["reversi"]):
print show[board[i * size["reversi"] + j]] + " ",
print "\n",
data.boards.append(board)
start_time = time.time()
move = player[turn].run_cmd(str(num) + ' genmove ' + color[turn] + '\n')
print role[turn] + " : " + str(move),
num += 1

View File

@ -25,16 +25,20 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--checkpoint_path", type=str, default=None)
parser.add_argument("--role", type=str, default="unknown")
parser.add_argument("--debug", type=str, default=False)
args = parser.parse_args()
if args.checkpoint_path == 'None':
args.checkpoint_path = None
game = Game(checkpoint_path=args.checkpoint_path)
debug = False
if args.debug == "True":
debug = True
game = Game(role=args.role, checkpoint_path=args.checkpoint_path, debug=debug)
engine = GTPEngine(game_obj=game, name='tianshou', version=0)
daemon = Pyro4.Daemon() # make a Pyro daemon
ns = Pyro4.locateNS() # find the name server
player = Player(role = args.role, engine = engine)
player = Player(role=args.role, engine=engine)
print "Init " + args.role + " player finished"
uri = daemon.register(player) # register the greeting maker as a Pyro object
print "Start on name " + args.role

View File

@ -1,264 +1,194 @@
from __future__ import print_function
import numpy as np
'''
Settings of the Go game.
(1, 1) is considered as the upper left corner of the board,
(size, 1) is the lower left
'''
def find_correct_moves(own, enemy):
"""return legal moves"""
left_right_mask = 0x7e7e7e7e7e7e7e7e # Both most left-right edge are 0, else 1
top_bottom_mask = 0x00ffffffffffff00 # Both most top-bottom edge are 0, else 1
mask = left_right_mask & top_bottom_mask
mobility = 0
mobility |= search_offset_left(own, enemy, left_right_mask, 1) # Left
mobility |= search_offset_left(own, enemy, mask, 9) # Left Top
mobility |= search_offset_left(own, enemy, top_bottom_mask, 8) # Top
mobility |= search_offset_left(own, enemy, mask, 7) # Top Right
mobility |= search_offset_right(own, enemy, left_right_mask, 1) # Right
mobility |= search_offset_right(own, enemy, mask, 9) # Bottom Right
mobility |= search_offset_right(own, enemy, top_bottom_mask, 8) # Bottom
mobility |= search_offset_right(own, enemy, mask, 7) # Left bottom
return mobility
def calc_flip(pos, own, enemy):
"""return flip stones of enemy by bitboard when I place stone at pos.
:param pos: 0~63
:param own: bitboard (0=top left, 63=bottom right)
:param enemy: bitboard
:return: flip stones of enemy when I place stone at pos.
"""
f1 = _calc_flip_half(pos, own, enemy)
f2 = _calc_flip_half(63 - pos, rotate180(own), rotate180(enemy))
return f1 | rotate180(f2)
def _calc_flip_half(pos, own, enemy):
el = [enemy, enemy & 0x7e7e7e7e7e7e7e7e, enemy & 0x7e7e7e7e7e7e7e7e, enemy & 0x7e7e7e7e7e7e7e7e]
masks = [0x0101010101010100, 0x00000000000000fe, 0x0002040810204080, 0x8040201008040200]
masks = [b64(m << pos) for m in masks]
flipped = 0
for e, mask in zip(el, masks):
outflank = mask & ((e | ~mask) + 1) & own
flipped |= (outflank - (outflank != 0)) & mask
return flipped
def search_offset_left(own, enemy, mask, offset):
e = enemy & mask
blank = ~(own | enemy)
t = e & (own >> offset)
t |= e & (t >> offset)
t |= e & (t >> offset)
t |= e & (t >> offset)
t |= e & (t >> offset)
t |= e & (t >> offset) # Up to six stones can be turned at once
return blank & (t >> offset) # Only the blank squares can be started
def search_offset_right(own, enemy, mask, offset):
e = enemy & mask
blank = ~(own | enemy)
t = e & (own << offset)
t |= e & (t << offset)
t |= e & (t << offset)
t |= e & (t << offset)
t |= e & (t << offset)
t |= e & (t << offset) # Up to six stones can be turned at once
return blank & (t << offset) # Only the blank squares can be started
def flip_vertical(x):
k1 = 0x00FF00FF00FF00FF
k2 = 0x0000FFFF0000FFFF
x = ((x >> 8) & k1) | ((x & k1) << 8)
x = ((x >> 16) & k2) | ((x & k2) << 16)
x = (x >> 32) | b64(x << 32)
return x
def b64(x):
return x & 0xFFFFFFFFFFFFFFFF
def bit_count(x):
return bin(x).count('1')
def bit_to_array(x, size):
"""bit_to_array(0b0010, 4) -> array([0, 1, 0, 0])"""
return np.array(list(reversed((("0" * size) + bin(x)[2:])[-size:])), dtype=np.uint8)
def flip_diag_a1h8(x):
k1 = 0x5500550055005500
k2 = 0x3333000033330000
k4 = 0x0f0f0f0f00000000
t = k4 & (x ^ b64(x << 28))
x ^= t ^ (t >> 28)
t = k2 & (x ^ b64(x << 14))
x ^= t ^ (t >> 14)
t = k1 & (x ^ b64(x << 7))
x ^= t ^ (t >> 7)
return x
def rotate90(x):
return flip_diag_a1h8(flip_vertical(x))
def rotate180(x):
return rotate90(rotate90(x))
class Reversi:
def __init__(self, black=None, white=None):
self.black = black or (0b00001000 << 24 | 0b00010000 << 32)
self.white = white or (0b00010000 << 24 | 0b00001000 << 32)
self.board = None # 8 * 8 board with 1 for black, -1 for white and 0 for blank
self.color = None # 1 for black and -1 for white
self.action = None # number in 0~63
# self.winner = None
self.black_win = None
def get_board(self, black=None, white=None):
self.black = black or (0b00001000 << 24 | 0b00010000 << 32)
self.white = white or (0b00010000 << 24 | 0b00001000 << 32)
self.board = self.bitboard2board()
return self.board
def simulate_is_valid(self, board, color):
self.board = board
self.color = color
self.board2bitboard()
own, enemy = self.get_own_and_enemy()
mobility = find_correct_moves(own, enemy)
valid_moves = bit_to_array(mobility, 64)
valid_moves = np.argwhere(valid_moves)
valid_moves = list(np.reshape(valid_moves, len(valid_moves)))
return valid_moves
def simulate_step_forward(self, state, vertex):
self.board = state[0]
self.color = state[1]
self.board2bitboard()
self.vertex2action(vertex)
step_forward = self.step()
if step_forward:
new_board = self.bitboard2board()
return [new_board, 0 - self.color], 0
def executor_do_move(self, board, color, vertex):
self.board = board
self.color = color
self.board2bitboard()
self.vertex2action(vertex)
step_forward = self.step()
if step_forward:
new_board = self.bitboard2board()
for i in range(64):
board[i] = new_board[i]
def executor_get_score(self, board):
self.board = board
self._game_over()
if self.black_win is not None:
return self.black_win
else:
ValueError("Game not finished!")
def board2bitboard(self):
count = 1
if self.board is None:
ValueError("None board!")
self.black = 0
self.white = 0
for i in range(64):
if self.board[i] == 1:
self.black |= count
elif self.board[i] == -1:
self.white |= count
count *= 2
def vertex2action(self, vertex):
x, y = vertex
if x == 0 and y == 0:
self.action = None
else:
self.action = 8 * (x - 1) + y - 1
def bitboard2board(self):
board = []
black = bit_to_array(self.black, 64)
white = bit_to_array(self.white, 64)
for i in range(64):
if black[i]:
board.append(1)
elif white[i]:
board.append(-1)
else:
board.append(0)
return board
def step(self):
if self.action < 0 or self.action > 63:
ValueError("Wrong action!")
if self.action is None:
return False
own, enemy = self.get_own_and_enemy()
flipped = calc_flip(self.action, own, enemy)
if bit_count(flipped) == 0:
self.illegal_move_to_lose(self.action)
return False
own ^= flipped
own |= 1 << self.action
enemy ^= flipped
self.set_own_and_enemy(own, enemy)
return True
def _game_over(self):
# self.done = True
'''
if self.winner is None:
black_num, white_num = self.number_of_black_and_white
if black_num > white_num:
self.winner = 1
elif black_num < white_num:
self.winner = -1
else:
self.winner = 0
'''
if self.black_win is None:
black_num, white_num = self.number_of_black_and_white
self.black_win = black_num - white_num
def illegal_move_to_lose(self, action):
self._game_over()
def get_own_and_enemy(self):
if self.color == 1:
own, enemy = self.black, self.white
elif self.color == -1:
own, enemy = self.white, self.black
else:
own, enemy = None, None
return own, enemy
def set_own_and_enemy(self, own, enemy):
if self.color == 1:
self.black, self.white = own, enemy
else:
self.white, self.black = own, enemy
@property
def number_of_black_and_white(self):
return bit_count(self.black), bit_count(self.white)
import numpy as np
import copy
'''
Settings of the Reversi game.
(1, 1) is considered as the upper left corner of the board,
(size, 1) is the lower left
'''
class Reversi:
def __init__(self, **kwargs):
self.size = kwargs['size']
def _deflatten(self, idx):
x = idx // self.size + 1
y = idx % self.size + 1
return (x, y)
def _flatten(self, vertex):
x, y = vertex
if (x == 0) and (y == 0):
return self.size ** 2
return (x - 1) * self.size + (y - 1)
def get_board(self):
board = np.zeros([self.size, self.size], dtype=np.int32)
board[self.size / 2 - 1, self.size / 2 - 1] = -1
board[self.size / 2, self.size / 2] = -1
board[self.size / 2 - 1, self.size / 2] = 1
board[self.size / 2, self.size / 2 - 1] = 1
return board
def _find_correct_moves(self, board, color, is_next=False):
moves = []
if is_next:
new_color = 0 - color
else:
new_color = color
for i in range(self.size ** 2):
x, y = self._deflatten(i)
valid = self._is_valid(board, x - 1, y - 1, new_color)
if valid:
moves.append(i)
return moves
def _one_direction_valid(self, board, x, y, color):
if (x >= 0) and (x < self.size):
if (y >= 0) and (y < self.size):
if board[x, y] == color:
return True
return False
def _is_valid(self, board, x, y, color):
if board[x, y]:
return False
for x_direction in [-1, 0, 1]:
for y_direction in [-1, 0, 1]:
new_x = x
new_y = y
flag = 0
while True:
new_x += x_direction
new_y += y_direction
if self._one_direction_valid(board, new_x, new_y, 0 - color):
flag = 1
else:
break
if self._one_direction_valid(board, new_x, new_y, color) and flag:
return True
return False
def simulate_get_mask(self, state, action_set):
history_boards, color = copy.deepcopy(state)
board = copy.deepcopy(history_boards[-1])
valid_moves = self._find_correct_moves(board, color)
if not len(valid_moves):
invalid_action_mask = action_set[0:-1]
else:
invalid_action_mask = []
for action in action_set:
if action not in valid_moves:
invalid_action_mask.append(action)
return invalid_action_mask
def simulate_step_forward(self, state, action):
history_boards, color = copy.deepcopy(state)
board = copy.deepcopy(history_boards[-1])
if action == self.size ** 2:
valid_moves = self._find_correct_moves(board, color, is_next=True)
if not len(valid_moves):
winner = self._get_winner(board)
return None, winner * color
else:
return [history_boards, 0 - color], 0
new_board = self._step(board, color, action)
history_boards.append(new_board)
return [history_boards, 0 - color], 0
def _get_winner(self, board):
black_num, white_num = self._number_of_black_and_white(board)
black_win = black_num - white_num
if black_win > 0:
winner = 1
elif black_win < 0:
winner = -1
else:
winner = 0
return winner
def _number_of_black_and_white(self, board):
black_num = 0
white_num = 0
board_list = np.reshape(board, self.size ** 2)
for i in range(len(board_list)):
if board_list[i] == 1:
black_num += 1
elif board_list[i] == -1:
white_num += 1
return black_num, white_num
def _step(self, board, color, action):
if action < 0 or action > self.size ** 2 - 1:
raise ValueError("Action not in the range of [0,63]!")
if action is None:
raise ValueError("Action is None!")
x, y = self._deflatten(action)
new_board = self._flip(board, x - 1, y - 1, color)
return new_board
def _flip(self, board, x, y, color):
valid = 0
board[x, y] = color
for x_direction in [-1, 0, 1]:
for y_direction in [-1, 0, 1]:
new_x = x
new_y = y
flag = 0
while True:
new_x += x_direction
new_y += y_direction
if self._one_direction_valid(board, new_x, new_y, 0 - color):
flag = 1
else:
break
if self._one_direction_valid(board, new_x, new_y, color) and flag:
valid = 1
flip_x = x
flip_y = y
while True:
flip_x += x_direction
flip_y += y_direction
if self._one_direction_valid(board, flip_x, flip_y, 0 - color):
board[flip_x, flip_y] = color
else:
break
if valid:
return board
else:
raise ValueError("Invalid action")
def executor_do_move(self, history, latest_boards, board, color, vertex):
board = np.reshape(board, (self.size, self.size))
color = color
action = self._flatten(vertex)
if action == self.size ** 2:
valid_moves = self._find_correct_moves(board, color, is_next=True)
if not len(valid_moves):
return False
else:
return True
else:
new_board = self._step(board, color, action)
history.append(new_board)
latest_boards.append(new_board)
return True
def executor_get_score(self, board):
board = board
winner = self._get_winner(board)
return winner
if __name__ == "__main__":
reversi = Reversi()
# board = reversi.get_board()
# print(board)
# state, value = reversi.simulate_step_forward([board, -1], 20)
# print(state[0])
# print("board")
# print(board)
# r = reversi.executor_get_score(board)
# print(r)

View File

@ -79,7 +79,7 @@ while True:
prob.append(np.array(game.prob).reshape(-1, game.size ** 2 + 1))
print("Finished")
print("\n")
score = game.game_engine.executor_get_score(game.board, True)
score = game.game_engine.executor_get_score(game.board)
if score > 0:
winner = utils.BLACK
else:

View File

@ -41,6 +41,11 @@ Tianshou(天授) is a reinforcement learning platform. The following image illus
<img src="https://github.com/sproblvem/tianshou/blob/master/docs/figures/go.png" height="150"/> <img src="https://github.com/sproblvem/tianshou/blob/master/docs/figures/reversi.jpg" height="150"/> <img src="https://github.com/sproblvem/tianshou/blob/master/docs/figures/warzone.jpg" height="150"/>
## examples
During development, run examples under `./examples/` directory with, e.g. `python ppo_example.py`.
Running them under this directory with `python examples/ppo_example.py` will not work.
## About coding style

View File

@ -1,8 +1,6 @@
#!/usr/bin/env python
import tensorflow as tf
import numpy as np
import time
import gym
# our lib imports here!
@ -10,7 +8,7 @@ import sys
sys.path.append('..')
import tianshou.core.losses as losses
from tianshou.data.replay_buffer.utils import get_replay_buffer
import tianshou.core.policy as policy
import tianshou.core.policy.dqn as policy
def policy_net(observation, action_dim):
@ -41,6 +39,8 @@ if __name__ == '__main__':
# pass the observation variable to the replay buffer or find a more reasonable way to help replay buffer
# access this observation variable.
observation = tf.placeholder(tf.float32, shape=(None,) + observation_dim, name="dqn_observation") # network input
action = tf.placeholder(dtype=tf.int32, shape=(None,)) # batch of integer actions
with tf.variable_scope('q_net'):
q_values = policy_net(observation, action_dim)
@ -48,10 +48,9 @@ if __name__ == '__main__':
q_values_target = policy_net(observation, action_dim)
# 2. build losses, optimizers
q_net = policy.DQN(q_values, observation_placeholder=observation) # YongRen: policy.DQN
target_net = policy.DQN(q_values_target, observation_placeholder=observation)
q_net = policy.DQNRefactor(q_values, observation_placeholder=observation, action_placeholder=action) # YongRen: policy.DQN
target_net = policy.DQNRefactor(q_values_target, observation_placeholder=observation, action_placeholder=action)
action = tf.placeholder(dtype=tf.int32, shape=[None]) # batch of integer actions
target = tf.placeholder(dtype=tf.float32, shape=[None]) # target value for DQN
dqn_loss = losses.dqn_loss(action, target, q_net) # TongzhengRen

View File

@ -1,17 +1,16 @@
#!/usr/bin/env python
from __future__ import absolute_import
import tensorflow as tf
import numpy as np
import time
import gym
# our lib imports here!
import sys
sys.path.append('..')
import tianshou.core.losses as losses
from tianshou.core import losses
from tianshou.data.batch import Batch
import tianshou.data.advantage_estimation as advantage_estimation
import tianshou.core.policy as policy
import tianshou.core.policy.stochastic as policy # TODO: fix imports as zhusuan so that only need to import to policy
def policy_net(observation, action_dim, scope=None):

View File

@ -21,4 +21,8 @@ referencing QValuePolicy in base.py, should have at least the listed methods.
TongzhengRen
seems to be direct python functions. Though the management of placeholders may require some discussion. also may write it in a functional form.
seems to be direct python functions. Though the management of placeholders may require some discussion. also may write it in a functional form.
# policy, value_function
naming should be reconsidered. Perhaps use plural forms for all nouns

View File

@ -35,17 +35,16 @@ def vanilla_policy_gradient(sampled_action, reward, pi, baseline="None"):
# TODO: Different baseline methods like REINFORCE, etc.
return vanilla_policy_gradient_loss
def dqn_loss(sampled_action, sampled_target, q_net):
def dqn_loss(sampled_action, sampled_target, policy):
"""
deep q-network
:param sampled_action: placeholder of sampled actions during the interaction with the environment
:param sampled_target: estimated Q(s,a)
:param q_net: current `policy` to be optimized
:param policy: current `policy` to be optimized
:return:
"""
action_num = q_net.values_tensor().get_shape()[1]
sampled_q = tf.reduce_sum(q_net.values_tensor() * tf.one_hot(sampled_action, action_num), axis=1)
sampled_q = policy.q_net.value_tensor
return tf.reduce_mean(tf.square(sampled_target - sampled_q))
def deterministic_policy_gradient(sampled_state, critic):

View File

@ -25,8 +25,9 @@ class MCTSNode(object):
def valid_mask(self, simulator):
pass
class UCTNode(MCTSNode):
def __init__(self, parent, action, state, action_num, prior, inverse=False, c_puct = 5):
def __init__(self, parent, action, state, action_num, prior, debug=False, inverse=False, c_puct = 5):
super(UCTNode, self).__init__(parent, action, state, action_num, prior, inverse)
self.Q = np.zeros([action_num])
self.W = np.zeros([action_num])
@ -34,9 +35,16 @@ class UCTNode(MCTSNode):
self.c_puct = c_puct
self.ucb = self.Q + self.c_puct * self.prior * math.sqrt(np.sum(self.N)) / (self.N + 1)
self.mask = None
self.debug=debug
self.elapse_time = 0
def clear_elapse_time(self):
self.elapse_time = 0
def selection(self, simulator):
head = time.time()
self.valid_mask(simulator)
self.elapse_time += time.time() - head
action = np.argmax(self.ucb)
if action in self.children.keys():
return self.children[action].selection(simulator)
@ -59,10 +67,13 @@ class UCTNode(MCTSNode):
self.parent.backpropagation(self.children[action].reward)
def valid_mask(self, simulator):
# let all invalid actions be illeagel in mcts
if self.mask is None:
self.mask = simulator.simulate_is_valid_list(self.state, range(self.action_num))
self.ucb[self.mask] = -float("Inf")
# let all invalid actions be illegal in mcts
if not hasattr(simulator, 'simulate_get_mask'):
pass
else:
if self.mask is None:
self.mask = simulator.simulate_get_mask(self.state, range(self.action_num))
self.ucb[self.mask] = -float("Inf")
class TSNode(MCTSNode):
@ -87,15 +98,15 @@ class ActionNode(object):
self.reward = 0
def type_conversion_to_tuple(self):
if type(self.next_state) is np.ndarray:
if isinstance(self.next_state, np.ndarray):
self.next_state = self.next_state.tolist()
if type(self.next_state) is list:
if isinstance(self.next_state, list):
self.next_state = list2tuple(self.next_state)
def type_conversion_to_origin(self):
if self.state_type is np.ndarray:
if isinstance(self.state_type, np.ndarray):
self.next_state = np.array(self.next_state)
if self.state_type is list:
if isinstance(self.state_type, np.ndarray):
self.next_state = tuple2list(self.next_state)
def selection(self, simulator):
@ -126,15 +137,18 @@ class ActionNode(object):
class MCTS(object):
def __init__(self, simulator, evaluator, root, action_num, method="UCT", inverse=False):
def __init__(self, simulator, evaluator, root, action_num, method="UCT",
role="unknown", debug=False, inverse=False):
self.simulator = simulator
self.evaluator = evaluator
self.role = role
self.debug = debug
prior, _ = self.evaluator(root)
self.action_num = action_num
if method == "":
self.root = root
if method == "UCT":
self.root = UCTNode(None, None, root, action_num, prior, inverse=inverse)
self.root = UCTNode(None, None, root, action_num, prior, self.debug, inverse=inverse)
if method == "TS":
self.root = TSNode(None, None, root, action_num, prior, inverse=inverse)
self.inverse = inverse
@ -149,14 +163,36 @@ class MCTS(object):
if max_step is None and max_time is None:
raise ValueError("Need a stop criteria!")
selection_time = 0
expansion_time = 0
backprop_time = 0
self.root.clear_elapse_time()
while step < max_step and time.time() - start_time < max_step:
self._expand()
sel_time, exp_time, back_time = self._expand()
selection_time += sel_time
expansion_time += exp_time
backprop_time += back_time
step += 1
if (self.debug):
file = open("debug.txt", "a")
file.write("[" + str(self.role) + "]"
+ " selection : " + str(selection_time) + "\t"
+ " validmask : " + str(self.root.elapse_time) + "\t"
+ " expansion : " + str(expansion_time) + "\t"
+ " backprop : " + str(backprop_time) + "\t"
+ "\n")
file.close()
def _expand(self):
t0 = time.time()
node, new_action = self.root.selection(self.simulator)
t1 = time.time()
value = node.children[new_action].expansion(self.evaluator, self.action_num)
t2 = time.time()
node.children[new_action].backpropagation(value + 0.)
t3 = time.time()
return t1 - t0, t2 - t1, t3 - t2
if __name__ == "__main__":
pass

View File

@ -1,6 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from .base import *
from .stochastic import *
from .dqn import *

View File

@ -3,21 +3,26 @@
from __future__ import absolute_import
from __future__ import division
import warnings
import tensorflow as tf
# from zhusuan.utils import add_name_scope
__all__ = [
'StochasticPolicy',
'QValuePolicy',
]
# TODO: a even more "base" class for policy
class PolicyBase(object):
"""
base class for policy. only provides `act` method with exploration
"""
def __init__(self, observation_placeholder):
self._observation_placeholder = observation_placeholder
def act(self, observation, exploration):
raise NotImplementedError()
class QValuePolicy(object):
"""
The policy as in DQN
@ -25,14 +30,14 @@ class QValuePolicy(object):
def __init__(self, observation_placeholder):
self._observation_placeholder = observation_placeholder
def act(self, observation, exploration=None): # first implement no exploration
def act(self, observation, exploration=None): # first implement no exploration
"""
return the action (int) to be executed.
no exploration when exploration=None.
"""
self._act(observation, exploration)
def _act(self, observation, exploration = None):
def _act(self, observation, exploration=None):
raise NotImplementedError()
def values(self, observation):
@ -48,7 +53,6 @@ class QValuePolicy(object):
pass
class StochasticPolicy(object):
"""
The :class:`Distribution` class is the base class for various probabilistic
@ -118,7 +122,7 @@ class StochasticPolicy(object):
param_dtype,
is_continuous,
observation_placeholder,
group_ndims=0, # maybe useful for repeat_action
group_ndims=0, # maybe useful for repeat_action
**kwargs):
self._act_dtype = act_dtype

View File

@ -1,19 +1,34 @@
from tianshou.core.policy.base import QValuePolicy
from __future__ import absolute_import
from .base import PolicyBase
import tensorflow as tf
import sys
sys.path.append('..')
import value_function.action_value as value_func
from ..value_function.action_value import DQN
class DQN_refactor(object):
class DQNRefactor(PolicyBase):
"""
use DQN from value_function as a member
"""
def __init__(self, value_tensor, observation_placeholder, action_placeholder):
self._network = value_func.DQN(value_tensor, observation_placeholder, action_placeholder)
self._q_net = DQN(value_tensor, observation_placeholder, action_placeholder)
self._argmax_action = tf.argmax(value_tensor, axis=1)
super(DQNRefactor, self).__init__(observation_placeholder=observation_placeholder)
def act(self, observation, exploration=None):
sess = tf.get_default_session()
if not exploration: # no exploration
action = sess.run(self._argmax_action, feed_dict={self._observation_placeholder: observation})
class DQN(QValuePolicy):
return action
@property
def q_net(self):
return self._q_net
class DQNOld(QValuePolicy):
"""
The policy as in DQN
"""

View File

@ -10,12 +10,6 @@ import tensorflow as tf
from .base import StochasticPolicy
__all__ = [
'OnehotCategorical',
'OnehotDiscrete',
]
class OnehotCategorical(StochasticPolicy):
"""
The class of one-hot Categorical distribution.

View File

@ -1,4 +1,6 @@
from base import ValueFunctionBase
from __future__ import absolute_import
from .base import ValueFunctionBase
import tensorflow as tf
@ -13,9 +15,8 @@ class ActionValue(ValueFunctionBase):
observation_placeholder=observation_placeholder
)
def get_value(self, observation, action):
def eval_value(self, observation, action):
"""
:param observation: numpy array of observations, of shape (batchsize, observation_dim).
:param action: numpy array of actions, of shape (batchsize, action_dim)
# TODO: Atari discrete action should have dim 1. Super Mario may should have, say, dim 5, where each can be 0/1
@ -23,8 +24,8 @@ class ActionValue(ValueFunctionBase):
# TODO: dealing with the last dim of 1 in V(s) and Q(s, a)
"""
sess = tf.get_default_session()
return sess.run(self.get_value_tensor(), feed_dict=
{self._observation_placeholder: observation, self._action_placeholder:action})[:, 0]
return sess.run(self.value_tensor, feed_dict=
{self._observation_placeholder: observation, self._action_placeholder: action})
class DQN(ActionValue):
@ -39,15 +40,24 @@ class DQN(ActionValue):
:param action_placeholder: of shape (batchsize, )
"""
self._value_tensor_all_actions = value_tensor
canonical_value_tensor = value_tensor[action_placeholder] # maybe a tf.map_fn. for now it's wrong
batch_size = tf.shape(value_tensor)[0]
batch_dim_index = tf.range(batch_size)
indices = tf.stack([batch_dim_index, action_placeholder], axis=1)
canonical_value_tensor = tf.gather_nd(value_tensor, indices)
super(DQN, self).__init__(value_tensor=canonical_value_tensor,
observation_placeholder=observation_placeholder,
action_placeholder=action_placeholder)
def get_value_all_actions(self, observation):
def eval_value_all_actions(self, observation):
"""
:param observation:
:return: numpy array of Q(s, *) given s, of shape (batchsize, num_actions)
"""
sess = tf.get_default_session()
return sess.run(self._value_tensor_all_actions, feed_dict={self._observation_placeholder: observation})
def get_value_tensor_all_actions(self):
@property
def value_tensor_all_actions(self):
return self._value_tensor_all_actions

View File

@ -1,3 +1,6 @@
from __future__ import absolute_import
import tensorflow as tf
# TODO: linear feature baseline also in tf?
class ValueFunctionBase(object):
@ -6,16 +9,17 @@ class ValueFunctionBase(object):
"""
def __init__(self, value_tensor, observation_placeholder):
self._observation_placeholder = observation_placeholder
self._value_tensor = value_tensor
self._value_tensor = tf.squeeze(value_tensor) # canonical values has shape (batchsize, )
def get_value(self, **kwargs):
def eval_value(self, **kwargs):
"""
:return: batch of corresponding values in numpy array
"""
raise NotImplementedError()
def get_value_tensor(self):
@property
def value_tensor(self):
"""
:return: tensor of the corresponding values

View File

@ -1,4 +1,6 @@
from base import ValueFunctionBase
from __future__ import absolute_import
from .base import ValueFunctionBase
import tensorflow as tf
@ -12,12 +14,12 @@ class StateValue(ValueFunctionBase):
observation_placeholder=observation_placeholder
)
def get_value(self, observation):
def eval_value(self, observation):
"""
:param observation: numpy array of observations, of shape (batchsize, observation_dim).
:return: numpy array of state values, of shape (batchsize, )
# TODO: dealing with the last dim of 1 in V(s) and Q(s, a)
# TODO: dealing with the last dim of 1 in V(s) and Q(s, a), this should rely on the action shape returned by env
"""
sess = tf.get_default_session()
return sess.run(self.get_value_tensor(), feed_dict={self._observation_placeholder: observation})[:, 0]
return sess.run(self.value_tensor, feed_dict={self._observation_placeholder: observation})