2020-07-21 14:59:49 +08:00
|
|
|
import argparse
|
2021-09-03 05:05:04 +08:00
|
|
|
import os
|
2020-07-21 14:59:49 +08:00
|
|
|
from copy import deepcopy
|
|
|
|
from typing import Optional, Tuple
|
2021-09-03 05:05:04 +08:00
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
import torch
|
|
|
|
from tic_tac_toe_env import TicTacToeEnv
|
2020-07-21 14:59:49 +08:00
|
|
|
from torch.utils.tensorboard import SummaryWriter
|
|
|
|
|
2021-09-03 05:05:04 +08:00
|
|
|
from tianshou.data import Collector, VectorReplayBuffer
|
2020-08-19 15:00:24 +08:00
|
|
|
from tianshou.env import DummyVectorEnv
|
2021-09-03 05:05:04 +08:00
|
|
|
from tianshou.policy import (
|
|
|
|
BasePolicy,
|
|
|
|
DQNPolicy,
|
|
|
|
MultiAgentPolicyManager,
|
|
|
|
RandomPolicy,
|
|
|
|
)
|
2020-07-21 14:59:49 +08:00
|
|
|
from tianshou.trainer import offpolicy_trainer
|
2021-09-03 05:05:04 +08:00
|
|
|
from tianshou.utils import TensorboardLogger
|
|
|
|
from tianshou.utils.net.common import Net
|
2020-07-21 14:59:49 +08:00
|
|
|
|
|
|
|
|
|
|
|
def get_parser() -> argparse.ArgumentParser:
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument('--seed', type=int, default=1626)
|
|
|
|
parser.add_argument('--eps-test', type=float, default=0.05)
|
|
|
|
parser.add_argument('--eps-train', type=float, default=0.1)
|
|
|
|
parser.add_argument('--buffer-size', type=int, default=20000)
|
|
|
|
parser.add_argument('--lr', type=float, default=1e-3)
|
2021-09-03 05:05:04 +08:00
|
|
|
parser.add_argument(
|
|
|
|
'--gamma', type=float, default=0.9, help='a smaller gamma favors earlier win'
|
|
|
|
)
|
2020-07-21 14:59:49 +08:00
|
|
|
parser.add_argument('--n-step', type=int, default=3)
|
|
|
|
parser.add_argument('--target-update-freq', type=int, default=320)
|
|
|
|
parser.add_argument('--epoch', type=int, default=20)
|
2021-02-21 13:06:02 +08:00
|
|
|
parser.add_argument('--step-per-epoch', type=int, default=5000)
|
|
|
|
parser.add_argument('--step-per-collect', type=int, default=10)
|
|
|
|
parser.add_argument('--update-per-step', type=float, default=0.1)
|
2020-07-21 14:59:49 +08:00
|
|
|
parser.add_argument('--batch-size', type=int, default=64)
|
2021-09-03 05:05:04 +08:00
|
|
|
parser.add_argument(
|
|
|
|
'--hidden-sizes', type=int, nargs='*', default=[128, 128, 128, 128]
|
|
|
|
)
|
2021-02-19 10:33:49 +08:00
|
|
|
parser.add_argument('--training-num', type=int, default=10)
|
2020-07-21 14:59:49 +08:00
|
|
|
parser.add_argument('--test-num', type=int, default=100)
|
|
|
|
parser.add_argument('--logdir', type=str, default='log')
|
|
|
|
parser.add_argument('--render', type=float, default=0.1)
|
2021-02-19 10:33:49 +08:00
|
|
|
parser.add_argument('--board-size', type=int, default=6)
|
|
|
|
parser.add_argument('--win-size', type=int, default=4)
|
2020-07-21 14:59:49 +08:00
|
|
|
parser.add_argument(
|
2021-09-03 05:05:04 +08:00
|
|
|
'--win-rate', type=float, default=0.9, help='the expected winning rate'
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'--watch',
|
|
|
|
default=False,
|
|
|
|
action='store_true',
|
|
|
|
help='no training, '
|
|
|
|
'watch the play of pre-trained models'
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'--agent-id',
|
|
|
|
type=int,
|
|
|
|
default=2,
|
|
|
|
help='the learned agent plays as the'
|
|
|
|
' agent_id-th player. Choices are 1 and 2.'
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'--resume-path',
|
|
|
|
type=str,
|
|
|
|
default='',
|
|
|
|
help='the path of agent pth file '
|
|
|
|
'for resuming from a pre-trained agent'
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'--opponent-path',
|
|
|
|
type=str,
|
|
|
|
default='',
|
|
|
|
help='the path of opponent agent pth file '
|
|
|
|
'for resuming from a pre-trained agent'
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu'
|
|
|
|
)
|
2020-07-21 14:59:49 +08:00
|
|
|
return parser
|
|
|
|
|
|
|
|
|
|
|
|
def get_args() -> argparse.Namespace:
|
|
|
|
parser = get_parser()
|
2021-02-19 10:33:49 +08:00
|
|
|
return parser.parse_known_args()[0]
|
2020-07-21 14:59:49 +08:00
|
|
|
|
|
|
|
|
2020-09-26 16:35:37 +08:00
|
|
|
def get_agents(
|
|
|
|
args: argparse.Namespace = get_args(),
|
|
|
|
agent_learn: Optional[BasePolicy] = None,
|
|
|
|
agent_opponent: Optional[BasePolicy] = None,
|
|
|
|
optim: Optional[torch.optim.Optimizer] = None,
|
|
|
|
) -> Tuple[BasePolicy, torch.optim.Optimizer]:
|
2020-07-21 14:59:49 +08:00
|
|
|
env = TicTacToeEnv(args.board_size, args.win_size)
|
|
|
|
args.state_shape = env.observation_space.shape or env.observation_space.n
|
|
|
|
args.action_shape = env.action_space.shape or env.action_space.n
|
|
|
|
if agent_learn is None:
|
|
|
|
# model
|
2021-09-03 05:05:04 +08:00
|
|
|
net = Net(
|
|
|
|
args.state_shape,
|
|
|
|
args.action_shape,
|
|
|
|
hidden_sizes=args.hidden_sizes,
|
|
|
|
device=args.device
|
|
|
|
).to(args.device)
|
2020-07-21 14:59:49 +08:00
|
|
|
if optim is None:
|
|
|
|
optim = torch.optim.Adam(net.parameters(), lr=args.lr)
|
|
|
|
agent_learn = DQNPolicy(
|
2021-09-03 05:05:04 +08:00
|
|
|
net,
|
|
|
|
optim,
|
|
|
|
args.gamma,
|
|
|
|
args.n_step,
|
|
|
|
target_update_freq=args.target_update_freq
|
|
|
|
)
|
2020-07-21 14:59:49 +08:00
|
|
|
if args.resume_path:
|
|
|
|
agent_learn.load_state_dict(torch.load(args.resume_path))
|
|
|
|
|
|
|
|
if agent_opponent is None:
|
|
|
|
if args.opponent_path:
|
|
|
|
agent_opponent = deepcopy(agent_learn)
|
|
|
|
agent_opponent.load_state_dict(torch.load(args.opponent_path))
|
|
|
|
else:
|
|
|
|
agent_opponent = RandomPolicy()
|
|
|
|
|
|
|
|
if args.agent_id == 1:
|
|
|
|
agents = [agent_learn, agent_opponent]
|
|
|
|
else:
|
|
|
|
agents = [agent_opponent, agent_learn]
|
|
|
|
policy = MultiAgentPolicyManager(agents)
|
|
|
|
return policy, optim
|
|
|
|
|
|
|
|
|
2020-09-26 16:35:37 +08:00
|
|
|
def train_agent(
|
|
|
|
args: argparse.Namespace = get_args(),
|
|
|
|
agent_learn: Optional[BasePolicy] = None,
|
|
|
|
agent_opponent: Optional[BasePolicy] = None,
|
|
|
|
optim: Optional[torch.optim.Optimizer] = None,
|
|
|
|
) -> Tuple[dict, BasePolicy]:
|
2021-09-03 05:05:04 +08:00
|
|
|
|
2020-07-21 14:59:49 +08:00
|
|
|
def env_func():
|
|
|
|
return TicTacToeEnv(args.board_size, args.win_size)
|
2021-09-03 05:05:04 +08:00
|
|
|
|
2020-08-19 15:00:24 +08:00
|
|
|
train_envs = DummyVectorEnv([env_func for _ in range(args.training_num)])
|
|
|
|
test_envs = DummyVectorEnv([env_func for _ in range(args.test_num)])
|
2020-07-21 14:59:49 +08:00
|
|
|
# seed
|
|
|
|
np.random.seed(args.seed)
|
|
|
|
torch.manual_seed(args.seed)
|
|
|
|
train_envs.seed(args.seed)
|
|
|
|
test_envs.seed(args.seed)
|
|
|
|
|
|
|
|
policy, optim = get_agents(
|
2021-09-03 05:05:04 +08:00
|
|
|
args, agent_learn=agent_learn, agent_opponent=agent_opponent, optim=optim
|
|
|
|
)
|
2020-07-21 14:59:49 +08:00
|
|
|
|
|
|
|
# collector
|
|
|
|
train_collector = Collector(
|
2021-09-03 05:05:04 +08:00
|
|
|
policy,
|
|
|
|
train_envs,
|
2021-02-19 10:33:49 +08:00
|
|
|
VectorReplayBuffer(args.buffer_size, len(train_envs)),
|
2021-09-03 05:05:04 +08:00
|
|
|
exploration_noise=True
|
|
|
|
)
|
2021-07-05 09:50:39 +08:00
|
|
|
test_collector = Collector(policy, test_envs, exploration_noise=True)
|
2020-07-21 14:59:49 +08:00
|
|
|
# policy.set_eps(1)
|
2021-02-19 10:33:49 +08:00
|
|
|
train_collector.collect(n_step=args.batch_size * args.training_num)
|
2020-07-21 14:59:49 +08:00
|
|
|
# log
|
2021-02-24 14:48:42 +08:00
|
|
|
log_path = os.path.join(args.logdir, 'tic_tac_toe', 'dqn')
|
|
|
|
writer = SummaryWriter(log_path)
|
|
|
|
writer.add_text("args", str(args))
|
2021-08-30 10:35:02 -04:00
|
|
|
logger = TensorboardLogger(writer)
|
2020-07-21 14:59:49 +08:00
|
|
|
|
|
|
|
def save_fn(policy):
|
|
|
|
if hasattr(args, 'model_save_path'):
|
|
|
|
model_save_path = args.model_save_path
|
|
|
|
else:
|
|
|
|
model_save_path = os.path.join(
|
2021-09-03 05:05:04 +08:00
|
|
|
args.logdir, 'tic_tac_toe', 'dqn', 'policy.pth'
|
|
|
|
)
|
|
|
|
torch.save(policy.policies[args.agent_id - 1].state_dict(), model_save_path)
|
2020-07-21 14:59:49 +08:00
|
|
|
|
2020-09-26 16:35:37 +08:00
|
|
|
def stop_fn(mean_rewards):
|
|
|
|
return mean_rewards >= args.win_rate
|
2020-07-21 14:59:49 +08:00
|
|
|
|
2020-09-26 16:35:37 +08:00
|
|
|
def train_fn(epoch, env_step):
|
2020-07-21 14:59:49 +08:00
|
|
|
policy.policies[args.agent_id - 1].set_eps(args.eps_train)
|
|
|
|
|
2020-09-26 16:35:37 +08:00
|
|
|
def test_fn(epoch, env_step):
|
2020-07-21 14:59:49 +08:00
|
|
|
policy.policies[args.agent_id - 1].set_eps(args.eps_test)
|
|
|
|
|
2021-02-19 10:33:49 +08:00
|
|
|
def reward_metric(rews):
|
|
|
|
return rews[:, args.agent_id - 1]
|
|
|
|
|
2020-07-21 14:59:49 +08:00
|
|
|
# trainer
|
|
|
|
result = offpolicy_trainer(
|
2021-09-03 05:05:04 +08:00
|
|
|
policy,
|
|
|
|
train_collector,
|
|
|
|
test_collector,
|
|
|
|
args.epoch,
|
|
|
|
args.step_per_epoch,
|
|
|
|
args.step_per_collect,
|
|
|
|
args.test_num,
|
|
|
|
args.batch_size,
|
|
|
|
train_fn=train_fn,
|
|
|
|
test_fn=test_fn,
|
|
|
|
stop_fn=stop_fn,
|
|
|
|
save_fn=save_fn,
|
|
|
|
update_per_step=args.update_per_step,
|
|
|
|
logger=logger,
|
|
|
|
test_in_train=False,
|
|
|
|
reward_metric=reward_metric
|
|
|
|
)
|
2020-07-21 14:59:49 +08:00
|
|
|
|
|
|
|
return result, policy.policies[args.agent_id - 1]
|
|
|
|
|
|
|
|
|
2020-09-26 16:35:37 +08:00
|
|
|
def watch(
|
|
|
|
args: argparse.Namespace = get_args(),
|
|
|
|
agent_learn: Optional[BasePolicy] = None,
|
|
|
|
agent_opponent: Optional[BasePolicy] = None,
|
|
|
|
) -> None:
|
2020-07-21 14:59:49 +08:00
|
|
|
env = TicTacToeEnv(args.board_size, args.win_size)
|
|
|
|
policy, optim = get_agents(
|
2021-09-03 05:05:04 +08:00
|
|
|
args, agent_learn=agent_learn, agent_opponent=agent_opponent
|
|
|
|
)
|
2020-08-27 12:15:18 +08:00
|
|
|
policy.eval()
|
2020-09-02 13:03:32 +08:00
|
|
|
policy.policies[args.agent_id - 1].set_eps(args.eps_test)
|
2021-07-05 09:50:39 +08:00
|
|
|
collector = Collector(policy, env, exploration_noise=True)
|
2020-07-21 14:59:49 +08:00
|
|
|
result = collector.collect(n_episode=1, render=args.render)
|
2021-02-19 10:33:49 +08:00
|
|
|
rews, lens = result["rews"], result["lens"]
|
2021-02-21 13:06:02 +08:00
|
|
|
print(f"Final reward: {rews[:, args.agent_id - 1].mean()}, length: {lens.mean()}")
|