add examples, fix some bugs (#5)
* update atari.py * fix setup.py pass the pytest * fix setup.py pass the pytest * add args "render" * change the tensorboard writter * change the tensorboard writter * change device, render, tensorboard log location * change device, render, tensorboard log location * remove some wrong local files * fix some tab mistakes and the envs name in continuous/test_xx.py * add examples and point robot maze environment * fix some bugs during testing examples * add dqn network and fix some args * change back the tensorboard writter's frequency to ensure ppo and a2c can write things normally * add a warning to collector * rm some unrelated files * reformat * fix a bug in test_dqn due to the model wrong selection
This commit is contained in:
parent
acb93502cf
commit
77068af526
BIN
docs/_static/images/Ant-v2.png
vendored
Normal file
BIN
docs/_static/images/Ant-v2.png
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 183 KiB |
105
examples/ant_v2_ddpg.py
Normal file
105
examples/ant_v2_ddpg.py
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
import gym
|
||||||
|
import torch
|
||||||
|
import pprint
|
||||||
|
import argparse
|
||||||
|
import numpy as np
|
||||||
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
|
||||||
|
from tianshou.policy import DDPGPolicy
|
||||||
|
from tianshou.trainer import offpolicy_trainer
|
||||||
|
from tianshou.data import Collector, ReplayBuffer
|
||||||
|
from tianshou.env import VectorEnv, SubprocVectorEnv
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from continuous_net import Actor, Critic
|
||||||
|
else: # pytest
|
||||||
|
from test.continuous.net import Actor, Critic
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--task', type=str, default='Ant-v2')
|
||||||
|
parser.add_argument('--seed', type=int, default=1626)
|
||||||
|
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||||
|
parser.add_argument('--actor-lr', type=float, default=1e-4)
|
||||||
|
parser.add_argument('--critic-lr', type=float, default=1e-3)
|
||||||
|
parser.add_argument('--gamma', type=float, default=0.99)
|
||||||
|
parser.add_argument('--tau', type=float, default=0.005)
|
||||||
|
parser.add_argument('--exploration-noise', type=float, default=0.1)
|
||||||
|
parser.add_argument('--epoch', type=int, default=100)
|
||||||
|
parser.add_argument('--step-per-epoch', type=int, default=2400)
|
||||||
|
parser.add_argument('--collect-per-step', type=int, default=4)
|
||||||
|
parser.add_argument('--batch-size', type=int, default=128)
|
||||||
|
parser.add_argument('--layer-num', type=int, default=1)
|
||||||
|
parser.add_argument('--training-num', type=int, default=8)
|
||||||
|
parser.add_argument('--test-num', type=int, default=100)
|
||||||
|
parser.add_argument('--logdir', type=str, default='log')
|
||||||
|
parser.add_argument('--render', type=float, default=0.)
|
||||||
|
parser.add_argument(
|
||||||
|
'--device', type=str,
|
||||||
|
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
args = parser.parse_known_args()[0]
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def test_ddpg(args=get_args()):
|
||||||
|
env = gym.make(args.task)
|
||||||
|
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||||
|
args.action_shape = env.action_space.shape or env.action_space.n
|
||||||
|
args.max_action = env.action_space.high[0]
|
||||||
|
# train_envs = gym.make(args.task)
|
||||||
|
train_envs = VectorEnv(
|
||||||
|
[lambda: gym.make(args.task) for _ in range(args.training_num)])
|
||||||
|
# test_envs = gym.make(args.task)
|
||||||
|
test_envs = SubprocVectorEnv(
|
||||||
|
[lambda: gym.make(args.task) for _ in range(args.test_num)])
|
||||||
|
# seed
|
||||||
|
np.random.seed(args.seed)
|
||||||
|
torch.manual_seed(args.seed)
|
||||||
|
train_envs.seed(args.seed)
|
||||||
|
test_envs.seed(args.seed)
|
||||||
|
# model
|
||||||
|
actor = Actor(
|
||||||
|
args.layer_num, args.state_shape, args.action_shape,
|
||||||
|
args.max_action, args.device
|
||||||
|
).to(args.device)
|
||||||
|
actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
|
||||||
|
critic = Critic(
|
||||||
|
args.layer_num, args.state_shape, args.action_shape, args.device
|
||||||
|
).to(args.device)
|
||||||
|
critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
|
||||||
|
policy = DDPGPolicy(
|
||||||
|
actor, actor_optim, critic, critic_optim,
|
||||||
|
args.tau, args.gamma, args.exploration_noise,
|
||||||
|
[env.action_space.low[0], env.action_space.high[0]],
|
||||||
|
reward_normalization=True, ignore_done=True)
|
||||||
|
# collector
|
||||||
|
train_collector = Collector(
|
||||||
|
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||||
|
test_collector = Collector(policy, test_envs)
|
||||||
|
# log
|
||||||
|
writer = SummaryWriter(args.logdir + '/' + 'ddpg')
|
||||||
|
|
||||||
|
def stop_fn(x):
|
||||||
|
return x >= env.spec.reward_threshold
|
||||||
|
|
||||||
|
# trainer
|
||||||
|
result = offpolicy_trainer(
|
||||||
|
policy, train_collector, test_collector, args.epoch,
|
||||||
|
args.step_per_epoch, args.collect_per_step, args.test_num,
|
||||||
|
args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||||
|
assert stop_fn(result['best_reward'])
|
||||||
|
train_collector.close()
|
||||||
|
test_collector.close()
|
||||||
|
if __name__ == '__main__':
|
||||||
|
pprint.pprint(result)
|
||||||
|
# Let's watch its performance!
|
||||||
|
env = gym.make(args.task)
|
||||||
|
collector = Collector(policy, env)
|
||||||
|
result = collector.collect(n_episode=1, render=args.render)
|
||||||
|
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||||
|
collector.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test_ddpg()
|
||||||
110
examples/ant_v2_sac.py
Normal file
110
examples/ant_v2_sac.py
Normal file
@ -0,0 +1,110 @@
|
|||||||
|
import gym
|
||||||
|
import torch
|
||||||
|
import pprint
|
||||||
|
import argparse
|
||||||
|
import numpy as np
|
||||||
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
|
||||||
|
from tianshou.policy import SACPolicy
|
||||||
|
from tianshou.trainer import offpolicy_trainer
|
||||||
|
from tianshou.data import Collector, ReplayBuffer
|
||||||
|
from tianshou.env import VectorEnv, SubprocVectorEnv
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from continuous_net import ActorProb, Critic
|
||||||
|
else: # pytest
|
||||||
|
from test.continuous.net import ActorProb, Critic
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--task', type=str, default='Ant-v2')
|
||||||
|
parser.add_argument('--seed', type=int, default=1626)
|
||||||
|
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||||
|
parser.add_argument('--actor-lr', type=float, default=3e-4)
|
||||||
|
parser.add_argument('--critic-lr', type=float, default=1e-3)
|
||||||
|
parser.add_argument('--gamma', type=float, default=0.99)
|
||||||
|
parser.add_argument('--tau', type=float, default=0.005)
|
||||||
|
parser.add_argument('--alpha', type=float, default=0.2)
|
||||||
|
parser.add_argument('--epoch', type=int, default=100)
|
||||||
|
parser.add_argument('--step-per-epoch', type=int, default=2400)
|
||||||
|
parser.add_argument('--collect-per-step', type=int, default=10)
|
||||||
|
parser.add_argument('--batch-size', type=int, default=128)
|
||||||
|
parser.add_argument('--layer-num', type=int, default=1)
|
||||||
|
parser.add_argument('--training-num', type=int, default=8)
|
||||||
|
parser.add_argument('--test-num', type=int, default=100)
|
||||||
|
parser.add_argument('--logdir', type=str, default='log')
|
||||||
|
parser.add_argument('--render', type=float, default=0.)
|
||||||
|
parser.add_argument(
|
||||||
|
'--device', type=str,
|
||||||
|
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
args = parser.parse_known_args()[0]
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def test_sac(args=get_args()):
|
||||||
|
env = gym.make(args.task)
|
||||||
|
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||||
|
args.action_shape = env.action_space.shape or env.action_space.n
|
||||||
|
args.max_action = env.action_space.high[0]
|
||||||
|
# train_envs = gym.make(args.task)
|
||||||
|
train_envs = VectorEnv(
|
||||||
|
[lambda: gym.make(args.task) for _ in range(args.training_num)])
|
||||||
|
# test_envs = gym.make(args.task)
|
||||||
|
test_envs = SubprocVectorEnv(
|
||||||
|
[lambda: gym.make(args.task) for _ in range(args.test_num)])
|
||||||
|
# seed
|
||||||
|
np.random.seed(args.seed)
|
||||||
|
torch.manual_seed(args.seed)
|
||||||
|
train_envs.seed(args.seed)
|
||||||
|
test_envs.seed(args.seed)
|
||||||
|
# model
|
||||||
|
actor = ActorProb(
|
||||||
|
args.layer_num, args.state_shape, args.action_shape,
|
||||||
|
args.max_action, args.device
|
||||||
|
).to(args.device)
|
||||||
|
actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
|
||||||
|
critic1 = Critic(
|
||||||
|
args.layer_num, args.state_shape, args.action_shape, args.device
|
||||||
|
).to(args.device)
|
||||||
|
critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
|
||||||
|
critic2 = Critic(
|
||||||
|
args.layer_num, args.state_shape, args.action_shape, args.device
|
||||||
|
).to(args.device)
|
||||||
|
critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
|
||||||
|
policy = SACPolicy(
|
||||||
|
actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim,
|
||||||
|
args.tau, args.gamma, args.alpha,
|
||||||
|
[env.action_space.low[0], env.action_space.high[0]],
|
||||||
|
reward_normalization=True, ignore_done=True)
|
||||||
|
# collector
|
||||||
|
train_collector = Collector(
|
||||||
|
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||||
|
test_collector = Collector(policy, test_envs)
|
||||||
|
# train_collector.collect(n_step=args.buffer_size)
|
||||||
|
# log
|
||||||
|
writer = SummaryWriter(args.logdir + '/' + 'sac')
|
||||||
|
|
||||||
|
def stop_fn(x):
|
||||||
|
return x >= env.spec.reward_threshold
|
||||||
|
|
||||||
|
# trainer
|
||||||
|
result = offpolicy_trainer(
|
||||||
|
policy, train_collector, test_collector, args.epoch,
|
||||||
|
args.step_per_epoch, args.collect_per_step, args.test_num,
|
||||||
|
args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||||
|
assert stop_fn(result['best_reward'])
|
||||||
|
train_collector.close()
|
||||||
|
test_collector.close()
|
||||||
|
if __name__ == '__main__':
|
||||||
|
pprint.pprint(result)
|
||||||
|
# Let's watch its performance!
|
||||||
|
env = gym.make(args.task)
|
||||||
|
collector = Collector(policy, env)
|
||||||
|
result = collector.collect(n_episode=1, render=args.render)
|
||||||
|
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||||
|
collector.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test_sac()
|
||||||
114
examples/ant_v2_td3.py
Normal file
114
examples/ant_v2_td3.py
Normal file
@ -0,0 +1,114 @@
|
|||||||
|
import gym
|
||||||
|
import torch
|
||||||
|
import pprint
|
||||||
|
import argparse
|
||||||
|
import numpy as np
|
||||||
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
|
||||||
|
from tianshou.policy import TD3Policy
|
||||||
|
from tianshou.trainer import offpolicy_trainer
|
||||||
|
from tianshou.data import Collector, ReplayBuffer
|
||||||
|
from tianshou.env import VectorEnv, SubprocVectorEnv
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from continuous_net import Actor, Critic
|
||||||
|
else: # pytest
|
||||||
|
from test.continuous.net import Actor, Critic
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--task', type=str, default='Ant-v2')
|
||||||
|
parser.add_argument('--seed', type=int, default=1626)
|
||||||
|
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||||
|
parser.add_argument('--actor-lr', type=float, default=3e-4)
|
||||||
|
parser.add_argument('--critic-lr', type=float, default=1e-3)
|
||||||
|
parser.add_argument('--gamma', type=float, default=0.99)
|
||||||
|
parser.add_argument('--tau', type=float, default=0.005)
|
||||||
|
parser.add_argument('--exploration-noise', type=float, default=0.1)
|
||||||
|
parser.add_argument('--policy-noise', type=float, default=0.2)
|
||||||
|
parser.add_argument('--noise-clip', type=float, default=0.5)
|
||||||
|
parser.add_argument('--update-actor-freq', type=int, default=2)
|
||||||
|
parser.add_argument('--epoch', type=int, default=100)
|
||||||
|
parser.add_argument('--step-per-epoch', type=int, default=2400)
|
||||||
|
parser.add_argument('--collect-per-step', type=int, default=10)
|
||||||
|
parser.add_argument('--batch-size', type=int, default=128)
|
||||||
|
parser.add_argument('--layer-num', type=int, default=1)
|
||||||
|
parser.add_argument('--training-num', type=int, default=8)
|
||||||
|
parser.add_argument('--test-num', type=int, default=100)
|
||||||
|
parser.add_argument('--logdir', type=str, default='log')
|
||||||
|
parser.add_argument('--render', type=float, default=0.)
|
||||||
|
parser.add_argument(
|
||||||
|
'--device', type=str,
|
||||||
|
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
args = parser.parse_known_args()[0]
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def test_td3(args=get_args()):
|
||||||
|
env = gym.make(args.task)
|
||||||
|
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||||
|
args.action_shape = env.action_space.shape or env.action_space.n
|
||||||
|
args.max_action = env.action_space.high[0]
|
||||||
|
# train_envs = gym.make(args.task)
|
||||||
|
train_envs = VectorEnv(
|
||||||
|
[lambda: gym.make(args.task) for _ in range(args.training_num)])
|
||||||
|
# test_envs = gym.make(args.task)
|
||||||
|
test_envs = SubprocVectorEnv(
|
||||||
|
[lambda: gym.make(args.task) for _ in range(args.test_num)])
|
||||||
|
# seed
|
||||||
|
np.random.seed(args.seed)
|
||||||
|
torch.manual_seed(args.seed)
|
||||||
|
train_envs.seed(args.seed)
|
||||||
|
test_envs.seed(args.seed)
|
||||||
|
# model
|
||||||
|
actor = Actor(
|
||||||
|
args.layer_num, args.state_shape, args.action_shape,
|
||||||
|
args.max_action, args.device
|
||||||
|
).to(args.device)
|
||||||
|
actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
|
||||||
|
critic1 = Critic(
|
||||||
|
args.layer_num, args.state_shape, args.action_shape, args.device
|
||||||
|
).to(args.device)
|
||||||
|
critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
|
||||||
|
critic2 = Critic(
|
||||||
|
args.layer_num, args.state_shape, args.action_shape, args.device
|
||||||
|
).to(args.device)
|
||||||
|
critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
|
||||||
|
policy = TD3Policy(
|
||||||
|
actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim,
|
||||||
|
args.tau, args.gamma, args.exploration_noise, args.policy_noise,
|
||||||
|
args.update_actor_freq, args.noise_clip,
|
||||||
|
[env.action_space.low[0], env.action_space.high[0]],
|
||||||
|
reward_normalization=True, ignore_done=True)
|
||||||
|
# collector
|
||||||
|
train_collector = Collector(
|
||||||
|
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||||
|
test_collector = Collector(policy, test_envs)
|
||||||
|
# train_collector.collect(n_step=args.buffer_size)
|
||||||
|
# log
|
||||||
|
writer = SummaryWriter(args.logdir + '/' + 'td3')
|
||||||
|
|
||||||
|
def stop_fn(x):
|
||||||
|
return x >= env.spec.reward_threshold
|
||||||
|
|
||||||
|
# trainer
|
||||||
|
result = offpolicy_trainer(
|
||||||
|
policy, train_collector, test_collector, args.epoch,
|
||||||
|
args.step_per_epoch, args.collect_per_step, args.test_num,
|
||||||
|
args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||||
|
assert stop_fn(result['best_reward'])
|
||||||
|
train_collector.close()
|
||||||
|
test_collector.close()
|
||||||
|
if __name__ == '__main__':
|
||||||
|
pprint.pprint(result)
|
||||||
|
# Let's watch its performance!
|
||||||
|
env = gym.make(args.task)
|
||||||
|
collector = Collector(policy, env)
|
||||||
|
result = collector.collect(n_episode=1, render=args.render)
|
||||||
|
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||||
|
collector.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test_td3()
|
||||||
79
examples/continuous_net.py
Normal file
79
examples/continuous_net.py
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from torch import nn
|
||||||
|
|
||||||
|
|
||||||
|
class Actor(nn.Module):
|
||||||
|
def __init__(self, layer_num, state_shape, action_shape,
|
||||||
|
max_action, device='cpu'):
|
||||||
|
super().__init__()
|
||||||
|
self.device = device
|
||||||
|
self.model = [
|
||||||
|
nn.Linear(np.prod(state_shape), 128),
|
||||||
|
nn.ReLU(inplace=True)]
|
||||||
|
for i in range(layer_num):
|
||||||
|
self.model += [nn.Linear(128, 128), nn.ReLU(inplace=True)]
|
||||||
|
self.model += [nn.Linear(128, np.prod(action_shape))]
|
||||||
|
self.model = nn.Sequential(*self.model)
|
||||||
|
self._max = max_action
|
||||||
|
|
||||||
|
def forward(self, s, **kwargs):
|
||||||
|
s = torch.tensor(s, device=self.device, dtype=torch.float)
|
||||||
|
batch = s.shape[0]
|
||||||
|
s = s.view(batch, -1)
|
||||||
|
logits = self.model(s)
|
||||||
|
logits = self._max * torch.tanh(logits)
|
||||||
|
return logits, None
|
||||||
|
|
||||||
|
|
||||||
|
class ActorProb(nn.Module):
|
||||||
|
def __init__(self, layer_num, state_shape, action_shape,
|
||||||
|
max_action, device='cpu'):
|
||||||
|
super().__init__()
|
||||||
|
self.device = device
|
||||||
|
self.model = [
|
||||||
|
nn.Linear(np.prod(state_shape), 128),
|
||||||
|
nn.ReLU(inplace=True)]
|
||||||
|
for i in range(layer_num):
|
||||||
|
self.model += [nn.Linear(128, 128), nn.ReLU(inplace=True)]
|
||||||
|
self.model = nn.Sequential(*self.model)
|
||||||
|
self.mu = nn.Linear(128, np.prod(action_shape))
|
||||||
|
self.sigma = nn.Linear(128, np.prod(action_shape))
|
||||||
|
self._max = max_action
|
||||||
|
|
||||||
|
def forward(self, s, **kwargs):
|
||||||
|
if not isinstance(s, torch.Tensor):
|
||||||
|
s = torch.tensor(s, device=self.device, dtype=torch.float)
|
||||||
|
batch = s.shape[0]
|
||||||
|
s = s.view(batch, -1)
|
||||||
|
logits = self.model(s)
|
||||||
|
mu = self._max * torch.tanh(self.mu(logits))
|
||||||
|
sigma = torch.exp(self.sigma(logits))
|
||||||
|
return (mu, sigma), None
|
||||||
|
|
||||||
|
|
||||||
|
class Critic(nn.Module):
|
||||||
|
def __init__(self, layer_num, state_shape, action_shape=0, device='cpu'):
|
||||||
|
super().__init__()
|
||||||
|
self.device = device
|
||||||
|
self.model = [
|
||||||
|
nn.Linear(np.prod(state_shape) + np.prod(action_shape), 128),
|
||||||
|
nn.ReLU(inplace=True)]
|
||||||
|
for i in range(layer_num):
|
||||||
|
self.model += [nn.Linear(128, 128), nn.ReLU(inplace=True)]
|
||||||
|
self.model += [nn.Linear(128, 1)]
|
||||||
|
self.model = nn.Sequential(*self.model)
|
||||||
|
|
||||||
|
def forward(self, s, a=None):
|
||||||
|
if not isinstance(s, torch.Tensor):
|
||||||
|
s = torch.tensor(s, device=self.device, dtype=torch.float)
|
||||||
|
if a is not None and not isinstance(a, torch.Tensor):
|
||||||
|
a = torch.tensor(a, device=self.device, dtype=torch.float)
|
||||||
|
batch = s.shape[0]
|
||||||
|
s = s.view(batch, -1)
|
||||||
|
if a is None:
|
||||||
|
logits = self.model(s)
|
||||||
|
else:
|
||||||
|
a = a.view(batch, -1)
|
||||||
|
logits = self.model(torch.cat([s, a], dim=1))
|
||||||
|
return logits
|
||||||
81
examples/discrete_net.py
Normal file
81
examples/discrete_net.py
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from torch import nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
|
||||||
|
class Net(nn.Module):
|
||||||
|
def __init__(self, layer_num, state_shape, action_shape=0, device='cpu'):
|
||||||
|
super().__init__()
|
||||||
|
self.device = device
|
||||||
|
self.model = [
|
||||||
|
nn.Linear(np.prod(state_shape), 128),
|
||||||
|
nn.ReLU(inplace=True)]
|
||||||
|
for i in range(layer_num):
|
||||||
|
self.model += [nn.Linear(128, 128), nn.ReLU(inplace=True)]
|
||||||
|
if action_shape:
|
||||||
|
self.model += [nn.Linear(128, np.prod(action_shape))]
|
||||||
|
self.model = nn.Sequential(*self.model)
|
||||||
|
|
||||||
|
def forward(self, s, state=None, info={}):
|
||||||
|
if not isinstance(s, torch.Tensor):
|
||||||
|
s = torch.tensor(s, device=self.device, dtype=torch.float)
|
||||||
|
batch = s.shape[0]
|
||||||
|
s = s.view(batch, -1)
|
||||||
|
logits = self.model(s)
|
||||||
|
return logits, state
|
||||||
|
|
||||||
|
|
||||||
|
class Actor(nn.Module):
|
||||||
|
def __init__(self, preprocess_net, action_shape):
|
||||||
|
super().__init__()
|
||||||
|
self.preprocess = preprocess_net
|
||||||
|
self.last = nn.Linear(128, np.prod(action_shape))
|
||||||
|
|
||||||
|
def forward(self, s, state=None, info={}):
|
||||||
|
logits, h = self.preprocess(s, state)
|
||||||
|
logits = F.softmax(self.last(logits), dim=-1)
|
||||||
|
return logits, h
|
||||||
|
|
||||||
|
|
||||||
|
class Critic(nn.Module):
|
||||||
|
def __init__(self, preprocess_net):
|
||||||
|
super().__init__()
|
||||||
|
self.preprocess = preprocess_net
|
||||||
|
self.last = nn.Linear(128, 1)
|
||||||
|
|
||||||
|
def forward(self, s):
|
||||||
|
logits, h = self.preprocess(s, None)
|
||||||
|
logits = self.last(logits)
|
||||||
|
return logits
|
||||||
|
|
||||||
|
|
||||||
|
class DQN(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, h, w, action_shape, device='cpu'):
|
||||||
|
super(DQN, self).__init__()
|
||||||
|
self.device = device
|
||||||
|
|
||||||
|
self.conv1 = nn.Conv2d(1, 16, kernel_size=5, stride=2)
|
||||||
|
self.bn1 = nn.BatchNorm2d(16)
|
||||||
|
self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
|
||||||
|
self.bn2 = nn.BatchNorm2d(32)
|
||||||
|
self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
|
||||||
|
self.bn3 = nn.BatchNorm2d(32)
|
||||||
|
|
||||||
|
def conv2d_size_out(size, kernel_size=5, stride=2):
|
||||||
|
return (size - (kernel_size - 1) - 1) // stride + 1
|
||||||
|
|
||||||
|
convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
|
||||||
|
convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
|
||||||
|
linear_input_size = convw * convh * 32
|
||||||
|
self.head = nn.Linear(linear_input_size, action_shape)
|
||||||
|
|
||||||
|
def forward(self, x, state=None, info={}):
|
||||||
|
if not isinstance(x, torch.Tensor):
|
||||||
|
x = torch.tensor(x, device=self.device, dtype=torch.float)
|
||||||
|
x = x.permute(0, 3, 1, 2)
|
||||||
|
x = F.relu(self.bn1(self.conv1(x)))
|
||||||
|
x = F.relu(self.bn2(self.conv2(x)))
|
||||||
|
x = F.relu(self.bn3(self.conv3(x)))
|
||||||
|
return self.head(x.view(x.size(0), -1)), state
|
||||||
119
examples/point_maze_td3.py
Normal file
119
examples/point_maze_td3.py
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
import gym
|
||||||
|
import torch
|
||||||
|
import pprint
|
||||||
|
import argparse
|
||||||
|
import numpy as np
|
||||||
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
|
||||||
|
from tianshou.policy import TD3Policy
|
||||||
|
from tianshou.trainer import offpolicy_trainer
|
||||||
|
from tianshou.data import Collector, ReplayBuffer
|
||||||
|
from tianshou.env import VectorEnv, SubprocVectorEnv
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from continuous_net import Actor, Critic
|
||||||
|
else: # pytest
|
||||||
|
from test.continuous.net import Actor, Critic
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--task', type=str, default='PointMaze-v0')
|
||||||
|
parser.add_argument('--seed', type=int, default=1626)
|
||||||
|
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||||
|
parser.add_argument('--actor-lr', type=float, default=3e-5)
|
||||||
|
parser.add_argument('--critic-lr', type=float, default=1e-4)
|
||||||
|
parser.add_argument('--gamma', type=float, default=0.99)
|
||||||
|
parser.add_argument('--tau', type=float, default=0.005)
|
||||||
|
parser.add_argument('--exploration-noise', type=float, default=0.1)
|
||||||
|
parser.add_argument('--policy-noise', type=float, default=0.2)
|
||||||
|
parser.add_argument('--noise-clip', type=float, default=0.5)
|
||||||
|
parser.add_argument('--update-actor-freq', type=int, default=2)
|
||||||
|
parser.add_argument('--epoch', type=int, default=100)
|
||||||
|
parser.add_argument('--step-per-epoch', type=int, default=2400)
|
||||||
|
parser.add_argument('--collect-per-step', type=int, default=10)
|
||||||
|
parser.add_argument('--batch-size', type=int, default=128)
|
||||||
|
parser.add_argument('--layer-num', type=int, default=1)
|
||||||
|
parser.add_argument('--training-num', type=int, default=8)
|
||||||
|
parser.add_argument('--test-num', type=int, default=100)
|
||||||
|
parser.add_argument('--logdir', type=str, default='log')
|
||||||
|
parser.add_argument('--render', type=float, default=0.)
|
||||||
|
parser.add_argument(
|
||||||
|
'--device', type=str,
|
||||||
|
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
parser.add_argument('--max_episode_steps', type=int, default=2000)
|
||||||
|
|
||||||
|
args = parser.parse_known_args()[0]
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def test_td3(args=get_args()):
|
||||||
|
env = gym.make(args.task)
|
||||||
|
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||||
|
args.action_shape = env.action_space.shape or env.action_space.n
|
||||||
|
args.max_action = env.action_space.high[0]
|
||||||
|
# train_envs = gym.make(args.task)
|
||||||
|
train_envs = VectorEnv(
|
||||||
|
[lambda: gym.make(args.task) for _ in range(args.training_num)])
|
||||||
|
# test_envs = gym.make(args.task)
|
||||||
|
test_envs = SubprocVectorEnv(
|
||||||
|
[lambda: gym.make(args.task) for _ in range(args.test_num)])
|
||||||
|
# seed
|
||||||
|
np.random.seed(args.seed)
|
||||||
|
torch.manual_seed(args.seed)
|
||||||
|
train_envs.seed(args.seed)
|
||||||
|
test_envs.seed(args.seed)
|
||||||
|
# model
|
||||||
|
actor = Actor(
|
||||||
|
args.layer_num, args.state_shape, args.action_shape,
|
||||||
|
args.max_action, args.device
|
||||||
|
).to(args.device)
|
||||||
|
actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
|
||||||
|
critic1 = Critic(
|
||||||
|
args.layer_num, args.state_shape, args.action_shape, args.device
|
||||||
|
).to(args.device)
|
||||||
|
critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
|
||||||
|
critic2 = Critic(
|
||||||
|
args.layer_num, args.state_shape, args.action_shape, args.device
|
||||||
|
).to(args.device)
|
||||||
|
critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
|
||||||
|
policy = TD3Policy(
|
||||||
|
actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim,
|
||||||
|
args.tau, args.gamma, args.exploration_noise, args.policy_noise,
|
||||||
|
args.update_actor_freq, args.noise_clip,
|
||||||
|
[env.action_space.low[0], env.action_space.high[0]],
|
||||||
|
reward_normalization=True, ignore_done=True)
|
||||||
|
# collector
|
||||||
|
train_collector = Collector(
|
||||||
|
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||||
|
test_collector = Collector(policy, test_envs)
|
||||||
|
# train_collector.collect(n_step=args.buffer_size)
|
||||||
|
# log
|
||||||
|
writer = SummaryWriter(args.logdir + '/' + 'td3')
|
||||||
|
|
||||||
|
def stop_fn(x):
|
||||||
|
if env.spec.reward_threshold:
|
||||||
|
return x >= env.spec.reward_threshold
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# trainer
|
||||||
|
result = offpolicy_trainer(
|
||||||
|
policy, train_collector, test_collector, args.epoch,
|
||||||
|
args.step_per_epoch, args.collect_per_step, args.test_num,
|
||||||
|
args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||||
|
assert stop_fn(result['best_reward'])
|
||||||
|
train_collector.close()
|
||||||
|
test_collector.close()
|
||||||
|
if __name__ == '__main__':
|
||||||
|
pprint.pprint(result)
|
||||||
|
# Let's watch its performance!
|
||||||
|
env = gym.make(args.task)
|
||||||
|
collector = Collector(policy, env)
|
||||||
|
result = collector.collect(n_step=1000, render=args.render)
|
||||||
|
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||||
|
collector.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test_td3()
|
||||||
108
examples/pong_a2c.py
Normal file
108
examples/pong_a2c.py
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
import gym
|
||||||
|
import torch
|
||||||
|
import pprint
|
||||||
|
import argparse
|
||||||
|
import numpy as np
|
||||||
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
|
||||||
|
from tianshou.policy import A2CPolicy
|
||||||
|
from tianshou.env import SubprocVectorEnv
|
||||||
|
from tianshou.trainer import onpolicy_trainer
|
||||||
|
from tianshou.data import Collector, ReplayBuffer
|
||||||
|
from tianshou.env.atari import create_atari_environment
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from discrete_net import Net, Actor, Critic
|
||||||
|
else: # pytest
|
||||||
|
from test.discrete.net import Net, Actor, Critic
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--task', type=str, default='Pong')
|
||||||
|
parser.add_argument('--seed', type=int, default=1626)
|
||||||
|
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||||
|
parser.add_argument('--lr', type=float, default=3e-4)
|
||||||
|
parser.add_argument('--gamma', type=float, default=0.9)
|
||||||
|
parser.add_argument('--epoch', type=int, default=100)
|
||||||
|
parser.add_argument('--step-per-epoch', type=int, default=1000)
|
||||||
|
parser.add_argument('--collect-per-step', type=int, default=100)
|
||||||
|
parser.add_argument('--repeat-per-collect', type=int, default=1)
|
||||||
|
parser.add_argument('--batch-size', type=int, default=64)
|
||||||
|
parser.add_argument('--layer-num', type=int, default=2)
|
||||||
|
parser.add_argument('--training-num', type=int, default=8)
|
||||||
|
parser.add_argument('--test-num', type=int, default=8)
|
||||||
|
parser.add_argument('--logdir', type=str, default='log')
|
||||||
|
parser.add_argument('--render', type=float, default=0.)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--device', type=str,
|
||||||
|
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
# a2c special
|
||||||
|
parser.add_argument('--vf-coef', type=float, default=0.5)
|
||||||
|
parser.add_argument('--ent-coef', type=float, default=0.001)
|
||||||
|
parser.add_argument('--max-grad-norm', type=float, default=None)
|
||||||
|
parser.add_argument('--max_episode_steps', type=int, default=2000)
|
||||||
|
args = parser.parse_known_args()[0]
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def test_a2c(args=get_args()):
|
||||||
|
env = create_atari_environment(args.task, max_episode_steps=args.max_episode_steps)
|
||||||
|
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||||
|
args.action_shape = env.env.action_space.shape or env.env.action_space.n
|
||||||
|
# train_envs = gym.make(args.task)
|
||||||
|
train_envs = SubprocVectorEnv(
|
||||||
|
[lambda: create_atari_environment(args.task, max_episode_steps=args.max_episode_steps) for _ in
|
||||||
|
range(args.training_num)])
|
||||||
|
# test_envs = gym.make(args.task)
|
||||||
|
test_envs = SubprocVectorEnv(
|
||||||
|
[lambda: create_atari_environment(args.task, max_episode_steps=args.max_episode_steps) for _ in
|
||||||
|
range(args.test_num)])
|
||||||
|
# seed
|
||||||
|
np.random.seed(args.seed)
|
||||||
|
torch.manual_seed(args.seed)
|
||||||
|
train_envs.seed(args.seed)
|
||||||
|
test_envs.seed(args.seed)
|
||||||
|
# model
|
||||||
|
net = Net(args.layer_num, args.state_shape, device=args.device)
|
||||||
|
actor = Actor(net, args.action_shape).to(args.device)
|
||||||
|
critic = Critic(net).to(args.device)
|
||||||
|
optim = torch.optim.Adam(list(
|
||||||
|
actor.parameters()) + list(critic.parameters()), lr=args.lr)
|
||||||
|
dist = torch.distributions.Categorical
|
||||||
|
policy = A2CPolicy(
|
||||||
|
actor, critic, optim, dist, args.gamma, vf_coef=args.vf_coef,
|
||||||
|
ent_coef=args.ent_coef, max_grad_norm=args.max_grad_norm)
|
||||||
|
# collector
|
||||||
|
train_collector = Collector(
|
||||||
|
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||||
|
test_collector = Collector(policy, test_envs)
|
||||||
|
# log
|
||||||
|
writer = SummaryWriter(args.logdir + '/' + 'a2c')
|
||||||
|
|
||||||
|
def stop_fn(x):
|
||||||
|
if env.env.spec.reward_threshold:
|
||||||
|
return x >= env.spec.reward_threshold
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# trainer
|
||||||
|
result = onpolicy_trainer(
|
||||||
|
policy, train_collector, test_collector, args.epoch,
|
||||||
|
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
|
||||||
|
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||||
|
train_collector.close()
|
||||||
|
test_collector.close()
|
||||||
|
if __name__ == '__main__':
|
||||||
|
pprint.pprint(result)
|
||||||
|
# Let's watch its performance!
|
||||||
|
env = create_atari_environment(args.task)
|
||||||
|
collector = Collector(policy, env)
|
||||||
|
result = collector.collect(n_episode=1, render=args.render)
|
||||||
|
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||||
|
collector.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test_a2c()
|
||||||
112
examples/pong_dqn.py
Normal file
112
examples/pong_dqn.py
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
import gym
|
||||||
|
import torch
|
||||||
|
import pprint
|
||||||
|
import argparse
|
||||||
|
import numpy as np
|
||||||
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
|
||||||
|
from tianshou.policy import DQNPolicy
|
||||||
|
from tianshou.env import SubprocVectorEnv
|
||||||
|
from tianshou.trainer import offpolicy_trainer
|
||||||
|
from tianshou.data import Collector, ReplayBuffer
|
||||||
|
from tianshou.env.atari import create_atari_environment
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from discrete_net import DQN
|
||||||
|
else: # pytest
|
||||||
|
from test.discrete.net import DQN
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--task', type=str, default='Pong')
|
||||||
|
parser.add_argument('--seed', type=int, default=1626)
|
||||||
|
parser.add_argument('--eps-test', type=float, default=0.05)
|
||||||
|
parser.add_argument('--eps-train', type=float, default=0.1)
|
||||||
|
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||||
|
parser.add_argument('--lr', type=float, default=1e-3)
|
||||||
|
parser.add_argument('--gamma', type=float, default=0.9)
|
||||||
|
parser.add_argument('--n-step', type=int, default=1)
|
||||||
|
parser.add_argument('--target-update-freq', type=int, default=320)
|
||||||
|
parser.add_argument('--epoch', type=int, default=100)
|
||||||
|
parser.add_argument('--step-per-epoch', type=int, default=1000)
|
||||||
|
parser.add_argument('--collect-per-step', type=int, default=10)
|
||||||
|
parser.add_argument('--batch-size', type=int, default=64)
|
||||||
|
parser.add_argument('--layer-num', type=int, default=3)
|
||||||
|
parser.add_argument('--training-num', type=int, default=8)
|
||||||
|
parser.add_argument('--test-num', type=int, default=8)
|
||||||
|
parser.add_argument('--logdir', type=str, default='log')
|
||||||
|
parser.add_argument('--render', type=float, default=0.)
|
||||||
|
parser.add_argument(
|
||||||
|
'--device', type=str,
|
||||||
|
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
args = parser.parse_known_args()[0]
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def test_dqn(args=get_args()):
|
||||||
|
env = create_atari_environment(args.task)
|
||||||
|
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||||
|
args.action_shape = env.env.action_space.shape or env.env.action_space.n
|
||||||
|
# train_envs = gym.make(args.task)
|
||||||
|
train_envs = SubprocVectorEnv(
|
||||||
|
[lambda: create_atari_environment(args.task) for _ in range(args.training_num)])
|
||||||
|
# test_envs = gym.make(args.task)
|
||||||
|
test_envs = SubprocVectorEnv(
|
||||||
|
[lambda: create_atari_environment(args.task) for _ in range(args.test_num)])
|
||||||
|
# seed
|
||||||
|
np.random.seed(args.seed)
|
||||||
|
torch.manual_seed(args.seed)
|
||||||
|
train_envs.seed(args.seed)
|
||||||
|
test_envs.seed(args.seed)
|
||||||
|
# model
|
||||||
|
net = DQN(args.state_shape[0], args.state_shape[1], args.action_shape, args.device)
|
||||||
|
net = net.to(args.device)
|
||||||
|
optim = torch.optim.Adam(net.parameters(), lr=args.lr)
|
||||||
|
policy = DQNPolicy(
|
||||||
|
net, optim, args.gamma, args.n_step,
|
||||||
|
use_target_network=args.target_update_freq > 0,
|
||||||
|
target_update_freq=args.target_update_freq)
|
||||||
|
# collector
|
||||||
|
train_collector = Collector(
|
||||||
|
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||||
|
test_collector = Collector(policy, test_envs)
|
||||||
|
# policy.set_eps(1)
|
||||||
|
train_collector.collect(n_step=args.batch_size * 4)
|
||||||
|
print(len(train_collector.buffer))
|
||||||
|
# log
|
||||||
|
writer = SummaryWriter(args.logdir + '/' + 'dqn')
|
||||||
|
|
||||||
|
def stop_fn(x):
|
||||||
|
if env.env.spec.reward_threshold:
|
||||||
|
return x >= env.spec.reward_threshold
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def train_fn(x):
|
||||||
|
policy.set_eps(args.eps_train)
|
||||||
|
|
||||||
|
def test_fn(x):
|
||||||
|
policy.set_eps(args.eps_test)
|
||||||
|
|
||||||
|
# trainer
|
||||||
|
result = offpolicy_trainer(
|
||||||
|
policy, train_collector, test_collector, args.epoch,
|
||||||
|
args.step_per_epoch, args.collect_per_step, args.test_num,
|
||||||
|
args.batch_size, train_fn=train_fn, test_fn=test_fn,
|
||||||
|
stop_fn=stop_fn, writer=writer, task=args.task)
|
||||||
|
|
||||||
|
train_collector.close()
|
||||||
|
test_collector.close()
|
||||||
|
if __name__ == '__main__':
|
||||||
|
pprint.pprint(result)
|
||||||
|
# Let's watch its performance!
|
||||||
|
env = create_atari_environment(args.task)
|
||||||
|
collector = Collector(policy, env)
|
||||||
|
result = collector.collect(n_episode=1, render=args.render)
|
||||||
|
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||||
|
collector.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test_dqn(get_args())
|
||||||
112
examples/pong_ppo.py
Normal file
112
examples/pong_ppo.py
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
import gym
|
||||||
|
import torch
|
||||||
|
import pprint
|
||||||
|
import argparse
|
||||||
|
import numpy as np
|
||||||
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
|
||||||
|
from tianshou.policy import PPOPolicy
|
||||||
|
from tianshou.env import SubprocVectorEnv
|
||||||
|
from tianshou.trainer import onpolicy_trainer
|
||||||
|
from tianshou.data import Collector, ReplayBuffer
|
||||||
|
from tianshou.env.atari import create_atari_environment
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from discrete_net import Net, Actor, Critic
|
||||||
|
else: # pytest
|
||||||
|
from test.discrete.net import Net, Actor, Critic
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--task', type=str, default='Pong')
|
||||||
|
parser.add_argument('--seed', type=int, default=1626)
|
||||||
|
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||||
|
parser.add_argument('--lr', type=float, default=1e-3)
|
||||||
|
parser.add_argument('--gamma', type=float, default=0.99)
|
||||||
|
parser.add_argument('--epoch', type=int, default=100)
|
||||||
|
parser.add_argument('--step-per-epoch', type=int, default=1000)
|
||||||
|
parser.add_argument('--collect-per-step', type=int, default=100)
|
||||||
|
parser.add_argument('--repeat-per-collect', type=int, default=2)
|
||||||
|
parser.add_argument('--batch-size', type=int, default=64)
|
||||||
|
parser.add_argument('--layer-num', type=int, default=1)
|
||||||
|
parser.add_argument('--training-num', type=int, default=8)
|
||||||
|
parser.add_argument('--test-num', type=int, default=8)
|
||||||
|
parser.add_argument('--logdir', type=str, default='log')
|
||||||
|
parser.add_argument('--render', type=float, default=0.)
|
||||||
|
parser.add_argument(
|
||||||
|
'--device', type=str,
|
||||||
|
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
# ppo special
|
||||||
|
parser.add_argument('--vf-coef', type=float, default=0.5)
|
||||||
|
parser.add_argument('--ent-coef', type=float, default=0.0)
|
||||||
|
parser.add_argument('--eps-clip', type=float, default=0.2)
|
||||||
|
parser.add_argument('--max-grad-norm', type=float, default=0.5)
|
||||||
|
parser.add_argument('--max_episode_steps', type=int, default=2000)
|
||||||
|
args = parser.parse_known_args()[0]
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def test_ppo(args=get_args()):
|
||||||
|
env = create_atari_environment(args.task, max_episode_steps=args.max_episode_steps)
|
||||||
|
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||||
|
args.action_shape = env.action_space().shape or env.action_space().n
|
||||||
|
# train_envs = gym.make(args.task)
|
||||||
|
train_envs = SubprocVectorEnv(
|
||||||
|
[lambda: create_atari_environment(args.task, max_episode_steps=args.max_episode_steps) for _ in
|
||||||
|
range(args.training_num)])
|
||||||
|
# test_envs = gym.make(args.task)
|
||||||
|
test_envs = SubprocVectorEnv(
|
||||||
|
[lambda: create_atari_environment(args.task, max_episode_steps=args.max_episode_steps) for _ in
|
||||||
|
range(args.test_num)])
|
||||||
|
# seed
|
||||||
|
np.random.seed(args.seed)
|
||||||
|
torch.manual_seed(args.seed)
|
||||||
|
train_envs.seed(args.seed)
|
||||||
|
test_envs.seed(args.seed)
|
||||||
|
# model
|
||||||
|
net = Net(args.layer_num, args.state_shape, device=args.device)
|
||||||
|
actor = Actor(net, args.action_shape).to(args.device)
|
||||||
|
critic = Critic(net).to(args.device)
|
||||||
|
optim = torch.optim.Adam(list(
|
||||||
|
actor.parameters()) + list(critic.parameters()), lr=args.lr)
|
||||||
|
dist = torch.distributions.Categorical
|
||||||
|
policy = PPOPolicy(
|
||||||
|
actor, critic, optim, dist, args.gamma,
|
||||||
|
max_grad_norm=args.max_grad_norm,
|
||||||
|
eps_clip=args.eps_clip,
|
||||||
|
vf_coef=args.vf_coef,
|
||||||
|
ent_coef=args.ent_coef,
|
||||||
|
action_range=None)
|
||||||
|
# collector
|
||||||
|
train_collector = Collector(
|
||||||
|
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||||
|
test_collector = Collector(policy, test_envs)
|
||||||
|
# log
|
||||||
|
writer = SummaryWriter(args.logdir + '/' + 'ppo')
|
||||||
|
|
||||||
|
def stop_fn(x):
|
||||||
|
if env.env.spec.reward_threshold:
|
||||||
|
return x >= env.spec.reward_threshold
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# trainer
|
||||||
|
result = onpolicy_trainer(
|
||||||
|
policy, train_collector, test_collector, args.epoch,
|
||||||
|
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
|
||||||
|
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||||
|
train_collector.close()
|
||||||
|
test_collector.close()
|
||||||
|
if __name__ == '__main__':
|
||||||
|
pprint.pprint(result)
|
||||||
|
# Let's watch its performance!
|
||||||
|
env = create_atari_environment(args.task)
|
||||||
|
collector = Collector(policy, env)
|
||||||
|
result = collector.collect(n_step=2000, render=args.render)
|
||||||
|
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||||
|
collector.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test_ppo()
|
||||||
1
setup.py
1
setup.py
@ -55,6 +55,7 @@ setup(
|
|||||||
],
|
],
|
||||||
'atari': [
|
'atari': [
|
||||||
'atari_py',
|
'atari_py',
|
||||||
|
'cv2'
|
||||||
],
|
],
|
||||||
'mujoco': [
|
'mujoco': [
|
||||||
'mujoco_py',
|
'mujoco_py',
|
||||||
|
|||||||
@ -34,6 +34,7 @@ def get_args():
|
|||||||
parser.add_argument('--training-num', type=int, default=8)
|
parser.add_argument('--training-num', type=int, default=8)
|
||||||
parser.add_argument('--test-num', type=int, default=100)
|
parser.add_argument('--test-num', type=int, default=100)
|
||||||
parser.add_argument('--logdir', type=str, default='log')
|
parser.add_argument('--logdir', type=str, default='log')
|
||||||
|
parser.add_argument('--render', type=float, default=0.)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--device', type=str,
|
'--device', type=str,
|
||||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
@ -79,7 +80,7 @@ def test_ddpg(args=get_args()):
|
|||||||
policy, train_envs, ReplayBuffer(args.buffer_size))
|
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||||
test_collector = Collector(policy, test_envs)
|
test_collector = Collector(policy, test_envs)
|
||||||
# log
|
# log
|
||||||
writer = SummaryWriter(args.logdir)
|
writer = SummaryWriter(args.logdir + '/' + 'ddpg')
|
||||||
|
|
||||||
def stop_fn(x):
|
def stop_fn(x):
|
||||||
return x >= env.spec.reward_threshold
|
return x >= env.spec.reward_threshold
|
||||||
@ -88,7 +89,7 @@ def test_ddpg(args=get_args()):
|
|||||||
result = offpolicy_trainer(
|
result = offpolicy_trainer(
|
||||||
policy, train_collector, test_collector, args.epoch,
|
policy, train_collector, test_collector, args.epoch,
|
||||||
args.step_per_epoch, args.collect_per_step, args.test_num,
|
args.step_per_epoch, args.collect_per_step, args.test_num,
|
||||||
args.batch_size, stop_fn=stop_fn, writer=writer)
|
args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||||
assert stop_fn(result['best_reward'])
|
assert stop_fn(result['best_reward'])
|
||||||
train_collector.close()
|
train_collector.close()
|
||||||
test_collector.close()
|
test_collector.close()
|
||||||
@ -97,7 +98,7 @@ def test_ddpg(args=get_args()):
|
|||||||
# Let's watch its performance!
|
# Let's watch its performance!
|
||||||
env = gym.make(args.task)
|
env = gym.make(args.task)
|
||||||
collector = Collector(policy, env)
|
collector = Collector(policy, env)
|
||||||
result = collector.collect(n_episode=1, render=1 / 35)
|
result = collector.collect(n_episode=1, render=args.render)
|
||||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||||
collector.close()
|
collector.close()
|
||||||
|
|
||||||
|
|||||||
@ -32,6 +32,7 @@ def get_args():
|
|||||||
parser.add_argument('--training-num', type=int, default=16)
|
parser.add_argument('--training-num', type=int, default=16)
|
||||||
parser.add_argument('--test-num', type=int, default=100)
|
parser.add_argument('--test-num', type=int, default=100)
|
||||||
parser.add_argument('--logdir', type=str, default='log')
|
parser.add_argument('--logdir', type=str, default='log')
|
||||||
|
parser.add_argument('--render', type=float, default=0.)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--device', type=str,
|
'--device', type=str,
|
||||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
@ -87,7 +88,7 @@ def _test_ppo(args=get_args()):
|
|||||||
test_collector = Collector(policy, test_envs)
|
test_collector = Collector(policy, test_envs)
|
||||||
train_collector.collect(n_step=args.step_per_epoch)
|
train_collector.collect(n_step=args.step_per_epoch)
|
||||||
# log
|
# log
|
||||||
writer = SummaryWriter(args.logdir)
|
writer = SummaryWriter(args.logdir + '/' + 'ppo')
|
||||||
|
|
||||||
def stop_fn(x):
|
def stop_fn(x):
|
||||||
return x >= env.spec.reward_threshold
|
return x >= env.spec.reward_threshold
|
||||||
@ -96,7 +97,7 @@ def _test_ppo(args=get_args()):
|
|||||||
result = onpolicy_trainer(
|
result = onpolicy_trainer(
|
||||||
policy, train_collector, test_collector, args.epoch,
|
policy, train_collector, test_collector, args.epoch,
|
||||||
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
|
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
|
||||||
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer)
|
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||||
assert stop_fn(result['best_reward'])
|
assert stop_fn(result['best_reward'])
|
||||||
train_collector.close()
|
train_collector.close()
|
||||||
test_collector.close()
|
test_collector.close()
|
||||||
@ -105,7 +106,7 @@ def _test_ppo(args=get_args()):
|
|||||||
# Let's watch its performance!
|
# Let's watch its performance!
|
||||||
env = gym.make(args.task)
|
env = gym.make(args.task)
|
||||||
collector = Collector(policy, env)
|
collector = Collector(policy, env)
|
||||||
result = collector.collect(n_episode=1, render=1 / 35)
|
result = collector.collect(n_episode=1, render=args.render)
|
||||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||||
collector.close()
|
collector.close()
|
||||||
|
|
||||||
|
|||||||
@ -34,6 +34,7 @@ def get_args():
|
|||||||
parser.add_argument('--training-num', type=int, default=8)
|
parser.add_argument('--training-num', type=int, default=8)
|
||||||
parser.add_argument('--test-num', type=int, default=100)
|
parser.add_argument('--test-num', type=int, default=100)
|
||||||
parser.add_argument('--logdir', type=str, default='log')
|
parser.add_argument('--logdir', type=str, default='log')
|
||||||
|
parser.add_argument('--render', type=float, default=0.)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--device', type=str,
|
'--device', type=str,
|
||||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
@ -84,7 +85,7 @@ def test_sac(args=get_args()):
|
|||||||
test_collector = Collector(policy, test_envs)
|
test_collector = Collector(policy, test_envs)
|
||||||
# train_collector.collect(n_step=args.buffer_size)
|
# train_collector.collect(n_step=args.buffer_size)
|
||||||
# log
|
# log
|
||||||
writer = SummaryWriter(args.logdir)
|
writer = SummaryWriter(args.logdir + '/' + 'sac')
|
||||||
|
|
||||||
def stop_fn(x):
|
def stop_fn(x):
|
||||||
return x >= env.spec.reward_threshold
|
return x >= env.spec.reward_threshold
|
||||||
@ -93,7 +94,7 @@ def test_sac(args=get_args()):
|
|||||||
result = offpolicy_trainer(
|
result = offpolicy_trainer(
|
||||||
policy, train_collector, test_collector, args.epoch,
|
policy, train_collector, test_collector, args.epoch,
|
||||||
args.step_per_epoch, args.collect_per_step, args.test_num,
|
args.step_per_epoch, args.collect_per_step, args.test_num,
|
||||||
args.batch_size, stop_fn=stop_fn, writer=writer)
|
args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||||
assert stop_fn(result['best_reward'])
|
assert stop_fn(result['best_reward'])
|
||||||
train_collector.close()
|
train_collector.close()
|
||||||
test_collector.close()
|
test_collector.close()
|
||||||
@ -102,7 +103,7 @@ def test_sac(args=get_args()):
|
|||||||
# Let's watch its performance!
|
# Let's watch its performance!
|
||||||
env = gym.make(args.task)
|
env = gym.make(args.task)
|
||||||
collector = Collector(policy, env)
|
collector = Collector(policy, env)
|
||||||
result = collector.collect(n_episode=1, render=1 / 35)
|
result = collector.collect(n_episode=1, render=args.render)
|
||||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||||
collector.close()
|
collector.close()
|
||||||
|
|
||||||
|
|||||||
@ -37,6 +37,7 @@ def get_args():
|
|||||||
parser.add_argument('--training-num', type=int, default=8)
|
parser.add_argument('--training-num', type=int, default=8)
|
||||||
parser.add_argument('--test-num', type=int, default=100)
|
parser.add_argument('--test-num', type=int, default=100)
|
||||||
parser.add_argument('--logdir', type=str, default='log')
|
parser.add_argument('--logdir', type=str, default='log')
|
||||||
|
parser.add_argument('--render', type=float, default=0.)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--device', type=str,
|
'--device', type=str,
|
||||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
@ -88,7 +89,7 @@ def test_td3(args=get_args()):
|
|||||||
test_collector = Collector(policy, test_envs)
|
test_collector = Collector(policy, test_envs)
|
||||||
# train_collector.collect(n_step=args.buffer_size)
|
# train_collector.collect(n_step=args.buffer_size)
|
||||||
# log
|
# log
|
||||||
writer = SummaryWriter(args.logdir)
|
writer = SummaryWriter(args.logdir + '/' + 'td3')
|
||||||
|
|
||||||
def stop_fn(x):
|
def stop_fn(x):
|
||||||
return x >= env.spec.reward_threshold
|
return x >= env.spec.reward_threshold
|
||||||
@ -97,7 +98,7 @@ def test_td3(args=get_args()):
|
|||||||
result = offpolicy_trainer(
|
result = offpolicy_trainer(
|
||||||
policy, train_collector, test_collector, args.epoch,
|
policy, train_collector, test_collector, args.epoch,
|
||||||
args.step_per_epoch, args.collect_per_step, args.test_num,
|
args.step_per_epoch, args.collect_per_step, args.test_num,
|
||||||
args.batch_size, stop_fn=stop_fn, writer=writer)
|
args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||||
assert stop_fn(result['best_reward'])
|
assert stop_fn(result['best_reward'])
|
||||||
train_collector.close()
|
train_collector.close()
|
||||||
test_collector.close()
|
test_collector.close()
|
||||||
@ -106,7 +107,7 @@ def test_td3(args=get_args()):
|
|||||||
# Let's watch its performance!
|
# Let's watch its performance!
|
||||||
env = gym.make(args.task)
|
env = gym.make(args.task)
|
||||||
collector = Collector(policy, env)
|
collector = Collector(policy, env)
|
||||||
result = collector.collect(n_episode=1, render=1 / 35)
|
result = collector.collect(n_episode=1, render=args.render)
|
||||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||||
collector.close()
|
collector.close()
|
||||||
|
|
||||||
|
|||||||
@ -48,3 +48,33 @@ class Critic(nn.Module):
|
|||||||
logits, h = self.preprocess(s, None)
|
logits, h = self.preprocess(s, None)
|
||||||
logits = self.last(logits)
|
logits = self.last(logits)
|
||||||
return logits
|
return logits
|
||||||
|
|
||||||
|
|
||||||
|
class DQN(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, h, w, action_shape, device='cpu'):
|
||||||
|
super(DQN, self).__init__()
|
||||||
|
self.device = device
|
||||||
|
|
||||||
|
self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=2)
|
||||||
|
self.bn1 = nn.BatchNorm2d(16)
|
||||||
|
self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
|
||||||
|
self.bn2 = nn.BatchNorm2d(32)
|
||||||
|
self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
|
||||||
|
self.bn3 = nn.BatchNorm2d(32)
|
||||||
|
|
||||||
|
def conv2d_size_out(size, kernel_size=5, stride=2):
|
||||||
|
return (size - (kernel_size - 1) - 1) // stride + 1
|
||||||
|
|
||||||
|
convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
|
||||||
|
convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
|
||||||
|
linear_input_size = convw * convh * 32
|
||||||
|
self.head = nn.Linear(linear_input_size, action_shape)
|
||||||
|
|
||||||
|
def forward(self, x, state=None, info={}):
|
||||||
|
if not isinstance(x, torch.Tensor):
|
||||||
|
s = torch.tensor(x, device=self.device, dtype=torch.float)
|
||||||
|
x = F.relu(self.bn1(self.conv1(x)))
|
||||||
|
x = F.relu(self.bn2(self.conv2(x)))
|
||||||
|
x = F.relu(self.bn3(self.conv3(x)))
|
||||||
|
return self.head(x.view(x.size(0), -1)), state
|
||||||
|
|||||||
@ -32,6 +32,8 @@ def get_args():
|
|||||||
parser.add_argument('--training-num', type=int, default=32)
|
parser.add_argument('--training-num', type=int, default=32)
|
||||||
parser.add_argument('--test-num', type=int, default=100)
|
parser.add_argument('--test-num', type=int, default=100)
|
||||||
parser.add_argument('--logdir', type=str, default='log')
|
parser.add_argument('--logdir', type=str, default='log')
|
||||||
|
parser.add_argument('--render', type=float, default=0.)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--device', type=str,
|
'--device', type=str,
|
||||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
@ -73,7 +75,7 @@ def test_a2c(args=get_args()):
|
|||||||
policy, train_envs, ReplayBuffer(args.buffer_size))
|
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||||
test_collector = Collector(policy, test_envs)
|
test_collector = Collector(policy, test_envs)
|
||||||
# log
|
# log
|
||||||
writer = SummaryWriter(args.logdir)
|
writer = SummaryWriter(args.logdir + '/' + 'ppo')
|
||||||
|
|
||||||
def stop_fn(x):
|
def stop_fn(x):
|
||||||
return x >= env.spec.reward_threshold
|
return x >= env.spec.reward_threshold
|
||||||
@ -82,7 +84,7 @@ def test_a2c(args=get_args()):
|
|||||||
result = onpolicy_trainer(
|
result = onpolicy_trainer(
|
||||||
policy, train_collector, test_collector, args.epoch,
|
policy, train_collector, test_collector, args.epoch,
|
||||||
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
|
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
|
||||||
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer)
|
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||||
assert stop_fn(result['best_reward'])
|
assert stop_fn(result['best_reward'])
|
||||||
train_collector.close()
|
train_collector.close()
|
||||||
test_collector.close()
|
test_collector.close()
|
||||||
@ -91,7 +93,7 @@ def test_a2c(args=get_args()):
|
|||||||
# Let's watch its performance!
|
# Let's watch its performance!
|
||||||
env = gym.make(args.task)
|
env = gym.make(args.task)
|
||||||
collector = Collector(policy, env)
|
collector = Collector(policy, env)
|
||||||
result = collector.collect(n_episode=1, render=1 / 35)
|
result = collector.collect(n_episode=1, render=args.render)
|
||||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||||
collector.close()
|
collector.close()
|
||||||
|
|
||||||
|
|||||||
@ -35,6 +35,7 @@ def get_args():
|
|||||||
parser.add_argument('--training-num', type=int, default=8)
|
parser.add_argument('--training-num', type=int, default=8)
|
||||||
parser.add_argument('--test-num', type=int, default=100)
|
parser.add_argument('--test-num', type=int, default=100)
|
||||||
parser.add_argument('--logdir', type=str, default='log')
|
parser.add_argument('--logdir', type=str, default='log')
|
||||||
|
parser.add_argument('--render', type=float, default=0.)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--device', type=str,
|
'--device', type=str,
|
||||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
@ -73,7 +74,7 @@ def test_dqn(args=get_args()):
|
|||||||
train_collector.collect(n_step=args.batch_size)
|
train_collector.collect(n_step=args.batch_size)
|
||||||
print(len(train_collector.buffer))
|
print(len(train_collector.buffer))
|
||||||
# log
|
# log
|
||||||
writer = SummaryWriter(args.logdir)
|
writer = SummaryWriter(args.logdir + '/' + 'ppo')
|
||||||
|
|
||||||
def stop_fn(x):
|
def stop_fn(x):
|
||||||
return x >= env.spec.reward_threshold
|
return x >= env.spec.reward_threshold
|
||||||
@ -89,7 +90,7 @@ def test_dqn(args=get_args()):
|
|||||||
policy, train_collector, test_collector, args.epoch,
|
policy, train_collector, test_collector, args.epoch,
|
||||||
args.step_per_epoch, args.collect_per_step, args.test_num,
|
args.step_per_epoch, args.collect_per_step, args.test_num,
|
||||||
args.batch_size, train_fn=train_fn, test_fn=test_fn,
|
args.batch_size, train_fn=train_fn, test_fn=test_fn,
|
||||||
stop_fn=stop_fn, writer=writer)
|
stop_fn=stop_fn, writer=writer, task=args.task)
|
||||||
|
|
||||||
assert stop_fn(result['best_reward'])
|
assert stop_fn(result['best_reward'])
|
||||||
train_collector.close()
|
train_collector.close()
|
||||||
@ -99,7 +100,7 @@ def test_dqn(args=get_args()):
|
|||||||
# Let's watch its performance!
|
# Let's watch its performance!
|
||||||
env = gym.make(args.task)
|
env = gym.make(args.task)
|
||||||
collector = Collector(policy, env)
|
collector = Collector(policy, env)
|
||||||
result = collector.collect(n_episode=1, render=1 / 35)
|
result = collector.collect(n_episode=1, render=args.render)
|
||||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||||
collector.close()
|
collector.close()
|
||||||
|
|
||||||
|
|||||||
@ -86,6 +86,7 @@ def get_args():
|
|||||||
parser.add_argument('--training-num', type=int, default=8)
|
parser.add_argument('--training-num', type=int, default=8)
|
||||||
parser.add_argument('--test-num', type=int, default=100)
|
parser.add_argument('--test-num', type=int, default=100)
|
||||||
parser.add_argument('--logdir', type=str, default='log')
|
parser.add_argument('--logdir', type=str, default='log')
|
||||||
|
parser.add_argument('--render', type=float, default=0.)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--device', type=str,
|
'--device', type=str,
|
||||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
@ -121,7 +122,7 @@ def test_pg(args=get_args()):
|
|||||||
policy, train_envs, ReplayBuffer(args.buffer_size))
|
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||||
test_collector = Collector(policy, test_envs)
|
test_collector = Collector(policy, test_envs)
|
||||||
# log
|
# log
|
||||||
writer = SummaryWriter(args.logdir)
|
writer = SummaryWriter(args.logdir + '/' + 'ppo')
|
||||||
|
|
||||||
def stop_fn(x):
|
def stop_fn(x):
|
||||||
return x >= env.spec.reward_threshold
|
return x >= env.spec.reward_threshold
|
||||||
@ -130,7 +131,7 @@ def test_pg(args=get_args()):
|
|||||||
result = onpolicy_trainer(
|
result = onpolicy_trainer(
|
||||||
policy, train_collector, test_collector, args.epoch,
|
policy, train_collector, test_collector, args.epoch,
|
||||||
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
|
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
|
||||||
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer)
|
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||||
assert stop_fn(result['best_reward'])
|
assert stop_fn(result['best_reward'])
|
||||||
train_collector.close()
|
train_collector.close()
|
||||||
test_collector.close()
|
test_collector.close()
|
||||||
@ -139,7 +140,7 @@ def test_pg(args=get_args()):
|
|||||||
# Let's watch its performance!
|
# Let's watch its performance!
|
||||||
env = gym.make(args.task)
|
env = gym.make(args.task)
|
||||||
collector = Collector(policy, env)
|
collector = Collector(policy, env)
|
||||||
result = collector.collect(n_episode=1, render=1 / 35)
|
result = collector.collect(n_episode=1, render=args.render)
|
||||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||||
collector.close()
|
collector.close()
|
||||||
|
|
||||||
|
|||||||
@ -32,6 +32,7 @@ def get_args():
|
|||||||
parser.add_argument('--training-num', type=int, default=32)
|
parser.add_argument('--training-num', type=int, default=32)
|
||||||
parser.add_argument('--test-num', type=int, default=100)
|
parser.add_argument('--test-num', type=int, default=100)
|
||||||
parser.add_argument('--logdir', type=str, default='log')
|
parser.add_argument('--logdir', type=str, default='log')
|
||||||
|
parser.add_argument('--render', type=float, default=0.)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--device', type=str,
|
'--device', type=str,
|
||||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
@ -78,7 +79,7 @@ def test_ppo(args=get_args()):
|
|||||||
policy, train_envs, ReplayBuffer(args.buffer_size))
|
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||||
test_collector = Collector(policy, test_envs)
|
test_collector = Collector(policy, test_envs)
|
||||||
# log
|
# log
|
||||||
writer = SummaryWriter(args.logdir)
|
writer = SummaryWriter(args.logdir + '/' + 'ppo')
|
||||||
|
|
||||||
def stop_fn(x):
|
def stop_fn(x):
|
||||||
return x >= env.spec.reward_threshold
|
return x >= env.spec.reward_threshold
|
||||||
@ -87,7 +88,7 @@ def test_ppo(args=get_args()):
|
|||||||
result = onpolicy_trainer(
|
result = onpolicy_trainer(
|
||||||
policy, train_collector, test_collector, args.epoch,
|
policy, train_collector, test_collector, args.epoch,
|
||||||
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
|
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
|
||||||
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer)
|
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||||
assert stop_fn(result['best_reward'])
|
assert stop_fn(result['best_reward'])
|
||||||
train_collector.close()
|
train_collector.close()
|
||||||
test_collector.close()
|
test_collector.close()
|
||||||
@ -96,7 +97,7 @@ def test_ppo(args=get_args()):
|
|||||||
# Let's watch its performance!
|
# Let's watch its performance!
|
||||||
env = gym.make(args.task)
|
env = gym.make(args.task)
|
||||||
collector = Collector(policy, env)
|
collector = Collector(policy, env)
|
||||||
result = collector.collect(n_episode=1, render=1 / 35)
|
result = collector.collect(n_episode=1, render=args.render)
|
||||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||||
collector.close()
|
collector.close()
|
||||||
|
|
||||||
|
|||||||
@ -37,7 +37,7 @@ class Batch(object):
|
|||||||
else:
|
else:
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
'No support for append with type {} in class Batch.'
|
'No support for append with type {} in class Batch.'
|
||||||
.format(type(batch.__dict__[k])))
|
.format(type(batch.__dict__[k])))
|
||||||
|
|
||||||
def split(self, size=None, permute=True):
|
def split(self, size=None, permute=True):
|
||||||
length = min([
|
length = min([
|
||||||
|
|||||||
@ -2,7 +2,7 @@ import time
|
|||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
|
import warnings
|
||||||
from tianshou.env import BaseVectorEnv
|
from tianshou.env import BaseVectorEnv
|
||||||
from tianshou.data import Batch, ReplayBuffer
|
from tianshou.data import Batch, ReplayBuffer
|
||||||
from tianshou.utils import MovAvg
|
from tianshou.utils import MovAvg
|
||||||
@ -87,6 +87,7 @@ class Collector(object):
|
|||||||
return np.array([data])
|
return np.array([data])
|
||||||
|
|
||||||
def collect(self, n_step=0, n_episode=0, render=0):
|
def collect(self, n_step=0, n_episode=0, render=0):
|
||||||
|
warning_count = 0
|
||||||
if not self._multi_env:
|
if not self._multi_env:
|
||||||
n_episode = np.sum(n_episode)
|
n_episode = np.sum(n_episode)
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
@ -97,6 +98,10 @@ class Collector(object):
|
|||||||
reward_sum = 0
|
reward_sum = 0
|
||||||
length_sum = 0
|
length_sum = 0
|
||||||
while True:
|
while True:
|
||||||
|
if warning_count >= 100000:
|
||||||
|
warnings.warn(
|
||||||
|
'There are already many steps in an episode. You should add a time limitation to your environment!',
|
||||||
|
Warning)
|
||||||
if self._multi_env:
|
if self._multi_env:
|
||||||
batch_data = Batch(
|
batch_data = Batch(
|
||||||
obs=self._obs, act=self._act, rew=self._rew,
|
obs=self._obs, act=self._act, rew=self._rew,
|
||||||
@ -131,11 +136,14 @@ class Collector(object):
|
|||||||
'rew': self._rew[i], 'done': self._done[i],
|
'rew': self._rew[i], 'done': self._done[i],
|
||||||
'obs_next': obs_next[i], 'info': self._info[i]}
|
'obs_next': obs_next[i], 'info': self._info[i]}
|
||||||
if self._cached_buf:
|
if self._cached_buf:
|
||||||
|
warning_count += 1
|
||||||
self._cached_buf[i].add(**data)
|
self._cached_buf[i].add(**data)
|
||||||
elif self._multi_buf:
|
elif self._multi_buf:
|
||||||
|
warning_count += 1
|
||||||
self.buffer[i].add(**data)
|
self.buffer[i].add(**data)
|
||||||
cur_step += 1
|
cur_step += 1
|
||||||
else:
|
else:
|
||||||
|
warning_count += 1
|
||||||
self.buffer.add(**data)
|
self.buffer.add(**data)
|
||||||
cur_step += 1
|
cur_step += 1
|
||||||
if self._done[i]:
|
if self._done[i]:
|
||||||
|
|||||||
@ -14,7 +14,7 @@ class OUNoise(object):
|
|||||||
if self.x is None or self.x.shape != size:
|
if self.x is None or self.x.shape != size:
|
||||||
self.x = 0
|
self.x = 0
|
||||||
self.x = self.x + self.alpha * (mu - self.x) + \
|
self.x = self.x + self.alpha * (mu - self.x) + \
|
||||||
self.beta * np.random.normal(size=size)
|
self.beta * np.random.normal(size=size)
|
||||||
return self.x
|
return self.x
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
|
|||||||
@ -39,6 +39,7 @@ class A2CPolicy(PGPolicy):
|
|||||||
a_loss = -(dist.log_prob(a) * (r - v).detach()).mean()
|
a_loss = -(dist.log_prob(a) * (r - v).detach()).mean()
|
||||||
vf_loss = F.mse_loss(r[:, None], v)
|
vf_loss = F.mse_loss(r[:, None], v)
|
||||||
ent_loss = dist.entropy().mean()
|
ent_loss = dist.entropy().mean()
|
||||||
|
|
||||||
loss = a_loss + self._w_vf * vf_loss - self._w_ent * ent_loss
|
loss = a_loss + self._w_vf * vf_loss - self._w_ent * ent_loss
|
||||||
loss.backward()
|
loss.backward()
|
||||||
if self._grad_norm:
|
if self._grad_norm:
|
||||||
|
|||||||
@ -34,6 +34,9 @@ class PGPolicy(BasePolicy):
|
|||||||
|
|
||||||
def learn(self, batch, batch_size=None, repeat=1):
|
def learn(self, batch, batch_size=None, repeat=1):
|
||||||
losses = []
|
losses = []
|
||||||
|
|
||||||
|
batch.returns = (batch.returns - batch.returns.mean()) \
|
||||||
|
/ (batch.returns.std() + self._eps)
|
||||||
r = batch.returns
|
r = batch.returns
|
||||||
batch.returns = (r - r.mean()) / (r.std() + self._eps)
|
batch.returns = (r - r.mean()) / (r.std() + self._eps)
|
||||||
for _ in range(repeat):
|
for _ in range(repeat):
|
||||||
|
|||||||
@ -58,6 +58,9 @@ class PPOPolicy(PGPolicy):
|
|||||||
|
|
||||||
def learn(self, batch, batch_size=None, repeat=1):
|
def learn(self, batch, batch_size=None, repeat=1):
|
||||||
losses, clip_losses, vf_losses, ent_losses = [], [], [], []
|
losses, clip_losses, vf_losses, ent_losses = [], [], [], []
|
||||||
|
|
||||||
|
batch.returns = (batch.returns - batch.returns.mean()) \
|
||||||
|
/ (batch.returns.std() + self._eps)
|
||||||
r = batch.returns
|
r = batch.returns
|
||||||
batch.returns = (r - r.mean()) / (r.std() + self._eps)
|
batch.returns = (r - r.mean()) / (r.std() + self._eps)
|
||||||
batch.act = torch.tensor(batch.act)
|
batch.act = torch.tensor(batch.act)
|
||||||
@ -79,6 +82,7 @@ class PPOPolicy(PGPolicy):
|
|||||||
clip_losses.append(clip_loss.detach().cpu().numpy())
|
clip_losses.append(clip_loss.detach().cpu().numpy())
|
||||||
vf_loss = F.smooth_l1_loss(self.critic(b.obs), target_v)
|
vf_loss = F.smooth_l1_loss(self.critic(b.obs), target_v)
|
||||||
vf_losses.append(vf_loss.detach().cpu().numpy())
|
vf_losses.append(vf_loss.detach().cpu().numpy())
|
||||||
|
|
||||||
e_loss = dist.entropy().mean()
|
e_loss = dist.entropy().mean()
|
||||||
ent_losses.append(e_loss.detach().cpu().numpy())
|
ent_losses.append(e_loss.detach().cpu().numpy())
|
||||||
loss = clip_loss + self._w_vf * vf_loss - self._w_ent * e_loss
|
loss = clip_loss + self._w_vf * vf_loss - self._w_ent * e_loss
|
||||||
@ -87,7 +91,7 @@ class PPOPolicy(PGPolicy):
|
|||||||
loss.backward()
|
loss.backward()
|
||||||
nn.utils.clip_grad_norm_(list(
|
nn.utils.clip_grad_norm_(list(
|
||||||
self.actor.parameters()) + list(self.critic.parameters()),
|
self.actor.parameters()) + list(self.critic.parameters()),
|
||||||
self._max_grad_norm)
|
self._max_grad_norm)
|
||||||
self.optim.step()
|
self.optim.step()
|
||||||
self.sync_weight()
|
self.sync_weight()
|
||||||
return {
|
return {
|
||||||
|
|||||||
@ -8,7 +8,7 @@ from tianshou.trainer import test_episode, gather_info
|
|||||||
def offpolicy_trainer(policy, train_collector, test_collector, max_epoch,
|
def offpolicy_trainer(policy, train_collector, test_collector, max_epoch,
|
||||||
step_per_epoch, collect_per_step, episode_per_test,
|
step_per_epoch, collect_per_step, episode_per_test,
|
||||||
batch_size, train_fn=None, test_fn=None, stop_fn=None,
|
batch_size, train_fn=None, test_fn=None, stop_fn=None,
|
||||||
writer=None, verbose=True):
|
writer=None, verbose=True, task=''):
|
||||||
global_step = 0
|
global_step = 0
|
||||||
best_epoch, best_reward = -1, -1
|
best_epoch, best_reward = -1, -1
|
||||||
stat = {}
|
stat = {}
|
||||||
@ -47,7 +47,7 @@ def offpolicy_trainer(policy, train_collector, test_collector, max_epoch,
|
|||||||
data[k] = f'{result[k]:.2f}'
|
data[k] = f'{result[k]:.2f}'
|
||||||
if writer:
|
if writer:
|
||||||
writer.add_scalar(
|
writer.add_scalar(
|
||||||
k, result[k], global_step=global_step)
|
k + '_' + task, result[k], global_step=global_step)
|
||||||
for k in losses.keys():
|
for k in losses.keys():
|
||||||
if stat.get(k) is None:
|
if stat.get(k) is None:
|
||||||
stat[k] = MovAvg()
|
stat[k] = MovAvg()
|
||||||
@ -55,7 +55,7 @@ def offpolicy_trainer(policy, train_collector, test_collector, max_epoch,
|
|||||||
data[k] = f'{stat[k].get():.6f}'
|
data[k] = f'{stat[k].get():.6f}'
|
||||||
if writer:
|
if writer:
|
||||||
writer.add_scalar(
|
writer.add_scalar(
|
||||||
k, stat[k].get(), global_step=global_step)
|
k + '_' + task, stat[k].get(), global_step=global_step)
|
||||||
t.update(1)
|
t.update(1)
|
||||||
t.set_postfix(**data)
|
t.set_postfix(**data)
|
||||||
if t.n <= t.total:
|
if t.n <= t.total:
|
||||||
|
|||||||
@ -9,7 +9,7 @@ def onpolicy_trainer(policy, train_collector, test_collector, max_epoch,
|
|||||||
step_per_epoch, collect_per_step, repeat_per_collect,
|
step_per_epoch, collect_per_step, repeat_per_collect,
|
||||||
episode_per_test, batch_size,
|
episode_per_test, batch_size,
|
||||||
train_fn=None, test_fn=None, stop_fn=None,
|
train_fn=None, test_fn=None, stop_fn=None,
|
||||||
writer=None, verbose=True):
|
writer=None, verbose=True, task=''):
|
||||||
global_step = 0
|
global_step = 0
|
||||||
best_epoch, best_reward = -1, -1
|
best_epoch, best_reward = -1, -1
|
||||||
stat = {}
|
stat = {}
|
||||||
@ -52,15 +52,15 @@ def onpolicy_trainer(policy, train_collector, test_collector, max_epoch,
|
|||||||
data[k] = f'{result[k]:.2f}'
|
data[k] = f'{result[k]:.2f}'
|
||||||
if writer:
|
if writer:
|
||||||
writer.add_scalar(
|
writer.add_scalar(
|
||||||
k, result[k], global_step=global_step)
|
k + '_' + task, result[k], global_step=global_step)
|
||||||
for k in losses.keys():
|
for k in losses.keys():
|
||||||
if stat.get(k) is None:
|
if stat.get(k) is None:
|
||||||
stat[k] = MovAvg()
|
stat[k] = MovAvg()
|
||||||
stat[k].add(losses[k])
|
stat[k].add(losses[k])
|
||||||
data[k] = f'{stat[k].get():.6f}'
|
data[k] = f'{stat[k].get():.6f}'
|
||||||
if writer:
|
if writer and global_step:
|
||||||
writer.add_scalar(
|
writer.add_scalar(
|
||||||
k, stat[k].get(), global_step=global_step)
|
k + '_' + task, stat[k].get(), global_step=global_step)
|
||||||
t.update(step)
|
t.update(step)
|
||||||
t.set_postfix(**data)
|
t.set_postfix(**data)
|
||||||
if t.n <= t.total:
|
if t.n <= t.total:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user