add examples, fix some bugs (#5)
* update atari.py * fix setup.py pass the pytest * fix setup.py pass the pytest * add args "render" * change the tensorboard writter * change the tensorboard writter * change device, render, tensorboard log location * change device, render, tensorboard log location * remove some wrong local files * fix some tab mistakes and the envs name in continuous/test_xx.py * add examples and point robot maze environment * fix some bugs during testing examples * add dqn network and fix some args * change back the tensorboard writter's frequency to ensure ppo and a2c can write things normally * add a warning to collector * rm some unrelated files * reformat * fix a bug in test_dqn due to the model wrong selection
This commit is contained in:
parent
acb93502cf
commit
77068af526
BIN
docs/_static/images/Ant-v2.png
vendored
Normal file
BIN
docs/_static/images/Ant-v2.png
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 183 KiB |
105
examples/ant_v2_ddpg.py
Normal file
105
examples/ant_v2_ddpg.py
Normal file
@ -0,0 +1,105 @@
|
||||
import gym
|
||||
import torch
|
||||
import pprint
|
||||
import argparse
|
||||
import numpy as np
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
from tianshou.policy import DDPGPolicy
|
||||
from tianshou.trainer import offpolicy_trainer
|
||||
from tianshou.data import Collector, ReplayBuffer
|
||||
from tianshou.env import VectorEnv, SubprocVectorEnv
|
||||
|
||||
if __name__ == '__main__':
|
||||
from continuous_net import Actor, Critic
|
||||
else: # pytest
|
||||
from test.continuous.net import Actor, Critic
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='Ant-v2')
|
||||
parser.add_argument('--seed', type=int, default=1626)
|
||||
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||
parser.add_argument('--actor-lr', type=float, default=1e-4)
|
||||
parser.add_argument('--critic-lr', type=float, default=1e-3)
|
||||
parser.add_argument('--gamma', type=float, default=0.99)
|
||||
parser.add_argument('--tau', type=float, default=0.005)
|
||||
parser.add_argument('--exploration-noise', type=float, default=0.1)
|
||||
parser.add_argument('--epoch', type=int, default=100)
|
||||
parser.add_argument('--step-per-epoch', type=int, default=2400)
|
||||
parser.add_argument('--collect-per-step', type=int, default=4)
|
||||
parser.add_argument('--batch-size', type=int, default=128)
|
||||
parser.add_argument('--layer-num', type=int, default=1)
|
||||
parser.add_argument('--training-num', type=int, default=8)
|
||||
parser.add_argument('--test-num', type=int, default=100)
|
||||
parser.add_argument('--logdir', type=str, default='log')
|
||||
parser.add_argument('--render', type=float, default=0.)
|
||||
parser.add_argument(
|
||||
'--device', type=str,
|
||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||
args = parser.parse_known_args()[0]
|
||||
return args
|
||||
|
||||
|
||||
def test_ddpg(args=get_args()):
|
||||
env = gym.make(args.task)
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
args.max_action = env.action_space.high[0]
|
||||
# train_envs = gym.make(args.task)
|
||||
train_envs = VectorEnv(
|
||||
[lambda: gym.make(args.task) for _ in range(args.training_num)])
|
||||
# test_envs = gym.make(args.task)
|
||||
test_envs = SubprocVectorEnv(
|
||||
[lambda: gym.make(args.task) for _ in range(args.test_num)])
|
||||
# seed
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
train_envs.seed(args.seed)
|
||||
test_envs.seed(args.seed)
|
||||
# model
|
||||
actor = Actor(
|
||||
args.layer_num, args.state_shape, args.action_shape,
|
||||
args.max_action, args.device
|
||||
).to(args.device)
|
||||
actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
|
||||
critic = Critic(
|
||||
args.layer_num, args.state_shape, args.action_shape, args.device
|
||||
).to(args.device)
|
||||
critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
|
||||
policy = DDPGPolicy(
|
||||
actor, actor_optim, critic, critic_optim,
|
||||
args.tau, args.gamma, args.exploration_noise,
|
||||
[env.action_space.low[0], env.action_space.high[0]],
|
||||
reward_normalization=True, ignore_done=True)
|
||||
# collector
|
||||
train_collector = Collector(
|
||||
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||
test_collector = Collector(policy, test_envs)
|
||||
# log
|
||||
writer = SummaryWriter(args.logdir + '/' + 'ddpg')
|
||||
|
||||
def stop_fn(x):
|
||||
return x >= env.spec.reward_threshold
|
||||
|
||||
# trainer
|
||||
result = offpolicy_trainer(
|
||||
policy, train_collector, test_collector, args.epoch,
|
||||
args.step_per_epoch, args.collect_per_step, args.test_num,
|
||||
args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||
assert stop_fn(result['best_reward'])
|
||||
train_collector.close()
|
||||
test_collector.close()
|
||||
if __name__ == '__main__':
|
||||
pprint.pprint(result)
|
||||
# Let's watch its performance!
|
||||
env = gym.make(args.task)
|
||||
collector = Collector(policy, env)
|
||||
result = collector.collect(n_episode=1, render=args.render)
|
||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||
collector.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_ddpg()
|
||||
110
examples/ant_v2_sac.py
Normal file
110
examples/ant_v2_sac.py
Normal file
@ -0,0 +1,110 @@
|
||||
import gym
|
||||
import torch
|
||||
import pprint
|
||||
import argparse
|
||||
import numpy as np
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
from tianshou.policy import SACPolicy
|
||||
from tianshou.trainer import offpolicy_trainer
|
||||
from tianshou.data import Collector, ReplayBuffer
|
||||
from tianshou.env import VectorEnv, SubprocVectorEnv
|
||||
|
||||
if __name__ == '__main__':
|
||||
from continuous_net import ActorProb, Critic
|
||||
else: # pytest
|
||||
from test.continuous.net import ActorProb, Critic
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='Ant-v2')
|
||||
parser.add_argument('--seed', type=int, default=1626)
|
||||
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||
parser.add_argument('--actor-lr', type=float, default=3e-4)
|
||||
parser.add_argument('--critic-lr', type=float, default=1e-3)
|
||||
parser.add_argument('--gamma', type=float, default=0.99)
|
||||
parser.add_argument('--tau', type=float, default=0.005)
|
||||
parser.add_argument('--alpha', type=float, default=0.2)
|
||||
parser.add_argument('--epoch', type=int, default=100)
|
||||
parser.add_argument('--step-per-epoch', type=int, default=2400)
|
||||
parser.add_argument('--collect-per-step', type=int, default=10)
|
||||
parser.add_argument('--batch-size', type=int, default=128)
|
||||
parser.add_argument('--layer-num', type=int, default=1)
|
||||
parser.add_argument('--training-num', type=int, default=8)
|
||||
parser.add_argument('--test-num', type=int, default=100)
|
||||
parser.add_argument('--logdir', type=str, default='log')
|
||||
parser.add_argument('--render', type=float, default=0.)
|
||||
parser.add_argument(
|
||||
'--device', type=str,
|
||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||
args = parser.parse_known_args()[0]
|
||||
return args
|
||||
|
||||
|
||||
def test_sac(args=get_args()):
|
||||
env = gym.make(args.task)
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
args.max_action = env.action_space.high[0]
|
||||
# train_envs = gym.make(args.task)
|
||||
train_envs = VectorEnv(
|
||||
[lambda: gym.make(args.task) for _ in range(args.training_num)])
|
||||
# test_envs = gym.make(args.task)
|
||||
test_envs = SubprocVectorEnv(
|
||||
[lambda: gym.make(args.task) for _ in range(args.test_num)])
|
||||
# seed
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
train_envs.seed(args.seed)
|
||||
test_envs.seed(args.seed)
|
||||
# model
|
||||
actor = ActorProb(
|
||||
args.layer_num, args.state_shape, args.action_shape,
|
||||
args.max_action, args.device
|
||||
).to(args.device)
|
||||
actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
|
||||
critic1 = Critic(
|
||||
args.layer_num, args.state_shape, args.action_shape, args.device
|
||||
).to(args.device)
|
||||
critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
|
||||
critic2 = Critic(
|
||||
args.layer_num, args.state_shape, args.action_shape, args.device
|
||||
).to(args.device)
|
||||
critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
|
||||
policy = SACPolicy(
|
||||
actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim,
|
||||
args.tau, args.gamma, args.alpha,
|
||||
[env.action_space.low[0], env.action_space.high[0]],
|
||||
reward_normalization=True, ignore_done=True)
|
||||
# collector
|
||||
train_collector = Collector(
|
||||
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||
test_collector = Collector(policy, test_envs)
|
||||
# train_collector.collect(n_step=args.buffer_size)
|
||||
# log
|
||||
writer = SummaryWriter(args.logdir + '/' + 'sac')
|
||||
|
||||
def stop_fn(x):
|
||||
return x >= env.spec.reward_threshold
|
||||
|
||||
# trainer
|
||||
result = offpolicy_trainer(
|
||||
policy, train_collector, test_collector, args.epoch,
|
||||
args.step_per_epoch, args.collect_per_step, args.test_num,
|
||||
args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||
assert stop_fn(result['best_reward'])
|
||||
train_collector.close()
|
||||
test_collector.close()
|
||||
if __name__ == '__main__':
|
||||
pprint.pprint(result)
|
||||
# Let's watch its performance!
|
||||
env = gym.make(args.task)
|
||||
collector = Collector(policy, env)
|
||||
result = collector.collect(n_episode=1, render=args.render)
|
||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||
collector.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_sac()
|
||||
114
examples/ant_v2_td3.py
Normal file
114
examples/ant_v2_td3.py
Normal file
@ -0,0 +1,114 @@
|
||||
import gym
|
||||
import torch
|
||||
import pprint
|
||||
import argparse
|
||||
import numpy as np
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
from tianshou.policy import TD3Policy
|
||||
from tianshou.trainer import offpolicy_trainer
|
||||
from tianshou.data import Collector, ReplayBuffer
|
||||
from tianshou.env import VectorEnv, SubprocVectorEnv
|
||||
|
||||
if __name__ == '__main__':
|
||||
from continuous_net import Actor, Critic
|
||||
else: # pytest
|
||||
from test.continuous.net import Actor, Critic
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='Ant-v2')
|
||||
parser.add_argument('--seed', type=int, default=1626)
|
||||
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||
parser.add_argument('--actor-lr', type=float, default=3e-4)
|
||||
parser.add_argument('--critic-lr', type=float, default=1e-3)
|
||||
parser.add_argument('--gamma', type=float, default=0.99)
|
||||
parser.add_argument('--tau', type=float, default=0.005)
|
||||
parser.add_argument('--exploration-noise', type=float, default=0.1)
|
||||
parser.add_argument('--policy-noise', type=float, default=0.2)
|
||||
parser.add_argument('--noise-clip', type=float, default=0.5)
|
||||
parser.add_argument('--update-actor-freq', type=int, default=2)
|
||||
parser.add_argument('--epoch', type=int, default=100)
|
||||
parser.add_argument('--step-per-epoch', type=int, default=2400)
|
||||
parser.add_argument('--collect-per-step', type=int, default=10)
|
||||
parser.add_argument('--batch-size', type=int, default=128)
|
||||
parser.add_argument('--layer-num', type=int, default=1)
|
||||
parser.add_argument('--training-num', type=int, default=8)
|
||||
parser.add_argument('--test-num', type=int, default=100)
|
||||
parser.add_argument('--logdir', type=str, default='log')
|
||||
parser.add_argument('--render', type=float, default=0.)
|
||||
parser.add_argument(
|
||||
'--device', type=str,
|
||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||
args = parser.parse_known_args()[0]
|
||||
return args
|
||||
|
||||
|
||||
def test_td3(args=get_args()):
|
||||
env = gym.make(args.task)
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
args.max_action = env.action_space.high[0]
|
||||
# train_envs = gym.make(args.task)
|
||||
train_envs = VectorEnv(
|
||||
[lambda: gym.make(args.task) for _ in range(args.training_num)])
|
||||
# test_envs = gym.make(args.task)
|
||||
test_envs = SubprocVectorEnv(
|
||||
[lambda: gym.make(args.task) for _ in range(args.test_num)])
|
||||
# seed
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
train_envs.seed(args.seed)
|
||||
test_envs.seed(args.seed)
|
||||
# model
|
||||
actor = Actor(
|
||||
args.layer_num, args.state_shape, args.action_shape,
|
||||
args.max_action, args.device
|
||||
).to(args.device)
|
||||
actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
|
||||
critic1 = Critic(
|
||||
args.layer_num, args.state_shape, args.action_shape, args.device
|
||||
).to(args.device)
|
||||
critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
|
||||
critic2 = Critic(
|
||||
args.layer_num, args.state_shape, args.action_shape, args.device
|
||||
).to(args.device)
|
||||
critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
|
||||
policy = TD3Policy(
|
||||
actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim,
|
||||
args.tau, args.gamma, args.exploration_noise, args.policy_noise,
|
||||
args.update_actor_freq, args.noise_clip,
|
||||
[env.action_space.low[0], env.action_space.high[0]],
|
||||
reward_normalization=True, ignore_done=True)
|
||||
# collector
|
||||
train_collector = Collector(
|
||||
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||
test_collector = Collector(policy, test_envs)
|
||||
# train_collector.collect(n_step=args.buffer_size)
|
||||
# log
|
||||
writer = SummaryWriter(args.logdir + '/' + 'td3')
|
||||
|
||||
def stop_fn(x):
|
||||
return x >= env.spec.reward_threshold
|
||||
|
||||
# trainer
|
||||
result = offpolicy_trainer(
|
||||
policy, train_collector, test_collector, args.epoch,
|
||||
args.step_per_epoch, args.collect_per_step, args.test_num,
|
||||
args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||
assert stop_fn(result['best_reward'])
|
||||
train_collector.close()
|
||||
test_collector.close()
|
||||
if __name__ == '__main__':
|
||||
pprint.pprint(result)
|
||||
# Let's watch its performance!
|
||||
env = gym.make(args.task)
|
||||
collector = Collector(policy, env)
|
||||
result = collector.collect(n_episode=1, render=args.render)
|
||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||
collector.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_td3()
|
||||
79
examples/continuous_net.py
Normal file
79
examples/continuous_net.py
Normal file
@ -0,0 +1,79 @@
|
||||
import torch
|
||||
import numpy as np
|
||||
from torch import nn
|
||||
|
||||
|
||||
class Actor(nn.Module):
|
||||
def __init__(self, layer_num, state_shape, action_shape,
|
||||
max_action, device='cpu'):
|
||||
super().__init__()
|
||||
self.device = device
|
||||
self.model = [
|
||||
nn.Linear(np.prod(state_shape), 128),
|
||||
nn.ReLU(inplace=True)]
|
||||
for i in range(layer_num):
|
||||
self.model += [nn.Linear(128, 128), nn.ReLU(inplace=True)]
|
||||
self.model += [nn.Linear(128, np.prod(action_shape))]
|
||||
self.model = nn.Sequential(*self.model)
|
||||
self._max = max_action
|
||||
|
||||
def forward(self, s, **kwargs):
|
||||
s = torch.tensor(s, device=self.device, dtype=torch.float)
|
||||
batch = s.shape[0]
|
||||
s = s.view(batch, -1)
|
||||
logits = self.model(s)
|
||||
logits = self._max * torch.tanh(logits)
|
||||
return logits, None
|
||||
|
||||
|
||||
class ActorProb(nn.Module):
|
||||
def __init__(self, layer_num, state_shape, action_shape,
|
||||
max_action, device='cpu'):
|
||||
super().__init__()
|
||||
self.device = device
|
||||
self.model = [
|
||||
nn.Linear(np.prod(state_shape), 128),
|
||||
nn.ReLU(inplace=True)]
|
||||
for i in range(layer_num):
|
||||
self.model += [nn.Linear(128, 128), nn.ReLU(inplace=True)]
|
||||
self.model = nn.Sequential(*self.model)
|
||||
self.mu = nn.Linear(128, np.prod(action_shape))
|
||||
self.sigma = nn.Linear(128, np.prod(action_shape))
|
||||
self._max = max_action
|
||||
|
||||
def forward(self, s, **kwargs):
|
||||
if not isinstance(s, torch.Tensor):
|
||||
s = torch.tensor(s, device=self.device, dtype=torch.float)
|
||||
batch = s.shape[0]
|
||||
s = s.view(batch, -1)
|
||||
logits = self.model(s)
|
||||
mu = self._max * torch.tanh(self.mu(logits))
|
||||
sigma = torch.exp(self.sigma(logits))
|
||||
return (mu, sigma), None
|
||||
|
||||
|
||||
class Critic(nn.Module):
|
||||
def __init__(self, layer_num, state_shape, action_shape=0, device='cpu'):
|
||||
super().__init__()
|
||||
self.device = device
|
||||
self.model = [
|
||||
nn.Linear(np.prod(state_shape) + np.prod(action_shape), 128),
|
||||
nn.ReLU(inplace=True)]
|
||||
for i in range(layer_num):
|
||||
self.model += [nn.Linear(128, 128), nn.ReLU(inplace=True)]
|
||||
self.model += [nn.Linear(128, 1)]
|
||||
self.model = nn.Sequential(*self.model)
|
||||
|
||||
def forward(self, s, a=None):
|
||||
if not isinstance(s, torch.Tensor):
|
||||
s = torch.tensor(s, device=self.device, dtype=torch.float)
|
||||
if a is not None and not isinstance(a, torch.Tensor):
|
||||
a = torch.tensor(a, device=self.device, dtype=torch.float)
|
||||
batch = s.shape[0]
|
||||
s = s.view(batch, -1)
|
||||
if a is None:
|
||||
logits = self.model(s)
|
||||
else:
|
||||
a = a.view(batch, -1)
|
||||
logits = self.model(torch.cat([s, a], dim=1))
|
||||
return logits
|
||||
81
examples/discrete_net.py
Normal file
81
examples/discrete_net.py
Normal file
@ -0,0 +1,81 @@
|
||||
import torch
|
||||
import numpy as np
|
||||
from torch import nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class Net(nn.Module):
|
||||
def __init__(self, layer_num, state_shape, action_shape=0, device='cpu'):
|
||||
super().__init__()
|
||||
self.device = device
|
||||
self.model = [
|
||||
nn.Linear(np.prod(state_shape), 128),
|
||||
nn.ReLU(inplace=True)]
|
||||
for i in range(layer_num):
|
||||
self.model += [nn.Linear(128, 128), nn.ReLU(inplace=True)]
|
||||
if action_shape:
|
||||
self.model += [nn.Linear(128, np.prod(action_shape))]
|
||||
self.model = nn.Sequential(*self.model)
|
||||
|
||||
def forward(self, s, state=None, info={}):
|
||||
if not isinstance(s, torch.Tensor):
|
||||
s = torch.tensor(s, device=self.device, dtype=torch.float)
|
||||
batch = s.shape[0]
|
||||
s = s.view(batch, -1)
|
||||
logits = self.model(s)
|
||||
return logits, state
|
||||
|
||||
|
||||
class Actor(nn.Module):
|
||||
def __init__(self, preprocess_net, action_shape):
|
||||
super().__init__()
|
||||
self.preprocess = preprocess_net
|
||||
self.last = nn.Linear(128, np.prod(action_shape))
|
||||
|
||||
def forward(self, s, state=None, info={}):
|
||||
logits, h = self.preprocess(s, state)
|
||||
logits = F.softmax(self.last(logits), dim=-1)
|
||||
return logits, h
|
||||
|
||||
|
||||
class Critic(nn.Module):
|
||||
def __init__(self, preprocess_net):
|
||||
super().__init__()
|
||||
self.preprocess = preprocess_net
|
||||
self.last = nn.Linear(128, 1)
|
||||
|
||||
def forward(self, s):
|
||||
logits, h = self.preprocess(s, None)
|
||||
logits = self.last(logits)
|
||||
return logits
|
||||
|
||||
|
||||
class DQN(nn.Module):
|
||||
|
||||
def __init__(self, h, w, action_shape, device='cpu'):
|
||||
super(DQN, self).__init__()
|
||||
self.device = device
|
||||
|
||||
self.conv1 = nn.Conv2d(1, 16, kernel_size=5, stride=2)
|
||||
self.bn1 = nn.BatchNorm2d(16)
|
||||
self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
|
||||
self.bn2 = nn.BatchNorm2d(32)
|
||||
self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
|
||||
self.bn3 = nn.BatchNorm2d(32)
|
||||
|
||||
def conv2d_size_out(size, kernel_size=5, stride=2):
|
||||
return (size - (kernel_size - 1) - 1) // stride + 1
|
||||
|
||||
convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
|
||||
convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
|
||||
linear_input_size = convw * convh * 32
|
||||
self.head = nn.Linear(linear_input_size, action_shape)
|
||||
|
||||
def forward(self, x, state=None, info={}):
|
||||
if not isinstance(x, torch.Tensor):
|
||||
x = torch.tensor(x, device=self.device, dtype=torch.float)
|
||||
x = x.permute(0, 3, 1, 2)
|
||||
x = F.relu(self.bn1(self.conv1(x)))
|
||||
x = F.relu(self.bn2(self.conv2(x)))
|
||||
x = F.relu(self.bn3(self.conv3(x)))
|
||||
return self.head(x.view(x.size(0), -1)), state
|
||||
119
examples/point_maze_td3.py
Normal file
119
examples/point_maze_td3.py
Normal file
@ -0,0 +1,119 @@
|
||||
import gym
|
||||
import torch
|
||||
import pprint
|
||||
import argparse
|
||||
import numpy as np
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
from tianshou.policy import TD3Policy
|
||||
from tianshou.trainer import offpolicy_trainer
|
||||
from tianshou.data import Collector, ReplayBuffer
|
||||
from tianshou.env import VectorEnv, SubprocVectorEnv
|
||||
|
||||
if __name__ == '__main__':
|
||||
from continuous_net import Actor, Critic
|
||||
else: # pytest
|
||||
from test.continuous.net import Actor, Critic
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='PointMaze-v0')
|
||||
parser.add_argument('--seed', type=int, default=1626)
|
||||
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||
parser.add_argument('--actor-lr', type=float, default=3e-5)
|
||||
parser.add_argument('--critic-lr', type=float, default=1e-4)
|
||||
parser.add_argument('--gamma', type=float, default=0.99)
|
||||
parser.add_argument('--tau', type=float, default=0.005)
|
||||
parser.add_argument('--exploration-noise', type=float, default=0.1)
|
||||
parser.add_argument('--policy-noise', type=float, default=0.2)
|
||||
parser.add_argument('--noise-clip', type=float, default=0.5)
|
||||
parser.add_argument('--update-actor-freq', type=int, default=2)
|
||||
parser.add_argument('--epoch', type=int, default=100)
|
||||
parser.add_argument('--step-per-epoch', type=int, default=2400)
|
||||
parser.add_argument('--collect-per-step', type=int, default=10)
|
||||
parser.add_argument('--batch-size', type=int, default=128)
|
||||
parser.add_argument('--layer-num', type=int, default=1)
|
||||
parser.add_argument('--training-num', type=int, default=8)
|
||||
parser.add_argument('--test-num', type=int, default=100)
|
||||
parser.add_argument('--logdir', type=str, default='log')
|
||||
parser.add_argument('--render', type=float, default=0.)
|
||||
parser.add_argument(
|
||||
'--device', type=str,
|
||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||
parser.add_argument('--max_episode_steps', type=int, default=2000)
|
||||
|
||||
args = parser.parse_known_args()[0]
|
||||
return args
|
||||
|
||||
|
||||
def test_td3(args=get_args()):
|
||||
env = gym.make(args.task)
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
args.max_action = env.action_space.high[0]
|
||||
# train_envs = gym.make(args.task)
|
||||
train_envs = VectorEnv(
|
||||
[lambda: gym.make(args.task) for _ in range(args.training_num)])
|
||||
# test_envs = gym.make(args.task)
|
||||
test_envs = SubprocVectorEnv(
|
||||
[lambda: gym.make(args.task) for _ in range(args.test_num)])
|
||||
# seed
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
train_envs.seed(args.seed)
|
||||
test_envs.seed(args.seed)
|
||||
# model
|
||||
actor = Actor(
|
||||
args.layer_num, args.state_shape, args.action_shape,
|
||||
args.max_action, args.device
|
||||
).to(args.device)
|
||||
actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
|
||||
critic1 = Critic(
|
||||
args.layer_num, args.state_shape, args.action_shape, args.device
|
||||
).to(args.device)
|
||||
critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
|
||||
critic2 = Critic(
|
||||
args.layer_num, args.state_shape, args.action_shape, args.device
|
||||
).to(args.device)
|
||||
critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
|
||||
policy = TD3Policy(
|
||||
actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim,
|
||||
args.tau, args.gamma, args.exploration_noise, args.policy_noise,
|
||||
args.update_actor_freq, args.noise_clip,
|
||||
[env.action_space.low[0], env.action_space.high[0]],
|
||||
reward_normalization=True, ignore_done=True)
|
||||
# collector
|
||||
train_collector = Collector(
|
||||
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||
test_collector = Collector(policy, test_envs)
|
||||
# train_collector.collect(n_step=args.buffer_size)
|
||||
# log
|
||||
writer = SummaryWriter(args.logdir + '/' + 'td3')
|
||||
|
||||
def stop_fn(x):
|
||||
if env.spec.reward_threshold:
|
||||
return x >= env.spec.reward_threshold
|
||||
else:
|
||||
return False
|
||||
|
||||
# trainer
|
||||
result = offpolicy_trainer(
|
||||
policy, train_collector, test_collector, args.epoch,
|
||||
args.step_per_epoch, args.collect_per_step, args.test_num,
|
||||
args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||
assert stop_fn(result['best_reward'])
|
||||
train_collector.close()
|
||||
test_collector.close()
|
||||
if __name__ == '__main__':
|
||||
pprint.pprint(result)
|
||||
# Let's watch its performance!
|
||||
env = gym.make(args.task)
|
||||
collector = Collector(policy, env)
|
||||
result = collector.collect(n_step=1000, render=args.render)
|
||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||
collector.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_td3()
|
||||
108
examples/pong_a2c.py
Normal file
108
examples/pong_a2c.py
Normal file
@ -0,0 +1,108 @@
|
||||
import gym
|
||||
import torch
|
||||
import pprint
|
||||
import argparse
|
||||
import numpy as np
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
from tianshou.policy import A2CPolicy
|
||||
from tianshou.env import SubprocVectorEnv
|
||||
from tianshou.trainer import onpolicy_trainer
|
||||
from tianshou.data import Collector, ReplayBuffer
|
||||
from tianshou.env.atari import create_atari_environment
|
||||
|
||||
if __name__ == '__main__':
|
||||
from discrete_net import Net, Actor, Critic
|
||||
else: # pytest
|
||||
from test.discrete.net import Net, Actor, Critic
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='Pong')
|
||||
parser.add_argument('--seed', type=int, default=1626)
|
||||
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||
parser.add_argument('--lr', type=float, default=3e-4)
|
||||
parser.add_argument('--gamma', type=float, default=0.9)
|
||||
parser.add_argument('--epoch', type=int, default=100)
|
||||
parser.add_argument('--step-per-epoch', type=int, default=1000)
|
||||
parser.add_argument('--collect-per-step', type=int, default=100)
|
||||
parser.add_argument('--repeat-per-collect', type=int, default=1)
|
||||
parser.add_argument('--batch-size', type=int, default=64)
|
||||
parser.add_argument('--layer-num', type=int, default=2)
|
||||
parser.add_argument('--training-num', type=int, default=8)
|
||||
parser.add_argument('--test-num', type=int, default=8)
|
||||
parser.add_argument('--logdir', type=str, default='log')
|
||||
parser.add_argument('--render', type=float, default=0.)
|
||||
|
||||
parser.add_argument(
|
||||
'--device', type=str,
|
||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||
# a2c special
|
||||
parser.add_argument('--vf-coef', type=float, default=0.5)
|
||||
parser.add_argument('--ent-coef', type=float, default=0.001)
|
||||
parser.add_argument('--max-grad-norm', type=float, default=None)
|
||||
parser.add_argument('--max_episode_steps', type=int, default=2000)
|
||||
args = parser.parse_known_args()[0]
|
||||
return args
|
||||
|
||||
|
||||
def test_a2c(args=get_args()):
|
||||
env = create_atari_environment(args.task, max_episode_steps=args.max_episode_steps)
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.env.action_space.shape or env.env.action_space.n
|
||||
# train_envs = gym.make(args.task)
|
||||
train_envs = SubprocVectorEnv(
|
||||
[lambda: create_atari_environment(args.task, max_episode_steps=args.max_episode_steps) for _ in
|
||||
range(args.training_num)])
|
||||
# test_envs = gym.make(args.task)
|
||||
test_envs = SubprocVectorEnv(
|
||||
[lambda: create_atari_environment(args.task, max_episode_steps=args.max_episode_steps) for _ in
|
||||
range(args.test_num)])
|
||||
# seed
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
train_envs.seed(args.seed)
|
||||
test_envs.seed(args.seed)
|
||||
# model
|
||||
net = Net(args.layer_num, args.state_shape, device=args.device)
|
||||
actor = Actor(net, args.action_shape).to(args.device)
|
||||
critic = Critic(net).to(args.device)
|
||||
optim = torch.optim.Adam(list(
|
||||
actor.parameters()) + list(critic.parameters()), lr=args.lr)
|
||||
dist = torch.distributions.Categorical
|
||||
policy = A2CPolicy(
|
||||
actor, critic, optim, dist, args.gamma, vf_coef=args.vf_coef,
|
||||
ent_coef=args.ent_coef, max_grad_norm=args.max_grad_norm)
|
||||
# collector
|
||||
train_collector = Collector(
|
||||
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||
test_collector = Collector(policy, test_envs)
|
||||
# log
|
||||
writer = SummaryWriter(args.logdir + '/' + 'a2c')
|
||||
|
||||
def stop_fn(x):
|
||||
if env.env.spec.reward_threshold:
|
||||
return x >= env.spec.reward_threshold
|
||||
else:
|
||||
return False
|
||||
|
||||
# trainer
|
||||
result = onpolicy_trainer(
|
||||
policy, train_collector, test_collector, args.epoch,
|
||||
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
|
||||
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||
train_collector.close()
|
||||
test_collector.close()
|
||||
if __name__ == '__main__':
|
||||
pprint.pprint(result)
|
||||
# Let's watch its performance!
|
||||
env = create_atari_environment(args.task)
|
||||
collector = Collector(policy, env)
|
||||
result = collector.collect(n_episode=1, render=args.render)
|
||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||
collector.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_a2c()
|
||||
112
examples/pong_dqn.py
Normal file
112
examples/pong_dqn.py
Normal file
@ -0,0 +1,112 @@
|
||||
import gym
|
||||
import torch
|
||||
import pprint
|
||||
import argparse
|
||||
import numpy as np
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
from tianshou.policy import DQNPolicy
|
||||
from tianshou.env import SubprocVectorEnv
|
||||
from tianshou.trainer import offpolicy_trainer
|
||||
from tianshou.data import Collector, ReplayBuffer
|
||||
from tianshou.env.atari import create_atari_environment
|
||||
|
||||
if __name__ == '__main__':
|
||||
from discrete_net import DQN
|
||||
else: # pytest
|
||||
from test.discrete.net import DQN
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='Pong')
|
||||
parser.add_argument('--seed', type=int, default=1626)
|
||||
parser.add_argument('--eps-test', type=float, default=0.05)
|
||||
parser.add_argument('--eps-train', type=float, default=0.1)
|
||||
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||
parser.add_argument('--lr', type=float, default=1e-3)
|
||||
parser.add_argument('--gamma', type=float, default=0.9)
|
||||
parser.add_argument('--n-step', type=int, default=1)
|
||||
parser.add_argument('--target-update-freq', type=int, default=320)
|
||||
parser.add_argument('--epoch', type=int, default=100)
|
||||
parser.add_argument('--step-per-epoch', type=int, default=1000)
|
||||
parser.add_argument('--collect-per-step', type=int, default=10)
|
||||
parser.add_argument('--batch-size', type=int, default=64)
|
||||
parser.add_argument('--layer-num', type=int, default=3)
|
||||
parser.add_argument('--training-num', type=int, default=8)
|
||||
parser.add_argument('--test-num', type=int, default=8)
|
||||
parser.add_argument('--logdir', type=str, default='log')
|
||||
parser.add_argument('--render', type=float, default=0.)
|
||||
parser.add_argument(
|
||||
'--device', type=str,
|
||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||
args = parser.parse_known_args()[0]
|
||||
return args
|
||||
|
||||
|
||||
def test_dqn(args=get_args()):
|
||||
env = create_atari_environment(args.task)
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.env.action_space.shape or env.env.action_space.n
|
||||
# train_envs = gym.make(args.task)
|
||||
train_envs = SubprocVectorEnv(
|
||||
[lambda: create_atari_environment(args.task) for _ in range(args.training_num)])
|
||||
# test_envs = gym.make(args.task)
|
||||
test_envs = SubprocVectorEnv(
|
||||
[lambda: create_atari_environment(args.task) for _ in range(args.test_num)])
|
||||
# seed
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
train_envs.seed(args.seed)
|
||||
test_envs.seed(args.seed)
|
||||
# model
|
||||
net = DQN(args.state_shape[0], args.state_shape[1], args.action_shape, args.device)
|
||||
net = net.to(args.device)
|
||||
optim = torch.optim.Adam(net.parameters(), lr=args.lr)
|
||||
policy = DQNPolicy(
|
||||
net, optim, args.gamma, args.n_step,
|
||||
use_target_network=args.target_update_freq > 0,
|
||||
target_update_freq=args.target_update_freq)
|
||||
# collector
|
||||
train_collector = Collector(
|
||||
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||
test_collector = Collector(policy, test_envs)
|
||||
# policy.set_eps(1)
|
||||
train_collector.collect(n_step=args.batch_size * 4)
|
||||
print(len(train_collector.buffer))
|
||||
# log
|
||||
writer = SummaryWriter(args.logdir + '/' + 'dqn')
|
||||
|
||||
def stop_fn(x):
|
||||
if env.env.spec.reward_threshold:
|
||||
return x >= env.spec.reward_threshold
|
||||
else:
|
||||
return False
|
||||
|
||||
def train_fn(x):
|
||||
policy.set_eps(args.eps_train)
|
||||
|
||||
def test_fn(x):
|
||||
policy.set_eps(args.eps_test)
|
||||
|
||||
# trainer
|
||||
result = offpolicy_trainer(
|
||||
policy, train_collector, test_collector, args.epoch,
|
||||
args.step_per_epoch, args.collect_per_step, args.test_num,
|
||||
args.batch_size, train_fn=train_fn, test_fn=test_fn,
|
||||
stop_fn=stop_fn, writer=writer, task=args.task)
|
||||
|
||||
train_collector.close()
|
||||
test_collector.close()
|
||||
if __name__ == '__main__':
|
||||
pprint.pprint(result)
|
||||
# Let's watch its performance!
|
||||
env = create_atari_environment(args.task)
|
||||
collector = Collector(policy, env)
|
||||
result = collector.collect(n_episode=1, render=args.render)
|
||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||
collector.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_dqn(get_args())
|
||||
112
examples/pong_ppo.py
Normal file
112
examples/pong_ppo.py
Normal file
@ -0,0 +1,112 @@
|
||||
import gym
|
||||
import torch
|
||||
import pprint
|
||||
import argparse
|
||||
import numpy as np
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
from tianshou.policy import PPOPolicy
|
||||
from tianshou.env import SubprocVectorEnv
|
||||
from tianshou.trainer import onpolicy_trainer
|
||||
from tianshou.data import Collector, ReplayBuffer
|
||||
from tianshou.env.atari import create_atari_environment
|
||||
|
||||
if __name__ == '__main__':
|
||||
from discrete_net import Net, Actor, Critic
|
||||
else: # pytest
|
||||
from test.discrete.net import Net, Actor, Critic
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='Pong')
|
||||
parser.add_argument('--seed', type=int, default=1626)
|
||||
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||
parser.add_argument('--lr', type=float, default=1e-3)
|
||||
parser.add_argument('--gamma', type=float, default=0.99)
|
||||
parser.add_argument('--epoch', type=int, default=100)
|
||||
parser.add_argument('--step-per-epoch', type=int, default=1000)
|
||||
parser.add_argument('--collect-per-step', type=int, default=100)
|
||||
parser.add_argument('--repeat-per-collect', type=int, default=2)
|
||||
parser.add_argument('--batch-size', type=int, default=64)
|
||||
parser.add_argument('--layer-num', type=int, default=1)
|
||||
parser.add_argument('--training-num', type=int, default=8)
|
||||
parser.add_argument('--test-num', type=int, default=8)
|
||||
parser.add_argument('--logdir', type=str, default='log')
|
||||
parser.add_argument('--render', type=float, default=0.)
|
||||
parser.add_argument(
|
||||
'--device', type=str,
|
||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||
# ppo special
|
||||
parser.add_argument('--vf-coef', type=float, default=0.5)
|
||||
parser.add_argument('--ent-coef', type=float, default=0.0)
|
||||
parser.add_argument('--eps-clip', type=float, default=0.2)
|
||||
parser.add_argument('--max-grad-norm', type=float, default=0.5)
|
||||
parser.add_argument('--max_episode_steps', type=int, default=2000)
|
||||
args = parser.parse_known_args()[0]
|
||||
return args
|
||||
|
||||
|
||||
def test_ppo(args=get_args()):
|
||||
env = create_atari_environment(args.task, max_episode_steps=args.max_episode_steps)
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space().shape or env.action_space().n
|
||||
# train_envs = gym.make(args.task)
|
||||
train_envs = SubprocVectorEnv(
|
||||
[lambda: create_atari_environment(args.task, max_episode_steps=args.max_episode_steps) for _ in
|
||||
range(args.training_num)])
|
||||
# test_envs = gym.make(args.task)
|
||||
test_envs = SubprocVectorEnv(
|
||||
[lambda: create_atari_environment(args.task, max_episode_steps=args.max_episode_steps) for _ in
|
||||
range(args.test_num)])
|
||||
# seed
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
train_envs.seed(args.seed)
|
||||
test_envs.seed(args.seed)
|
||||
# model
|
||||
net = Net(args.layer_num, args.state_shape, device=args.device)
|
||||
actor = Actor(net, args.action_shape).to(args.device)
|
||||
critic = Critic(net).to(args.device)
|
||||
optim = torch.optim.Adam(list(
|
||||
actor.parameters()) + list(critic.parameters()), lr=args.lr)
|
||||
dist = torch.distributions.Categorical
|
||||
policy = PPOPolicy(
|
||||
actor, critic, optim, dist, args.gamma,
|
||||
max_grad_norm=args.max_grad_norm,
|
||||
eps_clip=args.eps_clip,
|
||||
vf_coef=args.vf_coef,
|
||||
ent_coef=args.ent_coef,
|
||||
action_range=None)
|
||||
# collector
|
||||
train_collector = Collector(
|
||||
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||
test_collector = Collector(policy, test_envs)
|
||||
# log
|
||||
writer = SummaryWriter(args.logdir + '/' + 'ppo')
|
||||
|
||||
def stop_fn(x):
|
||||
if env.env.spec.reward_threshold:
|
||||
return x >= env.spec.reward_threshold
|
||||
else:
|
||||
return False
|
||||
|
||||
# trainer
|
||||
result = onpolicy_trainer(
|
||||
policy, train_collector, test_collector, args.epoch,
|
||||
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
|
||||
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||
train_collector.close()
|
||||
test_collector.close()
|
||||
if __name__ == '__main__':
|
||||
pprint.pprint(result)
|
||||
# Let's watch its performance!
|
||||
env = create_atari_environment(args.task)
|
||||
collector = Collector(policy, env)
|
||||
result = collector.collect(n_step=2000, render=args.render)
|
||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||
collector.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_ppo()
|
||||
1
setup.py
1
setup.py
@ -55,6 +55,7 @@ setup(
|
||||
],
|
||||
'atari': [
|
||||
'atari_py',
|
||||
'cv2'
|
||||
],
|
||||
'mujoco': [
|
||||
'mujoco_py',
|
||||
|
||||
@ -34,6 +34,7 @@ def get_args():
|
||||
parser.add_argument('--training-num', type=int, default=8)
|
||||
parser.add_argument('--test-num', type=int, default=100)
|
||||
parser.add_argument('--logdir', type=str, default='log')
|
||||
parser.add_argument('--render', type=float, default=0.)
|
||||
parser.add_argument(
|
||||
'--device', type=str,
|
||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||
@ -79,7 +80,7 @@ def test_ddpg(args=get_args()):
|
||||
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||
test_collector = Collector(policy, test_envs)
|
||||
# log
|
||||
writer = SummaryWriter(args.logdir)
|
||||
writer = SummaryWriter(args.logdir + '/' + 'ddpg')
|
||||
|
||||
def stop_fn(x):
|
||||
return x >= env.spec.reward_threshold
|
||||
@ -88,7 +89,7 @@ def test_ddpg(args=get_args()):
|
||||
result = offpolicy_trainer(
|
||||
policy, train_collector, test_collector, args.epoch,
|
||||
args.step_per_epoch, args.collect_per_step, args.test_num,
|
||||
args.batch_size, stop_fn=stop_fn, writer=writer)
|
||||
args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||
assert stop_fn(result['best_reward'])
|
||||
train_collector.close()
|
||||
test_collector.close()
|
||||
@ -97,7 +98,7 @@ def test_ddpg(args=get_args()):
|
||||
# Let's watch its performance!
|
||||
env = gym.make(args.task)
|
||||
collector = Collector(policy, env)
|
||||
result = collector.collect(n_episode=1, render=1 / 35)
|
||||
result = collector.collect(n_episode=1, render=args.render)
|
||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||
collector.close()
|
||||
|
||||
|
||||
@ -32,6 +32,7 @@ def get_args():
|
||||
parser.add_argument('--training-num', type=int, default=16)
|
||||
parser.add_argument('--test-num', type=int, default=100)
|
||||
parser.add_argument('--logdir', type=str, default='log')
|
||||
parser.add_argument('--render', type=float, default=0.)
|
||||
parser.add_argument(
|
||||
'--device', type=str,
|
||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||
@ -87,7 +88,7 @@ def _test_ppo(args=get_args()):
|
||||
test_collector = Collector(policy, test_envs)
|
||||
train_collector.collect(n_step=args.step_per_epoch)
|
||||
# log
|
||||
writer = SummaryWriter(args.logdir)
|
||||
writer = SummaryWriter(args.logdir + '/' + 'ppo')
|
||||
|
||||
def stop_fn(x):
|
||||
return x >= env.spec.reward_threshold
|
||||
@ -96,7 +97,7 @@ def _test_ppo(args=get_args()):
|
||||
result = onpolicy_trainer(
|
||||
policy, train_collector, test_collector, args.epoch,
|
||||
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
|
||||
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer)
|
||||
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||
assert stop_fn(result['best_reward'])
|
||||
train_collector.close()
|
||||
test_collector.close()
|
||||
@ -105,7 +106,7 @@ def _test_ppo(args=get_args()):
|
||||
# Let's watch its performance!
|
||||
env = gym.make(args.task)
|
||||
collector = Collector(policy, env)
|
||||
result = collector.collect(n_episode=1, render=1 / 35)
|
||||
result = collector.collect(n_episode=1, render=args.render)
|
||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||
collector.close()
|
||||
|
||||
|
||||
@ -34,6 +34,7 @@ def get_args():
|
||||
parser.add_argument('--training-num', type=int, default=8)
|
||||
parser.add_argument('--test-num', type=int, default=100)
|
||||
parser.add_argument('--logdir', type=str, default='log')
|
||||
parser.add_argument('--render', type=float, default=0.)
|
||||
parser.add_argument(
|
||||
'--device', type=str,
|
||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||
@ -84,7 +85,7 @@ def test_sac(args=get_args()):
|
||||
test_collector = Collector(policy, test_envs)
|
||||
# train_collector.collect(n_step=args.buffer_size)
|
||||
# log
|
||||
writer = SummaryWriter(args.logdir)
|
||||
writer = SummaryWriter(args.logdir + '/' + 'sac')
|
||||
|
||||
def stop_fn(x):
|
||||
return x >= env.spec.reward_threshold
|
||||
@ -93,7 +94,7 @@ def test_sac(args=get_args()):
|
||||
result = offpolicy_trainer(
|
||||
policy, train_collector, test_collector, args.epoch,
|
||||
args.step_per_epoch, args.collect_per_step, args.test_num,
|
||||
args.batch_size, stop_fn=stop_fn, writer=writer)
|
||||
args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||
assert stop_fn(result['best_reward'])
|
||||
train_collector.close()
|
||||
test_collector.close()
|
||||
@ -102,7 +103,7 @@ def test_sac(args=get_args()):
|
||||
# Let's watch its performance!
|
||||
env = gym.make(args.task)
|
||||
collector = Collector(policy, env)
|
||||
result = collector.collect(n_episode=1, render=1 / 35)
|
||||
result = collector.collect(n_episode=1, render=args.render)
|
||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||
collector.close()
|
||||
|
||||
|
||||
@ -37,6 +37,7 @@ def get_args():
|
||||
parser.add_argument('--training-num', type=int, default=8)
|
||||
parser.add_argument('--test-num', type=int, default=100)
|
||||
parser.add_argument('--logdir', type=str, default='log')
|
||||
parser.add_argument('--render', type=float, default=0.)
|
||||
parser.add_argument(
|
||||
'--device', type=str,
|
||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||
@ -88,7 +89,7 @@ def test_td3(args=get_args()):
|
||||
test_collector = Collector(policy, test_envs)
|
||||
# train_collector.collect(n_step=args.buffer_size)
|
||||
# log
|
||||
writer = SummaryWriter(args.logdir)
|
||||
writer = SummaryWriter(args.logdir + '/' + 'td3')
|
||||
|
||||
def stop_fn(x):
|
||||
return x >= env.spec.reward_threshold
|
||||
@ -97,7 +98,7 @@ def test_td3(args=get_args()):
|
||||
result = offpolicy_trainer(
|
||||
policy, train_collector, test_collector, args.epoch,
|
||||
args.step_per_epoch, args.collect_per_step, args.test_num,
|
||||
args.batch_size, stop_fn=stop_fn, writer=writer)
|
||||
args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||
assert stop_fn(result['best_reward'])
|
||||
train_collector.close()
|
||||
test_collector.close()
|
||||
@ -106,7 +107,7 @@ def test_td3(args=get_args()):
|
||||
# Let's watch its performance!
|
||||
env = gym.make(args.task)
|
||||
collector = Collector(policy, env)
|
||||
result = collector.collect(n_episode=1, render=1 / 35)
|
||||
result = collector.collect(n_episode=1, render=args.render)
|
||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||
collector.close()
|
||||
|
||||
|
||||
@ -48,3 +48,33 @@ class Critic(nn.Module):
|
||||
logits, h = self.preprocess(s, None)
|
||||
logits = self.last(logits)
|
||||
return logits
|
||||
|
||||
|
||||
class DQN(nn.Module):
|
||||
|
||||
def __init__(self, h, w, action_shape, device='cpu'):
|
||||
super(DQN, self).__init__()
|
||||
self.device = device
|
||||
|
||||
self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=2)
|
||||
self.bn1 = nn.BatchNorm2d(16)
|
||||
self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
|
||||
self.bn2 = nn.BatchNorm2d(32)
|
||||
self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
|
||||
self.bn3 = nn.BatchNorm2d(32)
|
||||
|
||||
def conv2d_size_out(size, kernel_size=5, stride=2):
|
||||
return (size - (kernel_size - 1) - 1) // stride + 1
|
||||
|
||||
convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
|
||||
convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
|
||||
linear_input_size = convw * convh * 32
|
||||
self.head = nn.Linear(linear_input_size, action_shape)
|
||||
|
||||
def forward(self, x, state=None, info={}):
|
||||
if not isinstance(x, torch.Tensor):
|
||||
s = torch.tensor(x, device=self.device, dtype=torch.float)
|
||||
x = F.relu(self.bn1(self.conv1(x)))
|
||||
x = F.relu(self.bn2(self.conv2(x)))
|
||||
x = F.relu(self.bn3(self.conv3(x)))
|
||||
return self.head(x.view(x.size(0), -1)), state
|
||||
|
||||
@ -32,6 +32,8 @@ def get_args():
|
||||
parser.add_argument('--training-num', type=int, default=32)
|
||||
parser.add_argument('--test-num', type=int, default=100)
|
||||
parser.add_argument('--logdir', type=str, default='log')
|
||||
parser.add_argument('--render', type=float, default=0.)
|
||||
|
||||
parser.add_argument(
|
||||
'--device', type=str,
|
||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||
@ -73,7 +75,7 @@ def test_a2c(args=get_args()):
|
||||
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||
test_collector = Collector(policy, test_envs)
|
||||
# log
|
||||
writer = SummaryWriter(args.logdir)
|
||||
writer = SummaryWriter(args.logdir + '/' + 'ppo')
|
||||
|
||||
def stop_fn(x):
|
||||
return x >= env.spec.reward_threshold
|
||||
@ -82,7 +84,7 @@ def test_a2c(args=get_args()):
|
||||
result = onpolicy_trainer(
|
||||
policy, train_collector, test_collector, args.epoch,
|
||||
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
|
||||
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer)
|
||||
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||
assert stop_fn(result['best_reward'])
|
||||
train_collector.close()
|
||||
test_collector.close()
|
||||
@ -91,7 +93,7 @@ def test_a2c(args=get_args()):
|
||||
# Let's watch its performance!
|
||||
env = gym.make(args.task)
|
||||
collector = Collector(policy, env)
|
||||
result = collector.collect(n_episode=1, render=1 / 35)
|
||||
result = collector.collect(n_episode=1, render=args.render)
|
||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||
collector.close()
|
||||
|
||||
|
||||
@ -35,6 +35,7 @@ def get_args():
|
||||
parser.add_argument('--training-num', type=int, default=8)
|
||||
parser.add_argument('--test-num', type=int, default=100)
|
||||
parser.add_argument('--logdir', type=str, default='log')
|
||||
parser.add_argument('--render', type=float, default=0.)
|
||||
parser.add_argument(
|
||||
'--device', type=str,
|
||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||
@ -73,7 +74,7 @@ def test_dqn(args=get_args()):
|
||||
train_collector.collect(n_step=args.batch_size)
|
||||
print(len(train_collector.buffer))
|
||||
# log
|
||||
writer = SummaryWriter(args.logdir)
|
||||
writer = SummaryWriter(args.logdir + '/' + 'ppo')
|
||||
|
||||
def stop_fn(x):
|
||||
return x >= env.spec.reward_threshold
|
||||
@ -89,7 +90,7 @@ def test_dqn(args=get_args()):
|
||||
policy, train_collector, test_collector, args.epoch,
|
||||
args.step_per_epoch, args.collect_per_step, args.test_num,
|
||||
args.batch_size, train_fn=train_fn, test_fn=test_fn,
|
||||
stop_fn=stop_fn, writer=writer)
|
||||
stop_fn=stop_fn, writer=writer, task=args.task)
|
||||
|
||||
assert stop_fn(result['best_reward'])
|
||||
train_collector.close()
|
||||
@ -99,7 +100,7 @@ def test_dqn(args=get_args()):
|
||||
# Let's watch its performance!
|
||||
env = gym.make(args.task)
|
||||
collector = Collector(policy, env)
|
||||
result = collector.collect(n_episode=1, render=1 / 35)
|
||||
result = collector.collect(n_episode=1, render=args.render)
|
||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||
collector.close()
|
||||
|
||||
|
||||
@ -86,6 +86,7 @@ def get_args():
|
||||
parser.add_argument('--training-num', type=int, default=8)
|
||||
parser.add_argument('--test-num', type=int, default=100)
|
||||
parser.add_argument('--logdir', type=str, default='log')
|
||||
parser.add_argument('--render', type=float, default=0.)
|
||||
parser.add_argument(
|
||||
'--device', type=str,
|
||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||
@ -121,7 +122,7 @@ def test_pg(args=get_args()):
|
||||
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||
test_collector = Collector(policy, test_envs)
|
||||
# log
|
||||
writer = SummaryWriter(args.logdir)
|
||||
writer = SummaryWriter(args.logdir + '/' + 'ppo')
|
||||
|
||||
def stop_fn(x):
|
||||
return x >= env.spec.reward_threshold
|
||||
@ -130,7 +131,7 @@ def test_pg(args=get_args()):
|
||||
result = onpolicy_trainer(
|
||||
policy, train_collector, test_collector, args.epoch,
|
||||
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
|
||||
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer)
|
||||
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||
assert stop_fn(result['best_reward'])
|
||||
train_collector.close()
|
||||
test_collector.close()
|
||||
@ -139,7 +140,7 @@ def test_pg(args=get_args()):
|
||||
# Let's watch its performance!
|
||||
env = gym.make(args.task)
|
||||
collector = Collector(policy, env)
|
||||
result = collector.collect(n_episode=1, render=1 / 35)
|
||||
result = collector.collect(n_episode=1, render=args.render)
|
||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||
collector.close()
|
||||
|
||||
|
||||
@ -32,6 +32,7 @@ def get_args():
|
||||
parser.add_argument('--training-num', type=int, default=32)
|
||||
parser.add_argument('--test-num', type=int, default=100)
|
||||
parser.add_argument('--logdir', type=str, default='log')
|
||||
parser.add_argument('--render', type=float, default=0.)
|
||||
parser.add_argument(
|
||||
'--device', type=str,
|
||||
default='cuda' if torch.cuda.is_available() else 'cpu')
|
||||
@ -78,7 +79,7 @@ def test_ppo(args=get_args()):
|
||||
policy, train_envs, ReplayBuffer(args.buffer_size))
|
||||
test_collector = Collector(policy, test_envs)
|
||||
# log
|
||||
writer = SummaryWriter(args.logdir)
|
||||
writer = SummaryWriter(args.logdir + '/' + 'ppo')
|
||||
|
||||
def stop_fn(x):
|
||||
return x >= env.spec.reward_threshold
|
||||
@ -87,7 +88,7 @@ def test_ppo(args=get_args()):
|
||||
result = onpolicy_trainer(
|
||||
policy, train_collector, test_collector, args.epoch,
|
||||
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
|
||||
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer)
|
||||
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
|
||||
assert stop_fn(result['best_reward'])
|
||||
train_collector.close()
|
||||
test_collector.close()
|
||||
@ -96,7 +97,7 @@ def test_ppo(args=get_args()):
|
||||
# Let's watch its performance!
|
||||
env = gym.make(args.task)
|
||||
collector = Collector(policy, env)
|
||||
result = collector.collect(n_episode=1, render=1 / 35)
|
||||
result = collector.collect(n_episode=1, render=args.render)
|
||||
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
|
||||
collector.close()
|
||||
|
||||
|
||||
@ -37,7 +37,7 @@ class Batch(object):
|
||||
else:
|
||||
raise TypeError(
|
||||
'No support for append with type {} in class Batch.'
|
||||
.format(type(batch.__dict__[k])))
|
||||
.format(type(batch.__dict__[k])))
|
||||
|
||||
def split(self, size=None, permute=True):
|
||||
length = min([
|
||||
|
||||
@ -2,7 +2,7 @@ import time
|
||||
import torch
|
||||
import numpy as np
|
||||
from copy import deepcopy
|
||||
|
||||
import warnings
|
||||
from tianshou.env import BaseVectorEnv
|
||||
from tianshou.data import Batch, ReplayBuffer
|
||||
from tianshou.utils import MovAvg
|
||||
@ -87,6 +87,7 @@ class Collector(object):
|
||||
return np.array([data])
|
||||
|
||||
def collect(self, n_step=0, n_episode=0, render=0):
|
||||
warning_count = 0
|
||||
if not self._multi_env:
|
||||
n_episode = np.sum(n_episode)
|
||||
start_time = time.time()
|
||||
@ -97,6 +98,10 @@ class Collector(object):
|
||||
reward_sum = 0
|
||||
length_sum = 0
|
||||
while True:
|
||||
if warning_count >= 100000:
|
||||
warnings.warn(
|
||||
'There are already many steps in an episode. You should add a time limitation to your environment!',
|
||||
Warning)
|
||||
if self._multi_env:
|
||||
batch_data = Batch(
|
||||
obs=self._obs, act=self._act, rew=self._rew,
|
||||
@ -131,11 +136,14 @@ class Collector(object):
|
||||
'rew': self._rew[i], 'done': self._done[i],
|
||||
'obs_next': obs_next[i], 'info': self._info[i]}
|
||||
if self._cached_buf:
|
||||
warning_count += 1
|
||||
self._cached_buf[i].add(**data)
|
||||
elif self._multi_buf:
|
||||
warning_count += 1
|
||||
self.buffer[i].add(**data)
|
||||
cur_step += 1
|
||||
else:
|
||||
warning_count += 1
|
||||
self.buffer.add(**data)
|
||||
cur_step += 1
|
||||
if self._done[i]:
|
||||
|
||||
@ -14,7 +14,7 @@ class OUNoise(object):
|
||||
if self.x is None or self.x.shape != size:
|
||||
self.x = 0
|
||||
self.x = self.x + self.alpha * (mu - self.x) + \
|
||||
self.beta * np.random.normal(size=size)
|
||||
self.beta * np.random.normal(size=size)
|
||||
return self.x
|
||||
|
||||
def reset(self):
|
||||
|
||||
@ -39,6 +39,7 @@ class A2CPolicy(PGPolicy):
|
||||
a_loss = -(dist.log_prob(a) * (r - v).detach()).mean()
|
||||
vf_loss = F.mse_loss(r[:, None], v)
|
||||
ent_loss = dist.entropy().mean()
|
||||
|
||||
loss = a_loss + self._w_vf * vf_loss - self._w_ent * ent_loss
|
||||
loss.backward()
|
||||
if self._grad_norm:
|
||||
|
||||
@ -34,6 +34,9 @@ class PGPolicy(BasePolicy):
|
||||
|
||||
def learn(self, batch, batch_size=None, repeat=1):
|
||||
losses = []
|
||||
|
||||
batch.returns = (batch.returns - batch.returns.mean()) \
|
||||
/ (batch.returns.std() + self._eps)
|
||||
r = batch.returns
|
||||
batch.returns = (r - r.mean()) / (r.std() + self._eps)
|
||||
for _ in range(repeat):
|
||||
|
||||
@ -58,6 +58,9 @@ class PPOPolicy(PGPolicy):
|
||||
|
||||
def learn(self, batch, batch_size=None, repeat=1):
|
||||
losses, clip_losses, vf_losses, ent_losses = [], [], [], []
|
||||
|
||||
batch.returns = (batch.returns - batch.returns.mean()) \
|
||||
/ (batch.returns.std() + self._eps)
|
||||
r = batch.returns
|
||||
batch.returns = (r - r.mean()) / (r.std() + self._eps)
|
||||
batch.act = torch.tensor(batch.act)
|
||||
@ -79,6 +82,7 @@ class PPOPolicy(PGPolicy):
|
||||
clip_losses.append(clip_loss.detach().cpu().numpy())
|
||||
vf_loss = F.smooth_l1_loss(self.critic(b.obs), target_v)
|
||||
vf_losses.append(vf_loss.detach().cpu().numpy())
|
||||
|
||||
e_loss = dist.entropy().mean()
|
||||
ent_losses.append(e_loss.detach().cpu().numpy())
|
||||
loss = clip_loss + self._w_vf * vf_loss - self._w_ent * e_loss
|
||||
@ -87,7 +91,7 @@ class PPOPolicy(PGPolicy):
|
||||
loss.backward()
|
||||
nn.utils.clip_grad_norm_(list(
|
||||
self.actor.parameters()) + list(self.critic.parameters()),
|
||||
self._max_grad_norm)
|
||||
self._max_grad_norm)
|
||||
self.optim.step()
|
||||
self.sync_weight()
|
||||
return {
|
||||
|
||||
@ -8,7 +8,7 @@ from tianshou.trainer import test_episode, gather_info
|
||||
def offpolicy_trainer(policy, train_collector, test_collector, max_epoch,
|
||||
step_per_epoch, collect_per_step, episode_per_test,
|
||||
batch_size, train_fn=None, test_fn=None, stop_fn=None,
|
||||
writer=None, verbose=True):
|
||||
writer=None, verbose=True, task=''):
|
||||
global_step = 0
|
||||
best_epoch, best_reward = -1, -1
|
||||
stat = {}
|
||||
@ -47,7 +47,7 @@ def offpolicy_trainer(policy, train_collector, test_collector, max_epoch,
|
||||
data[k] = f'{result[k]:.2f}'
|
||||
if writer:
|
||||
writer.add_scalar(
|
||||
k, result[k], global_step=global_step)
|
||||
k + '_' + task, result[k], global_step=global_step)
|
||||
for k in losses.keys():
|
||||
if stat.get(k) is None:
|
||||
stat[k] = MovAvg()
|
||||
@ -55,7 +55,7 @@ def offpolicy_trainer(policy, train_collector, test_collector, max_epoch,
|
||||
data[k] = f'{stat[k].get():.6f}'
|
||||
if writer:
|
||||
writer.add_scalar(
|
||||
k, stat[k].get(), global_step=global_step)
|
||||
k + '_' + task, stat[k].get(), global_step=global_step)
|
||||
t.update(1)
|
||||
t.set_postfix(**data)
|
||||
if t.n <= t.total:
|
||||
|
||||
@ -9,7 +9,7 @@ def onpolicy_trainer(policy, train_collector, test_collector, max_epoch,
|
||||
step_per_epoch, collect_per_step, repeat_per_collect,
|
||||
episode_per_test, batch_size,
|
||||
train_fn=None, test_fn=None, stop_fn=None,
|
||||
writer=None, verbose=True):
|
||||
writer=None, verbose=True, task=''):
|
||||
global_step = 0
|
||||
best_epoch, best_reward = -1, -1
|
||||
stat = {}
|
||||
@ -52,15 +52,15 @@ def onpolicy_trainer(policy, train_collector, test_collector, max_epoch,
|
||||
data[k] = f'{result[k]:.2f}'
|
||||
if writer:
|
||||
writer.add_scalar(
|
||||
k, result[k], global_step=global_step)
|
||||
k + '_' + task, result[k], global_step=global_step)
|
||||
for k in losses.keys():
|
||||
if stat.get(k) is None:
|
||||
stat[k] = MovAvg()
|
||||
stat[k].add(losses[k])
|
||||
data[k] = f'{stat[k].get():.6f}'
|
||||
if writer:
|
||||
if writer and global_step:
|
||||
writer.add_scalar(
|
||||
k, stat[k].get(), global_step=global_step)
|
||||
k + '_' + task, stat[k].get(), global_step=global_step)
|
||||
t.update(step)
|
||||
t.set_postfix(**data)
|
||||
if t.n <= t.total:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user