add examples, fix some bugs (#5)

* update atari.py

* fix setup.py
pass the pytest

* fix setup.py
pass the pytest

* add args "render"

* change the tensorboard writter

* change the tensorboard writter

* change device, render, tensorboard log location

* change device, render, tensorboard log location

* remove some wrong local files

* fix some tab mistakes and the envs name in continuous/test_xx.py

* add examples and point robot maze environment

* fix some bugs during testing examples

* add dqn network and fix some args

* change back the tensorboard writter's frequency to ensure ppo and a2c can write things normally

* add a warning to collector

* rm some unrelated files

* reformat

* fix a bug in test_dqn due to the model wrong selection
This commit is contained in:
Minghao Zhang 2020-03-28 07:27:18 +08:00 committed by GitHub
parent acb93502cf
commit 77068af526
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
28 changed files with 1031 additions and 35 deletions

BIN
docs/_static/images/Ant-v2.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 183 KiB

105
examples/ant_v2_ddpg.py Normal file
View File

@ -0,0 +1,105 @@
import gym
import torch
import pprint
import argparse
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from tianshou.policy import DDPGPolicy
from tianshou.trainer import offpolicy_trainer
from tianshou.data import Collector, ReplayBuffer
from tianshou.env import VectorEnv, SubprocVectorEnv
if __name__ == '__main__':
from continuous_net import Actor, Critic
else: # pytest
from test.continuous.net import Actor, Critic
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='Ant-v2')
parser.add_argument('--seed', type=int, default=1626)
parser.add_argument('--buffer-size', type=int, default=20000)
parser.add_argument('--actor-lr', type=float, default=1e-4)
parser.add_argument('--critic-lr', type=float, default=1e-3)
parser.add_argument('--gamma', type=float, default=0.99)
parser.add_argument('--tau', type=float, default=0.005)
parser.add_argument('--exploration-noise', type=float, default=0.1)
parser.add_argument('--epoch', type=int, default=100)
parser.add_argument('--step-per-epoch', type=int, default=2400)
parser.add_argument('--collect-per-step', type=int, default=4)
parser.add_argument('--batch-size', type=int, default=128)
parser.add_argument('--layer-num', type=int, default=1)
parser.add_argument('--training-num', type=int, default=8)
parser.add_argument('--test-num', type=int, default=100)
parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument(
'--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu')
args = parser.parse_known_args()[0]
return args
def test_ddpg(args=get_args()):
env = gym.make(args.task)
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
args.max_action = env.action_space.high[0]
# train_envs = gym.make(args.task)
train_envs = VectorEnv(
[lambda: gym.make(args.task) for _ in range(args.training_num)])
# test_envs = gym.make(args.task)
test_envs = SubprocVectorEnv(
[lambda: gym.make(args.task) for _ in range(args.test_num)])
# seed
np.random.seed(args.seed)
torch.manual_seed(args.seed)
train_envs.seed(args.seed)
test_envs.seed(args.seed)
# model
actor = Actor(
args.layer_num, args.state_shape, args.action_shape,
args.max_action, args.device
).to(args.device)
actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
critic = Critic(
args.layer_num, args.state_shape, args.action_shape, args.device
).to(args.device)
critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
policy = DDPGPolicy(
actor, actor_optim, critic, critic_optim,
args.tau, args.gamma, args.exploration_noise,
[env.action_space.low[0], env.action_space.high[0]],
reward_normalization=True, ignore_done=True)
# collector
train_collector = Collector(
policy, train_envs, ReplayBuffer(args.buffer_size))
test_collector = Collector(policy, test_envs)
# log
writer = SummaryWriter(args.logdir + '/' + 'ddpg')
def stop_fn(x):
return x >= env.spec.reward_threshold
# trainer
result = offpolicy_trainer(
policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.test_num,
args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
assert stop_fn(result['best_reward'])
train_collector.close()
test_collector.close()
if __name__ == '__main__':
pprint.pprint(result)
# Let's watch its performance!
env = gym.make(args.task)
collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close()
if __name__ == '__main__':
test_ddpg()

110
examples/ant_v2_sac.py Normal file
View File

@ -0,0 +1,110 @@
import gym
import torch
import pprint
import argparse
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from tianshou.policy import SACPolicy
from tianshou.trainer import offpolicy_trainer
from tianshou.data import Collector, ReplayBuffer
from tianshou.env import VectorEnv, SubprocVectorEnv
if __name__ == '__main__':
from continuous_net import ActorProb, Critic
else: # pytest
from test.continuous.net import ActorProb, Critic
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='Ant-v2')
parser.add_argument('--seed', type=int, default=1626)
parser.add_argument('--buffer-size', type=int, default=20000)
parser.add_argument('--actor-lr', type=float, default=3e-4)
parser.add_argument('--critic-lr', type=float, default=1e-3)
parser.add_argument('--gamma', type=float, default=0.99)
parser.add_argument('--tau', type=float, default=0.005)
parser.add_argument('--alpha', type=float, default=0.2)
parser.add_argument('--epoch', type=int, default=100)
parser.add_argument('--step-per-epoch', type=int, default=2400)
parser.add_argument('--collect-per-step', type=int, default=10)
parser.add_argument('--batch-size', type=int, default=128)
parser.add_argument('--layer-num', type=int, default=1)
parser.add_argument('--training-num', type=int, default=8)
parser.add_argument('--test-num', type=int, default=100)
parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument(
'--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu')
args = parser.parse_known_args()[0]
return args
def test_sac(args=get_args()):
env = gym.make(args.task)
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
args.max_action = env.action_space.high[0]
# train_envs = gym.make(args.task)
train_envs = VectorEnv(
[lambda: gym.make(args.task) for _ in range(args.training_num)])
# test_envs = gym.make(args.task)
test_envs = SubprocVectorEnv(
[lambda: gym.make(args.task) for _ in range(args.test_num)])
# seed
np.random.seed(args.seed)
torch.manual_seed(args.seed)
train_envs.seed(args.seed)
test_envs.seed(args.seed)
# model
actor = ActorProb(
args.layer_num, args.state_shape, args.action_shape,
args.max_action, args.device
).to(args.device)
actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
critic1 = Critic(
args.layer_num, args.state_shape, args.action_shape, args.device
).to(args.device)
critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
critic2 = Critic(
args.layer_num, args.state_shape, args.action_shape, args.device
).to(args.device)
critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
policy = SACPolicy(
actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim,
args.tau, args.gamma, args.alpha,
[env.action_space.low[0], env.action_space.high[0]],
reward_normalization=True, ignore_done=True)
# collector
train_collector = Collector(
policy, train_envs, ReplayBuffer(args.buffer_size))
test_collector = Collector(policy, test_envs)
# train_collector.collect(n_step=args.buffer_size)
# log
writer = SummaryWriter(args.logdir + '/' + 'sac')
def stop_fn(x):
return x >= env.spec.reward_threshold
# trainer
result = offpolicy_trainer(
policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.test_num,
args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
assert stop_fn(result['best_reward'])
train_collector.close()
test_collector.close()
if __name__ == '__main__':
pprint.pprint(result)
# Let's watch its performance!
env = gym.make(args.task)
collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close()
if __name__ == '__main__':
test_sac()

114
examples/ant_v2_td3.py Normal file
View File

@ -0,0 +1,114 @@
import gym
import torch
import pprint
import argparse
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from tianshou.policy import TD3Policy
from tianshou.trainer import offpolicy_trainer
from tianshou.data import Collector, ReplayBuffer
from tianshou.env import VectorEnv, SubprocVectorEnv
if __name__ == '__main__':
from continuous_net import Actor, Critic
else: # pytest
from test.continuous.net import Actor, Critic
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='Ant-v2')
parser.add_argument('--seed', type=int, default=1626)
parser.add_argument('--buffer-size', type=int, default=20000)
parser.add_argument('--actor-lr', type=float, default=3e-4)
parser.add_argument('--critic-lr', type=float, default=1e-3)
parser.add_argument('--gamma', type=float, default=0.99)
parser.add_argument('--tau', type=float, default=0.005)
parser.add_argument('--exploration-noise', type=float, default=0.1)
parser.add_argument('--policy-noise', type=float, default=0.2)
parser.add_argument('--noise-clip', type=float, default=0.5)
parser.add_argument('--update-actor-freq', type=int, default=2)
parser.add_argument('--epoch', type=int, default=100)
parser.add_argument('--step-per-epoch', type=int, default=2400)
parser.add_argument('--collect-per-step', type=int, default=10)
parser.add_argument('--batch-size', type=int, default=128)
parser.add_argument('--layer-num', type=int, default=1)
parser.add_argument('--training-num', type=int, default=8)
parser.add_argument('--test-num', type=int, default=100)
parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument(
'--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu')
args = parser.parse_known_args()[0]
return args
def test_td3(args=get_args()):
env = gym.make(args.task)
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
args.max_action = env.action_space.high[0]
# train_envs = gym.make(args.task)
train_envs = VectorEnv(
[lambda: gym.make(args.task) for _ in range(args.training_num)])
# test_envs = gym.make(args.task)
test_envs = SubprocVectorEnv(
[lambda: gym.make(args.task) for _ in range(args.test_num)])
# seed
np.random.seed(args.seed)
torch.manual_seed(args.seed)
train_envs.seed(args.seed)
test_envs.seed(args.seed)
# model
actor = Actor(
args.layer_num, args.state_shape, args.action_shape,
args.max_action, args.device
).to(args.device)
actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
critic1 = Critic(
args.layer_num, args.state_shape, args.action_shape, args.device
).to(args.device)
critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
critic2 = Critic(
args.layer_num, args.state_shape, args.action_shape, args.device
).to(args.device)
critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
policy = TD3Policy(
actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim,
args.tau, args.gamma, args.exploration_noise, args.policy_noise,
args.update_actor_freq, args.noise_clip,
[env.action_space.low[0], env.action_space.high[0]],
reward_normalization=True, ignore_done=True)
# collector
train_collector = Collector(
policy, train_envs, ReplayBuffer(args.buffer_size))
test_collector = Collector(policy, test_envs)
# train_collector.collect(n_step=args.buffer_size)
# log
writer = SummaryWriter(args.logdir + '/' + 'td3')
def stop_fn(x):
return x >= env.spec.reward_threshold
# trainer
result = offpolicy_trainer(
policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.test_num,
args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
assert stop_fn(result['best_reward'])
train_collector.close()
test_collector.close()
if __name__ == '__main__':
pprint.pprint(result)
# Let's watch its performance!
env = gym.make(args.task)
collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close()
if __name__ == '__main__':
test_td3()

View File

@ -0,0 +1,79 @@
import torch
import numpy as np
from torch import nn
class Actor(nn.Module):
def __init__(self, layer_num, state_shape, action_shape,
max_action, device='cpu'):
super().__init__()
self.device = device
self.model = [
nn.Linear(np.prod(state_shape), 128),
nn.ReLU(inplace=True)]
for i in range(layer_num):
self.model += [nn.Linear(128, 128), nn.ReLU(inplace=True)]
self.model += [nn.Linear(128, np.prod(action_shape))]
self.model = nn.Sequential(*self.model)
self._max = max_action
def forward(self, s, **kwargs):
s = torch.tensor(s, device=self.device, dtype=torch.float)
batch = s.shape[0]
s = s.view(batch, -1)
logits = self.model(s)
logits = self._max * torch.tanh(logits)
return logits, None
class ActorProb(nn.Module):
def __init__(self, layer_num, state_shape, action_shape,
max_action, device='cpu'):
super().__init__()
self.device = device
self.model = [
nn.Linear(np.prod(state_shape), 128),
nn.ReLU(inplace=True)]
for i in range(layer_num):
self.model += [nn.Linear(128, 128), nn.ReLU(inplace=True)]
self.model = nn.Sequential(*self.model)
self.mu = nn.Linear(128, np.prod(action_shape))
self.sigma = nn.Linear(128, np.prod(action_shape))
self._max = max_action
def forward(self, s, **kwargs):
if not isinstance(s, torch.Tensor):
s = torch.tensor(s, device=self.device, dtype=torch.float)
batch = s.shape[0]
s = s.view(batch, -1)
logits = self.model(s)
mu = self._max * torch.tanh(self.mu(logits))
sigma = torch.exp(self.sigma(logits))
return (mu, sigma), None
class Critic(nn.Module):
def __init__(self, layer_num, state_shape, action_shape=0, device='cpu'):
super().__init__()
self.device = device
self.model = [
nn.Linear(np.prod(state_shape) + np.prod(action_shape), 128),
nn.ReLU(inplace=True)]
for i in range(layer_num):
self.model += [nn.Linear(128, 128), nn.ReLU(inplace=True)]
self.model += [nn.Linear(128, 1)]
self.model = nn.Sequential(*self.model)
def forward(self, s, a=None):
if not isinstance(s, torch.Tensor):
s = torch.tensor(s, device=self.device, dtype=torch.float)
if a is not None and not isinstance(a, torch.Tensor):
a = torch.tensor(a, device=self.device, dtype=torch.float)
batch = s.shape[0]
s = s.view(batch, -1)
if a is None:
logits = self.model(s)
else:
a = a.view(batch, -1)
logits = self.model(torch.cat([s, a], dim=1))
return logits

81
examples/discrete_net.py Normal file
View File

@ -0,0 +1,81 @@
import torch
import numpy as np
from torch import nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self, layer_num, state_shape, action_shape=0, device='cpu'):
super().__init__()
self.device = device
self.model = [
nn.Linear(np.prod(state_shape), 128),
nn.ReLU(inplace=True)]
for i in range(layer_num):
self.model += [nn.Linear(128, 128), nn.ReLU(inplace=True)]
if action_shape:
self.model += [nn.Linear(128, np.prod(action_shape))]
self.model = nn.Sequential(*self.model)
def forward(self, s, state=None, info={}):
if not isinstance(s, torch.Tensor):
s = torch.tensor(s, device=self.device, dtype=torch.float)
batch = s.shape[0]
s = s.view(batch, -1)
logits = self.model(s)
return logits, state
class Actor(nn.Module):
def __init__(self, preprocess_net, action_shape):
super().__init__()
self.preprocess = preprocess_net
self.last = nn.Linear(128, np.prod(action_shape))
def forward(self, s, state=None, info={}):
logits, h = self.preprocess(s, state)
logits = F.softmax(self.last(logits), dim=-1)
return logits, h
class Critic(nn.Module):
def __init__(self, preprocess_net):
super().__init__()
self.preprocess = preprocess_net
self.last = nn.Linear(128, 1)
def forward(self, s):
logits, h = self.preprocess(s, None)
logits = self.last(logits)
return logits
class DQN(nn.Module):
def __init__(self, h, w, action_shape, device='cpu'):
super(DQN, self).__init__()
self.device = device
self.conv1 = nn.Conv2d(1, 16, kernel_size=5, stride=2)
self.bn1 = nn.BatchNorm2d(16)
self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
self.bn2 = nn.BatchNorm2d(32)
self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
self.bn3 = nn.BatchNorm2d(32)
def conv2d_size_out(size, kernel_size=5, stride=2):
return (size - (kernel_size - 1) - 1) // stride + 1
convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
linear_input_size = convw * convh * 32
self.head = nn.Linear(linear_input_size, action_shape)
def forward(self, x, state=None, info={}):
if not isinstance(x, torch.Tensor):
x = torch.tensor(x, device=self.device, dtype=torch.float)
x = x.permute(0, 3, 1, 2)
x = F.relu(self.bn1(self.conv1(x)))
x = F.relu(self.bn2(self.conv2(x)))
x = F.relu(self.bn3(self.conv3(x)))
return self.head(x.view(x.size(0), -1)), state

119
examples/point_maze_td3.py Normal file
View File

@ -0,0 +1,119 @@
import gym
import torch
import pprint
import argparse
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from tianshou.policy import TD3Policy
from tianshou.trainer import offpolicy_trainer
from tianshou.data import Collector, ReplayBuffer
from tianshou.env import VectorEnv, SubprocVectorEnv
if __name__ == '__main__':
from continuous_net import Actor, Critic
else: # pytest
from test.continuous.net import Actor, Critic
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='PointMaze-v0')
parser.add_argument('--seed', type=int, default=1626)
parser.add_argument('--buffer-size', type=int, default=20000)
parser.add_argument('--actor-lr', type=float, default=3e-5)
parser.add_argument('--critic-lr', type=float, default=1e-4)
parser.add_argument('--gamma', type=float, default=0.99)
parser.add_argument('--tau', type=float, default=0.005)
parser.add_argument('--exploration-noise', type=float, default=0.1)
parser.add_argument('--policy-noise', type=float, default=0.2)
parser.add_argument('--noise-clip', type=float, default=0.5)
parser.add_argument('--update-actor-freq', type=int, default=2)
parser.add_argument('--epoch', type=int, default=100)
parser.add_argument('--step-per-epoch', type=int, default=2400)
parser.add_argument('--collect-per-step', type=int, default=10)
parser.add_argument('--batch-size', type=int, default=128)
parser.add_argument('--layer-num', type=int, default=1)
parser.add_argument('--training-num', type=int, default=8)
parser.add_argument('--test-num', type=int, default=100)
parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument(
'--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu')
parser.add_argument('--max_episode_steps', type=int, default=2000)
args = parser.parse_known_args()[0]
return args
def test_td3(args=get_args()):
env = gym.make(args.task)
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
args.max_action = env.action_space.high[0]
# train_envs = gym.make(args.task)
train_envs = VectorEnv(
[lambda: gym.make(args.task) for _ in range(args.training_num)])
# test_envs = gym.make(args.task)
test_envs = SubprocVectorEnv(
[lambda: gym.make(args.task) for _ in range(args.test_num)])
# seed
np.random.seed(args.seed)
torch.manual_seed(args.seed)
train_envs.seed(args.seed)
test_envs.seed(args.seed)
# model
actor = Actor(
args.layer_num, args.state_shape, args.action_shape,
args.max_action, args.device
).to(args.device)
actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
critic1 = Critic(
args.layer_num, args.state_shape, args.action_shape, args.device
).to(args.device)
critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
critic2 = Critic(
args.layer_num, args.state_shape, args.action_shape, args.device
).to(args.device)
critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
policy = TD3Policy(
actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim,
args.tau, args.gamma, args.exploration_noise, args.policy_noise,
args.update_actor_freq, args.noise_clip,
[env.action_space.low[0], env.action_space.high[0]],
reward_normalization=True, ignore_done=True)
# collector
train_collector = Collector(
policy, train_envs, ReplayBuffer(args.buffer_size))
test_collector = Collector(policy, test_envs)
# train_collector.collect(n_step=args.buffer_size)
# log
writer = SummaryWriter(args.logdir + '/' + 'td3')
def stop_fn(x):
if env.spec.reward_threshold:
return x >= env.spec.reward_threshold
else:
return False
# trainer
result = offpolicy_trainer(
policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.test_num,
args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
assert stop_fn(result['best_reward'])
train_collector.close()
test_collector.close()
if __name__ == '__main__':
pprint.pprint(result)
# Let's watch its performance!
env = gym.make(args.task)
collector = Collector(policy, env)
result = collector.collect(n_step=1000, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close()
if __name__ == '__main__':
test_td3()

108
examples/pong_a2c.py Normal file
View File

@ -0,0 +1,108 @@
import gym
import torch
import pprint
import argparse
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from tianshou.policy import A2CPolicy
from tianshou.env import SubprocVectorEnv
from tianshou.trainer import onpolicy_trainer
from tianshou.data import Collector, ReplayBuffer
from tianshou.env.atari import create_atari_environment
if __name__ == '__main__':
from discrete_net import Net, Actor, Critic
else: # pytest
from test.discrete.net import Net, Actor, Critic
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='Pong')
parser.add_argument('--seed', type=int, default=1626)
parser.add_argument('--buffer-size', type=int, default=20000)
parser.add_argument('--lr', type=float, default=3e-4)
parser.add_argument('--gamma', type=float, default=0.9)
parser.add_argument('--epoch', type=int, default=100)
parser.add_argument('--step-per-epoch', type=int, default=1000)
parser.add_argument('--collect-per-step', type=int, default=100)
parser.add_argument('--repeat-per-collect', type=int, default=1)
parser.add_argument('--batch-size', type=int, default=64)
parser.add_argument('--layer-num', type=int, default=2)
parser.add_argument('--training-num', type=int, default=8)
parser.add_argument('--test-num', type=int, default=8)
parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument(
'--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu')
# a2c special
parser.add_argument('--vf-coef', type=float, default=0.5)
parser.add_argument('--ent-coef', type=float, default=0.001)
parser.add_argument('--max-grad-norm', type=float, default=None)
parser.add_argument('--max_episode_steps', type=int, default=2000)
args = parser.parse_known_args()[0]
return args
def test_a2c(args=get_args()):
env = create_atari_environment(args.task, max_episode_steps=args.max_episode_steps)
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.env.action_space.shape or env.env.action_space.n
# train_envs = gym.make(args.task)
train_envs = SubprocVectorEnv(
[lambda: create_atari_environment(args.task, max_episode_steps=args.max_episode_steps) for _ in
range(args.training_num)])
# test_envs = gym.make(args.task)
test_envs = SubprocVectorEnv(
[lambda: create_atari_environment(args.task, max_episode_steps=args.max_episode_steps) for _ in
range(args.test_num)])
# seed
np.random.seed(args.seed)
torch.manual_seed(args.seed)
train_envs.seed(args.seed)
test_envs.seed(args.seed)
# model
net = Net(args.layer_num, args.state_shape, device=args.device)
actor = Actor(net, args.action_shape).to(args.device)
critic = Critic(net).to(args.device)
optim = torch.optim.Adam(list(
actor.parameters()) + list(critic.parameters()), lr=args.lr)
dist = torch.distributions.Categorical
policy = A2CPolicy(
actor, critic, optim, dist, args.gamma, vf_coef=args.vf_coef,
ent_coef=args.ent_coef, max_grad_norm=args.max_grad_norm)
# collector
train_collector = Collector(
policy, train_envs, ReplayBuffer(args.buffer_size))
test_collector = Collector(policy, test_envs)
# log
writer = SummaryWriter(args.logdir + '/' + 'a2c')
def stop_fn(x):
if env.env.spec.reward_threshold:
return x >= env.spec.reward_threshold
else:
return False
# trainer
result = onpolicy_trainer(
policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
train_collector.close()
test_collector.close()
if __name__ == '__main__':
pprint.pprint(result)
# Let's watch its performance!
env = create_atari_environment(args.task)
collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close()
if __name__ == '__main__':
test_a2c()

112
examples/pong_dqn.py Normal file
View File

@ -0,0 +1,112 @@
import gym
import torch
import pprint
import argparse
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from tianshou.policy import DQNPolicy
from tianshou.env import SubprocVectorEnv
from tianshou.trainer import offpolicy_trainer
from tianshou.data import Collector, ReplayBuffer
from tianshou.env.atari import create_atari_environment
if __name__ == '__main__':
from discrete_net import DQN
else: # pytest
from test.discrete.net import DQN
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='Pong')
parser.add_argument('--seed', type=int, default=1626)
parser.add_argument('--eps-test', type=float, default=0.05)
parser.add_argument('--eps-train', type=float, default=0.1)
parser.add_argument('--buffer-size', type=int, default=20000)
parser.add_argument('--lr', type=float, default=1e-3)
parser.add_argument('--gamma', type=float, default=0.9)
parser.add_argument('--n-step', type=int, default=1)
parser.add_argument('--target-update-freq', type=int, default=320)
parser.add_argument('--epoch', type=int, default=100)
parser.add_argument('--step-per-epoch', type=int, default=1000)
parser.add_argument('--collect-per-step', type=int, default=10)
parser.add_argument('--batch-size', type=int, default=64)
parser.add_argument('--layer-num', type=int, default=3)
parser.add_argument('--training-num', type=int, default=8)
parser.add_argument('--test-num', type=int, default=8)
parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument(
'--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu')
args = parser.parse_known_args()[0]
return args
def test_dqn(args=get_args()):
env = create_atari_environment(args.task)
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.env.action_space.shape or env.env.action_space.n
# train_envs = gym.make(args.task)
train_envs = SubprocVectorEnv(
[lambda: create_atari_environment(args.task) for _ in range(args.training_num)])
# test_envs = gym.make(args.task)
test_envs = SubprocVectorEnv(
[lambda: create_atari_environment(args.task) for _ in range(args.test_num)])
# seed
np.random.seed(args.seed)
torch.manual_seed(args.seed)
train_envs.seed(args.seed)
test_envs.seed(args.seed)
# model
net = DQN(args.state_shape[0], args.state_shape[1], args.action_shape, args.device)
net = net.to(args.device)
optim = torch.optim.Adam(net.parameters(), lr=args.lr)
policy = DQNPolicy(
net, optim, args.gamma, args.n_step,
use_target_network=args.target_update_freq > 0,
target_update_freq=args.target_update_freq)
# collector
train_collector = Collector(
policy, train_envs, ReplayBuffer(args.buffer_size))
test_collector = Collector(policy, test_envs)
# policy.set_eps(1)
train_collector.collect(n_step=args.batch_size * 4)
print(len(train_collector.buffer))
# log
writer = SummaryWriter(args.logdir + '/' + 'dqn')
def stop_fn(x):
if env.env.spec.reward_threshold:
return x >= env.spec.reward_threshold
else:
return False
def train_fn(x):
policy.set_eps(args.eps_train)
def test_fn(x):
policy.set_eps(args.eps_test)
# trainer
result = offpolicy_trainer(
policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.test_num,
args.batch_size, train_fn=train_fn, test_fn=test_fn,
stop_fn=stop_fn, writer=writer, task=args.task)
train_collector.close()
test_collector.close()
if __name__ == '__main__':
pprint.pprint(result)
# Let's watch its performance!
env = create_atari_environment(args.task)
collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close()
if __name__ == '__main__':
test_dqn(get_args())

112
examples/pong_ppo.py Normal file
View File

@ -0,0 +1,112 @@
import gym
import torch
import pprint
import argparse
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from tianshou.policy import PPOPolicy
from tianshou.env import SubprocVectorEnv
from tianshou.trainer import onpolicy_trainer
from tianshou.data import Collector, ReplayBuffer
from tianshou.env.atari import create_atari_environment
if __name__ == '__main__':
from discrete_net import Net, Actor, Critic
else: # pytest
from test.discrete.net import Net, Actor, Critic
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='Pong')
parser.add_argument('--seed', type=int, default=1626)
parser.add_argument('--buffer-size', type=int, default=20000)
parser.add_argument('--lr', type=float, default=1e-3)
parser.add_argument('--gamma', type=float, default=0.99)
parser.add_argument('--epoch', type=int, default=100)
parser.add_argument('--step-per-epoch', type=int, default=1000)
parser.add_argument('--collect-per-step', type=int, default=100)
parser.add_argument('--repeat-per-collect', type=int, default=2)
parser.add_argument('--batch-size', type=int, default=64)
parser.add_argument('--layer-num', type=int, default=1)
parser.add_argument('--training-num', type=int, default=8)
parser.add_argument('--test-num', type=int, default=8)
parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument(
'--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu')
# ppo special
parser.add_argument('--vf-coef', type=float, default=0.5)
parser.add_argument('--ent-coef', type=float, default=0.0)
parser.add_argument('--eps-clip', type=float, default=0.2)
parser.add_argument('--max-grad-norm', type=float, default=0.5)
parser.add_argument('--max_episode_steps', type=int, default=2000)
args = parser.parse_known_args()[0]
return args
def test_ppo(args=get_args()):
env = create_atari_environment(args.task, max_episode_steps=args.max_episode_steps)
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space().shape or env.action_space().n
# train_envs = gym.make(args.task)
train_envs = SubprocVectorEnv(
[lambda: create_atari_environment(args.task, max_episode_steps=args.max_episode_steps) for _ in
range(args.training_num)])
# test_envs = gym.make(args.task)
test_envs = SubprocVectorEnv(
[lambda: create_atari_environment(args.task, max_episode_steps=args.max_episode_steps) for _ in
range(args.test_num)])
# seed
np.random.seed(args.seed)
torch.manual_seed(args.seed)
train_envs.seed(args.seed)
test_envs.seed(args.seed)
# model
net = Net(args.layer_num, args.state_shape, device=args.device)
actor = Actor(net, args.action_shape).to(args.device)
critic = Critic(net).to(args.device)
optim = torch.optim.Adam(list(
actor.parameters()) + list(critic.parameters()), lr=args.lr)
dist = torch.distributions.Categorical
policy = PPOPolicy(
actor, critic, optim, dist, args.gamma,
max_grad_norm=args.max_grad_norm,
eps_clip=args.eps_clip,
vf_coef=args.vf_coef,
ent_coef=args.ent_coef,
action_range=None)
# collector
train_collector = Collector(
policy, train_envs, ReplayBuffer(args.buffer_size))
test_collector = Collector(policy, test_envs)
# log
writer = SummaryWriter(args.logdir + '/' + 'ppo')
def stop_fn(x):
if env.env.spec.reward_threshold:
return x >= env.spec.reward_threshold
else:
return False
# trainer
result = onpolicy_trainer(
policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
train_collector.close()
test_collector.close()
if __name__ == '__main__':
pprint.pprint(result)
# Let's watch its performance!
env = create_atari_environment(args.task)
collector = Collector(policy, env)
result = collector.collect(n_step=2000, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close()
if __name__ == '__main__':
test_ppo()

View File

@ -55,6 +55,7 @@ setup(
], ],
'atari': [ 'atari': [
'atari_py', 'atari_py',
'cv2'
], ],
'mujoco': [ 'mujoco': [
'mujoco_py', 'mujoco_py',

View File

@ -34,6 +34,7 @@ def get_args():
parser.add_argument('--training-num', type=int, default=8) parser.add_argument('--training-num', type=int, default=8)
parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--test-num', type=int, default=100)
parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument( parser.add_argument(
'--device', type=str, '--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu') default='cuda' if torch.cuda.is_available() else 'cpu')
@ -79,7 +80,7 @@ def test_ddpg(args=get_args()):
policy, train_envs, ReplayBuffer(args.buffer_size)) policy, train_envs, ReplayBuffer(args.buffer_size))
test_collector = Collector(policy, test_envs) test_collector = Collector(policy, test_envs)
# log # log
writer = SummaryWriter(args.logdir) writer = SummaryWriter(args.logdir + '/' + 'ddpg')
def stop_fn(x): def stop_fn(x):
return x >= env.spec.reward_threshold return x >= env.spec.reward_threshold
@ -88,7 +89,7 @@ def test_ddpg(args=get_args()):
result = offpolicy_trainer( result = offpolicy_trainer(
policy, train_collector, test_collector, args.epoch, policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.test_num, args.step_per_epoch, args.collect_per_step, args.test_num,
args.batch_size, stop_fn=stop_fn, writer=writer) args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
assert stop_fn(result['best_reward']) assert stop_fn(result['best_reward'])
train_collector.close() train_collector.close()
test_collector.close() test_collector.close()
@ -97,7 +98,7 @@ def test_ddpg(args=get_args()):
# Let's watch its performance! # Let's watch its performance!
env = gym.make(args.task) env = gym.make(args.task)
collector = Collector(policy, env) collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=1 / 35) result = collector.collect(n_episode=1, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}') print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close() collector.close()

View File

@ -32,6 +32,7 @@ def get_args():
parser.add_argument('--training-num', type=int, default=16) parser.add_argument('--training-num', type=int, default=16)
parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--test-num', type=int, default=100)
parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument( parser.add_argument(
'--device', type=str, '--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu') default='cuda' if torch.cuda.is_available() else 'cpu')
@ -87,7 +88,7 @@ def _test_ppo(args=get_args()):
test_collector = Collector(policy, test_envs) test_collector = Collector(policy, test_envs)
train_collector.collect(n_step=args.step_per_epoch) train_collector.collect(n_step=args.step_per_epoch)
# log # log
writer = SummaryWriter(args.logdir) writer = SummaryWriter(args.logdir + '/' + 'ppo')
def stop_fn(x): def stop_fn(x):
return x >= env.spec.reward_threshold return x >= env.spec.reward_threshold
@ -96,7 +97,7 @@ def _test_ppo(args=get_args()):
result = onpolicy_trainer( result = onpolicy_trainer(
policy, train_collector, test_collector, args.epoch, policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer) args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
assert stop_fn(result['best_reward']) assert stop_fn(result['best_reward'])
train_collector.close() train_collector.close()
test_collector.close() test_collector.close()
@ -105,7 +106,7 @@ def _test_ppo(args=get_args()):
# Let's watch its performance! # Let's watch its performance!
env = gym.make(args.task) env = gym.make(args.task)
collector = Collector(policy, env) collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=1 / 35) result = collector.collect(n_episode=1, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}') print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close() collector.close()

View File

@ -34,6 +34,7 @@ def get_args():
parser.add_argument('--training-num', type=int, default=8) parser.add_argument('--training-num', type=int, default=8)
parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--test-num', type=int, default=100)
parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument( parser.add_argument(
'--device', type=str, '--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu') default='cuda' if torch.cuda.is_available() else 'cpu')
@ -84,7 +85,7 @@ def test_sac(args=get_args()):
test_collector = Collector(policy, test_envs) test_collector = Collector(policy, test_envs)
# train_collector.collect(n_step=args.buffer_size) # train_collector.collect(n_step=args.buffer_size)
# log # log
writer = SummaryWriter(args.logdir) writer = SummaryWriter(args.logdir + '/' + 'sac')
def stop_fn(x): def stop_fn(x):
return x >= env.spec.reward_threshold return x >= env.spec.reward_threshold
@ -93,7 +94,7 @@ def test_sac(args=get_args()):
result = offpolicy_trainer( result = offpolicy_trainer(
policy, train_collector, test_collector, args.epoch, policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.test_num, args.step_per_epoch, args.collect_per_step, args.test_num,
args.batch_size, stop_fn=stop_fn, writer=writer) args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
assert stop_fn(result['best_reward']) assert stop_fn(result['best_reward'])
train_collector.close() train_collector.close()
test_collector.close() test_collector.close()
@ -102,7 +103,7 @@ def test_sac(args=get_args()):
# Let's watch its performance! # Let's watch its performance!
env = gym.make(args.task) env = gym.make(args.task)
collector = Collector(policy, env) collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=1 / 35) result = collector.collect(n_episode=1, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}') print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close() collector.close()

View File

@ -37,6 +37,7 @@ def get_args():
parser.add_argument('--training-num', type=int, default=8) parser.add_argument('--training-num', type=int, default=8)
parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--test-num', type=int, default=100)
parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument( parser.add_argument(
'--device', type=str, '--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu') default='cuda' if torch.cuda.is_available() else 'cpu')
@ -88,7 +89,7 @@ def test_td3(args=get_args()):
test_collector = Collector(policy, test_envs) test_collector = Collector(policy, test_envs)
# train_collector.collect(n_step=args.buffer_size) # train_collector.collect(n_step=args.buffer_size)
# log # log
writer = SummaryWriter(args.logdir) writer = SummaryWriter(args.logdir + '/' + 'td3')
def stop_fn(x): def stop_fn(x):
return x >= env.spec.reward_threshold return x >= env.spec.reward_threshold
@ -97,7 +98,7 @@ def test_td3(args=get_args()):
result = offpolicy_trainer( result = offpolicy_trainer(
policy, train_collector, test_collector, args.epoch, policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.test_num, args.step_per_epoch, args.collect_per_step, args.test_num,
args.batch_size, stop_fn=stop_fn, writer=writer) args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
assert stop_fn(result['best_reward']) assert stop_fn(result['best_reward'])
train_collector.close() train_collector.close()
test_collector.close() test_collector.close()
@ -106,7 +107,7 @@ def test_td3(args=get_args()):
# Let's watch its performance! # Let's watch its performance!
env = gym.make(args.task) env = gym.make(args.task)
collector = Collector(policy, env) collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=1 / 35) result = collector.collect(n_episode=1, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}') print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close() collector.close()

View File

@ -48,3 +48,33 @@ class Critic(nn.Module):
logits, h = self.preprocess(s, None) logits, h = self.preprocess(s, None)
logits = self.last(logits) logits = self.last(logits)
return logits return logits
class DQN(nn.Module):
def __init__(self, h, w, action_shape, device='cpu'):
super(DQN, self).__init__()
self.device = device
self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=2)
self.bn1 = nn.BatchNorm2d(16)
self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
self.bn2 = nn.BatchNorm2d(32)
self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
self.bn3 = nn.BatchNorm2d(32)
def conv2d_size_out(size, kernel_size=5, stride=2):
return (size - (kernel_size - 1) - 1) // stride + 1
convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
linear_input_size = convw * convh * 32
self.head = nn.Linear(linear_input_size, action_shape)
def forward(self, x, state=None, info={}):
if not isinstance(x, torch.Tensor):
s = torch.tensor(x, device=self.device, dtype=torch.float)
x = F.relu(self.bn1(self.conv1(x)))
x = F.relu(self.bn2(self.conv2(x)))
x = F.relu(self.bn3(self.conv3(x)))
return self.head(x.view(x.size(0), -1)), state

View File

@ -32,6 +32,8 @@ def get_args():
parser.add_argument('--training-num', type=int, default=32) parser.add_argument('--training-num', type=int, default=32)
parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--test-num', type=int, default=100)
parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument( parser.add_argument(
'--device', type=str, '--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu') default='cuda' if torch.cuda.is_available() else 'cpu')
@ -73,7 +75,7 @@ def test_a2c(args=get_args()):
policy, train_envs, ReplayBuffer(args.buffer_size)) policy, train_envs, ReplayBuffer(args.buffer_size))
test_collector = Collector(policy, test_envs) test_collector = Collector(policy, test_envs)
# log # log
writer = SummaryWriter(args.logdir) writer = SummaryWriter(args.logdir + '/' + 'ppo')
def stop_fn(x): def stop_fn(x):
return x >= env.spec.reward_threshold return x >= env.spec.reward_threshold
@ -82,7 +84,7 @@ def test_a2c(args=get_args()):
result = onpolicy_trainer( result = onpolicy_trainer(
policy, train_collector, test_collector, args.epoch, policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer) args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
assert stop_fn(result['best_reward']) assert stop_fn(result['best_reward'])
train_collector.close() train_collector.close()
test_collector.close() test_collector.close()
@ -91,7 +93,7 @@ def test_a2c(args=get_args()):
# Let's watch its performance! # Let's watch its performance!
env = gym.make(args.task) env = gym.make(args.task)
collector = Collector(policy, env) collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=1 / 35) result = collector.collect(n_episode=1, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}') print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close() collector.close()

View File

@ -35,6 +35,7 @@ def get_args():
parser.add_argument('--training-num', type=int, default=8) parser.add_argument('--training-num', type=int, default=8)
parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--test-num', type=int, default=100)
parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument( parser.add_argument(
'--device', type=str, '--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu') default='cuda' if torch.cuda.is_available() else 'cpu')
@ -73,7 +74,7 @@ def test_dqn(args=get_args()):
train_collector.collect(n_step=args.batch_size) train_collector.collect(n_step=args.batch_size)
print(len(train_collector.buffer)) print(len(train_collector.buffer))
# log # log
writer = SummaryWriter(args.logdir) writer = SummaryWriter(args.logdir + '/' + 'ppo')
def stop_fn(x): def stop_fn(x):
return x >= env.spec.reward_threshold return x >= env.spec.reward_threshold
@ -89,7 +90,7 @@ def test_dqn(args=get_args()):
policy, train_collector, test_collector, args.epoch, policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.test_num, args.step_per_epoch, args.collect_per_step, args.test_num,
args.batch_size, train_fn=train_fn, test_fn=test_fn, args.batch_size, train_fn=train_fn, test_fn=test_fn,
stop_fn=stop_fn, writer=writer) stop_fn=stop_fn, writer=writer, task=args.task)
assert stop_fn(result['best_reward']) assert stop_fn(result['best_reward'])
train_collector.close() train_collector.close()
@ -99,7 +100,7 @@ def test_dqn(args=get_args()):
# Let's watch its performance! # Let's watch its performance!
env = gym.make(args.task) env = gym.make(args.task)
collector = Collector(policy, env) collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=1 / 35) result = collector.collect(n_episode=1, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}') print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close() collector.close()

View File

@ -86,6 +86,7 @@ def get_args():
parser.add_argument('--training-num', type=int, default=8) parser.add_argument('--training-num', type=int, default=8)
parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--test-num', type=int, default=100)
parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument( parser.add_argument(
'--device', type=str, '--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu') default='cuda' if torch.cuda.is_available() else 'cpu')
@ -121,7 +122,7 @@ def test_pg(args=get_args()):
policy, train_envs, ReplayBuffer(args.buffer_size)) policy, train_envs, ReplayBuffer(args.buffer_size))
test_collector = Collector(policy, test_envs) test_collector = Collector(policy, test_envs)
# log # log
writer = SummaryWriter(args.logdir) writer = SummaryWriter(args.logdir + '/' + 'ppo')
def stop_fn(x): def stop_fn(x):
return x >= env.spec.reward_threshold return x >= env.spec.reward_threshold
@ -130,7 +131,7 @@ def test_pg(args=get_args()):
result = onpolicy_trainer( result = onpolicy_trainer(
policy, train_collector, test_collector, args.epoch, policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer) args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
assert stop_fn(result['best_reward']) assert stop_fn(result['best_reward'])
train_collector.close() train_collector.close()
test_collector.close() test_collector.close()
@ -139,7 +140,7 @@ def test_pg(args=get_args()):
# Let's watch its performance! # Let's watch its performance!
env = gym.make(args.task) env = gym.make(args.task)
collector = Collector(policy, env) collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=1 / 35) result = collector.collect(n_episode=1, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}') print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close() collector.close()

View File

@ -32,6 +32,7 @@ def get_args():
parser.add_argument('--training-num', type=int, default=32) parser.add_argument('--training-num', type=int, default=32)
parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--test-num', type=int, default=100)
parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument( parser.add_argument(
'--device', type=str, '--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu') default='cuda' if torch.cuda.is_available() else 'cpu')
@ -78,7 +79,7 @@ def test_ppo(args=get_args()):
policy, train_envs, ReplayBuffer(args.buffer_size)) policy, train_envs, ReplayBuffer(args.buffer_size))
test_collector = Collector(policy, test_envs) test_collector = Collector(policy, test_envs)
# log # log
writer = SummaryWriter(args.logdir) writer = SummaryWriter(args.logdir + '/' + 'ppo')
def stop_fn(x): def stop_fn(x):
return x >= env.spec.reward_threshold return x >= env.spec.reward_threshold
@ -87,7 +88,7 @@ def test_ppo(args=get_args()):
result = onpolicy_trainer( result = onpolicy_trainer(
policy, train_collector, test_collector, args.epoch, policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer) args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
assert stop_fn(result['best_reward']) assert stop_fn(result['best_reward'])
train_collector.close() train_collector.close()
test_collector.close() test_collector.close()
@ -96,7 +97,7 @@ def test_ppo(args=get_args()):
# Let's watch its performance! # Let's watch its performance!
env = gym.make(args.task) env = gym.make(args.task)
collector = Collector(policy, env) collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=1 / 35) result = collector.collect(n_episode=1, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}') print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close() collector.close()

View File

@ -2,7 +2,7 @@ import time
import torch import torch
import numpy as np import numpy as np
from copy import deepcopy from copy import deepcopy
import warnings
from tianshou.env import BaseVectorEnv from tianshou.env import BaseVectorEnv
from tianshou.data import Batch, ReplayBuffer from tianshou.data import Batch, ReplayBuffer
from tianshou.utils import MovAvg from tianshou.utils import MovAvg
@ -87,6 +87,7 @@ class Collector(object):
return np.array([data]) return np.array([data])
def collect(self, n_step=0, n_episode=0, render=0): def collect(self, n_step=0, n_episode=0, render=0):
warning_count = 0
if not self._multi_env: if not self._multi_env:
n_episode = np.sum(n_episode) n_episode = np.sum(n_episode)
start_time = time.time() start_time = time.time()
@ -97,6 +98,10 @@ class Collector(object):
reward_sum = 0 reward_sum = 0
length_sum = 0 length_sum = 0
while True: while True:
if warning_count >= 100000:
warnings.warn(
'There are already many steps in an episode. You should add a time limitation to your environment!',
Warning)
if self._multi_env: if self._multi_env:
batch_data = Batch( batch_data = Batch(
obs=self._obs, act=self._act, rew=self._rew, obs=self._obs, act=self._act, rew=self._rew,
@ -131,11 +136,14 @@ class Collector(object):
'rew': self._rew[i], 'done': self._done[i], 'rew': self._rew[i], 'done': self._done[i],
'obs_next': obs_next[i], 'info': self._info[i]} 'obs_next': obs_next[i], 'info': self._info[i]}
if self._cached_buf: if self._cached_buf:
warning_count += 1
self._cached_buf[i].add(**data) self._cached_buf[i].add(**data)
elif self._multi_buf: elif self._multi_buf:
warning_count += 1
self.buffer[i].add(**data) self.buffer[i].add(**data)
cur_step += 1 cur_step += 1
else: else:
warning_count += 1
self.buffer.add(**data) self.buffer.add(**data)
cur_step += 1 cur_step += 1
if self._done[i]: if self._done[i]:

View File

@ -39,6 +39,7 @@ class A2CPolicy(PGPolicy):
a_loss = -(dist.log_prob(a) * (r - v).detach()).mean() a_loss = -(dist.log_prob(a) * (r - v).detach()).mean()
vf_loss = F.mse_loss(r[:, None], v) vf_loss = F.mse_loss(r[:, None], v)
ent_loss = dist.entropy().mean() ent_loss = dist.entropy().mean()
loss = a_loss + self._w_vf * vf_loss - self._w_ent * ent_loss loss = a_loss + self._w_vf * vf_loss - self._w_ent * ent_loss
loss.backward() loss.backward()
if self._grad_norm: if self._grad_norm:

View File

@ -34,6 +34,9 @@ class PGPolicy(BasePolicy):
def learn(self, batch, batch_size=None, repeat=1): def learn(self, batch, batch_size=None, repeat=1):
losses = [] losses = []
batch.returns = (batch.returns - batch.returns.mean()) \
/ (batch.returns.std() + self._eps)
r = batch.returns r = batch.returns
batch.returns = (r - r.mean()) / (r.std() + self._eps) batch.returns = (r - r.mean()) / (r.std() + self._eps)
for _ in range(repeat): for _ in range(repeat):

View File

@ -58,6 +58,9 @@ class PPOPolicy(PGPolicy):
def learn(self, batch, batch_size=None, repeat=1): def learn(self, batch, batch_size=None, repeat=1):
losses, clip_losses, vf_losses, ent_losses = [], [], [], [] losses, clip_losses, vf_losses, ent_losses = [], [], [], []
batch.returns = (batch.returns - batch.returns.mean()) \
/ (batch.returns.std() + self._eps)
r = batch.returns r = batch.returns
batch.returns = (r - r.mean()) / (r.std() + self._eps) batch.returns = (r - r.mean()) / (r.std() + self._eps)
batch.act = torch.tensor(batch.act) batch.act = torch.tensor(batch.act)
@ -79,6 +82,7 @@ class PPOPolicy(PGPolicy):
clip_losses.append(clip_loss.detach().cpu().numpy()) clip_losses.append(clip_loss.detach().cpu().numpy())
vf_loss = F.smooth_l1_loss(self.critic(b.obs), target_v) vf_loss = F.smooth_l1_loss(self.critic(b.obs), target_v)
vf_losses.append(vf_loss.detach().cpu().numpy()) vf_losses.append(vf_loss.detach().cpu().numpy())
e_loss = dist.entropy().mean() e_loss = dist.entropy().mean()
ent_losses.append(e_loss.detach().cpu().numpy()) ent_losses.append(e_loss.detach().cpu().numpy())
loss = clip_loss + self._w_vf * vf_loss - self._w_ent * e_loss loss = clip_loss + self._w_vf * vf_loss - self._w_ent * e_loss

View File

@ -8,7 +8,7 @@ from tianshou.trainer import test_episode, gather_info
def offpolicy_trainer(policy, train_collector, test_collector, max_epoch, def offpolicy_trainer(policy, train_collector, test_collector, max_epoch,
step_per_epoch, collect_per_step, episode_per_test, step_per_epoch, collect_per_step, episode_per_test,
batch_size, train_fn=None, test_fn=None, stop_fn=None, batch_size, train_fn=None, test_fn=None, stop_fn=None,
writer=None, verbose=True): writer=None, verbose=True, task=''):
global_step = 0 global_step = 0
best_epoch, best_reward = -1, -1 best_epoch, best_reward = -1, -1
stat = {} stat = {}
@ -47,7 +47,7 @@ def offpolicy_trainer(policy, train_collector, test_collector, max_epoch,
data[k] = f'{result[k]:.2f}' data[k] = f'{result[k]:.2f}'
if writer: if writer:
writer.add_scalar( writer.add_scalar(
k, result[k], global_step=global_step) k + '_' + task, result[k], global_step=global_step)
for k in losses.keys(): for k in losses.keys():
if stat.get(k) is None: if stat.get(k) is None:
stat[k] = MovAvg() stat[k] = MovAvg()
@ -55,7 +55,7 @@ def offpolicy_trainer(policy, train_collector, test_collector, max_epoch,
data[k] = f'{stat[k].get():.6f}' data[k] = f'{stat[k].get():.6f}'
if writer: if writer:
writer.add_scalar( writer.add_scalar(
k, stat[k].get(), global_step=global_step) k + '_' + task, stat[k].get(), global_step=global_step)
t.update(1) t.update(1)
t.set_postfix(**data) t.set_postfix(**data)
if t.n <= t.total: if t.n <= t.total:

View File

@ -9,7 +9,7 @@ def onpolicy_trainer(policy, train_collector, test_collector, max_epoch,
step_per_epoch, collect_per_step, repeat_per_collect, step_per_epoch, collect_per_step, repeat_per_collect,
episode_per_test, batch_size, episode_per_test, batch_size,
train_fn=None, test_fn=None, stop_fn=None, train_fn=None, test_fn=None, stop_fn=None,
writer=None, verbose=True): writer=None, verbose=True, task=''):
global_step = 0 global_step = 0
best_epoch, best_reward = -1, -1 best_epoch, best_reward = -1, -1
stat = {} stat = {}
@ -52,15 +52,15 @@ def onpolicy_trainer(policy, train_collector, test_collector, max_epoch,
data[k] = f'{result[k]:.2f}' data[k] = f'{result[k]:.2f}'
if writer: if writer:
writer.add_scalar( writer.add_scalar(
k, result[k], global_step=global_step) k + '_' + task, result[k], global_step=global_step)
for k in losses.keys(): for k in losses.keys():
if stat.get(k) is None: if stat.get(k) is None:
stat[k] = MovAvg() stat[k] = MovAvg()
stat[k].add(losses[k]) stat[k].add(losses[k])
data[k] = f'{stat[k].get():.6f}' data[k] = f'{stat[k].get():.6f}'
if writer: if writer and global_step:
writer.add_scalar( writer.add_scalar(
k, stat[k].get(), global_step=global_step) k + '_' + task, stat[k].get(), global_step=global_step)
t.update(step) t.update(step)
t.set_postfix(**data) t.set_postfix(**data)
if t.n <= t.total: if t.n <= t.total: