Add files via upload

This commit is contained in:
Flange 2024-04-20 01:23:05 +08:00 committed by GitHub
parent b60ebbefff
commit 1f7c319266
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 2642 additions and 0 deletions

311
MuJoCo/ppo.py Normal file
View File

@ -0,0 +1,311 @@
import argparse
import os
import random
import time
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.normal import Normal
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--exp_name', type=str, default=os.path.basename(__file__).rstrip('.py'))
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--torch_deterministic', type=bool, default=True)
parser.add_argument('--cuda', type=bool, default=True)
parser.add_argument('--env_id', type=str, default='Humanoid-v4')
parser.add_argument('--total_time_steps', type=int, default=int(1e7))
parser.add_argument('--learning_rate', type=float, default=3e-4)
parser.add_argument('--num_envs', type=int, default=8)
parser.add_argument('--num_steps', type=int, default=256)
parser.add_argument('--anneal_lr', type=bool, default=True)
parser.add_argument('--gamma', type=float, default=0.99)
parser.add_argument('--gae_lambda', type=float, default=0.95)
parser.add_argument('--num_mini_batches', type=int, default=4)
parser.add_argument('--update_epochs', type=int, default=10)
parser.add_argument('--norm_adv', type=bool, default=True)
parser.add_argument('--clip_value_loss', type=bool, default=True)
parser.add_argument('--c_1', type=float, default=0.5)
parser.add_argument('--c_2', type=float, default=0.0)
parser.add_argument('--max_grad_norm', type=float, default=0.5)
parser.add_argument('--clip_epsilon', type=float, default=0.2)
a = parser.parse_args()
a.batch_size = int(a.num_envs * a.num_steps)
a.minibatch_size = int(a.batch_size // a.num_mini_batches)
return a
def make_env(env_id, gamma):
def thunk():
env = gym.make(env_id)
env = gym.wrappers.FlattenObservation(env)
env = gym.wrappers.RecordEpisodeStatistics(env)
env = gym.wrappers.ClipAction(env)
env = gym.wrappers.NormalizeObservation(env)
env = gym.wrappers.TransformObservation(env, lambda o: np.clip(o, -10, 10))
env = gym.wrappers.NormalizeReward(env, gamma=gamma)
env = gym.wrappers.TransformReward(env, lambda r: float(np.clip(r, -10, 10)))
return env
return thunk
def layer_init(layer, s=np.sqrt(2), bias_const=0.0):
torch.nn.init.orthogonal_(layer.weight, s)
torch.nn.init.constant_(layer.bias, bias_const)
return layer
class Agent(nn.Module):
def __init__(self, e):
super().__init__()
self.critic = nn.Sequential(
layer_init(nn.Linear(np.array(e.single_observation_space.shape).prod(), 64)),
nn.Tanh(),
layer_init(nn.Linear(64, 64)),
nn.Tanh(),
layer_init(nn.Linear(64, 1), s=1.0),
)
self.actor_mean = nn.Sequential(
layer_init(nn.Linear(np.array(e.single_observation_space.shape).prod(), 64)),
nn.Tanh(),
layer_init(nn.Linear(64, 64)),
nn.Tanh(),
layer_init(nn.Linear(64, np.array(e.single_action_space.shape).prod()), s=0.01),
)
self.actor_log_std = nn.Parameter(torch.zeros(1, np.array(e.single_action_space.shape).prod()))
def get_value(self, x):
return self.critic(x)
def get_action_and_value(self, x, a=None, show_all=False):
action_mean = self.actor_mean(x)
action_log_std = self.actor_log_std.expand_as(action_mean)
action_std = torch.exp(action_log_std)
probs = Normal(action_mean, action_std)
if a is None:
a = probs.sample()
if show_all:
return a, probs.log_prob(a).sum(1), probs.entropy().sum(1), self.critic(x), probs
return a, probs.log_prob(a).sum(1), probs.entropy().sum(1), self.critic(x)
def compute_kld(mu_1, sigma_1, mu_2, sigma_2):
return torch.log(sigma_2 / sigma_1) + ((mu_1 - mu_2) ** 2 + (sigma_1 ** 2 - sigma_2 ** 2)) / (2 * sigma_2 ** 2)
def main(env_id, seed):
args = get_args()
args.env_id = env_id
args.seed = seed
run_name = (
'ppo' +
'_epoch_' + str(args.update_epochs) +
'_seed_' + str(args.seed)
)
# Save training logs
path_string = str(args.env_id) + '/' + run_name
writer = SummaryWriter(path_string)
writer.add_text(
'Hyperparameter',
'|param|value|\n|-|-|\n%s' % ('\n'.join([f'|{key}|{value}|' for key, value in vars(args).items()])),
)
# Random seed
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic
device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu')
# Initialize environments
envs = gym.vector.SyncVectorEnv(
[make_env(args.env_id, args.gamma) for _ in range(args.num_envs)]
)
assert isinstance(envs.single_action_space, gym.spaces.Box), 'only continuous action space is supported'
agent = Agent(envs).to(device)
optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5)
# Initialize buffer
obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device)
actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device)
log_probs = torch.zeros((args.num_steps, args.num_envs)).to(device)
rewards = torch.zeros((args.num_steps, args.num_envs)).to(device)
dones = torch.zeros((args.num_steps, args.num_envs)).to(device)
values = torch.zeros((args.num_steps, args.num_envs)).to(device)
mean = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device)
std = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device)
# Data collection
global_step = 0
start_time = time.time()
next_obs, _ = envs.reset(seed=args.seed)
next_obs = torch.Tensor(next_obs).to(device)
next_done = torch.zeros(args.num_envs).to(device)
num_updates = args.total_time_steps // args.batch_size
for update in tqdm(range(1, num_updates + 1)):
# Linear decay of learning rate
if args.anneal_lr:
frac = 1.0 - (update - 1.0) / num_updates
lr_now = frac * args.learning_rate
optimizer.param_groups[0]['lr'] = lr_now
for step in range(0, args.num_steps):
global_step += 1 * args.num_envs
obs[step] = next_obs
dones[step] = next_done
# Compute the logarithm of the action probability output by the old policy network
with torch.no_grad():
action, log_prob, _, value, mean_std = agent.get_action_and_value(next_obs, show_all=True)
values[step] = value.flatten()
actions[step] = action
log_probs[step] = log_prob
# Mean and variance (batch_size, num_envs, num_actions)
mean[step] = mean_std.loc
std[step] = mean_std.scale
# Update the environments
next_obs, reward, terminations, truncations, info = envs.step(action.cpu().numpy())
done = np.logical_or(terminations, truncations)
rewards[step] = torch.tensor(reward).to(device).view(-1)
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device)
if 'final_info' not in info:
continue
for item in info['final_info']:
if item is None:
continue
writer.add_scalar('charts/episodic_return', item['episode']['r'][0], global_step)
# Use GAE (Generalized Advantage Estimation) technique to estimate the advantage function
with torch.no_grad():
next_value = agent.get_value(next_obs).reshape(1, -1)
advantages = torch.zeros_like(rewards).to(device)
last_gae_lam = 0
for t in reversed(range(args.num_steps)):
if t == args.num_steps - 1:
next_non_terminal = 1.0 - next_done
next_values = next_value
else:
next_non_terminal = 1.0 - dones[t + 1]
next_values = values[t + 1]
delta = rewards[t] + args.gamma * next_values * next_non_terminal - values[t]
advantages[t] = last_gae_lam = delta + args.gamma * args.gae_lambda * next_non_terminal * last_gae_lam
returns = advantages + values
# ---------------------- We have collected enough data, now let's start training ---------------------- #
# Flatten each batch
b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
b_log_probs = log_probs.reshape(-1)
b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
b_advantages = advantages.reshape(-1)
b_returns = returns.reshape(-1)
b_values = values.reshape(-1)
# Obtain the mean and variance of a batch
b_mean = mean.reshape(args.batch_size, -1)
b_std = std.reshape(args.batch_size, -1)
# Update the policy network and value network
b_index = np.arange(args.batch_size)
for epoch in range(1, args.update_epochs + 1):
np.random.shuffle(b_index)
t = 0
for start in range(0, args.batch_size, args.minibatch_size):
t += 1
end = start + args.minibatch_size
mb_index = b_index[start:end]
# The latest outputs of the policy network and value network
_, new_log_prob, entropy, new_value, new_mean_std = agent.get_action_and_value(b_obs[mb_index],
b_actions[mb_index],
show_all=True)
# Compute KL divergence
new_mean = new_mean_std.loc.reshape(args.minibatch_size, -1)
new_std = new_mean_std.scale.reshape(args.minibatch_size, -1)
d = compute_kld(b_mean[mb_index], b_std[mb_index], new_mean, new_std).sum(1)
writer.add_scalar('charts/average_kld', d.mean(), global_step)
writer.add_scalar('others/min_kld', d.min(), global_step)
writer.add_scalar('others/max_kld', d.max(), global_step)
log_ratio = new_log_prob - b_log_probs[mb_index]
ratios = log_ratio.exp()
mb_advantages = b_advantages[mb_index]
# Advantage normalization
if args.norm_adv:
mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-12)
# Policy loss
pg_loss1 = -mb_advantages * ratios
pg_loss2 = -mb_advantages * torch.clamp(ratios, 1 - args.clip_epsilon, 1 + args.clip_epsilon)
pg_loss = torch.max(pg_loss1, pg_loss2).mean()
# Value loss
new_value = new_value.view(-1)
if args.clip_value_loss:
v_loss_un_clipped = (new_value - b_returns[mb_index]) ** 2
v_clipped = b_values[mb_index] + torch.clamp(
new_value - b_values[mb_index],
-args.clip_epsilon,
args.clip_epsilon,
)
v_loss_clipped = (v_clipped - b_returns[mb_index]) ** 2
v_loss_max = torch.max(v_loss_un_clipped, v_loss_clipped)
v_loss = 0.5 * v_loss_max.mean()
else:
v_loss = 0.5 * ((new_value - b_returns[mb_index]) ** 2).mean()
# Policy entropy
entropy_loss = entropy.mean()
# Total loss
loss = pg_loss + v_loss * args.c_1 - entropy_loss * args.c_2
# Save the data during the training process
writer.add_scalar('losses/policy_loss', pg_loss.item(), global_step)
writer.add_scalar('losses/value_loss', v_loss.item(), global_step)
writer.add_scalar('losses/entropy', entropy_loss.item(), global_step)
# Update network parameters
optimizer.zero_grad()
loss.backward()
nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
optimizer.step()
y_pre, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
var_y = np.var(y_true)
explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pre) / var_y
writer.add_scalar('others/explained_variance', explained_var, global_step)
# Save the data during the training process
writer.add_scalar('charts/learning_rate', optimizer.param_groups[0]['lr'], global_step)
writer.add_scalar('charts/SPS', int(global_step / (time.time() - start_time)), global_step)
envs.close()
writer.close()
def run():
for env_id in ['Humanoid-v4']:
for seed in range(1, 6):
print(env_id, 'seed:', seed)
main(env_id, seed)
if __name__ == '__main__':
run()

320
MuJoCo/spo.py Normal file
View File

@ -0,0 +1,320 @@
import argparse
import os
import random
import time
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.normal import Normal
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--exp_name', type=str, default=os.path.basename(__file__).rstrip('.py'))
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--torch_deterministic', type=bool, default=True)
parser.add_argument('--cuda', type=bool, default=True)
parser.add_argument('--env_id', type=str, default='Humanoid-v4')
parser.add_argument('--total_time_steps', type=int, default=int(1e7))
parser.add_argument('--learning_rate', type=float, default=3e-4)
parser.add_argument('--num_envs', type=int, default=8)
parser.add_argument('--num_steps', type=int, default=256)
parser.add_argument('--anneal_lr', type=bool, default=True)
parser.add_argument('--gamma', type=float, default=0.99)
parser.add_argument('--gae_lambda', type=float, default=0.95)
parser.add_argument('--num_mini_batches', type=int, default=4)
parser.add_argument('--update_epochs', type=int, default=10)
parser.add_argument('--norm_adv', type=bool, default=True)
parser.add_argument('--clip_value_loss', type=bool, default=True)
parser.add_argument('--c_1', type=float, default=0.5)
parser.add_argument('--c_2', type=float, default=0.0)
parser.add_argument('--max_grad_norm', type=float, default=0.5)
parser.add_argument('--kld_max', type=float, default=0.02)
a = parser.parse_args()
a.batch_size = int(a.num_envs * a.num_steps)
a.minibatch_size = int(a.batch_size // a.num_mini_batches)
return a
def make_env(env_id, gamma):
def thunk():
env = gym.make(env_id)
env = gym.wrappers.FlattenObservation(env)
env = gym.wrappers.RecordEpisodeStatistics(env)
env = gym.wrappers.ClipAction(env)
env = gym.wrappers.NormalizeObservation(env)
env = gym.wrappers.TransformObservation(env, lambda o: np.clip(o, -10, 10))
env = gym.wrappers.NormalizeReward(env, gamma=gamma)
env = gym.wrappers.TransformReward(env, lambda r: float(np.clip(r, -10, 10)))
return env
return thunk
def layer_init(layer, s=np.sqrt(2), bias_const=0.0):
torch.nn.init.orthogonal_(layer.weight, s)
torch.nn.init.constant_(layer.bias, bias_const)
return layer
class Agent(nn.Module):
def __init__(self, e):
super().__init__()
self.critic = nn.Sequential(
layer_init(nn.Linear(np.array(e.single_observation_space.shape).prod(), 64)),
nn.Tanh(),
layer_init(nn.Linear(64, 64)),
nn.Tanh(),
layer_init(nn.Linear(64, 1), s=1.0),
)
self.actor_mean = nn.Sequential(
layer_init(nn.Linear(np.array(e.single_observation_space.shape).prod(), 64)),
nn.Tanh(),
layer_init(nn.Linear(64, 64)),
nn.Tanh(),
layer_init(nn.Linear(64, np.array(e.single_action_space.shape).prod()), s=0.01),
)
self.actor_log_std = nn.Parameter(torch.zeros(1, np.array(e.single_action_space.shape).prod()))
def get_value(self, x):
return self.critic(x)
def get_action_and_value(self, x, a=None, show_all=False):
action_mean = self.actor_mean(x)
action_log_std = self.actor_log_std.expand_as(action_mean)
action_std = torch.exp(action_log_std)
probs = Normal(action_mean, action_std)
if a is None:
a = probs.sample()
if show_all:
return a, probs.log_prob(a).sum(1), probs.entropy().sum(1), self.critic(x), probs
return a, probs.log_prob(a).sum(1), probs.entropy().sum(1), self.critic(x)
def compute_kld(mu_1, sigma_1, mu_2, sigma_2):
return torch.log(sigma_2 / sigma_1) + ((mu_1 - mu_2) ** 2 + (sigma_1 ** 2 - sigma_2 ** 2)) / (2 * sigma_2 ** 2)
def main(env_id, seed):
args = get_args()
args.env_id = env_id
args.seed = seed
run_name = (
'spo' +
'_epoch_' + str(args.update_epochs) +
'_seed_' + str(args.seed)
)
# Save training logs
path_string = str(args.env_id) + '/' + run_name
writer = SummaryWriter(path_string)
writer.add_text(
'Hyperparameter',
'|param|value|\n|-|-|\n%s' % ('\n'.join([f'|{key}|{value}|' for key, value in vars(args).items()])),
)
# Random seed
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic
device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu')
# Initialize environments
envs = gym.vector.SyncVectorEnv(
[make_env(args.env_id, args.gamma) for _ in range(args.num_envs)]
)
assert isinstance(envs.single_action_space, gym.spaces.Box), 'only continuous action space is supported'
agent = Agent(envs).to(device)
optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5)
# Initialize buffer
obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device)
actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device)
log_probs = torch.zeros((args.num_steps, args.num_envs)).to(device)
rewards = torch.zeros((args.num_steps, args.num_envs)).to(device)
dones = torch.zeros((args.num_steps, args.num_envs)).to(device)
values = torch.zeros((args.num_steps, args.num_envs)).to(device)
mean = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device)
std = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device)
# Data collection
global_step = 0
start_time = time.time()
next_obs, _ = envs.reset(seed=args.seed)
next_obs = torch.Tensor(next_obs).to(device)
next_done = torch.zeros(args.num_envs).to(device)
num_updates = args.total_time_steps // args.batch_size
for update in tqdm(range(1, num_updates + 1)):
# Linear decay of learning rate
if args.anneal_lr:
frac = 1.0 - (update - 1.0) / num_updates
lr_now = frac * args.learning_rate
optimizer.param_groups[0]['lr'] = lr_now
for step in range(0, args.num_steps):
global_step += 1 * args.num_envs
obs[step] = next_obs
dones[step] = next_done
# Compute the logarithm of the action probability output by the old policy network
with torch.no_grad():
action, log_prob, _, value, mean_std = agent.get_action_and_value(next_obs, show_all=True)
values[step] = value.flatten()
actions[step] = action
log_probs[step] = log_prob
# Mean and variance (batch_size, num_envs, num_actions)
mean[step] = mean_std.loc
std[step] = mean_std.scale
# Update the environments
next_obs, reward, terminations, truncations, info = envs.step(action.cpu().numpy())
done = np.logical_or(terminations, truncations)
rewards[step] = torch.tensor(reward).to(device).view(-1)
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device)
if 'final_info' not in info:
continue
for item in info['final_info']:
if item is None:
continue
writer.add_scalar('charts/episodic_return', item['episode']['r'][0], global_step)
# Use GAE (Generalized Advantage Estimation) technique to estimate the advantage function
with torch.no_grad():
next_value = agent.get_value(next_obs).reshape(1, -1)
advantages = torch.zeros_like(rewards).to(device)
last_gae_lam = 0
for t in reversed(range(args.num_steps)):
if t == args.num_steps - 1:
next_non_terminal = 1.0 - next_done
next_values = next_value
else:
next_non_terminal = 1.0 - dones[t + 1]
next_values = values[t + 1]
delta = rewards[t] + args.gamma * next_values * next_non_terminal - values[t]
advantages[t] = last_gae_lam = delta + args.gamma * args.gae_lambda * next_non_terminal * last_gae_lam
returns = advantages + values
# ---------------------- We have collected enough data, now let's start training ---------------------- #
# Flatten each batch
b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
b_log_probs = log_probs.reshape(-1)
b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
b_advantages = advantages.reshape(-1)
b_returns = returns.reshape(-1)
b_values = values.reshape(-1)
# Obtain the mean and variance of a batch
b_mean = mean.reshape(args.batch_size, -1)
b_std = std.reshape(args.batch_size, -1)
# Update the policy network and value network
b_index = np.arange(args.batch_size)
for epoch in range(1, args.update_epochs + 1):
np.random.shuffle(b_index)
t = 0
for start in range(0, args.batch_size, args.minibatch_size):
t += 1
end = start + args.minibatch_size
mb_index = b_index[start:end]
# The latest outputs of the policy network and value network
_, new_log_prob, entropy, new_value, new_mean_std = agent.get_action_and_value(b_obs[mb_index],
b_actions[mb_index],
show_all=True)
# Compute KL divergence
new_mean = new_mean_std.loc.reshape(args.minibatch_size, -1)
new_std = new_mean_std.scale.reshape(args.minibatch_size, -1)
d = compute_kld(b_mean[mb_index], b_std[mb_index], new_mean, new_std).sum(1)
writer.add_scalar('charts/average_kld', d.mean(), global_step)
writer.add_scalar('others/min_kld', d.min(), global_step)
writer.add_scalar('others/max_kld', d.max(), global_step)
log_ratio = new_log_prob - b_log_probs[mb_index]
ratios = log_ratio.exp()
mb_advantages = b_advantages[mb_index]
# Advantage normalization
if args.norm_adv:
mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-12)
# Policy loss (main code of SPO)
if epoch == 1 and t == 1:
pg_loss = (-mb_advantages * ratios).mean()
else:
# d_clip
d_clip = torch.clamp(input=d, min=0, max=args.kld_max)
# d_clip / d
ratio = d_clip / (d + 1e-12)
# sign_a
sign_a = torch.sign(mb_advantages)
# (d_clip / d + sign_a - 1) * sign_a
result = (ratio + sign_a - 1) * sign_a
pg_loss = (-mb_advantages * ratios * result).mean()
# Value loss
new_value = new_value.view(-1)
if args.clip_value_loss:
v_loss_un_clipped = (new_value - b_returns[mb_index]) ** 2
v_clipped = b_values[mb_index] + torch.clamp(
new_value - b_values[mb_index],
-0.2,
0.2,
)
v_loss_clipped = (v_clipped - b_returns[mb_index]) ** 2
v_loss_max = torch.max(v_loss_un_clipped, v_loss_clipped)
v_loss = 0.5 * v_loss_max.mean()
else:
v_loss = 0.5 * ((new_value - b_returns[mb_index]) ** 2).mean()
# Policy entropy
entropy_loss = entropy.mean()
# Total loss
loss = pg_loss + v_loss * args.c_1 - entropy_loss * args.c_2
# Save the data during the training process
writer.add_scalar('losses/policy_loss', pg_loss.item(), global_step)
writer.add_scalar('losses/value_loss', v_loss.item(), global_step)
writer.add_scalar('losses/entropy', entropy_loss.item(), global_step)
# Update network parameters
optimizer.zero_grad()
loss.backward()
nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
optimizer.step()
y_pre, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
var_y = np.var(y_true)
explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pre) / var_y
writer.add_scalar('others/explained_variance', explained_var, global_step)
# Save the data during the training process
writer.add_scalar('charts/learning_rate', optimizer.param_groups[0]['lr'], global_step)
writer.add_scalar('charts/SPS', int(global_step / (time.time() - start_time)), global_step)
envs.close()
writer.close()
def run():
for env_id in ['Humanoid-v4']:
for seed in range(1, 6):
print(env_id, 'seed:', seed)
main(env_id, seed)
if __name__ == '__main__':
run()

BIN
spo.pdf Normal file

Binary file not shown.

314
v4/ppo.py Normal file
View File

@ -0,0 +1,314 @@
import argparse
import os
import random
import time
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from stable_baselines3.common.atari_wrappers import (
ClipRewardEnv,
EpisodicLifeEnv,
FireResetEnv,
MaxAndSkipEnv,
NoopResetEnv
)
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--exp_name', type=str, default=os.path.basename(__file__).rstrip('.py'))
parser.add_argument('--gym_id', type=str, default='BreakoutNoFrameskip-v4')
parser.add_argument('--learning_rate', type=float, default=2.5e-4)
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--total_steps', type=int, default=int(1e7))
parser.add_argument('--use_cuda', type=bool, default=True)
parser.add_argument('--num_envs', type=int, default=8)
parser.add_argument('--num_steps', type=int, default=128)
parser.add_argument('--lr_decay', type=bool, default=True)
parser.add_argument('--use_gae', type=bool, default=True)
parser.add_argument('--gae_lambda', type=float, default=0.95)
parser.add_argument('--gamma', type=float, default=0.99)
parser.add_argument('--num_mini_batches', type=int, default=4)
parser.add_argument('--update_epochs', type=int, default=4)
parser.add_argument('--norm_adv', type=bool, default=True)
parser.add_argument('--clip_value_loss', type=bool, default=True)
parser.add_argument('--c_1', type=float, default=1.0)
parser.add_argument('--c_2', type=float, default=0.01)
parser.add_argument('--max_grad_norm', type=float, default=0.5)
parser.add_argument('--clip_epsilon', type=float, default=0.2)
a = parser.parse_args()
a.batch_size = int(a.num_envs * a.num_steps)
a.minibatch_size = int(a.batch_size // a.num_mini_batches)
return a
def make_env(gym_id, seed):
def thunk():
env = gym.make(gym_id)
env = gym.wrappers.RecordEpisodeStatistics(env)
env = NoopResetEnv(env, noop_max=30)
env = MaxAndSkipEnv(env, skip=4)
env = EpisodicLifeEnv(env)
if 'FIRE' in env.unwrapped.get_action_meanings():
env = FireResetEnv(env)
env = ClipRewardEnv(env)
env = gym.wrappers.ResizeObservation(env, (84, 84))
env = gym.wrappers.GrayScaleObservation(env)
env = gym.wrappers.FrameStack(env, 4)
env.seed(seed)
env.action_space.seed(seed)
env.observation_space.seed(seed)
return env
return thunk
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
torch.nn.init.orthogonal_(layer.weight, std)
torch.nn.init.constant_(layer.bias, bias_const)
return layer
class Agent(nn.Module):
def __init__(self, e):
super(Agent, self).__init__()
self.network = nn.Sequential(
layer_init(nn.Conv2d(4, 32, 8, stride=4)),
nn.ReLU(),
layer_init(nn.Conv2d(32, 64, 4, stride=2)),
nn.ReLU(),
layer_init(nn.Conv2d(64, 64, 3, stride=1)),
nn.ReLU(),
nn.Flatten(),
layer_init(nn.Linear(64 * 7 * 7, 512)),
nn.ReLU(),
)
self.actor = layer_init(nn.Linear(512, e.single_action_space.n), std=0.01)
self.critic = layer_init(nn.Linear(512, 1), std=1)
def get_value(self, x):
return self.critic(self.network(x / 255.0))
def get_action_and_value(self, x, a=None, show_all=False):
hidden = self.network(x / 255.0)
log = self.actor(hidden)
p = Categorical(logits=log)
if a is None:
a = p.sample()
if show_all:
return a, p.log_prob(a), p.entropy(), self.critic(hidden), p.probs
return a, p.log_prob(a), p.entropy(), self.critic(hidden)
def main(env_id, seed):
args = get_args()
args.gym_id = env_id
args.seed = seed
run_name = (
'ppo' +
'_epoch_' + str(args.update_epochs) +
'_seed_' + str(args.seed)
)
# Save training logs
path_string = str(args.gym_id).split('NoFrameskip')[0] + '/' + run_name
writer = SummaryWriter(path_string)
writer.add_text(
'Hyperparameter',
'|param|value|\n|-|-|\n%s' % ('\n'.join([f'|{key}|{value}|' for key, value in vars(args).items()])),
)
# Random seed
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
# Initialize environments
device = torch.device('cuda' if torch.cuda.is_available() and args.use_cuda else 'cpu')
envs = gym.vector.SyncVectorEnv(
[make_env(args.gym_id, args.seed + i) for i in range(args.num_envs)]
)
assert isinstance(envs.single_action_space, gym.spaces.Discrete), 'only discrete action space is supported'
agent = Agent(envs).to(device)
optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5)
# Initialize buffer
obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device)
actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device)
probs = torch.zeros((args.num_steps, args.num_envs, envs.single_action_space.n)).to(device)
log_probs = torch.zeros((args.num_steps, args.num_envs)).to(device)
rewards = torch.zeros((args.num_steps, args.num_envs)).to(device)
dones = torch.zeros((args.num_steps, args.num_envs)).to(device)
values = torch.zeros((args.num_steps, args.num_envs)).to(device)
# Data collection
global_step = 0
start_time = time.time()
next_obs = torch.Tensor(envs.reset()).to(device)
next_done = torch.zeros(args.num_envs).to(device)
num_updates = int(args.total_steps // args.batch_size)
for update in tqdm(range(1, num_updates + 1)):
# Linear decay of learning rate
if args.lr_decay:
frac = 1.0 - (update - 1.0) / num_updates
lr_now = frac * args.learning_rate
optimizer.param_groups[0]['lr'] = lr_now
for step in range(0, args.num_steps):
global_step += 1 * args.num_envs
obs[step] = next_obs
dones[step] = next_done
# Compute the logarithm of the action probability output by the old policy network
with torch.no_grad():
action, log_prob, _, value, prob = agent.get_action_and_value(next_obs, show_all=True)
values[step] = value.flatten()
actions[step] = action
probs[step] = prob
log_probs[step] = log_prob
# Update the environments
next_obs, reward, done, info = envs.step(action.cpu().numpy())
rewards[step] = torch.tensor(reward).to(device).view(-1)
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device)
for item in info:
if 'episode' in item.keys():
writer.add_scalar('charts/episodic_return', item['episode']['r'], global_step)
break
# Use GAE (Generalized Advantage Estimation) technique to estimate the advantage function
with torch.no_grad():
next_value = agent.get_value(next_obs).reshape(1, -1)
if args.use_gae:
advantages = torch.zeros_like(rewards).to(device)
last_gae_lam = 0
for t in reversed(range(args.num_steps)):
if t == args.num_steps - 1:
next_non_terminal = 1.0 - next_done
next_values = next_value
else:
next_non_terminal = 1.0 - dones[t + 1]
next_values = values[t + 1]
delta = rewards[t] + args.gamma * next_values * next_non_terminal - values[t]
advantages[t] = last_gae_lam = (
delta + args.gamma * args.gae_lambda * next_non_terminal * last_gae_lam
)
returns = advantages + values
else:
returns = torch.zeros_like(rewards).to(device)
for t in reversed(range(args.num_steps)):
if t == args.num_steps - 1:
next_non_terminal = 1.0 - next_done
next_return = next_value
else:
next_non_terminal = 1.0 - dones[t + 1]
next_return = returns[t + 1]
returns[t] = rewards[t] + args.gamma * next_non_terminal * next_return
advantages = returns - values
# ---------------------- We have collected enough data, now let's start training ---------------------- #
# Flatten each batch
b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
b_probs = probs.reshape((-1, envs.single_action_space.n))
b_log_probs = log_probs.reshape(-1)
b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
b_advantages = advantages.reshape(-1)
b_returns = returns.reshape(-1)
b_values = values.reshape(-1)
# Update the policy network and value network
b_index = np.arange(args.batch_size)
for epoch in range(1, args.update_epochs + 1):
np.random.shuffle(b_index)
for start in range(0, args.batch_size, args.minibatch_size):
end = start + args.minibatch_size
mb_index = b_index[start:end]
# The latest outputs of the policy network and value network
_, new_log_prob, entropy, new_value, new_probs = (
agent.get_action_and_value(b_obs[mb_index], b_actions.long()[mb_index], show_all=True)
)
# Compute KL divergence
d = torch.sum(
b_probs[mb_index] * torch.log((b_probs[mb_index] + 1e-12) / (new_probs + 1e-12)), 1
)
writer.add_scalar('charts/average_kld', d.mean(), global_step)
writer.add_scalar('others/min_kld', d.min(), global_step)
writer.add_scalar('others/max_kld', d.max(), global_step)
log_ratio = new_log_prob - b_log_probs[mb_index]
ratio = log_ratio.exp()
# Advantage normalization
mb_advantages = b_advantages[mb_index]
if args.norm_adv:
mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-12)
# Policy loss
pg_loss1 = -mb_advantages * ratio
pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_epsilon, 1 + args.clip_epsilon)
pg_loss = torch.max(pg_loss1, pg_loss2).mean()
# Value loss
new_value = new_value.view(-1)
if args.clip_value_loss:
v_loss_un_clipped = (new_value - b_returns[mb_index]) ** 2
v_clipped = b_values[mb_index] + torch.clamp(
new_value - b_values[mb_index],
-args.clip_epsilon,
args.clip_epsilon,
)
v_loss_clipped = (v_clipped - b_returns[mb_index]) ** 2
v_loss_max = torch.max(v_loss_un_clipped, v_loss_clipped)
v_loss = 0.5 * v_loss_max.mean()
else:
v_loss = 0.5 * ((new_value - b_returns[mb_index]) ** 2).mean()
# Policy entropy
entropy_loss = entropy.mean()
# Total loss
loss = pg_loss + v_loss * args.c_1 - entropy_loss * args.c_2
# Save the data during the training process
writer.add_scalar('losses/value_loss', v_loss.item(), global_step)
writer.add_scalar('losses/policy_loss', pg_loss.item(), global_step)
writer.add_scalar('losses/entropy', entropy_loss.item(), global_step)
writer.add_scalar('losses/delta', torch.abs(ratio - 1).mean().item(), global_step)
# Update network parameters
optimizer.zero_grad()
loss.backward()
nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
optimizer.step()
y_pre, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
var_y = np.var(y_true)
explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pre) / var_y
# Save the data during the training process
writer.add_scalar('charts/learning_rate', optimizer.param_groups[0]['lr'], global_step)
writer.add_scalar('others/explained_variance', explained_var, global_step)
writer.add_scalar('charts/SPS', int(global_step / (time.time() - start_time)), global_step)
envs.close()
writer.close()
def run():
for env_id in ['Breakout']:
for seed in [1, 2, 3]:
print(env_id + 'NoFrameskip-v4', 'seed:', seed)
main(env_id + 'NoFrameskip-v4', seed)
if __name__ == '__main__':
run()

325
v4/spo.py Normal file
View File

@ -0,0 +1,325 @@
import argparse
import os
import random
import time
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from stable_baselines3.common.atari_wrappers import (
ClipRewardEnv,
EpisodicLifeEnv,
FireResetEnv,
MaxAndSkipEnv,
NoopResetEnv
)
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--exp_name', type=str, default=os.path.basename(__file__).rstrip('.py'))
parser.add_argument('--gym_id', type=str, default='BreakoutNoFrameskip-v4')
parser.add_argument('--learning_rate', type=float, default=2.5e-4)
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--total_steps', type=int, default=int(1e7))
parser.add_argument('--use_cuda', type=bool, default=True)
parser.add_argument('--num_envs', type=int, default=8)
parser.add_argument('--num_steps', type=int, default=128)
parser.add_argument('--lr_decay', type=bool, default=True)
parser.add_argument('--use_gae', type=bool, default=True)
parser.add_argument('--gae_lambda', type=float, default=0.95)
parser.add_argument('--gamma', type=float, default=0.99)
parser.add_argument('--num_mini_batches', type=int, default=4)
parser.add_argument('--update_epochs', type=int, default=8)
parser.add_argument('--norm_adv', type=bool, default=True)
parser.add_argument('--clip_value_loss', type=bool, default=True)
parser.add_argument('--c_1', type=float, default=1.0)
parser.add_argument('--c_2', type=float, default=0.01)
parser.add_argument('--max_grad_norm', type=float, default=0.5)
parser.add_argument('--kld_max', type=float, default=0.02)
a = parser.parse_args()
a.batch_size = int(a.num_envs * a.num_steps)
a.minibatch_size = int(a.batch_size // a.num_mini_batches)
return a
def make_env(gym_id, seed):
def thunk():
env = gym.make(gym_id)
env = gym.wrappers.RecordEpisodeStatistics(env)
env = NoopResetEnv(env, noop_max=30)
env = MaxAndSkipEnv(env, skip=4)
env = EpisodicLifeEnv(env)
if 'FIRE' in env.unwrapped.get_action_meanings():
env = FireResetEnv(env)
env = ClipRewardEnv(env)
env = gym.wrappers.ResizeObservation(env, (84, 84))
env = gym.wrappers.GrayScaleObservation(env)
env = gym.wrappers.FrameStack(env, 4)
env.seed(seed)
env.action_space.seed(seed)
env.observation_space.seed(seed)
return env
return thunk
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
torch.nn.init.orthogonal_(layer.weight, std)
torch.nn.init.constant_(layer.bias, bias_const)
return layer
class Agent(nn.Module):
def __init__(self, e):
super(Agent, self).__init__()
self.network = nn.Sequential(
layer_init(nn.Conv2d(4, 32, 8, stride=4)),
nn.ReLU(),
layer_init(nn.Conv2d(32, 64, 4, stride=2)),
nn.ReLU(),
layer_init(nn.Conv2d(64, 64, 3, stride=1)),
nn.ReLU(),
nn.Flatten(),
layer_init(nn.Linear(64 * 7 * 7, 512)),
nn.ReLU(),
)
self.actor = layer_init(nn.Linear(512, e.single_action_space.n), std=0.01)
self.critic = layer_init(nn.Linear(512, 1), std=1)
def get_value(self, x):
return self.critic(self.network(x / 255.0))
def get_action_and_value(self, x, a=None, show_all=False):
hidden = self.network(x / 255.0)
log = self.actor(hidden)
p = Categorical(logits=log)
if a is None:
a = p.sample()
if show_all:
return a, p.log_prob(a), p.entropy(), self.critic(hidden), p.probs
return a, p.log_prob(a), p.entropy(), self.critic(hidden)
def main(env_id, seed):
args = get_args()
args.gym_id = env_id
args.seed = seed
run_name = (
'spo_' + str(args.kld_max) +
'_epoch_' + str(args.update_epochs) +
'_seed_' + str(args.seed)
)
# Save training logs
path_string = str(args.gym_id).split('NoFrameskip')[0] + '/' + run_name
writer = SummaryWriter(path_string)
writer.add_text(
'Hyperparameter',
'|param|value|\n|-|-|\n%s' % ('\n'.join([f'|{key}|{value}|' for key, value in vars(args).items()])),
)
# Random seed
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
# Initialize environments
device = torch.device('cuda' if torch.cuda.is_available() and args.use_cuda else 'cpu')
envs = gym.vector.SyncVectorEnv(
[make_env(args.gym_id, args.seed + i) for i in range(args.num_envs)]
)
assert isinstance(envs.single_action_space, gym.spaces.Discrete), 'only discrete action space is supported'
agent = Agent(envs).to(device)
optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5)
# Initialize buffer
obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device)
actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device)
probs = torch.zeros((args.num_steps, args.num_envs, envs.single_action_space.n)).to(device)
log_probs = torch.zeros((args.num_steps, args.num_envs)).to(device)
rewards = torch.zeros((args.num_steps, args.num_envs)).to(device)
dones = torch.zeros((args.num_steps, args.num_envs)).to(device)
values = torch.zeros((args.num_steps, args.num_envs)).to(device)
# Data collection
global_step = 0
start_time = time.time()
next_obs = torch.Tensor(envs.reset()).to(device)
next_done = torch.zeros(args.num_envs).to(device)
num_updates = int(args.total_steps // args.batch_size)
for update in tqdm(range(1, num_updates + 1)):
# Linear decay of learning rate
if args.lr_decay:
frac = 1.0 - (update - 1.0) / num_updates
lr_now = frac * args.learning_rate
optimizer.param_groups[0]['lr'] = lr_now
for step in range(0, args.num_steps):
global_step += 1 * args.num_envs
obs[step] = next_obs
dones[step] = next_done
# Compute the logarithm of the action probability output by the old policy network
with torch.no_grad():
action, log_prob, _, value, prob = agent.get_action_and_value(next_obs, show_all=True)
values[step] = value.flatten()
actions[step] = action
probs[step] = prob
log_probs[step] = log_prob
# Update the environments
next_obs, reward, done, info = envs.step(action.cpu().numpy())
rewards[step] = torch.tensor(reward).to(device).view(-1)
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device)
for item in info:
if 'episode' in item.keys():
writer.add_scalar('charts/episodic_return', item['episode']['r'], global_step)
break
# Use GAE (Generalized Advantage Estimation) technique to estimate the advantage function
with torch.no_grad():
next_value = agent.get_value(next_obs).reshape(1, -1)
if args.use_gae:
advantages = torch.zeros_like(rewards).to(device)
last_gae_lam = 0
for t in reversed(range(args.num_steps)):
if t == args.num_steps - 1:
next_non_terminal = 1.0 - next_done
next_values = next_value
else:
next_non_terminal = 1.0 - dones[t + 1]
next_values = values[t + 1]
delta = rewards[t] + args.gamma * next_values * next_non_terminal - values[t]
advantages[t] = last_gae_lam = (
delta + args.gamma * args.gae_lambda * next_non_terminal * last_gae_lam
)
returns = advantages + values
else:
returns = torch.zeros_like(rewards).to(device)
for t in reversed(range(args.num_steps)):
if t == args.num_steps - 1:
next_non_terminal = 1.0 - next_done
next_return = next_value
else:
next_non_terminal = 1.0 - dones[t + 1]
next_return = returns[t + 1]
returns[t] = rewards[t] + args.gamma * next_non_terminal * next_return
advantages = returns - values
# ---------------------- We have collected enough data, now let's start training ---------------------- #
# Flatten each batch
b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
b_probs = probs.reshape((-1, envs.single_action_space.n))
b_log_probs = log_probs.reshape(-1)
b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
b_advantages = advantages.reshape(-1)
b_returns = returns.reshape(-1)
b_values = values.reshape(-1)
# Update the policy network and value network
b_index = np.arange(args.batch_size)
for epoch in range(1, args.update_epochs + 1):
np.random.shuffle(b_index)
t = 0
for start in range(0, args.batch_size, args.minibatch_size):
t += 1
end = start + args.minibatch_size
mb_index = b_index[start:end]
# The latest outputs of the policy network and value network
_, new_log_prob, entropy, new_value, new_probs = (
agent.get_action_and_value(b_obs[mb_index], b_actions.long()[mb_index], show_all=True)
)
# Compute KL divergence
d = torch.sum(
b_probs[mb_index] * torch.log((b_probs[mb_index] + 1e-12) / (new_probs + 1e-12)), 1
)
writer.add_scalar('charts/average_kld', d.mean(), global_step)
writer.add_scalar('others/min_kld', d.min(), global_step)
writer.add_scalar('others/max_kld', d.max(), global_step)
log_ratio = new_log_prob - b_log_probs[mb_index]
ratios = log_ratio.exp()
# Advantage normalization
mb_advantages = b_advantages[mb_index]
if args.norm_adv:
mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-12)
# Policy loss (main code of SPO)
new_value = new_value.view(-1)
if epoch == 1 and t == 1:
pg_loss = (-mb_advantages * ratios).mean()
else:
d_clip = torch.clamp(input=d, min=0, max=args.kld_max)
# d_clip / d
ratio = d_clip / (d + 1e-12)
# sign_a
sign_a = torch.sign(mb_advantages)
# (d_clip / d + sign_a - 1) * sign_a
result = (ratio + sign_a - 1) * sign_a
pg_loss = (-mb_advantages * ratios * result).mean()
# Value loss
new_value = new_value.view(-1)
if args.clip_value_loss:
v_loss_un_clipped = (new_value - b_returns[mb_index]) ** 2
v_clipped = b_values[mb_index] + torch.clamp(
new_value - b_values[mb_index],
-0.2,
0.2,
)
v_loss_clipped = (v_clipped - b_returns[mb_index]) ** 2
v_loss_max = torch.max(v_loss_un_clipped, v_loss_clipped)
v_loss = 0.5 * v_loss_max.mean()
else:
v_loss = 0.5 * ((new_value - b_returns[mb_index]) ** 2).mean()
# Policy entropy
entropy_loss = entropy.mean()
# Total loss
loss = pg_loss + v_loss * args.c_1 - entropy_loss * args.c_2
# Save the data during the training process
writer.add_scalar('losses/value_loss', v_loss.item(), global_step)
writer.add_scalar('losses/policy_loss', pg_loss.item(), global_step)
writer.add_scalar('losses/entropy', entropy_loss.item(), global_step)
writer.add_scalar('losses/delta', torch.abs(ratios - 1).mean().item(), global_step)
# Update network parameters
optimizer.zero_grad()
loss.backward()
nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
optimizer.step()
y_pre, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
var_y = np.var(y_true)
explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pre) / var_y
# Save the data during the training process
writer.add_scalar('charts/learning_rate', optimizer.param_groups[0]['lr'], global_step)
writer.add_scalar('others/explained_variance', explained_var, global_step)
writer.add_scalar('charts/SPS', int(global_step / (time.time() - start_time)), global_step)
envs.close()
writer.close()
def run():
for env_id in ['Breakout']:
for seed in [1, 2, 3]:
print(env_id + 'NoFrameskip-v4', 'seed:', seed)
main(env_id + 'NoFrameskip-v4', seed)
if __name__ == '__main__':
run()

338
v5/ppo_clip.py Normal file
View File

@ -0,0 +1,338 @@
import argparse
import os
import time
import gymnasium as gym
import numpy as np
import torch
from stable_baselines3.common.atari_wrappers import FireResetEnv, EpisodicLifeEnv, ClipRewardEnv
from torch import nn, optim
from torch.distributions import Categorical
from torch.nn.utils.clip_grad import clip_grad_norm_
from torch.utils.tensorboard.writer import SummaryWriter
from tqdm import tqdm
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--exp_name', type=str, default=os.path.basename(__file__).rstrip('.py'))
parser.add_argument('--env_id', type=str, default='ALE/Breakout-v5')
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--use_cuda', type=bool, default=True)
parser.add_argument('--learning_rate', type=float, default=2.5e-4)
parser.add_argument('--lr_decay', type=bool, default=True)
parser.add_argument('--total_steps', type=int, default=int(1e7))
parser.add_argument('--num_envs', type=int, default=8)
parser.add_argument('--num_steps', type=int, default=128)
parser.add_argument('--update_epochs', type=int, default=8)
parser.add_argument('--num_mini_batches', type=int, default=4)
parser.add_argument('--gae_lambda', type=float, default=0.95)
parser.add_argument('--gamma', type=float, default=0.99)
parser.add_argument('--clip_value_loss', type=bool, default=True)
parser.add_argument('--c_1', type=float, default=1.0)
parser.add_argument('--c_2', type=float, default=0.01)
parser.add_argument('--clip_grad_norm', type=float, default=0.5)
parser.add_argument('--clip_epsilon', type=float, default=0.2)
args = parser.parse_args()
args.device = torch.device('cuda' if torch.cuda.is_available() and args.use_cuda else 'cpu')
args.batch_size = int(args.num_envs * args.num_steps)
args.minibatch_size = int(args.batch_size // args.num_mini_batches)
args.num_updates = int(args.total_steps // args.batch_size)
return args
def make_env(env_id):
def thunk():
env = gym.make(env_id, frameskip=1, repeat_action_probability=0.0, full_action_space=False)
env = gym.wrappers.RecordEpisodeStatistics(env)
if 'FIRE' in env.unwrapped.get_action_meanings():
env = FireResetEnv(env)
env = EpisodicLifeEnv(env)
env = ClipRewardEnv(env)
env = gym.wrappers.AtariPreprocessing(env, scale_obs=True)
env = gym.wrappers.FrameStack(env, 4)
return env
return thunk
def compute_advantages(rewards, flags, values, last_value, args):
advantages = torch.zeros((args.num_steps, args.num_envs)).to(args.device)
adv = torch.zeros(args.num_envs).to(args.device)
for i in reversed(range(args.num_steps)):
returns = rewards[i] + args.gamma * flags[i] * last_value
delta = returns - values[i]
adv = delta + args.gamma * args.gae_lambda * flags[i] * adv
advantages[i] = adv
last_value = values[i]
return advantages
class Buffer:
def __init__(self, num_steps, num_envs, observation_shape, action_dim, device):
self.states = np.zeros((num_steps, num_envs, *observation_shape), dtype=np.float32)
self.actions = np.zeros((num_steps, num_envs), dtype=np.int64)
self.rewards = np.zeros((num_steps, num_envs), dtype=np.float32)
self.flags = np.zeros((num_steps, num_envs), dtype=np.float32)
self.log_probs = np.zeros((num_steps, num_envs), dtype=np.float32)
self.probs = np.zeros((num_steps, num_envs, action_dim), dtype=np.float32)
self.values = np.zeros((num_steps, num_envs), dtype=np.float32)
self.step = 0
self.num_steps = num_steps
self.device = device
def push(self, state, action, reward, flag, log_prob, prob, value):
self.states[self.step] = state
self.actions[self.step] = action
self.rewards[self.step] = reward
self.flags[self.step] = flag
self.log_probs[self.step] = log_prob
self.probs[self.step] = prob
self.values[self.step] = value
self.step = (self.step + 1) % self.num_steps
def get(self):
return (
torch.from_numpy(self.states).to(self.device),
torch.from_numpy(self.actions).to(self.device),
torch.from_numpy(self.rewards).to(self.device),
torch.from_numpy(self.flags).to(self.device),
torch.from_numpy(self.log_probs).to(self.device),
torch.from_numpy(self.values).to(self.device),
)
def get_probs(self):
return torch.from_numpy(self.probs).to(self.device)
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
torch.nn.init.orthogonal_(layer.weight, std)
torch.nn.init.constant_(layer.bias, bias_const)
return layer
class Agent(nn.Module):
def __init__(self, action_dim, device):
super().__init__()
self.encoder = nn.Sequential(
layer_init(nn.Conv2d(4, 32, 8, stride=4)),
nn.ReLU(),
layer_init(nn.Conv2d(32, 64, 4, stride=2)),
nn.ReLU(),
layer_init(nn.Conv2d(64, 64, 3, stride=1)),
nn.ReLU(),
nn.Flatten(),
layer_init(nn.Linear(64 * 7 * 7, 512)),
nn.ReLU()
)
self.actor_net = layer_init(nn.Linear(512, action_dim), std=0.01)
self.critic_net = layer_init(nn.Linear(512, 1), std=1)
if device.type == 'cuda':
self.cuda()
def forward(self, state):
hidden = self.encoder(state)
actor_value = self.actor_net(hidden)
distribution = Categorical(logits=actor_value)
action = distribution.sample()
log_prob = distribution.log_prob(action)
value = self.critic_net(hidden).squeeze(-1)
return action, log_prob, value, distribution.probs
def evaluate(self, states, actions):
hidden = self.encoder(states)
actor_values = self.actor_net(hidden)
distribution = Categorical(logits=actor_values)
log_probs = distribution.log_prob(actions)
entropy = distribution.entropy()
values = self.critic_net(hidden).squeeze(-1)
return log_probs, values, entropy, distribution.probs
def critic(self, state):
return self.critic_net(self.encoder(state)).squeeze(-1)
def train(env_id, seed):
args = get_args()
args.env_id = env_id
args.seed = seed
run_name = (
'ppo_clip' +
'_epoch_' + str(args.update_epochs) +
'_seed_' + str(args.seed)
)
# Save training logs
path_string = str(args.env_id)[4:] + '/' + run_name
writer = SummaryWriter(path_string)
writer.add_text(
'Hyperparameter',
'|param|value|\n|-|-|\n%s' % ('\n'.join([f'|{key}|{value}|' for key, value in vars(args).items()])),
)
# Initialize environments
envs = gym.vector.AsyncVectorEnv([make_env(args.env_id) for _ in range(args.num_envs)])
# State space and action space
observation_shape = envs.single_observation_space.shape
action_dim = envs.single_action_space.n
# Random seed
if args.seed:
numpy_rng = np.random.default_rng(args.seed)
torch.manual_seed(args.seed)
state, _ = envs.reset(seed=args.seed)
else:
numpy_rng = np.random.default_rng()
state, _ = envs.reset()
# Initialize agent
agent = Agent(action_dim, args.device)
# Initialize optimizer
optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate)
# Initialize buffer
rollout_buffer = Buffer(args.num_steps, args.num_envs, observation_shape, action_dim, args.device)
global_step = 0
start_time = time.time()
# Data collection
for _ in tqdm(range(args.num_updates)):
# Linear decay of learning rate
if args.lr_decay:
optimizer.param_groups[0]['lr'] -= (args.learning_rate - 1e-12) / args.num_updates
for _ in range(args.num_steps):
global_step += 1 * args.num_envs
with torch.no_grad():
action, log_prob, value, prob = agent(torch.from_numpy(state).to(args.device).float())
action = action.cpu().numpy()
# Update the environments
next_state, reward, terminated, truncated, all_info = envs.step(action)
# Save data
flag = 1.0 - np.logical_or(terminated, truncated)
log_prob = log_prob.cpu().numpy()
prob = prob.cpu().numpy()
value = value.cpu().numpy()
rollout_buffer.push(state, action, reward, flag, log_prob, prob, value)
state = next_state
if 'final_info' not in all_info:
continue
for info in all_info['final_info']:
if info is None:
continue
if 'episode' in info.keys():
writer.add_scalar('charts/episodic_return', info['episode']['r'], global_step)
break
# ---------------------- We have collected enough data, now let's start training ---------------------- #
states, actions, rewards, flags, log_probs, values = rollout_buffer.get()
probs = rollout_buffer.get_probs()
with torch.no_grad():
last_value = agent.critic(torch.from_numpy(next_state).to(args.device).float())
# Use GAE (Generalized Advantage Estimation) technique to estimate the advantage function and TD target
advantages = compute_advantages(rewards, flags, values, last_value, args)
td_target = advantages + values
# Flatten each batch
states = states.reshape(-1, *observation_shape)
actions = actions.reshape(-1)
log_probs = log_probs.reshape(-1)
probs = probs.reshape((-1, action_dim))
td_target = td_target.reshape(-1)
advantages = advantages.reshape(-1)
values = values.reshape(-1)
batch_indexes = np.arange(args.batch_size)
# Update the policy network and value network
for e in range(1, args.update_epochs + 1):
numpy_rng.shuffle(batch_indexes)
for start in range(0, args.batch_size, args.minibatch_size):
end = start + args.minibatch_size
index = batch_indexes[start:end]
# The latest outputs of the policy network and value network
new_log_probs, td_predict, entropy, new_probs = agent.evaluate(states[index], actions[index])
log_ratio = new_log_probs - log_probs[index]
ratios = log_ratio.exp()
# Compute KL divergence
d = torch.sum(
probs[index] * torch.log((probs[index] + 1e-12) / (new_probs + 1e-12)), 1
)
writer.add_scalar('charts/average_kld', d.mean(), global_step)
writer.add_scalar('others/min_kld', d.min(), global_step)
writer.add_scalar('others/max_kld', d.max(), global_step)
# Advantage normalization
b_advantages = advantages[index]
b_advantages = (b_advantages - b_advantages.mean()) / (b_advantages.std() + 1e-12)
# Policy loss
policy_loss_1 = b_advantages * ratios
policy_loss_2 = b_advantages * torch.clamp(
ratios, 1.0 - args.clip_epsilon, 1.0 + args.clip_epsilon
)
policy_loss = -torch.min(policy_loss_1, policy_loss_2).mean()
# Value loss
if args.clip_value_loss:
v_loss_un_clipped = (td_predict - td_target[index]) ** 2
v_clipped = td_target[index] + torch.clamp(
td_predict - td_target[index],
-args.clip_epsilon,
args.clip_epsilon,
)
v_loss_clipped = (v_clipped - td_target[index]) ** 2
v_loss_max = torch.max(v_loss_un_clipped, v_loss_clipped)
value_loss = 0.5 * v_loss_max.mean()
else:
value_loss = 0.5 * ((td_predict - td_target[index]) ** 2).mean()
# Policy entropy
entropy_loss = entropy.mean()
# Total loss
loss = policy_loss + value_loss * args.c_1 - entropy_loss * args.c_2
# Save the data during the training process
writer.add_scalar('losses/value_loss', value_loss.item(), global_step)
writer.add_scalar('losses/policy_loss', policy_loss.item(), global_step)
writer.add_scalar('losses/entropy', entropy_loss.item(), global_step)
writer.add_scalar('losses/delta', torch.abs(ratios - 1).mean().item(), global_step)
# Update network parameters
optimizer.zero_grad()
loss.backward()
clip_grad_norm_(agent.parameters(), args.clip_grad_norm)
optimizer.step()
explained_var = (
np.nan if torch.var(td_target) == 0 else 1 - torch.var(td_target - values) / torch.var(td_target)
)
writer.add_scalar('charts/learning_rate', optimizer.param_groups[0]['lr'], global_step)
writer.add_scalar('charts/SPS', int(global_step / (time.time() - start_time)), global_step)
writer.add_scalar('others/explained_var', explained_var, global_step)
envs.close()
writer.close()
def main():
for env_id in ['Breakout']:
for seed in [1, 2, 3]:
print(env_id, seed)
train('ALE/' + env_id + '-v5', seed)
if __name__ == '__main__':
main()

348
v5/ppo_early_stop.py Normal file
View File

@ -0,0 +1,348 @@
import argparse
import os
import time
import gymnasium as gym
import numpy as np
import torch
from stable_baselines3.common.atari_wrappers import FireResetEnv, EpisodicLifeEnv, ClipRewardEnv
from torch import nn, optim
from torch.distributions import Categorical
from torch.nn.utils.clip_grad import clip_grad_norm_
from torch.utils.tensorboard.writer import SummaryWriter
from tqdm import tqdm
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--exp_name', type=str, default=os.path.basename(__file__).rstrip('.py'))
parser.add_argument('--env_id', type=str, default='ALE/Breakout-v5')
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--use_cuda', type=bool, default=True)
parser.add_argument('--learning_rate', type=float, default=2.5e-4)
parser.add_argument('--lr_decay', type=bool, default=True)
parser.add_argument('--total_steps', type=int, default=int(1e7))
parser.add_argument('--num_envs', type=int, default=8)
parser.add_argument('--num_steps', type=int, default=128)
parser.add_argument('--update_epochs', type=int, default=8)
parser.add_argument('--num_mini_batches', type=int, default=4)
parser.add_argument('--gae_lambda', type=float, default=0.95)
parser.add_argument('--gamma', type=float, default=0.99)
parser.add_argument('--clip_value_loss', type=bool, default=True)
parser.add_argument('--c_1', type=float, default=1.0)
parser.add_argument('--c_2', type=float, default=0.01)
parser.add_argument('--clip_grad_norm', type=float, default=0.5)
parser.add_argument('--clip_epsilon', type=float, default=0.2)
args = parser.parse_args()
args.device = torch.device('cuda' if torch.cuda.is_available() and args.use_cuda else 'cpu')
args.batch_size = int(args.num_envs * args.num_steps)
args.minibatch_size = int(args.batch_size // args.num_mini_batches)
args.num_updates = int(args.total_steps // args.batch_size)
return args
def make_env(env_id):
def thunk():
env = gym.make(env_id, frameskip=1, repeat_action_probability=0.0, full_action_space=False)
env = gym.wrappers.RecordEpisodeStatistics(env)
if 'FIRE' in env.unwrapped.get_action_meanings():
env = FireResetEnv(env)
env = EpisodicLifeEnv(env)
env = ClipRewardEnv(env)
env = gym.wrappers.AtariPreprocessing(env, scale_obs=True)
env = gym.wrappers.FrameStack(env, 4)
return env
return thunk
def compute_advantages(rewards, flags, values, last_value, args):
advantages = torch.zeros((args.num_steps, args.num_envs)).to(args.device)
adv = torch.zeros(args.num_envs).to(args.device)
for i in reversed(range(args.num_steps)):
returns = rewards[i] + args.gamma * flags[i] * last_value
delta = returns - values[i]
adv = delta + args.gamma * args.gae_lambda * flags[i] * adv
advantages[i] = adv
last_value = values[i]
return advantages
class Buffer:
def __init__(self, num_steps, num_envs, observation_shape, action_dim, device):
self.states = np.zeros((num_steps, num_envs, *observation_shape), dtype=np.float32)
self.actions = np.zeros((num_steps, num_envs), dtype=np.int64)
self.rewards = np.zeros((num_steps, num_envs), dtype=np.float32)
self.flags = np.zeros((num_steps, num_envs), dtype=np.float32)
self.log_probs = np.zeros((num_steps, num_envs), dtype=np.float32)
self.probs = np.zeros((num_steps, num_envs, action_dim), dtype=np.float32)
self.values = np.zeros((num_steps, num_envs), dtype=np.float32)
self.step = 0
self.num_steps = num_steps
self.device = device
def push(self, state, action, reward, flag, log_prob, prob, value):
self.states[self.step] = state
self.actions[self.step] = action
self.rewards[self.step] = reward
self.flags[self.step] = flag
self.log_probs[self.step] = log_prob
self.probs[self.step] = prob
self.values[self.step] = value
self.step = (self.step + 1) % self.num_steps
def get(self):
return (
torch.from_numpy(self.states).to(self.device),
torch.from_numpy(self.actions).to(self.device),
torch.from_numpy(self.rewards).to(self.device),
torch.from_numpy(self.flags).to(self.device),
torch.from_numpy(self.log_probs).to(self.device),
torch.from_numpy(self.values).to(self.device),
)
def get_probs(self):
return torch.from_numpy(self.probs).to(self.device)
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
torch.nn.init.orthogonal_(layer.weight, std)
torch.nn.init.constant_(layer.bias, bias_const)
return layer
class Agent(nn.Module):
def __init__(self, action_dim, device):
super().__init__()
self.encoder = nn.Sequential(
layer_init(nn.Conv2d(4, 32, 8, stride=4)),
nn.ReLU(),
layer_init(nn.Conv2d(32, 64, 4, stride=2)),
nn.ReLU(),
layer_init(nn.Conv2d(64, 64, 3, stride=1)),
nn.ReLU(),
nn.Flatten(),
layer_init(nn.Linear(64 * 7 * 7, 512)),
nn.ReLU()
)
self.actor_net = layer_init(nn.Linear(512, action_dim), std=0.01)
self.critic_net = layer_init(nn.Linear(512, 1), std=1)
if device.type == 'cuda':
self.cuda()
def forward(self, state):
hidden = self.encoder(state)
actor_value = self.actor_net(hidden)
distribution = Categorical(logits=actor_value)
action = distribution.sample()
log_prob = distribution.log_prob(action)
value = self.critic_net(hidden).squeeze(-1)
return action, log_prob, value, distribution.probs
def evaluate(self, states, actions):
hidden = self.encoder(states)
actor_values = self.actor_net(hidden)
distribution = Categorical(logits=actor_values)
log_probs = distribution.log_prob(actions)
entropy = distribution.entropy()
values = self.critic_net(hidden).squeeze(-1)
return log_probs, values, entropy, distribution.probs
def critic(self, state):
return self.critic_net(self.encoder(state)).squeeze(-1)
def train(env_id, seed):
args = get_args()
args.env_id = env_id
args.seed = seed
run_name = (
'ppo_es' +
'_epoch_' + str(args.update_epochs) +
'_seed_' + str(args.seed)
)
# Save training logs
path_string = str(args.env_id)[4:] + '/' + run_name
writer = SummaryWriter(path_string)
writer.add_text(
'Hyperparameter',
'|param|value|\n|-|-|\n%s' % ('\n'.join([f'|{key}|{value}|' for key, value in vars(args).items()])),
)
# Initialize environments
envs = gym.vector.AsyncVectorEnv([make_env(args.env_id) for _ in range(args.num_envs)])
# State space and action space
observation_shape = envs.single_observation_space.shape
action_dim = envs.single_action_space.n
# Random seed
if args.seed:
numpy_rng = np.random.default_rng(args.seed)
torch.manual_seed(args.seed)
state, _ = envs.reset(seed=args.seed)
else:
numpy_rng = np.random.default_rng()
state, _ = envs.reset()
# Initialize agent
agent = Agent(action_dim, args.device)
# Initialize optimizer
optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate)
# Initialize buffer
rollout_buffer = Buffer(args.num_steps, args.num_envs, observation_shape, action_dim, args.device)
global_step = 0
start_time = time.time()
# Data collection
for _ in tqdm(range(args.num_updates)):
# Linear decay of learning rate
if args.lr_decay:
optimizer.param_groups[0]['lr'] -= (args.learning_rate - 1e-12) / args.num_updates
for _ in range(args.num_steps):
global_step += 1 * args.num_envs
with torch.no_grad():
action, log_prob, value, prob = agent(torch.from_numpy(state).to(args.device).float())
action = action.cpu().numpy()
# Update the environments
next_state, reward, terminated, truncated, all_info = envs.step(action)
# Save data
flag = 1.0 - np.logical_or(terminated, truncated)
log_prob = log_prob.cpu().numpy()
prob = prob.cpu().numpy()
value = value.cpu().numpy()
rollout_buffer.push(state, action, reward, flag, log_prob, prob, value)
state = next_state
if 'final_info' not in all_info:
continue
for info in all_info['final_info']:
if info is None:
continue
if 'episode' in info.keys():
writer.add_scalar('charts/episodic_return', info['episode']['r'], global_step)
break
# ---------------------- We have collected enough data, now let's start training ---------------------- #
states, actions, rewards, flags, log_probs, values = rollout_buffer.get()
probs = rollout_buffer.get_probs()
with torch.no_grad():
last_value = agent.critic(torch.from_numpy(next_state).to(args.device).float())
# Use GAE (Generalized Advantage Estimation) technique to estimate the advantage function and TD target
advantages = compute_advantages(rewards, flags, values, last_value, args)
td_target = advantages + values
# Flatten each batch
states = states.reshape(-1, *observation_shape)
actions = actions.reshape(-1)
log_probs = log_probs.reshape(-1)
probs = probs.reshape((-1, action_dim))
td_target = td_target.reshape(-1)
advantages = advantages.reshape(-1)
values = values.reshape(-1)
batch_indexes = np.arange(args.batch_size)
# Update the policy network and value network
flag = True
for e in range(1, args.update_epochs + 1):
numpy_rng.shuffle(batch_indexes)
for start in range(0, args.batch_size, args.minibatch_size):
end = start + args.minibatch_size
index = batch_indexes[start:end]
# The latest outputs of the policy network and value network
new_log_probs, td_predict, entropy, new_probs = agent.evaluate(states[index], actions[index])
log_ratio = new_log_probs - log_probs[index]
ratios = log_ratio.exp()
# Compute KL divergence
d = torch.sum(
probs[index] * torch.log((probs[index] + 1e-12) / (new_probs + 1e-12)), 1
)
writer.add_scalar('charts/average_kld', d.mean(), global_step)
writer.add_scalar('others/min_kld', d.min(), global_step)
writer.add_scalar('others/max_kld', d.max(), global_step)
# Early stopping
if d.mean() > 0.02:
flag = False
break
# Advantage normalization
b_advantages = advantages[index]
b_advantages = (b_advantages - b_advantages.mean()) / (b_advantages.std() + 1e-12)
# Policy loss
policy_loss_1 = b_advantages * ratios
policy_loss_2 = b_advantages * torch.clamp(
ratios, 1.0 - args.clip_epsilon, 1.0 + args.clip_epsilon
)
policy_loss = -torch.min(policy_loss_1, policy_loss_2).mean()
# Value loss
if args.clip_value_loss:
v_loss_un_clipped = (td_predict - td_target[index]) ** 2
v_clipped = td_target[index] + torch.clamp(
td_predict - td_target[index],
-args.clip_epsilon,
args.clip_epsilon,
)
v_loss_clipped = (v_clipped - td_target[index]) ** 2
v_loss_max = torch.max(v_loss_un_clipped, v_loss_clipped)
value_loss = 0.5 * v_loss_max.mean()
else:
value_loss = 0.5 * ((td_predict - td_target[index]) ** 2).mean()
# Policy entropy
entropy_loss = entropy.mean()
# Total loss
loss = policy_loss + value_loss * args.c_1 - entropy_loss * args.c_2
# Save the data during the training process
writer.add_scalar('losses/value_loss', value_loss.item(), global_step)
writer.add_scalar('losses/policy_loss', policy_loss.item(), global_step)
writer.add_scalar('losses/entropy', entropy_loss.item(), global_step)
writer.add_scalar('losses/delta', torch.abs(ratios - 1).mean().item(), global_step)
# Update network parameters
optimizer.zero_grad()
loss.backward()
clip_grad_norm_(agent.parameters(), args.clip_grad_norm)
optimizer.step()
# Early stopping
if flag is False:
break
explained_var = (
np.nan if torch.var(td_target) == 0 else 1 - torch.var(td_target - values) / torch.var(td_target)
)
writer.add_scalar('charts/learning_rate', optimizer.param_groups[0]['lr'], global_step)
writer.add_scalar('charts/SPS', int(global_step / (time.time() - start_time)), global_step)
writer.add_scalar('others/explained_var', explained_var, global_step)
envs.close()
writer.close()
def main():
for env_id in ['Breakout']:
for seed in [1, 2, 3]:
print(env_id, seed)
train('ALE/' + env_id + '-v5', seed)
if __name__ == '__main__':
main()

339
v5/ppo_penalty.py Normal file
View File

@ -0,0 +1,339 @@
import argparse
import os
import time
import gymnasium as gym
import numpy as np
import torch
from stable_baselines3.common.atari_wrappers import FireResetEnv, EpisodicLifeEnv, ClipRewardEnv
from torch import nn, optim
from torch.distributions import Categorical
from torch.nn.utils.clip_grad import clip_grad_norm_
from torch.utils.tensorboard.writer import SummaryWriter
from tqdm import tqdm
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--exp_name', type=str, default=os.path.basename(__file__).rstrip('.py'))
parser.add_argument('--env_id', type=str, default='ALE/Breakout-v5')
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--use_cuda', type=bool, default=True)
parser.add_argument('--learning_rate', type=float, default=2.5e-4)
parser.add_argument('--lr_decay', type=bool, default=True)
parser.add_argument('--total_steps', type=int, default=int(1e7))
parser.add_argument('--num_envs', type=int, default=8)
parser.add_argument('--num_steps', type=int, default=128)
parser.add_argument('--update_epochs', type=int, default=8)
parser.add_argument('--num_mini_batches', type=int, default=4)
parser.add_argument('--gae_lambda', type=float, default=0.95)
parser.add_argument('--gamma', type=float, default=0.99)
parser.add_argument('--clip_value_loss', type=bool, default=True)
parser.add_argument('--c_1', type=float, default=1.0)
parser.add_argument('--c_2', type=float, default=0.01)
parser.add_argument('--clip_grad_norm', type=float, default=0.5)
parser.add_argument('--beta', type=float, default=1.0)
args = parser.parse_args()
args.device = torch.device('cuda' if torch.cuda.is_available() and args.use_cuda else 'cpu')
args.batch_size = int(args.num_envs * args.num_steps)
args.minibatch_size = int(args.batch_size // args.num_mini_batches)
args.num_updates = int(args.total_steps // args.batch_size)
return args
def make_env(env_id):
def thunk():
env = gym.make(env_id, frameskip=1, repeat_action_probability=0.0, full_action_space=False)
env = gym.wrappers.RecordEpisodeStatistics(env)
if 'FIRE' in env.unwrapped.get_action_meanings():
env = FireResetEnv(env)
env = EpisodicLifeEnv(env)
env = ClipRewardEnv(env)
env = gym.wrappers.AtariPreprocessing(env, scale_obs=True)
env = gym.wrappers.FrameStack(env, 4)
return env
return thunk
def compute_advantages(rewards, flags, values, last_value, args):
advantages = torch.zeros((args.num_steps, args.num_envs)).to(args.device)
adv = torch.zeros(args.num_envs).to(args.device)
for i in reversed(range(args.num_steps)):
returns = rewards[i] + args.gamma * flags[i] * last_value
delta = returns - values[i]
adv = delta + args.gamma * args.gae_lambda * flags[i] * adv
advantages[i] = adv
last_value = values[i]
return advantages
class Buffer:
def __init__(self, num_steps, num_envs, observation_shape, action_dim, device):
self.states = np.zeros((num_steps, num_envs, *observation_shape), dtype=np.float32)
self.actions = np.zeros((num_steps, num_envs), dtype=np.int64)
self.rewards = np.zeros((num_steps, num_envs), dtype=np.float32)
self.flags = np.zeros((num_steps, num_envs), dtype=np.float32)
self.log_probs = np.zeros((num_steps, num_envs), dtype=np.float32)
self.probs = np.zeros((num_steps, num_envs, action_dim), dtype=np.float32)
self.values = np.zeros((num_steps, num_envs), dtype=np.float32)
self.step = 0
self.num_steps = num_steps
self.device = device
def push(self, state, action, reward, flag, log_prob, prob, value):
self.states[self.step] = state
self.actions[self.step] = action
self.rewards[self.step] = reward
self.flags[self.step] = flag
self.log_probs[self.step] = log_prob
self.probs[self.step] = prob
self.values[self.step] = value
self.step = (self.step + 1) % self.num_steps
def get(self):
return (
torch.from_numpy(self.states).to(self.device),
torch.from_numpy(self.actions).to(self.device),
torch.from_numpy(self.rewards).to(self.device),
torch.from_numpy(self.flags).to(self.device),
torch.from_numpy(self.log_probs).to(self.device),
torch.from_numpy(self.values).to(self.device),
)
def get_probs(self):
return torch.from_numpy(self.probs).to(self.device)
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
torch.nn.init.orthogonal_(layer.weight, std)
torch.nn.init.constant_(layer.bias, bias_const)
return layer
class Agent(nn.Module):
def __init__(self, action_dim, device):
super().__init__()
self.encoder = nn.Sequential(
layer_init(nn.Conv2d(4, 32, 8, stride=4)),
nn.ReLU(),
layer_init(nn.Conv2d(32, 64, 4, stride=2)),
nn.ReLU(),
layer_init(nn.Conv2d(64, 64, 3, stride=1)),
nn.ReLU(),
nn.Flatten(),
layer_init(nn.Linear(64 * 7 * 7, 512)),
nn.ReLU()
)
self.actor_net = layer_init(nn.Linear(512, action_dim), std=0.01)
self.critic_net = layer_init(nn.Linear(512, 1), std=1)
if device.type == 'cuda':
self.cuda()
def forward(self, state):
hidden = self.encoder(state)
actor_value = self.actor_net(hidden)
distribution = Categorical(logits=actor_value)
action = distribution.sample()
log_prob = distribution.log_prob(action)
value = self.critic_net(hidden).squeeze(-1)
return action, log_prob, value, distribution.probs
def evaluate(self, states, actions):
hidden = self.encoder(states)
actor_values = self.actor_net(hidden)
distribution = Categorical(logits=actor_values)
log_probs = distribution.log_prob(actions)
entropy = distribution.entropy()
values = self.critic_net(hidden).squeeze(-1)
return log_probs, values, entropy, distribution.probs
def critic(self, state):
return self.critic_net(self.encoder(state)).squeeze(-1)
def train(env_id, seed):
args = get_args()
args.env_id = env_id
args.seed = seed
run_name = (
'ppo_penalty' +
'_epoch_' + str(args.update_epochs) +
'_seed_' + str(args.seed)
)
# Save training logs
path_string = str(args.env_id)[4:] + '/' + run_name
writer = SummaryWriter(path_string)
writer.add_text(
'Hyperparameter',
'|param|value|\n|-|-|\n%s' % ('\n'.join([f'|{key}|{value}|' for key, value in vars(args).items()])),
)
# Initialize environments
envs = gym.vector.AsyncVectorEnv([make_env(args.env_id) for _ in range(args.num_envs)])
# State space and action space
observation_shape = envs.single_observation_space.shape
action_dim = envs.single_action_space.n
# Random seed
if args.seed:
numpy_rng = np.random.default_rng(args.seed)
torch.manual_seed(args.seed)
state, _ = envs.reset(seed=args.seed)
else:
numpy_rng = np.random.default_rng()
state, _ = envs.reset()
# Initialize agent
agent = Agent(action_dim, args.device)
# Initialize optimizer
optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate)
# Initialize buffer
rollout_buffer = Buffer(args.num_steps, args.num_envs, observation_shape, action_dim, args.device)
global_step = 0
start_time = time.time()
# Data collection
for _ in tqdm(range(args.num_updates)):
# Linear decay of learning rate
if args.lr_decay:
optimizer.param_groups[0]['lr'] -= (args.learning_rate - 1e-12) / args.num_updates
for _ in range(args.num_steps):
global_step += 1 * args.num_envs
with torch.no_grad():
action, log_prob, value, prob = agent(torch.from_numpy(state).to(args.device).float())
action = action.cpu().numpy()
# Update the environments
next_state, reward, terminated, truncated, all_info = envs.step(action)
# Save data
flag = 1.0 - np.logical_or(terminated, truncated)
log_prob = log_prob.cpu().numpy()
prob = prob.cpu().numpy()
value = value.cpu().numpy()
rollout_buffer.push(state, action, reward, flag, log_prob, prob, value)
state = next_state
if 'final_info' not in all_info:
continue
for info in all_info['final_info']:
if info is None:
continue
if 'episode' in info.keys():
writer.add_scalar('charts/episodic_return', info['episode']['r'], global_step)
break
# ---------------------- We have collected enough data, now let's start training ---------------------- #
states, actions, rewards, flags, log_probs, values = rollout_buffer.get()
probs = rollout_buffer.get_probs()
with torch.no_grad():
last_value = agent.critic(torch.from_numpy(next_state).to(args.device).float())
# Use GAE (Generalized Advantage Estimation) technique to estimate the advantage function and TD target
advantages = compute_advantages(rewards, flags, values, last_value, args)
td_target = advantages + values
# Flatten each batch
states = states.reshape(-1, *observation_shape)
actions = actions.reshape(-1)
log_probs = log_probs.reshape(-1)
probs = probs.reshape((-1, action_dim))
td_target = td_target.reshape(-1)
advantages = advantages.reshape(-1)
values = values.reshape(-1)
batch_indexes = np.arange(args.batch_size)
# Update the policy network and value network
for e in range(1, args.update_epochs + 1):
numpy_rng.shuffle(batch_indexes)
for start in range(0, args.batch_size, args.minibatch_size):
end = start + args.minibatch_size
index = batch_indexes[start:end]
# The latest outputs of the policy network and value network
new_log_probs, td_predict, entropy, new_probs = agent.evaluate(states[index], actions[index])
log_ratio = new_log_probs - log_probs[index]
ratios = log_ratio.exp()
# Compute KL divergence
d = torch.sum(
probs[index] * torch.log((probs[index] + 1e-12) / (new_probs + 1e-12)), 1
)
writer.add_scalar('charts/average_kld', d.mean(), global_step)
writer.add_scalar('others/min_kld', d.min(), global_step)
writer.add_scalar('others/max_kld', d.max(), global_step)
# Advantage normalization
b_advantages = advantages[index]
b_advantages = (b_advantages - b_advantages.mean()) / (b_advantages.std() + 1e-12)
# Policy loss
if d.mean() < 0.02 / 1.5:
args.beta = args.beta / 2
if d.mean() > 0.02 * 1.5:
args.beta = args.beta * 2
policy_loss = -(b_advantages * ratios - args.beta * d).mean()
# Value loss
if args.clip_value_loss:
v_loss_un_clipped = (td_predict - td_target[index]) ** 2
v_clipped = td_target[index] + torch.clamp(
td_predict - td_target[index],
-0.2,
0.2,
)
v_loss_clipped = (v_clipped - td_target[index]) ** 2
v_loss_max = torch.max(v_loss_un_clipped, v_loss_clipped)
value_loss = 0.5 * v_loss_max.mean()
else:
value_loss = 0.5 * ((td_predict - td_target[index]) ** 2).mean()
# Policy entropy
entropy_loss = entropy.mean()
# Total loss
loss = policy_loss + value_loss * args.c_1 - entropy_loss * args.c_2
# Save the data during the training process
writer.add_scalar('losses/value_loss', value_loss.item(), global_step)
writer.add_scalar('losses/policy_loss', policy_loss.item(), global_step)
writer.add_scalar('losses/entropy', entropy_loss.item(), global_step)
writer.add_scalar('losses/delta', torch.abs(ratios - 1).mean().item(), global_step)
# Update network parameters
optimizer.zero_grad()
loss.backward()
clip_grad_norm_(agent.parameters(), args.clip_grad_norm)
optimizer.step()
explained_var = (
np.nan if torch.var(td_target) == 0 else 1 - torch.var(td_target - values) / torch.var(td_target)
)
writer.add_scalar('charts/learning_rate', optimizer.param_groups[0]['lr'], global_step)
writer.add_scalar('charts/SPS', int(global_step / (time.time() - start_time)), global_step)
writer.add_scalar('others/explained_var', explained_var, global_step)
envs.close()
writer.close()
def main():
for env_id in ['Breakout']:
for seed in [1, 2, 3]:
print(env_id, seed)
train('ALE/' + env_id + '-v5', seed)
if __name__ == '__main__':
main()

347
v5/spo.py Normal file
View File

@ -0,0 +1,347 @@
import argparse
import os
import time
import gymnasium as gym
import numpy as np
import torch
from stable_baselines3.common.atari_wrappers import FireResetEnv, EpisodicLifeEnv, ClipRewardEnv
from torch import nn, optim
from torch.distributions import Categorical
from torch.nn.utils.clip_grad import clip_grad_norm_
from torch.utils.tensorboard.writer import SummaryWriter
from tqdm import tqdm
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--exp_name', type=str, default=os.path.basename(__file__).rstrip('.py'))
parser.add_argument('--env_id', type=str, default='ALE/Breakout-v5')
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--use_cuda', type=bool, default=True)
parser.add_argument('--learning_rate', type=float, default=2.5e-4)
parser.add_argument('--lr_decay', type=bool, default=True)
parser.add_argument('--total_steps', type=int, default=int(1e7))
parser.add_argument('--num_envs', type=int, default=8)
parser.add_argument('--num_steps', type=int, default=128)
parser.add_argument('--update_epochs', type=int, default=8)
parser.add_argument('--num_mini_batches', type=int, default=4)
parser.add_argument('--gae_lambda', type=float, default=0.95)
parser.add_argument('--gamma', type=float, default=0.99)
parser.add_argument('--clip_value_loss', type=bool, default=True)
parser.add_argument('--c_1', type=float, default=1.0)
parser.add_argument('--c_2', type=float, default=0.01)
parser.add_argument('--clip_grad_norm', type=float, default=0.5)
parser.add_argument('--kld_max', type=float, default=0.02)
args = parser.parse_args()
args.device = torch.device('cuda' if torch.cuda.is_available() and args.use_cuda else 'cpu')
args.batch_size = int(args.num_envs * args.num_steps)
args.minibatch_size = int(args.batch_size // args.num_mini_batches)
args.num_updates = int(args.total_steps // args.batch_size)
return args
def make_env(env_id):
def thunk():
env = gym.make(env_id, frameskip=1, repeat_action_probability=0.0, full_action_space=False)
env = gym.wrappers.RecordEpisodeStatistics(env)
if 'FIRE' in env.unwrapped.get_action_meanings():
env = FireResetEnv(env)
env = EpisodicLifeEnv(env)
env = ClipRewardEnv(env)
env = gym.wrappers.AtariPreprocessing(env, scale_obs=True)
env = gym.wrappers.FrameStack(env, 4)
return env
return thunk
def compute_advantages(rewards, flags, values, last_value, args):
advantages = torch.zeros((args.num_steps, args.num_envs)).to(args.device)
adv = torch.zeros(args.num_envs).to(args.device)
for i in reversed(range(args.num_steps)):
returns = rewards[i] + args.gamma * flags[i] * last_value
delta = returns - values[i]
adv = delta + args.gamma * args.gae_lambda * flags[i] * adv
advantages[i] = adv
last_value = values[i]
return advantages
class Buffer:
def __init__(self, num_steps, num_envs, observation_shape, action_dim, device):
self.states = np.zeros((num_steps, num_envs, *observation_shape), dtype=np.float32)
self.actions = np.zeros((num_steps, num_envs), dtype=np.int64)
self.rewards = np.zeros((num_steps, num_envs), dtype=np.float32)
self.flags = np.zeros((num_steps, num_envs), dtype=np.float32)
self.log_probs = np.zeros((num_steps, num_envs), dtype=np.float32)
self.probs = np.zeros((num_steps, num_envs, action_dim), dtype=np.float32)
self.values = np.zeros((num_steps, num_envs), dtype=np.float32)
self.step = 0
self.num_steps = num_steps
self.device = device
def push(self, state, action, reward, flag, log_prob, prob, value):
self.states[self.step] = state
self.actions[self.step] = action
self.rewards[self.step] = reward
self.flags[self.step] = flag
self.log_probs[self.step] = log_prob
self.probs[self.step] = prob
self.values[self.step] = value
self.step = (self.step + 1) % self.num_steps
def get(self):
return (
torch.from_numpy(self.states).to(self.device),
torch.from_numpy(self.actions).to(self.device),
torch.from_numpy(self.rewards).to(self.device),
torch.from_numpy(self.flags).to(self.device),
torch.from_numpy(self.log_probs).to(self.device),
torch.from_numpy(self.values).to(self.device),
)
def get_probs(self):
return torch.from_numpy(self.probs).to(self.device)
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
torch.nn.init.orthogonal_(layer.weight, std)
torch.nn.init.constant_(layer.bias, bias_const)
return layer
class Agent(nn.Module):
def __init__(self, action_dim, device):
super().__init__()
self.encoder = nn.Sequential(
layer_init(nn.Conv2d(4, 32, 8, stride=4)),
nn.ReLU(),
layer_init(nn.Conv2d(32, 64, 4, stride=2)),
nn.ReLU(),
layer_init(nn.Conv2d(64, 64, 3, stride=1)),
nn.ReLU(),
nn.Flatten(),
layer_init(nn.Linear(64 * 7 * 7, 512)),
nn.ReLU()
)
self.actor_net = layer_init(nn.Linear(512, action_dim), std=0.01)
self.critic_net = layer_init(nn.Linear(512, 1), std=1)
if device.type == 'cuda':
self.cuda()
def forward(self, state):
hidden = self.encoder(state)
actor_value = self.actor_net(hidden)
distribution = Categorical(logits=actor_value)
action = distribution.sample()
log_prob = distribution.log_prob(action)
value = self.critic_net(hidden).squeeze(-1)
return action, log_prob, value, distribution.probs
def evaluate(self, states, actions):
hidden = self.encoder(states)
actor_values = self.actor_net(hidden)
distribution = Categorical(logits=actor_values)
log_probs = distribution.log_prob(actions)
entropy = distribution.entropy()
values = self.critic_net(hidden).squeeze(-1)
return log_probs, values, entropy, distribution.probs
def critic(self, state):
return self.critic_net(self.encoder(state)).squeeze(-1)
def train(env_id, seed):
args = get_args()
args.env_id = env_id
args.seed = seed
run_name = (
'spo_' + str(args.kld_max) +
'_epoch_' + str(args.update_epochs) +
'_seed_' + str(args.seed)
)
# Save training logs
path_string = str(args.env_id)[4:] + '/' + run_name
writer = SummaryWriter(path_string)
writer.add_text(
'Hyperparameter',
'|param|value|\n|-|-|\n%s' % ('\n'.join([f'|{key}|{value}|' for key, value in vars(args).items()])),
)
# Initialize environments
envs = gym.vector.AsyncVectorEnv([make_env(args.env_id) for _ in range(args.num_envs)])
# State space and action space
observation_shape = envs.single_observation_space.shape
action_dim = envs.single_action_space.n
# Random seed
if args.seed:
numpy_rng = np.random.default_rng(args.seed)
torch.manual_seed(args.seed)
state, _ = envs.reset(seed=args.seed)
else:
numpy_rng = np.random.default_rng()
state, _ = envs.reset()
# Initialize agent
agent = Agent(action_dim, args.device)
# Initialize optimizer
optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate)
# Initialize buffer
rollout_buffer = Buffer(args.num_steps, args.num_envs, observation_shape, action_dim, args.device)
global_step = 0
start_time = time.time()
# Data collection
for _ in tqdm(range(args.num_updates)):
# Linear decay of learning rate
if args.lr_decay:
optimizer.param_groups[0]['lr'] -= (args.learning_rate - 1e-12) / args.num_updates
for _ in range(args.num_steps):
global_step += 1 * args.num_envs
with torch.no_grad():
action, log_prob, value, prob = agent(torch.from_numpy(state).to(args.device).float())
action = action.cpu().numpy()
# Update the environments
next_state, reward, terminated, truncated, all_info = envs.step(action)
# Save data
flag = 1.0 - np.logical_or(terminated, truncated)
log_prob = log_prob.cpu().numpy()
prob = prob.cpu().numpy()
value = value.cpu().numpy()
rollout_buffer.push(state, action, reward, flag, log_prob, prob, value)
state = next_state
if 'final_info' not in all_info:
continue
for info in all_info['final_info']:
if info is None:
continue
if 'episode' in info.keys():
writer.add_scalar('charts/episodic_return', info['episode']['r'], global_step)
break
# ---------------------- We have collected enough data, now let's start training ---------------------- #
states, actions, rewards, flags, log_probs, values = rollout_buffer.get()
probs = rollout_buffer.get_probs()
with torch.no_grad():
last_value = agent.critic(torch.from_numpy(next_state).to(args.device).float())
# Use GAE (Generalized Advantage Estimation) technique to estimate the advantage function and TD target
advantages = compute_advantages(rewards, flags, values, last_value, args)
td_target = advantages + values
# Flatten each batch
states = states.reshape(-1, *observation_shape)
actions = actions.reshape(-1)
log_probs = log_probs.reshape(-1)
probs = probs.reshape((-1, action_dim))
td_target = td_target.reshape(-1)
advantages = advantages.reshape(-1)
values = values.reshape(-1)
batch_indexes = np.arange(args.batch_size)
# Update the policy network and value network
for e in range(1, args.update_epochs + 1):
numpy_rng.shuffle(batch_indexes)
t = 0
for start in range(0, args.batch_size, args.minibatch_size):
t += 1
end = start + args.minibatch_size
index = batch_indexes[start:end]
# The latest outputs of the policy network and value network
new_log_probs, td_predict, entropy, new_probs = agent.evaluate(states[index], actions[index])
log_ratio = new_log_probs - log_probs[index]
ratios = log_ratio.exp()
# Compute KL divergence
d = torch.sum(
probs[index] * torch.log((probs[index] + 1e-12) / (new_probs + 1e-12)), 1
)
writer.add_scalar('charts/average_kld', d.mean(), global_step)
writer.add_scalar('others/min_kld', d.min(), global_step)
writer.add_scalar('others/max_kld', d.max(), global_step)
# Advantage normalization
b_advantages = advantages[index]
b_advantages = (b_advantages - b_advantages.mean()) / (b_advantages.std() + 1e-12)
# Policy loss (main code of SPO)
if e == 1 and t == 1:
policy_loss = (-b_advantages * ratios).mean()
else:
# d_clip
d_clip = torch.clamp(input=d, min=0, max=args.kld_max)
# d_clip / d
ratio = d_clip / (d + 1e-12)
# sign_a
sign_a = torch.sign(b_advantages)
# (d_clip / d + sign_a - 1) * sign_a
result = (ratio + sign_a - 1) * sign_a
policy_loss = (-b_advantages * ratios * result).mean()
# Value loss
if args.clip_value_loss:
v_loss_un_clipped = (td_predict - td_target[index]) ** 2
v_clipped = td_target[index] + torch.clamp(
td_predict - td_target[index],
-0.2,
0.2,
)
v_loss_clipped = (v_clipped - td_target[index]) ** 2
v_loss_max = torch.max(v_loss_un_clipped, v_loss_clipped)
value_loss = 0.5 * v_loss_max.mean()
else:
value_loss = 0.5 * ((td_predict - td_target[index]) ** 2).mean()
# Policy entropy
entropy_loss = entropy.mean()
# Total loss
loss = policy_loss + value_loss * args.c_1 - entropy_loss * args.c_2
# Save the data during the training process
writer.add_scalar('losses/value_loss', value_loss.item(), global_step)
writer.add_scalar('losses/policy_loss', policy_loss.item(), global_step)
writer.add_scalar('losses/entropy', entropy_loss.item(), global_step)
writer.add_scalar('losses/delta', torch.abs(ratios - 1).mean().item(), global_step)
# Update network parameters
optimizer.zero_grad()
loss.backward()
clip_grad_norm_(agent.parameters(), args.clip_grad_norm)
optimizer.step()
explained_var = (
np.nan if torch.var(td_target) == 0 else 1 - torch.var(td_target - values) / torch.var(td_target)
)
writer.add_scalar('charts/learning_rate', optimizer.param_groups[0]['lr'], global_step)
writer.add_scalar('charts/SPS', int(global_step / (time.time() - start_time)), global_step)
writer.add_scalar('others/explained_var', explained_var, global_step)
envs.close()
writer.close()
def main():
for env_id in ['Breakout']:
for seed in [1, 2, 3]:
print(env_id, seed)
train('ALE/' + env_id + '-v5', seed)
if __name__ == '__main__':
main()