add initial code
This commit is contained in:
parent
0e16595372
commit
9c801f0adc
420
PPO/pytorch/ppo_pong_pytorch.py
Normal file
420
PPO/pytorch/ppo_pong_pytorch.py
Normal file
@ -0,0 +1,420 @@
|
||||
import gym
|
||||
from gym.envs.registration import register
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.distributions import Categorical
|
||||
from torch.distributions.kl import kl_divergence
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
from torch.optim import Adam
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
import numpy as np
|
||||
import sys
|
||||
import numpy
|
||||
import time
|
||||
import datetime
|
||||
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
dataType = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
|
||||
|
||||
class Utils():
|
||||
def prepro(self, I):
|
||||
I = I[35:195] # crop
|
||||
I = I[::2,::2, 0] # downsample by factor of 2
|
||||
I[I == 144] = 0 # erase background (background type 1)
|
||||
I[I == 109] = 0 # erase background (background type 2)
|
||||
I[I != 0] = 1 # everything else (paddles, ball) just set to 1
|
||||
X = I.astype(np.float32).ravel() # Combine items in 1 array
|
||||
return X
|
||||
|
||||
class Actor_Model(nn.Module):
|
||||
def __init__(self, state_dim, action_dim):
|
||||
super(Actor_Model, self).__init__()
|
||||
|
||||
self.nn_layer = nn.Sequential(
|
||||
nn.Linear(state_dim, 64),
|
||||
nn.ReLU(),
|
||||
nn.Linear(64, 64),
|
||||
nn.ReLU(),
|
||||
nn.Linear(64, action_dim),
|
||||
nn.Softmax(-1)
|
||||
).float().to(device)
|
||||
|
||||
def forward(self, states):
|
||||
return self.nn_layer(states)
|
||||
|
||||
class Critic_Model(nn.Module):
|
||||
def __init__(self, state_dim, action_dim):
|
||||
super(Critic_Model, self).__init__()
|
||||
|
||||
self.nn_layer = nn.Sequential(
|
||||
nn.Linear(state_dim, 64),
|
||||
nn.ReLU(),
|
||||
nn.Linear(64, 64),
|
||||
nn.ReLU(),
|
||||
nn.Linear(64, 1)
|
||||
).float().to(device)
|
||||
|
||||
def forward(self, states):
|
||||
return self.nn_layer(states)
|
||||
|
||||
class Memory(Dataset):
|
||||
def __init__(self):
|
||||
self.actions = []
|
||||
self.states = []
|
||||
self.rewards = []
|
||||
self.dones = []
|
||||
self.next_states = []
|
||||
|
||||
def __len__(self):
|
||||
return len(self.dones)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return np.array(self.states[idx], dtype = np.float32), np.array(self.actions[idx], dtype = np.float32), np.array([self.rewards[idx]], dtype = np.float32), np.array([self.dones[idx]], dtype = np.float32), np.array(self.next_states[idx], dtype = np.float32)
|
||||
|
||||
def save_eps(self, state, action, reward, done, next_state):
|
||||
self.states.append(state)
|
||||
self.actions.append(action)
|
||||
self.rewards.append(reward)
|
||||
self.dones.append(done)
|
||||
self.next_states.append(next_state)
|
||||
|
||||
def clear_memory(self):
|
||||
del self.actions[:]
|
||||
del self.states[:]
|
||||
del self.rewards[:]
|
||||
del self.dones[:]
|
||||
del self.next_states[:]
|
||||
|
||||
class Discrete():
|
||||
def sample(self, datas):
|
||||
distribution = Categorical(datas)
|
||||
return distribution.sample().float().to(device)
|
||||
|
||||
def entropy(self, datas):
|
||||
distribution = Categorical(datas)
|
||||
return distribution.entropy().float().to(device)
|
||||
|
||||
def logprob(self, datas, value_data):
|
||||
distribution = Categorical(datas)
|
||||
return distribution.log_prob(value_data).unsqueeze(1).float().to(device)
|
||||
|
||||
def kl_divergence(self, datas1, datas2):
|
||||
distribution1 = Categorical(datas1)
|
||||
distribution2 = Categorical(datas2)
|
||||
|
||||
return kl_divergence(distribution1, distribution2).unsqueeze(1).float().to(device)
|
||||
|
||||
class PolicyFunction():
|
||||
def __init__(self, gamma = 0.99, lam = 0.95):
|
||||
self.gamma = gamma
|
||||
self.lam = lam
|
||||
|
||||
def monte_carlo_discounted(self, rewards, dones):
|
||||
running_add = 0
|
||||
returns = []
|
||||
|
||||
for step in reversed(range(len(rewards))):
|
||||
running_add = rewards[step] + (1.0 - dones[step]) * self.gamma * running_add
|
||||
returns.insert(0, running_add)
|
||||
|
||||
return torch.stack(returns)
|
||||
|
||||
def temporal_difference(self, reward, next_value, done):
|
||||
q_values = reward + (1 - done) * self.gamma * next_value
|
||||
return q_values
|
||||
|
||||
def generalized_advantage_estimation(self, values, rewards, next_values, dones):
|
||||
gae = 0
|
||||
adv = []
|
||||
|
||||
delta = rewards + (1.0 - dones) * self.gamma * next_values - values
|
||||
for step in reversed(range(len(rewards))):
|
||||
gae = delta[step] + (1.0 - dones[step]) * self.gamma * self.lam * gae
|
||||
adv.insert(0, gae)
|
||||
|
||||
return torch.stack(adv)
|
||||
|
||||
class TrulyPPO():
|
||||
def __init__(self, policy_kl_range, policy_params, value_clip, vf_loss_coef, entropy_coef, gamma, lam):
|
||||
self.policy_kl_range = policy_kl_range
|
||||
self.policy_params = policy_params
|
||||
self.value_clip = value_clip
|
||||
self.vf_loss_coef = vf_loss_coef
|
||||
self.entropy_coef = entropy_coef
|
||||
|
||||
self.distributions = Discrete()
|
||||
self.policy_function = PolicyFunction(gamma, lam)
|
||||
|
||||
# Loss for PPO
|
||||
def compute_loss(self, action_probs, old_action_probs, values, old_values, next_values, actions, rewards, dones):
|
||||
# Don't use old value in backpropagation
|
||||
Old_values = old_values.detach()
|
||||
Old_action_probs = old_action_probs.detach()
|
||||
|
||||
# Getting general advantages estimator and returns
|
||||
Advantages = self.policy_function.generalized_advantage_estimation(values, rewards, next_values, dones)
|
||||
Returns = (Advantages + values).detach()
|
||||
Advantages = ((Advantages - Advantages.mean()) / (Advantages.std() + 1e-6)).detach()
|
||||
|
||||
# Finding the ratio (pi_theta / pi_theta__old):
|
||||
logprobs = self.distributions.logprob(action_probs, actions)
|
||||
Old_logprobs = self.distributions.logprob(Old_action_probs, actions).detach()
|
||||
|
||||
# Finding Surrogate Loss
|
||||
ratios = (logprobs - Old_logprobs).exp() # ratios = old_logprobs / logprobs
|
||||
Kl = self.distributions.kl_divergence(old_action_probs, action_probs)
|
||||
|
||||
pg_targets = torch.where(
|
||||
(Kl >= self.policy_kl_range) & (ratios > 1),
|
||||
ratios * Advantages - self.policy_params * Kl,
|
||||
ratios * Advantages
|
||||
)
|
||||
pg_loss = pg_targets.mean()
|
||||
|
||||
# Getting Entropy from the action probability
|
||||
dist_entropy = self.distributions.entropy(action_probs).mean()
|
||||
|
||||
# Getting Critic loss by using Clipped critic value
|
||||
if self.value_clip is None:
|
||||
critic_loss = ((Returns - values).pow(2) * 0.5).mean()
|
||||
else:
|
||||
vpredclipped = old_values + torch.clamp(values - Old_values, -self.value_clip, self.value_clip) # Minimize the difference between old value and new value
|
||||
vf_losses1 = (Returns - values).pow(2) * 0.5 # Mean Squared Error
|
||||
vf_losses2 = (Returns - vpredclipped).pow(2) * 0.5 # Mean Squared Error
|
||||
critic_loss = torch.max(vf_losses1, vf_losses2).mean()
|
||||
|
||||
# We need to maximaze Policy Loss to make agent always find Better Rewards
|
||||
# and minimize Critic Loss
|
||||
loss = (critic_loss * self.vf_loss_coef) - (dist_entropy * self.entropy_coef) - pg_loss
|
||||
return loss
|
||||
|
||||
class Agent():
|
||||
def __init__(self, state_dim, action_dim, is_training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
|
||||
batchsize, PPO_epochs, gamma, lam, learning_rate):
|
||||
self.policy_kl_range = policy_kl_range
|
||||
self.policy_params = policy_params
|
||||
self.value_clip = value_clip
|
||||
self.entropy_coef = entropy_coef
|
||||
self.vf_loss_coef = vf_loss_coef
|
||||
self.batchsize = batchsize
|
||||
self.PPO_epochs = PPO_epochs
|
||||
self.is_training_mode = is_training_mode
|
||||
self.action_dim = action_dim
|
||||
|
||||
self.actor = Actor_Model(state_dim, action_dim)
|
||||
self.actor_old = Actor_Model(state_dim, action_dim)
|
||||
self.actor_optimizer = Adam(self.actor.parameters(), lr = learning_rate)
|
||||
|
||||
self.critic = Critic_Model(state_dim, action_dim)
|
||||
self.critic_old = Critic_Model(state_dim, action_dim)
|
||||
self.critic_optimizer = Adam(self.critic.parameters(), lr = learning_rate)
|
||||
|
||||
self.memory = Memory()
|
||||
self.policy_function = PolicyFunction(gamma, lam)
|
||||
|
||||
self.distributions = Discrete()
|
||||
self.policy_loss = TrulyPPO(policy_kl_range, policy_params, value_clip, vf_loss_coef, entropy_coef, gamma, lam)
|
||||
|
||||
if is_training_mode:
|
||||
self.actor.train()
|
||||
self.critic.train()
|
||||
else:
|
||||
self.actor.eval()
|
||||
self.critic.eval()
|
||||
|
||||
def save_eps(self, state, action, reward, done, next_state):
|
||||
self.memory.save_eps(state, action, reward, done, next_state)
|
||||
|
||||
def act(self, state):
|
||||
state = torch.FloatTensor(state).unsqueeze(0).to(device).detach()
|
||||
action_probs = self.actor(state)
|
||||
|
||||
# We don't need sample the action in Test Mode
|
||||
# only sampling the action in Training Mode in order to exploring the actions
|
||||
if self.is_training_mode:
|
||||
# Sample the action
|
||||
action = self.distributions.sample(action_probs)
|
||||
else:
|
||||
action = torch.argmax(action_probs, 1)
|
||||
|
||||
return action.int().cpu().item()
|
||||
|
||||
# Get loss and Do backpropagation
|
||||
def training_ppo(self, states, actions, rewards, dones, next_states):
|
||||
action_probs, values = self.actor(states), self.critic(states)
|
||||
old_action_probs, old_values = self.actor_old(states), self.critic_old(states)
|
||||
next_values = self.critic(next_states)
|
||||
|
||||
loss = self.policy_loss.compute_loss(action_probs, old_action_probs, values, old_values, next_values, actions, rewards, dones)
|
||||
|
||||
self.actor_optimizer.zero_grad()
|
||||
self.critic_optimizer.zero_grad()
|
||||
|
||||
loss.backward()
|
||||
|
||||
self.actor_optimizer.step()
|
||||
self.critic_optimizer.step()
|
||||
|
||||
# Update the model
|
||||
def update_ppo(self):
|
||||
dataloader = DataLoader(self.memory, self.batchsize, shuffle = False)
|
||||
|
||||
# Optimize policy for K epochs:
|
||||
for _ in range(self.PPO_epochs):
|
||||
for states, actions, rewards, dones, next_states in dataloader:
|
||||
self.training_ppo(states.float().to(device), actions.float().to(device), rewards.float().to(device), dones.float().to(device), next_states.float().to(device))
|
||||
|
||||
# Clear the memory
|
||||
self.memory.clear_memory()
|
||||
|
||||
# Copy new weights into old policy:
|
||||
self.actor_old.load_state_dict(self.actor.state_dict())
|
||||
self.critic_old.load_state_dict(self.critic.state_dict())
|
||||
|
||||
def save_weights(self):
|
||||
torch.save({
|
||||
'model_state_dict': self.actor.state_dict(),
|
||||
'optimizer_state_dict': self.actor_optimizer.state_dict()
|
||||
}, 'SlimeVolley/actor.tar')
|
||||
|
||||
torch.save({
|
||||
'model_state_dict': self.critic.state_dict(),
|
||||
'optimizer_state_dict': self.critic_optimizer.state_dict()
|
||||
}, 'SlimeVolley/critic.tar')
|
||||
|
||||
def load_weights(self):
|
||||
actor_checkpoint = torch.load('SlimeVolley/actor.tar')
|
||||
self.actor.load_state_dict(actor_checkpoint['model_state_dict'])
|
||||
self.actor_optimizer.load_state_dict(actor_checkpoint['optimizer_state_dict'])
|
||||
|
||||
critic_checkpoint = torch.load('SlimeVolley/critic.tar')
|
||||
self.critic.load_state_dict(critic_checkpoint['model_state_dict'])
|
||||
self.critic_optimizer.load_state_dict(critic_checkpoint['optimizer_state_dict'])
|
||||
|
||||
class Runner():
|
||||
def __init__(self, env, agent, render, training_mode, n_update):
|
||||
self.env = env
|
||||
self.agent = agent
|
||||
self.render = render
|
||||
self.training_mode = training_mode
|
||||
|
||||
self.n_update = n_update
|
||||
self.t_updates = 0
|
||||
self.utils = Utils()
|
||||
|
||||
def run_episode(self):
|
||||
############################################
|
||||
obs = self.env.reset()
|
||||
obs = self.utils.prepro(obs)
|
||||
|
||||
done = False
|
||||
total_reward = 0
|
||||
eps_time = 0
|
||||
############################################
|
||||
for _ in range(10000):
|
||||
action = self.agent.act(state)
|
||||
action_gym = action + 1 if action != 0 else 0
|
||||
|
||||
next_obs, reward, done, _ = self.env.step(action_gym)
|
||||
next_obs = self.utils.prepro(next_obs)
|
||||
next_state = next_obs - obs
|
||||
|
||||
eps_time += 1
|
||||
self.t_updates += 1
|
||||
total_reward += reward
|
||||
|
||||
if self.training_mode:
|
||||
self.agent.save_eps(state.tolist(), action, reward, float(done), next_state.tolist())
|
||||
|
||||
state = next_state
|
||||
obs = next_obs
|
||||
|
||||
if self.render:
|
||||
self.env.render()
|
||||
|
||||
if self.training_mode and self.n_update is not None and self.t_updates == self.n_update:
|
||||
self.agent.update_ppo()
|
||||
self.t_updates = 0
|
||||
|
||||
if done:
|
||||
break
|
||||
|
||||
if self.training_mode and self.n_update is None:
|
||||
self.agent.update_ppo()
|
||||
|
||||
return total_reward, eps_time
|
||||
|
||||
def main():
|
||||
############## Hyperparameters ##############
|
||||
load_weights = False # If you want to load the agent, set this to True
|
||||
save_weights = False # If you want to save the agent, set this to True
|
||||
training_mode = True # If you want to train the agent, set this to True. But set this otherwise if you only want to test it
|
||||
reward_threshold = 495 # Set threshold for reward. The learning will stop if reward has pass threshold. Set none to sei this off
|
||||
using_google_drive = False
|
||||
|
||||
render = False # If you want to display the image, set this to True. Turn this off if you run this in Google Collab
|
||||
n_update = 128 # How many episode before you update the Policy. Recommended set to 128 for Discrete
|
||||
n_plot_batch = 100000000 # How many episode you want to plot the result
|
||||
n_episode = 100000 # How many episode you want to run
|
||||
n_saved = 10 # How many episode to run before saving the weights
|
||||
|
||||
policy_kl_range = 0.0008 # Recommended set to 0.0008 for Discrete
|
||||
policy_params = 20 # Recommended set to 20 for Discrete
|
||||
value_clip = 1.0 # How many value will be clipped. Recommended set to the highest or lowest possible reward
|
||||
entropy_coef = 0.05 # How much randomness of action you will get
|
||||
vf_loss_coef = 1.0 # Just set to 1
|
||||
batchsize = 32 # How many batch per update. size of batch = n_update / minibatch. Recommended set to 4 for Discrete
|
||||
PPO_epochs = 4 # How many epoch per update. Recommended set to 10 for Discrete
|
||||
|
||||
gamma = 0.99 # Just set to 0.99
|
||||
lam = 0.95 # Just set to 0.95
|
||||
learning_rate = 2.5e-4 # Just set to 0.95
|
||||
#############################################
|
||||
writer = SummaryWriter()
|
||||
|
||||
env_name = 'CartPole-v1' # Set the env you want
|
||||
env = gym.make(env_name)
|
||||
|
||||
state_dim = env.observation_space.shape[0]
|
||||
action_dim = env.action_space.n
|
||||
|
||||
agent = Agent(state_dim, action_dim, training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
|
||||
batchsize, PPO_epochs, gamma, lam, learning_rate)
|
||||
|
||||
runner = Runner(env, agent, render, training_mode, n_update)
|
||||
#############################################
|
||||
if using_google_drive:
|
||||
from google.colab import drive
|
||||
drive.mount('/test')
|
||||
|
||||
if load_weights:
|
||||
agent.load_weights()
|
||||
print('Weight Loaded')
|
||||
|
||||
print('Run the training!!')
|
||||
start = time.time()
|
||||
|
||||
try:
|
||||
for i_episode in range(1, n_episode + 1):
|
||||
total_reward, eps_time = runner.run_episode()
|
||||
|
||||
print('Episode: {} \t t_reward: {} \t time: {} \t '.format(i_episode, total_reward, eps_time))
|
||||
writer.add_scalar('rewards', total_reward, i_episode)
|
||||
|
||||
if save_weights:
|
||||
if i_episode % n_saved == 0:
|
||||
agent.save_weights()
|
||||
print('weights saved')
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print('\nTraining has been Shutdown \n')
|
||||
|
||||
finally:
|
||||
finish = time.time()
|
||||
timedelta = finish - start
|
||||
print('Timelength: {}'.format(str( datetime.timedelta(seconds = timedelta) )))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
412
PPO/pytorch/ppo_pytorch.py
Normal file
412
PPO/pytorch/ppo_pytorch.py
Normal file
@ -0,0 +1,412 @@
|
||||
import gym
|
||||
from gym.envs.registration import register
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.distributions import Categorical
|
||||
from torch.distributions.kl import kl_divergence
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
from torch.optim import Adam
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
import numpy as np
|
||||
import sys
|
||||
import numpy
|
||||
import time
|
||||
import datetime
|
||||
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
dataType = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
|
||||
|
||||
class Utils():
|
||||
def prepro(self, I):
|
||||
I = I[35:195] # crop
|
||||
I = I[::2,::2, 0] # downsample by factor of 2
|
||||
I[I == 144] = 0 # erase background (background type 1)
|
||||
I[I == 109] = 0 # erase background (background type 2)
|
||||
I[I != 0] = 1 # everything else (paddles, ball) just set to 1
|
||||
X = I.astype(np.float32).ravel() # Combine items in 1 array
|
||||
return X
|
||||
|
||||
class Actor_Model(nn.Module):
|
||||
def __init__(self, state_dim, action_dim):
|
||||
super(Actor_Model, self).__init__()
|
||||
|
||||
self.nn_layer = nn.Sequential(
|
||||
nn.Linear(state_dim, 64),
|
||||
nn.ReLU(),
|
||||
nn.Linear(64, 64),
|
||||
nn.ReLU(),
|
||||
nn.Linear(64, action_dim),
|
||||
nn.Softmax(-1)
|
||||
).float().to(device)
|
||||
|
||||
def forward(self, states):
|
||||
return self.nn_layer(states)
|
||||
|
||||
class Critic_Model(nn.Module):
|
||||
def __init__(self, state_dim, action_dim):
|
||||
super(Critic_Model, self).__init__()
|
||||
|
||||
self.nn_layer = nn.Sequential(
|
||||
nn.Linear(state_dim, 64),
|
||||
nn.ReLU(),
|
||||
nn.Linear(64, 64),
|
||||
nn.ReLU(),
|
||||
nn.Linear(64, 1)
|
||||
).float().to(device)
|
||||
|
||||
def forward(self, states):
|
||||
return self.nn_layer(states)
|
||||
|
||||
class Memory(Dataset):
|
||||
def __init__(self):
|
||||
self.actions = []
|
||||
self.states = []
|
||||
self.rewards = []
|
||||
self.dones = []
|
||||
self.next_states = []
|
||||
|
||||
def __len__(self):
|
||||
return len(self.dones)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return np.array(self.states[idx], dtype = np.float32), np.array(self.actions[idx], dtype = np.float32), np.array([self.rewards[idx]], dtype = np.float32), np.array([self.dones[idx]], dtype = np.float32), np.array(self.next_states[idx], dtype = np.float32)
|
||||
|
||||
def save_eps(self, state, action, reward, done, next_state):
|
||||
self.states.append(state)
|
||||
self.actions.append(action)
|
||||
self.rewards.append(reward)
|
||||
self.dones.append(done)
|
||||
self.next_states.append(next_state)
|
||||
|
||||
def clear_memory(self):
|
||||
del self.actions[:]
|
||||
del self.states[:]
|
||||
del self.rewards[:]
|
||||
del self.dones[:]
|
||||
del self.next_states[:]
|
||||
|
||||
class Discrete():
|
||||
def sample(self, datas):
|
||||
distribution = Categorical(datas)
|
||||
return distribution.sample().float().to(device)
|
||||
|
||||
def entropy(self, datas):
|
||||
distribution = Categorical(datas)
|
||||
return distribution.entropy().float().to(device)
|
||||
|
||||
def logprob(self, datas, value_data):
|
||||
distribution = Categorical(datas)
|
||||
return distribution.log_prob(value_data).unsqueeze(1).float().to(device)
|
||||
|
||||
def kl_divergence(self, datas1, datas2):
|
||||
distribution1 = Categorical(datas1)
|
||||
distribution2 = Categorical(datas2)
|
||||
|
||||
return kl_divergence(distribution1, distribution2).unsqueeze(1).float().to(device)
|
||||
|
||||
class PolicyFunction():
|
||||
def __init__(self, gamma = 0.99, lam = 0.95):
|
||||
self.gamma = gamma
|
||||
self.lam = lam
|
||||
|
||||
def monte_carlo_discounted(self, rewards, dones):
|
||||
running_add = 0
|
||||
returns = []
|
||||
|
||||
for step in reversed(range(len(rewards))):
|
||||
running_add = rewards[step] + (1.0 - dones[step]) * self.gamma * running_add
|
||||
returns.insert(0, running_add)
|
||||
|
||||
return torch.stack(returns)
|
||||
|
||||
def temporal_difference(self, reward, next_value, done):
|
||||
q_values = reward + (1 - done) * self.gamma * next_value
|
||||
return q_values
|
||||
|
||||
def generalized_advantage_estimation(self, values, rewards, next_values, dones):
|
||||
gae = 0
|
||||
adv = []
|
||||
|
||||
delta = rewards + (1.0 - dones) * self.gamma * next_values - values
|
||||
for step in reversed(range(len(rewards))):
|
||||
gae = delta[step] + (1.0 - dones[step]) * self.gamma * self.lam * gae
|
||||
adv.insert(0, gae)
|
||||
|
||||
return torch.stack(adv)
|
||||
|
||||
class TrulyPPO():
|
||||
def __init__(self, policy_kl_range, policy_params, value_clip, vf_loss_coef, entropy_coef, gamma, lam):
|
||||
self.policy_kl_range = policy_kl_range
|
||||
self.policy_params = policy_params
|
||||
self.value_clip = value_clip
|
||||
self.vf_loss_coef = vf_loss_coef
|
||||
self.entropy_coef = entropy_coef
|
||||
|
||||
self.distributions = Discrete()
|
||||
self.policy_function = PolicyFunction(gamma, lam)
|
||||
|
||||
# Loss for PPO
|
||||
def compute_loss(self, action_probs, old_action_probs, values, old_values, next_values, actions, rewards, dones):
|
||||
# Don't use old value in backpropagation
|
||||
Old_values = old_values.detach()
|
||||
Old_action_probs = old_action_probs.detach()
|
||||
|
||||
# Getting general advantages estimator and returns
|
||||
Advantages = self.policy_function.generalized_advantage_estimation(values, rewards, next_values, dones)
|
||||
Returns = (Advantages + values).detach()
|
||||
Advantages = ((Advantages - Advantages.mean()) / (Advantages.std() + 1e-6)).detach()
|
||||
|
||||
# Finding the ratio (pi_theta / pi_theta__old):
|
||||
logprobs = self.distributions.logprob(action_probs, actions)
|
||||
Old_logprobs = self.distributions.logprob(Old_action_probs, actions).detach()
|
||||
|
||||
# Finding Surrogate Loss
|
||||
ratios = (logprobs - Old_logprobs).exp() # ratios = old_logprobs / logprobs
|
||||
Kl = self.distributions.kl_divergence(old_action_probs, action_probs)
|
||||
|
||||
pg_targets = torch.where(
|
||||
(Kl >= self.policy_kl_range) & (ratios > 1),
|
||||
ratios * Advantages - self.policy_params * Kl,
|
||||
ratios * Advantages
|
||||
)
|
||||
pg_loss = pg_targets.mean()
|
||||
|
||||
# Getting Entropy from the action probability
|
||||
dist_entropy = self.distributions.entropy(action_probs).mean()
|
||||
|
||||
# Getting Critic loss by using Clipped critic value
|
||||
if self.value_clip is None:
|
||||
critic_loss = ((Returns - values).pow(2) * 0.5).mean()
|
||||
else:
|
||||
vpredclipped = old_values + torch.clamp(values - Old_values, -self.value_clip, self.value_clip) # Minimize the difference between old value and new value
|
||||
vf_losses1 = (Returns - values).pow(2) * 0.5 # Mean Squared Error
|
||||
vf_losses2 = (Returns - vpredclipped).pow(2) * 0.5 # Mean Squared Error
|
||||
critic_loss = torch.max(vf_losses1, vf_losses2).mean()
|
||||
|
||||
# We need to maximaze Policy Loss to make agent always find Better Rewards
|
||||
# and minimize Critic Loss
|
||||
loss = (critic_loss * self.vf_loss_coef) - (dist_entropy * self.entropy_coef) - pg_loss
|
||||
return loss
|
||||
|
||||
class Agent():
|
||||
def __init__(self, state_dim, action_dim, is_training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
|
||||
batchsize, PPO_epochs, gamma, lam, learning_rate):
|
||||
self.policy_kl_range = policy_kl_range
|
||||
self.policy_params = policy_params
|
||||
self.value_clip = value_clip
|
||||
self.entropy_coef = entropy_coef
|
||||
self.vf_loss_coef = vf_loss_coef
|
||||
self.batchsize = batchsize
|
||||
self.PPO_epochs = PPO_epochs
|
||||
self.is_training_mode = is_training_mode
|
||||
self.action_dim = action_dim
|
||||
|
||||
self.actor = Actor_Model(state_dim, action_dim)
|
||||
self.actor_old = Actor_Model(state_dim, action_dim)
|
||||
self.actor_optimizer = Adam(self.actor.parameters(), lr = learning_rate)
|
||||
|
||||
self.critic = Critic_Model(state_dim, action_dim)
|
||||
self.critic_old = Critic_Model(state_dim, action_dim)
|
||||
self.critic_optimizer = Adam(self.critic.parameters(), lr = learning_rate)
|
||||
|
||||
self.memory = Memory()
|
||||
self.policy_function = PolicyFunction(gamma, lam)
|
||||
|
||||
self.distributions = Discrete()
|
||||
self.policy_loss = TrulyPPO(policy_kl_range, policy_params, value_clip, vf_loss_coef, entropy_coef, gamma, lam)
|
||||
|
||||
if is_training_mode:
|
||||
self.actor.train()
|
||||
self.critic.train()
|
||||
else:
|
||||
self.actor.eval()
|
||||
self.critic.eval()
|
||||
|
||||
def save_eps(self, state, action, reward, done, next_state):
|
||||
self.memory.save_eps(state, action, reward, done, next_state)
|
||||
|
||||
def act(self, state):
|
||||
state = torch.FloatTensor(state).unsqueeze(0).to(device).detach()
|
||||
action_probs = self.actor(state)
|
||||
|
||||
# We don't need sample the action in Test Mode
|
||||
# only sampling the action in Training Mode in order to exploring the actions
|
||||
if self.is_training_mode:
|
||||
# Sample the action
|
||||
action = self.distributions.sample(action_probs)
|
||||
else:
|
||||
action = torch.argmax(action_probs, 1)
|
||||
|
||||
return action.int().cpu().item()
|
||||
|
||||
# Get loss and Do backpropagation
|
||||
def training_ppo(self, states, actions, rewards, dones, next_states):
|
||||
action_probs, values = self.actor(states), self.critic(states)
|
||||
old_action_probs, old_values = self.actor_old(states), self.critic_old(states)
|
||||
next_values = self.critic(next_states)
|
||||
|
||||
loss = self.policy_loss.compute_loss(action_probs, old_action_probs, values, old_values, next_values, actions, rewards, dones)
|
||||
|
||||
self.actor_optimizer.zero_grad()
|
||||
self.critic_optimizer.zero_grad()
|
||||
|
||||
loss.backward()
|
||||
|
||||
self.actor_optimizer.step()
|
||||
self.critic_optimizer.step()
|
||||
|
||||
# Update the model
|
||||
def update_ppo(self):
|
||||
dataloader = DataLoader(self.memory, self.batchsize, shuffle = False)
|
||||
|
||||
# Optimize policy for K epochs:
|
||||
for _ in range(self.PPO_epochs):
|
||||
for states, actions, rewards, dones, next_states in dataloader:
|
||||
self.training_ppo(states.float().to(device), actions.float().to(device), rewards.float().to(device), dones.float().to(device), next_states.float().to(device))
|
||||
|
||||
# Clear the memory
|
||||
self.memory.clear_memory()
|
||||
|
||||
# Copy new weights into old policy:
|
||||
self.actor_old.load_state_dict(self.actor.state_dict())
|
||||
self.critic_old.load_state_dict(self.critic.state_dict())
|
||||
|
||||
def save_weights(self):
|
||||
torch.save({
|
||||
'model_state_dict': self.actor.state_dict(),
|
||||
'optimizer_state_dict': self.actor_optimizer.state_dict()
|
||||
}, 'SlimeVolley/actor.tar')
|
||||
|
||||
torch.save({
|
||||
'model_state_dict': self.critic.state_dict(),
|
||||
'optimizer_state_dict': self.critic_optimizer.state_dict()
|
||||
}, 'SlimeVolley/critic.tar')
|
||||
|
||||
def load_weights(self):
|
||||
actor_checkpoint = torch.load('SlimeVolley/actor.tar')
|
||||
self.actor.load_state_dict(actor_checkpoint['model_state_dict'])
|
||||
self.actor_optimizer.load_state_dict(actor_checkpoint['optimizer_state_dict'])
|
||||
|
||||
critic_checkpoint = torch.load('SlimeVolley/critic.tar')
|
||||
self.critic.load_state_dict(critic_checkpoint['model_state_dict'])
|
||||
self.critic_optimizer.load_state_dict(critic_checkpoint['optimizer_state_dict'])
|
||||
|
||||
class Runner():
|
||||
def __init__(self, env, agent, render, training_mode, n_update):
|
||||
self.env = env
|
||||
self.agent = agent
|
||||
self.render = render
|
||||
self.training_mode = training_mode
|
||||
|
||||
self.n_update = n_update
|
||||
self.t_updates = 0
|
||||
|
||||
def run_episode(self):
|
||||
############################################
|
||||
state = self.env.reset()
|
||||
done = False
|
||||
total_reward = 0
|
||||
eps_time = 0
|
||||
############################################
|
||||
for _ in range(10000):
|
||||
action = self.agent.act(state)
|
||||
next_state, reward, done, _ = self.env.step(action)
|
||||
|
||||
eps_time += 1
|
||||
self.t_updates += 1
|
||||
total_reward += reward
|
||||
|
||||
if self.training_mode:
|
||||
self.agent.save_eps(state.tolist(), action, reward, float(done), next_state.tolist())
|
||||
|
||||
state = next_state
|
||||
|
||||
if self.render:
|
||||
self.env.render()
|
||||
|
||||
if self.training_mode and self.n_update is not None and self.t_updates == self.n_update:
|
||||
self.agent.update_ppo()
|
||||
self.t_updates = 0
|
||||
|
||||
if done:
|
||||
break
|
||||
|
||||
if self.training_mode and self.n_update is None:
|
||||
self.agent.update_ppo()
|
||||
|
||||
return total_reward, eps_time
|
||||
|
||||
def main():
|
||||
############## Hyperparameters ##############
|
||||
load_weights = False # If you want to load the agent, set this to True
|
||||
save_weights = False # If you want to save the agent, set this to True
|
||||
training_mode = True # If you want to train the agent, set this to True. But set this otherwise if you only want to test it
|
||||
reward_threshold = 495 # Set threshold for reward. The learning will stop if reward has pass threshold. Set none to sei this off
|
||||
using_google_drive = False
|
||||
|
||||
render = False # If you want to display the image, set this to True. Turn this off if you run this in Google Collab
|
||||
n_update = 128 # How many episode before you update the Policy. Recommended set to 128 for Discrete
|
||||
n_plot_batch = 100000000 # How many episode you want to plot the result
|
||||
n_episode = 100000 # How many episode you want to run
|
||||
n_saved = 10 # How many episode to run before saving the weights
|
||||
|
||||
policy_kl_range = 0.0008 # Recommended set to 0.0008 for Discrete
|
||||
policy_params = 20 # Recommended set to 20 for Discrete
|
||||
value_clip = 1.0 # How many value will be clipped. Recommended set to the highest or lowest possible reward
|
||||
entropy_coef = 0.05 # How much randomness of action you will get
|
||||
vf_loss_coef = 1.0 # Just set to 1
|
||||
batchsize = 32 # How many batch per update. size of batch = n_update / minibatch. Recommended set to 4 for Discrete
|
||||
PPO_epochs = 4 # How many epoch per update. Recommended set to 10 for Discrete
|
||||
|
||||
gamma = 0.99 # Just set to 0.99
|
||||
lam = 0.95 # Just set to 0.95
|
||||
learning_rate = 2.5e-4 # Just set to 0.95
|
||||
#############################################
|
||||
writer = SummaryWriter()
|
||||
|
||||
env_name = 'CartPole-v1' # Set the env you want
|
||||
env = gym.make(env_name)
|
||||
|
||||
state_dim = env.observation_space.shape[0]
|
||||
action_dim = env.action_space.n
|
||||
|
||||
agent = Agent(state_dim, action_dim, training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
|
||||
batchsize, PPO_epochs, gamma, lam, learning_rate)
|
||||
|
||||
runner = Runner(env, agent, render, training_mode, n_update)
|
||||
#############################################
|
||||
if using_google_drive:
|
||||
from google.colab import drive
|
||||
drive.mount('/test')
|
||||
|
||||
if load_weights:
|
||||
agent.load_weights()
|
||||
print('Weight Loaded')
|
||||
|
||||
print('Run the training!!')
|
||||
start = time.time()
|
||||
|
||||
try:
|
||||
for i_episode in range(1, n_episode + 1):
|
||||
total_reward, eps_time = runner.run_episode()
|
||||
|
||||
print('Episode: {} \t t_reward: {} \t time: {} \t '.format(i_episode, total_reward, eps_time))
|
||||
writer.add_scalar('rewards', total_reward, i_episode)
|
||||
|
||||
if save_weights:
|
||||
if i_episode % n_saved == 0:
|
||||
agent.save_weights()
|
||||
print('weights saved')
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print('\nTraining has been Shutdown \n')
|
||||
|
||||
finally:
|
||||
finish = time.time()
|
||||
timedelta = finish - start
|
||||
print('Timelength: {}'.format(str( datetime.timedelta(seconds = timedelta) )))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
424
PPO/tensorflow 2/ppo_pong_tensorflow.py
Normal file
424
PPO/tensorflow 2/ppo_pong_tensorflow.py
Normal file
@ -0,0 +1,424 @@
|
||||
import gym
|
||||
from gym.envs.registration import register
|
||||
|
||||
import tensorflow as tf
|
||||
import tensorflow_probability as tfp
|
||||
from tensorflow.keras.layers import Dense
|
||||
from tensorflow.keras import Model
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import sys
|
||||
import numpy
|
||||
|
||||
class Utils():
|
||||
def prepro(self, I):
|
||||
I = I[35:195] # crop
|
||||
I = I[::2,::2, 0] # downsample by factor of 2
|
||||
I[I == 144] = 0 # erase background (background type 1)
|
||||
I[I == 109] = 0 # erase background (background type 2)
|
||||
I[I != 0] = 1 # everything else (paddles, ball) just set to 1
|
||||
X = I.astype(np.float32).ravel() # Combine items in 1 array
|
||||
return X
|
||||
|
||||
class Actor_Model(Model):
|
||||
def __init__(self, state_dim, action_dim):
|
||||
super(Actor_Model, self).__init__()
|
||||
self.d1 = Dense(640, activation='relu')
|
||||
self.d2 = Dense(640, activation='relu')
|
||||
self.dout = Dense(action_dim, activation='softmax')
|
||||
|
||||
def call(self, x):
|
||||
x = self.d1(x)
|
||||
x = self.d2(x)
|
||||
return self.dout(x)
|
||||
|
||||
class Critic_Model(Model):
|
||||
def __init__(self, state_dim, action_dim):
|
||||
super(Critic_Model, self).__init__()
|
||||
self.d1 = Dense(640, activation='relu')
|
||||
self.d2 = Dense(640, activation='relu')
|
||||
self.dout = Dense(1, activation='linear')
|
||||
|
||||
def call(self, x):
|
||||
x = self.d1(x)
|
||||
x = self.d2(x)
|
||||
return self.dout(x)
|
||||
|
||||
class Memory():
|
||||
def __init__(self):
|
||||
self.actions = []
|
||||
self.states = []
|
||||
self.rewards = []
|
||||
self.dones = []
|
||||
self.next_states = []
|
||||
|
||||
def __len__(self):
|
||||
return len(self.dones)
|
||||
|
||||
def get_all_items(self):
|
||||
states = tf.constant(self.states, dtype = tf.float32)
|
||||
actions = tf.constant(self.actions, dtype = tf.float32)
|
||||
rewards = tf.expand_dims(tf.constant(self.rewards, dtype = tf.float32), 1)
|
||||
dones = tf.expand_dims(tf.constant(self.dones, dtype = tf.float32), 1)
|
||||
next_states = tf.constant(self.next_states, dtype = tf.float32)
|
||||
|
||||
return tf.data.Dataset.from_tensor_slices((states, actions, rewards, dones, next_states))
|
||||
|
||||
def save_eps(self, state, action, reward, done, next_state):
|
||||
self.rewards.append(reward)
|
||||
self.states.append(state)
|
||||
self.actions.append(action)
|
||||
self.dones.append(done)
|
||||
self.next_states.append(next_state)
|
||||
|
||||
def clear_memory(self):
|
||||
del self.actions[:]
|
||||
del self.states[:]
|
||||
del self.rewards[:]
|
||||
del self.dones[:]
|
||||
del self.next_states[:]
|
||||
|
||||
class Distributions():
|
||||
def sample(self, datas):
|
||||
distribution = tfp.distributions.Categorical(probs = datas)
|
||||
return distribution.sample()
|
||||
|
||||
def entropy(self, datas):
|
||||
distribution = tfp.distributions.Categorical(probs = datas)
|
||||
return distribution.entropy()
|
||||
|
||||
def logprob(self, datas, value_data):
|
||||
distribution = tfp.distributions.Categorical(probs = datas)
|
||||
return tf.expand_dims(distribution.log_prob(value_data), 1)
|
||||
|
||||
def kl_divergence(self, datas1, datas2):
|
||||
distribution1 = tfp.distributions.Categorical(probs = datas1)
|
||||
distribution2 = tfp.distributions.Categorical(probs = datas2)
|
||||
|
||||
return tf.expand_dims(tfp.distributions.kl_divergence(distribution1, distribution2), 1)
|
||||
|
||||
class PolicyFunction():
|
||||
def __init__(self, gamma = 0.99, lam = 0.95):
|
||||
self.gamma = gamma
|
||||
self.lam = lam
|
||||
|
||||
def monte_carlo_discounted(self, rewards, dones):
|
||||
running_add = 0
|
||||
returns = []
|
||||
|
||||
for step in reversed(range(len(rewards))):
|
||||
running_add = rewards[step] + (1.0 - dones[step]) * self.gamma * running_add
|
||||
returns.insert(0, running_add)
|
||||
|
||||
return tf.stack(returns)
|
||||
|
||||
def temporal_difference(self, reward, next_value, done):
|
||||
q_values = reward + (1 - done) * self.gamma * next_value
|
||||
return q_values
|
||||
|
||||
def generalized_advantage_estimation(self, values, rewards, next_values, dones):
|
||||
gae = 0
|
||||
adv = []
|
||||
|
||||
delta = rewards + (1.0 - dones) * self.gamma * next_values - values
|
||||
for step in reversed(range(len(rewards))):
|
||||
gae = delta[step] + (1.0 - dones[step]) * self.gamma * self.lam * gae
|
||||
adv.insert(0, gae)
|
||||
|
||||
return tf.stack(adv)
|
||||
|
||||
class Agent():
|
||||
def __init__(self, state_dim, action_dim, is_training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
|
||||
minibatch, PPO_epochs, gamma, lam, learning_rate):
|
||||
self.policy_kl_range = policy_kl_range
|
||||
self.policy_params = policy_params
|
||||
self.value_clip = value_clip
|
||||
self.entropy_coef = entropy_coef
|
||||
self.vf_loss_coef = vf_loss_coef
|
||||
self.minibatch = minibatch
|
||||
self.PPO_epochs = PPO_epochs
|
||||
self.is_training_mode = is_training_mode
|
||||
self.action_dim = action_dim
|
||||
|
||||
self.actor = Actor_Model(state_dim, action_dim)
|
||||
self.actor_old = Actor_Model(state_dim, action_dim)
|
||||
|
||||
self.critic = Critic_Model(state_dim, action_dim)
|
||||
self.critic_old = Critic_Model(state_dim, action_dim)
|
||||
|
||||
self.optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)
|
||||
self.memory = Memory()
|
||||
self.policy_function = PolicyFunction(gamma, lam)
|
||||
self.distributions = Distributions()
|
||||
|
||||
def save_eps(self, state, action, reward, done, next_state):
|
||||
self.memory.save_eps(state, action, reward, done, next_state)
|
||||
|
||||
# Loss for PPO
|
||||
def get_loss(self, action_probs, values, old_action_probs, old_values, next_values, actions, rewards, dones):
|
||||
# Don't use old value in backpropagation
|
||||
Old_values = tf.stop_gradient(old_values)
|
||||
|
||||
# Getting general advantages estimator
|
||||
Advantages = self.policy_function.generalized_advantage_estimation(values, rewards, next_values, dones)
|
||||
Returns = tf.stop_gradient(Advantages + values)
|
||||
Advantages = tf.stop_gradient((Advantages - tf.math.reduce_mean(Advantages)) / (tf.math.reduce_std(Advantages) + 1e-6))
|
||||
|
||||
# Finding the ratio (pi_theta / pi_theta__old):
|
||||
logprobs = self.distributions.logprob(action_probs, actions)
|
||||
Old_logprobs = tf.stop_gradient(self.distributions.logprob(old_action_probs, actions))
|
||||
ratios = tf.math.exp(logprobs - Old_logprobs) # ratios = old_logprobs / logprobs
|
||||
|
||||
# Finding KL Divergence
|
||||
Kl = self.distributions.kl_divergence(old_action_probs, action_probs)
|
||||
|
||||
# Combining TR-PPO with Rollback (Truly PPO)
|
||||
pg_loss = tf.where(
|
||||
tf.logical_and(Kl >= self.policy_kl_range, ratios > 1),
|
||||
ratios * Advantages - self.policy_params * Kl,
|
||||
ratios * Advantages
|
||||
)
|
||||
pg_loss = tf.math.reduce_mean(pg_loss)
|
||||
|
||||
# Getting entropy from the action probability
|
||||
dist_entropy = tf.math.reduce_mean(self.distributions.entropy(action_probs))
|
||||
|
||||
# Getting critic loss by using Clipped critic value
|
||||
vpredclipped = old_values + tf.clip_by_value(values - Old_values, -self.value_clip, self.value_clip) # Minimize the difference between old value and new value
|
||||
vf_losses1 = tf.math.square(Returns - values) * 0.5 # Mean Squared Error
|
||||
vf_losses2 = tf.math.square(Returns - vpredclipped) * 0.5 # Mean Squared Error
|
||||
critic_loss = tf.math.reduce_mean(tf.math.maximum(vf_losses1, vf_losses2))
|
||||
|
||||
# We need to maximaze Policy Loss to make agent always find Better Rewards
|
||||
# and minimize Critic Loss
|
||||
loss = (critic_loss * self.vf_loss_coef) - (dist_entropy * self.entropy_coef) - pg_loss
|
||||
return loss
|
||||
|
||||
@tf.function
|
||||
def act(self, state):
|
||||
state = tf.expand_dims(tf.cast(state, dtype = tf.float32), 0)
|
||||
action_probs = self.actor(state)
|
||||
|
||||
# We don't need sample the action in Test Mode
|
||||
# only sampling the action in Training Mode in order to exploring the actions
|
||||
if self.is_training_mode:
|
||||
# Sample the action
|
||||
action = self.distributions.sample(action_probs)
|
||||
else:
|
||||
action = tf.math.argmax(action_probs, 1)
|
||||
|
||||
return action
|
||||
|
||||
# Get loss and Do backpropagation
|
||||
@tf.function
|
||||
def training_ppo(self, states, actions, rewards, dones, next_states):
|
||||
with tf.GradientTape() as tape:
|
||||
action_probs, values = self.actor(states), self.critic(states)
|
||||
old_action_probs, old_values = self.actor_old(states), self.critic_old(states)
|
||||
next_values = self.critic(next_states)
|
||||
|
||||
loss = self.get_loss(action_probs, values, old_action_probs, old_values, next_values, actions, rewards, dones)
|
||||
|
||||
gradients = tape.gradient(loss, self.actor.trainable_variables + self.critic.trainable_variables)
|
||||
self.optimizer.apply_gradients(zip(gradients, self.actor.trainable_variables + self.critic.trainable_variables))
|
||||
|
||||
# Update the model
|
||||
def update_ppo(self):
|
||||
batch_size = int(len(self.memory) / self.minibatch)
|
||||
|
||||
# Optimize policy for K epochs:
|
||||
for _ in range(self.PPO_epochs):
|
||||
for states, actions, rewards, dones, next_states in self.memory.get_all_items().batch(batch_size):
|
||||
self.training_ppo(states, actions, rewards, dones, next_states)
|
||||
|
||||
# Clear the memory
|
||||
self.memory.clear_memory()
|
||||
|
||||
# Copy new weights into old policy:
|
||||
self.actor_old.set_weights(self.actor.get_weights())
|
||||
self.critic_old.set_weights(self.critic.get_weights())
|
||||
|
||||
def save_weights(self):
|
||||
self.actor.save_weights('bipedalwalker_w/actor_ppo', save_format='tf')
|
||||
self.actor_old.save_weights('bipedalwalker_w/actor_old_ppo', save_format='tf')
|
||||
self.critic.save_weights('bipedalwalker_w/critic_ppo', save_format='tf')
|
||||
self.critic_old.save_weights('bipedalwalker_w/critic_old_ppo', save_format='tf')
|
||||
|
||||
def load_weights(self):
|
||||
self.actor.load_weights('bipedalwalker_w/actor_ppo')
|
||||
self.actor_old.load_weights('bipedalwalker_w/actor_old_ppo')
|
||||
self.critic.load_weights('bipedalwalker_w/critic_ppo')
|
||||
self.critic_old.load_weights('bipedalwalker_w/critic_old_ppo')
|
||||
|
||||
def plot(datas):
|
||||
print('----------')
|
||||
|
||||
plt.plot(datas)
|
||||
plt.plot()
|
||||
plt.xlabel('Episode')
|
||||
plt.ylabel('Datas')
|
||||
plt.show()
|
||||
|
||||
print('Max :', np.max(datas))
|
||||
print('Min :', np.min(datas))
|
||||
print('Avg :', np.mean(datas))
|
||||
|
||||
def run_episode(env, agent, state_dim, render, training_mode, t_updates, n_update):
|
||||
utils = Utils()
|
||||
############################################
|
||||
obs = env.reset()
|
||||
obs = utils.prepro(obs)
|
||||
state = obs
|
||||
|
||||
done = False
|
||||
total_reward = 0
|
||||
eps_time = 0
|
||||
############################################
|
||||
|
||||
while not done:
|
||||
action = int(agent.act(state))
|
||||
action_gym = action + 1 if action != 0 else 0
|
||||
|
||||
next_obs, reward, done, _ = env.step(action_gym)
|
||||
next_obs = utils.prepro(next_obs)
|
||||
next_state = next_obs - obs
|
||||
|
||||
eps_time += 1
|
||||
t_updates += 1
|
||||
total_reward += reward
|
||||
|
||||
if training_mode:
|
||||
agent.save_eps(state.tolist(), float(action), float(reward), float(done), next_state.tolist())
|
||||
|
||||
state = next_state
|
||||
obs = next_obs
|
||||
|
||||
if render:
|
||||
env.render()
|
||||
|
||||
if training_mode:
|
||||
if t_updates % n_update == 0:
|
||||
agent.update_ppo()
|
||||
t_updates = 0
|
||||
|
||||
if done:
|
||||
return total_reward, eps_time, t_updates
|
||||
|
||||
def main():
|
||||
############## Hyperparameters ##############
|
||||
load_weights = False # If you want to load the agent, set this to True
|
||||
save_weights = False # If you want to save the agent, set this to True
|
||||
training_mode = True # If you want to train the agent, set this to True. But set this otherwise if you only want to test it
|
||||
reward_threshold = 300 # Set threshold for reward. The learning will stop if reward has pass threshold. Set none to sei this off
|
||||
using_google_drive = False
|
||||
|
||||
render = False # If you want to display the image. Turn this off if you run this in Google Collab
|
||||
n_update = 128 # How many episode before you update the Policy. ocommended set to 128 for Discrete
|
||||
n_plot_batch = 100000000 # How many episode you want to plot the result
|
||||
n_episode = 100000 # How many episode you want to run
|
||||
n_saved = 10 # How many episode to run before saving the weights
|
||||
|
||||
policy_kl_range = 0.0008 # Set to 0.0008 for Discrete
|
||||
policy_params = 20 # Set to 20 for Discrete
|
||||
value_clip = 1.0 # How many value will be clipped. Recommended set to the highest or lowest possible reward
|
||||
entropy_coef = 0.05 # How much randomness of action you will get
|
||||
vf_loss_coef = 1.0 # Just set to 1
|
||||
minibatch = 4 # How many batch per update. size of batch = n_update / minibatch. Rocommended set to 4 for Discrete
|
||||
PPO_epochs = 4 # How many epoch per update
|
||||
|
||||
gamma = 0.99 # Just set to 0.99
|
||||
lam = 0.95 # Just set to 0.95
|
||||
learning_rate = 2.5e-4 # Just set to 0.95
|
||||
#############################################
|
||||
env_name = 'PongDeterministic-v4' # Set the env you want
|
||||
env = gym.make(env_name)
|
||||
|
||||
state_dim = 80 * 80
|
||||
action_dim = 3
|
||||
|
||||
print(action_dim)
|
||||
|
||||
agent = Agent(state_dim, action_dim, training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
|
||||
minibatch, PPO_epochs, gamma, lam, learning_rate)
|
||||
#############################################
|
||||
if using_google_drive:
|
||||
from google.colab import drive
|
||||
drive.mount('/test')
|
||||
|
||||
if load_weights:
|
||||
agent.load_weights()
|
||||
print('Weight Loaded')
|
||||
|
||||
rewards = []
|
||||
batch_rewards = []
|
||||
batch_solved_reward = []
|
||||
|
||||
times = []
|
||||
batch_times = []
|
||||
|
||||
t_updates = 0
|
||||
|
||||
for i_episode in range(1, n_episode + 1):
|
||||
total_reward, time, t_updates = run_episode(env, agent, state_dim, render, training_mode, t_updates, n_update)
|
||||
print('Episode {} \t t_reward: {} \t time: {} \t '.format(i_episode, total_reward, time))
|
||||
batch_rewards.append(int(total_reward))
|
||||
batch_times.append(time)
|
||||
|
||||
if save_weights:
|
||||
if i_episode % n_saved == 0:
|
||||
agent.save_weights()
|
||||
print('weights saved')
|
||||
|
||||
if reward_threshold:
|
||||
if len(batch_solved_reward) == 100:
|
||||
if np.mean(batch_solved_reward) >= reward_threshold :
|
||||
for reward in batch_rewards:
|
||||
rewards.append(reward)
|
||||
|
||||
for time in batch_times:
|
||||
times.append(time)
|
||||
|
||||
print('You solved task after {} episode'.format(len(rewards)))
|
||||
break
|
||||
|
||||
else:
|
||||
del batch_solved_reward[0]
|
||||
batch_solved_reward.append(total_reward)
|
||||
|
||||
else:
|
||||
batch_solved_reward.append(total_reward)
|
||||
|
||||
if i_episode % n_plot_batch == 0 and i_episode != 0:
|
||||
# Plot the reward, times for every n_plot_batch
|
||||
plot(batch_rewards)
|
||||
plot(batch_times)
|
||||
|
||||
for reward in batch_rewards:
|
||||
rewards.append(reward)
|
||||
|
||||
for time in batch_times:
|
||||
times.append(time)
|
||||
|
||||
batch_rewards = []
|
||||
batch_times = []
|
||||
|
||||
print('========== Cummulative ==========')
|
||||
# Plot the reward, times for every episode
|
||||
plot(rewards)
|
||||
plot(times)
|
||||
|
||||
print('========== Final ==========')
|
||||
# Plot the reward, times for every episode
|
||||
|
||||
for reward in batch_rewards:
|
||||
rewards.append(reward)
|
||||
|
||||
for time in batch_times:
|
||||
times.append(time)
|
||||
|
||||
plot(rewards)
|
||||
plot(times)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
409
PPO/tensorflow 2/ppo_tensorflow.py
Normal file
409
PPO/tensorflow 2/ppo_tensorflow.py
Normal file
@ -0,0 +1,409 @@
|
||||
import gym
|
||||
from gym.envs.registration import register
|
||||
|
||||
import tensorflow as tf
|
||||
import tensorflow_probability as tfp
|
||||
from tensorflow.keras.layers import Dense
|
||||
from tensorflow.keras import Model
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import sys
|
||||
import numpy
|
||||
|
||||
class Utils():
|
||||
def prepro(self, I):
|
||||
I = I[35:195] # crop
|
||||
I = I[::2,::2, 0] # downsample by factor of 2
|
||||
I[I == 144] = 0 # erase background (background type 1)
|
||||
I[I == 109] = 0 # erase background (background type 2)
|
||||
I[I != 0] = 1 # everything else (paddles, ball) just set to 1
|
||||
X = I.astype(np.float32).ravel() # Combine items in 1 array
|
||||
return X
|
||||
|
||||
class Actor_Model(Model):
|
||||
def __init__(self, state_dim, action_dim):
|
||||
super(Actor_Model, self).__init__()
|
||||
self.d1 = Dense(32, activation='relu')
|
||||
self.d2 = Dense(32, activation='relu')
|
||||
self.dout = Dense(action_dim, activation='softmax')
|
||||
|
||||
def call(self, x):
|
||||
x = self.d1(x)
|
||||
x = self.d2(x)
|
||||
return self.dout(x)
|
||||
|
||||
class Critic_Model(Model):
|
||||
def __init__(self, state_dim, action_dim):
|
||||
super(Critic_Model, self).__init__()
|
||||
self.d1 = Dense(32, activation='relu')
|
||||
self.d2 = Dense(32, activation='relu')
|
||||
self.dout = Dense(1, activation='linear')
|
||||
|
||||
def call(self, x):
|
||||
x = self.d1(x)
|
||||
x = self.d2(x)
|
||||
return self.dout(x)
|
||||
|
||||
class Memory():
|
||||
def __init__(self):
|
||||
self.actions = []
|
||||
self.states = []
|
||||
self.rewards = []
|
||||
self.dones = []
|
||||
self.next_states = []
|
||||
|
||||
def __len__(self):
|
||||
return len(self.dones)
|
||||
|
||||
def get_all_items(self):
|
||||
states = tf.constant(self.states, dtype = tf.float32)
|
||||
actions = tf.constant(self.actions, dtype = tf.float32)
|
||||
rewards = tf.expand_dims(tf.constant(self.rewards, dtype = tf.float32), 1)
|
||||
dones = tf.expand_dims(tf.constant(self.dones, dtype = tf.float32), 1)
|
||||
next_states = tf.constant(self.next_states, dtype = tf.float32)
|
||||
|
||||
return tf.data.Dataset.from_tensor_slices((states, actions, rewards, dones, next_states))
|
||||
|
||||
def save_eps(self, state, action, reward, done, next_state):
|
||||
self.rewards.append(reward)
|
||||
self.states.append(state)
|
||||
self.actions.append(action)
|
||||
self.dones.append(done)
|
||||
self.next_states.append(next_state)
|
||||
|
||||
def clear_memory(self):
|
||||
del self.actions[:]
|
||||
del self.states[:]
|
||||
del self.rewards[:]
|
||||
del self.dones[:]
|
||||
del self.next_states[:]
|
||||
|
||||
class Distributions():
|
||||
def sample(self, datas):
|
||||
distribution = tfp.distributions.Categorical(probs = datas)
|
||||
return distribution.sample()
|
||||
|
||||
def entropy(self, datas):
|
||||
distribution = tfp.distributions.Categorical(probs = datas)
|
||||
return distribution.entropy()
|
||||
|
||||
def logprob(self, datas, value_data):
|
||||
distribution = tfp.distributions.Categorical(probs = datas)
|
||||
return tf.expand_dims(distribution.log_prob(value_data), 1)
|
||||
|
||||
def kl_divergence(self, datas1, datas2):
|
||||
distribution1 = tfp.distributions.Categorical(probs = datas1)
|
||||
distribution2 = tfp.distributions.Categorical(probs = datas2)
|
||||
|
||||
return tf.expand_dims(tfp.distributions.kl_divergence(distribution1, distribution2), 1)
|
||||
|
||||
class PolicyFunction():
|
||||
def __init__(self, gamma = 0.99, lam = 0.95):
|
||||
self.gamma = gamma
|
||||
self.lam = lam
|
||||
|
||||
def monte_carlo_discounted(self, rewards, dones):
|
||||
running_add = 0
|
||||
returns = []
|
||||
|
||||
for step in reversed(range(len(rewards))):
|
||||
running_add = rewards[step] + (1.0 - dones[step]) * self.gamma * running_add
|
||||
returns.insert(0, running_add)
|
||||
|
||||
return tf.stack(returns)
|
||||
|
||||
def temporal_difference(self, reward, next_value, done):
|
||||
q_values = reward + (1 - done) * self.gamma * next_value
|
||||
return q_values
|
||||
|
||||
def generalized_advantage_estimation(self, values, rewards, next_values, dones):
|
||||
gae = 0
|
||||
adv = []
|
||||
|
||||
delta = rewards + (1.0 - dones) * self.gamma * next_values - values
|
||||
for step in reversed(range(len(rewards))):
|
||||
gae = delta[step] + (1.0 - dones[step]) * self.gamma * self.lam * gae
|
||||
adv.insert(0, gae)
|
||||
|
||||
return tf.stack(adv)
|
||||
|
||||
class Agent():
|
||||
def __init__(self, state_dim, action_dim, is_training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
|
||||
minibatch, PPO_epochs, gamma, lam, learning_rate):
|
||||
self.policy_kl_range = policy_kl_range
|
||||
self.policy_params = policy_params
|
||||
self.value_clip = value_clip
|
||||
self.entropy_coef = entropy_coef
|
||||
self.vf_loss_coef = vf_loss_coef
|
||||
self.minibatch = minibatch
|
||||
self.PPO_epochs = PPO_epochs
|
||||
self.is_training_mode = is_training_mode
|
||||
self.action_dim = action_dim
|
||||
|
||||
self.actor = Actor_Model(state_dim, action_dim)
|
||||
self.actor_old = Actor_Model(state_dim, action_dim)
|
||||
|
||||
self.critic = Critic_Model(state_dim, action_dim)
|
||||
self.critic_old = Critic_Model(state_dim, action_dim)
|
||||
|
||||
self.optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)
|
||||
self.memory = Memory()
|
||||
self.policy_function = PolicyFunction(gamma, lam)
|
||||
self.distributions = Distributions()
|
||||
|
||||
def save_eps(self, state, action, reward, done, next_state):
|
||||
self.memory.save_eps(state, action, reward, done, next_state)
|
||||
|
||||
# Loss for PPO
|
||||
def get_loss(self, action_probs, values, old_action_probs, old_values, next_values, actions, rewards, dones):
|
||||
# Don't use old value in backpropagation
|
||||
Old_values = tf.stop_gradient(old_values)
|
||||
|
||||
# Getting general advantages estimator
|
||||
Advantages = self.policy_function.generalized_advantage_estimation(values, rewards, next_values, dones)
|
||||
Returns = tf.stop_gradient(Advantages + values)
|
||||
Advantages = tf.stop_gradient((Advantages - tf.math.reduce_mean(Advantages)) / (tf.math.reduce_std(Advantages) + 1e-6))
|
||||
|
||||
# Finding the ratio (pi_theta / pi_theta__old):
|
||||
logprobs = self.distributions.logprob(action_probs, actions)
|
||||
Old_logprobs = tf.stop_gradient(self.distributions.logprob(old_action_probs, actions))
|
||||
ratios = tf.math.exp(logprobs - Old_logprobs) # ratios = old_logprobs / logprobs
|
||||
|
||||
# Finding KL Divergence
|
||||
Kl = self.distributions.kl_divergence(old_action_probs, action_probs)
|
||||
|
||||
# Combining TR-PPO with Rollback (Truly PPO)
|
||||
pg_loss = tf.where(
|
||||
tf.logical_and(Kl >= self.policy_kl_range, ratios > 1),
|
||||
ratios * Advantages - self.policy_params * Kl,
|
||||
ratios * Advantages
|
||||
)
|
||||
pg_loss = tf.math.reduce_mean(pg_loss)
|
||||
|
||||
# Getting entropy from the action probability
|
||||
dist_entropy = tf.math.reduce_mean(self.distributions.entropy(action_probs))
|
||||
|
||||
# Getting critic loss by using Clipped critic value
|
||||
vpredclipped = old_values + tf.clip_by_value(values - Old_values, -self.value_clip, self.value_clip) # Minimize the difference between old value and new value
|
||||
vf_losses1 = tf.math.square(Returns - values) * 0.5 # Mean Squared Error
|
||||
vf_losses2 = tf.math.square(Returns - vpredclipped) * 0.5 # Mean Squared Error
|
||||
critic_loss = tf.math.reduce_mean(tf.math.maximum(vf_losses1, vf_losses2))
|
||||
|
||||
# We need to maximaze Policy Loss to make agent always find Better Rewards
|
||||
# and minimize Critic Loss
|
||||
loss = (critic_loss * self.vf_loss_coef) - (dist_entropy * self.entropy_coef) - pg_loss
|
||||
return loss
|
||||
|
||||
@tf.function
|
||||
def act(self, state):
|
||||
state = tf.expand_dims(tf.cast(state, dtype = tf.float32), 0)
|
||||
action_probs = self.actor(state)
|
||||
|
||||
# We don't need sample the action in Test Mode
|
||||
# only sampling the action in Training Mode in order to exploring the actions
|
||||
if self.is_training_mode:
|
||||
# Sample the action
|
||||
action = self.distributions.sample(action_probs)
|
||||
else:
|
||||
action = tf.math.argmax(action_probs, 1)
|
||||
|
||||
return action
|
||||
|
||||
# Get loss and Do backpropagation
|
||||
@tf.function
|
||||
def training_ppo(self, states, actions, rewards, dones, next_states):
|
||||
with tf.GradientTape() as tape:
|
||||
action_probs, values = self.actor(states), self.critic(states)
|
||||
old_action_probs, old_values = self.actor_old(states), self.critic_old(states)
|
||||
next_values = self.critic(next_states)
|
||||
|
||||
loss = self.get_loss(action_probs, values, old_action_probs, old_values, next_values, actions, rewards, dones)
|
||||
|
||||
gradients = tape.gradient(loss, self.actor.trainable_variables + self.critic.trainable_variables)
|
||||
self.optimizer.apply_gradients(zip(gradients, self.actor.trainable_variables + self.critic.trainable_variables))
|
||||
|
||||
# Update the model
|
||||
def update_ppo(self):
|
||||
batch_size = int(len(self.memory) / self.minibatch)
|
||||
|
||||
# Optimize policy for K epochs:
|
||||
for _ in range(self.PPO_epochs):
|
||||
for states, actions, rewards, dones, next_states in self.memory.get_all_items().batch(batch_size):
|
||||
self.training_ppo(states, actions, rewards, dones, next_states)
|
||||
|
||||
# Clear the memory
|
||||
self.memory.clear_memory()
|
||||
|
||||
# Copy new weights into old policy:
|
||||
self.actor_old.set_weights(self.actor.get_weights())
|
||||
self.critic_old.set_weights(self.critic.get_weights())
|
||||
|
||||
def save_weights(self):
|
||||
self.actor.save_weights('bipedalwalker_w/actor_ppo', save_format='tf')
|
||||
self.actor_old.save_weights('bipedalwalker_w/actor_old_ppo', save_format='tf')
|
||||
self.critic.save_weights('bipedalwalker_w/critic_ppo', save_format='tf')
|
||||
self.critic_old.save_weights('bipedalwalker_w/critic_old_ppo', save_format='tf')
|
||||
|
||||
def load_weights(self):
|
||||
self.actor.load_weights('bipedalwalker_w/actor_ppo')
|
||||
self.actor_old.load_weights('bipedalwalker_w/actor_old_ppo')
|
||||
self.critic.load_weights('bipedalwalker_w/critic_ppo')
|
||||
self.critic_old.load_weights('bipedalwalker_w/critic_old_ppo')
|
||||
|
||||
def plot(datas):
|
||||
print('----------')
|
||||
|
||||
plt.plot(datas)
|
||||
plt.plot()
|
||||
plt.xlabel('Episode')
|
||||
plt.ylabel('Datas')
|
||||
plt.show()
|
||||
|
||||
print('Max :', np.max(datas))
|
||||
print('Min :', np.min(datas))
|
||||
print('Avg :', np.mean(datas))
|
||||
|
||||
def run_episode(env, agent, state_dim, render, training_mode, t_updates, n_update):
|
||||
############################################
|
||||
state = env.reset()
|
||||
done = False
|
||||
total_reward = 0
|
||||
eps_time = 0
|
||||
############################################
|
||||
|
||||
while not done:
|
||||
action = int(agent.act(state))
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
|
||||
eps_time += 1
|
||||
t_updates += 1
|
||||
total_reward += reward
|
||||
|
||||
if training_mode:
|
||||
agent.save_eps(state.tolist(), action, reward, float(done), next_state.tolist())
|
||||
|
||||
state = next_state
|
||||
|
||||
if render:
|
||||
env.render()
|
||||
|
||||
if training_mode:
|
||||
if t_updates % n_update == 0:
|
||||
agent.update_ppo()
|
||||
t_updates = 0
|
||||
|
||||
if done:
|
||||
return total_reward, eps_time, t_updates
|
||||
|
||||
def main():
|
||||
############## Hyperparameters ##############
|
||||
load_weights = False # If you want to load the agent, set this to True
|
||||
save_weights = False # If you want to save the agent, set this to True
|
||||
training_mode = True # If you want to train the agent, set this to True. But set this otherwise if you only want to test it
|
||||
reward_threshold = 300 # Set threshold for reward. The learning will stop if reward has pass threshold. Set none to sei this off
|
||||
using_google_drive = False
|
||||
|
||||
render = False # If you want to display the image. Turn this off if you run this in Google Collab
|
||||
n_update = 32 # How many episode before you update the Policy. ocommended set to 128 for Discrete
|
||||
n_plot_batch = 100000000 # How many episode you want to plot the result
|
||||
n_episode = 100000 # How many episode you want to run
|
||||
n_saved = 10 # How many episode to run before saving the weights
|
||||
|
||||
policy_kl_range = 0.0008 # Set to 0.0008 for Discrete
|
||||
policy_params = 20 # Set to 20 for Discrete
|
||||
value_clip = 1.0 # How many value will be clipped. Recommended set to the highest or lowest possible reward
|
||||
entropy_coef = 0.05 # How much randomness of action you will get
|
||||
vf_loss_coef = 1.0 # Just set to 1
|
||||
minibatch = 2 # How many batch per update. size of batch = n_update / minibatch. Rocommended set to 4 for Discrete
|
||||
PPO_epochs = 4 # How many epoch per update
|
||||
|
||||
gamma = 0.99 # Just set to 0.99
|
||||
lam = 0.95 # Just set to 0.95
|
||||
learning_rate = 2.5e-4 # Just set to 0.95
|
||||
#############################################
|
||||
env_name = 'Env Name' # Set the env you want
|
||||
env = gym.make(env_name)
|
||||
|
||||
state_dim = env.observation_space.shape[0]
|
||||
action_dim = env.action_space.n
|
||||
|
||||
print(action_dim)
|
||||
|
||||
agent = Agent(state_dim, action_dim, training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
|
||||
minibatch, PPO_epochs, gamma, lam, learning_rate)
|
||||
#############################################
|
||||
if using_google_drive:
|
||||
from google.colab import drive
|
||||
drive.mount('/test')
|
||||
|
||||
if load_weights:
|
||||
agent.load_weights()
|
||||
print('Weight Loaded')
|
||||
|
||||
rewards = []
|
||||
batch_rewards = []
|
||||
batch_solved_reward = []
|
||||
|
||||
times = []
|
||||
batch_times = []
|
||||
|
||||
t_updates = 0
|
||||
|
||||
for i_episode in range(1, n_episode + 1):
|
||||
total_reward, time, t_updates = run_episode(env, agent, state_dim, render, training_mode, t_updates, n_update)
|
||||
print('Episode {} \t t_reward: {} \t time: {} \t '.format(i_episode, total_reward, time))
|
||||
batch_rewards.append(int(total_reward))
|
||||
batch_times.append(time)
|
||||
|
||||
if save_weights:
|
||||
if i_episode % n_saved == 0:
|
||||
agent.save_weights()
|
||||
print('weights saved')
|
||||
|
||||
if reward_threshold:
|
||||
if len(batch_solved_reward) == 100:
|
||||
if np.mean(batch_solved_reward) >= reward_threshold:
|
||||
print('You solved task after {} episode'.format(len(rewards)))
|
||||
break
|
||||
|
||||
else:
|
||||
del batch_solved_reward[0]
|
||||
batch_solved_reward.append(total_reward)
|
||||
|
||||
else:
|
||||
batch_solved_reward.append(total_reward)
|
||||
|
||||
if i_episode % n_plot_batch == 0 and i_episode != 0:
|
||||
# Plot the reward, times for every n_plot_batch
|
||||
plot(batch_rewards)
|
||||
plot(batch_times)
|
||||
|
||||
for reward in batch_rewards:
|
||||
rewards.append(reward)
|
||||
|
||||
for time in batch_times:
|
||||
times.append(time)
|
||||
|
||||
batch_rewards = []
|
||||
batch_times = []
|
||||
|
||||
print('========== Cummulative ==========')
|
||||
# Plot the reward, times for every episode
|
||||
plot(rewards)
|
||||
plot(times)
|
||||
|
||||
print('========== Final ==========')
|
||||
# Plot the reward, times for every episode
|
||||
|
||||
for reward in batch_rewards:
|
||||
rewards.append(reward)
|
||||
|
||||
for time in batch_times:
|
||||
times.append(time)
|
||||
|
||||
plot(rewards)
|
||||
plot(times)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
403
PPO_continous/pytorch/ppo_continous_pytorch.py
Normal file
403
PPO_continous/pytorch/ppo_continous_pytorch.py
Normal file
@ -0,0 +1,403 @@
|
||||
import gym
|
||||
from gym.envs.registration import register
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.distributions import Normal
|
||||
from torch.distributions.kl import kl_divergence
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
from torch.optim import Adam
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
import numpy as np
|
||||
import sys
|
||||
import numpy
|
||||
import time
|
||||
import datetime
|
||||
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
dataType = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
|
||||
|
||||
class Actor_Model(nn.Module):
|
||||
def __init__(self, state_dim, action_dim):
|
||||
super(Actor_Model, self).__init__()
|
||||
|
||||
self.nn_layer = nn.Sequential(
|
||||
nn.Linear(state_dim, 256),
|
||||
nn.ReLU(),
|
||||
nn.Linear(256, 64),
|
||||
nn.ReLU(),
|
||||
nn.Linear(64, action_dim),
|
||||
nn.Tanh()
|
||||
).float().to(device)
|
||||
|
||||
def forward(self, states):
|
||||
return self.nn_layer(states)
|
||||
|
||||
class Critic_Model(nn.Module):
|
||||
def __init__(self, state_dim, action_dim):
|
||||
super(Critic_Model, self).__init__()
|
||||
|
||||
self.nn_layer = nn.Sequential(
|
||||
nn.Linear(state_dim, 256),
|
||||
nn.ReLU(),
|
||||
nn.Linear(256, 64),
|
||||
nn.ReLU(),
|
||||
nn.Linear(64, 1)
|
||||
).float().to(device)
|
||||
|
||||
def forward(self, states):
|
||||
return self.nn_layer(states)
|
||||
|
||||
class Memory(Dataset):
|
||||
def __init__(self):
|
||||
self.actions = []
|
||||
self.states = []
|
||||
self.rewards = []
|
||||
self.dones = []
|
||||
self.next_states = []
|
||||
|
||||
def __len__(self):
|
||||
return len(self.dones)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return np.array(self.states[idx], dtype = np.float32), np.array(self.actions[idx], dtype = np.float32), np.array([self.rewards[idx]], dtype = np.float32), np.array([self.dones[idx]], dtype = np.float32), np.array(self.next_states[idx], dtype = np.float32)
|
||||
|
||||
def save_eps(self, state, action, reward, done, next_state):
|
||||
self.states.append(state)
|
||||
self.actions.append(action)
|
||||
self.rewards.append(reward)
|
||||
self.dones.append(done)
|
||||
self.next_states.append(next_state)
|
||||
|
||||
def clear_memory(self):
|
||||
del self.actions[:]
|
||||
del self.states[:]
|
||||
del self.rewards[:]
|
||||
del self.dones[:]
|
||||
del self.next_states[:]
|
||||
|
||||
class Continous():
|
||||
def sample(self, mean, std):
|
||||
distribution = Normal(mean, std)
|
||||
return distribution.sample().float().to(device)
|
||||
|
||||
def entropy(self, mean, std):
|
||||
distribution = Normal(mean, std)
|
||||
return distribution.entropy().float().to(device)
|
||||
|
||||
def logprob(self, mean, std, value_data):
|
||||
distribution = Normal(mean, std)
|
||||
return distribution.log_prob(value_data).float().to(device)
|
||||
|
||||
def kl_divergence(self, mean1, std1, mean2, std2):
|
||||
distribution1 = Normal(mean1, std1)
|
||||
distribution2 = Normal(mean2, std2)
|
||||
|
||||
return kl_divergence(distribution1, distribution2).float().to(device)
|
||||
|
||||
class PolicyFunction():
|
||||
def __init__(self, gamma = 0.99, lam = 0.95):
|
||||
self.gamma = gamma
|
||||
self.lam = lam
|
||||
|
||||
def monte_carlo_discounted(self, rewards, dones):
|
||||
running_add = 0
|
||||
returns = []
|
||||
|
||||
for step in reversed(range(len(rewards))):
|
||||
running_add = rewards[step] + (1.0 - dones[step]) * self.gamma * running_add
|
||||
returns.insert(0, running_add)
|
||||
|
||||
return torch.stack(returns)
|
||||
|
||||
def temporal_difference(self, reward, next_value, done):
|
||||
q_values = reward + (1 - done) * self.gamma * next_value
|
||||
return q_values
|
||||
|
||||
def generalized_advantage_estimation(self, values, rewards, next_values, dones):
|
||||
gae = 0
|
||||
adv = []
|
||||
|
||||
delta = rewards + (1.0 - dones) * self.gamma * next_values - values
|
||||
for step in reversed(range(len(rewards))):
|
||||
gae = delta[step] + (1.0 - dones[step]) * self.gamma * self.lam * gae
|
||||
adv.insert(0, gae)
|
||||
|
||||
return torch.stack(adv)
|
||||
|
||||
class TrulyPPO():
|
||||
def __init__(self, policy_kl_range, policy_params, value_clip, vf_loss_coef, entropy_coef, gamma, lam):
|
||||
self.policy_kl_range = policy_kl_range
|
||||
self.policy_params = policy_params
|
||||
self.value_clip = value_clip
|
||||
self.vf_loss_coef = vf_loss_coef
|
||||
self.entropy_coef = entropy_coef
|
||||
|
||||
self.distributions = Continous()
|
||||
self.policy_function = PolicyFunction(gamma, lam)
|
||||
|
||||
def compute_loss(self, action_mean, action_std, old_action_mean, old_action_std, values, old_values, next_values, actions, rewards, dones):
|
||||
# Don't use old value in backpropagation
|
||||
Old_values = old_values.detach()
|
||||
Old_action_mean = old_action_mean.detach()
|
||||
|
||||
# Getting general advantages estimator and returns
|
||||
Advantages = self.policy_function.generalized_advantage_estimation(values, rewards, next_values, dones)
|
||||
Returns = (Advantages + values).detach()
|
||||
Advantages = ((Advantages - Advantages.mean()) / (Advantages.std() + 1e-6)).detach()
|
||||
|
||||
# Finding the ratio (pi_theta / pi_theta__old):
|
||||
logprobs = self.distributions.logprob(action_mean, action_std, actions)
|
||||
Old_logprobs = self.distributions.logprob(Old_action_mean, old_action_std, actions).detach()
|
||||
|
||||
# Finding Surrogate Loss
|
||||
ratios = (logprobs - Old_logprobs).exp() # ratios = old_logprobs / logprobs
|
||||
Kl = self.distributions.kl_divergence(Old_action_mean, old_action_std, action_mean, action_std)
|
||||
|
||||
pg_targets = torch.where(
|
||||
(Kl >= self.policy_kl_range) & (ratios > 1),
|
||||
ratios * Advantages - self.policy_params * Kl,
|
||||
ratios * Advantages
|
||||
)
|
||||
pg_loss = pg_targets.mean()
|
||||
|
||||
# Getting entropy from the action probability
|
||||
dist_entropy = self.distributions.entropy(action_mean, action_std).mean()
|
||||
|
||||
# Getting Critic loss by using Clipped critic value
|
||||
if self.value_clip is None:
|
||||
critic_loss = ((Returns - values).pow(2) * 0.5).mean()
|
||||
else:
|
||||
vpredclipped = old_values + torch.clamp(values - Old_values, -self.value_clip, self.value_clip) # Minimize the difference between old value and new value
|
||||
vf_losses1 = (Returns - values).pow(2) * 0.5 # Mean Squared Error
|
||||
vf_losses2 = (Returns - vpredclipped).pow(2) * 0.5 # Mean Squared Error
|
||||
critic_loss = torch.max(vf_losses1, vf_losses2).mean()
|
||||
|
||||
# We need to maximaze Policy Loss to make agent always find Better Rewards
|
||||
# and minimize Critic Loss
|
||||
loss = (critic_loss * self.vf_loss_coef) - (dist_entropy * self.entropy_coef) - pg_loss
|
||||
return loss
|
||||
|
||||
class Agent():
|
||||
def __init__(self, state_dim, action_dim, is_training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
|
||||
minibatch, PPO_epochs, gamma, lam, learning_rate):
|
||||
self.policy_kl_range = policy_kl_range
|
||||
self.policy_params = policy_params
|
||||
self.value_clip = value_clip
|
||||
self.entropy_coef = entropy_coef
|
||||
self.vf_loss_coef = vf_loss_coef
|
||||
self.minibatch = minibatch
|
||||
self.PPO_epochs = PPO_epochs
|
||||
self.is_training_mode = is_training_mode
|
||||
self.action_dim = action_dim
|
||||
self.std = torch.ones([1, action_dim]).float().to(device)
|
||||
|
||||
self.actor = Actor_Model(state_dim, action_dim)
|
||||
self.actor_old = Actor_Model(state_dim, action_dim)
|
||||
self.actor_optimizer = Adam(self.actor.parameters(), lr = learning_rate)
|
||||
|
||||
self.critic = Critic_Model(state_dim, action_dim)
|
||||
self.critic_old = Critic_Model(state_dim, action_dim)
|
||||
self.critic_optimizer = Adam(self.critic.parameters(), lr = learning_rate)
|
||||
|
||||
self.memory = Memory()
|
||||
self.policy_function = PolicyFunction(gamma, lam)
|
||||
|
||||
self.distributions = Continous()
|
||||
self.policy_loss = TrulyPPO(policy_kl_range, policy_params, value_clip, vf_loss_coef, entropy_coef, gamma, lam)
|
||||
|
||||
if is_training_mode:
|
||||
self.actor.train()
|
||||
self.critic.train()
|
||||
else:
|
||||
self.actor.eval()
|
||||
self.critic.eval()
|
||||
|
||||
def save_eps(self, state, action, reward, done, next_state):
|
||||
self.memory.save_eps(state, action, reward, done, next_state)
|
||||
|
||||
def act(self, state):
|
||||
state = torch.FloatTensor(state).unsqueeze(0).to(device).detach()
|
||||
action_mean = self.actor(state)
|
||||
|
||||
# We don't need sample the action in Test Mode
|
||||
# only sampling the action in Training Mode in order to exploring the actions
|
||||
if self.is_training_mode:
|
||||
# Sample the action
|
||||
action = self.distributions.sample(action_mean, self.std)
|
||||
else:
|
||||
action = action_mean
|
||||
|
||||
return action.squeeze(0).cpu().numpy()
|
||||
|
||||
# Get loss and Do backpropagation
|
||||
def training_ppo(self, states, actions, rewards, dones, next_states):
|
||||
action_mean, values = self.actor(states), self.critic(states)
|
||||
old_action_mean, old_values = self.actor_old(states), self.critic_old(states)
|
||||
next_values = self.critic(next_states)
|
||||
|
||||
loss = self.policy_loss.compute_loss(action_mean, self.std, old_action_mean, self.std, values, old_values, next_values, actions, rewards, dones)
|
||||
|
||||
self.actor_optimizer.zero_grad()
|
||||
self.critic_optimizer.zero_grad()
|
||||
|
||||
loss.backward()
|
||||
|
||||
self.actor_optimizer.step()
|
||||
self.critic_optimizer.step()
|
||||
|
||||
# Update the model
|
||||
def update_ppo(self):
|
||||
batch_size = int(len(self.memory) / self.minibatch)
|
||||
dataloader = DataLoader(self.memory, batch_size, shuffle = False)
|
||||
|
||||
# Optimize policy for K epochs:
|
||||
for _ in range(self.PPO_epochs):
|
||||
for states, actions, rewards, dones, next_states in dataloader:
|
||||
self.training_ppo(states.float().to(device), actions.float().to(device), rewards.float().to(device), dones.float().to(device), next_states.float().to(device))
|
||||
|
||||
# Clear the memory
|
||||
self.memory.clear_memory()
|
||||
|
||||
# Copy new weights into old policy:
|
||||
self.actor_old.load_state_dict(self.actor.state_dict())
|
||||
self.critic_old.load_state_dict(self.critic.state_dict())
|
||||
|
||||
def save_weights(self):
|
||||
torch.save({
|
||||
'model_state_dict': self.actor.state_dict(),
|
||||
'optimizer_state_dict': self.actor_optimizer.state_dict()
|
||||
}, '/test/My Drive/Bipedal4/actor.tar')
|
||||
|
||||
torch.save({
|
||||
'model_state_dict': self.critic.state_dict(),
|
||||
'optimizer_state_dict': self.critic_optimizer.state_dict()
|
||||
}, '/test/My Drive/Bipedal4/critic.tar')
|
||||
|
||||
def load_weights(self):
|
||||
actor_checkpoint = torch.load('/test/My Drive/Bipedal4/actor.tar')
|
||||
self.actor.load_state_dict(actor_checkpoint['model_state_dict'])
|
||||
self.actor_optimizer.load_state_dict(actor_checkpoint['optimizer_state_dict'])
|
||||
|
||||
critic_checkpoint = torch.load('/test/My Drive/Bipedal4/critic.tar')
|
||||
self.critic.load_state_dict(critic_checkpoint['model_state_dict'])
|
||||
self.critic_optimizer.load_state_dict(critic_checkpoint['optimizer_state_dict'])
|
||||
|
||||
class Runner():
|
||||
def __init__(self, env, agent, render, training_mode, n_update):
|
||||
self.env = env
|
||||
self.agent = agent
|
||||
self.render = render
|
||||
self.training_mode = training_mode
|
||||
|
||||
self.n_update = n_update
|
||||
self.t_updates = 0
|
||||
|
||||
def run_episode(self):
|
||||
############################################
|
||||
state = self.env.reset()
|
||||
done = False
|
||||
total_reward = 0
|
||||
eps_time = 0
|
||||
############################################
|
||||
for _ in range(10000):
|
||||
action = self.agent.act(state)
|
||||
next_state, reward, done, _ = self.env.step(action)
|
||||
|
||||
eps_time += 1
|
||||
self.t_updates += 1
|
||||
total_reward += reward
|
||||
|
||||
if self.training_mode:
|
||||
self.agent.save_eps(state.tolist(), action, reward, float(done), next_state.tolist())
|
||||
|
||||
state = next_state
|
||||
|
||||
if self.render:
|
||||
self.env.render()
|
||||
|
||||
if self.training_mode and self.n_update is not None and self.t_updates == self.n_update:
|
||||
self.agent.update_ppo()
|
||||
self.t_updates = 0
|
||||
|
||||
if done:
|
||||
break
|
||||
|
||||
if self.training_mode and self.n_update is None:
|
||||
self.agent.update_ppo()
|
||||
|
||||
return total_reward, eps_time
|
||||
|
||||
def main():
|
||||
############## Hyperparameters ##############
|
||||
load_weights = False # If you want to load the agent, set this to True
|
||||
save_weights = False # If you want to save the agent, set this to True
|
||||
training_mode = True # If you want to train the agent, set this to True. But set this otherwise if you only want to test it
|
||||
reward_threshold = 495 # Set threshold for reward. The learning will stop if reward has pass threshold. Set none to sei this off
|
||||
using_google_drive = False
|
||||
|
||||
render = False # If you want to display the image, set this to True. Turn this off if you run this in Google Collab
|
||||
n_update = 1024 # How many episode before you update the Policy. Recommended set to 128 for Discrete
|
||||
n_plot_batch = 100000000 # How many episode you want to plot the result
|
||||
n_episode = 100000 # How many episode you want to run
|
||||
n_saved = 10 # How many episode to run before saving the weights
|
||||
|
||||
policy_kl_range = 0.03 # Recommended set to 0.0008 for Discrete
|
||||
policy_params = 5 # Recommended set to 20 for Discrete
|
||||
value_clip = 1.0 # How many value will be clipped. Recommended set to the highest or lowest possible reward
|
||||
entropy_coef = 0.0 # How much randomness of action you will get
|
||||
vf_loss_coef = 1.0 # Just set to 1
|
||||
minibatch = 32 # How many batch per update. size of batch = n_update / minibatch. Recommended set to 4 for Discrete
|
||||
PPO_epochs = 10 # How many epoch per update. Recommended set to 10 for Discrete
|
||||
|
||||
gamma = 0.99 # Just set to 0.99
|
||||
lam = 0.95 # Just set to 0.95
|
||||
learning_rate = 3e-4 # Just set to 0.95
|
||||
#############################################
|
||||
writer = SummaryWriter()
|
||||
|
||||
env_name = 'BipedalWalker-v3' # Set the env you want
|
||||
env = gym.make(env_name)
|
||||
|
||||
state_dim = env.observation_space.shape[0]
|
||||
action_dim = env.action_space.shape[0]
|
||||
|
||||
agent = Agent(state_dim, action_dim, training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
|
||||
minibatch, PPO_epochs, gamma, lam, learning_rate)
|
||||
|
||||
runner = Runner(env, agent, render, training_mode, n_update)
|
||||
#############################################
|
||||
if using_google_drive:
|
||||
from google.colab import drive
|
||||
drive.mount('/test')
|
||||
|
||||
if load_weights:
|
||||
agent.load_weights()
|
||||
print('Weight Loaded')
|
||||
|
||||
print('Run the training!!')
|
||||
start = time.time()
|
||||
|
||||
try:
|
||||
for i_episode in range(1, n_episode + 1):
|
||||
total_reward, eps_time = runner.run_episode()
|
||||
|
||||
print('Episode: {} \t t_reward: {} \t time: {} \t '.format(i_episode, total_reward, eps_time))
|
||||
writer.add_scalar('rewards', total_reward, i_episode)
|
||||
|
||||
if save_weights:
|
||||
if i_episode % n_saved == 0:
|
||||
agent.save_weights()
|
||||
print('weights saved')
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print('\nTraining has been Shutdown \n')
|
||||
|
||||
finally:
|
||||
finish = time.time()
|
||||
timedelta = finish - start
|
||||
print('Timelength: {}'.format(str( datetime.timedelta(seconds = timedelta) )))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
411
PPO_continous/tensorflow/ppo_continous_tensorflow.py
Normal file
411
PPO_continous/tensorflow/ppo_continous_tensorflow.py
Normal file
@ -0,0 +1,411 @@
|
||||
import gym
|
||||
from gym.envs.registration import register
|
||||
|
||||
import tensorflow as tf
|
||||
import tensorflow_probability as tfp
|
||||
from tensorflow.keras.layers import Dense
|
||||
from tensorflow.keras import Model
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import sys
|
||||
import numpy
|
||||
|
||||
class Utils():
|
||||
def prepro(self, I):
|
||||
I = I[35:195] # crop
|
||||
I = I[::2,::2, 0] # downsample by factor of 2
|
||||
I[I == 144] = 0 # erase background (background type 1)
|
||||
I[I == 109] = 0 # erase background (background type 2)
|
||||
I[I != 0] = 1 # everything else (paddles, ball) just set to 1
|
||||
X = I.astype(np.float32).ravel() # Combine items in 1 array
|
||||
return X
|
||||
|
||||
class Actor_Model(Model):
|
||||
def __init__(self, state_dim, action_dim):
|
||||
super(Actor_Model, self).__init__()
|
||||
self.d1 = Dense(64, activation='relu')
|
||||
self.d2 = Dense(64, activation='relu')
|
||||
self.dout = Dense(action_dim, activation='tanh')
|
||||
|
||||
def call(self, x):
|
||||
x = self.d1(x)
|
||||
x = self.d2(x)
|
||||
return self.dout(x)
|
||||
|
||||
class Critic_Model(Model):
|
||||
def __init__(self, state_dim, action_dim):
|
||||
super(Critic_Model, self).__init__()
|
||||
self.d1 = Dense(64, activation='relu')
|
||||
self.d2 = Dense(64, activation='relu')
|
||||
self.dout = Dense(1, activation='linear')
|
||||
|
||||
def call(self, x):
|
||||
x = self.d1(x)
|
||||
x = self.d2(x)
|
||||
return self.dout(x)
|
||||
|
||||
class Memory():
|
||||
def __init__(self):
|
||||
self.actions = []
|
||||
self.states = []
|
||||
self.rewards = []
|
||||
self.dones = []
|
||||
self.next_states = []
|
||||
|
||||
def __len__(self):
|
||||
return len(self.dones)
|
||||
|
||||
def get_all_items(self):
|
||||
states = tf.constant(self.states, dtype = tf.float32)
|
||||
actions = tf.constant(self.actions, dtype = tf.float32)
|
||||
rewards = tf.expand_dims(tf.constant(self.rewards, dtype = tf.float32), 1)
|
||||
dones = tf.expand_dims(tf.constant(self.dones, dtype = tf.float32), 1)
|
||||
next_states = tf.constant(self.next_states, dtype = tf.float32)
|
||||
|
||||
return tf.data.Dataset.from_tensor_slices((states, actions, rewards, dones, next_states))
|
||||
|
||||
def save_eps(self, state, action, reward, done, next_state):
|
||||
self.rewards.append(reward)
|
||||
self.states.append(state)
|
||||
self.actions.append(action)
|
||||
self.dones.append(done)
|
||||
self.next_states.append(next_state)
|
||||
|
||||
def clear_memory(self):
|
||||
del self.actions[:]
|
||||
del self.states[:]
|
||||
del self.rewards[:]
|
||||
del self.dones[:]
|
||||
del self.next_states[:]
|
||||
|
||||
class Distributions():
|
||||
def sample(self, mean, std):
|
||||
distribution = tfp.distributions.Normal(mean, std)
|
||||
return distribution.sample()
|
||||
|
||||
def entropy(self, mean, std):
|
||||
distribution = tfp.distributions.Normal(mean, std)
|
||||
return distribution.entropy()
|
||||
|
||||
def logprob(self, mean, std, value_data):
|
||||
distribution = tfp.distributions.Normal(mean, std)
|
||||
return distribution.log_prob(value_data)
|
||||
|
||||
def kl_divergence(self, mean1, std1, mean2, std2):
|
||||
distribution1 = tfp.distributions.Normal(mean1, std1)
|
||||
distribution2 = tfp.distributions.Normal(mean2, std2)
|
||||
|
||||
return tfp.distributions.kl_divergence(distribution1, distribution2)
|
||||
|
||||
class PolicyFunction():
|
||||
def __init__(self, gamma = 0.99, lam = 0.95):
|
||||
self.gamma = gamma
|
||||
self.lam = lam
|
||||
|
||||
def monte_carlo_discounted(self, rewards, dones):
|
||||
running_add = 0
|
||||
returns = []
|
||||
|
||||
for step in reversed(range(len(rewards))):
|
||||
running_add = rewards[step] + (1.0 - dones[step]) * self.gamma * running_add
|
||||
returns.insert(0, running_add)
|
||||
|
||||
return tf.stack(returns)
|
||||
|
||||
def temporal_difference(self, reward, next_value, done):
|
||||
q_values = reward + (1 - done) * self.gamma * next_value
|
||||
return q_values
|
||||
|
||||
def generalized_advantage_estimation(self, values, rewards, next_values, dones):
|
||||
gae = 0
|
||||
adv = []
|
||||
|
||||
delta = rewards + (1.0 - dones) * self.gamma * next_values - values
|
||||
for step in reversed(range(len(rewards))):
|
||||
gae = delta[step] + (1.0 - dones[step]) * self.gamma * self.lam * gae
|
||||
adv.insert(0, gae)
|
||||
|
||||
return tf.stack(adv)
|
||||
|
||||
class Agent():
|
||||
def __init__(self, state_dim, action_dim, is_training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
|
||||
minibatch, PPO_epochs, gamma, lam, learning_rate):
|
||||
self.policy_kl_range = policy_kl_range
|
||||
self.policy_params = policy_params
|
||||
self.value_clip = value_clip
|
||||
self.entropy_coef = entropy_coef
|
||||
self.vf_loss_coef = vf_loss_coef
|
||||
self.minibatch = minibatch
|
||||
self.PPO_epochs = PPO_epochs
|
||||
self.is_training_mode = is_training_mode
|
||||
self.action_dim = action_dim
|
||||
self.std = tf.ones([1, action_dim])
|
||||
|
||||
self.actor = Actor_Model(state_dim, action_dim)
|
||||
self.actor_old = Actor_Model(state_dim, action_dim)
|
||||
|
||||
self.critic = Critic_Model(state_dim, action_dim)
|
||||
self.critic_old = Critic_Model(state_dim, action_dim)
|
||||
|
||||
self.optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)
|
||||
self.memory = Memory()
|
||||
self.policy_function = PolicyFunction(gamma, lam)
|
||||
self.distributions = Distributions()
|
||||
|
||||
def save_eps(self, state, action, reward, done, next_state):
|
||||
self.memory.save_eps(state, action, reward, done, next_state)
|
||||
|
||||
# Loss for PPO
|
||||
def get_loss(self, action_mean, values, old_action_mean, old_values, next_values, actions, rewards, dones):
|
||||
# Don't use old value in backpropagation
|
||||
Old_values = tf.stop_gradient(old_values)
|
||||
|
||||
# Getting general advantages estimator
|
||||
Advantages = self.policy_function.generalized_advantage_estimation(values, rewards, next_values, dones)
|
||||
Returns = tf.stop_gradient(Advantages + values)
|
||||
Advantages = tf.stop_gradient((Advantages - tf.math.reduce_mean(Advantages)) / (tf.math.reduce_std(Advantages) + 1e-6))
|
||||
|
||||
# Finding the ratio (pi_theta / pi_theta__old):
|
||||
logprobs = self.distributions.logprob(action_mean, self.std, actions)
|
||||
Old_logprobs = tf.stop_gradient(self.distributions.logprob(old_action_mean, self.std, actions))
|
||||
ratios = tf.math.exp(logprobs - Old_logprobs) # ratios = old_logprobs / logprobs
|
||||
|
||||
# Finding KL Divergence
|
||||
Kl = self.distributions.kl_divergence(old_action_mean, self.std, action_mean, self.std)
|
||||
|
||||
# Combining TR-PPO with Rollback (Truly PPO)
|
||||
pg_loss = tf.where(
|
||||
tf.logical_and(Kl >= self.policy_kl_range, ratios > 1),
|
||||
ratios * Advantages - self.policy_params * Kl,
|
||||
ratios * Advantages
|
||||
)
|
||||
pg_loss = tf.math.reduce_mean(pg_loss)
|
||||
|
||||
# Getting entropy from the action probability
|
||||
dist_entropy = tf.math.reduce_mean(self.distributions.entropy(action_mean, self.std))
|
||||
|
||||
# Getting critic loss by using Clipped critic value
|
||||
vpredclipped = old_values + tf.clip_by_value(values - Old_values, -self.value_clip, self.value_clip) # Minimize the difference between old value and new value
|
||||
vf_losses1 = tf.math.square(Returns - values) * 0.5 # Mean Squared Error
|
||||
vf_losses2 = tf.math.square(Returns - vpredclipped) * 0.5 # Mean Squared Error
|
||||
critic_loss = tf.math.reduce_mean(tf.math.maximum(vf_losses1, vf_losses2))
|
||||
|
||||
# We need to maximaze Policy Loss to make agent always find Better Rewards
|
||||
# and minimize Critic Loss
|
||||
loss = (critic_loss * self.vf_loss_coef) - (dist_entropy * self.entropy_coef) - pg_loss
|
||||
return loss
|
||||
|
||||
@tf.function
|
||||
def act(self, state):
|
||||
state = tf.expand_dims(tf.cast(state, dtype = tf.float32), 0)
|
||||
action_mean = self.actor(state)
|
||||
|
||||
# We don't need sample the action in Test Mode
|
||||
# only sampling the action in Training Mode in order to exploring the actions
|
||||
if self.is_training_mode:
|
||||
# Sample the action
|
||||
action = self.distributions.sample(action_mean, self.std)
|
||||
else:
|
||||
action = action_mean
|
||||
|
||||
return tf.squeeze(action, 0)
|
||||
|
||||
# Get loss and Do backpropagation
|
||||
@tf.function
|
||||
def training_ppo(self, states, actions, rewards, dones, next_states):
|
||||
with tf.GradientTape() as tape:
|
||||
action_mean, values = self.actor(states), self.critic(states)
|
||||
old_action_mean, old_values = self.actor_old(states), self.critic_old(states)
|
||||
next_values = self.critic(next_states)
|
||||
|
||||
loss = self.get_loss(action_mean, values, old_action_mean, old_values, next_values, actions, rewards, dones)
|
||||
|
||||
gradients = tape.gradient(loss, self.actor.trainable_variables + self.critic.trainable_variables)
|
||||
self.optimizer.apply_gradients(zip(gradients, self.actor.trainable_variables + self.critic.trainable_variables))
|
||||
|
||||
# Update the model
|
||||
def update_ppo(self):
|
||||
batch_size = int(len(self.memory) / self.minibatch)
|
||||
|
||||
# Optimize policy for K epochs:
|
||||
for _ in range(self.PPO_epochs):
|
||||
for states, actions, rewards, dones, next_states in self.memory.get_all_items().batch(batch_size):
|
||||
self.training_ppo(states, actions, rewards, dones, next_states)
|
||||
|
||||
# Clear the memory
|
||||
self.memory.clear_memory()
|
||||
|
||||
# Copy new weights into old policy:
|
||||
self.actor_old.set_weights(self.actor.get_weights())
|
||||
self.critic_old.set_weights(self.critic.get_weights())
|
||||
|
||||
def save_weights(self):
|
||||
self.actor.save_weights('bipedalwalker_w/actor_ppo', save_format='tf')
|
||||
self.actor_old.save_weights('bipedalwalker_w/actor_old_ppo', save_format='tf')
|
||||
self.critic.save_weights('bipedalwalker_w/critic_ppo', save_format='tf')
|
||||
self.critic_old.save_weights('bipedalwalker_w/critic_old_ppo', save_format='tf')
|
||||
|
||||
def load_weights(self):
|
||||
self.actor.load_weights('bipedalwalker_w/actor_ppo')
|
||||
self.actor_old.load_weights('bipedalwalker_w/actor_old_ppo')
|
||||
self.critic.load_weights('bipedalwalker_w/critic_ppo')
|
||||
self.critic_old.load_weights('bipedalwalker_w/critic_old_ppo')
|
||||
|
||||
def plot(datas):
|
||||
print('----------')
|
||||
|
||||
plt.plot(datas)
|
||||
plt.plot()
|
||||
plt.xlabel('Episode')
|
||||
plt.ylabel('Datas')
|
||||
plt.show()
|
||||
|
||||
print('Max :', np.max(datas))
|
||||
print('Min :', np.min(datas))
|
||||
print('Avg :', np.mean(datas))
|
||||
|
||||
def run_episode(env, agent, state_dim, render, training_mode, t_updates, n_update):
|
||||
############################################
|
||||
state = env.reset()
|
||||
done = False
|
||||
total_reward = 0
|
||||
eps_time = 0
|
||||
############################################
|
||||
|
||||
while not done:
|
||||
action = agent.act(state).numpy()
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
|
||||
eps_time += 1
|
||||
t_updates += 1
|
||||
total_reward += reward
|
||||
|
||||
if training_mode:
|
||||
agent.save_eps(state.tolist(), action, reward, float(done), next_state.tolist())
|
||||
|
||||
state = next_state
|
||||
|
||||
if render:
|
||||
env.render()
|
||||
|
||||
if training_mode:
|
||||
if t_updates % n_update == 0:
|
||||
agent.update_ppo()
|
||||
t_updates = 0
|
||||
|
||||
if done:
|
||||
return total_reward, eps_time, t_updates
|
||||
|
||||
def main():
|
||||
############## Hyperparameters ##############
|
||||
load_weights = False # If you want to load the agent, set this to True
|
||||
save_weights = False # If you want to save the agent, set this to True
|
||||
training_mode = True # If you want to train the agent, set this to True. But set this otherwise if you only want to test it
|
||||
reward_threshold = 300 # Set threshold for reward. The learning will stop if reward has pass threshold. Set none to sei this off
|
||||
using_google_drive = False
|
||||
|
||||
render = False # If you want to display the image. Turn this off if you run this in Google Collab
|
||||
n_update = 1024 # How many episode before you update the Policy. ocommended set to 128 for Discrete
|
||||
n_plot_batch = 100000000 # How many episode you want to plot the result
|
||||
n_episode = 100000 # How many episode you want to run
|
||||
n_saved = 10 # How many episode to run before saving the weights
|
||||
|
||||
policy_kl_range = 0.03 # Set to 0.0008 for Discrete
|
||||
policy_params = 5 # Set to 20 for Discrete
|
||||
value_clip = 1.0 # How many value will be clipped. Recommended set to the highest or lowest possible reward
|
||||
entropy_coef = 0.05 # How much randomness of action you will get
|
||||
vf_loss_coef = 1.0 # Just set to 1
|
||||
minibatch = 32 # How many batch per update. size of batch = n_update / minibatch. Rocommended set to 4 for Discrete
|
||||
PPO_epochs = 10 # How many epoch per update
|
||||
|
||||
gamma = 0.99 # Just set to 0.99
|
||||
lam = 0.95 # Just set to 0.95
|
||||
learning_rate = 3e-4 # Just set to 0.95
|
||||
#############################################
|
||||
env_name = 'Env Name' # Set the env you want
|
||||
env = gym.make(env_name)
|
||||
|
||||
state_dim = env.observation_space.shape[0]
|
||||
action_dim = env.action_space.shape[0]
|
||||
|
||||
print(action_dim)
|
||||
|
||||
agent = Agent(state_dim, action_dim, training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
|
||||
minibatch, PPO_epochs, gamma, lam, learning_rate)
|
||||
#############################################
|
||||
if using_google_drive:
|
||||
from google.colab import drive
|
||||
drive.mount('/test')
|
||||
|
||||
if load_weights:
|
||||
agent.load_weights()
|
||||
print('Weight Loaded')
|
||||
|
||||
rewards = []
|
||||
batch_rewards = []
|
||||
batch_solved_reward = []
|
||||
|
||||
times = []
|
||||
batch_times = []
|
||||
|
||||
total_time = 0
|
||||
t_updates = 0
|
||||
|
||||
for i_episode in range(1, n_episode + 1):
|
||||
total_reward, time, t_updates = run_episode(env, agent, state_dim, render, training_mode, t_updates, n_update)
|
||||
print('Episode {} \t t_reward: {} \t time: {} \t '.format(i_episode, total_reward, time))
|
||||
batch_rewards.append(int(total_reward))
|
||||
batch_times.append(time)
|
||||
|
||||
if save_weights:
|
||||
if i_episode % n_saved == 0:
|
||||
agent.save_weights()
|
||||
print('weights saved')
|
||||
|
||||
if reward_threshold:
|
||||
if len(batch_solved_reward) == 100:
|
||||
if np.mean(batch_solved_reward) >= reward_threshold:
|
||||
print('You solved task after {} episode'.format(len(rewards)))
|
||||
break
|
||||
|
||||
else:
|
||||
del batch_solved_reward[0]
|
||||
batch_solved_reward.append(total_reward)
|
||||
|
||||
else:
|
||||
batch_solved_reward.append(total_reward)
|
||||
|
||||
if i_episode % n_plot_batch == 0 and i_episode != 0:
|
||||
# Plot the reward, times for every n_plot_batch
|
||||
plot(batch_rewards)
|
||||
plot(batch_times)
|
||||
|
||||
for reward in batch_rewards:
|
||||
rewards.append(reward)
|
||||
|
||||
for time in batch_times:
|
||||
times.append(time)
|
||||
|
||||
batch_rewards = []
|
||||
batch_times = []
|
||||
|
||||
print('========== Cummulative ==========')
|
||||
# Plot the reward, times for every episode
|
||||
plot(rewards)
|
||||
plot(times)
|
||||
|
||||
print('========== Final ==========')
|
||||
# Plot the reward, times for every episode
|
||||
|
||||
for reward in batch_rewards:
|
||||
rewards.append(reward)
|
||||
|
||||
for time in batch_times:
|
||||
times.append(time)
|
||||
|
||||
plot(rewards)
|
||||
plot(times)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
x
Reference in New Issue
Block a user