add initial code

2020-12-31 09:14:00 +08:00 · 2020-12-31 09:14:00 +08:00 · 9c801f0adc
commit 9c801f0adc
parent 0e16595372
6 changed files with 2479 additions and 0 deletions
--- a/PPO/pytorch/ppo_pong_pytorch.py
+++ b/PPO/pytorch/ppo_pong_pytorch.py
@ -0,0 +1,420 @@
+import gym
+from gym.envs.registration import register
+    
+import torch
+import torch.nn as nn
+from torch.distributions import Categorical
+from torch.distributions.kl import kl_divergence
+from torch.utils.data import Dataset, DataLoader
+from torch.optim import Adam
+from torch.utils.tensorboard import SummaryWriter
+
+import numpy as np
+import sys
+import numpy
+import time
+import datetime
+
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")  
+dataType = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
+
+class Utils():
+    def prepro(self, I):
+        I           = I[35:195] # crop
+        I           = I[::2,::2, 0] # downsample by factor of 2
+        I[I == 144] = 0 # erase background (background type 1)
+        I[I == 109] = 0 # erase background (background type 2)
+        I[I != 0]   = 1 # everything else (paddles, ball) just set to 1
+        X           = I.astype(np.float32).ravel() # Combine items in 1 array 
+        return X
+
+class Actor_Model(nn.Module):
+    def __init__(self, state_dim, action_dim):
+        super(Actor_Model, self).__init__()   
+
+        self.nn_layer = nn.Sequential(
+                nn.Linear(state_dim, 64),
+                nn.ReLU(),
+                nn.Linear(64, 64),
+                nn.ReLU(),
+                nn.Linear(64, action_dim),
+                nn.Softmax(-1)
+              ).float().to(device)
+        
+    def forward(self, states):
+        return self.nn_layer(states)
+
+class Critic_Model(nn.Module):
+    def __init__(self, state_dim, action_dim):
+        super(Critic_Model, self).__init__()   
+
+        self.nn_layer = nn.Sequential(
+                nn.Linear(state_dim, 64),
+                nn.ReLU(),
+                nn.Linear(64, 64),
+                nn.ReLU(),
+                nn.Linear(64, 1)
+              ).float().to(device)
+        
+    def forward(self, states):
+        return self.nn_layer(states)
+
+class Memory(Dataset):
+    def __init__(self):
+        self.actions        = [] 
+        self.states         = []
+        self.rewards        = []
+        self.dones          = []     
+        self.next_states    = []
+
+    def __len__(self):
+        return len(self.dones)
+
+    def __getitem__(self, idx):
+        return np.array(self.states[idx], dtype = np.float32), np.array(self.actions[idx], dtype = np.float32), np.array([self.rewards[idx]], dtype = np.float32), np.array([self.dones[idx]], dtype = np.float32), np.array(self.next_states[idx], dtype = np.float32)      
+
+    def save_eps(self, state, action, reward, done, next_state):
+        self.states.append(state)
+        self.actions.append(action)
+        self.rewards.append(reward)
+        self.dones.append(done)
+        self.next_states.append(next_state)        
+
+    def clear_memory(self):
+        del self.actions[:]
+        del self.states[:]
+        del self.rewards[:]
+        del self.dones[:]
+        del self.next_states[:]  
+
+class Discrete():
+    def sample(self, datas):
+        distribution = Categorical(datas)
+        return distribution.sample().float().to(device)
+        
+    def entropy(self, datas):
+        distribution = Categorical(datas)    
+        return distribution.entropy().float().to(device)
+      
+    def logprob(self, datas, value_data):
+        distribution = Categorical(datas)
+        return distribution.log_prob(value_data).unsqueeze(1).float().to(device)
+
+    def kl_divergence(self, datas1, datas2):
+        distribution1 = Categorical(datas1)
+        distribution2 = Categorical(datas2)
+
+        return kl_divergence(distribution1, distribution2).unsqueeze(1).float().to(device)  
+
+class PolicyFunction():
+    def __init__(self, gamma = 0.99, lam = 0.95):
+        self.gamma  = gamma
+        self.lam    = lam
+
+    def monte_carlo_discounted(self, rewards, dones):
+        running_add = 0
+        returns     = []        
+        
+        for step in reversed(range(len(rewards))):
+            running_add = rewards[step] + (1.0 - dones[step]) * self.gamma * running_add
+            returns.insert(0, running_add)
+            
+        return torch.stack(returns)
+      
+    def temporal_difference(self, reward, next_value, done):
+        q_values = reward + (1 - done) * self.gamma * next_value           
+        return q_values
+      
+    def generalized_advantage_estimation(self, values, rewards, next_values, dones):
+        gae     = 0
+        adv     = []     
+
+        delta   = rewards + (1.0 - dones) * self.gamma * next_values - values          
+        for step in reversed(range(len(rewards))):  
+            gae = delta[step] + (1.0 - dones[step]) * self.gamma * self.lam * gae
+            adv.insert(0, gae)
+            
+        return torch.stack(adv)
+
+class TrulyPPO():
+    def __init__(self, policy_kl_range, policy_params, value_clip, vf_loss_coef, entropy_coef, gamma, lam):
+        self.policy_kl_range    = policy_kl_range
+        self.policy_params      = policy_params
+        self.value_clip         = value_clip
+        self.vf_loss_coef       = vf_loss_coef
+        self.entropy_coef       = entropy_coef
+
+        self.distributions      = Discrete()
+        self.policy_function    = PolicyFunction(gamma, lam)
+
+    # Loss for PPO  
+    def compute_loss(self, action_probs, old_action_probs, values, old_values, next_values, actions, rewards, dones):
+        # Don't use old value in backpropagation
+        Old_values          = old_values.detach()
+        Old_action_probs    = old_action_probs.detach()     
+
+        # Getting general advantages estimator and returns
+        Advantages      = self.policy_function.generalized_advantage_estimation(values, rewards, next_values, dones)
+        Returns         = (Advantages + values).detach()
+        Advantages      = ((Advantages - Advantages.mean()) / (Advantages.std() + 1e-6)).detach()
+
+        # Finding the ratio (pi_theta / pi_theta__old): 
+        logprobs        = self.distributions.logprob(action_probs, actions)
+        Old_logprobs    = self.distributions.logprob(Old_action_probs, actions).detach()
+
+        # Finding Surrogate Loss
+        ratios          = (logprobs - Old_logprobs).exp() # ratios = old_logprobs / logprobs        
+        Kl              = self.distributions.kl_divergence(old_action_probs, action_probs)
+
+        pg_targets  = torch.where(
+            (Kl >= self.policy_kl_range) & (ratios > 1),
+            ratios * Advantages - self.policy_params * Kl,
+            ratios * Advantages
+        )
+        pg_loss     = pg_targets.mean()
+
+        # Getting Entropy from the action probability 
+        dist_entropy    = self.distributions.entropy(action_probs).mean()
+
+        # Getting Critic loss by using Clipped critic value
+        if self.value_clip is None:
+            critic_loss = ((Returns - values).pow(2) * 0.5).mean()
+        else:
+            vpredclipped  = old_values + torch.clamp(values - Old_values, -self.value_clip, self.value_clip) # Minimize the difference between old value and new value
+            vf_losses1    = (Returns - values).pow(2) * 0.5 # Mean Squared Error
+            vf_losses2    = (Returns - vpredclipped).pow(2) * 0.5 # Mean Squared Error        
+            critic_loss   = torch.max(vf_losses1, vf_losses2).mean() 
+
+        # We need to maximaze Policy Loss to make agent always find Better Rewards
+        # and minimize Critic Loss 
+        loss = (critic_loss * self.vf_loss_coef) - (dist_entropy * self.entropy_coef) - pg_loss
+        return loss
+
+class Agent():  
+    def __init__(self, state_dim, action_dim, is_training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
+                 batchsize, PPO_epochs, gamma, lam, learning_rate):        
+        self.policy_kl_range    = policy_kl_range 
+        self.policy_params      = policy_params
+        self.value_clip         = value_clip    
+        self.entropy_coef       = entropy_coef
+        self.vf_loss_coef       = vf_loss_coef
+        self.batchsize          = batchsize       
+        self.PPO_epochs         = PPO_epochs
+        self.is_training_mode   = is_training_mode
+        self.action_dim         = action_dim               
+
+        self.actor              = Actor_Model(state_dim, action_dim)
+        self.actor_old          = Actor_Model(state_dim, action_dim)
+        self.actor_optimizer    = Adam(self.actor.parameters(), lr = learning_rate)
+
+        self.critic             = Critic_Model(state_dim, action_dim)
+        self.critic_old         = Critic_Model(state_dim, action_dim)
+        self.critic_optimizer   = Adam(self.critic.parameters(), lr = learning_rate)
+
+        self.memory             = Memory()
+        self.policy_function    = PolicyFunction(gamma, lam)  
+
+        self.distributions      = Discrete()
+        self.policy_loss        = TrulyPPO(policy_kl_range, policy_params, value_clip, vf_loss_coef, entropy_coef, gamma, lam)
+
+        if is_training_mode:
+          self.actor.train()
+          self.critic.train()
+        else:
+          self.actor.eval()
+          self.critic.eval()
+
+    def save_eps(self, state, action, reward, done, next_state):
+        self.memory.save_eps(state, action, reward, done, next_state)
+
+    def act(self, state):
+        state           = torch.FloatTensor(state).unsqueeze(0).to(device).detach()
+        action_probs    = self.actor(state)
+        
+        # We don't need sample the action in Test Mode
+        # only sampling the action in Training Mode in order to exploring the actions
+        if self.is_training_mode:
+            # Sample the action
+            action  = self.distributions.sample(action_probs) 
+        else:
+            action  = torch.argmax(action_probs, 1)  
+              
+        return action.int().cpu().item()
+
+    # Get loss and Do backpropagation
+    def training_ppo(self, states, actions, rewards, dones, next_states):
+        action_probs, values            = self.actor(states), self.critic(states)
+        old_action_probs, old_values    = self.actor_old(states), self.critic_old(states)
+        next_values                     = self.critic(next_states)
+
+        loss    = self.policy_loss.compute_loss(action_probs, old_action_probs, values, old_values, next_values, actions, rewards, dones)
+
+        self.actor_optimizer.zero_grad()
+        self.critic_optimizer.zero_grad()
+
+        loss.backward()
+
+        self.actor_optimizer.step() 
+        self.critic_optimizer.step() 
+
+    # Update the model
+    def update_ppo(self):        
+        dataloader  = DataLoader(self.memory, self.batchsize, shuffle = False)
+
+        # Optimize policy for K epochs:
+        for _ in range(self.PPO_epochs):       
+            for states, actions, rewards, dones, next_states in dataloader:
+                self.training_ppo(states.float().to(device), actions.float().to(device), rewards.float().to(device), dones.float().to(device), next_states.float().to(device))
+
+        # Clear the memory
+        self.memory.clear_memory()
+
+        # Copy new weights into old policy:
+        self.actor_old.load_state_dict(self.actor.state_dict())
+        self.critic_old.load_state_dict(self.critic.state_dict())
+
+    def save_weights(self):
+        torch.save({
+            'model_state_dict': self.actor.state_dict(),
+            'optimizer_state_dict': self.actor_optimizer.state_dict()
+            }, 'SlimeVolley/actor.tar')
+        
+        torch.save({
+            'model_state_dict': self.critic.state_dict(),
+            'optimizer_state_dict': self.critic_optimizer.state_dict()
+            }, 'SlimeVolley/critic.tar')
+        
+    def load_weights(self):
+        actor_checkpoint = torch.load('SlimeVolley/actor.tar')
+        self.actor.load_state_dict(actor_checkpoint['model_state_dict'])
+        self.actor_optimizer.load_state_dict(actor_checkpoint['optimizer_state_dict'])
+
+        critic_checkpoint = torch.load('SlimeVolley/critic.tar')
+        self.critic.load_state_dict(critic_checkpoint['model_state_dict'])
+        self.critic_optimizer.load_state_dict(critic_checkpoint['optimizer_state_dict'])
+
+class Runner():
+    def __init__(self, env, agent, render, training_mode, n_update):
+        self.env = env
+        self.agent = agent
+        self.render = render
+        self.training_mode = training_mode
+
+        self.n_update = n_update
+        self.t_updates = 0
+        self.utils = Utils()
+
+    def run_episode(self):
+        ############################################
+        obs = self.env.reset()
+        obs = self.utils.prepro(obs)
+
+        done = False
+        total_reward = 0
+        eps_time = 0
+        ############################################
+        for _ in range(10000): 
+            action      = self.agent.act(state)       
+            action_gym  = action + 1 if action != 0 else 0
+
+            next_obs, reward, done, _   = self.env.step(action_gym)
+            next_obs                    = self.utils.prepro(next_obs)
+            next_state                  = next_obs - obs
+
+            eps_time += 1 
+            self.t_updates += 1
+            total_reward += reward
+            
+            if self.training_mode: 
+                self.agent.save_eps(state.tolist(), action, reward, float(done), next_state.tolist()) 
+                
+            state   = next_state
+            obs     = next_obs
+                    
+            if self.render:
+                self.env.render()     
+            
+            if self.training_mode and self.n_update is not None and self.t_updates == self.n_update:
+                self.agent.update_ppo()
+                self.t_updates = 0
+            
+            if done: 
+                break                
+        
+        if self.training_mode and self.n_update is None:
+            self.agent.update_ppo()
+                    
+        return total_reward, eps_time
+
+def main():
+    ############## Hyperparameters ##############
+    load_weights        = False # If you want to load the agent, set this to True
+    save_weights        = False # If you want to save the agent, set this to True
+    training_mode       = True # If you want to train the agent, set this to True. But set this otherwise if you only want to test it
+    reward_threshold    = 495 # Set threshold for reward. The learning will stop if reward has pass threshold. Set none to sei this off
+    using_google_drive  = False
+
+    render              = False # If you want to display the image, set this to True. Turn this off if you run this in Google Collab
+    n_update            = 128 # How many episode before you update the Policy. Recommended set to 128 for Discrete
+    n_plot_batch        = 100000000 # How many episode you want to plot the result
+    n_episode           = 100000 # How many episode you want to run
+    n_saved             = 10 # How many episode to run before saving the weights
+
+    policy_kl_range     = 0.0008 # Recommended set to 0.0008 for Discrete
+    policy_params       = 20 # Recommended set to 20 for Discrete
+    value_clip          = 1.0 # How many value will be clipped. Recommended set to the highest or lowest possible reward
+    entropy_coef        = 0.05 # How much randomness of action you will get
+    vf_loss_coef        = 1.0 # Just set to 1
+    batchsize           = 32 # How many batch per update. size of batch = n_update / minibatch. Recommended set to 4 for Discrete
+    PPO_epochs          = 4 # How many epoch per update. Recommended set to 10 for Discrete
+    
+    gamma               = 0.99 # Just set to 0.99
+    lam                 = 0.95 # Just set to 0.95
+    learning_rate       = 2.5e-4 # Just set to 0.95
+    ############################################# 
+    writer              = SummaryWriter()
+
+    env_name            = 'CartPole-v1' # Set the env you want
+    env                 = gym.make(env_name)
+
+    state_dim           = env.observation_space.shape[0]
+    action_dim          = env.action_space.n
+
+    agent               = Agent(state_dim, action_dim, training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
+                            batchsize, PPO_epochs, gamma, lam, learning_rate) 
+
+    runner              = Runner(env, agent, render, training_mode, n_update)
+    #############################################     
+    if using_google_drive:
+        from google.colab import drive
+        drive.mount('/test')
+
+    if load_weights:
+        agent.load_weights()
+        print('Weight Loaded')
+
+    print('Run the training!!')
+    start = time.time()
+
+    try:
+        for i_episode in range(1, n_episode + 1):
+            total_reward, eps_time = runner.run_episode()
+
+            print('Episode: {} \t t_reward: {} \t time: {} \t '.format(i_episode, total_reward, eps_time))
+            writer.add_scalar('rewards', total_reward, i_episode)
+
+            if save_weights:
+                if i_episode % n_saved == 0:
+                    agent.save_weights() 
+                    print('weights saved')
+
+    except KeyboardInterrupt:        
+        print('\nTraining has been Shutdown \n')
+
+    finally:
+        finish = time.time()
+        timedelta = finish - start
+        print('Timelength: {}'.format(str( datetime.timedelta(seconds = timedelta) )))
+
+if __name__ == '__main__':
+    main()
--- a/PPO/pytorch/ppo_pytorch.py
+++ b/PPO/pytorch/ppo_pytorch.py
@ -0,0 +1,412 @@
+import gym
+from gym.envs.registration import register
+    
+import torch
+import torch.nn as nn
+from torch.distributions import Categorical
+from torch.distributions.kl import kl_divergence
+from torch.utils.data import Dataset, DataLoader
+from torch.optim import Adam
+from torch.utils.tensorboard import SummaryWriter
+
+import numpy as np
+import sys
+import numpy
+import time
+import datetime
+
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")  
+dataType = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
+
+class Utils():
+    def prepro(self, I):
+        I           = I[35:195] # crop
+        I           = I[::2,::2, 0] # downsample by factor of 2
+        I[I == 144] = 0 # erase background (background type 1)
+        I[I == 109] = 0 # erase background (background type 2)
+        I[I != 0]   = 1 # everything else (paddles, ball) just set to 1
+        X           = I.astype(np.float32).ravel() # Combine items in 1 array 
+        return X
+
+class Actor_Model(nn.Module):
+    def __init__(self, state_dim, action_dim):
+        super(Actor_Model, self).__init__()   
+
+        self.nn_layer = nn.Sequential(
+                nn.Linear(state_dim, 64),
+                nn.ReLU(),
+                nn.Linear(64, 64),
+                nn.ReLU(),
+                nn.Linear(64, action_dim),
+                nn.Softmax(-1)
+              ).float().to(device)
+        
+    def forward(self, states):
+        return self.nn_layer(states)
+
+class Critic_Model(nn.Module):
+    def __init__(self, state_dim, action_dim):
+        super(Critic_Model, self).__init__()   
+
+        self.nn_layer = nn.Sequential(
+                nn.Linear(state_dim, 64),
+                nn.ReLU(),
+                nn.Linear(64, 64),
+                nn.ReLU(),
+                nn.Linear(64, 1)
+              ).float().to(device)
+        
+    def forward(self, states):
+        return self.nn_layer(states)
+
+class Memory(Dataset):
+    def __init__(self):
+        self.actions        = [] 
+        self.states         = []
+        self.rewards        = []
+        self.dones          = []     
+        self.next_states    = []
+
+    def __len__(self):
+        return len(self.dones)
+
+    def __getitem__(self, idx):
+        return np.array(self.states[idx], dtype = np.float32), np.array(self.actions[idx], dtype = np.float32), np.array([self.rewards[idx]], dtype = np.float32), np.array([self.dones[idx]], dtype = np.float32), np.array(self.next_states[idx], dtype = np.float32)      
+
+    def save_eps(self, state, action, reward, done, next_state):
+        self.states.append(state)
+        self.actions.append(action)
+        self.rewards.append(reward)
+        self.dones.append(done)
+        self.next_states.append(next_state)        
+
+    def clear_memory(self):
+        del self.actions[:]
+        del self.states[:]
+        del self.rewards[:]
+        del self.dones[:]
+        del self.next_states[:]  
+
+class Discrete():
+    def sample(self, datas):
+        distribution = Categorical(datas)
+        return distribution.sample().float().to(device)
+        
+    def entropy(self, datas):
+        distribution = Categorical(datas)    
+        return distribution.entropy().float().to(device)
+      
+    def logprob(self, datas, value_data):
+        distribution = Categorical(datas)
+        return distribution.log_prob(value_data).unsqueeze(1).float().to(device)
+
+    def kl_divergence(self, datas1, datas2):
+        distribution1 = Categorical(datas1)
+        distribution2 = Categorical(datas2)
+
+        return kl_divergence(distribution1, distribution2).unsqueeze(1).float().to(device)  
+
+class PolicyFunction():
+    def __init__(self, gamma = 0.99, lam = 0.95):
+        self.gamma  = gamma
+        self.lam    = lam
+
+    def monte_carlo_discounted(self, rewards, dones):
+        running_add = 0
+        returns     = []        
+        
+        for step in reversed(range(len(rewards))):
+            running_add = rewards[step] + (1.0 - dones[step]) * self.gamma * running_add
+            returns.insert(0, running_add)
+            
+        return torch.stack(returns)
+      
+    def temporal_difference(self, reward, next_value, done):
+        q_values = reward + (1 - done) * self.gamma * next_value           
+        return q_values
+      
+    def generalized_advantage_estimation(self, values, rewards, next_values, dones):
+        gae     = 0
+        adv     = []     
+
+        delta   = rewards + (1.0 - dones) * self.gamma * next_values - values          
+        for step in reversed(range(len(rewards))):  
+            gae = delta[step] + (1.0 - dones[step]) * self.gamma * self.lam * gae
+            adv.insert(0, gae)
+            
+        return torch.stack(adv)
+
+class TrulyPPO():
+    def __init__(self, policy_kl_range, policy_params, value_clip, vf_loss_coef, entropy_coef, gamma, lam):
+        self.policy_kl_range    = policy_kl_range
+        self.policy_params      = policy_params
+        self.value_clip         = value_clip
+        self.vf_loss_coef       = vf_loss_coef
+        self.entropy_coef       = entropy_coef
+
+        self.distributions      = Discrete()
+        self.policy_function    = PolicyFunction(gamma, lam)
+
+    # Loss for PPO  
+    def compute_loss(self, action_probs, old_action_probs, values, old_values, next_values, actions, rewards, dones):
+        # Don't use old value in backpropagation
+        Old_values          = old_values.detach()
+        Old_action_probs    = old_action_probs.detach()     
+
+        # Getting general advantages estimator and returns
+        Advantages      = self.policy_function.generalized_advantage_estimation(values, rewards, next_values, dones)
+        Returns         = (Advantages + values).detach()
+        Advantages      = ((Advantages - Advantages.mean()) / (Advantages.std() + 1e-6)).detach()
+
+        # Finding the ratio (pi_theta / pi_theta__old): 
+        logprobs        = self.distributions.logprob(action_probs, actions)
+        Old_logprobs    = self.distributions.logprob(Old_action_probs, actions).detach()
+
+        # Finding Surrogate Loss
+        ratios          = (logprobs - Old_logprobs).exp() # ratios = old_logprobs / logprobs        
+        Kl              = self.distributions.kl_divergence(old_action_probs, action_probs)
+
+        pg_targets  = torch.where(
+            (Kl >= self.policy_kl_range) & (ratios > 1),
+            ratios * Advantages - self.policy_params * Kl,
+            ratios * Advantages
+        )
+        pg_loss     = pg_targets.mean()
+
+        # Getting Entropy from the action probability 
+        dist_entropy    = self.distributions.entropy(action_probs).mean()
+
+        # Getting Critic loss by using Clipped critic value
+        if self.value_clip is None:
+            critic_loss   = ((Returns - values).pow(2) * 0.5).mean()
+        else:
+            vpredclipped  = old_values + torch.clamp(values - Old_values, -self.value_clip, self.value_clip) # Minimize the difference between old value and new value
+            vf_losses1    = (Returns - values).pow(2) * 0.5 # Mean Squared Error
+            vf_losses2    = (Returns - vpredclipped).pow(2) * 0.5 # Mean Squared Error        
+            critic_loss   = torch.max(vf_losses1, vf_losses2).mean() 
+
+        # We need to maximaze Policy Loss to make agent always find Better Rewards
+        # and minimize Critic Loss 
+        loss = (critic_loss * self.vf_loss_coef) - (dist_entropy * self.entropy_coef) - pg_loss
+        return loss
+
+class Agent():  
+    def __init__(self, state_dim, action_dim, is_training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
+                 batchsize, PPO_epochs, gamma, lam, learning_rate):        
+        self.policy_kl_range    = policy_kl_range 
+        self.policy_params      = policy_params
+        self.value_clip         = value_clip    
+        self.entropy_coef       = entropy_coef
+        self.vf_loss_coef       = vf_loss_coef
+        self.batchsize          = batchsize       
+        self.PPO_epochs         = PPO_epochs
+        self.is_training_mode   = is_training_mode
+        self.action_dim         = action_dim               
+
+        self.actor              = Actor_Model(state_dim, action_dim)
+        self.actor_old          = Actor_Model(state_dim, action_dim)
+        self.actor_optimizer    = Adam(self.actor.parameters(), lr = learning_rate)
+
+        self.critic             = Critic_Model(state_dim, action_dim)
+        self.critic_old         = Critic_Model(state_dim, action_dim)
+        self.critic_optimizer   = Adam(self.critic.parameters(), lr = learning_rate)
+
+        self.memory             = Memory()
+        self.policy_function    = PolicyFunction(gamma, lam)  
+
+        self.distributions      = Discrete()
+        self.policy_loss        = TrulyPPO(policy_kl_range, policy_params, value_clip, vf_loss_coef, entropy_coef, gamma, lam)
+
+        if is_training_mode:
+          self.actor.train()
+          self.critic.train()
+        else:
+          self.actor.eval()
+          self.critic.eval()
+
+    def save_eps(self, state, action, reward, done, next_state):
+        self.memory.save_eps(state, action, reward, done, next_state)
+
+    def act(self, state):
+        state           = torch.FloatTensor(state).unsqueeze(0).to(device).detach()
+        action_probs    = self.actor(state)
+        
+        # We don't need sample the action in Test Mode
+        # only sampling the action in Training Mode in order to exploring the actions
+        if self.is_training_mode:
+            # Sample the action
+            action  = self.distributions.sample(action_probs) 
+        else:
+            action  = torch.argmax(action_probs, 1)  
+              
+        return action.int().cpu().item()
+
+    # Get loss and Do backpropagation
+    def training_ppo(self, states, actions, rewards, dones, next_states):
+        action_probs, values            = self.actor(states), self.critic(states)
+        old_action_probs, old_values    = self.actor_old(states), self.critic_old(states)
+        next_values                     = self.critic(next_states)
+
+        loss    = self.policy_loss.compute_loss(action_probs, old_action_probs, values, old_values, next_values, actions, rewards, dones)
+
+        self.actor_optimizer.zero_grad()
+        self.critic_optimizer.zero_grad()
+
+        loss.backward()
+
+        self.actor_optimizer.step() 
+        self.critic_optimizer.step() 
+
+    # Update the model
+    def update_ppo(self):        
+        dataloader  = DataLoader(self.memory, self.batchsize, shuffle = False)
+
+        # Optimize policy for K epochs:
+        for _ in range(self.PPO_epochs):       
+            for states, actions, rewards, dones, next_states in dataloader:
+                self.training_ppo(states.float().to(device), actions.float().to(device), rewards.float().to(device), dones.float().to(device), next_states.float().to(device))
+
+        # Clear the memory
+        self.memory.clear_memory()
+
+        # Copy new weights into old policy:
+        self.actor_old.load_state_dict(self.actor.state_dict())
+        self.critic_old.load_state_dict(self.critic.state_dict())
+
+    def save_weights(self):
+        torch.save({
+            'model_state_dict': self.actor.state_dict(),
+            'optimizer_state_dict': self.actor_optimizer.state_dict()
+            }, 'SlimeVolley/actor.tar')
+        
+        torch.save({
+            'model_state_dict': self.critic.state_dict(),
+            'optimizer_state_dict': self.critic_optimizer.state_dict()
+            }, 'SlimeVolley/critic.tar')
+        
+    def load_weights(self):
+        actor_checkpoint = torch.load('SlimeVolley/actor.tar')
+        self.actor.load_state_dict(actor_checkpoint['model_state_dict'])
+        self.actor_optimizer.load_state_dict(actor_checkpoint['optimizer_state_dict'])
+
+        critic_checkpoint = torch.load('SlimeVolley/critic.tar')
+        self.critic.load_state_dict(critic_checkpoint['model_state_dict'])
+        self.critic_optimizer.load_state_dict(critic_checkpoint['optimizer_state_dict'])
+
+class Runner():
+    def __init__(self, env, agent, render, training_mode, n_update):
+        self.env = env
+        self.agent = agent
+        self.render = render
+        self.training_mode = training_mode
+
+        self.n_update = n_update
+        self.t_updates = 0
+
+    def run_episode(self):
+        ############################################
+        state = self.env.reset()    
+        done = False
+        total_reward = 0
+        eps_time = 0
+        ############################################
+        for _ in range(10000): 
+            action = self.agent.act(state)       
+            next_state, reward, done, _ =  self.env.step(action)
+
+            eps_time += 1 
+            self.t_updates += 1
+            total_reward += reward
+            
+            if self.training_mode: 
+                self.agent.save_eps(state.tolist(), action, reward, float(done), next_state.tolist()) 
+                
+            state = next_state
+                    
+            if self.render:
+                self.env.render()     
+            
+            if self.training_mode and self.n_update is not None and self.t_updates == self.n_update:
+                self.agent.update_ppo()
+                self.t_updates = 0
+            
+            if done: 
+                break                
+        
+        if self.training_mode and self.n_update is None:
+            self.agent.update_ppo()
+                    
+        return total_reward, eps_time
+
+def main():
+    ############## Hyperparameters ##############
+    load_weights        = False # If you want to load the agent, set this to True
+    save_weights        = False # If you want to save the agent, set this to True
+    training_mode       = True # If you want to train the agent, set this to True. But set this otherwise if you only want to test it
+    reward_threshold    = 495 # Set threshold for reward. The learning will stop if reward has pass threshold. Set none to sei this off
+    using_google_drive  = False
+
+    render              = False # If you want to display the image, set this to True. Turn this off if you run this in Google Collab
+    n_update            = 128 # How many episode before you update the Policy. Recommended set to 128 for Discrete
+    n_plot_batch        = 100000000 # How many episode you want to plot the result
+    n_episode           = 100000 # How many episode you want to run
+    n_saved             = 10 # How many episode to run before saving the weights
+
+    policy_kl_range     = 0.0008 # Recommended set to 0.0008 for Discrete
+    policy_params       = 20 # Recommended set to 20 for Discrete
+    value_clip          = 1.0 # How many value will be clipped. Recommended set to the highest or lowest possible reward
+    entropy_coef        = 0.05 # How much randomness of action you will get
+    vf_loss_coef        = 1.0 # Just set to 1
+    batchsize           = 32 # How many batch per update. size of batch = n_update / minibatch. Recommended set to 4 for Discrete
+    PPO_epochs          = 4 # How many epoch per update. Recommended set to 10 for Discrete
+    
+    gamma               = 0.99 # Just set to 0.99
+    lam                 = 0.95 # Just set to 0.95
+    learning_rate       = 2.5e-4 # Just set to 0.95
+    ############################################# 
+    writer              = SummaryWriter()
+
+    env_name            = 'CartPole-v1' # Set the env you want
+    env                 = gym.make(env_name)
+
+    state_dim           = env.observation_space.shape[0]
+    action_dim          = env.action_space.n
+
+    agent               = Agent(state_dim, action_dim, training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
+                            batchsize, PPO_epochs, gamma, lam, learning_rate) 
+
+    runner              = Runner(env, agent, render, training_mode, n_update)
+    #############################################     
+    if using_google_drive:
+        from google.colab import drive
+        drive.mount('/test')
+
+    if load_weights:
+        agent.load_weights()
+        print('Weight Loaded')
+
+    print('Run the training!!')
+    start = time.time()
+
+    try:
+        for i_episode in range(1, n_episode + 1):
+            total_reward, eps_time = runner.run_episode()
+
+            print('Episode: {} \t t_reward: {} \t time: {} \t '.format(i_episode, total_reward, eps_time))
+            writer.add_scalar('rewards', total_reward, i_episode)
+
+            if save_weights:
+                if i_episode % n_saved == 0:
+                    agent.save_weights() 
+                    print('weights saved')
+
+    except KeyboardInterrupt:        
+        print('\nTraining has been Shutdown \n')
+
+    finally:
+        finish = time.time()
+        timedelta = finish - start
+        print('Timelength: {}'.format(str( datetime.timedelta(seconds = timedelta) )))
+
+if __name__ == '__main__':
+    main()
--- a/2/ppo_pong_tensorflow.py
+++ b/2/ppo_pong_tensorflow.py
@ -0,0 +1,424 @@
+import gym
+from gym.envs.registration import register
+    
+import tensorflow as tf
+import tensorflow_probability as tfp
+from tensorflow.keras.layers import Dense
+from tensorflow.keras import Model
+
+import matplotlib.pyplot as plt
+import numpy as np
+import sys
+import numpy
+
+class Utils():
+    def prepro(self, I):
+        I           = I[35:195] # crop
+        I           = I[::2,::2, 0] # downsample by factor of 2
+        I[I == 144] = 0 # erase background (background type 1)
+        I[I == 109] = 0 # erase background (background type 2)
+        I[I != 0]   = 1 # everything else (paddles, ball) just set to 1
+        X           = I.astype(np.float32).ravel() # Combine items in 1 array 
+        return X
+
+class Actor_Model(Model):
+  def __init__(self, state_dim, action_dim):
+    super(Actor_Model, self).__init__()
+    self.d1     = Dense(640, activation='relu')
+    self.d2     = Dense(640, activation='relu')
+    self.dout   = Dense(action_dim, activation='softmax')
+
+  def call(self, x):
+    x = self.d1(x)
+    x = self.d2(x)
+    return self.dout(x)
+
+class Critic_Model(Model):
+  def __init__(self, state_dim, action_dim):
+    super(Critic_Model, self).__init__()
+    self.d1     = Dense(640, activation='relu')
+    self.d2     = Dense(640, activation='relu')
+    self.dout   = Dense(1, activation='linear')
+
+  def call(self, x):
+    x = self.d1(x)
+    x = self.d2(x)
+    return self.dout(x)
+
+class Memory():
+    def __init__(self):
+        self.actions        = [] 
+        self.states         = []
+        self.rewards        = []
+        self.dones          = []     
+        self.next_states    = []
+
+    def __len__(self):
+        return len(self.dones)
+
+    def get_all_items(self):
+        states = tf.constant(self.states, dtype = tf.float32)
+        actions = tf.constant(self.actions, dtype = tf.float32)
+        rewards = tf.expand_dims(tf.constant(self.rewards, dtype = tf.float32), 1)
+        dones = tf.expand_dims(tf.constant(self.dones, dtype = tf.float32), 1)
+        next_states = tf.constant(self.next_states, dtype = tf.float32)
+        
+        return tf.data.Dataset.from_tensor_slices((states, actions, rewards, dones, next_states))      
+
+    def save_eps(self, state, action, reward, done, next_state):
+        self.rewards.append(reward)
+        self.states.append(state)
+        self.actions.append(action)
+        self.dones.append(done)
+        self.next_states.append(next_state)        
+
+    def clear_memory(self):
+        del self.actions[:]
+        del self.states[:]
+        del self.rewards[:]
+        del self.dones[:]
+        del self.next_states[:]  
+
+class Distributions():
+    def sample(self, datas):
+        distribution = tfp.distributions.Categorical(probs = datas)
+        return distribution.sample()
+        
+    def entropy(self, datas):
+        distribution = tfp.distributions.Categorical(probs = datas)            
+        return distribution.entropy()
+      
+    def logprob(self, datas, value_data):
+        distribution = tfp.distributions.Categorical(probs = datas)
+        return tf.expand_dims(distribution.log_prob(value_data), 1)
+
+    def kl_divergence(self, datas1, datas2):
+        distribution1 = tfp.distributions.Categorical(probs = datas1)
+        distribution2 = tfp.distributions.Categorical(probs = datas2)
+
+        return tf.expand_dims(tfp.distributions.kl_divergence(distribution1, distribution2), 1)
+
+class PolicyFunction():
+    def __init__(self, gamma = 0.99, lam = 0.95):
+        self.gamma  = gamma
+        self.lam    = lam
+
+    def monte_carlo_discounted(self, rewards, dones):
+        running_add = 0
+        returns     = []        
+        
+        for step in reversed(range(len(rewards))):
+            running_add = rewards[step] + (1.0 - dones[step]) * self.gamma * running_add
+            returns.insert(0, running_add)
+            
+        return tf.stack(returns)
+      
+    def temporal_difference(self, reward, next_value, done):
+        q_values = reward + (1 - done) * self.gamma * next_value           
+        return q_values
+      
+    def generalized_advantage_estimation(self, values, rewards, next_values, dones):
+        gae     = 0
+        adv     = []     
+
+        delta   = rewards + (1.0 - dones) * self.gamma * next_values - values          
+        for step in reversed(range(len(rewards))):  
+            gae = delta[step] + (1.0 - dones[step]) * self.gamma * self.lam * gae
+            adv.insert(0, gae)
+            
+        return tf.stack(adv)
+
+class Agent():  
+    def __init__(self, state_dim, action_dim, is_training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
+                 minibatch, PPO_epochs, gamma, lam, learning_rate):        
+        self.policy_kl_range    = policy_kl_range 
+        self.policy_params      = policy_params
+        self.value_clip         = value_clip    
+        self.entropy_coef       = entropy_coef
+        self.vf_loss_coef       = vf_loss_coef
+        self.minibatch          = minibatch       
+        self.PPO_epochs         = PPO_epochs
+        self.is_training_mode   = is_training_mode
+        self.action_dim         = action_dim               
+
+        self.actor              = Actor_Model(state_dim, action_dim)
+        self.actor_old          = Actor_Model(state_dim, action_dim)
+
+        self.critic             = Critic_Model(state_dim, action_dim)
+        self.critic_old         = Critic_Model(state_dim, action_dim)
+
+        self.optimizer          = tf.keras.optimizers.Adam(learning_rate = learning_rate)
+        self.memory             = Memory()
+        self.policy_function    = PolicyFunction(gamma, lam)  
+        self.distributions      = Distributions()
+
+    def save_eps(self, state, action, reward, done, next_state):
+        self.memory.save_eps(state, action, reward, done, next_state)
+
+    # Loss for PPO  
+    def get_loss(self, action_probs, values, old_action_probs, old_values, next_values, actions, rewards, dones):   
+        # Don't use old value in backpropagation
+        Old_values      = tf.stop_gradient(old_values)
+
+        # Getting general advantages estimator
+        Advantages      = self.policy_function.generalized_advantage_estimation(values, rewards, next_values, dones)
+        Returns         = tf.stop_gradient(Advantages + values)
+        Advantages      = tf.stop_gradient((Advantages - tf.math.reduce_mean(Advantages)) / (tf.math.reduce_std(Advantages) + 1e-6))
+
+        # Finding the ratio (pi_theta / pi_theta__old):        
+        logprobs        = self.distributions.logprob(action_probs, actions)
+        Old_logprobs    = tf.stop_gradient(self.distributions.logprob(old_action_probs, actions))
+        ratios          = tf.math.exp(logprobs - Old_logprobs) # ratios = old_logprobs / logprobs
+
+        # Finding KL Divergence                
+        Kl              = self.distributions.kl_divergence(old_action_probs, action_probs)
+
+        # Combining TR-PPO with Rollback (Truly PPO)
+        pg_loss         = tf.where(
+                tf.logical_and(Kl >= self.policy_kl_range, ratios > 1),
+                ratios * Advantages - self.policy_params * Kl,
+                ratios * Advantages
+        )
+        pg_loss         = tf.math.reduce_mean(pg_loss)
+
+        # Getting entropy from the action probability
+        dist_entropy    = tf.math.reduce_mean(self.distributions.entropy(action_probs))
+
+        # Getting critic loss by using Clipped critic value
+        vpredclipped    = old_values + tf.clip_by_value(values - Old_values, -self.value_clip, self.value_clip) # Minimize the difference between old value and new value
+        vf_losses1      = tf.math.square(Returns - values) * 0.5 # Mean Squared Error
+        vf_losses2      = tf.math.square(Returns - vpredclipped) * 0.5 # Mean Squared Error
+        critic_loss     = tf.math.reduce_mean(tf.math.maximum(vf_losses1, vf_losses2))           
+
+        # We need to maximaze Policy Loss to make agent always find Better Rewards
+        # and minimize Critic Loss 
+        loss            = (critic_loss * self.vf_loss_coef) - (dist_entropy * self.entropy_coef) - pg_loss
+        return loss       
+
+    @tf.function
+    def act(self, state):
+        state           = tf.expand_dims(tf.cast(state, dtype = tf.float32), 0)
+        action_probs    = self.actor(state)
+        
+        # We don't need sample the action in Test Mode
+        # only sampling the action in Training Mode in order to exploring the actions
+        if self.is_training_mode:
+            # Sample the action
+            action  = self.distributions.sample(action_probs) 
+        else:
+            action  = tf.math.argmax(action_probs, 1)  
+              
+        return action
+
+    # Get loss and Do backpropagation
+    @tf.function
+    def training_ppo(self, states, actions, rewards, dones, next_states):        
+        with tf.GradientTape() as tape:
+            action_probs, values            = self.actor(states), self.critic(states)
+            old_action_probs, old_values    = self.actor_old(states), self.critic_old(states)
+            next_values                     = self.critic(next_states)
+
+            loss = self.get_loss(action_probs, values, old_action_probs, old_values, next_values, actions, rewards, dones)
+
+        gradients = tape.gradient(loss, self.actor.trainable_variables + self.critic.trainable_variables)        
+        self.optimizer.apply_gradients(zip(gradients, self.actor.trainable_variables + self.critic.trainable_variables)) 
+
+    # Update the model
+    def update_ppo(self):        
+        batch_size = int(len(self.memory) / self.minibatch)
+
+        # Optimize policy for K epochs:
+        for _ in range(self.PPO_epochs):       
+            for states, actions, rewards, dones, next_states in self.memory.get_all_items().batch(batch_size):
+                self.training_ppo(states, actions, rewards, dones, next_states)
+
+        # Clear the memory
+        self.memory.clear_memory()
+
+        # Copy new weights into old policy:
+        self.actor_old.set_weights(self.actor.get_weights())
+        self.critic_old.set_weights(self.critic.get_weights())
+
+    def save_weights(self):
+        self.actor.save_weights('bipedalwalker_w/actor_ppo', save_format='tf')
+        self.actor_old.save_weights('bipedalwalker_w/actor_old_ppo', save_format='tf')
+        self.critic.save_weights('bipedalwalker_w/critic_ppo', save_format='tf')
+        self.critic_old.save_weights('bipedalwalker_w/critic_old_ppo', save_format='tf')
+        
+    def load_weights(self):
+        self.actor.load_weights('bipedalwalker_w/actor_ppo')
+        self.actor_old.load_weights('bipedalwalker_w/actor_old_ppo')
+        self.critic.load_weights('bipedalwalker_w/critic_ppo')
+        self.critic_old.load_weights('bipedalwalker_w/critic_old_ppo')
+
+def plot(datas):
+    print('----------')
+
+    plt.plot(datas)
+    plt.plot()
+    plt.xlabel('Episode')
+    plt.ylabel('Datas')
+    plt.show()
+
+    print('Max :', np.max(datas))
+    print('Min :', np.min(datas))
+    print('Avg :', np.mean(datas))
+
+def run_episode(env, agent, state_dim, render, training_mode, t_updates, n_update):
+    utils           = Utils()
+    ############################################
+    obs             = env.reset()
+    obs             = utils.prepro(obs)
+    state           = obs
+
+    done            = False
+    total_reward    = 0
+    eps_time        = 0
+    ############################################
+    
+    while not done:
+        action                      = int(agent.act(state))
+        action_gym                  = action + 1 if action != 0 else 0
+
+        next_obs, reward, done, _   = env.step(action_gym)
+        next_obs                    = utils.prepro(next_obs)
+        next_state                  = next_obs - obs
+        
+        eps_time        += 1 
+        t_updates       += 1
+        total_reward    += reward
+
+        if training_mode:
+            agent.save_eps(state.tolist(), float(action), float(reward), float(done), next_state.tolist()) 
+            
+        state   = next_state 
+        obs     = next_obs
+                
+        if render:
+            env.render()
+        
+        if training_mode:
+            if t_updates % n_update == 0:
+                agent.update_ppo()
+                t_updates = 0
+        
+        if done:           
+            return total_reward, eps_time, t_updates           
+
+def main():
+    ############## Hyperparameters ##############
+    load_weights        = False # If you want to load the agent, set this to True
+    save_weights        = False # If you want to save the agent, set this to True
+    training_mode       = True # If you want to train the agent, set this to True. But set this otherwise if you only want to test it
+    reward_threshold    = 300 # Set threshold for reward. The learning will stop if reward has pass threshold. Set none to sei this off
+    using_google_drive  = False
+
+    render              = False # If you want to display the image. Turn this off if you run this in Google Collab
+    n_update            = 128 # How many episode before you update the Policy. ocommended set to 128 for Discrete
+    n_plot_batch        = 100000000 # How many episode you want to plot the result
+    n_episode           = 100000 # How many episode you want to run
+    n_saved             = 10 # How many episode to run before saving the weights
+
+    policy_kl_range     = 0.0008 # Set to 0.0008 for Discrete
+    policy_params       = 20 # Set to 20 for Discrete
+    value_clip          = 1.0 # How many value will be clipped. Recommended set to the highest or lowest possible reward
+    entropy_coef        = 0.05 # How much randomness of action you will get
+    vf_loss_coef        = 1.0 # Just set to 1
+    minibatch           = 4 # How many batch per update. size of batch = n_update / minibatch. Rocommended set to 4 for Discrete
+    PPO_epochs          = 4 # How many epoch per update
+    
+    gamma               = 0.99 # Just set to 0.99
+    lam                 = 0.95 # Just set to 0.95
+    learning_rate       = 2.5e-4 # Just set to 0.95
+    ############################################# 
+    env_name            = 'PongDeterministic-v4' # Set the env you want
+    env                 = gym.make(env_name)
+
+    state_dim           = 80 * 80
+    action_dim          = 3
+
+    print(action_dim)
+
+    agent               = Agent(state_dim, action_dim, training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
+                            minibatch, PPO_epochs, gamma, lam, learning_rate)  
+    #############################################     
+    if using_google_drive:
+        from google.colab import drive
+        drive.mount('/test')
+
+    if load_weights:
+        agent.load_weights()
+        print('Weight Loaded')
+
+    rewards             = []   
+    batch_rewards       = []
+    batch_solved_reward = []
+
+    times               = []
+    batch_times         = []
+
+    t_updates           = 0
+
+    for i_episode in range(1, n_episode + 1):
+        total_reward, time, t_updates = run_episode(env, agent, state_dim, render, training_mode, t_updates, n_update)
+        print('Episode {} \t t_reward: {} \t time: {} \t '.format(i_episode, total_reward, time))
+        batch_rewards.append(int(total_reward))
+        batch_times.append(time)        
+
+        if save_weights:
+            if i_episode % n_saved == 0:
+                agent.save_weights() 
+                print('weights saved')
+
+        if reward_threshold:
+            if len(batch_solved_reward) == 100:            
+                if np.mean(batch_solved_reward) >= reward_threshold :              
+                    for reward in batch_rewards:
+                        rewards.append(reward)
+
+                    for time in batch_times:
+                        times.append(time)                    
+
+                    print('You solved task after {} episode'.format(len(rewards)))
+                    break
+
+                else:
+                    del batch_solved_reward[0]
+                    batch_solved_reward.append(total_reward)
+
+            else:
+                batch_solved_reward.append(total_reward)
+
+        if i_episode % n_plot_batch == 0 and i_episode != 0:
+            # Plot the reward, times for every n_plot_batch
+            plot(batch_rewards)
+            plot(batch_times)
+
+            for reward in batch_rewards:
+                rewards.append(reward)
+
+            for time in batch_times:
+                times.append(time)
+
+            batch_rewards   = []
+            batch_times     = []
+
+            print('========== Cummulative ==========')
+            # Plot the reward, times for every episode
+            plot(rewards)
+            plot(times)
+
+    print('========== Final ==========')
+    # Plot the reward, times for every episode
+
+    for reward in batch_rewards:
+        rewards.append(reward)
+
+    for time in batch_times:
+        times.append(time)
+
+    plot(rewards)
+    plot(times)
+
+if __name__ == '__main__':
+    main()
--- a/2/ppo_tensorflow.py
+++ b/2/ppo_tensorflow.py
@ -0,0 +1,409 @@
+import gym
+from gym.envs.registration import register
+    
+import tensorflow as tf
+import tensorflow_probability as tfp
+from tensorflow.keras.layers import Dense
+from tensorflow.keras import Model
+
+import matplotlib.pyplot as plt
+import numpy as np
+import sys
+import numpy
+
+class Utils():
+    def prepro(self, I):
+        I           = I[35:195] # crop
+        I           = I[::2,::2, 0] # downsample by factor of 2
+        I[I == 144] = 0 # erase background (background type 1)
+        I[I == 109] = 0 # erase background (background type 2)
+        I[I != 0]   = 1 # everything else (paddles, ball) just set to 1
+        X           = I.astype(np.float32).ravel() # Combine items in 1 array 
+        return X
+
+class Actor_Model(Model):
+  def __init__(self, state_dim, action_dim):
+    super(Actor_Model, self).__init__()
+    self.d1     = Dense(32, activation='relu')
+    self.d2     = Dense(32, activation='relu')
+    self.dout   = Dense(action_dim, activation='softmax')
+
+  def call(self, x):
+    x = self.d1(x)
+    x = self.d2(x)
+    return self.dout(x)
+
+class Critic_Model(Model):
+  def __init__(self, state_dim, action_dim):
+    super(Critic_Model, self).__init__()
+    self.d1     = Dense(32, activation='relu')
+    self.d2     = Dense(32, activation='relu')
+    self.dout   = Dense(1, activation='linear')
+
+  def call(self, x):
+    x = self.d1(x)
+    x = self.d2(x)
+    return self.dout(x)
+
+class Memory():
+    def __init__(self):
+        self.actions        = [] 
+        self.states         = []
+        self.rewards        = []
+        self.dones          = []     
+        self.next_states    = []
+
+    def __len__(self):
+        return len(self.dones)
+
+    def get_all_items(self):
+        states = tf.constant(self.states, dtype = tf.float32)
+        actions = tf.constant(self.actions, dtype = tf.float32)
+        rewards = tf.expand_dims(tf.constant(self.rewards, dtype = tf.float32), 1)
+        dones = tf.expand_dims(tf.constant(self.dones, dtype = tf.float32), 1)
+        next_states = tf.constant(self.next_states, dtype = tf.float32)
+        
+        return tf.data.Dataset.from_tensor_slices((states, actions, rewards, dones, next_states))      
+
+    def save_eps(self, state, action, reward, done, next_state):
+        self.rewards.append(reward)
+        self.states.append(state)
+        self.actions.append(action)
+        self.dones.append(done)
+        self.next_states.append(next_state)        
+
+    def clear_memory(self):
+        del self.actions[:]
+        del self.states[:]
+        del self.rewards[:]
+        del self.dones[:]
+        del self.next_states[:]  
+
+class Distributions():
+    def sample(self, datas):
+        distribution = tfp.distributions.Categorical(probs = datas)
+        return distribution.sample()
+        
+    def entropy(self, datas):
+        distribution = tfp.distributions.Categorical(probs = datas)            
+        return distribution.entropy()
+      
+    def logprob(self, datas, value_data):
+        distribution = tfp.distributions.Categorical(probs = datas)
+        return tf.expand_dims(distribution.log_prob(value_data), 1)
+
+    def kl_divergence(self, datas1, datas2):
+        distribution1 = tfp.distributions.Categorical(probs = datas1)
+        distribution2 = tfp.distributions.Categorical(probs = datas2)
+
+        return tf.expand_dims(tfp.distributions.kl_divergence(distribution1, distribution2), 1)
+
+class PolicyFunction():
+    def __init__(self, gamma = 0.99, lam = 0.95):
+        self.gamma  = gamma
+        self.lam    = lam
+
+    def monte_carlo_discounted(self, rewards, dones):
+        running_add = 0
+        returns     = []        
+        
+        for step in reversed(range(len(rewards))):
+            running_add = rewards[step] + (1.0 - dones[step]) * self.gamma * running_add
+            returns.insert(0, running_add)
+            
+        return tf.stack(returns)
+      
+    def temporal_difference(self, reward, next_value, done):
+        q_values = reward + (1 - done) * self.gamma * next_value           
+        return q_values
+      
+    def generalized_advantage_estimation(self, values, rewards, next_values, dones):
+        gae     = 0
+        adv     = []     
+
+        delta   = rewards + (1.0 - dones) * self.gamma * next_values - values          
+        for step in reversed(range(len(rewards))):  
+            gae = delta[step] + (1.0 - dones[step]) * self.gamma * self.lam * gae
+            adv.insert(0, gae)
+            
+        return tf.stack(adv)
+
+class Agent():  
+    def __init__(self, state_dim, action_dim, is_training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
+                 minibatch, PPO_epochs, gamma, lam, learning_rate):        
+        self.policy_kl_range    = policy_kl_range 
+        self.policy_params      = policy_params
+        self.value_clip         = value_clip    
+        self.entropy_coef       = entropy_coef
+        self.vf_loss_coef       = vf_loss_coef
+        self.minibatch          = minibatch       
+        self.PPO_epochs         = PPO_epochs
+        self.is_training_mode   = is_training_mode
+        self.action_dim         = action_dim               
+
+        self.actor              = Actor_Model(state_dim, action_dim)
+        self.actor_old          = Actor_Model(state_dim, action_dim)
+
+        self.critic             = Critic_Model(state_dim, action_dim)
+        self.critic_old         = Critic_Model(state_dim, action_dim)
+
+        self.optimizer          = tf.keras.optimizers.Adam(learning_rate = learning_rate)
+        self.memory             = Memory()
+        self.policy_function    = PolicyFunction(gamma, lam)  
+        self.distributions      = Distributions()
+
+    def save_eps(self, state, action, reward, done, next_state):
+        self.memory.save_eps(state, action, reward, done, next_state)
+
+    # Loss for PPO  
+    def get_loss(self, action_probs, values, old_action_probs, old_values, next_values, actions, rewards, dones):   
+        # Don't use old value in backpropagation
+        Old_values      = tf.stop_gradient(old_values)
+
+        # Getting general advantages estimator
+        Advantages      = self.policy_function.generalized_advantage_estimation(values, rewards, next_values, dones)
+        Returns         = tf.stop_gradient(Advantages + values)
+        Advantages      = tf.stop_gradient((Advantages - tf.math.reduce_mean(Advantages)) / (tf.math.reduce_std(Advantages) + 1e-6))
+
+        # Finding the ratio (pi_theta / pi_theta__old):        
+        logprobs        = self.distributions.logprob(action_probs, actions)
+        Old_logprobs    = tf.stop_gradient(self.distributions.logprob(old_action_probs, actions))
+        ratios          = tf.math.exp(logprobs - Old_logprobs) # ratios = old_logprobs / logprobs
+
+        # Finding KL Divergence                
+        Kl              = self.distributions.kl_divergence(old_action_probs, action_probs)
+
+        # Combining TR-PPO with Rollback (Truly PPO)
+        pg_loss         = tf.where(
+                tf.logical_and(Kl >= self.policy_kl_range, ratios > 1),
+                ratios * Advantages - self.policy_params * Kl,
+                ratios * Advantages
+        )
+        pg_loss         = tf.math.reduce_mean(pg_loss)
+
+        # Getting entropy from the action probability
+        dist_entropy    = tf.math.reduce_mean(self.distributions.entropy(action_probs))
+
+        # Getting critic loss by using Clipped critic value
+        vpredclipped    = old_values + tf.clip_by_value(values - Old_values, -self.value_clip, self.value_clip) # Minimize the difference between old value and new value
+        vf_losses1      = tf.math.square(Returns - values) * 0.5 # Mean Squared Error
+        vf_losses2      = tf.math.square(Returns - vpredclipped) * 0.5 # Mean Squared Error
+        critic_loss     = tf.math.reduce_mean(tf.math.maximum(vf_losses1, vf_losses2))           
+
+        # We need to maximaze Policy Loss to make agent always find Better Rewards
+        # and minimize Critic Loss 
+        loss            = (critic_loss * self.vf_loss_coef) - (dist_entropy * self.entropy_coef) - pg_loss
+        return loss       
+
+    @tf.function
+    def act(self, state):
+        state           = tf.expand_dims(tf.cast(state, dtype = tf.float32), 0)
+        action_probs    = self.actor(state)
+        
+        # We don't need sample the action in Test Mode
+        # only sampling the action in Training Mode in order to exploring the actions
+        if self.is_training_mode:
+            # Sample the action
+            action  = self.distributions.sample(action_probs) 
+        else:
+            action  = tf.math.argmax(action_probs, 1)  
+              
+        return action
+
+    # Get loss and Do backpropagation
+    @tf.function
+    def training_ppo(self, states, actions, rewards, dones, next_states):        
+        with tf.GradientTape() as tape:
+            action_probs, values            = self.actor(states), self.critic(states)
+            old_action_probs, old_values    = self.actor_old(states), self.critic_old(states)
+            next_values                     = self.critic(next_states)
+
+            loss = self.get_loss(action_probs, values, old_action_probs, old_values, next_values, actions, rewards, dones)
+
+        gradients = tape.gradient(loss, self.actor.trainable_variables + self.critic.trainable_variables)        
+        self.optimizer.apply_gradients(zip(gradients, self.actor.trainable_variables + self.critic.trainable_variables)) 
+
+    # Update the model
+    def update_ppo(self):        
+        batch_size = int(len(self.memory) / self.minibatch)
+
+        # Optimize policy for K epochs:
+        for _ in range(self.PPO_epochs):       
+            for states, actions, rewards, dones, next_states in self.memory.get_all_items().batch(batch_size):
+                self.training_ppo(states, actions, rewards, dones, next_states)
+
+        # Clear the memory
+        self.memory.clear_memory()
+
+        # Copy new weights into old policy:
+        self.actor_old.set_weights(self.actor.get_weights())
+        self.critic_old.set_weights(self.critic.get_weights())
+
+    def save_weights(self):
+        self.actor.save_weights('bipedalwalker_w/actor_ppo', save_format='tf')
+        self.actor_old.save_weights('bipedalwalker_w/actor_old_ppo', save_format='tf')
+        self.critic.save_weights('bipedalwalker_w/critic_ppo', save_format='tf')
+        self.critic_old.save_weights('bipedalwalker_w/critic_old_ppo', save_format='tf')
+        
+    def load_weights(self):
+        self.actor.load_weights('bipedalwalker_w/actor_ppo')
+        self.actor_old.load_weights('bipedalwalker_w/actor_old_ppo')
+        self.critic.load_weights('bipedalwalker_w/critic_ppo')
+        self.critic_old.load_weights('bipedalwalker_w/critic_old_ppo')
+
+def plot(datas):
+    print('----------')
+
+    plt.plot(datas)
+    plt.plot()
+    plt.xlabel('Episode')
+    plt.ylabel('Datas')
+    plt.show()
+
+    print('Max :', np.max(datas))
+    print('Min :', np.min(datas))
+    print('Avg :', np.mean(datas))
+
+def run_episode(env, agent, state_dim, render, training_mode, t_updates, n_update):
+    ############################################
+    state           = env.reset()
+    done            = False
+    total_reward    = 0
+    eps_time        = 0
+    ############################################
+    
+    while not done:
+        action                          = int(agent.act(state))
+        next_state, reward, done, _     = env.step(action)
+        
+        eps_time        += 1 
+        t_updates       += 1
+        total_reward    += reward
+
+        if training_mode:
+            agent.save_eps(state.tolist(), action, reward, float(done), next_state.tolist()) 
+            
+        state   = next_state
+                
+        if render:
+            env.render()
+        
+        if training_mode:
+            if t_updates % n_update == 0:
+                agent.update_ppo()
+                t_updates = 0
+        
+        if done:           
+            return total_reward, eps_time, t_updates           
+
+def main():
+    ############## Hyperparameters ##############
+    load_weights        = False # If you want to load the agent, set this to True
+    save_weights        = False # If you want to save the agent, set this to True
+    training_mode       = True # If you want to train the agent, set this to True. But set this otherwise if you only want to test it
+    reward_threshold    = 300 # Set threshold for reward. The learning will stop if reward has pass threshold. Set none to sei this off
+    using_google_drive  = False
+
+    render              = False # If you want to display the image. Turn this off if you run this in Google Collab
+    n_update            = 32 # How many episode before you update the Policy. ocommended set to 128 for Discrete
+    n_plot_batch        = 100000000 # How many episode you want to plot the result
+    n_episode           = 100000 # How many episode you want to run
+    n_saved             = 10 # How many episode to run before saving the weights
+
+    policy_kl_range     = 0.0008 # Set to 0.0008 for Discrete
+    policy_params       = 20 # Set to 20 for Discrete
+    value_clip          = 1.0 # How many value will be clipped. Recommended set to the highest or lowest possible reward
+    entropy_coef        = 0.05 # How much randomness of action you will get
+    vf_loss_coef        = 1.0 # Just set to 1
+    minibatch           = 2 # How many batch per update. size of batch = n_update / minibatch. Rocommended set to 4 for Discrete
+    PPO_epochs          = 4 # How many epoch per update
+    
+    gamma               = 0.99 # Just set to 0.99
+    lam                 = 0.95 # Just set to 0.95
+    learning_rate       = 2.5e-4 # Just set to 0.95
+    ############################################# 
+    env_name            = 'Env Name' # Set the env you want
+    env                 = gym.make(env_name)
+
+    state_dim           = env.observation_space.shape[0]
+    action_dim          = env.action_space.n
+
+    print(action_dim)
+
+    agent               = Agent(state_dim, action_dim, training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
+                            minibatch, PPO_epochs, gamma, lam, learning_rate)  
+    #############################################     
+    if using_google_drive:
+        from google.colab import drive
+        drive.mount('/test')
+
+    if load_weights:
+        agent.load_weights()
+        print('Weight Loaded')
+
+    rewards             = []   
+    batch_rewards       = []
+    batch_solved_reward = []
+
+    times               = []
+    batch_times         = []
+
+    t_updates           = 0
+
+    for i_episode in range(1, n_episode + 1):
+        total_reward, time, t_updates = run_episode(env, agent, state_dim, render, training_mode, t_updates, n_update)
+        print('Episode {} \t t_reward: {} \t time: {} \t '.format(i_episode, total_reward, time))
+        batch_rewards.append(int(total_reward))
+        batch_times.append(time)        
+
+        if save_weights:
+            if i_episode % n_saved == 0:
+                agent.save_weights() 
+                print('weights saved')
+
+        if reward_threshold:
+            if len(batch_solved_reward) == 100:            
+                if np.mean(batch_solved_reward) >= reward_threshold:
+                    print('You solved task after {} episode'.format(len(rewards)))
+                    break
+
+                else:
+                    del batch_solved_reward[0]
+                    batch_solved_reward.append(total_reward)
+
+            else:
+                batch_solved_reward.append(total_reward)
+
+        if i_episode % n_plot_batch == 0 and i_episode != 0:
+            # Plot the reward, times for every n_plot_batch
+            plot(batch_rewards)
+            plot(batch_times)
+
+            for reward in batch_rewards:
+                rewards.append(reward)
+
+            for time in batch_times:
+                times.append(time)
+
+            batch_rewards   = []
+            batch_times     = []
+
+            print('========== Cummulative ==========')
+            # Plot the reward, times for every episode
+            plot(rewards)
+            plot(times)
+
+    print('========== Final ==========')
+    # Plot the reward, times for every episode
+
+    for reward in batch_rewards:
+        rewards.append(reward)
+
+    for time in batch_times:
+        times.append(time)
+
+    plot(rewards)
+    plot(times) 
+
+if __name__ == '__main__':
+    main()
--- a/PPO_continous/pytorch/ppo_continous_pytorch.py
+++ b/PPO_continous/pytorch/ppo_continous_pytorch.py
@ -0,0 +1,403 @@
+import gym
+from gym.envs.registration import register
+    
+import torch
+import torch.nn as nn
+from torch.distributions import Normal
+from torch.distributions.kl import kl_divergence
+from torch.utils.data import Dataset, DataLoader
+from torch.optim import Adam
+from torch.utils.tensorboard import SummaryWriter
+
+import numpy as np
+import sys
+import numpy
+import time
+import datetime
+
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")  
+dataType = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
+
+class Actor_Model(nn.Module):
+    def __init__(self, state_dim, action_dim):
+        super(Actor_Model, self).__init__()   
+
+        self.nn_layer = nn.Sequential(
+                nn.Linear(state_dim, 256),
+                nn.ReLU(),
+                nn.Linear(256, 64),
+                nn.ReLU(),
+                nn.Linear(64, action_dim),
+                nn.Tanh()
+              ).float().to(device)
+        
+    def forward(self, states):
+        return self.nn_layer(states)
+
+class Critic_Model(nn.Module):
+    def __init__(self, state_dim, action_dim):
+        super(Critic_Model, self).__init__()   
+
+        self.nn_layer = nn.Sequential(
+                nn.Linear(state_dim, 256),
+                nn.ReLU(),
+                nn.Linear(256, 64),
+                nn.ReLU(),
+                nn.Linear(64, 1)
+              ).float().to(device)
+        
+    def forward(self, states):
+        return self.nn_layer(states)
+
+class Memory(Dataset):
+    def __init__(self):
+        self.actions        = [] 
+        self.states         = []
+        self.rewards        = []
+        self.dones          = []     
+        self.next_states    = []
+
+    def __len__(self):
+        return len(self.dones)
+
+    def __getitem__(self, idx):
+        return np.array(self.states[idx], dtype = np.float32), np.array(self.actions[idx], dtype = np.float32), np.array([self.rewards[idx]], dtype = np.float32), np.array([self.dones[idx]], dtype = np.float32), np.array(self.next_states[idx], dtype = np.float32)      
+
+    def save_eps(self, state, action, reward, done, next_state):
+        self.states.append(state)
+        self.actions.append(action)
+        self.rewards.append(reward)
+        self.dones.append(done)
+        self.next_states.append(next_state)     
+
+    def clear_memory(self):
+        del self.actions[:]
+        del self.states[:]
+        del self.rewards[:]
+        del self.dones[:]
+        del self.next_states[:]  
+
+class Continous():
+    def sample(self, mean, std):
+        distribution    = Normal(mean, std)
+        return distribution.sample().float().to(device)
+        
+    def entropy(self, mean, std):
+        distribution    = Normal(mean, std)    
+        return distribution.entropy().float().to(device)
+      
+    def logprob(self, mean, std, value_data):
+        distribution    = Normal(mean, std)
+        return distribution.log_prob(value_data).float().to(device)
+
+    def kl_divergence(self, mean1, std1, mean2, std2):
+        distribution1   = Normal(mean1, std1)
+        distribution2   = Normal(mean2, std2)
+
+        return kl_divergence(distribution1, distribution2).float().to(device)  
+
+class PolicyFunction():
+    def __init__(self, gamma = 0.99, lam = 0.95):
+        self.gamma  = gamma
+        self.lam    = lam
+
+    def monte_carlo_discounted(self, rewards, dones):
+        running_add = 0
+        returns     = []        
+        
+        for step in reversed(range(len(rewards))):
+            running_add = rewards[step] + (1.0 - dones[step]) * self.gamma * running_add
+            returns.insert(0, running_add)
+            
+        return torch.stack(returns)
+      
+    def temporal_difference(self, reward, next_value, done):
+        q_values = reward + (1 - done) * self.gamma * next_value           
+        return q_values
+      
+    def generalized_advantage_estimation(self, values, rewards, next_values, dones):
+        gae     = 0
+        adv     = []     
+
+        delta   = rewards + (1.0 - dones) * self.gamma * next_values - values          
+        for step in reversed(range(len(rewards))):  
+            gae = delta[step] + (1.0 - dones[step]) * self.gamma * self.lam * gae
+            adv.insert(0, gae)
+            
+        return torch.stack(adv)
+
+class TrulyPPO():
+    def __init__(self, policy_kl_range, policy_params, value_clip, vf_loss_coef, entropy_coef, gamma, lam):
+        self.policy_kl_range    = policy_kl_range
+        self.policy_params      = policy_params
+        self.value_clip         = value_clip
+        self.vf_loss_coef       = vf_loss_coef
+        self.entropy_coef       = entropy_coef
+
+        self.distributions      = Continous()
+        self.policy_function    = PolicyFunction(gamma, lam)
+
+    def compute_loss(self, action_mean, action_std, old_action_mean, old_action_std, values, old_values, next_values, actions, rewards, dones):    
+        # Don't use old value in backpropagation
+        Old_values          = old_values.detach()
+        Old_action_mean     = old_action_mean.detach()
+
+        # Getting general advantages estimator and returns
+        Advantages      = self.policy_function.generalized_advantage_estimation(values, rewards, next_values, dones)
+        Returns         = (Advantages + values).detach()
+        Advantages      = ((Advantages - Advantages.mean()) / (Advantages.std() + 1e-6)).detach() 
+
+        # Finding the ratio (pi_theta / pi_theta__old):  
+        logprobs        = self.distributions.logprob(action_mean, action_std, actions)
+        Old_logprobs    = self.distributions.logprob(Old_action_mean, old_action_std, actions).detach() 
+
+        # Finding Surrogate Loss
+        ratios          = (logprobs - Old_logprobs).exp() # ratios = old_logprobs / logprobs        
+        Kl              = self.distributions.kl_divergence(Old_action_mean, old_action_std, action_mean, action_std)
+
+        pg_targets  = torch.where(
+            (Kl >= self.policy_kl_range) & (ratios > 1),
+            ratios * Advantages - self.policy_params * Kl,
+            ratios * Advantages
+        )
+        pg_loss     = pg_targets.mean()
+
+        # Getting entropy from the action probability 
+        dist_entropy    = self.distributions.entropy(action_mean, action_std).mean()
+
+        # Getting Critic loss by using Clipped critic value
+        if self.value_clip is None:
+            critic_loss   = ((Returns - values).pow(2) * 0.5).mean()
+        else:
+            vpredclipped  = old_values + torch.clamp(values - Old_values, -self.value_clip, self.value_clip) # Minimize the difference between old value and new value
+            vf_losses1    = (Returns - values).pow(2) * 0.5 # Mean Squared Error
+            vf_losses2    = (Returns - vpredclipped).pow(2) * 0.5 # Mean Squared Error        
+            critic_loss   = torch.max(vf_losses1, vf_losses2).mean()                
+
+        # We need to maximaze Policy Loss to make agent always find Better Rewards
+        # and minimize Critic Loss 
+        loss = (critic_loss * self.vf_loss_coef) - (dist_entropy * self.entropy_coef) - pg_loss
+        return loss
+
+class Agent():  
+    def __init__(self, state_dim, action_dim, is_training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
+                 minibatch, PPO_epochs, gamma, lam, learning_rate):        
+        self.policy_kl_range    = policy_kl_range 
+        self.policy_params      = policy_params
+        self.value_clip         = value_clip    
+        self.entropy_coef       = entropy_coef
+        self.vf_loss_coef       = vf_loss_coef
+        self.minibatch          = minibatch       
+        self.PPO_epochs         = PPO_epochs
+        self.is_training_mode   = is_training_mode
+        self.action_dim         = action_dim 
+        self.std                = torch.ones([1, action_dim]).float().to(device)                
+
+        self.actor              = Actor_Model(state_dim, action_dim)
+        self.actor_old          = Actor_Model(state_dim, action_dim)
+        self.actor_optimizer    = Adam(self.actor.parameters(), lr = learning_rate)
+
+        self.critic             = Critic_Model(state_dim, action_dim)
+        self.critic_old         = Critic_Model(state_dim, action_dim)
+        self.critic_optimizer   = Adam(self.critic.parameters(), lr = learning_rate)
+
+        self.memory             = Memory()
+        self.policy_function    = PolicyFunction(gamma, lam)
+
+        self.distributions      = Continous()
+        self.policy_loss        = TrulyPPO(policy_kl_range, policy_params, value_clip, vf_loss_coef, entropy_coef, gamma, lam)
+
+        if is_training_mode:
+          self.actor.train()
+          self.critic.train()
+        else:
+          self.actor.eval()
+          self.critic.eval()
+
+    def save_eps(self, state, action, reward, done, next_state):
+        self.memory.save_eps(state, action, reward, done, next_state)
+
+    def act(self, state):
+        state       = torch.FloatTensor(state).unsqueeze(0).to(device).detach()
+        action_mean = self.actor(state)
+        
+        # We don't need sample the action in Test Mode
+        # only sampling the action in Training Mode in order to exploring the actions
+        if self.is_training_mode:
+            # Sample the action
+            action  = self.distributions.sample(action_mean, self.std)
+        else:
+            action  = action_mean  
+              
+        return action.squeeze(0).cpu().numpy()
+
+    # Get loss and Do backpropagation
+    def training_ppo(self, states, actions, rewards, dones, next_states):                 
+        action_mean, values          = self.actor(states), self.critic(states)
+        old_action_mean, old_values  = self.actor_old(states), self.critic_old(states)
+        next_values                  = self.critic(next_states)
+  
+        loss    = self.policy_loss.compute_loss(action_mean, self.std, old_action_mean, self.std, values, old_values, next_values, actions, rewards, dones)
+
+        self.actor_optimizer.zero_grad()
+        self.critic_optimizer.zero_grad()
+
+        loss.backward()
+
+        self.actor_optimizer.step() 
+        self.critic_optimizer.step() 
+
+    # Update the model
+    def update_ppo(self):        
+        batch_size  = int(len(self.memory) / self.minibatch)
+        dataloader  = DataLoader(self.memory, batch_size, shuffle = False)
+
+        # Optimize policy for K epochs:
+        for _ in range(self.PPO_epochs):       
+            for states, actions, rewards, dones, next_states in dataloader:
+                self.training_ppo(states.float().to(device), actions.float().to(device), rewards.float().to(device), dones.float().to(device), next_states.float().to(device))
+
+        # Clear the memory
+        self.memory.clear_memory()
+
+        # Copy new weights into old policy:
+        self.actor_old.load_state_dict(self.actor.state_dict())
+        self.critic_old.load_state_dict(self.critic.state_dict())
+
+    def save_weights(self):
+        torch.save({
+            'model_state_dict': self.actor.state_dict(),
+            'optimizer_state_dict': self.actor_optimizer.state_dict()
+            }, '/test/My Drive/Bipedal4/actor.tar')
+        
+        torch.save({
+            'model_state_dict': self.critic.state_dict(),
+            'optimizer_state_dict': self.critic_optimizer.state_dict()
+            }, '/test/My Drive/Bipedal4/critic.tar')
+        
+    def load_weights(self):
+        actor_checkpoint = torch.load('/test/My Drive/Bipedal4/actor.tar')
+        self.actor.load_state_dict(actor_checkpoint['model_state_dict'])
+        self.actor_optimizer.load_state_dict(actor_checkpoint['optimizer_state_dict'])
+
+        critic_checkpoint = torch.load('/test/My Drive/Bipedal4/critic.tar')
+        self.critic.load_state_dict(critic_checkpoint['model_state_dict'])
+        self.critic_optimizer.load_state_dict(critic_checkpoint['optimizer_state_dict'])
+
+class Runner():
+    def __init__(self, env, agent, render, training_mode, n_update):
+        self.env = env
+        self.agent = agent
+        self.render = render
+        self.training_mode = training_mode
+
+        self.n_update = n_update
+        self.t_updates = 0
+
+    def run_episode(self):
+        ############################################
+        state = self.env.reset()    
+        done = False
+        total_reward = 0
+        eps_time = 0
+        ############################################
+        for _ in range(10000): 
+            action = self.agent.act(state)   
+            next_state, reward, done, _ =  self.env.step(action)
+
+            eps_time += 1 
+            self.t_updates += 1
+            total_reward += reward
+            
+            if self.training_mode: 
+                self.agent.save_eps(state.tolist(), action, reward, float(done), next_state.tolist()) 
+                
+            state = next_state
+                    
+            if self.render:
+                self.env.render()     
+            
+            if self.training_mode and self.n_update is not None and self.t_updates == self.n_update:
+                self.agent.update_ppo()
+                self.t_updates = 0
+            
+            if done: 
+                break                
+        
+        if self.training_mode and self.n_update is None:
+            self.agent.update_ppo()
+                    
+        return total_reward, eps_time
+
+def main():
+    ############## Hyperparameters ##############
+    load_weights        = False # If you want to load the agent, set this to True
+    save_weights        = False # If you want to save the agent, set this to True
+    training_mode       = True # If you want to train the agent, set this to True. But set this otherwise if you only want to test it
+    reward_threshold    = 495 # Set threshold for reward. The learning will stop if reward has pass threshold. Set none to sei this off
+    using_google_drive  = False
+
+    render              = False # If you want to display the image, set this to True. Turn this off if you run this in Google Collab
+    n_update            = 1024 # How many episode before you update the Policy. Recommended set to 128 for Discrete
+    n_plot_batch        = 100000000 # How many episode you want to plot the result
+    n_episode           = 100000 # How many episode you want to run
+    n_saved             = 10 # How many episode to run before saving the weights
+
+    policy_kl_range     = 0.03 # Recommended set to 0.0008 for Discrete
+    policy_params       = 5 # Recommended set to 20 for Discrete
+    value_clip          = 1.0 # How many value will be clipped. Recommended set to the highest or lowest possible reward
+    entropy_coef        = 0.0 # How much randomness of action you will get
+    vf_loss_coef        = 1.0 # Just set to 1
+    minibatch           = 32 # How many batch per update. size of batch = n_update / minibatch. Recommended set to 4 for Discrete
+    PPO_epochs          = 10 # How many epoch per update. Recommended set to 10 for Discrete
+    
+    gamma               = 0.99 # Just set to 0.99
+    lam                 = 0.95 # Just set to 0.95
+    learning_rate       = 3e-4 # Just set to 0.95
+    ############################################# 
+    writer              = SummaryWriter()
+
+    env_name            = 'BipedalWalker-v3' # Set the env you want
+    env                 = gym.make(env_name)
+
+    state_dim           = env.observation_space.shape[0]
+    action_dim          = env.action_space.shape[0]
+
+    agent               = Agent(state_dim, action_dim, training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
+                            minibatch, PPO_epochs, gamma, lam, learning_rate) 
+
+    runner              = Runner(env, agent, render, training_mode, n_update)
+    #############################################     
+    if using_google_drive:
+        from google.colab import drive
+        drive.mount('/test')
+
+    if load_weights:
+        agent.load_weights()
+        print('Weight Loaded')
+
+    print('Run the training!!')
+    start = time.time()
+
+    try:
+        for i_episode in range(1, n_episode + 1):
+            total_reward, eps_time = runner.run_episode()
+
+            print('Episode: {} \t t_reward: {} \t time: {} \t '.format(i_episode, total_reward, eps_time))
+            writer.add_scalar('rewards', total_reward, i_episode)
+
+            if save_weights:
+                if i_episode % n_saved == 0:
+                    agent.save_weights() 
+                    print('weights saved')
+
+    except KeyboardInterrupt:        
+        print('\nTraining has been Shutdown \n')
+
+    finally:
+        finish = time.time()
+        timedelta = finish - start
+        print('Timelength: {}'.format(str( datetime.timedelta(seconds = timedelta) )))    
+
+if __name__ == '__main__':
+    main()
--- a/PPO_continous/tensorflow/ppo_continous_tensorflow.py
+++ b/PPO_continous/tensorflow/ppo_continous_tensorflow.py
@ -0,0 +1,411 @@
+import gym
+from gym.envs.registration import register
+    
+import tensorflow as tf
+import tensorflow_probability as tfp
+from tensorflow.keras.layers import Dense
+from tensorflow.keras import Model
+
+import matplotlib.pyplot as plt
+import numpy as np
+import sys
+import numpy
+
+class Utils():
+    def prepro(self, I):
+        I           = I[35:195] # crop
+        I           = I[::2,::2, 0] # downsample by factor of 2
+        I[I == 144] = 0 # erase background (background type 1)
+        I[I == 109] = 0 # erase background (background type 2)
+        I[I != 0]   = 1 # everything else (paddles, ball) just set to 1
+        X           = I.astype(np.float32).ravel() # Combine items in 1 array 
+        return X
+
+class Actor_Model(Model):
+  def __init__(self, state_dim, action_dim):
+    super(Actor_Model, self).__init__()
+    self.d1     = Dense(64, activation='relu')
+    self.d2     = Dense(64, activation='relu')
+    self.dout   = Dense(action_dim, activation='tanh')
+
+  def call(self, x):
+    x = self.d1(x)
+    x = self.d2(x)
+    return self.dout(x)
+
+class Critic_Model(Model):
+  def __init__(self, state_dim, action_dim):
+    super(Critic_Model, self).__init__()
+    self.d1     = Dense(64, activation='relu')
+    self.d2     = Dense(64, activation='relu')
+    self.dout   = Dense(1, activation='linear')
+
+  def call(self, x):
+    x = self.d1(x)
+    x = self.d2(x)
+    return self.dout(x)
+
+class Memory():
+    def __init__(self):
+        self.actions        = [] 
+        self.states         = []
+        self.rewards        = []
+        self.dones          = []     
+        self.next_states    = []
+
+    def __len__(self):
+        return len(self.dones)
+
+    def get_all_items(self):
+        states = tf.constant(self.states, dtype = tf.float32)
+        actions = tf.constant(self.actions, dtype = tf.float32)
+        rewards = tf.expand_dims(tf.constant(self.rewards, dtype = tf.float32), 1)
+        dones = tf.expand_dims(tf.constant(self.dones, dtype = tf.float32), 1)
+        next_states = tf.constant(self.next_states, dtype = tf.float32)
+        
+        return tf.data.Dataset.from_tensor_slices((states, actions, rewards, dones, next_states))      
+
+    def save_eps(self, state, action, reward, done, next_state):
+        self.rewards.append(reward)
+        self.states.append(state)
+        self.actions.append(action)
+        self.dones.append(done)
+        self.next_states.append(next_state)        
+
+    def clear_memory(self):
+        del self.actions[:]
+        del self.states[:]
+        del self.rewards[:]
+        del self.dones[:]
+        del self.next_states[:]  
+
+class Distributions():
+    def sample(self, mean, std):
+        distribution = tfp.distributions.Normal(mean, std)
+        return distribution.sample()
+        
+    def entropy(self, mean, std):
+        distribution = tfp.distributions.Normal(mean, std)
+        return distribution.entropy()
+      
+    def logprob(self, mean, std, value_data):
+        distribution = tfp.distributions.Normal(mean, std)
+        return distribution.log_prob(value_data)
+
+    def kl_divergence(self, mean1, std1, mean2, std2):
+        distribution1 = tfp.distributions.Normal(mean1, std1)
+        distribution2 = tfp.distributions.Normal(mean2, std2)
+
+        return tfp.distributions.kl_divergence(distribution1, distribution2)
+
+class PolicyFunction():
+    def __init__(self, gamma = 0.99, lam = 0.95):
+        self.gamma  = gamma
+        self.lam    = lam
+
+    def monte_carlo_discounted(self, rewards, dones):
+        running_add = 0
+        returns     = []        
+        
+        for step in reversed(range(len(rewards))):
+            running_add = rewards[step] + (1.0 - dones[step]) * self.gamma * running_add
+            returns.insert(0, running_add)
+            
+        return tf.stack(returns)
+      
+    def temporal_difference(self, reward, next_value, done):
+        q_values = reward + (1 - done) * self.gamma * next_value           
+        return q_values
+      
+    def generalized_advantage_estimation(self, values, rewards, next_values, dones):
+        gae     = 0
+        adv     = []     
+
+        delta   = rewards + (1.0 - dones) * self.gamma * next_values - values          
+        for step in reversed(range(len(rewards))):  
+            gae = delta[step] + (1.0 - dones[step]) * self.gamma * self.lam * gae
+            adv.insert(0, gae)
+            
+        return tf.stack(adv)
+
+class Agent():  
+    def __init__(self, state_dim, action_dim, is_training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
+                 minibatch, PPO_epochs, gamma, lam, learning_rate):        
+        self.policy_kl_range    = policy_kl_range 
+        self.policy_params      = policy_params
+        self.value_clip         = value_clip    
+        self.entropy_coef       = entropy_coef
+        self.vf_loss_coef       = vf_loss_coef
+        self.minibatch          = minibatch       
+        self.PPO_epochs         = PPO_epochs
+        self.is_training_mode   = is_training_mode
+        self.action_dim         = action_dim   
+        self.std                = tf.ones([1, action_dim])            
+
+        self.actor              = Actor_Model(state_dim, action_dim)
+        self.actor_old          = Actor_Model(state_dim, action_dim)
+
+        self.critic             = Critic_Model(state_dim, action_dim)
+        self.critic_old         = Critic_Model(state_dim, action_dim)
+
+        self.optimizer          = tf.keras.optimizers.Adam(learning_rate = learning_rate)
+        self.memory             = Memory()
+        self.policy_function    = PolicyFunction(gamma, lam)  
+        self.distributions      = Distributions()
+
+    def save_eps(self, state, action, reward, done, next_state):
+        self.memory.save_eps(state, action, reward, done, next_state)
+
+    # Loss for PPO  
+    def get_loss(self, action_mean, values, old_action_mean, old_values, next_values, actions, rewards, dones):   
+        # Don't use old value in backpropagation
+        Old_values      = tf.stop_gradient(old_values)
+
+        # Getting general advantages estimator
+        Advantages      = self.policy_function.generalized_advantage_estimation(values, rewards, next_values, dones)
+        Returns         = tf.stop_gradient(Advantages + values)
+        Advantages      = tf.stop_gradient((Advantages - tf.math.reduce_mean(Advantages)) / (tf.math.reduce_std(Advantages) + 1e-6))
+
+        # Finding the ratio (pi_theta / pi_theta__old):        
+        logprobs        = self.distributions.logprob(action_mean, self.std, actions)
+        Old_logprobs    = tf.stop_gradient(self.distributions.logprob(old_action_mean, self.std, actions))
+        ratios          = tf.math.exp(logprobs - Old_logprobs) # ratios = old_logprobs / logprobs
+
+        # Finding KL Divergence                
+        Kl              = self.distributions.kl_divergence(old_action_mean, self.std, action_mean, self.std)
+
+        # Combining TR-PPO with Rollback (Truly PPO)
+        pg_loss         = tf.where(
+                tf.logical_and(Kl >= self.policy_kl_range, ratios > 1),
+                ratios * Advantages - self.policy_params * Kl,
+                ratios * Advantages
+        )
+        pg_loss         = tf.math.reduce_mean(pg_loss)
+
+        # Getting entropy from the action probability
+        dist_entropy    = tf.math.reduce_mean(self.distributions.entropy(action_mean, self.std))
+
+        # Getting critic loss by using Clipped critic value
+        vpredclipped    = old_values + tf.clip_by_value(values - Old_values, -self.value_clip, self.value_clip) # Minimize the difference between old value and new value
+        vf_losses1      = tf.math.square(Returns - values) * 0.5 # Mean Squared Error
+        vf_losses2      = tf.math.square(Returns - vpredclipped) * 0.5 # Mean Squared Error
+        critic_loss     = tf.math.reduce_mean(tf.math.maximum(vf_losses1, vf_losses2))           
+
+        # We need to maximaze Policy Loss to make agent always find Better Rewards
+        # and minimize Critic Loss 
+        loss            = (critic_loss * self.vf_loss_coef) - (dist_entropy * self.entropy_coef) - pg_loss
+        return loss       
+
+    @tf.function
+    def act(self, state):
+        state           = tf.expand_dims(tf.cast(state, dtype = tf.float32), 0)
+        action_mean    = self.actor(state)
+        
+        # We don't need sample the action in Test Mode
+        # only sampling the action in Training Mode in order to exploring the actions
+        if self.is_training_mode:
+            # Sample the action
+            action  = self.distributions.sample(action_mean, self.std)
+        else:
+            action  = action_mean
+
+        return tf.squeeze(action, 0)
+
+    # Get loss and Do backpropagation
+    @tf.function
+    def training_ppo(self, states, actions, rewards, dones, next_states):        
+        with tf.GradientTape() as tape:
+            action_mean, values            = self.actor(states), self.critic(states)
+            old_action_mean, old_values    = self.actor_old(states), self.critic_old(states)
+            next_values                     = self.critic(next_states)
+
+            loss = self.get_loss(action_mean, values, old_action_mean, old_values, next_values, actions, rewards, dones)
+
+        gradients = tape.gradient(loss, self.actor.trainable_variables + self.critic.trainable_variables)        
+        self.optimizer.apply_gradients(zip(gradients, self.actor.trainable_variables + self.critic.trainable_variables)) 
+
+    # Update the model
+    def update_ppo(self):        
+        batch_size = int(len(self.memory) / self.minibatch)
+
+        # Optimize policy for K epochs:
+        for _ in range(self.PPO_epochs):       
+            for states, actions, rewards, dones, next_states in self.memory.get_all_items().batch(batch_size):
+                self.training_ppo(states, actions, rewards, dones, next_states)
+
+        # Clear the memory
+        self.memory.clear_memory()
+
+        # Copy new weights into old policy:
+        self.actor_old.set_weights(self.actor.get_weights())
+        self.critic_old.set_weights(self.critic.get_weights())
+
+    def save_weights(self):
+        self.actor.save_weights('bipedalwalker_w/actor_ppo', save_format='tf')
+        self.actor_old.save_weights('bipedalwalker_w/actor_old_ppo', save_format='tf')
+        self.critic.save_weights('bipedalwalker_w/critic_ppo', save_format='tf')
+        self.critic_old.save_weights('bipedalwalker_w/critic_old_ppo', save_format='tf')
+        
+    def load_weights(self):
+        self.actor.load_weights('bipedalwalker_w/actor_ppo')
+        self.actor_old.load_weights('bipedalwalker_w/actor_old_ppo')
+        self.critic.load_weights('bipedalwalker_w/critic_ppo')
+        self.critic_old.load_weights('bipedalwalker_w/critic_old_ppo')
+
+def plot(datas):
+    print('----------')
+
+    plt.plot(datas)
+    plt.plot()
+    plt.xlabel('Episode')
+    plt.ylabel('Datas')
+    plt.show()
+
+    print('Max :', np.max(datas))
+    print('Min :', np.min(datas))
+    print('Avg :', np.mean(datas))
+
+def run_episode(env, agent, state_dim, render, training_mode, t_updates, n_update):
+    ############################################
+    state           = env.reset()
+    done            = False
+    total_reward    = 0
+    eps_time        = 0
+    ############################################
+    
+    while not done:
+        action                          = agent.act(state).numpy()
+        next_state, reward, done, _     = env.step(action)
+        
+        eps_time        += 1 
+        t_updates       += 1
+        total_reward    += reward
+
+        if training_mode:
+            agent.save_eps(state.tolist(), action, reward, float(done), next_state.tolist()) 
+            
+        state   = next_state
+                
+        if render:
+            env.render()
+        
+        if training_mode:
+            if t_updates % n_update == 0:
+                agent.update_ppo()
+                t_updates = 0
+        
+        if done:           
+            return total_reward, eps_time, t_updates           
+
+def main():
+   ############## Hyperparameters ##############
+    load_weights        = False # If you want to load the agent, set this to True
+    save_weights        = False # If you want to save the agent, set this to True
+    training_mode       = True # If you want to train the agent, set this to True. But set this otherwise if you only want to test it
+    reward_threshold    = 300 # Set threshold for reward. The learning will stop if reward has pass threshold. Set none to sei this off
+    using_google_drive  = False
+
+    render              = False # If you want to display the image. Turn this off if you run this in Google Collab
+    n_update            = 1024 # How many episode before you update the Policy. ocommended set to 128 for Discrete
+    n_plot_batch        = 100000000 # How many episode you want to plot the result
+    n_episode           = 100000 # How many episode you want to run
+    n_saved             = 10 # How many episode to run before saving the weights
+
+    policy_kl_range     = 0.03 # Set to 0.0008 for Discrete
+    policy_params       = 5 # Set to 20 for Discrete
+    value_clip          = 1.0 # How many value will be clipped. Recommended set to the highest or lowest possible reward
+    entropy_coef        = 0.05 # How much randomness of action you will get
+    vf_loss_coef        = 1.0 # Just set to 1
+    minibatch           = 32 # How many batch per update. size of batch = n_update / minibatch. Rocommended set to 4 for Discrete
+    PPO_epochs          = 10 # How many epoch per update
+    
+    gamma               = 0.99 # Just set to 0.99
+    lam                 = 0.95 # Just set to 0.95
+    learning_rate       = 3e-4 # Just set to 0.95
+    ############################################# 
+    env_name            = 'Env Name' # Set the env you want
+    env                 = gym.make(env_name)
+
+    state_dim           = env.observation_space.shape[0]
+    action_dim          = env.action_space.shape[0]
+
+    print(action_dim)
+
+    agent               = Agent(state_dim, action_dim, training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
+                            minibatch, PPO_epochs, gamma, lam, learning_rate)  
+    #############################################     
+    if using_google_drive:
+        from google.colab import drive
+        drive.mount('/test')
+
+    if load_weights:
+        agent.load_weights()
+        print('Weight Loaded')
+
+    rewards             = []   
+    batch_rewards       = []
+    batch_solved_reward = []
+
+    times               = []
+    batch_times         = []
+
+    total_time          = 0
+    t_updates           = 0
+
+    for i_episode in range(1, n_episode + 1):
+        total_reward, time, t_updates = run_episode(env, agent, state_dim, render, training_mode, t_updates, n_update)
+        print('Episode {} \t t_reward: {} \t time: {} \t '.format(i_episode, total_reward, time))
+        batch_rewards.append(int(total_reward))
+        batch_times.append(time)        
+
+        if save_weights:
+            if i_episode % n_saved == 0:
+                agent.save_weights() 
+                print('weights saved')
+
+        if reward_threshold:
+            if len(batch_solved_reward) == 100:            
+                if np.mean(batch_solved_reward) >= reward_threshold:
+                    print('You solved task after {} episode'.format(len(rewards)))
+                    break
+
+                else:
+                    del batch_solved_reward[0]
+                    batch_solved_reward.append(total_reward)
+
+            else:
+                batch_solved_reward.append(total_reward)
+
+        if i_episode % n_plot_batch == 0 and i_episode != 0:
+            # Plot the reward, times for every n_plot_batch
+            plot(batch_rewards)
+            plot(batch_times)
+
+            for reward in batch_rewards:
+                rewards.append(reward)
+
+            for time in batch_times:
+                times.append(time)
+
+            batch_rewards   = []
+            batch_times     = []
+
+            print('========== Cummulative ==========')
+            # Plot the reward, times for every episode
+            plot(rewards)
+            plot(times)
+
+    print('========== Final ==========')
+    # Plot the reward, times for every episode
+
+    for reward in batch_rewards:
+        rewards.append(reward)
+
+    for time in batch_times:
+        times.append(time)
+
+    plot(rewards)
+    plot(times)
+
+if __name__ == '__main__':
+    main()