diff --git a/examples/actor_critic.py b/examples/actor_critic.py index 2e3a8a5..588b02c 100755 --- a/examples/actor_critic.py +++ b/examples/actor_critic.py @@ -1,5 +1,3 @@ -from __future__ import absolute_import - import tensorflow as tf import time import numpy as np @@ -13,7 +11,6 @@ if __name__ == '__main__': observation_dim = env.observation_space.shape action_dim = env.action_space.n - clip_param = 0.2 num_batches = 10 batch_size = 512 diff --git a/examples/ddpg.py b/examples/ddpg.py index 17a67a7..44958b0 100644 --- a/examples/ddpg.py +++ b/examples/ddpg.py @@ -1,20 +1,12 @@ -#!/usr/bin/env python -from __future__ import absolute_import - import tensorflow as tf import gym import numpy as np import time -import argparse import tianshou as ts if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument("--render", action="store_true", default=False) - args = parser.parse_args() - env = gym.make('Pendulum-v0') observation_dim = env.observation_space.shape action_dim = env.action_space.shape diff --git a/examples/dqn.py b/examples/dqn.py new file mode 100644 index 0000000..13fddeb --- /dev/null +++ b/examples/dqn.py @@ -0,0 +1,83 @@ +import tensorflow as tf +import gym +import numpy as np +import time + +import tianshou as ts + + +if __name__ == '__main__': + env = gym.make('CartPole-v0') + observation_dim = env.observation_space.shape + action_dim = env.action_space.n + + # hyper-parameters + batch_size = 32 + + seed = 123 + np.random.seed(seed) + tf.set_random_seed(seed) + + ### 1. build network with pure tf + observation_ph = tf.placeholder(tf.float32, shape=(None,) + observation_dim) + + def my_network(): + net = tf.layers.dense(observation_ph, 32, activation=tf.nn.tanh) + net = tf.layers.dense(net, 32, activation=tf.nn.tanh) + + action_values = tf.layers.dense(net, action_dim, activation=None) + + return None, action_values # no policy head + + ### 2. build policy, loss, optimizer + dqn = ts.value_function.DQN(my_network, observation_placeholder=observation_ph, has_old_net=True) + pi = ts.policy.DQN(dqn) + + dqn_loss = ts.losses.value_mse(dqn) + + total_loss = dqn_loss + optimizer = tf.train.AdamOptimizer(1e-4) + train_op = optimizer.minimize(total_loss, var_list=list(dqn.trainable_variables)) + + ### 3. define data collection + replay_buffer = ts.data.VanillaReplayBuffer(capacity=2e4, nstep=1) + + process_functions = [ts.data.advantage_estimation.nstep_q_return(1, dqn)] + managed_networks = [dqn] + + data_collector = ts.data.DataCollector( + env=env, + policy=pi, + data_buffer=replay_buffer, + process_functions=process_functions, + managed_networks=managed_networks + ) + + ### 4. start training + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + with tf.Session(config=config) as sess: + sess.run(tf.global_variables_initializer()) + + # sync target network in the beginning + pi.sync_weights() + + start_time = time.time() + data_collector.collect(num_timesteps=5000) + for i in range(int(1e8)): # number of training steps + # collect data + data_collector.collect(num_timesteps=4) + + # update network + feed_dict = data_collector.next_batch(batch_size) + sess.run(train_op, feed_dict=feed_dict) + + if i % 5000 == 0: + print('Step {}, elapsed time: {:.1f} min'.format(i, (time.time() - start_time) / 60)) + # epsilon 0.05 as in nature paper + pi.set_epsilon_test(0.05) + ts.data.test_policy_in_env(pi, env, num_timesteps=1000) + + # update target network + if i % 1000 == 0: + pi.sync_weights() \ No newline at end of file diff --git a/examples/ppo.py b/examples/ppo.py new file mode 100644 index 0000000..7e10991 --- /dev/null +++ b/examples/ppo.py @@ -0,0 +1,80 @@ +import tensorflow as tf +import gym +import numpy as np +import time + +import tianshou as ts + + +if __name__ == '__main__': + env = gym.make('CartPole-v0') + observation_dim = env.observation_space.shape + action_dim = env.action_space.n + + clip_param = 0.2 + num_batches = 10 + batch_size = 512 + + seed = 0 + np.random.seed(seed) + tf.set_random_seed(seed) + + ### 1. build network with pure tf + observation_ph = tf.placeholder(tf.float32, shape=(None,) + observation_dim) + + def my_policy(): + net = tf.layers.dense(observation_ph, 32, activation=tf.nn.tanh) + net = tf.layers.dense(net, 32, activation=tf.nn.tanh) + + action_logits = tf.layers.dense(net, action_dim, activation=None) + action_dist = tf.distributions.Categorical(logits=action_logits) + + return action_dist, None + + ### 2. build policy, loss, optimizer + pi = ts.policy.Distributional(my_policy, observation_placeholder=observation_ph, has_old_net=True) + + ppo_loss_clip = ts.losses.ppo_clip(pi, clip_param) + + total_loss = ppo_loss_clip + optimizer = tf.train.AdamOptimizer(1e-4) + train_op = optimizer.minimize(total_loss, var_list=list(pi.trainable_variables)) + + ### 3. define data collection + data_buffer = ts.data.BatchSet() + + data_collector = ts.data.DataCollector( + env=env, + policy=pi, + data_buffer=data_buffer, + process_functions=[ts.data.advantage_estimation.full_return], + managed_networks=[pi], + ) + + ### 4. start training + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + with tf.Session(config=config) as sess: + sess.run(tf.global_variables_initializer()) + + # assign actor to pi_old + pi.sync_weights() + + start_time = time.time() + for i in range(1000): + # collect data + data_collector.collect(num_episodes=50) + + # print current return + print('Epoch {}:'.format(i)) + data_buffer.statistics() + + # update network + for _ in range(num_batches): + feed_dict = data_collector.next_batch(batch_size) + sess.run(train_op, feed_dict=feed_dict) + + # assigning pi_old to be current pi + pi.sync_weights() + + print('Elapsed time: {:.1f} min'.format((time.time() - start_time) / 60)) \ No newline at end of file