add dqn and ppo examples, bit clean-up

This commit is contained in:
haoshengzou 2018-06-14 11:18:39 +08:00
parent 6f206759ab
commit f8c359b094
4 changed files with 163 additions and 11 deletions

View File

@ -1,5 +1,3 @@
from __future__ import absolute_import
import tensorflow as tf import tensorflow as tf
import time import time
import numpy as np import numpy as np
@ -13,7 +11,6 @@ if __name__ == '__main__':
observation_dim = env.observation_space.shape observation_dim = env.observation_space.shape
action_dim = env.action_space.n action_dim = env.action_space.n
clip_param = 0.2
num_batches = 10 num_batches = 10
batch_size = 512 batch_size = 512

View File

@ -1,20 +1,12 @@
#!/usr/bin/env python
from __future__ import absolute_import
import tensorflow as tf import tensorflow as tf
import gym import gym
import numpy as np import numpy as np
import time import time
import argparse
import tianshou as ts import tianshou as ts
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--render", action="store_true", default=False)
args = parser.parse_args()
env = gym.make('Pendulum-v0') env = gym.make('Pendulum-v0')
observation_dim = env.observation_space.shape observation_dim = env.observation_space.shape
action_dim = env.action_space.shape action_dim = env.action_space.shape

83
examples/dqn.py Normal file
View File

@ -0,0 +1,83 @@
import tensorflow as tf
import gym
import numpy as np
import time
import tianshou as ts
if __name__ == '__main__':
env = gym.make('CartPole-v0')
observation_dim = env.observation_space.shape
action_dim = env.action_space.n
# hyper-parameters
batch_size = 32
seed = 123
np.random.seed(seed)
tf.set_random_seed(seed)
### 1. build network with pure tf
observation_ph = tf.placeholder(tf.float32, shape=(None,) + observation_dim)
def my_network():
net = tf.layers.dense(observation_ph, 32, activation=tf.nn.tanh)
net = tf.layers.dense(net, 32, activation=tf.nn.tanh)
action_values = tf.layers.dense(net, action_dim, activation=None)
return None, action_values # no policy head
### 2. build policy, loss, optimizer
dqn = ts.value_function.DQN(my_network, observation_placeholder=observation_ph, has_old_net=True)
pi = ts.policy.DQN(dqn)
dqn_loss = ts.losses.value_mse(dqn)
total_loss = dqn_loss
optimizer = tf.train.AdamOptimizer(1e-4)
train_op = optimizer.minimize(total_loss, var_list=list(dqn.trainable_variables))
### 3. define data collection
replay_buffer = ts.data.VanillaReplayBuffer(capacity=2e4, nstep=1)
process_functions = [ts.data.advantage_estimation.nstep_q_return(1, dqn)]
managed_networks = [dqn]
data_collector = ts.data.DataCollector(
env=env,
policy=pi,
data_buffer=replay_buffer,
process_functions=process_functions,
managed_networks=managed_networks
)
### 4. start training
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
# sync target network in the beginning
pi.sync_weights()
start_time = time.time()
data_collector.collect(num_timesteps=5000)
for i in range(int(1e8)): # number of training steps
# collect data
data_collector.collect(num_timesteps=4)
# update network
feed_dict = data_collector.next_batch(batch_size)
sess.run(train_op, feed_dict=feed_dict)
if i % 5000 == 0:
print('Step {}, elapsed time: {:.1f} min'.format(i, (time.time() - start_time) / 60))
# epsilon 0.05 as in nature paper
pi.set_epsilon_test(0.05)
ts.data.test_policy_in_env(pi, env, num_timesteps=1000)
# update target network
if i % 1000 == 0:
pi.sync_weights()

80
examples/ppo.py Normal file
View File

@ -0,0 +1,80 @@
import tensorflow as tf
import gym
import numpy as np
import time
import tianshou as ts
if __name__ == '__main__':
env = gym.make('CartPole-v0')
observation_dim = env.observation_space.shape
action_dim = env.action_space.n
clip_param = 0.2
num_batches = 10
batch_size = 512
seed = 0
np.random.seed(seed)
tf.set_random_seed(seed)
### 1. build network with pure tf
observation_ph = tf.placeholder(tf.float32, shape=(None,) + observation_dim)
def my_policy():
net = tf.layers.dense(observation_ph, 32, activation=tf.nn.tanh)
net = tf.layers.dense(net, 32, activation=tf.nn.tanh)
action_logits = tf.layers.dense(net, action_dim, activation=None)
action_dist = tf.distributions.Categorical(logits=action_logits)
return action_dist, None
### 2. build policy, loss, optimizer
pi = ts.policy.Distributional(my_policy, observation_placeholder=observation_ph, has_old_net=True)
ppo_loss_clip = ts.losses.ppo_clip(pi, clip_param)
total_loss = ppo_loss_clip
optimizer = tf.train.AdamOptimizer(1e-4)
train_op = optimizer.minimize(total_loss, var_list=list(pi.trainable_variables))
### 3. define data collection
data_buffer = ts.data.BatchSet()
data_collector = ts.data.DataCollector(
env=env,
policy=pi,
data_buffer=data_buffer,
process_functions=[ts.data.advantage_estimation.full_return],
managed_networks=[pi],
)
### 4. start training
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
# assign actor to pi_old
pi.sync_weights()
start_time = time.time()
for i in range(1000):
# collect data
data_collector.collect(num_episodes=50)
# print current return
print('Epoch {}:'.format(i))
data_buffer.statistics()
# update network
for _ in range(num_batches):
feed_dict = data_collector.next_batch(batch_size)
sess.run(train_op, feed_dict=feed_dict)
# assigning pi_old to be current pi
pi.sync_weights()
print('Elapsed time: {:.1f} min'.format((time.time() - start_time) / 60))