design exploration and evaluators for off-policy algos

This commit is contained in:
haoshengzou 2018-03-04 13:53:29 +08:00
parent 2eb056a721
commit 54a7b1343d
3 changed files with 25 additions and 13 deletions

View File

@ -78,13 +78,10 @@ if __name__ == '__main__':
critic.sync_weights()
start_time = time.time()
for i in range(100):
data_collector.collect(num_timesteps=1e3) # warm-up
for i in range(int(1e8)):
# collect data
data_collector.collect(num_episodes=50)
# print current return
print('Epoch {}:'.format(i))
data_collector.statistics()
data_collector.collect()
# update network
for _ in range(num_batches):
@ -92,3 +89,7 @@ if __name__ == '__main__':
sess.run([actor_train_op, critic_train_op], feed_dict=feed_dict)
print('Elapsed time: {:.1f} min'.format((time.time() - start_time) / 60))
# test every 1000 training steps
if i % 1000 == 0:
test(env, actor)

View File

@ -77,17 +77,28 @@ if __name__ == '__main__':
pi.sync_weights() # TODO: automate this for policies with target network
start_time = time.time()
for i in range(100):
epsilon = 0.5
pi.set_epsilon_train(epsilon)
data_collector.collect(num_timesteps=1e3) # warm-up
for i in range(int(1e8)): # number of training steps
# anneal epsilon step-wise
if (i + 1) % 1e4 == 0 and epsilon > 0.1:
epsilon -= 0.1
pi.set_epsilon_train(epsilon)
# collect data
data_collector.collect()
# print current return
print('Epoch {}:'.format(i))
data_collector.statistics()
# update network
for _ in range(num_batches):
feed_dict = data_collector.next_batch(batch_size)
sess.run(train_op, feed_dict=feed_dict)
print('Elapsed time: {:.1f} min'.format((time.time() - start_time) / 60))
# test every 1000 training steps
# tester could share some code with batch!
if i % 1000 == 0:
# epsilon 0.05 as in nature paper
pi.set_epsilon_test(0.05)
test(env, pi) # go for act_test of pi, not act

View File

@ -59,7 +59,7 @@ class DataCollector(object):
if self.process_mode == 'minibatch':
pass
# flatten rank-2 list to numpy array
# flatten rank-2 list to numpy array, construct feed_dict
return