2017-12-10 17:23:13 +08:00
|
|
|
import numpy as np
|
|
|
|
import gc
|
2018-01-17 11:55:51 +08:00
|
|
|
import logging
|
|
|
|
from . import utils
|
2017-12-10 17:23:13 +08:00
|
|
|
|
|
|
|
# TODO: Refactor with tf.train.slice_input_producer, tf.train.Coordinator, tf.train.QueueRunner
|
|
|
|
class Batch(object):
|
|
|
|
"""
|
|
|
|
class for batch datasets. Collect multiple observations (actions, rewards, etc.) on-policy.
|
|
|
|
"""
|
|
|
|
|
2018-02-24 10:53:39 +08:00
|
|
|
def __init__(self, env, pi, reward_processors, networks, render=False): # how to name the function?
|
2018-01-17 11:55:51 +08:00
|
|
|
"""
|
|
|
|
constructor
|
|
|
|
:param env:
|
|
|
|
:param pi:
|
|
|
|
:param reward_processors: list of functions to process reward
|
|
|
|
:param networks: list of networks to be optimized, so as to match data in feed_dict
|
|
|
|
"""
|
2017-12-10 17:23:13 +08:00
|
|
|
self._env = env
|
|
|
|
self._pi = pi
|
2018-01-17 11:55:51 +08:00
|
|
|
self.raw_data = {}
|
|
|
|
self.data = {}
|
|
|
|
|
|
|
|
self.reward_processors = reward_processors
|
|
|
|
self.networks = networks
|
2018-02-24 10:53:39 +08:00
|
|
|
self.render = render
|
2018-01-17 11:55:51 +08:00
|
|
|
|
|
|
|
self.required_placeholders = {}
|
|
|
|
for net in self.networks:
|
|
|
|
self.required_placeholders.update(net.managed_placeholders)
|
|
|
|
self.require_advantage = 'advantage' in self.required_placeholders.keys()
|
|
|
|
|
2017-12-10 17:23:13 +08:00
|
|
|
self._is_first_collect = True
|
|
|
|
|
2018-01-15 00:03:06 +08:00
|
|
|
def collect(self, num_timesteps=0, num_episodes=0, my_feed_dict={},
|
2018-02-25 16:31:35 +08:00
|
|
|
process_reward=True, epsilon_greedy=0): # specify how many data to collect here, or fix it in __init__()
|
2018-01-03 20:32:05 +08:00
|
|
|
assert sum(
|
|
|
|
[num_timesteps > 0, num_episodes > 0]) == 1, "One and only one collection number specification permitted!"
|
2017-12-10 17:23:13 +08:00
|
|
|
|
2018-01-03 20:32:05 +08:00
|
|
|
if num_timesteps > 0: # YouQiaoben: finish this implementation, the following code are just from openai/baselines
|
2017-12-10 17:23:13 +08:00
|
|
|
t = 0
|
2018-02-10 03:42:00 +08:00
|
|
|
ac = self._env.action_space.sample() # not used, just so we have the datatype
|
2018-01-03 20:32:05 +08:00
|
|
|
new = True # marks if we're on first timestep of an episode
|
2017-12-10 17:23:13 +08:00
|
|
|
if self.is_first_collect:
|
2018-02-10 03:42:00 +08:00
|
|
|
ob = self._env.reset()
|
2017-12-10 17:23:13 +08:00
|
|
|
self.is_first_collect = False
|
|
|
|
else:
|
2018-01-03 20:32:05 +08:00
|
|
|
ob = self.raw_data['observations'][0] # last observation!
|
2017-12-10 17:23:13 +08:00
|
|
|
|
|
|
|
# Initialize history arrays
|
|
|
|
observations = np.array([ob for _ in range(num_timesteps)])
|
|
|
|
rewards = np.zeros(num_timesteps, 'float32')
|
|
|
|
episode_start_flags = np.zeros(num_timesteps, 'int32')
|
|
|
|
actions = np.array([ac for _ in range(num_timesteps)])
|
|
|
|
|
|
|
|
for t in range(num_timesteps):
|
|
|
|
pass
|
|
|
|
|
|
|
|
while True:
|
|
|
|
prevac = ac
|
|
|
|
ac, vpred = pi.act(stochastic, ob)
|
|
|
|
# Slight weirdness here because we need value function at time T
|
|
|
|
# before returning segment [0, T-1] so we get the correct
|
|
|
|
# terminal value
|
|
|
|
i = t % horizon
|
|
|
|
observations[i] = ob
|
|
|
|
vpreds[i] = vpred
|
|
|
|
episode_start_flags[i] = new
|
|
|
|
actions[i] = ac
|
|
|
|
prevacs[i] = prevac
|
|
|
|
|
2018-02-10 03:42:00 +08:00
|
|
|
ob, rew, new, _ = self._env.step(ac)
|
2017-12-10 17:23:13 +08:00
|
|
|
rewards[i] = rew
|
|
|
|
|
|
|
|
cur_ep_ret += rew
|
|
|
|
cur_ep_len += 1
|
|
|
|
if new:
|
|
|
|
ep_rets.append(cur_ep_ret)
|
|
|
|
ep_lens.append(cur_ep_len)
|
|
|
|
cur_ep_ret = 0
|
|
|
|
cur_ep_len = 0
|
2018-02-10 03:42:00 +08:00
|
|
|
ob = self._env.reset()
|
2017-12-10 17:23:13 +08:00
|
|
|
t += 1
|
|
|
|
|
2018-01-02 19:40:37 +08:00
|
|
|
if num_episodes > 0: # YouQiaoben: fix memory growth, both del and gc.collect() fail
|
2017-12-10 17:23:13 +08:00
|
|
|
# initialize rawdata lists
|
|
|
|
if not self._is_first_collect:
|
|
|
|
del self.observations
|
|
|
|
del self.actions
|
|
|
|
del self.rewards
|
|
|
|
del self.episode_start_flags
|
|
|
|
|
|
|
|
observations = []
|
|
|
|
actions = []
|
|
|
|
rewards = []
|
|
|
|
episode_start_flags = []
|
|
|
|
|
2018-01-03 20:32:05 +08:00
|
|
|
# t_count = 0
|
2017-12-10 17:23:13 +08:00
|
|
|
|
|
|
|
for _ in range(num_episodes):
|
2018-01-03 20:32:05 +08:00
|
|
|
t_count = 0
|
|
|
|
|
2017-12-10 17:23:13 +08:00
|
|
|
ob = self._env.reset()
|
|
|
|
observations.append(ob)
|
|
|
|
episode_start_flags.append(True)
|
|
|
|
|
|
|
|
while True:
|
2018-02-25 16:31:35 +08:00
|
|
|
# a simple implementation of epsilon greedy
|
|
|
|
if epsilon_greedy > 0 and np.random.random() < epsilon_greedy:
|
|
|
|
ac = np.random.randint(low = 0, high = self._env.action_space.n)
|
|
|
|
else:
|
|
|
|
ac = self._pi.act(ob, my_feed_dict)
|
2017-12-10 17:23:13 +08:00
|
|
|
actions.append(ac)
|
|
|
|
|
2018-02-24 10:53:39 +08:00
|
|
|
if self.render:
|
|
|
|
self._env.render()
|
2017-12-10 17:23:13 +08:00
|
|
|
ob, reward, done, _ = self._env.step(ac)
|
|
|
|
rewards.append(reward)
|
|
|
|
|
2018-02-25 16:31:35 +08:00
|
|
|
#t_count += 1
|
|
|
|
#if t_count >= 100: # force episode stop, just to test if memory still grows
|
|
|
|
# break
|
2017-12-10 17:23:13 +08:00
|
|
|
|
2018-01-02 19:40:37 +08:00
|
|
|
if done: # end of episode, discard s_T
|
2018-01-17 11:55:51 +08:00
|
|
|
# TODO: for num_timesteps collection, has to store terminal flag instead of start flag!
|
2017-12-10 17:23:13 +08:00
|
|
|
break
|
|
|
|
else:
|
|
|
|
observations.append(ob)
|
|
|
|
episode_start_flags.append(False)
|
|
|
|
|
|
|
|
self.observations = np.array(observations)
|
|
|
|
self.actions = np.array(actions)
|
|
|
|
self.rewards = np.array(rewards)
|
|
|
|
self.episode_start_flags = np.array(episode_start_flags)
|
|
|
|
|
|
|
|
del observations
|
|
|
|
del actions
|
|
|
|
del rewards
|
|
|
|
del episode_start_flags
|
|
|
|
|
2018-01-17 11:55:51 +08:00
|
|
|
self.raw_data = {'observation': self.observations, 'action': self.actions, 'reward': self.rewards,
|
|
|
|
'end_flag': self.episode_start_flags}
|
2018-01-03 20:32:05 +08:00
|
|
|
|
2017-12-10 17:23:13 +08:00
|
|
|
self._is_first_collect = False
|
|
|
|
|
2018-01-17 11:55:51 +08:00
|
|
|
if process_reward:
|
2017-12-10 17:23:13 +08:00
|
|
|
self.apply_advantage_estimation_function()
|
|
|
|
|
|
|
|
gc.collect()
|
|
|
|
|
|
|
|
def apply_advantage_estimation_function(self):
|
2018-01-17 11:55:51 +08:00
|
|
|
for processor in self.reward_processors:
|
|
|
|
self.data.update(processor(self.raw_data))
|
2017-12-10 17:23:13 +08:00
|
|
|
|
2018-01-17 11:55:51 +08:00
|
|
|
def next_batch(self, batch_size, standardize_advantage=True):
|
|
|
|
rand_idx = np.random.choice(self.raw_data['observation'].shape[0], batch_size)
|
2018-01-02 19:40:37 +08:00
|
|
|
|
2018-02-27 14:11:52 +08:00
|
|
|
# maybe re-compute advantage here, but only on rand_idx
|
|
|
|
# but how to construct the feed_dict?
|
|
|
|
if self.online:
|
|
|
|
self.data_batch.update(self.apply_advantage_estimation_function(rand_idx))
|
|
|
|
|
|
|
|
|
2018-01-14 20:58:28 +08:00
|
|
|
feed_dict = {}
|
2018-01-17 11:55:51 +08:00
|
|
|
for key, placeholder in self.required_placeholders.items():
|
2018-02-27 14:11:52 +08:00
|
|
|
feed_dict[placeholder] = utils.get_batch(self, key, rand_idx)
|
|
|
|
|
2018-01-17 11:55:51 +08:00
|
|
|
found, data_key = utils.internal_key_match(key, self.raw_data.keys())
|
|
|
|
if found:
|
2018-02-27 14:11:52 +08:00
|
|
|
feed_dict[placeholder] = utils.get_batch(self.raw_data[data_key], rand_idx) # self.raw_data[data_key][rand_idx]
|
2018-01-17 11:55:51 +08:00
|
|
|
else:
|
|
|
|
found, data_key = utils.internal_key_match(key, self.data.keys())
|
|
|
|
if found:
|
|
|
|
feed_dict[placeholder] = self.data[data_key][rand_idx]
|
|
|
|
|
|
|
|
if not found:
|
|
|
|
raise TypeError('Placeholder {} has no value to feed!'.format(str(placeholder.name)))
|
|
|
|
|
|
|
|
if standardize_advantage:
|
|
|
|
if self.require_advantage:
|
|
|
|
advantage_value = feed_dict[self.required_placeholders['advantage']]
|
|
|
|
advantage_mean = np.mean(advantage_value)
|
|
|
|
advantage_std = np.std(advantage_value)
|
|
|
|
if advantage_std < 1e-3:
|
|
|
|
logging.warning('advantage_std too small (< 1e-3) for advantage standardization. may cause numerical issues')
|
|
|
|
feed_dict[self.required_placeholders['advantage']] = (advantage_value - advantage_mean) / advantage_std
|
|
|
|
|
|
|
|
# TODO: maybe move all advantage estimation functions to tf, as in tensorforce (though haven't
|
|
|
|
# understood tensorforce after reading) maybe tf.stop_gradient for targets/advantages
|
|
|
|
# this will simplify data collector as it only needs to collect raw data, (s, a, r, done) only
|
2018-01-14 20:58:28 +08:00
|
|
|
|
|
|
|
return feed_dict
|
2018-01-02 19:40:37 +08:00
|
|
|
|
2018-01-03 20:32:05 +08:00
|
|
|
# TODO: this will definitely be refactored with a proper logger
|
2018-01-02 19:40:37 +08:00
|
|
|
def statistics(self):
|
|
|
|
"""
|
|
|
|
compute the statistics of the current sampled paths
|
|
|
|
:return:
|
|
|
|
"""
|
2018-01-17 11:55:51 +08:00
|
|
|
rewards = self.raw_data['reward']
|
|
|
|
episode_start_flags = self.raw_data['end_flag']
|
2018-01-02 19:40:37 +08:00
|
|
|
num_timesteps = rewards.shape[0]
|
|
|
|
|
|
|
|
returns = []
|
2018-01-03 20:32:05 +08:00
|
|
|
episode_lengths = []
|
2018-01-02 19:40:37 +08:00
|
|
|
max_return = 0
|
2018-01-03 20:32:05 +08:00
|
|
|
num_episodes = 1
|
2018-01-02 19:40:37 +08:00
|
|
|
episode_start_idx = 0
|
|
|
|
for i in range(1, num_timesteps):
|
|
|
|
if episode_start_flags[i] or (
|
|
|
|
i == num_timesteps - 1): # found the start of next episode or the end of all episodes
|
2018-01-03 20:32:05 +08:00
|
|
|
if episode_start_flags[i]:
|
|
|
|
num_episodes += 1
|
2018-01-02 19:40:37 +08:00
|
|
|
if i < rewards.shape[0] - 1:
|
|
|
|
t = i - 1
|
|
|
|
else:
|
|
|
|
t = i
|
|
|
|
Gt = 0
|
2018-01-03 20:32:05 +08:00
|
|
|
episode_lengths.append(t - episode_start_idx)
|
2018-01-02 19:40:37 +08:00
|
|
|
while t >= episode_start_idx:
|
|
|
|
Gt += rewards[t]
|
|
|
|
t -= 1
|
|
|
|
|
|
|
|
returns.append(Gt)
|
|
|
|
if Gt > max_return:
|
|
|
|
max_return = Gt
|
|
|
|
episode_start_idx = i
|
|
|
|
|
|
|
|
print('AverageReturn: {}'.format(np.mean(returns)))
|
2018-01-03 20:32:05 +08:00
|
|
|
print('StdReturn : {}'.format(np.std(returns)))
|
|
|
|
print('NumEpisodes : {}'.format(num_episodes))
|
|
|
|
print('MinMaxReturns: {}..., {}'.format(np.sort(returns)[:3], np.sort(returns)[-3:]))
|
|
|
|
print('AverageLength: {}'.format(np.mean(episode_lengths)))
|
|
|
|
print('MinMaxLengths: {}..., {}'.format(np.sort(episode_lengths)[:3], np.sort(episode_lengths)[-3:]))
|