Tianshou/tianshou/data/batch.py

import numpy as np
import gc
import logging
from . import utils

# TODO: Refactor with tf.train.slice_input_producer, tf.train.Coordinator, tf.train.QueueRunner
class Batch(object):
    """
    class for batch datasets. Collect multiple observations (actions, rewards, etc.) on-policy.
    """

    def __init__(self, env, pi, reward_processors, networks, render=False):  # how to name the function?
        """
        constructor
        :param env:
        :param pi:
        :param reward_processors: list of functions to process reward
        :param networks: list of networks to be optimized, so as to match data in feed_dict
        """
        self._env = env
        self._pi = pi
        self.raw_data = {}
        self.data = {}

        self.reward_processors = reward_processors
        self.networks = networks
        self.render = render

        self.required_placeholders = {}
        for net in self.networks:
            self.required_placeholders.update(net.managed_placeholders)
        self.require_advantage = 'advantage' in self.required_placeholders.keys()

        self._is_first_collect = True

    def collect(self, num_timesteps=0, num_episodes=0, my_feed_dict={},
                process_reward=True, epsilon_greedy=0):  # specify how many data to collect here, or fix it in __init__()
        assert sum(
            [num_timesteps > 0, num_episodes > 0]) == 1, "One and only one collection number specification permitted!"

        if num_timesteps > 0:  # YouQiaoben: finish this implementation, the following code are just from openai/baselines
            t = 0
            ac = self._env.action_space.sample()  # not used, just so we have the datatype
            new = True  # marks if we're on first timestep of an episode
            if self.is_first_collect:
                ob = self._env.reset()
                self.is_first_collect = False
            else:
                ob = self.raw_data['observations'][0]  # last observation!

            # Initialize history arrays
            observations = np.array([ob for _ in range(num_timesteps)])
            rewards = np.zeros(num_timesteps, 'float32')
            episode_start_flags = np.zeros(num_timesteps, 'int32')
            actions = np.array([ac for _ in range(num_timesteps)])

            for t in range(num_timesteps):
                pass

            while True:
                prevac = ac
                ac, vpred = pi.act(stochastic, ob)
                # Slight weirdness here because we need value function at time T
                # before returning segment [0, T-1] so we get the correct
                # terminal value
                i = t % horizon
                observations[i] = ob
                vpreds[i] = vpred
                episode_start_flags[i] = new
                actions[i] = ac
                prevacs[i] = prevac

                ob, rew, new, _ = self._env.step(ac)
                rewards[i] = rew

                cur_ep_ret += rew
                cur_ep_len += 1
                if new:
                    ep_rets.append(cur_ep_ret)
                    ep_lens.append(cur_ep_len)
                    cur_ep_ret = 0
                    cur_ep_len = 0
                    ob = self._env.reset()
                t += 1

        if num_episodes > 0:  # YouQiaoben: fix memory growth, both del and gc.collect() fail
            # initialize rawdata lists
            if not self._is_first_collect:
                del self.observations
                del self.actions
                del self.rewards
                del self.episode_start_flags

            observations = []
            actions = []
            rewards = []
            episode_start_flags = []

            # t_count = 0

            for _ in range(num_episodes):
                t_count = 0

                ob = self._env.reset()
                observations.append(ob)
                episode_start_flags.append(True)

                while True:
                    # a simple implementation of epsilon greedy
                    if epsilon_greedy > 0 and np.random.random() < epsilon_greedy:
                        ac = np.random.randint(low = 0, high = self._env.action_space.n)
                    else:
                        ac = self._pi.act(ob, my_feed_dict)
                    actions.append(ac)

                    if self.render:
                        self._env.render()
                    ob, reward, done, _ = self._env.step(ac)
                    rewards.append(reward)

                    #t_count += 1
                    #if t_count >= 100:  # force episode stop, just to test if memory still grows
                    #    break

                    if done:  # end of episode, discard s_T
                        # TODO: for num_timesteps collection, has to store terminal flag instead of start flag!
                        break
                    else:
                        observations.append(ob)
                        episode_start_flags.append(False)

            self.observations = np.array(observations)
            self.actions = np.array(actions)
            self.rewards = np.array(rewards)
            self.episode_start_flags = np.array(episode_start_flags)

            del observations
            del actions
            del rewards
            del episode_start_flags

            self.raw_data = {'observation': self.observations, 'action': self.actions, 'reward': self.rewards,
                             'end_flag': self.episode_start_flags}

            self._is_first_collect = False

        if process_reward:
            self.apply_advantage_estimation_function()

        gc.collect()

    def apply_advantage_estimation_function(self):
        for processor in self.reward_processors:
            self.data.update(processor(self.raw_data))

    def next_batch(self, batch_size, standardize_advantage=True):
        rand_idx = np.random.choice(self.raw_data['observation'].shape[0], batch_size)

        # maybe re-compute advantage here, but only on rand_idx
        # but how to construct the feed_dict?
        if self.online:
            self.data_batch.update(self.apply_advantage_estimation_function(rand_idx))


        feed_dict = {}
        for key, placeholder in self.required_placeholders.items():
            feed_dict[placeholder] = utils.get_batch(self, key, rand_idx)

            found, data_key = utils.internal_key_match(key, self.raw_data.keys())
            if found:
                feed_dict[placeholder] = utils.get_batch(self.raw_data[data_key], rand_idx)  # self.raw_data[data_key][rand_idx]
            else:
                found, data_key = utils.internal_key_match(key, self.data.keys())
                if found:
                    feed_dict[placeholder] = self.data[data_key][rand_idx]

            if not found:
                raise TypeError('Placeholder {} has no value to feed!'.format(str(placeholder.name)))

        if standardize_advantage:
            if self.require_advantage:
                advantage_value = feed_dict[self.required_placeholders['advantage']]
                advantage_mean = np.mean(advantage_value)
                advantage_std = np.std(advantage_value)
                if advantage_std < 1e-3:
                    logging.warning('advantage_std too small (< 1e-3) for advantage standardization. may cause numerical issues')
                feed_dict[self.required_placeholders['advantage']] = (advantage_value - advantage_mean) / advantage_std

        # TODO: maybe move all advantage estimation functions to tf, as in tensorforce (though haven't
        # understood tensorforce after reading) maybe tf.stop_gradient for targets/advantages
        # this will simplify data collector as it only needs to collect raw data, (s, a, r, done) only

        return feed_dict

    # TODO: this will definitely be refactored with a proper logger
    def statistics(self):
        """
        compute the statistics of the current sampled paths
        :return:
        """
        rewards = self.raw_data['reward']
        episode_start_flags = self.raw_data['end_flag']
        num_timesteps = rewards.shape[0]

        returns = []
        episode_lengths = []
        max_return = 0
        num_episodes = 1
        episode_start_idx = 0
        for i in range(1, num_timesteps):
            if episode_start_flags[i] or (
                    i == num_timesteps - 1):  # found the start of next episode or the end of all episodes
                if episode_start_flags[i]:
                    num_episodes += 1
                if i < rewards.shape[0] - 1:
                    t = i - 1
                else:
                    t = i
                Gt = 0
                episode_lengths.append(t - episode_start_idx)
                while t >= episode_start_idx:
                    Gt += rewards[t]
                    t -= 1

                returns.append(Gt)
                if Gt > max_return:
                    max_return = Gt
                episode_start_idx = i

        print('AverageReturn: {}'.format(np.mean(returns)))
        print('StdReturn    : {}'.format(np.std(returns)))
        print('NumEpisodes  : {}'.format(num_episodes))
        print('MinMaxReturns: {}..., {}'.format(np.sort(returns)[:3], np.sort(returns)[-3:]))
        print('AverageLength: {}'.format(np.mean(episode_lengths)))
        print('MinMaxLengths: {}..., {}'.format(np.sort(episode_lengths)[:3], np.sort(episode_lengths)[-3:]))
fix naming and comments of coding style, delete .json 2017-12-10 17:23:13 +08:00			`import numpy as np`
			`import gc`
fixed the bugs on Jan 14, which gives inferior or even no improvement. mistook group_ndims. policy will soon need refactoring. 2018-01-17 11:55:51 +08:00			`import logging`
			`from . import utils`
fix naming and comments of coding style, delete .json 2017-12-10 17:23:13 +08:00
			`# TODO: Refactor with tf.train.slice_input_producer, tf.train.Coordinator, tf.train.QueueRunner`
			`class Batch(object):`
			`"""`
			`class for batch datasets. Collect multiple observations (actions, rewards, etc.) on-policy.`
			`"""`

add option to show the running result of cartpole 2018-02-24 10:53:39 +08:00			`def __init__(self, env, pi, reward_processors, networks, render=False): # how to name the function?`
fixed the bugs on Jan 14, which gives inferior or even no improvement. mistook group_ndims. policy will soon need refactoring. 2018-01-17 11:55:51 +08:00			`"""`
			`constructor`
			`:param env:`
			`:param pi:`
			`:param reward_processors: list of functions to process reward`
			`:param networks: list of networks to be optimized, so as to match data in feed_dict`
			`"""`
fix naming and comments of coding style, delete .json 2017-12-10 17:23:13 +08:00			`self._env = env`
			`self._pi = pi`
fixed the bugs on Jan 14, which gives inferior or even no improvement. mistook group_ndims. policy will soon need refactoring. 2018-01-17 11:55:51 +08:00			`self.raw_data = {}`
			`self.data = {}`

			`self.reward_processors = reward_processors`
			`self.networks = networks`
add option to show the running result of cartpole 2018-02-24 10:53:39 +08:00			`self.render = render`
fixed the bugs on Jan 14, which gives inferior or even no improvement. mistook group_ndims. policy will soon need refactoring. 2018-01-17 11:55:51 +08:00
			`self.required_placeholders = {}`
			`for net in self.networks:`
			`self.required_placeholders.update(net.managed_placeholders)`
			`self.require_advantage = 'advantage' in self.required_placeholders.keys()`

fix naming and comments of coding style, delete .json 2017-12-10 17:23:13 +08:00			`self._is_first_collect = True`

finished all ppo examples. Training is remarkably slower than the version before Jan 13. More strangely, in the gym example there's almost no improvement... but this problem comes behind design. I'll first write actor-critic. 2018-01-15 00:03:06 +08:00			`def collect(self, num_timesteps=0, num_episodes=0, my_feed_dict={},`
add epsilon-greedy for dqn 2018-02-25 16:31:35 +08:00			`process_reward=True, epsilon_greedy=0): # specify how many data to collect here, or fix it in __init__()`
fix memory growth and slowness caused by sess.run(tf.multinomial()), now ppo examples are working OK with slight memory growth (1M/min), which still needs research 2018-01-03 20:32:05 +08:00			`assert sum(`
			`[num_timesteps > 0, num_episodes > 0]) == 1, "One and only one collection number specification permitted!"`
fix naming and comments of coding style, delete .json 2017-12-10 17:23:13 +08:00
fix memory growth and slowness caused by sess.run(tf.multinomial()), now ppo examples are working OK with slight memory growth (1M/min), which still needs research 2018-01-03 20:32:05 +08:00			`if num_timesteps > 0: # YouQiaoben: finish this implementation, the following code are just from openai/baselines`
fix naming and comments of coding style, delete .json 2017-12-10 17:23:13 +08:00			`t = 0`
fix the env -> self._env bug 2018-02-10 03:42:00 +08:00			`ac = self._env.action_space.sample() # not used, just so we have the datatype`
fix memory growth and slowness caused by sess.run(tf.multinomial()), now ppo examples are working OK with slight memory growth (1M/min), which still needs research 2018-01-03 20:32:05 +08:00			`new = True # marks if we're on first timestep of an episode`
fix naming and comments of coding style, delete .json 2017-12-10 17:23:13 +08:00			`if self.is_first_collect:`
fix the env -> self._env bug 2018-02-10 03:42:00 +08:00			`ob = self._env.reset()`
fix naming and comments of coding style, delete .json 2017-12-10 17:23:13 +08:00			`self.is_first_collect = False`
			`else:`
fix memory growth and slowness caused by sess.run(tf.multinomial()), now ppo examples are working OK with slight memory growth (1M/min), which still needs research 2018-01-03 20:32:05 +08:00			`ob = self.raw_data['observations'][0] # last observation!`
fix naming and comments of coding style, delete .json 2017-12-10 17:23:13 +08:00
			`# Initialize history arrays`
			`observations = np.array([ob for _ in range(num_timesteps)])`
			`rewards = np.zeros(num_timesteps, 'float32')`
			`episode_start_flags = np.zeros(num_timesteps, 'int32')`
			`actions = np.array([ac for _ in range(num_timesteps)])`

			`for t in range(num_timesteps):`
			`pass`

			`while True:`
			`prevac = ac`
			`ac, vpred = pi.act(stochastic, ob)`
			`# Slight weirdness here because we need value function at time T`
			`# before returning segment [0, T-1] so we get the correct`
			`# terminal value`
			`i = t % horizon`
			`observations[i] = ob`
			`vpreds[i] = vpred`
			`episode_start_flags[i] = new`
			`actions[i] = ac`
			`prevacs[i] = prevac`

fix the env -> self._env bug 2018-02-10 03:42:00 +08:00			`ob, rew, new, _ = self._env.step(ac)`
fix naming and comments of coding style, delete .json 2017-12-10 17:23:13 +08:00			`rewards[i] = rew`

			`cur_ep_ret += rew`
			`cur_ep_len += 1`
			`if new:`
			`ep_rets.append(cur_ep_ret)`
			`ep_lens.append(cur_ep_len)`
			`cur_ep_ret = 0`
			`cur_ep_len = 0`
fix the env -> self._env bug 2018-02-10 03:42:00 +08:00			`ob = self._env.reset()`
fix naming and comments of coding style, delete .json 2017-12-10 17:23:13 +08:00			`t += 1`

ppo_cartpole.py seems to be working with param: bs128, num_ep20, max_time500; manually merged Normal from branch policy_wrapper 2018-01-02 19:40:37 +08:00			`if num_episodes > 0: # YouQiaoben: fix memory growth, both del and gc.collect() fail`
fix naming and comments of coding style, delete .json 2017-12-10 17:23:13 +08:00			`# initialize rawdata lists`
			`if not self._is_first_collect:`
			`del self.observations`
			`del self.actions`
			`del self.rewards`
			`del self.episode_start_flags`

			`observations = []`
			`actions = []`
			`rewards = []`
			`episode_start_flags = []`

fix memory growth and slowness caused by sess.run(tf.multinomial()), now ppo examples are working OK with slight memory growth (1M/min), which still needs research 2018-01-03 20:32:05 +08:00			`# t_count = 0`
fix naming and comments of coding style, delete .json 2017-12-10 17:23:13 +08:00
			`for _ in range(num_episodes):`
fix memory growth and slowness caused by sess.run(tf.multinomial()), now ppo examples are working OK with slight memory growth (1M/min), which still needs research 2018-01-03 20:32:05 +08:00			`t_count = 0`

fix naming and comments of coding style, delete .json 2017-12-10 17:23:13 +08:00			`ob = self._env.reset()`
			`observations.append(ob)`
			`episode_start_flags.append(True)`

			`while True:`
add epsilon-greedy for dqn 2018-02-25 16:31:35 +08:00			`# a simple implementation of epsilon greedy`
			`if epsilon_greedy > 0 and np.random.random() < epsilon_greedy:`
			`ac = np.random.randint(low = 0, high = self._env.action_space.n)`
			`else:`
			`ac = self._pi.act(ob, my_feed_dict)`
fix naming and comments of coding style, delete .json 2017-12-10 17:23:13 +08:00			`actions.append(ac)`

add option to show the running result of cartpole 2018-02-24 10:53:39 +08:00			`if self.render:`
			`self._env.render()`
fix naming and comments of coding style, delete .json 2017-12-10 17:23:13 +08:00			`ob, reward, done, _ = self._env.step(ac)`
			`rewards.append(reward)`

add epsilon-greedy for dqn 2018-02-25 16:31:35 +08:00			`#t_count += 1`
			`#if t_count >= 100: # force episode stop, just to test if memory still grows`
			`# break`
fix naming and comments of coding style, delete .json 2017-12-10 17:23:13 +08:00
ppo_cartpole.py seems to be working with param: bs128, num_ep20, max_time500; manually merged Normal from branch policy_wrapper 2018-01-02 19:40:37 +08:00			`if done: # end of episode, discard s_T`
fixed the bugs on Jan 14, which gives inferior or even no improvement. mistook group_ndims. policy will soon need refactoring. 2018-01-17 11:55:51 +08:00			`# TODO: for num_timesteps collection, has to store terminal flag instead of start flag!`
fix naming and comments of coding style, delete .json 2017-12-10 17:23:13 +08:00			`break`
			`else:`
			`observations.append(ob)`
			`episode_start_flags.append(False)`

			`self.observations = np.array(observations)`
			`self.actions = np.array(actions)`
			`self.rewards = np.array(rewards)`
			`self.episode_start_flags = np.array(episode_start_flags)`

			`del observations`
			`del actions`
			`del rewards`
			`del episode_start_flags`

fixed the bugs on Jan 14, which gives inferior or even no improvement. mistook group_ndims. policy will soon need refactoring. 2018-01-17 11:55:51 +08:00			`self.raw_data = {'observation': self.observations, 'action': self.actions, 'reward': self.rewards,`
			`'end_flag': self.episode_start_flags}`
fix memory growth and slowness caused by sess.run(tf.multinomial()), now ppo examples are working OK with slight memory growth (1M/min), which still needs research 2018-01-03 20:32:05 +08:00
fix naming and comments of coding style, delete .json 2017-12-10 17:23:13 +08:00			`self._is_first_collect = False`

fixed the bugs on Jan 14, which gives inferior or even no improvement. mistook group_ndims. policy will soon need refactoring. 2018-01-17 11:55:51 +08:00			`if process_reward:`
fix naming and comments of coding style, delete .json 2017-12-10 17:23:13 +08:00			`self.apply_advantage_estimation_function()`

			`gc.collect()`

			`def apply_advantage_estimation_function(self):`
fixed the bugs on Jan 14, which gives inferior or even no improvement. mistook group_ndims. policy will soon need refactoring. 2018-01-17 11:55:51 +08:00			`for processor in self.reward_processors:`
			`self.data.update(processor(self.raw_data))`
fix naming and comments of coding style, delete .json 2017-12-10 17:23:13 +08:00
fixed the bugs on Jan 14, which gives inferior or even no improvement. mistook group_ndims. policy will soon need refactoring. 2018-01-17 11:55:51 +08:00			`def next_batch(self, batch_size, standardize_advantage=True):`
			`rand_idx = np.random.choice(self.raw_data['observation'].shape[0], batch_size)`
ppo_cartpole.py seems to be working with param: bs128, num_ep20, max_time500; manually merged Normal from branch policy_wrapper 2018-01-02 19:40:37 +08:00
interfaces for advantage_estimation. full_return finished and tested. 2018-02-27 14:11:52 +08:00			`# maybe re-compute advantage here, but only on rand_idx`
			`# but how to construct the feed_dict?`
			`if self.online:`
			`self.data_batch.update(self.apply_advantage_estimation_function(rand_idx))`


auto target network. ppo_cartpole.py run ok. but results is different from previous version even with the same random seed, still needs debugging. 2018-01-14 20:58:28 +08:00			`feed_dict = {}`
fixed the bugs on Jan 14, which gives inferior or even no improvement. mistook group_ndims. policy will soon need refactoring. 2018-01-17 11:55:51 +08:00			`for key, placeholder in self.required_placeholders.items():`
interfaces for advantage_estimation. full_return finished and tested. 2018-02-27 14:11:52 +08:00			`feed_dict[placeholder] = utils.get_batch(self, key, rand_idx)`

fixed the bugs on Jan 14, which gives inferior or even no improvement. mistook group_ndims. policy will soon need refactoring. 2018-01-17 11:55:51 +08:00			`found, data_key = utils.internal_key_match(key, self.raw_data.keys())`
			`if found:`
interfaces for advantage_estimation. full_return finished and tested. 2018-02-27 14:11:52 +08:00			`feed_dict[placeholder] = utils.get_batch(self.raw_data[data_key], rand_idx) # self.raw_data[data_key][rand_idx]`
fixed the bugs on Jan 14, which gives inferior or even no improvement. mistook group_ndims. policy will soon need refactoring. 2018-01-17 11:55:51 +08:00			`else:`
			`found, data_key = utils.internal_key_match(key, self.data.keys())`
			`if found:`
			`feed_dict[placeholder] = self.data[data_key][rand_idx]`

			`if not found:`
			`raise TypeError('Placeholder {} has no value to feed!'.format(str(placeholder.name)))`

			`if standardize_advantage:`
			`if self.require_advantage:`
			`advantage_value = feed_dict[self.required_placeholders['advantage']]`
			`advantage_mean = np.mean(advantage_value)`
			`advantage_std = np.std(advantage_value)`
			`if advantage_std < 1e-3:`
			`logging.warning('advantage_std too small (< 1e-3) for advantage standardization. may cause numerical issues')`
			`feed_dict[self.required_placeholders['advantage']] = (advantage_value - advantage_mean) / advantage_std`

			`# TODO: maybe move all advantage estimation functions to tf, as in tensorforce (though haven't`
			`# understood tensorforce after reading) maybe tf.stop_gradient for targets/advantages`
			`# this will simplify data collector as it only needs to collect raw data, (s, a, r, done) only`
auto target network. ppo_cartpole.py run ok. but results is different from previous version even with the same random seed, still needs debugging. 2018-01-14 20:58:28 +08:00
			`return feed_dict`
ppo_cartpole.py seems to be working with param: bs128, num_ep20, max_time500; manually merged Normal from branch policy_wrapper 2018-01-02 19:40:37 +08:00
fix memory growth and slowness caused by sess.run(tf.multinomial()), now ppo examples are working OK with slight memory growth (1M/min), which still needs research 2018-01-03 20:32:05 +08:00			`# TODO: this will definitely be refactored with a proper logger`
ppo_cartpole.py seems to be working with param: bs128, num_ep20, max_time500; manually merged Normal from branch policy_wrapper 2018-01-02 19:40:37 +08:00			`def statistics(self):`
			`"""`
			`compute the statistics of the current sampled paths`
			`:return:`
			`"""`
fixed the bugs on Jan 14, which gives inferior or even no improvement. mistook group_ndims. policy will soon need refactoring. 2018-01-17 11:55:51 +08:00			`rewards = self.raw_data['reward']`
			`episode_start_flags = self.raw_data['end_flag']`
ppo_cartpole.py seems to be working with param: bs128, num_ep20, max_time500; manually merged Normal from branch policy_wrapper 2018-01-02 19:40:37 +08:00			`num_timesteps = rewards.shape[0]`

			`returns = []`
fix memory growth and slowness caused by sess.run(tf.multinomial()), now ppo examples are working OK with slight memory growth (1M/min), which still needs research 2018-01-03 20:32:05 +08:00			`episode_lengths = []`
ppo_cartpole.py seems to be working with param: bs128, num_ep20, max_time500; manually merged Normal from branch policy_wrapper 2018-01-02 19:40:37 +08:00			`max_return = 0`
fix memory growth and slowness caused by sess.run(tf.multinomial()), now ppo examples are working OK with slight memory growth (1M/min), which still needs research 2018-01-03 20:32:05 +08:00			`num_episodes = 1`
ppo_cartpole.py seems to be working with param: bs128, num_ep20, max_time500; manually merged Normal from branch policy_wrapper 2018-01-02 19:40:37 +08:00			`episode_start_idx = 0`
			`for i in range(1, num_timesteps):`
			`if episode_start_flags[i] or (`
			`i == num_timesteps - 1): # found the start of next episode or the end of all episodes`
fix memory growth and slowness caused by sess.run(tf.multinomial()), now ppo examples are working OK with slight memory growth (1M/min), which still needs research 2018-01-03 20:32:05 +08:00			`if episode_start_flags[i]:`
			`num_episodes += 1`
ppo_cartpole.py seems to be working with param: bs128, num_ep20, max_time500; manually merged Normal from branch policy_wrapper 2018-01-02 19:40:37 +08:00			`if i < rewards.shape[0] - 1:`
			`t = i - 1`
			`else:`
			`t = i`
			`Gt = 0`
fix memory growth and slowness caused by sess.run(tf.multinomial()), now ppo examples are working OK with slight memory growth (1M/min), which still needs research 2018-01-03 20:32:05 +08:00			`episode_lengths.append(t - episode_start_idx)`
ppo_cartpole.py seems to be working with param: bs128, num_ep20, max_time500; manually merged Normal from branch policy_wrapper 2018-01-02 19:40:37 +08:00			`while t >= episode_start_idx:`
			`Gt += rewards[t]`
			`t -= 1`

			`returns.append(Gt)`
			`if Gt > max_return:`
			`max_return = Gt`
			`episode_start_idx = i`

			`print('AverageReturn: {}'.format(np.mean(returns)))`
fix memory growth and slowness caused by sess.run(tf.multinomial()), now ppo examples are working OK with slight memory growth (1M/min), which still needs research 2018-01-03 20:32:05 +08:00			`print('StdReturn : {}'.format(np.std(returns)))`
			`print('NumEpisodes : {}'.format(num_episodes))`
			`print('MinMaxReturns: {}..., {}'.format(np.sort(returns)[:3], np.sort(returns)[-3:]))`
			`print('AverageLength: {}'.format(np.mean(episode_lengths)))`
			`print('MinMaxLengths: {}..., {}'.format(np.sort(episode_lengths)[:3], np.sort(episode_lengths)[-3:]))`