This commit is contained in:
rtz19970824 2017-12-08 23:41:51 +08:00
commit 03a6880050
12 changed files with 659 additions and 240 deletions

1
.gitignore vendored
View File

@ -3,3 +3,4 @@ leela-zero
*.pyc
parameters
*.swp
*.sublime*

View File

@ -10,31 +10,31 @@ Tianshou(天授) is a reinforcement learning platform. The following image illus
## core
### Model
    DQN, Policy-Value Network of AlphaGo Zero, PPO-specific, TROP-specific
### Policy Wrapper
    Stochastic policies (OnehotCategorical, Gaussian), deterministic policies (policy as in DQN, DDPG)
    Specific network architectures in original paper of DQN, TRPO, A3C, etc. Policy-Value Network of AlphaGo Zero
### Algorithm
#### Loss design
    Actor-Critic (Variations), DQN (Variations), DDPG, TRPO, PPO
#### losses
    policy gradient (and its variants), DQN (and its variants), DDPG, TRPO, PPO
#### Optimization method
    SGD, ADAM, TRPO, natural gradient, etc.
#### optimizer
    TRPO, natural gradient (and TensorFlow optimizers (sgd, adam))
### Planning
    MCTS
## data
    Training style - Monte Carlo or Temporal Difference
    Training style - Batch, Replay (and its variants)
    Reward Reshaping/ Advantage Estimation Function
    Importance weight
    Advantage Estimation Function
    Multithread Read/Write
## environment
    DQN repeat frames etc.
    DQN repeat frames, Reward Reshaping, image preprocessing (not sure where)
## simulator
    Go, Othello/Reversi, Warzone
@ -43,3 +43,17 @@ Tianshou(天授) is a reinforcement learning platform. The following image illus
## TODO
Search based method parallel.
YongRen: Policy Wrapper, in order of Gaussian, DQN and DDPG
TongzhengRen: losses, in order of ppo, pg, DQN, DDPG with management of placeholders
YouQiaoben: data/Batch, implement num_timesteps, fix memory growth in num_episodes; adv_estimate.gae_lambda (need to write a value network in tf)
ShihongSong: data/Replay; then adv_estimate.dqn after YongRen's DQN
HaoshengZou: collaborate mainly on Policy and losses; interfaces and architecture
Note: install openai/gym first to run the Atari environment; note that interfaces between modules may not be finalized; the management of placeholders and `feed_dict` may have to be done manually for the time being;
Without preprocessing and other tricks, this example will not train to any meaningful results. Codes should past two tests: individual module test and run through this example code.

89
examples/ppo_example.py Executable file
View File

@ -0,0 +1,89 @@
#!/usr/bin/env python
import tensorflow as tf, numpy as np
import time
import gym
# our lib imports here!
import sys
sys.path.append('..')
import tianshou.core.losses as losses
from tianshou.data.Batch import Batch
import tianshou.data.adv_estimate as adv_estimate
import tianshou.core.policy as policy
def policy_net(obs, act_dim, scope=None):
"""
Constructs the policy network. NOT NEEDED IN THE LIBRARY! this is pure tf
:param obs: Placeholder for the observation. A tensor of shape (bs, x, y, channels)
:param act_dim: int. The number of actions.
:param scope: str. Specifying the scope of the variables.
"""
# with tf.variable_scope(scope):
net = tf.layers.conv2d(obs, 16, 8, 4, 'valid', activation=tf.nn.relu)
net = tf.layers.conv2d(net, 32, 4, 2, 'valid', activation=tf.nn.relu)
net = tf.layers.flatten(net)
net = tf.layers.dense(net, 256, activation=tf.nn.relu)
act_logits = tf.layers.dense(net, act_dim)
return act_logits
if __name__ == '__main__': # a clean version with only policy net, no value net
env = gym.make('PongNoFrameskip-v4')
obs_dim = env.observation_space.shape
act_dim = env.action_space.n
clip_param = 0.2
nb_batches = 2
# 1. build network with pure tf
obs = tf.placeholder(tf.float32, shape=(None,) + obs_dim) # network input
with tf.variable_scope('pi'):
act_logits = policy_net(obs, act_dim, 'pi')
train_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) # TODO: better management of TRAINABLE_VARIABLES
with tf.variable_scope('pi_old'):
act_logits_old = policy_net(obs, act_dim, 'pi_old')
# 2. build losses, optimizers
pi = policy.OnehotCategorical(act_logits, obs_placeholder=obs) # YongRen: policy.Gaussian (could reference the policy in TRPO paper, my code is adapted from zhusuan.distributions) policy.DQN etc.
# for continuous action space, you may need to change an environment to run
pi_old = policy.OnehotCategorical(act_logits_old, obs_placeholder=obs)
act = tf.placeholder(dtype=tf.int32, shape=[None]) # batch of integer actions
Dgrad = tf.placeholder(dtype=tf.float32, shape=[None]) # values used in the Gradients
ppo_loss_clip = losses.ppo_clip(act, Dgrad, clip_param, pi, pi_old) # TongzhengRen: losses.vpg ... management of placeholders and feed_dict
total_loss = ppo_loss_clip
optimizer = tf.train.AdamOptimizer(1e-3)
train_op = optimizer.minimize(total_loss, var_list=train_var_list)
# 3. define data collection
training_data = Batch(env, pi, adv_estimate.full_return) # YouQiaoben: finish and polish Batch, adv_estimate.gae_lambda as in PPO paper
# ShihongSong: Replay(env, pi, adv_estimate.target_network), use your ReplayMemory, interact as follows. Simplify your adv_estimate.dqn to run before YongRen's DQN
# maybe a dict to manage the elements to be collected
# 4. start training
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
minibatch_count = 0
collection_count = 0
while True: # until some stopping criterion met...
# collect data
training_data.collect(num_episodes=2) # YouQiaoben, ShihongSong
collection_count += 1
print('Collected {} times.'.format(collection_count))
# update network
for _ in range(nb_batches):
data = training_data.next_batch(64) # YouQiaoben, ShihongSong
# TODO: auto managing of the placeholders? or add this to params of data.Batch
sess.run(train_op, feed_dict={obs: data['obs'], act: data['acs'], Dgrad: data['Gts']})
minibatch_count += 1
print('Trained {} minibatches.'.format(minibatch_count))

View File

@ -1,239 +1,21 @@
# MCTS
# policy
This is an implementation for Monte Carlo Tree Search in various Reinforcement Learning applications.
YongRen
## MCTS with deterministic environments
### base, stochastic
The agent is interacting with a deterministic environment. That is to say, next state and reward for a state-action pair is deterministic.
follow OnehotCategorical to write Gaussian, can be in the same file as stochastic.py
### Node
### deterministic
Action nodes are not needed here since every state-action pair only lead to one state.
not sure how to write, but should at least have act() method to interact with environment
Elements for a node:
DQN should have an effective argmax_{actions}() method to use as a value network
+ state: the state for current node
+ parent node: the parent node of this node on the tree
+ parent action: the action that leads to this node
+ children: the next states the agent can reach by choosing an action from this node
+ prior: some external information for this node (default to be uniform)
Optional elements (for UCT or Thompson Sampling):
+ W: the list of all the sums of all sampled values collected for all children nodes (for UCT)
+ N: the list of numbers of times each of children node has been sampled (for UCT)
+ Q: the estimation for the value of all chidren nodes, i.e. W/N (for UCT)
+ U: upper bound value for all children nodes (for UCT)
+ R: the list of the one-step reward of all children nodes
+ alpha, beta: parameters for the posterior distribution of the value of all children nodes (for Thompson Sampling, beta distribution)
+ mu, sigma: parameters for the posterior distribution of the value of all children nodes (for Thompson Sampling, Gaussian distribution)
# losses
### Selection
In the selection part, an action is chosen for the current node.
TongzhengRen
If the action has been chosen before, then go to the corresponding child node and go on selection.
If not, stop selection and start expansion.
### Expansion
Send the state-action pair to the simulator and then the next state and a reward are returned by the simulator. Then initialize a new node with the next state. The prior information may be given for initialization.
Then go to rollout.
### Rollout
At the new leaf node, use a quick policy to play the game to some terminal state and return the collected reward along the trajectory to the leaf node. Use this collected reward to initialize the value of this node.
Another way is to send this state to some external estimator and use the returned result to initialize the value of this node.
Then turn to backpropagation to send this value backup.
### Backpropagation
From the leaf node to the root node, update all nodes that have been passed in this iteration.
For each node, a value is returned by its child node. Then add this value (might be multiplied by a discouting factor gamma) with the stored reward for this child to get a new value. The new value is used to update this Q value of the corresponding children.
For UCT methods, the new value is add to W. Then add 1 to N. Q and U can be calculated out.
For Thompson Sampling, the new value is treated as a sample to update the posterior distribution.
Then return teh new value to its parent node.
Stop backpropagation until root node is reached. Then start selection again.
## MCTS with random environments
The agent is interacting with a random environment. That is to say, next state and reward for a state-action pair is not deterministic. We do not know the hidden dynamics and reward distribution. We can just get samples from the simulator.
### Node
Both state nodes and action nodes are needed here.
#### State nodes
Elements for a state node:
+ state: the state for current node
+ parent node: the parent action node of this node on the tree
+ children: the actions nodes chosen from this node
+ prior: some external information for this node (default to be uniform)
Optional elements (for UCT or Thompson Sampling):
+ W: the list of all the sums of all sampled values collected for all children action nodes (for UCT)
+ N: the list of numbers of times each of children action node has been sampled (for UCT)
+ Q: the estimation for the value of all chidren action nodes, i.e. W/N (for UCT)
+ U: upper bound value for all children action nodes (for UCT)
+ R: the list of the expected one-step reward of all children action nodes
+ alpha, beta: parameters for the posterior distribution of the value of all children nodes (for Thompson Sampling, beta distribution)
+ mu, sigma: parameters for the posterior distribution of the value of all children nodes (for Thompson Sampling, Gaussian distribution)
#### Action nodes
Elements for a state node:
+ action: the action for current node
+ parent node: the parent state node of this node on the tree
+ children: the states node sampled by this action
Optional elements (for UCT or Thompson Sampling):
+ V: the estimation for the value of children state nodes (for UCT)
+ N: the number of times each of the children state node has been sampled (for UCT)
### Selection
In the selection part, an action is chosen for the current state node. Then the state-action pair to the simulator and then the next state and a reward are returned by the simulator.
If the next state has been seen from this action node before, then go to the corresponding child node and go on selection.
If not, stop selection and start expansion.
### Expansion
Initialize a new node with the next state. Add this node to children of the parent action node. Then generate all possible children for this node. Initialize them.
The prior information may be given for initialization.
Then go to rollout.
### Rollout
At the new state node, choose one action as a leaf node. Then use a quick policy to play the game to some terminal state and return the collected reward along the trajectory to the leaf node.
Then turn to backpropagation to send this value backup.
### Backpropagation
From the leaf node to the root node, update all nodes that have been passed in this iteration.
#### For state nodes
For each state node, a V value is returned by its child action node. Then add this value (might be multiplied by a discouting factor gamma) with the stored expected one step reward for this action node to get a new value. The new value is used to update the parameters for the child action node.
For UCT methods, the new value is add to W. Then add 1 to N. Q and U can be calculated out.
For Thompson Sampling, the new value is treated as a sample to update the posterior distribution.
Then return the new value to its parent state node.
#### For action nodes
For an action node, a Q value is returned by its child state node.
For UCT, calculate the new averaged V with Q and N. Then N pluses 1.
For Thompson sampling, just return the Q value to its parent action value as a sample.
Stop backpropagation until root node is reached. Then start selection again.
## MCTS for POMDPs
The agent is interacting under a partially observed environment. That is to say, we can only see observations and choose actions. A simulator is needed here. Every time I sent a state action pair to the simulator, it can return me a new state, a observation and a reward. There need a prior distribution of states for the root node.
### Node
We use observation nodes and action nodes here.
#### Observation nodes
Elements for an observation node:
+ observation: the observation for current node
+ parent node: the parent action node of this node on the tree
+ children: the actions nodes chosen from this node
+ prior: some external information for this node (default to be uniform)
Optional elements (for UCT or Thompson Sampling):
+ h: history information for this node
+ W: the list of all the sums of all sampled values collected for all children action nodes (for UCT)
+ N: the list of numbers of times each of children action node has been sampled (for UCT)
+ Q: the estimation for the value of all chidren action nodes, i.e. W/N (for UCT)
+ U: upper bound value for all children action nodes (for UCT)
+ R: the list of the expected one-step reward of all children action nodes
+ alpha, beta: parameters for the posterior distribution of the value of all children nodes (for Thompson Sampling, beta distribution)
+ mu, sigma: parameters for the posterior distribution of the value of all children nodes (for Thompson Sampling, Gaussian distribution)
#### Action nodes
Elements for a state node:
+ action: the action for current node
+ parent node: the parent observation node of this node on the tree
+ children: the observations node sampled by this action
Optional elements (for UCT or Thompson Sampling):
+ V: the estimation for the value of children observation nodes (for UCT)
+ N: the number of times each of the children observation node has been sampled (for UCT)
### Selection
In the selection part, we first need to sample a state from the root node's prior distribution.
At each observation node, an action *a* is chosen. Here we have a state *s* for this observation. Then we send the *(s,a)* pair to the simulator and then a new state *s'* and a reward are returned by the simulator.
If the next state has been seen from this action node before, then go to the corresponding child node with the new state *s'* and go on selection.
If not, stop selection and start expansion.
### Expansion
Initialize a new observation node. Add this node to children of its parent action node. Then generate all possible children for this node. Initialize them.
The prior information may be given for initialization.
Then go to rollout.
### Rollout
At the new observation node, choose one action as a leaf node. Then use a quick policy to play the game to some terminal state and return the collected reward along the trajectory to the leaf node.
Then turn to backpropagation to send this value backup.
### Backpropagation
From the leaf node to the root node, update all nodes that have been passed in this iteration.
#### For observation nodes
For each observation node, a Q value is returned by its child observation node. Then add this value (might be multiplied by a discouting factor gamma) with the stored expected one step reward for this action node to get a new value. The new value is used to update the parameters for the child action node.
For UCT methods, the new value is add to W. Then add 1 to N. Q and U can be calculated out.
For Thompson Sampling, the new value is treated as a sample to update the posterior distribution.
Then return the new value to its parent state node.
#### For action nodes
For an action node, a Q value is returned by its child state node.
For UCT, calculate the new averaged V with Q and N. Then N pluses 1.
For Thompson sampling, just return the Q value to its parent action value as a sample.
Stop backpropagation until root node is reached. Then start selection again.
seems to be direct python functions. Though the management of placeholders may require some discussion. also may write it in a functional form.

25
tianshou/core/losses.py Normal file
View File

@ -0,0 +1,25 @@
import tensorflow as tf
import baselines.common.tf_util as U
def ppo_clip(sampled_action, Dgrad, clip_param, pi, pi_old):
log_pi_act = pi.log_prob(sampled_action)
log_pi_old_act = pi_old.log_prob(sampled_action)
ratio = tf.exp(log_pi_act - log_pi_old_act)
clipped_ratio = tf.clip_by_value(ratio, 1. - clip_param, 1. + clip_param)
ppo_clip_loss = -tf.reduce_mean(tf.minimum(ratio * Dgrad, clipped_ratio * Dgrad))
return ppo_clip_loss
def L_VF(Gt, pi, St): # TODO: do we really have to specify St, or it's implicit in policy/value net
return U.mean(tf.square(pi.vpred - Gt))
def entropy_reg(pi):
return - U.mean(pi.pd.entropy())
def KL_diff(pi, pi_old):
kloldnew = pi_old.pd.kl(pi.pd)
meankl = U.mean(kloldnew)
return meankl

View File

@ -0,0 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from .base import *
from .stochastic import *

View File

@ -0,0 +1,211 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
import warnings
import tensorflow as tf
# from zhusuan.utils import add_name_scope
__all__ = [
'StochasticPolicy',
]
class StochasticPolicy(object):
"""
The :class:`Distribution` class is the base class for various probabilistic
distributions which support batch inputs, generating batches of samples and
evaluate probabilities at batches of given values.
The typical input shape for a :class:`Distribution` is like
``batch_shape + input_shape``. where ``input_shape`` represents the shape
of non-batch input parameter, :attr:`batch_shape` represents how many
independent inputs are fed into the distribution.
Samples generated are of shape
``([n_samples]+ )batch_shape + value_shape``. The first additional axis
is omitted only when passed `n_samples` is None (by default), in which
case one sample is generated. :attr:`value_shape` is the non-batch value
shape of the distribution. For a univariate distribution, its
:attr:`value_shape` is [].
There are cases where a batch of random variables are grouped into a
single event so that their probabilities should be computed together. This
is achieved by setting `group_ndims` argument, which defaults to 0.
The last `group_ndims` number of axes in :attr:`batch_shape` are
grouped into a single event. For example,
``Normal(..., group_ndims=1)`` will set the last axis of its
:attr:`batch_shape` to a single event, i.e., a multivariate Normal with
identity covariance matrix.
When evaluating probabilities at given values, the given Tensor should be
broadcastable to shape ``(... + )batch_shape + value_shape``. The returned
Tensor has shape ``(... + )batch_shape[:-group_ndims]``.
.. seealso::
:doc:`/concepts`
For both, the parameter `dtype` represents type of samples. For discrete,
can be set by user. For continuous, automatically determined from parameter
types.
The value type of `prob` and `log_prob` will be `param_dtype` which is
deduced from the parameter(s) when initializating. And `dtype` must be
among `int16`, `int32`, `int64`, `float16`, `float32` and `float64`.
When two or more parameters are tensors and they have different type,
`TypeError` will be raised.
:param dtype: The value type of samples from the distribution.
:param param_dtype: The parameter(s) type of the distribution.
:param is_continuous: Whether the distribution is continuous.
:param is_reparameterized: A bool. Whether the gradients of samples can
and are allowed to propagate back into inputs, using the
reparametrization trick from (Kingma, 2013).
:param use_path_derivative: A bool. Whether when taking the gradients
of the log-probability to propagate them through the parameters
of the distribution (False meaning you do propagate them). This
is based on the paper "Sticking the Landing: Simple,
Lower-Variance Gradient Estimators for Variational Inference"
:param group_ndims: A 0-D `int32` Tensor representing the number of
dimensions in :attr:`batch_shape` (counted from the end) that are
grouped into a single event, so that their probabilities are calculated
together. Default is 0, which means a single value is an event.
See above for more detailed explanation.
"""
def __init__(self,
act_dtype,
param_dtype,
is_continuous,
obs_placeholder,
group_ndims=0, # maybe useful for repeat_action
**kwargs):
self._act_dtype = act_dtype
self._param_dtype = param_dtype
self._is_continuous = is_continuous
self._obs_placeholder = obs_placeholder
if isinstance(group_ndims, int):
if group_ndims < 0:
raise ValueError("group_ndims must be non-negative.")
self._group_ndims = group_ndims
else:
group_ndims = tf.convert_to_tensor(group_ndims, tf.int32)
_assert_rank_op = tf.assert_rank(
group_ndims, 0,
message="group_ndims should be a scalar (0-D Tensor).")
_assert_nonnegative_op = tf.assert_greater_equal(
group_ndims, 0,
message="group_ndims must be non-negative.")
with tf.control_dependencies([_assert_rank_op,
_assert_nonnegative_op]):
self._group_ndims = tf.identity(group_ndims)
@property
def act_dtype(self):
"""The sample data type of the policy."""
return self._act_dtype
@property
def param_dtype(self):
"""The parameter(s) type of the distribution."""
return self._param_dtype
@property
def is_continuous(self):
"""Whether the distribution is continuous."""
return self._is_continuous
@property
def group_ndims(self):
"""
The number of dimensions in :attr:`batch_shape` (counted from the end)
that are grouped into a single event, so that their probabilities are
calculated together. See `Distribution` for more detailed explanation.
"""
return self._group_ndims
# @add_name_scope
def act(self, observation):
"""
sample(n_samples=None)
Return samples from the distribution. When `n_samples` is None (by
default), one sample of shape ``batch_shape + value_shape`` is
generated. For a scalar `n_samples`, the returned Tensor has a new
sample dimension with size `n_samples` inserted at ``axis=0``, i.e.,
the shape of samples is ``[n_samples] + batch_shape + value_shape``.
:param n_samples: A 0-D `int32` Tensor or None. How many independent
samples to draw from the distribution.
:return: A Tensor of samples.
"""
return self._act(observation)
if n_samples is None:
samples = self._sample(n_samples=1)
return tf.squeeze(samples, axis=0)
elif isinstance(n_samples, int):
return self._sample(n_samples)
else:
n_samples = tf.convert_to_tensor(n_samples, dtype=tf.int32)
_assert_rank_op = tf.assert_rank(
n_samples, 0,
message="n_samples should be a scalar (0-D Tensor).")
with tf.control_dependencies([_assert_rank_op]):
samples = self._sample(n_samples)
return samples
def _act(self, observation):
"""
Private method for subclasses to rewrite the :meth:`sample` method.
"""
raise NotImplementedError()
# @add_name_scope
def log_prob(self, sampled_action):
"""
log_prob(sampled_action)
Compute log probability density (mass) function at `given` value.
:param given: A Tensor. The value at which to evaluate log probability
density (mass) function. Must be able to broadcast to have a shape
of ``(... + )batch_shape + value_shape``.
:return: A Tensor of shape ``(... + )batch_shape[:-group_ndims]``.
"""
log_p = self._log_prob(sampled_action)
return tf.reduce_sum(log_p, tf.range(-self._group_ndims, 0))
# @add_name_scope
def prob(self, sampled_action):
"""
prob(given)
Compute probability density (mass) function at `given` value.
:param given: A Tensor. The value at which to evaluate probability
density (mass) function. Must be able to broadcast to have a shape
of ``(... + )batch_shape + value_shape``.
:return: A Tensor of shape ``(... + )batch_shape[:-group_ndims]``.
"""
p = self._prob(sampled_action)
return tf.reduce_prod(p, tf.range(-self._group_ndims, 0))
def _log_prob(self, sampled_action):
"""
Private method for subclasses to rewrite the :meth:`log_prob` method.
"""
raise NotImplementedError()
def _prob(self, sampled_action):
"""
Private method for subclasses to rewrite the :meth:`prob` method.
"""
raise NotImplementedError()

View File

@ -0,0 +1,106 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
import numpy as np
import tensorflow as tf
from .base import StochasticPolicy
__all__ = [
'OnehotCategorical',
'OnehotDiscrete',
]
class OnehotCategorical(StochasticPolicy):
"""
The class of one-hot Categorical distribution.
See :class:`~zhusuan.distributions.base.Distribution` for details.
:param logits: A N-D (N >= 1) `float` Tensor of shape (...,
n_categories). Each slice `[i, j, ..., k, :]` represents the
un-normalized log probabilities for all categories.
.. math:: \\mathrm{logits} \\propto \\log p
:param dtype: The value type of samples from the distribution.
:param group_ndims: A 0-D `int32` Tensor representing the number of
dimensions in `batch_shape` (counted from the end) that are grouped
into a single event, so that their probabilities are calculated
together. Default is 0, which means a single value is an event.
See :class:`~zhusuan.distributions.base.Distribution` for more detailed
explanation.
A single sample is a N-D Tensor with the same shape as logits. Each slice
`[i, j, ..., k, :]` is a one-hot vector of the selected category.
"""
def __init__(self, logits, obs_placeholder, dtype=None, group_ndims=0, **kwargs):
self._logits = tf.convert_to_tensor(logits)
if dtype is None:
dtype = tf.int32
# assert_same_float_and_int_dtype([], dtype)
tf.assert_rank(self._logits, rank=2) # TODO: flexible policy output rank?
self._n_categories = self._logits.get_shape()[-1].value
super(OnehotCategorical, self).__init__(
act_dtype=dtype,
param_dtype=self._logits.dtype,
is_continuous=False,
obs_placeholder=obs_placeholder,
group_ndims=group_ndims,
**kwargs)
@property
def logits(self):
"""The un-normalized log probabilities."""
return self._logits
@property
def n_categories(self):
"""The number of categories in the distribution."""
return self._n_categories
def _act(self, observation):
sess = tf.get_default_session() # TODO: this may be ugly. also maybe huge problem when parallel
sampled_action = sess.run(tf.multinomial(self.logits, num_samples=1), feed_dict={self._obs_placeholder: observation[None]})
sampled_action = sampled_action[0, 0]
return sampled_action
def _log_prob(self, sampled_action):
sampled_action_onehot = tf.one_hot(sampled_action, self.n_categories, dtype=self.act_dtype)
return -tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sampled_action, logits=self.logits)
# given = tf.cast(given, self.param_dtype)
# given, logits = maybe_explicit_broadcast(
# given, self.logits, 'given', 'logits')
# if (given.get_shape().ndims == 2) or (logits.get_shape().ndims == 2):
# given_flat = given
# logits_flat = logits
# else:
# given_flat = tf.reshape(given, [-1, self.n_categories])
# logits_flat = tf.reshape(logits, [-1, self.n_categories])
# log_p_flat = -tf.nn.softmax_cross_entropy_with_logits(
# labels=given_flat, logits=logits_flat)
# if (given.get_shape().ndims == 2) or (logits.get_shape().ndims == 2):
# log_p = log_p_flat
# else:
# log_p = tf.reshape(log_p_flat, tf.shape(logits)[:-1])
# if given.get_shape() and logits.get_shape():
# log_p.set_shape(tf.broadcast_static_shape(
# given.get_shape(), logits.get_shape())[:-1])
# return log_p
def _prob(self, sampled_action):
return tf.exp(self._log_prob(sampled_action))
OnehotDiscrete = OnehotCategorical

127
tianshou/data/Batch.py Normal file
View File

@ -0,0 +1,127 @@
import numpy as np
import gc
class Batch(object):
"""
class for batch datasets. Collect multiple states (actions, rewards, etc.) on-policy.
"""
def __init__(self, env, pi, adv_estimation_func): # how to name the function?
self.env = env
self.pi = pi
self.adv_estimation_func = adv_estimation_func
self.is_first_collect = True
def collect(self, num_timesteps=0, num_episodes=0, apply_func=True): # specify how many data to collect here, or fix it in __init__()
assert sum([num_timesteps > 0, num_episodes > 0]) == 1, "One and only one collection number specification permitted!"
if num_timesteps > 0: # YouQiaoben: finish this implementation, the following code are just from openai/baselines
t = 0
ac = self.env.action_space.sample() # not used, just so we have the datatype
new = True # marks if we're on first timestep of an episode
if self.is_first_collect:
ob = self.env.reset()
self.is_first_collect = False
else:
ob = self.raw_data['obs'][0] # last observation!
# Initialize history arrays
obs = np.array([ob for _ in range(num_timesteps)])
rews = np.zeros(num_timesteps, 'float32')
news = np.zeros(num_timesteps, 'int32')
acs = np.array([ac for _ in range(num_timesteps)])
for t in range(num_timesteps):
pass
while True:
prevac = ac
ac, vpred = pi.act(stochastic, ob)
# Slight weirdness here because we need value function at time T
# before returning segment [0, T-1] so we get the correct
# terminal value
i = t % horizon
obs[i] = ob
vpreds[i] = vpred
news[i] = new
acs[i] = ac
prevacs[i] = prevac
ob, rew, new, _ = env.step(ac)
rews[i] = rew
cur_ep_ret += rew
cur_ep_len += 1
if new:
ep_rets.append(cur_ep_ret)
ep_lens.append(cur_ep_len)
cur_ep_ret = 0
cur_ep_len = 0
ob = env.reset()
t += 1
if num_episodes > 0: # YouQiaoben: fix memory growth, both del and gc.collect() fail
# initialize rawdata lists
if not self.is_first_collect:
del self.obs
del self.acs
del self.rews
del self.news
obs = []
acs = []
rews = []
news = []
t_count = 0
for e in range(num_episodes):
ob = self.env.reset()
obs.append(ob)
news.append(True)
while True:
ac = self.pi.act(ob)
acs.append(ac)
ob, rew, done, _ = self.env.step(ac)
rews.append(rew)
t_count += 1
if t_count >= 200: # force episode stop
break
if done: # end of episode, discard s_T
break
else:
obs.append(ob)
news.append(False)
self.obs = np.array(obs)
self.acs = np.array(acs)
self.rews = np.array(rews)
self.news = np.array(news)
del obs
del acs
del rews
del news
self.raw_data = {'obs': self.obs, 'acs': self.acs, 'rews': self.rews, 'news': self.news}
self.is_first_collect = False
if apply_func:
self.apply_adv_estimation_func()
gc.collect()
def apply_adv_estimation_func(self):
self.data = self.adv_estimation_func(self.raw_data)
def next_batch(self, batch_size): # YouQiaoben: referencing other iterate over batches
rand_idx = np.random.choice(self.data['obs'].shape[0], batch_size)
return {key: value[rand_idx] for key, value in self.data.items()}

22
tianshou/data/README.md Normal file
View File

@ -0,0 +1,22 @@
# Batch
YouQiaoben
fix as stated in ppo_example.py
# Replay
ShihongSong
a Replay.py file. must have collect() and next_batch() methods for training.
integrate previous ReplayBuffer codes.
# adv_estimate
YouQiaoben (gae_lambda), ShihongSong(dqn after policy.DQN)
seems to be direct python functions. also may write it in a functional form.

View File

View File

@ -0,0 +1,37 @@
import numpy as np
def full_return(raw_data):
"""
naively compute full return
:param raw_data: dict of specified keys and values.
"""
obs = raw_data['obs']
acs = raw_data['acs']
rews = raw_data['rews']
news = raw_data['news']
num_timesteps = rews.shape[0]
data = {}
data['obs'] = obs
data['acs'] = acs
Gts = rews.copy()
episode_start_idx = 0
for i in range(1, num_timesteps):
if news[i] or (i == num_timesteps - 1): # found one full episode
if i < rews.shape[0] - 1:
t = i - 1
else:
t = i
Gt = 0
while t >= episode_start_idx:
Gt += rews[t]
Gts[t] = Gt
t -= 1
episode_start_idx = i
data['Gts'] = Gts
return data