Merge branch 'master' of https://github.com/sproblvem/tianshou

2017-12-08 23:41:51 +08:00 · 2017-12-08 23:41:51 +08:00 · 03a6880050
commit 03a6880050
parent bc49d466d1 60630c9b04
12 changed files with 659 additions and 240 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,3 +3,4 @@ leela-zero
 *.pyc
 parameters
 *.swp
+*.sublime*
--- a/README.md
+++ b/README.md
@ -10,31 +10,31 @@ Tianshou(天授) is a reinforcement learning platform. The following image illus

 ## core

-### Model
-&nbsp;&nbsp;&nbsp;&nbsp;DQN, Policy-Value Network of AlphaGo Zero, PPO-specific, TROP-specific
+### Policy Wrapper
+&nbsp;&nbsp;&nbsp;&nbsp;Stochastic policies (OnehotCategorical, Gaussian), deterministic policies (policy as in DQN, DDPG)
+
+&nbsp;&nbsp;&nbsp;&nbsp;Specific network architectures in original paper of DQN, TRPO, A3C, etc. Policy-Value Network of AlphaGo Zero

 ### Algorithm

-#### Loss design
-&nbsp;&nbsp;&nbsp;&nbsp;Actor-Critic (Variations), DQN (Variations), DDPG, TRPO, PPO
+#### losses
+&nbsp;&nbsp;&nbsp;&nbsp;policy gradient (and its variants), DQN (and its variants), DDPG, TRPO, PPO

-#### Optimization method
-&nbsp;&nbsp;&nbsp;&nbsp;SGD, ADAM, TRPO, natural gradient, etc.
+#### optimizer
+&nbsp;&nbsp;&nbsp;&nbsp;TRPO, natural gradient (and TensorFlow optimizers (sgd, adam))

 ### Planning
 &nbsp;&nbsp;&nbsp;&nbsp;MCTS

 ## data
-&nbsp;&nbsp;&nbsp;&nbsp;Training style - Monte Carlo or Temporal Difference
+&nbsp;&nbsp;&nbsp;&nbsp;Training style - Batch, Replay (and its variants)

-&nbsp;&nbsp;&nbsp;&nbsp;Reward Reshaping/ Advantage Estimation Function
-
-&nbsp;&nbsp;&nbsp;&nbsp;Importance weight
+&nbsp;&nbsp;&nbsp;&nbsp;Advantage Estimation Function

 &nbsp;&nbsp;&nbsp;&nbsp;Multithread Read/Write

 ## environment
-&nbsp;&nbsp;&nbsp;&nbsp;DQN repeat frames etc.
+&nbsp;&nbsp;&nbsp;&nbsp;DQN repeat frames, Reward Reshaping, image preprocessing (not sure where)

 ## simulator
 &nbsp;&nbsp;&nbsp;&nbsp;Go, Othello/Reversi, Warzone
@ -43,3 +43,17 @@ Tianshou(天授) is a reinforcement learning platform. The following image illus

 ## TODO
 Search based method parallel.
+
+YongRen: Policy Wrapper, in order of Gaussian, DQN and DDPG
+
+TongzhengRen: losses, in order of ppo, pg, DQN, DDPG with management of placeholders
+
+YouQiaoben: data/Batch, implement num_timesteps, fix memory growth in num_episodes; adv_estimate.gae_lambda (need to write a value network in tf)
+
+ShihongSong: data/Replay; then adv_estimate.dqn after YongRen's DQN
+
+HaoshengZou: collaborate mainly on Policy and losses; interfaces and architecture
+
+Note: install openai/gym first to run the Atari environment; note that interfaces between modules may not be finalized; the management of placeholders and `feed_dict` may have to be done manually for the time being;
+
+Without preprocessing and other tricks, this example will not train to any meaningful results. Codes should past two tests: individual module test and run through this example code.
--- a/examples/ppo_example.py
+++ b/examples/ppo_example.py
@ -0,0 +1,89 @@
+#!/usr/bin/env python
+
+import tensorflow as tf, numpy as np
+import time
+import gym
+
+# our lib imports here!
+import sys
+sys.path.append('..')
+import tianshou.core.losses as losses
+from tianshou.data.Batch import Batch
+import tianshou.data.adv_estimate as adv_estimate
+import tianshou.core.policy as policy
+
+
+def policy_net(obs, act_dim, scope=None):
+    """
+    Constructs the policy network. NOT NEEDED IN THE LIBRARY! this is pure tf
+
+    :param obs: Placeholder for the observation. A tensor of shape (bs, x, y, channels)
+    :param act_dim: int. The number of actions.
+    :param scope: str. Specifying the scope of the variables.
+    """
+    # with tf.variable_scope(scope):
+    net = tf.layers.conv2d(obs, 16, 8, 4, 'valid', activation=tf.nn.relu)
+    net = tf.layers.conv2d(net, 32, 4, 2, 'valid', activation=tf.nn.relu)
+    net = tf.layers.flatten(net)
+    net = tf.layers.dense(net, 256, activation=tf.nn.relu)
+
+    act_logits = tf.layers.dense(net, act_dim)
+
+    return act_logits
+
+
+if __name__ == '__main__': # a clean version with only policy net, no value net
+    env = gym.make('PongNoFrameskip-v4')
+    obs_dim = env.observation_space.shape
+    act_dim = env.action_space.n
+
+    clip_param = 0.2
+    nb_batches = 2
+
+    # 1. build network with pure tf
+    obs = tf.placeholder(tf.float32, shape=(None,) + obs_dim) # network input
+
+    with tf.variable_scope('pi'):
+        act_logits = policy_net(obs, act_dim, 'pi')
+        train_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) # TODO: better management of TRAINABLE_VARIABLES
+    with tf.variable_scope('pi_old'):
+        act_logits_old = policy_net(obs, act_dim, 'pi_old')
+
+    # 2. build losses, optimizers
+    pi = policy.OnehotCategorical(act_logits, obs_placeholder=obs) # YongRen: policy.Gaussian (could reference the policy in TRPO paper, my code is adapted from zhusuan.distributions) policy.DQN etc.
+    # for continuous action space, you may need to change an environment to run
+    pi_old = policy.OnehotCategorical(act_logits_old, obs_placeholder=obs)
+
+    act = tf.placeholder(dtype=tf.int32, shape=[None]) # batch of integer actions
+    Dgrad = tf.placeholder(dtype=tf.float32, shape=[None]) # values used in the Gradients
+
+    ppo_loss_clip = losses.ppo_clip(act, Dgrad, clip_param, pi, pi_old) # TongzhengRen: losses.vpg ... management of placeholders and feed_dict
+
+    total_loss = ppo_loss_clip
+    optimizer = tf.train.AdamOptimizer(1e-3)
+    train_op = optimizer.minimize(total_loss, var_list=train_var_list)
+
+    # 3. define data collection
+    training_data = Batch(env, pi, adv_estimate.full_return) # YouQiaoben: finish and polish Batch, adv_estimate.gae_lambda as in PPO paper
+                                                             # ShihongSong: Replay(env, pi, adv_estimate.target_network), use your ReplayMemory, interact as follows. Simplify your adv_estimate.dqn to run before YongRen's DQN
+    # maybe a dict to manage the elements to be collected
+
+    # 4. start training
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+
+        minibatch_count = 0
+        collection_count = 0
+        while True: # until some stopping criterion met...
+            # collect data
+            training_data.collect(num_episodes=2) # YouQiaoben, ShihongSong
+            collection_count += 1
+            print('Collected {} times.'.format(collection_count))
+
+            # update network
+            for _ in range(nb_batches):
+                data = training_data.next_batch(64) # YouQiaoben, ShihongSong
+                # TODO: auto managing of the placeholders? or add this to params of data.Batch
+                sess.run(train_op, feed_dict={obs: data['obs'], act: data['acs'], Dgrad: data['Gts']})
+                minibatch_count += 1
+                print('Trained {} minibatches.'.format(minibatch_count))
--- a/tianshou/core/README.md
+++ b/tianshou/core/README.md
@ -1,239 +1,21 @@
-# MCTS
+# policy

-This is an implementation for Monte Carlo Tree Search in various Reinforcement Learning applications.
+YongRen

-## MCTS with deterministic environments
+### base, stochastic

-The agent is interacting with a deterministic environment. That is to say, next state and reward for a state-action pair is deterministic. 
+follow OnehotCategorical to write Gaussian, can be in the same file as stochastic.py

-### Node
+### deterministic

-Action nodes are not needed here since every state-action pair only lead to one state.
+not sure how to write, but should at least have act() method to interact with environment

-Elements for a node:
+DQN should have an effective argmax_{actions}() method to use as a value network

-+ state: the state for current node
-+ parent node: the parent node of this node on the tree
-+ parent action: the action that leads to this node
-+ children: the next states the agent can reach by choosing an action from this node
-+ prior: some external information for this node (default to be uniform)

-Optional elements (for UCT or Thompson Sampling):

-+ W: the list of all the sums of all sampled values collected for all children nodes (for UCT)
-+ N: the list of numbers of times each of children node has been sampled (for UCT)
-+ Q: the estimation for the value of all chidren nodes, i.e. W/N (for UCT)
-+ U: upper bound value for all children nodes (for UCT)
-+ R: the list of the one-step reward of all children nodes
-+ alpha, beta: parameters for the posterior distribution of the value of all children nodes (for Thompson Sampling, beta distribution)
-+ mu, sigma: parameters for the posterior distribution of the value of all children nodes (for Thompson Sampling, Gaussian distribution)
+# losses

-### Selection
-In the selection part, an action is chosen for the current node. 
+TongzhengRen

-If the action has been chosen before, then go to the corresponding child node and go on selection.
-
-If not, stop selection and start expansion.
-
-### Expansion
-
-Send the state-action pair to the simulator and then the next state and a reward are returned by the simulator. Then initialize a new node with the next state. The prior information may be given for initialization.
-
-Then go to rollout.
-
-### Rollout
-
-At the new leaf node, use a quick policy to play the game to some terminal state and return the collected reward along the trajectory to the leaf node. Use this collected reward to initialize the value of this node.
-
-Another way is to send this state to some external estimator and use the returned result to initialize the value of this node.
-
-Then turn to backpropagation to send this value backup.
-
-### Backpropagation
-
-From the leaf node to the root node, update all nodes that have been passed in this iteration.
-
-For each node, a value is returned by its child node. Then add this value (might be multiplied by a discouting factor gamma) with the stored reward for this child to get a new value. The new value is used to update this Q value of the corresponding children.
-
-For UCT methods, the new value is add to W. Then add 1 to N. Q and U can be calculated out.
-
-For Thompson Sampling, the new value is treated as a sample to update the posterior distribution.
-
-Then return teh new value to its parent node.
-
-Stop backpropagation until root node is reached. Then start selection again.
-
-## MCTS with random environments
-
-The agent is interacting with a random environment. That is to say, next state and reward for a state-action pair is not deterministic. We do not know the hidden dynamics and reward distribution. We can just get samples from the simulator.
-
-### Node
-
-Both state nodes and action nodes are needed here.
-
-#### State nodes
-
-Elements for a state node:
-
-+ state: the state for current node
-+ parent node: the parent action node of this node on the tree
-+ children: the actions nodes chosen from this node
-+ prior: some external information for this node (default to be uniform)
-
-Optional elements (for UCT or Thompson Sampling):
-
-+ W: the list of all the sums of all sampled values collected for all children action nodes (for UCT)
-+ N: the list of numbers of times each of children action node has been sampled (for UCT)
-+ Q: the estimation for the value of all chidren action nodes, i.e. W/N (for UCT)
-+ U: upper bound value for all children action nodes (for UCT)
-+ R: the list of the expected one-step reward of all children action nodes
-+ alpha, beta: parameters for the posterior distribution of the value of all children nodes (for Thompson Sampling, beta distribution)
-+ mu, sigma: parameters for the posterior distribution of the value of all children nodes (for Thompson Sampling, Gaussian distribution)
-
-#### Action nodes
-
-Elements for a state node:
-
-+ action: the action for current node
-+ parent node: the parent state node of this node on the tree
-+ children: the states node sampled by this action
-
-Optional elements (for UCT or Thompson Sampling):
-
-+ V: the estimation for the value of children state nodes (for UCT)
-+ N: the number of times each of the children state node has been sampled (for UCT)
-
-### Selection
-In the selection part, an action is chosen for the current state node. Then the state-action pair to the simulator and then the next state and a reward are returned by the simulator.
-
-If the next state has been seen from this action node before, then go to the corresponding child node and go on selection.
-
-If not, stop selection and start expansion.
-
-### Expansion
-
-Initialize a new node with the next state. Add this node to children of the parent action node. Then generate all possible children for this node. Initialize them.
-
-The prior information may be given for initialization. 
-
-Then go to rollout.
-
-### Rollout
-
-At the new state node, choose one action as a leaf node. Then use a quick policy to play the game to some terminal state and return the collected reward along the trajectory to the leaf node.
-
-Then turn to backpropagation to send this value backup.
-
-### Backpropagation
-
-From the leaf node to the root node, update all nodes that have been passed in this iteration.
-
-#### For state nodes
-
-For each state node, a V value is returned by its child action node. Then add this value (might be multiplied by a discouting factor gamma) with the stored expected one step reward for this action node to get a new value. The new value is used to update the parameters for the child action node.
-
-For UCT methods, the new value is add to W. Then add 1 to N. Q and U can be calculated out.
-
-For Thompson Sampling, the new value is treated as a sample to update the posterior distribution.
-
-Then return the new value to its parent state node. 
-
-#### For action nodes
-
-For an action node, a Q value is returned by its child state node.
-
-For UCT, calculate the new averaged V with Q and N. Then N pluses 1.
-
-For Thompson sampling, just return the Q value to its parent action value as a sample.
-
-
-Stop backpropagation until root node is reached. Then start selection again.
-
-## MCTS for POMDPs
-
-The agent is interacting under a partially observed environment. That is to say, we can only see observations and choose actions. A simulator is needed here. Every time I sent a state action pair to the simulator, it can return me a new state, a observation and a reward. There need a prior distribution of states for the root node.
-
-### Node
-
-We use observation nodes and action nodes here.
-
-#### Observation nodes
-
-Elements for an observation node:
-
-+ observation: the observation for current node
-+ parent node: the parent action node of this node on the tree
-+ children: the actions nodes chosen from this node
-+ prior: some external information for this node (default to be uniform)
-
-Optional elements (for UCT or Thompson Sampling):
-
-+ h: history information for this node
-+ W: the list of all the sums of all sampled values collected for all children action nodes (for UCT)
-+ N: the list of numbers of times each of children action node has been sampled (for UCT)
-+ Q: the estimation for the value of all chidren action nodes, i.e. W/N (for UCT)
-+ U: upper bound value for all children action nodes (for UCT)
-+ R: the list of the expected one-step reward of all children action nodes
-+ alpha, beta: parameters for the posterior distribution of the value of all children nodes (for Thompson Sampling, beta distribution)
-+ mu, sigma: parameters for the posterior distribution of the value of all children nodes (for Thompson Sampling, Gaussian distribution)
-
-#### Action nodes
-
-Elements for a state node:
-
-+ action: the action for current node
-+ parent node: the parent observation node of this node on the tree
-+ children: the observations node sampled by this action
-
-Optional elements (for UCT or Thompson Sampling):
-
-+ V: the estimation for the value of children observation nodes (for UCT)
-+ N: the number of times each of the children observation node has been sampled (for UCT)
-
-### Selection
-In the selection part, we first need to sample a state from the root node's prior distribution.
-
-At each observation node, an action *a* is chosen. Here we have a state *s* for this observation. Then we send the *(s,a)*  pair to the simulator and then a new state *s'* and a reward are returned by the simulator.
-
-If the next state has been seen from this action node before, then go to the corresponding child node with the new state *s'* and go on selection.
-
-If not, stop selection and start expansion.
-
-### Expansion
-
-Initialize a new observation node. Add this node to children of its parent action node. Then generate all possible children for this node. Initialize them.
-
-The prior information may be given for initialization. 
-
-Then go to rollout.
-
-### Rollout
-
-At the new observation node, choose one action as a leaf node. Then use a quick policy to play the game to some terminal state and return the collected reward along the trajectory to the leaf node.
-
-Then turn to backpropagation to send this value backup.
-
-### Backpropagation
-
-From the leaf node to the root node, update all nodes that have been passed in this iteration.
-
-#### For observation nodes
-
-For each observation node, a Q value is returned by its child observation node. Then add this value (might be multiplied by a discouting factor gamma) with the stored expected one step reward for this action node to get a new value. The new value is used to update the parameters for the child action node.
-
-For UCT methods, the new value is add to W. Then add 1 to N. Q and U can be calculated out.
-
-For Thompson Sampling, the new value is treated as a sample to update the posterior distribution.
-
-Then return the new value to its parent state node. 
-
-#### For action nodes
-
-For an action node, a Q value is returned by its child state node.
-
-For UCT, calculate the new averaged V with Q and N. Then N pluses 1.
-
-For Thompson sampling, just return the Q value to its parent action value as a sample.
-
-
-Stop backpropagation until root node is reached. Then start selection again. 
+seems to be direct python functions. Though the management of placeholders may require some discussion. also may write it in a functional form.
--- a/tianshou/core/losses.py
+++ b/tianshou/core/losses.py
@ -0,0 +1,25 @@
+import tensorflow as tf
+import baselines.common.tf_util as U
+
+
+def ppo_clip(sampled_action, Dgrad, clip_param, pi, pi_old):
+    log_pi_act = pi.log_prob(sampled_action)
+    log_pi_old_act = pi_old.log_prob(sampled_action)
+    ratio = tf.exp(log_pi_act - log_pi_old_act)
+    clipped_ratio = tf.clip_by_value(ratio, 1. - clip_param, 1. + clip_param)
+    ppo_clip_loss = -tf.reduce_mean(tf.minimum(ratio * Dgrad, clipped_ratio * Dgrad))
+    return ppo_clip_loss
+
+
+def L_VF(Gt, pi, St): # TODO: do we really have to specify St, or it's implicit in policy/value net
+    return U.mean(tf.square(pi.vpred - Gt))
+
+
+def entropy_reg(pi):
+    return - U.mean(pi.pd.entropy())
+
+
+def KL_diff(pi, pi_old):
+    kloldnew = pi_old.pd.kl(pi.pd)
+    meankl = U.mean(kloldnew)
+    return meankl
--- a/tianshou/core/policy/init.py
+++ b/tianshou/core/policy/init.py
@ -0,0 +1,5 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from .base import *
+from .stochastic import *
--- a/tianshou/core/policy/base.py
+++ b/tianshou/core/policy/base.py
@ -0,0 +1,211 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import
+from __future__ import division
+import warnings
+
+import tensorflow as tf
+
+# from zhusuan.utils import add_name_scope
+
+
+__all__ = [
+    'StochasticPolicy',
+]
+
+
+class StochasticPolicy(object):
+    """
+    The :class:`Distribution` class is the base class for various probabilistic
+    distributions which support batch inputs, generating batches of samples and
+    evaluate probabilities at batches of given values.
+
+    The typical input shape for a :class:`Distribution` is like
+    ``batch_shape + input_shape``. where ``input_shape`` represents the shape
+    of non-batch input parameter, :attr:`batch_shape` represents how many
+    independent inputs are fed into the distribution.
+
+    Samples generated are of shape
+    ``([n_samples]+ )batch_shape + value_shape``. The first additional axis
+    is omitted only when passed `n_samples` is None (by default), in which
+    case one sample is generated. :attr:`value_shape` is the non-batch value
+    shape of the distribution. For a univariate distribution, its
+    :attr:`value_shape` is [].
+
+    There are cases where a batch of random variables are grouped into a
+    single event so that their probabilities should be computed together. This
+    is achieved by setting `group_ndims` argument, which defaults to 0.
+    The last `group_ndims` number of axes in :attr:`batch_shape` are
+    grouped into a single event. For example,
+    ``Normal(..., group_ndims=1)`` will set the last axis of its
+    :attr:`batch_shape` to a single event, i.e., a multivariate Normal with
+    identity covariance matrix.
+
+    When evaluating probabilities at given values, the given Tensor should be
+    broadcastable to shape ``(... + )batch_shape + value_shape``. The returned
+    Tensor has shape ``(... + )batch_shape[:-group_ndims]``.
+
+    .. seealso::
+
+        :doc:`/concepts`
+
+    For both, the parameter `dtype` represents type of samples. For discrete,
+    can be set by user. For continuous, automatically determined from parameter
+    types.
+
+    The value type of `prob` and `log_prob` will be `param_dtype` which is
+    deduced from the parameter(s) when initializating. And `dtype` must be
+    among `int16`, `int32`, `int64`, `float16`, `float32` and `float64`.
+
+    When two or more parameters are tensors and they have different type,
+    `TypeError` will be raised.
+
+    :param dtype: The value type of samples from the distribution.
+    :param param_dtype: The parameter(s) type of the distribution.
+    :param is_continuous: Whether the distribution is continuous.
+    :param is_reparameterized: A bool. Whether the gradients of samples can
+        and are allowed to propagate back into inputs, using the
+        reparametrization trick from (Kingma, 2013).
+    :param use_path_derivative: A bool. Whether when taking the gradients
+        of the log-probability to propagate them through the parameters
+        of the distribution (False meaning you do propagate them). This
+        is based on the paper "Sticking the Landing: Simple,
+        Lower-Variance Gradient Estimators for Variational Inference"
+    :param group_ndims: A 0-D `int32` Tensor representing the number of
+        dimensions in :attr:`batch_shape` (counted from the end) that are
+        grouped into a single event, so that their probabilities are calculated
+        together. Default is 0, which means a single value is an event.
+        See above for more detailed explanation.
+    """
+
+    def __init__(self,
+                 act_dtype,
+                 param_dtype,
+                 is_continuous,
+                 obs_placeholder,
+                 group_ndims=0, # maybe useful for repeat_action
+                 **kwargs):
+
+        self._act_dtype = act_dtype
+        self._param_dtype = param_dtype
+        self._is_continuous = is_continuous
+        self._obs_placeholder = obs_placeholder
+        if isinstance(group_ndims, int):
+            if group_ndims < 0:
+                raise ValueError("group_ndims must be non-negative.")
+            self._group_ndims = group_ndims
+        else:
+            group_ndims = tf.convert_to_tensor(group_ndims, tf.int32)
+            _assert_rank_op = tf.assert_rank(
+                group_ndims, 0,
+                message="group_ndims should be a scalar (0-D Tensor).")
+            _assert_nonnegative_op = tf.assert_greater_equal(
+                group_ndims, 0,
+                message="group_ndims must be non-negative.")
+            with tf.control_dependencies([_assert_rank_op,
+                                          _assert_nonnegative_op]):
+                self._group_ndims = tf.identity(group_ndims)
+
+    @property
+    def act_dtype(self):
+        """The sample data type of the policy."""
+        return self._act_dtype
+
+    @property
+    def param_dtype(self):
+        """The parameter(s) type of the distribution."""
+        return self._param_dtype
+
+    @property
+    def is_continuous(self):
+        """Whether the distribution is continuous."""
+        return self._is_continuous
+
+    @property
+    def group_ndims(self):
+        """
+        The number of dimensions in :attr:`batch_shape` (counted from the end)
+        that are grouped into a single event, so that their probabilities are
+        calculated together. See `Distribution` for more detailed explanation.
+        """
+        return self._group_ndims
+
+    # @add_name_scope
+    def act(self, observation):
+        """
+        sample(n_samples=None)
+
+        Return samples from the distribution. When `n_samples` is None (by
+        default), one sample of shape ``batch_shape + value_shape`` is
+        generated. For a scalar `n_samples`, the returned Tensor has a new
+        sample dimension with size `n_samples` inserted at ``axis=0``, i.e.,
+        the shape of samples is ``[n_samples] + batch_shape + value_shape``.
+
+        :param n_samples: A 0-D `int32` Tensor or None. How many independent
+            samples to draw from the distribution.
+        :return: A Tensor of samples.
+        """
+        return self._act(observation)
+
+        if n_samples is None:
+            samples = self._sample(n_samples=1)
+            return tf.squeeze(samples, axis=0)
+        elif isinstance(n_samples, int):
+            return self._sample(n_samples)
+        else:
+            n_samples = tf.convert_to_tensor(n_samples, dtype=tf.int32)
+            _assert_rank_op = tf.assert_rank(
+                n_samples, 0,
+                message="n_samples should be a scalar (0-D Tensor).")
+            with tf.control_dependencies([_assert_rank_op]):
+                samples = self._sample(n_samples)
+            return samples
+
+    def _act(self, observation):
+        """
+        Private method for subclasses to rewrite the :meth:`sample` method.
+        """
+        raise NotImplementedError()
+
+    # @add_name_scope
+    def log_prob(self, sampled_action):
+        """
+        log_prob(sampled_action)
+
+        Compute log probability density (mass) function at `given` value.
+
+        :param given: A Tensor. The value at which to evaluate log probability
+            density (mass) function. Must be able to broadcast to have a shape
+            of ``(... + )batch_shape + value_shape``.
+        :return: A Tensor of shape ``(... + )batch_shape[:-group_ndims]``.
+        """
+        log_p = self._log_prob(sampled_action)
+        return tf.reduce_sum(log_p, tf.range(-self._group_ndims, 0))
+
+    # @add_name_scope
+    def prob(self, sampled_action):
+        """
+        prob(given)
+
+        Compute probability density (mass) function at `given` value.
+
+        :param given: A Tensor. The value at which to evaluate probability
+            density (mass) function. Must be able to broadcast to have a shape
+            of ``(... + )batch_shape + value_shape``.
+        :return: A Tensor of shape ``(... + )batch_shape[:-group_ndims]``.
+        """
+        p = self._prob(sampled_action)
+        return tf.reduce_prod(p, tf.range(-self._group_ndims, 0))
+
+    def _log_prob(self, sampled_action):
+        """
+        Private method for subclasses to rewrite the :meth:`log_prob` method.
+        """
+        raise NotImplementedError()
+
+    def _prob(self, sampled_action):
+        """
+        Private method for subclasses to rewrite the :meth:`prob` method.
+        """
+        raise NotImplementedError()
--- a/tianshou/core/policy/stochastic.py
+++ b/tianshou/core/policy/stochastic.py
@ -0,0 +1,106 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import
+from __future__ import division
+
+import numpy as np
+import tensorflow as tf
+
+from .base import StochasticPolicy
+
+
+__all__ = [
+    'OnehotCategorical',
+    'OnehotDiscrete',
+]
+
+
+class OnehotCategorical(StochasticPolicy):
+    """
+    The class of one-hot Categorical distribution.
+    See :class:`~zhusuan.distributions.base.Distribution` for details.
+
+    :param logits: A N-D (N >= 1) `float` Tensor of shape (...,
+        n_categories). Each slice `[i, j, ..., k, :]` represents the
+        un-normalized log probabilities for all categories.
+
+        .. math:: \\mathrm{logits} \\propto \\log p
+
+    :param dtype: The value type of samples from the distribution.
+    :param group_ndims: A 0-D `int32` Tensor representing the number of
+        dimensions in `batch_shape` (counted from the end) that are grouped
+        into a single event, so that their probabilities are calculated
+        together. Default is 0, which means a single value is an event.
+        See :class:`~zhusuan.distributions.base.Distribution` for more detailed
+        explanation.
+
+    A single sample is a N-D Tensor with the same shape as logits. Each slice
+    `[i, j, ..., k, :]` is a one-hot vector of the selected category.
+    """
+
+    def __init__(self, logits, obs_placeholder, dtype=None, group_ndims=0, **kwargs):
+        self._logits = tf.convert_to_tensor(logits)
+
+        if dtype is None:
+            dtype = tf.int32
+        # assert_same_float_and_int_dtype([], dtype)
+
+        tf.assert_rank(self._logits, rank=2) # TODO: flexible policy output rank?
+        self._n_categories = self._logits.get_shape()[-1].value
+
+        super(OnehotCategorical, self).__init__(
+            act_dtype=dtype,
+            param_dtype=self._logits.dtype,
+            is_continuous=False,
+            obs_placeholder=obs_placeholder,
+            group_ndims=group_ndims,
+            **kwargs)
+
+    @property
+    def logits(self):
+        """The un-normalized log probabilities."""
+        return self._logits
+
+    @property
+    def n_categories(self):
+        """The number of categories in the distribution."""
+        return self._n_categories
+
+    def _act(self, observation):
+        sess = tf.get_default_session() # TODO: this may be ugly. also maybe huge problem when parallel
+        sampled_action = sess.run(tf.multinomial(self.logits, num_samples=1), feed_dict={self._obs_placeholder: observation[None]})
+
+        sampled_action = sampled_action[0, 0]
+
+        return sampled_action
+
+    def _log_prob(self, sampled_action):
+        sampled_action_onehot = tf.one_hot(sampled_action, self.n_categories, dtype=self.act_dtype)
+        return -tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sampled_action, logits=self.logits)
+
+        # given = tf.cast(given, self.param_dtype)
+        # given, logits = maybe_explicit_broadcast(
+        #     given, self.logits, 'given', 'logits')
+        # if (given.get_shape().ndims == 2) or (logits.get_shape().ndims == 2):
+        #     given_flat = given
+        #     logits_flat = logits
+        # else:
+        #     given_flat = tf.reshape(given, [-1, self.n_categories])
+        #     logits_flat = tf.reshape(logits, [-1, self.n_categories])
+        # log_p_flat = -tf.nn.softmax_cross_entropy_with_logits(
+        #     labels=given_flat, logits=logits_flat)
+        # if (given.get_shape().ndims == 2) or (logits.get_shape().ndims == 2):
+        #     log_p = log_p_flat
+        # else:
+        #     log_p = tf.reshape(log_p_flat, tf.shape(logits)[:-1])
+        #     if given.get_shape() and logits.get_shape():
+        #         log_p.set_shape(tf.broadcast_static_shape(
+        #             given.get_shape(), logits.get_shape())[:-1])
+        # return log_p
+
+    def _prob(self, sampled_action):
+        return tf.exp(self._log_prob(sampled_action))
+
+
+OnehotDiscrete = OnehotCategorical
--- a/tianshou/data/Batch.py
+++ b/tianshou/data/Batch.py
@ -0,0 +1,127 @@
+import numpy as np
+import gc
+
+
+class Batch(object):
+    """
+    class for batch datasets. Collect multiple states (actions, rewards, etc.) on-policy.
+    """
+
+    def __init__(self, env, pi, adv_estimation_func): # how to name the function?
+        self.env = env
+        self.pi = pi
+        self.adv_estimation_func = adv_estimation_func
+        self.is_first_collect = True
+
+
+    def collect(self, num_timesteps=0, num_episodes=0, apply_func=True): # specify how many data to collect here, or fix it in __init__()
+        assert sum([num_timesteps > 0, num_episodes > 0]) == 1, "One and only one collection number specification permitted!"
+
+        if num_timesteps > 0: # YouQiaoben: finish this implementation, the following code are just from openai/baselines
+            t = 0
+            ac = self.env.action_space.sample() # not used, just so we have the datatype
+            new = True # marks if we're on first timestep of an episode
+            if self.is_first_collect:
+                ob = self.env.reset()
+                self.is_first_collect = False
+            else:
+                ob = self.raw_data['obs'][0] # last observation!
+
+            # Initialize history arrays
+            obs = np.array([ob for _ in range(num_timesteps)])
+            rews = np.zeros(num_timesteps, 'float32')
+            news = np.zeros(num_timesteps, 'int32')
+            acs = np.array([ac for _ in range(num_timesteps)])
+
+            for t in range(num_timesteps):
+                pass
+
+            while True:
+                prevac = ac
+                ac, vpred = pi.act(stochastic, ob)
+                # Slight weirdness here because we need value function at time T
+                # before returning segment [0, T-1] so we get the correct
+                # terminal value
+                i = t % horizon
+                obs[i] = ob
+                vpreds[i] = vpred
+                news[i] = new
+                acs[i] = ac
+                prevacs[i] = prevac
+
+                ob, rew, new, _ = env.step(ac)
+                rews[i] = rew
+
+                cur_ep_ret += rew
+                cur_ep_len += 1
+                if new:
+                    ep_rets.append(cur_ep_ret)
+                    ep_lens.append(cur_ep_len)
+                    cur_ep_ret = 0
+                    cur_ep_len = 0
+                    ob = env.reset()
+                t += 1
+
+        if num_episodes > 0: # YouQiaoben: fix memory growth, both del and gc.collect() fail
+            # initialize rawdata lists
+            if not self.is_first_collect:
+                del self.obs
+                del self.acs
+                del self.rews
+                del self.news
+
+            obs = []
+            acs = []
+            rews = []
+            news = []
+
+            t_count = 0
+
+            for e in range(num_episodes):
+                ob = self.env.reset()
+                obs.append(ob)
+                news.append(True)
+
+                while True:
+                    ac = self.pi.act(ob)
+                    acs.append(ac)
+
+                    ob, rew, done, _ = self.env.step(ac)
+                    rews.append(rew)
+
+                    t_count += 1
+                    if t_count >= 200: # force episode stop
+                        break
+
+                    if done: # end of episode, discard s_T
+                        break
+                    else:
+                        obs.append(ob)
+                        news.append(False)
+
+            self.obs = np.array(obs)
+            self.acs = np.array(acs)
+            self.rews = np.array(rews)
+            self.news = np.array(news)
+
+            del obs
+            del acs
+            del rews
+            del news
+
+            self.raw_data = {'obs': self.obs, 'acs': self.acs, 'rews': self.rews, 'news': self.news}
+        
+            self.is_first_collect = False
+
+        if apply_func:
+            self.apply_adv_estimation_func()
+
+        gc.collect()
+
+    def apply_adv_estimation_func(self):
+        self.data = self.adv_estimation_func(self.raw_data)
+
+    def next_batch(self, batch_size): # YouQiaoben: referencing other iterate over batches
+        rand_idx = np.random.choice(self.data['obs'].shape[0], batch_size)
+        return {key: value[rand_idx] for key, value in self.data.items()}
+
--- a/tianshou/data/README.md
+++ b/tianshou/data/README.md
@ -0,0 +1,22 @@
+# Batch
+
+YouQiaoben
+
+fix as stated in ppo_example.py
+
+
+
+# Replay
+
+ShihongSong
+
+a Replay.py file. must have collect() and next_batch() methods for training.
+
+integrate previous ReplayBuffer codes.
+
+
+# adv_estimate
+
+YouQiaoben (gae_lambda), ShihongSong(dqn after policy.DQN)
+
+seems to be direct python functions. also may write it in a functional form.
--- a/tianshou/data/init.py
+++ b/tianshou/data/init.py
--- a/tianshou/data/adv_estimate.py
+++ b/tianshou/data/adv_estimate.py
@ -0,0 +1,37 @@
+import numpy as np
+
+
+def full_return(raw_data):
+    """
+    naively compute full return
+    :param raw_data: dict of specified keys and values.
+    """
+    obs = raw_data['obs']
+    acs = raw_data['acs']
+    rews = raw_data['rews']
+    news = raw_data['news']
+    num_timesteps = rews.shape[0]
+
+    data = {}
+    data['obs'] = obs
+    data['acs'] = acs
+
+    Gts = rews.copy()
+    episode_start_idx = 0
+    for i in range(1, num_timesteps):
+        if news[i] or (i == num_timesteps - 1): # found one full episode
+            if i < rews.shape[0] - 1:
+                t = i - 1
+            else:
+                t = i
+            Gt = 0
+            while t >= episode_start_idx:
+                Gt += rews[t]
+                Gts[t] = Gt
+                t -= 1
+
+            episode_start_idx = i
+
+    data['Gts'] = Gts
+
+    return data