diff --git a/examples/dqn_example.py b/examples/dqn_example.py index 7d20731..b676475 100644 --- a/examples/dqn_example.py +++ b/examples/dqn_example.py @@ -37,6 +37,9 @@ if __name__ == '__main__': action_dim = env.action_space.n # 1. build network with pure tf + # TODO: + # pass the observation variable to the replay buffer or find a more reasonable way to help replay buffer + # access this observation variable. observation = tf.placeholder(tf.float32, shape=(None,) + observation_dim, name="dqn_observation") # network input with tf.variable_scope('q_net'): @@ -59,6 +62,7 @@ if __name__ == '__main__': optimizer = tf.train.AdamOptimizer(1e-3) train_op = optimizer.minimize(total_loss, var_list=train_var_list, global_step=tf.train.get_global_step()) # 3. define data collection + # configuration should be given as parameters, different replay buffer has different parameters. replay_memory = get_replay_buffer('rank_based', env, q_values, q_net, target_net, {'size': 1000, 'batch_size': 64, 'learn_start': 20}) # ShihongSong: Replay(env, q_net, advantage_estimation.qlearning_target(target_network)), use your ReplayMemory, interact as follows. Simplify your advantage_estimation.dqn to run before YongRen's DQN @@ -70,6 +74,7 @@ if __name__ == '__main__': minibatch_count = 0 collection_count = 0 + # need to first collect some then sample, collect_freq must be larger than batch_size collect_freq = 100 while True: # until some stopping criterion met... # collect data diff --git a/tianshou/core/policy/dqn.py b/tianshou/core/policy/dqn.py index 81efc9b..39f6a16 100644 --- a/tianshou/core/policy/dqn.py +++ b/tianshou/core/policy/dqn.py @@ -8,6 +8,7 @@ class DQN(QValuePolicy): """ def __init__(self, logits, observation_placeholder, dtype=None, **kwargs): + # TODO: this version only support non-continuous action space, extend it to support continuous action space self._logits = tf.convert_to_tensor(logits) if dtype is None: dtype = tf.int32 @@ -15,6 +16,7 @@ class DQN(QValuePolicy): super(DQN, self).__init__(observation_placeholder) + # TODO: put the net definition outside of the class net = tf.layers.conv2d(self._observation_placeholder, 16, 8, 4, 'valid', activation=tf.nn.relu) net = tf.layers.conv2d(net, 32, 4, 2, 'valid', activation=tf.nn.relu) net = tf.layers.flatten(net) @@ -26,6 +28,7 @@ class DQN(QValuePolicy): return the action (int) to be executed. no exploration when exploration=None. """ + # TODO: ensure thread safety sess = tf.get_default_session() sampled_action = sess.run(tf.multinomial(self.logits, num_samples=1), feed_dict={self._observation_placeholder: observation[None]}) @@ -33,10 +36,16 @@ class DQN(QValuePolicy): @property def logits(self): + """ + :return: action values + """ return self._logits @property def n_categories(self): + """ + :return: dimension of action space if not continuous + """ return self._n_categories def values(self, observation): diff --git a/tianshou/data/replay_buffer/naive.py b/tianshou/data/replay_buffer/naive.py index 50ba1c3..5eb4dd7 100644 --- a/tianshou/data/replay_buffer/naive.py +++ b/tianshou/data/replay_buffer/naive.py @@ -23,6 +23,10 @@ class NaiveExperience(ReplayBuffer): self.n_entries += 1 def _begin_act(self): + """ + if the previous interaction is ended or the interaction hasn't started + then begin act from the state of env.reset() + """ self.observation = self._env.reset() self.action = self._env.action_space.sample() done = False @@ -33,6 +37,10 @@ class NaiveExperience(ReplayBuffer): self.observation, _, done, _ = self._env.step(self.action) def collect(self): + """ + collect data for replay memory and update the priority according to the given data. + store the previous action, previous observation, reward, action, observation in the replay memory. + """ sess = tf.get_default_session() current_data = dict() current_data['previous_action'] = self.action @@ -59,6 +67,13 @@ class NaiveExperience(ReplayBuffer): return [self.memory[idx] for idx in idxs], [1] * len(idxs), idxs def next_batch(self, batch_size): + """ + collect a batch of data from replay buffer, update the priority and calculate the necessary statistics for + updating q value network. + :param batch_size: int batch size. + :return: a batch of data, with target storing the target q value and wi, rewards storing the coefficient + for gradient of q value network. + """ data = dict() observations = list() actions = list() diff --git a/tianshou/data/replay_buffer/proportional.py b/tianshou/data/replay_buffer/proportional.py index 63aab66..52a231d 100644 --- a/tianshou/data/replay_buffer/proportional.py +++ b/tianshou/data/replay_buffer/proportional.py @@ -45,6 +45,10 @@ class PropotionalExperience(ReplayBuffer): self._begin_act() def _begin_act(self): + """ + if the previous interaction is ended or the interaction hasn't started + then begin act from the state of env.reset() + """ self.observation = self._env.reset() self.action = self._env.action_space.sample() done = False @@ -66,12 +70,6 @@ class PropotionalExperience(ReplayBuffer): """ self.tree.add(data, priority**self.alpha) - def collect(self): - pass - - def next_batch(self, batch_size): - pass - def sample(self, conf): """ The method return samples randomly. @@ -117,6 +115,10 @@ class PropotionalExperience(ReplayBuffer): return out, weights, indices def collect(self): + """ + collect data for replay memory and update the priority according to the given data. + store the previous action, previous observation, reward, action, observation in the replay memory. + """ sess = tf.get_default_session() current_data = dict() current_data['previous_action'] = self.action @@ -134,6 +136,13 @@ class PropotionalExperience(ReplayBuffer): self._begin_act() def next_batch(self, batch_size): + """ + collect a batch of data from replay buffer, update the priority and calculate the necessary statistics for + updating q value network. + :param batch_size: int batch size. + :return: a batch of data, with target storing the target q value and wi, rewards storing the coefficient + for gradient of q value network. + """ data = dict() observations = list() actions = list() diff --git a/tianshou/data/replay_buffer/rank_based.py b/tianshou/data/replay_buffer/rank_based.py index da56763..b71ca68 100644 --- a/tianshou/data/replay_buffer/rank_based.py +++ b/tianshou/data/replay_buffer/rank_based.py @@ -107,6 +107,10 @@ class RankBasedExperience(ReplayBuffer): return self.index def _begin_act(self): + """ + if the previous interaction is ended or the interaction hasn't started + then begin act from the state of env.reset() + """ self.observation = self._env.reset() self.action = self._env.action_space.sample() done = False @@ -117,6 +121,10 @@ class RankBasedExperience(ReplayBuffer): self.observation, _, done, _ = self._env.step(self.action) def collect(self): + """ + collect data for replay memory and update the priority according to the given data. + store the previous action, previous observation, reward, action, observation in the replay memory. + """ sess = tf.get_default_session() current_data = dict() current_data['previous_action'] = self.action @@ -131,6 +139,13 @@ class RankBasedExperience(ReplayBuffer): self._begin_act() def next_batch(self, batch_size): + """ + collect a batch of data from replay buffer, update the priority and calculate the necessary statistics for + updating q value network. + :param batch_size: int batch size. + :return: a batch of data, with target storing the target q value and wi, rewards storing the coefficient + for gradient of q value network. + """ data = dict() observations = list() actions = list()