Tianshou/tianshou/core/policy/dqn.py

from __future__ import absolute_import

from .base import PolicyBase
import tensorflow as tf
import numpy as np

__all__ = [
    'DQN',
]


class DQN(PolicyBase):
    """
    use DQN from value_function as a member

    Policy derived from a Deep-Q Network (DQN). It should be constructed from a :class:`tianshou.core.value_function.DQN`.
    Action is the argmax of the Q-values (usually with further :math:`\epsilon`-greedy).
    It can only be applied to discrete action spaces.

    :param dqn: A :class:`tianshou.core.value_function.DQN`. The Q-value network to derive this policy.
    :param epsilon_train: A float in range :math:`[0, 1]`. The :math:`\epsilon` used in :math:`\epsilon`-greedy
        during training while interacting with the environment.
    :param epsilon_test: A float in range :math:`[0, 1]`. The :math:`\epsilon` used in :math:`\epsilon`-greedy
        during test while interacting with the environment.
    """
    def __init__(self, dqn, epsilon_train=0.1, epsilon_test=0.05):
        self.action_value = dqn
        self.action = tf.argmax(dqn.value_tensor_all_actions, axis=1)

        self.epsilon_train = epsilon_train
        self.epsilon_test = epsilon_test

    def act(self, observation, my_feed_dict={}):
        """
        Return action given observation, with :math:`\epsilon`-greedy using ``self.epsilon_train``.

        :param observation: An array-like with rank the same as a single observation of the environment.
            Its "batch_size" is 1, but should not be explicitly set. This method will add the dimension
            of "batch_size" to the first dimension.
        :param my_feed_dict: Optional. A dict defaulting to empty.
            Specifies placeholders such as dropout and batch_norm except observation.

        :return: A numpy array.
            Action given the single observation. Its "batch_size" is 1, but should not be explicitly set.
        """
        sess = tf.get_default_session()

        feed_dict = {self.action_value.observation_placeholder: observation[None]}
        feed_dict.update(my_feed_dict)
        action = sess.run(self.action, feed_dict=feed_dict)

        # epsilon_greedy
        if np.random.rand() < self.epsilon_train:
            action = np.random.randint(self.action_value.num_actions)

        return np.squeeze(action)

    def act_test(self, observation, my_feed_dict={}):
        """
        Return action given observation, with :math:`\epsilon`-greedy using ``self.epsilon_test``.

        :param observation: An array-like with rank the same as a single observation of the environment.
            Its "batch_size" is 1, but should not be explicitly set. This method will add the dimension
            of "batch_size" to the first dimension.
        :param my_feed_dict: Optional. A dict defaulting to empty.
            Specifies placeholders such as dropout and batch_norm except observation.

        :return: A numpy array.
            Action given the single observation. Its "batch_size" is 1, but should not be explicitly set.
        """
        sess = tf.get_default_session()

        feed_dict = {self.action_value.observation_placeholder: observation[None]}
        feed_dict.update(my_feed_dict)
        action = sess.run(self.action, feed_dict=feed_dict)

        # epsilon_greedy
        if np.random.rand() < self.epsilon_test:
            action = np.random.randint(self.action_value.num_actions)

        return np.squeeze(action)

    @property
    def q_net(self):
        """The DQN (:class:`tianshou.core.value_function.DQN`) this policy based on."""
        return self.action_value

    def sync_weights(self):
        """
        Sync the variables of the "old net" to be the same as the current network.
        """
        if self.action_value.sync_weights_ops is not None:
            self.action_value.sync_weights()

    def set_epsilon_train(self, epsilon):
        """
        Set the :math:`\epsilon` in :math:`\epsilon`-greedy during training.
        :param epsilon: A float in range :math:`[0, 1]`.
        """
        self.epsilon_train = epsilon

    def set_epsilon_test(self, epsilon):
        """
        Set the :math:`\epsilon` in :math:`\epsilon`-greedy during training.
        :param epsilon: A float in range :math:`[0, 1]`.
        """
        self.epsilon_test = epsilon