2018-05-20 22:36:04 +08:00

108 lines
4.3 KiB
Python

from __future__ import absolute_import
from .base import PolicyBase
import tensorflow as tf
import numpy as np
__all__ = [
'DQN',
]
class DQN(PolicyBase):
"""
use DQN from value_function as a member
Policy derived from a Deep-Q Network (DQN). It should be constructed from a :class:`tianshou.core.value_function.DQN`.
Action is the argmax of the Q-values (usually with further :math:`\epsilon`-greedy).
It can only be applied to discrete action spaces.
:param dqn: A :class:`tianshou.core.value_function.DQN`. The Q-value network to derive this policy.
:param epsilon_train: A float in range :math:`[0, 1]`. The :math:`\epsilon` used in :math:`\epsilon`-greedy
during training while interacting with the environment.
:param epsilon_test: A float in range :math:`[0, 1]`. The :math:`\epsilon` used in :math:`\epsilon`-greedy
during test while interacting with the environment.
"""
def __init__(self, dqn, epsilon_train=0.1, epsilon_test=0.05):
self.action_value = dqn
self.action = tf.argmax(dqn.value_tensor_all_actions, axis=1)
self.epsilon_train = epsilon_train
self.epsilon_test = epsilon_test
def act(self, observation, my_feed_dict={}):
"""
Return action given observation, with :math:`\epsilon`-greedy using ``self.epsilon_train``.
:param observation: An array-like with rank the same as a single observation of the environment.
Its "batch_size" is 1, but should not be explicitly set. This method will add the dimension
of "batch_size" to the first dimension.
:param my_feed_dict: Optional. A dict defaulting to empty.
Specifies placeholders such as dropout and batch_norm except observation.
:return: A numpy array.
Action given the single observation. Its "batch_size" is 1, but should not be explicitly set.
"""
sess = tf.get_default_session()
feed_dict = {self.action_value.observation_placeholder: observation[None]}
feed_dict.update(my_feed_dict)
action = sess.run(self.action, feed_dict=feed_dict)
# epsilon_greedy
if np.random.rand() < self.epsilon_train:
action = np.random.randint(self.action_value.num_actions)
return np.squeeze(action)
def act_test(self, observation, my_feed_dict={}):
"""
Return action given observation, with :math:`\epsilon`-greedy using ``self.epsilon_test``.
:param observation: An array-like with rank the same as a single observation of the environment.
Its "batch_size" is 1, but should not be explicitly set. This method will add the dimension
of "batch_size" to the first dimension.
:param my_feed_dict: Optional. A dict defaulting to empty.
Specifies placeholders such as dropout and batch_norm except observation.
:return: A numpy array.
Action given the single observation. Its "batch_size" is 1, but should not be explicitly set.
"""
sess = tf.get_default_session()
feed_dict = {self.action_value.observation_placeholder: observation[None]}
feed_dict.update(my_feed_dict)
action = sess.run(self.action, feed_dict=feed_dict)
# epsilon_greedy
if np.random.rand() < self.epsilon_test:
action = np.random.randint(self.action_value.num_actions)
return np.squeeze(action)
@property
def q_net(self):
"""The DQN (:class:`tianshou.core.value_function.DQN`) this policy based on."""
return self.action_value
def sync_weights(self):
"""
Sync the variables of the "old net" to be the same as the current network.
"""
if self.action_value.sync_weights_ops is not None:
self.action_value.sync_weights()
def set_epsilon_train(self, epsilon):
"""
Set the :math:`\epsilon` in :math:`\epsilon`-greedy during training.
:param epsilon: A float in range :math:`[0, 1]`.
"""
self.epsilon_train = epsilon
def set_epsilon_test(self, epsilon):
"""
Set the :math:`\epsilon` in :math:`\epsilon`-greedy during training.
:param epsilon: A float in range :math:`[0, 1]`.
"""
self.epsilon_test = epsilon