182 lines
6.4 KiB
Python
Raw Normal View History

2020-03-14 21:48:31 +08:00
import torch
2020-03-15 17:41:00 +08:00
import numpy as np
2020-03-14 21:48:31 +08:00
from copy import deepcopy
from typing import Any, Dict, Union, Optional
2020-03-14 21:48:31 +08:00
from tianshou.policy import BasePolicy
from tianshou.data import Batch, ReplayBuffer, to_torch_as, to_numpy
2020-03-14 21:48:31 +08:00
2020-03-18 21:45:41 +08:00
class DQNPolicy(BasePolicy):
"""Implementation of Deep Q Network. arXiv:1312.5602.
Implementation of Double Q-Learning. arXiv:1509.06461.
2020-04-06 19:36:59 +08:00
Implementation of Dueling DQN. arXiv:1511.06581 (the dueling DQN is
implemented in the network side, not here).
2020-04-06 19:36:59 +08:00
:param torch.nn.Module model: a model following the rules in
:class:`~tianshou.policy.BasePolicy`. (s -> logits)
:param torch.optim.Optimizer optim: a torch.optim for optimizing the model.
:param float discount_factor: in [0, 1].
:param int estimation_step: greater than 1, the number of steps to look
ahead.
:param int target_update_freq: the target network update frequency (0 if
you do not use the target network).
:param bool reward_normalization: normalize the reward to Normal(0, 1),
defaults to False.
.. seealso::
Please refer to :class:`~tianshou.policy.BasePolicy` for more detailed
explanation.
2020-04-06 19:36:59 +08:00
"""
2020-03-14 21:48:31 +08:00
def __init__(
self,
model: torch.nn.Module,
optim: torch.optim.Optimizer,
discount_factor: float = 0.99,
estimation_step: int = 1,
target_update_freq: int = 0,
reward_normalization: bool = False,
**kwargs: Any,
) -> None:
2020-04-08 21:13:15 +08:00
super().__init__(**kwargs)
2020-03-14 21:48:31 +08:00
self.model = model
2020-03-15 17:41:00 +08:00
self.optim = optim
self.eps = 0.0
assert 0.0 <= discount_factor <= 1.0, "discount factor should be in [0, 1]"
2020-03-14 21:48:31 +08:00
self._gamma = discount_factor
assert estimation_step > 0, "estimation_step should be greater than 0"
2020-03-14 21:48:31 +08:00
self._n_step = estimation_step
2020-04-06 19:36:59 +08:00
self._target = target_update_freq > 0
2020-03-25 14:08:28 +08:00
self._freq = target_update_freq
self._iter = 0
2020-04-06 19:36:59 +08:00
if self._target:
2020-03-14 21:48:31 +08:00
self.model_old = deepcopy(self.model)
2020-03-15 17:41:00 +08:00
self.model_old.eval()
self._rew_norm = reward_normalization
2020-03-14 21:48:31 +08:00
2020-05-12 11:31:47 +08:00
def set_eps(self, eps: float) -> None:
2020-04-06 19:36:59 +08:00
"""Set the eps for epsilon-greedy exploration."""
2020-03-15 17:41:00 +08:00
self.eps = eps
def train(self, mode: bool = True) -> "DQNPolicy":
2020-04-06 19:36:59 +08:00
"""Set the module in training mode, except for the target network."""
self.training = mode
self.model.train(mode)
return self
2020-03-15 17:41:00 +08:00
2020-05-12 11:31:47 +08:00
def sync_weight(self) -> None:
2020-04-06 19:36:59 +08:00
"""Synchronize the weight for the target network."""
2020-03-25 14:08:28 +08:00
self.model_old.load_state_dict(self.model.state_dict())
2020-03-14 21:48:31 +08:00
def _target_q(
self, buffer: ReplayBuffer, indice: np.ndarray
) -> torch.Tensor:
2020-06-03 13:59:47 +08:00
batch = buffer[indice] # batch.obs_next: s_{t+n}
# target_Q = Q_old(s_, argmax(Q_new(s_, *)))
if self._target:
a = self(batch, input="obs_next").act
target_q = self(batch, model="model_old", input="obs_next").logits
target_q = target_q[np.arange(len(a)), a]
else:
target_q = self(batch, input="obs_next").logits.max(dim=1)[0]
2020-06-02 22:29:50 +08:00
return target_q
def process_fn(
self, batch: Batch, buffer: ReplayBuffer, indice: np.ndarray
) -> Batch:
"""Compute the n-step return for Q-learning targets.
More details can be found at
:meth:`~tianshou.policy.BasePolicy.compute_nstep_return`.
2020-04-06 19:36:59 +08:00
"""
2020-06-02 22:29:50 +08:00
batch = self.compute_nstep_return(
batch, buffer, indice, self._target_q,
self._gamma, self._n_step, self._rew_norm)
2020-03-14 21:48:31 +08:00
return batch
2020-03-15 17:41:00 +08:00
def compute_q_value(self, logits: torch.Tensor) -> torch.Tensor:
"""Compute the q value based on the network's raw output logits."""
return logits
def forward(
self,
batch: Batch,
state: Optional[Union[dict, Batch, np.ndarray]] = None,
model: str = "model",
input: str = "obs",
**kwargs: Any,
) -> Batch:
"""Compute action over the given batch data.
If you need to mask the action, please add a "mask" into batch.obs, for
example, if we have an environment that has "0/1/2" three actions:
Add multi-agent example: tic-tac-toe (#122) * make fileds with empty Batch rather than None after reset * dummy code * remove dummy * add reward_length argument for collector * Improve Batch (#126) * make sure the key type of Batch is string, and add unit tests * add is_empty() function and unit tests * enable cat of mixing dict and Batch, just like stack * bugfix for reward_length * add get_final_reward_fn argument to collector to deal with marl * minor polish * remove multibuf * minor polish * improve and implement Batch.cat_ * bugfix for buffer.sample with field impt_weight * restore the usage of a.cat_(b) * fix 2 bugs in batch and add corresponding unittest * code fix for update * update is_empty to recognize empty over empty; bugfix for len * bugfix for update and add testcase * add testcase of update * make fileds with empty Batch rather than None after reset * dummy code * remove dummy * add reward_length argument for collector * bugfix for reward_length * add get_final_reward_fn argument to collector to deal with marl * make sure the key type of Batch is string, and add unit tests * add is_empty() function and unit tests * enable cat of mixing dict and Batch, just like stack * dummy code * remove dummy * add multi-agent example: tic-tac-toe * move TicTacToeEnv to a separate file * remove dummy MANet * code refactor * move tic-tac-toe example to test * update doc with marl-example * fix docs * reduce the threshold * revert * update player id to start from 1 and change player to agent; keep coding * add reward_length argument for collector * Improve Batch (#128) * minor polish * improve and implement Batch.cat_ * bugfix for buffer.sample with field impt_weight * restore the usage of a.cat_(b) * fix 2 bugs in batch and add corresponding unittest * code fix for update * update is_empty to recognize empty over empty; bugfix for len * bugfix for update and add testcase * add testcase of update * fix docs * fix docs * fix docs [ci skip] * fix docs [ci skip] Co-authored-by: Trinkle23897 <463003665@qq.com> * refact * re-implement Batch.stack and add testcases * add doc for Batch.stack * reward_metric * modify flag * minor fix * reuse _create_values and refactor stack_ & cat_ * fix pep8 * fix reward stat in collector * fix stat of collector, simplify test/base/env.py * fix docs * minor fix * raise exception for stacking with partial keys and axis!=0 * minor fix * minor fix * minor fix * marl-examples * add condense; bugfix for torch.Tensor; code refactor * marl example can run now * enable tic tac toe with larger board size and win-size * add test dependency * Fix padding of inconsistent keys with Batch.stack and Batch.cat (#130) * re-implement Batch.stack and add testcases * add doc for Batch.stack * reuse _create_values and refactor stack_ & cat_ * fix pep8 * fix docs * raise exception for stacking with partial keys and axis!=0 * minor fix * minor fix Co-authored-by: Trinkle23897 <463003665@qq.com> * stash * let agent learn to play as agent 2 which is harder * code refactor * Improve collector (#125) * remove multibuf * reward_metric * make fileds with empty Batch rather than None after reset * many fixes and refactor Co-authored-by: Trinkle23897 <463003665@qq.com> * marl for tic-tac-toe and general gomoku * update default gamma to 0.1 for tic tac toe to win earlier * fix name typo; change default game config; add rew_norm option * fix pep8 * test commit * mv test dir name * add rew flag * fix torch.optim import error and madqn rew_norm * remove useless kwargs * Vector env enable select worker (#132) * Enable selecting worker for vector env step method. * Update collector to match new vecenv selective worker behavior. * Bug fix. * Fix rebase Co-authored-by: Alexis Duburcq <alexis.duburcq@wandercraft.eu> * show the last move of tictactoe by capital letters * add multi-agent tutorial * fix link * Standardized behavior of Batch.cat and misc code refactor (#137) * code refactor; remove unused kwargs; add reward_normalization for dqn * bugfix for __setitem__ with torch.Tensor; add Batch.condense * minor fix * support cat with empty Batch * remove the dependency of is_empty on len; specify the semantic of empty Batch by test cases * support stack with empty Batch * remove condense * refactor code to reflect the shared / partial / reserved categories of keys * add is_empty(recursive=False) * doc fix * docfix and bugfix for _is_batch_set * add doc for key reservation * bugfix for algebra operators * fix cat with lens hint * code refactor * bugfix for storing None * use ValueError instead of exception * hide lens away from users * add comment for __cat * move the computation of the initial value of lens in cat_ itself. * change the place of doc string * doc fix for Batch doc string * change recursive to recurse * doc string fix * minor fix for batch doc * write tutorials to specify the standard of Batch (#142) * add doc for len exceptions * doc move; unify is_scalar_value function * remove some issubclass check * bugfix for shape of Batch(a=1) * keep moving doc * keep writing batch tutorial * draft version of Batch tutorial done * improving doc * keep improving doc * batch tutorial done * rename _is_number * rename _is_scalar * shape property do not raise exception * restore some doc string * grammarly [ci skip] * grammarly + fix warning of building docs * polish docs * trim and re-arrange batch tutorial * go straight to the point * minor fix for batch doc * add shape / len in basic usage * keep improving tutorial * unify _to_array_with_correct_type to remove duplicate code * delegate type convertion to Batch.__init__ * further delegate type convertion to Batch.__init__ * bugfix for setattr * add a _parse_value function * remove dummy function call * polish docs Co-authored-by: Trinkle23897 <463003665@qq.com> * bugfix for mapolicy * pretty code * remove debug code; remove condense * doc fix * check before get_agents in tutorials/tictactoe * tutorial * fix * minor fix for batch doc * minor polish * faster test_ttt * improve tic-tac-toe environment * change default epoch and step-per-epoch for tic-tac-toe * fix mapolicy * minor polish for mapolicy * 90% to 80% (need to change the tutorial) * win rate * show step number at board * simplify mapolicy * minor polish for mapolicy * remove MADQN * fix pep8 * change legal_actions to mask (need to update docs) * simplify maenv * fix typo * move basevecenv to single file * separate RandomAgent * update docs * grammarly * fix pep8 * win rate typo * format in cheatsheet * use bool mask directly * update doc for boolean mask Co-authored-by: Trinkle23897 <463003665@qq.com> Co-authored-by: Alexis DUBURCQ <alexis.duburcq@gmail.com> Co-authored-by: Alexis Duburcq <alexis.duburcq@wandercraft.eu>
2020-07-21 14:59:49 +08:00
::
batch == Batch(
obs=Batch(
obs="original obs, with batch_size=1 for demonstration",
mask=np.array([[False, True, False]]),
# action 1 is available
# action 0 and 2 are unavailable
),
...
)
2020-04-06 19:36:59 +08:00
:param float eps: in [0, 1], for epsilon-greedy exploration method.
:return: A :class:`~tianshou.data.Batch` which has 3 keys:
* ``act`` the action.
* ``logits`` the network's raw output.
* ``state`` the hidden state.
.. seealso::
2020-04-10 10:47:16 +08:00
Please refer to :meth:`~tianshou.policy.BasePolicy.forward` for
more detailed explanation.
2020-04-06 19:36:59 +08:00
"""
2020-03-17 11:37:31 +08:00
model = getattr(self, model)
obs = batch[input]
obs_ = obs.obs if hasattr(obs, "obs") else obs
logits, h = model(obs_, state=state, info=batch.info)
q = self.compute_q_value(logits)
if not hasattr(self, "max_action_num"):
self.max_action_num = q.shape[1]
act: np.ndarray = to_numpy(q.max(dim=1)[1])
if hasattr(obs, "mask"):
Add multi-agent example: tic-tac-toe (#122) * make fileds with empty Batch rather than None after reset * dummy code * remove dummy * add reward_length argument for collector * Improve Batch (#126) * make sure the key type of Batch is string, and add unit tests * add is_empty() function and unit tests * enable cat of mixing dict and Batch, just like stack * bugfix for reward_length * add get_final_reward_fn argument to collector to deal with marl * minor polish * remove multibuf * minor polish * improve and implement Batch.cat_ * bugfix for buffer.sample with field impt_weight * restore the usage of a.cat_(b) * fix 2 bugs in batch and add corresponding unittest * code fix for update * update is_empty to recognize empty over empty; bugfix for len * bugfix for update and add testcase * add testcase of update * make fileds with empty Batch rather than None after reset * dummy code * remove dummy * add reward_length argument for collector * bugfix for reward_length * add get_final_reward_fn argument to collector to deal with marl * make sure the key type of Batch is string, and add unit tests * add is_empty() function and unit tests * enable cat of mixing dict and Batch, just like stack * dummy code * remove dummy * add multi-agent example: tic-tac-toe * move TicTacToeEnv to a separate file * remove dummy MANet * code refactor * move tic-tac-toe example to test * update doc with marl-example * fix docs * reduce the threshold * revert * update player id to start from 1 and change player to agent; keep coding * add reward_length argument for collector * Improve Batch (#128) * minor polish * improve and implement Batch.cat_ * bugfix for buffer.sample with field impt_weight * restore the usage of a.cat_(b) * fix 2 bugs in batch and add corresponding unittest * code fix for update * update is_empty to recognize empty over empty; bugfix for len * bugfix for update and add testcase * add testcase of update * fix docs * fix docs * fix docs [ci skip] * fix docs [ci skip] Co-authored-by: Trinkle23897 <463003665@qq.com> * refact * re-implement Batch.stack and add testcases * add doc for Batch.stack * reward_metric * modify flag * minor fix * reuse _create_values and refactor stack_ & cat_ * fix pep8 * fix reward stat in collector * fix stat of collector, simplify test/base/env.py * fix docs * minor fix * raise exception for stacking with partial keys and axis!=0 * minor fix * minor fix * minor fix * marl-examples * add condense; bugfix for torch.Tensor; code refactor * marl example can run now * enable tic tac toe with larger board size and win-size * add test dependency * Fix padding of inconsistent keys with Batch.stack and Batch.cat (#130) * re-implement Batch.stack and add testcases * add doc for Batch.stack * reuse _create_values and refactor stack_ & cat_ * fix pep8 * fix docs * raise exception for stacking with partial keys and axis!=0 * minor fix * minor fix Co-authored-by: Trinkle23897 <463003665@qq.com> * stash * let agent learn to play as agent 2 which is harder * code refactor * Improve collector (#125) * remove multibuf * reward_metric * make fileds with empty Batch rather than None after reset * many fixes and refactor Co-authored-by: Trinkle23897 <463003665@qq.com> * marl for tic-tac-toe and general gomoku * update default gamma to 0.1 for tic tac toe to win earlier * fix name typo; change default game config; add rew_norm option * fix pep8 * test commit * mv test dir name * add rew flag * fix torch.optim import error and madqn rew_norm * remove useless kwargs * Vector env enable select worker (#132) * Enable selecting worker for vector env step method. * Update collector to match new vecenv selective worker behavior. * Bug fix. * Fix rebase Co-authored-by: Alexis Duburcq <alexis.duburcq@wandercraft.eu> * show the last move of tictactoe by capital letters * add multi-agent tutorial * fix link * Standardized behavior of Batch.cat and misc code refactor (#137) * code refactor; remove unused kwargs; add reward_normalization for dqn * bugfix for __setitem__ with torch.Tensor; add Batch.condense * minor fix * support cat with empty Batch * remove the dependency of is_empty on len; specify the semantic of empty Batch by test cases * support stack with empty Batch * remove condense * refactor code to reflect the shared / partial / reserved categories of keys * add is_empty(recursive=False) * doc fix * docfix and bugfix for _is_batch_set * add doc for key reservation * bugfix for algebra operators * fix cat with lens hint * code refactor * bugfix for storing None * use ValueError instead of exception * hide lens away from users * add comment for __cat * move the computation of the initial value of lens in cat_ itself. * change the place of doc string * doc fix for Batch doc string * change recursive to recurse * doc string fix * minor fix for batch doc * write tutorials to specify the standard of Batch (#142) * add doc for len exceptions * doc move; unify is_scalar_value function * remove some issubclass check * bugfix for shape of Batch(a=1) * keep moving doc * keep writing batch tutorial * draft version of Batch tutorial done * improving doc * keep improving doc * batch tutorial done * rename _is_number * rename _is_scalar * shape property do not raise exception * restore some doc string * grammarly [ci skip] * grammarly + fix warning of building docs * polish docs * trim and re-arrange batch tutorial * go straight to the point * minor fix for batch doc * add shape / len in basic usage * keep improving tutorial * unify _to_array_with_correct_type to remove duplicate code * delegate type convertion to Batch.__init__ * further delegate type convertion to Batch.__init__ * bugfix for setattr * add a _parse_value function * remove dummy function call * polish docs Co-authored-by: Trinkle23897 <463003665@qq.com> * bugfix for mapolicy * pretty code * remove debug code; remove condense * doc fix * check before get_agents in tutorials/tictactoe * tutorial * fix * minor fix for batch doc * minor polish * faster test_ttt * improve tic-tac-toe environment * change default epoch and step-per-epoch for tic-tac-toe * fix mapolicy * minor polish for mapolicy * 90% to 80% (need to change the tutorial) * win rate * show step number at board * simplify mapolicy * minor polish for mapolicy * remove MADQN * fix pep8 * change legal_actions to mask (need to update docs) * simplify maenv * fix typo * move basevecenv to single file * separate RandomAgent * update docs * grammarly * fix pep8 * win rate typo * format in cheatsheet * use bool mask directly * update doc for boolean mask Co-authored-by: Trinkle23897 <463003665@qq.com> Co-authored-by: Alexis DUBURCQ <alexis.duburcq@gmail.com> Co-authored-by: Alexis Duburcq <alexis.duburcq@wandercraft.eu>
2020-07-21 14:59:49 +08:00
# some of actions are masked, they cannot be selected
q_: np.ndarray = to_numpy(q)
Add multi-agent example: tic-tac-toe (#122) * make fileds with empty Batch rather than None after reset * dummy code * remove dummy * add reward_length argument for collector * Improve Batch (#126) * make sure the key type of Batch is string, and add unit tests * add is_empty() function and unit tests * enable cat of mixing dict and Batch, just like stack * bugfix for reward_length * add get_final_reward_fn argument to collector to deal with marl * minor polish * remove multibuf * minor polish * improve and implement Batch.cat_ * bugfix for buffer.sample with field impt_weight * restore the usage of a.cat_(b) * fix 2 bugs in batch and add corresponding unittest * code fix for update * update is_empty to recognize empty over empty; bugfix for len * bugfix for update and add testcase * add testcase of update * make fileds with empty Batch rather than None after reset * dummy code * remove dummy * add reward_length argument for collector * bugfix for reward_length * add get_final_reward_fn argument to collector to deal with marl * make sure the key type of Batch is string, and add unit tests * add is_empty() function and unit tests * enable cat of mixing dict and Batch, just like stack * dummy code * remove dummy * add multi-agent example: tic-tac-toe * move TicTacToeEnv to a separate file * remove dummy MANet * code refactor * move tic-tac-toe example to test * update doc with marl-example * fix docs * reduce the threshold * revert * update player id to start from 1 and change player to agent; keep coding * add reward_length argument for collector * Improve Batch (#128) * minor polish * improve and implement Batch.cat_ * bugfix for buffer.sample with field impt_weight * restore the usage of a.cat_(b) * fix 2 bugs in batch and add corresponding unittest * code fix for update * update is_empty to recognize empty over empty; bugfix for len * bugfix for update and add testcase * add testcase of update * fix docs * fix docs * fix docs [ci skip] * fix docs [ci skip] Co-authored-by: Trinkle23897 <463003665@qq.com> * refact * re-implement Batch.stack and add testcases * add doc for Batch.stack * reward_metric * modify flag * minor fix * reuse _create_values and refactor stack_ & cat_ * fix pep8 * fix reward stat in collector * fix stat of collector, simplify test/base/env.py * fix docs * minor fix * raise exception for stacking with partial keys and axis!=0 * minor fix * minor fix * minor fix * marl-examples * add condense; bugfix for torch.Tensor; code refactor * marl example can run now * enable tic tac toe with larger board size and win-size * add test dependency * Fix padding of inconsistent keys with Batch.stack and Batch.cat (#130) * re-implement Batch.stack and add testcases * add doc for Batch.stack * reuse _create_values and refactor stack_ & cat_ * fix pep8 * fix docs * raise exception for stacking with partial keys and axis!=0 * minor fix * minor fix Co-authored-by: Trinkle23897 <463003665@qq.com> * stash * let agent learn to play as agent 2 which is harder * code refactor * Improve collector (#125) * remove multibuf * reward_metric * make fileds with empty Batch rather than None after reset * many fixes and refactor Co-authored-by: Trinkle23897 <463003665@qq.com> * marl for tic-tac-toe and general gomoku * update default gamma to 0.1 for tic tac toe to win earlier * fix name typo; change default game config; add rew_norm option * fix pep8 * test commit * mv test dir name * add rew flag * fix torch.optim import error and madqn rew_norm * remove useless kwargs * Vector env enable select worker (#132) * Enable selecting worker for vector env step method. * Update collector to match new vecenv selective worker behavior. * Bug fix. * Fix rebase Co-authored-by: Alexis Duburcq <alexis.duburcq@wandercraft.eu> * show the last move of tictactoe by capital letters * add multi-agent tutorial * fix link * Standardized behavior of Batch.cat and misc code refactor (#137) * code refactor; remove unused kwargs; add reward_normalization for dqn * bugfix for __setitem__ with torch.Tensor; add Batch.condense * minor fix * support cat with empty Batch * remove the dependency of is_empty on len; specify the semantic of empty Batch by test cases * support stack with empty Batch * remove condense * refactor code to reflect the shared / partial / reserved categories of keys * add is_empty(recursive=False) * doc fix * docfix and bugfix for _is_batch_set * add doc for key reservation * bugfix for algebra operators * fix cat with lens hint * code refactor * bugfix for storing None * use ValueError instead of exception * hide lens away from users * add comment for __cat * move the computation of the initial value of lens in cat_ itself. * change the place of doc string * doc fix for Batch doc string * change recursive to recurse * doc string fix * minor fix for batch doc * write tutorials to specify the standard of Batch (#142) * add doc for len exceptions * doc move; unify is_scalar_value function * remove some issubclass check * bugfix for shape of Batch(a=1) * keep moving doc * keep writing batch tutorial * draft version of Batch tutorial done * improving doc * keep improving doc * batch tutorial done * rename _is_number * rename _is_scalar * shape property do not raise exception * restore some doc string * grammarly [ci skip] * grammarly + fix warning of building docs * polish docs * trim and re-arrange batch tutorial * go straight to the point * minor fix for batch doc * add shape / len in basic usage * keep improving tutorial * unify _to_array_with_correct_type to remove duplicate code * delegate type convertion to Batch.__init__ * further delegate type convertion to Batch.__init__ * bugfix for setattr * add a _parse_value function * remove dummy function call * polish docs Co-authored-by: Trinkle23897 <463003665@qq.com> * bugfix for mapolicy * pretty code * remove debug code; remove condense * doc fix * check before get_agents in tutorials/tictactoe * tutorial * fix * minor fix for batch doc * minor polish * faster test_ttt * improve tic-tac-toe environment * change default epoch and step-per-epoch for tic-tac-toe * fix mapolicy * minor polish for mapolicy * 90% to 80% (need to change the tutorial) * win rate * show step number at board * simplify mapolicy * minor polish for mapolicy * remove MADQN * fix pep8 * change legal_actions to mask (need to update docs) * simplify maenv * fix typo * move basevecenv to single file * separate RandomAgent * update docs * grammarly * fix pep8 * win rate typo * format in cheatsheet * use bool mask directly * update doc for boolean mask Co-authored-by: Trinkle23897 <463003665@qq.com> Co-authored-by: Alexis DUBURCQ <alexis.duburcq@gmail.com> Co-authored-by: Alexis Duburcq <alexis.duburcq@wandercraft.eu>
2020-07-21 14:59:49 +08:00
q_[~obs.mask] = -np.inf
act = q_.argmax(axis=1)
return Batch(logits=logits, act=act, state=h)
2020-03-17 11:37:31 +08:00
def learn(self, batch: Batch, **kwargs: Any) -> Dict[str, float]:
if self._target and self._iter % self._freq == 0:
2020-03-25 14:08:28 +08:00
self.sync_weight()
2020-03-15 17:41:00 +08:00
self.optim.zero_grad()
weight = batch.pop("weight", 1.0)
q = self(batch).logits
2020-06-02 22:29:50 +08:00
q = q[np.arange(len(q)), batch.act]
r = to_torch_as(batch.returns.flatten(), q)
td = r - q
loss = (td.pow(2) * weight).mean()
batch.weight = td # prio-buffer
2020-03-15 17:41:00 +08:00
loss.backward()
self.optim.step()
self._iter += 1
return {"loss": loss.item()}
def exploration_noise(self, act: np.ndarray, batch: Batch) -> np.ndarray:
if not np.isclose(self.eps, 0.0):
for i in range(len(act)):
if np.random.rand() < self.eps:
q_ = np.random.rand(self.max_action_num)
if hasattr(batch["obs"], "mask"):
q_[~batch["obs"].mask[i]] = -np.inf
act[i] = q_.argmax()
return act