applied formatter to envs

2023-04-23 22:52:30 +09:00 · 2023-04-23 22:52:30 +09:00 · 6f0e6c6963
commit 6f0e6c6963
parent 628b856c63
4 changed files with 417 additions and 400 deletions
--- a/envs/atari.py
+++ b/envs/atari.py
@ -2,127 +2,145 @@ import numpy as np
 class Atari:
    LOCK = None
-  LOCK = None
+    def __init__(
        self,
        name,
        action_repeat=4,
        size=(84, 84),
        gray=True,
        noops=0,
        lives="unused",
        sticky=True,
        actions="all",
        length=108000,
        resize="opencv",
        seed=None,
    ):
        assert size[0] == size[1]
        assert lives in ("unused", "discount", "reset"), lives
        assert actions in ("all", "needed"), actions
        assert resize in ("opencv", "pillow"), resize
        if self.LOCK is None:
            import multiprocessing as mp
-  def __init__(
+            mp = mp.get_context("spawn")
-      self, name, action_repeat=4, size=(84, 84), gray=True, noops=0, lives='unused',
+            self.LOCK = mp.Lock()
-      sticky=True, actions='all', length=108000, resize='opencv', seed=None):
+        self._resize = resize
-    assert size[0] == size[1]
+        if self._resize == "opencv":
-    assert lives in ('unused', 'discount', 'reset'), lives
+            import cv2
    assert actions in ('all', 'needed'), actions
    assert resize in ('opencv', 'pillow'), resize
    if self.LOCK is None:
      import multiprocessing as mp
      mp = mp.get_context('spawn')
      self.LOCK = mp.Lock()
    self._resize = resize
    if self._resize == 'opencv':
      import cv2
      self._cv2 = cv2
    if self._resize == 'pillow':
      from PIL import Image
      self._image = Image
    import gym.envs.atari
    if name == 'james_bond':
      name = 'jamesbond'
    self._repeat = action_repeat
    self._size = size
    self._gray = gray
    self._noops = noops
    self._lives = lives
    self._sticky = sticky
    self._length = length
    self._random = np.random.RandomState(seed)
    with self.LOCK:
      self._env = gym.envs.atari.AtariEnv(
          game=name,
          obs_type='image',
          frameskip=1, repeat_action_probability=0.25 if sticky else 0.0,
          full_action_space=(actions == 'all'))
    assert self._env.unwrapped.get_action_meanings()[0] == 'NOOP'
    shape = self._env.observation_space.shape
    self._buffer = [np.zeros(shape, np.uint8) for _ in range(2)]
    self._ale = self._env.unwrapped.ale
    self._last_lives = None
    self._done = True
    self._step = 0
-  @property
+            self._cv2 = cv2
-  def action_space(self):
+        if self._resize == "pillow":
-    space = self._env.action_space
+            from PIL import Image
    space.discrete = True
    return space
-  def step(self, action):
+            self._image = Image
-    # if action['reset'] or self._done:
+        import gym.envs.atari
    #   with self.LOCK:
    #     self._reset()
    #   self._done = False
    #   self._step = 0
    #   return self._obs(0.0, is_first=True)
    total = 0.0
    dead = False
    if len(action.shape) >= 1:
      action = np.argmax(action)
    for repeat in range(self._repeat):
      _, reward, over, info = self._env.step(action)
      self._step += 1
      total += reward
      if repeat == self._repeat - 2:
        self._screen(self._buffer[1])
      if over:
        break
      if self._lives != 'unused':
        current = self._ale.lives()
        if current < self._last_lives:
          dead = True
          self._last_lives = current
          break
    if not self._repeat:
      self._buffer[1][:] = self._buffer[0][:]
    self._screen(self._buffer[0])
    self._done = over or (self._length and self._step >= self._length) or dead
    return self._obs(
        total,
        is_last=self._done or (dead and self._lives == 'reset'),
        is_terminal=dead or over)
-  def reset(self):
+        if name == "james_bond":
-    self._env.reset()
+            name = "jamesbond"
-    if self._noops:
+        self._repeat = action_repeat
-      for _ in range(self._random.randint(self._noops)):
+        self._size = size
-         _, _, dead, _ = self._env.step(0)
+        self._gray = gray
-         if dead:
+        self._noops = noops
-           self._env.reset()
+        self._lives = lives
-    self._last_lives = self._ale.lives()
+        self._sticky = sticky
-    self._screen(self._buffer[0])
+        self._length = length
-    self._buffer[1].fill(0)
+        self._random = np.random.RandomState(seed)
        with self.LOCK:
            self._env = gym.envs.atari.AtariEnv(
                game=name,
                obs_type="image",
                frameskip=1,
                repeat_action_probability=0.25 if sticky else 0.0,
                full_action_space=(actions == "all"),
            )
        assert self._env.unwrapped.get_action_meanings()[0] == "NOOP"
        shape = self._env.observation_space.shape
        self._buffer = [np.zeros(shape, np.uint8) for _ in range(2)]
        self._ale = self._env.unwrapped.ale
        self._last_lives = None
        self._done = True
        self._step = 0
-    self._done = False
+    @property
-    self._step = 0
+    def action_space(self):
-    obs, reward, is_terminal, _ = self._obs(0.0, is_first=True)
+        space = self._env.action_space
-    return obs
+        space.discrete = True
        return space
-  def _obs(self, reward, is_first=False, is_last=False, is_terminal=False):
+    def step(self, action):
-    np.maximum(self._buffer[0], self._buffer[1], out=self._buffer[0])
+        # if action['reset'] or self._done:
-    image = self._buffer[0]
+        #   with self.LOCK:
-    if image.shape[:2] != self._size:
+        #     self._reset()
-      if self._resize == 'opencv':
+        #   self._done = False
-        image = self._cv2.resize(
+        #   self._step = 0
-            image, self._size, interpolation=self._cv2.INTER_AREA)
+        #   return self._obs(0.0, is_first=True)
-      if self._resize == 'pillow':
+        total = 0.0
-        image = self._image.fromarray(image)
+        dead = False
-        image = image.resize(self._size, self._image.NEAREST)
+        if len(action.shape) >= 1:
-        image = np.array(image)
+            action = np.argmax(action)
-    if self._gray:
+        for repeat in range(self._repeat):
-      weights = [0.299, 0.587, 1 - (0.299 + 0.587)]
+            _, reward, over, info = self._env.step(action)
-      image = np.tensordot(image, weights, (-1, 0)).astype(image.dtype)
+            self._step += 1
-      image = image[:, :, None]
+            total += reward
-    return {'image':image, 'is_terminal':is_terminal}, reward, is_last, {}
+            if repeat == self._repeat - 2:
                self._screen(self._buffer[1])
            if over:
                break
            if self._lives != "unused":
                current = self._ale.lives()
                if current < self._last_lives:
                    dead = True
                    self._last_lives = current
                    break
        if not self._repeat:
            self._buffer[1][:] = self._buffer[0][:]
        self._screen(self._buffer[0])
        self._done = over or (self._length and self._step >= self._length) or dead
        return self._obs(
            total,
            is_last=self._done or (dead and self._lives == "reset"),
            is_terminal=dead or over,
        )
-  def _screen(self, array):
+    def reset(self):
-    self._ale.getScreenRGB2(array)
+        self._env.reset()
        if self._noops:
            for _ in range(self._random.randint(self._noops)):
                _, _, dead, _ = self._env.step(0)
                if dead:
                    self._env.reset()
        self._last_lives = self._ale.lives()
        self._screen(self._buffer[0])
        self._buffer[1].fill(0)
-  def close(self):
+        self._done = False
-    return self._env.close()
+        self._step = 0
        obs, reward, is_terminal, _ = self._obs(0.0, is_first=True)
        return obs
    def _obs(self, reward, is_first=False, is_last=False, is_terminal=False):
        np.maximum(self._buffer[0], self._buffer[1], out=self._buffer[0])
        image = self._buffer[0]
        if image.shape[:2] != self._size:
            if self._resize == "opencv":
                image = self._cv2.resize(
                    image, self._size, interpolation=self._cv2.INTER_AREA
                )
            if self._resize == "pillow":
                image = self._image.fromarray(image)
                image = image.resize(self._size, self._image.NEAREST)
                image = np.array(image)
        if self._gray:
            weights = [0.299, 0.587, 1 - (0.299 + 0.587)]
            image = np.tensordot(image, weights, (-1, 0)).astype(image.dtype)
            image = image[:, :, None]
        return {"image": image, "is_terminal": is_terminal}, reward, is_last, {}
    def _screen(self, array):
        self._ale.getScreenRGB2(array)
    def close(self):
        return self._env.close()
--- a/envs/dmc.py
+++ b/envs/dmc.py
@ -3,62 +3,60 @@ import numpy as np
 class DeepMindControl:
    def __init__(self, name, action_repeat=1, size=(64, 64), camera=None):
        domain, task = name.split("_", 1)
        if domain == "cup":  # Only domain with multiple words.
            domain = "ball_in_cup"
        if isinstance(domain, str):
            from dm_control import suite
-  def __init__(self, name, action_repeat=1, size=(64, 64), camera=None):
+            self._env = suite.load(domain, task)
-    domain, task = name.split('_', 1)
+        else:
-    if domain == 'cup':  # Only domain with multiple words.
+            assert task is None
-      domain = 'ball_in_cup'
+            self._env = domain()
-    if isinstance(domain, str):
+        self._action_repeat = action_repeat
-      from dm_control import suite
+        self._size = size
-      self._env = suite.load(domain, task)
+        if camera is None:
-    else:
+            camera = dict(quadruped=2).get(domain, 0)
-      assert task is None
+        self._camera = camera
      self._env = domain()
    self._action_repeat = action_repeat
    self._size = size
    if camera is None:
      camera = dict(quadruped=2).get(domain, 0)
    self._camera = camera
-  @property
+    @property
-  def observation_space(self):
+    def observation_space(self):
-    spaces = {}
+        spaces = {}
-    for key, value in self._env.observation_spec().items():
+        for key, value in self._env.observation_spec().items():
-      spaces[key] = gym.spaces.Box(
+            spaces[key] = gym.spaces.Box(-np.inf, np.inf, value.shape, dtype=np.float32)
-          -np.inf, np.inf, value.shape, dtype=np.float32)
+        spaces["image"] = gym.spaces.Box(0, 255, self._size + (3,), dtype=np.uint8)
-    spaces['image'] = gym.spaces.Box(
+        return gym.spaces.Dict(spaces)
        0, 255, self._size + (3,), dtype=np.uint8)
    return gym.spaces.Dict(spaces)
-  @property
+    @property
-  def action_space(self):
+    def action_space(self):
-    spec = self._env.action_spec()
+        spec = self._env.action_spec()
-    return gym.spaces.Box(spec.minimum, spec.maximum, dtype=np.float32)
+        return gym.spaces.Box(spec.minimum, spec.maximum, dtype=np.float32)
-  def step(self, action):
+    def step(self, action):
-    assert np.isfinite(action).all(), action
+        assert np.isfinite(action).all(), action
-    reward = 0
+        reward = 0
-    for _ in range(self._action_repeat):
+        for _ in range(self._action_repeat):
-      time_step = self._env.step(action)
+            time_step = self._env.step(action)
-      reward += time_step.reward or 0
+            reward += time_step.reward or 0
-      if time_step.last():
+            if time_step.last():
-        break
+                break
-    obs = dict(time_step.observation)
+        obs = dict(time_step.observation)
-    obs['image'] = self.render()
+        obs["image"] = self.render()
-    # There is no terminal state in DMC
+        # There is no terminal state in DMC
-    obs['is_terminal'] = False
+        obs["is_terminal"] = False
-    done = time_step.last()
+        done = time_step.last()
-    info = {'discount': np.array(time_step.discount, np.float32)}
+        info = {"discount": np.array(time_step.discount, np.float32)}
-    return obs, reward, done, info
+        return obs, reward, done, info
-  def reset(self):
+    def reset(self):
-    time_step = self._env.reset()
+        time_step = self._env.reset()
-    obs = dict(time_step.observation)
+        obs = dict(time_step.observation)
-    obs['image'] = self.render()
+        obs["image"] = self.render()
-    obs['is_terminal'] = False
+        obs["is_terminal"] = False
-    return obs
+        return obs
-  def render(self, *args, **kwargs):
+    def render(self, *args, **kwargs):
-    if kwargs.get('mode', 'rgb_array') != 'rgb_array':
+        if kwargs.get("mode", "rgb_array") != "rgb_array":
-      raise ValueError("Only render mode 'rgb_array' is supported.")
+            raise ValueError("Only render mode 'rgb_array' is supported.")
-    return self._env.physics.render(*self._size, camera_id=self._camera)
+        return self._env.physics.render(*self._size, camera_id=self._camera)
--- a/envs/dmlab.py
+++ b/envs/dmlab.py
@ -4,98 +4,105 @@ import deepmind_lab
 class DeepMindLabyrinth(object):
    ACTION_SET_DEFAULT = (
        (0, 0, 0, 1, 0, 0, 0),  # Forward
        (0, 0, 0, -1, 0, 0, 0),  # Backward
        (0, 0, -1, 0, 0, 0, 0),  # Strafe Left
        (0, 0, 1, 0, 0, 0, 0),  # Strafe Right
        (-20, 0, 0, 0, 0, 0, 0),  # Look Left
        (20, 0, 0, 0, 0, 0, 0),  # Look Right
        (-20, 0, 0, 1, 0, 0, 0),  # Look Left + Forward
        (20, 0, 0, 1, 0, 0, 0),  # Look Right + Forward
        (0, 0, 0, 0, 1, 0, 0),  # Fire
    )
-  ACTION_SET_DEFAULT = (
+    ACTION_SET_MEDIUM = (
-      (0, 0, 0, 1, 0, 0, 0),    # Forward
+        (0, 0, 0, 1, 0, 0, 0),  # Forward
-      (0, 0, 0, -1, 0, 0, 0),   # Backward
+        (0, 0, 0, -1, 0, 0, 0),  # Backward
-      (0, 0, -1, 0, 0, 0, 0),   # Strafe Left
+        (0, 0, -1, 0, 0, 0, 0),  # Strafe Left
-      (0, 0, 1, 0, 0, 0, 0),    # Strafe Right
+        (0, 0, 1, 0, 0, 0, 0),  # Strafe Right
-      (-20, 0, 0, 0, 0, 0, 0),  # Look Left
+        (-20, 0, 0, 0, 0, 0, 0),  # Look Left
-      (20, 0, 0, 0, 0, 0, 0),   # Look Right
+        (20, 0, 0, 0, 0, 0, 0),  # Look Right
-      (-20, 0, 0, 1, 0, 0, 0),  # Look Left + Forward
+        (0, 0, 0, 0, 0, 0, 0),  # Idle.
-      (20, 0, 0, 1, 0, 0, 0),   # Look Right + Forward
+    )
      (0, 0, 0, 0, 1, 0, 0),    # Fire
  )
-  ACTION_SET_MEDIUM = (
+    ACTION_SET_SMALL = (
-      (0, 0, 0, 1, 0, 0, 0),    # Forward
+        (0, 0, 0, 1, 0, 0, 0),  # Forward
-      (0, 0, 0, -1, 0, 0, 0),   # Backward
+        (-20, 0, 0, 0, 0, 0, 0),  # Look Left
-      (0, 0, -1, 0, 0, 0, 0),   # Strafe Left
+        (20, 0, 0, 0, 0, 0, 0),  # Look Right
-      (0, 0, 1, 0, 0, 0, 0),    # Strafe Right
+    )
      (-20, 0, 0, 0, 0, 0, 0),  # Look Left
      (20, 0, 0, 0, 0, 0, 0),   # Look Right
      (0, 0, 0, 0, 0, 0, 0),    # Idle.
  )
-  ACTION_SET_SMALL = (
+    def __init__(
-      (0, 0, 0, 1, 0, 0, 0),    # Forward
+        self,
-      (-20, 0, 0, 0, 0, 0, 0),  # Look Left
+        level,
-      (20, 0, 0, 0, 0, 0, 0),   # Look Right
+        mode,
-  )
+        action_repeat=4,
        render_size=(64, 64),
        action_set=ACTION_SET_DEFAULT,
        level_cache=None,
        seed=None,
        runfiles_path=None,
    ):
        assert mode in ("train", "test")
        if runfiles_path:
            print("Setting DMLab runfiles path:", runfiles_path)
            deepmind_lab.set_runfiles_path(runfiles_path)
        self._config = {}
        self._config["width"] = render_size[0]
        self._config["height"] = render_size[1]
        self._config["logLevel"] = "WARN"
        if mode == "test":
            self._config["allowHoldOutLevels"] = "true"
            self._config["mixerSeed"] = 0x600D5EED
        self._action_repeat = action_repeat
        self._random = np.random.RandomState(seed)
        self._env = deepmind_lab.Lab(
            level="contributed/dmlab30/" + level,
            observations=["RGB_INTERLEAVED"],
            config={k: str(v) for k, v in self._config.items()},
            level_cache=level_cache,
        )
        self._action_set = action_set
        self._last_image = None
        self._done = True
-  def __init__(
+    @property
-      self, level, mode, action_repeat=4, render_size=(64, 64),
+    def observation_space(self):
-      action_set=ACTION_SET_DEFAULT, level_cache=None, seed=None,
+        shape = (self._config["height"], self._config["width"], 3)
-      runfiles_path=None):
+        space = gym.spaces.Box(low=0, high=255, shape=shape, dtype=np.uint8)
-    assert mode in ('train', 'test')
+        return gym.spaces.Dict({"image": space})
    if runfiles_path:
      print('Setting DMLab runfiles path:', runfiles_path)
      deepmind_lab.set_runfiles_path(runfiles_path)
    self._config = {}
    self._config['width'] = render_size[0]
    self._config['height'] = render_size[1]
    self._config['logLevel'] = 'WARN'
    if mode == 'test':
      self._config['allowHoldOutLevels'] = 'true'
      self._config['mixerSeed'] = 0x600D5EED
    self._action_repeat = action_repeat
    self._random = np.random.RandomState(seed)
    self._env = deepmind_lab.Lab(
        level='contributed/dmlab30/'+level,
        observations=['RGB_INTERLEAVED'],
        config={k: str(v) for k, v in self._config.items()},
        level_cache=level_cache)
    self._action_set = action_set
    self._last_image = None
    self._done = True
-  @property
+    @property
-  def observation_space(self):
+    def action_space(self):
-    shape = (self._config['height'], self._config['width'], 3)
+        return gym.spaces.Discrete(len(self._action_set))
    space = gym.spaces.Box(low=0, high=255, shape=shape, dtype=np.uint8)
    return gym.spaces.Dict({'image': space})
-  @property
+    def reset(self):
-  def action_space(self):
+        self._done = False
-    return gym.spaces.Discrete(len(self._action_set))
+        self._env.reset(seed=self._random.randint(0, 2**31 - 1))
        obs = self._get_obs()
        return obs
-  def reset(self):
+    def step(self, action):
-    self._done = False
+        raw_action = np.array(self._action_set[action], np.intc)
-    self._env.reset(seed=self._random.randint(0, 2 ** 31 - 1))
+        reward = self._env.step(raw_action, num_steps=self._action_repeat)
-    obs = self._get_obs()
+        self._done = not self._env.is_running()
-    return obs
+        obs = self._get_obs()
        return obs, reward, self._done, {}
-  def step(self, action):
+    def render(self, *args, **kwargs):
-    raw_action = np.array(self._action_set[action], np.intc)
+        if kwargs.get("mode", "rgb_array") != "rgb_array":
-    reward = self._env.step(raw_action, num_steps=self._action_repeat)
+            raise ValueError("Only render mode 'rgb_array' is supported.")
-    self._done = not self._env.is_running()
+        del args  # Unused
-    obs = self._get_obs()
+        del kwargs  # Unused
-    return obs, reward, self._done, {}
+        return self._last_image
-  def render(self, *args, **kwargs):
+    def close(self):
-    if kwargs.get('mode', 'rgb_array') != 'rgb_array':
+        self._env.close()
      raise ValueError("Only render mode 'rgb_array' is supported.")
    del args  # Unused
    del kwargs  # Unused
    return self._last_image
-  def close(self):
+    def _get_obs(self):
-    self._env.close()
+        if self._done:
-
+            image = 0 * self._last_image
-  def _get_obs(self):
+        else:
-    if self._done:
+            image = self._env.observations()["RGB_INTERLEAVED"]
-      image = 0 * self._last_image
+        self._last_image = image
-    else:
+        return {"image": image}
      image = self._env.observations()['RGB_INTERLEAVED']
    self._last_image = image
    return {'image': image}
--- a/envs/wrappers.py
+++ b/envs/wrappers.py
@ -3,186 +3,180 @@ import numpy as np
 class CollectDataset:
    def __init__(self, env, callbacks=None, precision=32):
        self._env = env
        self._callbacks = callbacks or ()
        self._precision = precision
        self._episode = None
-  def __init__(self, env, callbacks=None, precision=32):
+    def __getattr__(self, name):
-    self._env = env
+        return getattr(self._env, name)
    self._callbacks = callbacks or ()
    self._precision = precision
    self._episode = None
-  def __getattr__(self, name):
+    def step(self, action):
-    return getattr(self._env, name)
+        obs, reward, done, info = self._env.step(action)
        obs = {k: self._convert(v) for k, v in obs.items()}
        transition = obs.copy()
        if isinstance(action, dict):
            transition.update(action)
        else:
            transition["action"] = action
        transition["reward"] = reward
        transition["discount"] = info.get("discount", np.array(1 - float(done)))
        self._episode.append(transition)
        if done:
            for key, value in self._episode[1].items():
                if key not in self._episode[0]:
                    self._episode[0][key] = 0 * value
            episode = {k: [t[k] for t in self._episode] for k in self._episode[0]}
            episode = {k: self._convert(v) for k, v in episode.items()}
            info["episode"] = episode
            for callback in self._callbacks:
                callback(episode)
        return obs, reward, done, info
-  def step(self, action):
+    def reset(self):
-    obs, reward, done, info = self._env.step(action)
+        obs = self._env.reset()
-    obs = {k: self._convert(v) for k, v in obs.items()}
+        transition = obs.copy()
-    transition = obs.copy()
+        # Missing keys will be filled with a zeroed out version of the first
-    if isinstance(action, dict):
+        # transition, because we do not know what action information the agent will
-      transition.update(action)
+        # pass yet.
-    else:
+        transition["reward"] = 0.0
-      transition['action'] = action
+        transition["discount"] = 1.0
-    transition['reward'] = reward
+        self._episode = [transition]
-    transition['discount'] = info.get('discount', np.array(1 - float(done)))
+        return obs
    self._episode.append(transition)
    if done:
      for key, value in self._episode[1].items():
        if key not in self._episode[0]:
          self._episode[0][key] = 0 * value
      episode = {k: [t[k] for t in self._episode] for k in self._episode[0]}
      episode = {k: self._convert(v) for k, v in episode.items()}
      info['episode'] = episode
      for callback in self._callbacks:
        callback(episode)
    return obs, reward, done, info
-  def reset(self):
+    def _convert(self, value):
-    obs = self._env.reset()
+        value = np.array(value)
-    transition = obs.copy()
+        if np.issubdtype(value.dtype, np.floating):
-    # Missing keys will be filled with a zeroed out version of the first
+            dtype = {16: np.float16, 32: np.float32, 64: np.float64}[self._precision]
-    # transition, because we do not know what action information the agent will
+        elif np.issubdtype(value.dtype, np.signedinteger):
-    # pass yet.
+            dtype = {16: np.int16, 32: np.int32, 64: np.int64}[self._precision]
-    transition['reward'] = 0.0
+        elif np.issubdtype(value.dtype, np.uint8):
-    transition['discount'] = 1.0
+            dtype = np.uint8
-    self._episode = [transition]
+        elif np.issubdtype(value.dtype, np.bool):
-    return obs
+            dtype = np.bool
-
+        else:
-  def _convert(self, value):
+            raise NotImplementedError(value.dtype)
-    value = np.array(value)
+        return value.astype(dtype)
    if np.issubdtype(value.dtype, np.floating):
      dtype = {16: np.float16, 32: np.float32, 64: np.float64}[self._precision]
    elif np.issubdtype(value.dtype, np.signedinteger):
      dtype = {16: np.int16, 32: np.int32, 64: np.int64}[self._precision]
    elif np.issubdtype(value.dtype, np.uint8):
      dtype = np.uint8
    elif np.issubdtype(value.dtype, np.bool):
      dtype = np.bool
    else:
      raise NotImplementedError(value.dtype)
    return value.astype(dtype)
 class TimeLimit:
    def __init__(self, env, duration):
        self._env = env
        self._duration = duration
        self._step = None
-  def __init__(self, env, duration):
+    def __getattr__(self, name):
-    self._env = env
+        return getattr(self._env, name)
    self._duration = duration
    self._step = None
-  def __getattr__(self, name):
+    def step(self, action):
-    return getattr(self._env, name)
+        assert self._step is not None, "Must reset environment."
        obs, reward, done, info = self._env.step(action)
        self._step += 1
        if self._step >= self._duration:
            done = True
            if "discount" not in info:
                info["discount"] = np.array(1.0).astype(np.float32)
            self._step = None
        return obs, reward, done, info
-  def step(self, action):
+    def reset(self):
-    assert self._step is not None, 'Must reset environment.'
+        self._step = 0
-    obs, reward, done, info = self._env.step(action)
+        return self._env.reset()
    self._step += 1
    if self._step >= self._duration:
      done = True
      if 'discount' not in info:
        info['discount'] = np.array(1.0).astype(np.float32)
      self._step = None
    return obs, reward, done, info
  def reset(self):
    self._step = 0
    return self._env.reset()
 class NormalizeActions:
    def __init__(self, env):
        self._env = env
        self._mask = np.logical_and(
            np.isfinite(env.action_space.low), np.isfinite(env.action_space.high)
        )
        self._low = np.where(self._mask, env.action_space.low, -1)
        self._high = np.where(self._mask, env.action_space.high, 1)
-  def __init__(self, env):
+    def __getattr__(self, name):
-    self._env = env
+        return getattr(self._env, name)
    self._mask = np.logical_and(
        np.isfinite(env.action_space.low),
        np.isfinite(env.action_space.high))
    self._low = np.where(self._mask, env.action_space.low, -1)
    self._high = np.where(self._mask, env.action_space.high, 1)
-  def __getattr__(self, name):
+    @property
-    return getattr(self._env, name)
+    def action_space(self):
        low = np.where(self._mask, -np.ones_like(self._low), self._low)
        high = np.where(self._mask, np.ones_like(self._low), self._high)
        return gym.spaces.Box(low, high, dtype=np.float32)
-  @property
+    def step(self, action):
-  def action_space(self):
+        original = (action + 1) / 2 * (self._high - self._low) + self._low
-    low = np.where(self._mask, -np.ones_like(self._low), self._low)
+        original = np.where(self._mask, original, action)
-    high = np.where(self._mask, np.ones_like(self._low), self._high)
+        return self._env.step(original)
    return gym.spaces.Box(low, high, dtype=np.float32)
  def step(self, action):
    original = (action + 1) / 2 * (self._high - self._low) + self._low
    original = np.where(self._mask, original, action)
    return self._env.step(original)
 class OneHotAction:
    def __init__(self, env):
        assert isinstance(env.action_space, gym.spaces.Discrete)
        self._env = env
        self._random = np.random.RandomState()
-  def __init__(self, env):
+    def __getattr__(self, name):
-    assert isinstance(env.action_space, gym.spaces.Discrete)
+        return getattr(self._env, name)
    self._env = env
    self._random = np.random.RandomState()
-  def __getattr__(self, name):
+    @property
-    return getattr(self._env, name)
+    def action_space(self):
        shape = (self._env.action_space.n,)
        space = gym.spaces.Box(low=0, high=1, shape=shape, dtype=np.float32)
        space.sample = self._sample_action
        space.discrete = True
        return space
-  @property
+    def step(self, action):
-  def action_space(self):
+        index = np.argmax(action).astype(int)
-    shape = (self._env.action_space.n,)
+        reference = np.zeros_like(action)
-    space = gym.spaces.Box(low=0, high=1, shape=shape, dtype=np.float32)
+        reference[index] = 1
-    space.sample = self._sample_action
+        if not np.allclose(reference, action):
-    space.discrete = True
+            raise ValueError(f"Invalid one-hot action:\n{action}")
-    return space
+        return self._env.step(index)
-  def step(self, action):
+    def reset(self):
-    index = np.argmax(action).astype(int)
+        return self._env.reset()
    reference = np.zeros_like(action)
    reference[index] = 1
    if not np.allclose(reference, action):
      raise ValueError(f'Invalid one-hot action:\n{action}')
    return self._env.step(index)
-  def reset(self):
+    def _sample_action(self):
-    return self._env.reset()
+        actions = self._env.action_space.n
-
+        index = self._random.randint(0, actions)
-  def _sample_action(self):
+        reference = np.zeros(actions, dtype=np.float32)
-    actions = self._env.action_space.n
+        reference[index] = 1.0
-    index = self._random.randint(0, actions)
+        return reference
    reference = np.zeros(actions, dtype=np.float32)
    reference[index] = 1.0
    return reference
 class RewardObs:
    def __init__(self, env):
        self._env = env
-  def __init__(self, env):
+    def __getattr__(self, name):
-    self._env = env
+        return getattr(self._env, name)
-  def __getattr__(self, name):
+    @property
-    return getattr(self._env, name)
+    def observation_space(self):
        spaces = self._env.observation_space.spaces
        assert "reward" not in spaces
        spaces["reward"] = gym.spaces.Box(-np.inf, np.inf, dtype=np.float32)
        return gym.spaces.Dict(spaces)
-  @property
+    def step(self, action):
-  def observation_space(self):
+        obs, reward, done, info = self._env.step(action)
-    spaces = self._env.observation_space.spaces
+        obs["reward"] = reward
-    assert 'reward' not in spaces
+        return obs, reward, done, info
    spaces['reward'] = gym.spaces.Box(-np.inf, np.inf, dtype=np.float32)
    return gym.spaces.Dict(spaces)
-  def step(self, action):
+    def reset(self):
-    obs, reward, done, info = self._env.step(action)
+        obs = self._env.reset()
-    obs['reward'] = reward
+        obs["reward"] = 0.0
-    return obs, reward, done, info
+        return obs
  def reset(self):
    obs = self._env.reset()
    obs['reward'] = 0.0
    return obs
 class SelectAction:
    def __init__(self, env, key):
        self._env = env
        self._key = key
-  def __init__(self, env, key):
+    def __getattr__(self, name):
-    self._env = env
+        return getattr(self._env, name)
    self._key = key
-  def __getattr__(self, name):
+    def step(self, action):
-    return getattr(self._env, name)
+        return self._env.step(action[self._key])
  def step(self, action):
    return self._env.step(action[self._key])