Tianshou/test/offline/test_discrete_crr.py

import argparse
import os
import pickle
import pprint

import gym
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter

from tianshou.data import Collector, VectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.policy import DiscreteCRRPolicy
from tianshou.trainer import offline_trainer
from tianshou.utils import TensorboardLogger
from tianshou.utils.net.common import ActorCritic, Net
from tianshou.utils.net.discrete import Actor, Critic

if __name__ == "__main__":
    from gather_cartpole_data import expert_file_name, gather_data
else:  # pytest
    from test.offline.gather_cartpole_data import expert_file_name, gather_data


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--task", type=str, default="CartPole-v0")
    parser.add_argument("--reward-threshold", type=float, default=None)
    parser.add_argument("--seed", type=int, default=1626)
    parser.add_argument("--lr", type=float, default=7e-4)
    parser.add_argument("--gamma", type=float, default=0.99)
    parser.add_argument("--n-step", type=int, default=3)
    parser.add_argument("--target-update-freq", type=int, default=320)
    parser.add_argument("--epoch", type=int, default=5)
    parser.add_argument("--update-per-epoch", type=int, default=1000)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[64, 64])
    parser.add_argument("--test-num", type=int, default=100)
    parser.add_argument("--logdir", type=str, default="log")
    parser.add_argument("--render", type=float, default=0.)
    parser.add_argument("--load-buffer-name", type=str, default=expert_file_name())
    parser.add_argument(
        "--device",
        type=str,
        default="cuda" if torch.cuda.is_available() else "cpu",
    )
    args = parser.parse_known_args()[0]
    return args


def test_discrete_crr(args=get_args()):
    # envs
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    if args.reward_threshold is None:
        default_reward_threshold = {"CartPole-v0": 180}
        args.reward_threshold = default_reward_threshold.get(
            args.task, env.spec.reward_threshold
        )
    test_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)]
    )
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.state_shape, args.hidden_sizes[0], device=args.device)
    actor = Actor(
        net,
        args.action_shape,
        hidden_sizes=args.hidden_sizes,
        device=args.device,
        softmax_output=False
    )
    critic = Critic(
        net,
        hidden_sizes=args.hidden_sizes,
        last_size=np.prod(args.action_shape),
        device=args.device
    )
    actor_critic = ActorCritic(actor, critic)
    optim = torch.optim.Adam(actor_critic.parameters(), lr=args.lr)

    policy = DiscreteCRRPolicy(
        actor,
        critic,
        optim,
        args.gamma,
        target_update_freq=args.target_update_freq,
    ).to(args.device)
    # buffer
    if os.path.exists(args.load_buffer_name) and os.path.isfile(args.load_buffer_name):
        if args.load_buffer_name.endswith(".hdf5"):
            buffer = VectorReplayBuffer.load_hdf5(args.load_buffer_name)
        else:
            buffer = pickle.load(open(args.load_buffer_name, "rb"))
    else:
        buffer = gather_data()

    # collector
    test_collector = Collector(policy, test_envs, exploration_noise=True)

    log_path = os.path.join(args.logdir, args.task, 'discrete_crr')
    writer = SummaryWriter(log_path)
    logger = TensorboardLogger(writer)

    def save_best_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(mean_rewards):
        return mean_rewards >= args.reward_threshold

    result = offline_trainer(
        policy,
        buffer,
        test_collector,
        args.epoch,
        args.update_per_epoch,
        args.test_num,
        args.batch_size,
        stop_fn=stop_fn,
        save_best_fn=save_best_fn,
        logger=logger
    )

    assert stop_fn(result['best_reward'])

    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        policy.eval()
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        rews, lens = result["rews"], result["lens"]
        print(f"Final reward: {rews.mean()}, length: {lens.mean()}")


if __name__ == "__main__":
    test_discrete_crr(get_args())
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`import argparse`
Add discrete Critic Regularized Regression (#367) 2021-05-18 22:29:56 -07:00			`import os`
			`import pickle`
			`import pprint`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00
			`import gym`
Add discrete Critic Regularized Regression (#367) 2021-05-18 22:29:56 -07:00			`import numpy as np`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`import torch`
Add discrete Critic Regularized Regression (#367) 2021-05-18 22:29:56 -07:00			`from torch.utils.tensorboard import SummaryWriter`

make unit test faster (#522) * test cache expert data in offline training * faster cql test * faster tests * use dummy * test ray dependency 2022-02-08 11:24:52 -05:00			`from tianshou.data import Collector, VectorReplayBuffer`
Add discrete Critic Regularized Regression (#367) 2021-05-18 22:29:56 -07:00			`from tianshou.env import DummyVectorEnv`
			`from tianshou.policy import DiscreteCRRPolicy`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`from tianshou.trainer import offline_trainer`
			`from tianshou.utils import TensorboardLogger`
Fix critic network for Discrete CRR (#485) - Fixes an inconsistency in the implementation of Discrete CRR. Now it uses `Critic` class for its critic, following conventions in other actor-critic policies; - Updates several offline policies to use `ActorCritic` class for its optimizer to eliminate randomness caused by parameter sharing between actor and critic; - Add `writer.flush()` in TensorboardLogger to ensure real-time result; - Enable `test_collector=None` in 3 trainers to turn off testing during training; - Updates the Atari offline results in README.md; - Moves Atari offline RL examples to `examples/offline`; tests to `test/offline` per review comments. 2021-11-28 07:10:28 -08:00			`from tianshou.utils.net.common import ActorCritic, Net`
			`from tianshou.utils.net.discrete import Actor, Critic`

			`if __name__ == "__main__":`
make unit test faster (#522) * test cache expert data in offline training * faster cql test * faster tests * use dummy * test ray dependency 2022-02-08 11:24:52 -05:00			`from gather_cartpole_data import expert_file_name, gather_data`
Fix critic network for Discrete CRR (#485) - Fixes an inconsistency in the implementation of Discrete CRR. Now it uses `Critic` class for its critic, following conventions in other actor-critic policies; - Updates several offline policies to use `ActorCritic` class for its optimizer to eliminate randomness caused by parameter sharing between actor and critic; - Add `writer.flush()` in TensorboardLogger to ensure real-time result; - Enable `test_collector=None` in 3 trainers to turn off testing during training; - Updates the Atari offline results in README.md; - Moves Atari offline RL examples to `examples/offline`; tests to `test/offline` per review comments. 2021-11-28 07:10:28 -08:00			`else: # pytest`
make unit test faster (#522) * test cache expert data in offline training * faster cql test * faster tests * use dummy * test ray dependency 2022-02-08 11:24:52 -05:00			`from test.offline.gather_cartpole_data import expert_file_name, gather_data`
Add discrete Critic Regularized Regression (#367) 2021-05-18 22:29:56 -07:00

			`def get_args():`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument("--task", type=str, default="CartPole-v0")`
Fixed hardcoded reward_treshold (#548) 2022-03-04 03:35:39 +01:00			`parser.add_argument("--reward-threshold", type=float, default=None)`
Add discrete Critic Regularized Regression (#367) 2021-05-18 22:29:56 -07:00			`parser.add_argument("--seed", type=int, default=1626)`
			`parser.add_argument("--lr", type=float, default=7e-4)`
			`parser.add_argument("--gamma", type=float, default=0.99)`
			`parser.add_argument("--n-step", type=int, default=3)`
			`parser.add_argument("--target-update-freq", type=int, default=320)`
			`parser.add_argument("--epoch", type=int, default=5)`
			`parser.add_argument("--update-per-epoch", type=int, default=1000)`
			`parser.add_argument("--batch-size", type=int, default=64)`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[64, 64])`
Add discrete Critic Regularized Regression (#367) 2021-05-18 22:29:56 -07:00			`parser.add_argument("--test-num", type=int, default=100)`
			`parser.add_argument("--logdir", type=str, default="log")`
			`parser.add_argument("--render", type=float, default=0.)`
make unit test faster (#522) * test cache expert data in offline training * faster cql test * faster tests * use dummy * test ray dependency 2022-02-08 11:24:52 -05:00			`parser.add_argument("--load-buffer-name", type=str, default=expert_file_name())`
Add discrete Critic Regularized Regression (#367) 2021-05-18 22:29:56 -07:00			`parser.add_argument(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`"--device",`
			`type=str,`
Add discrete Critic Regularized Regression (#367) 2021-05-18 22:29:56 -07:00			`default="cuda" if torch.cuda.is_available() else "cpu",`
			`)`
			`args = parser.parse_known_args()[0]`
			`return args`


			`def test_discrete_crr(args=get_args()):`
			`# envs`
			`env = gym.make(args.task)`
			`args.state_shape = env.observation_space.shape or env.observation_space.n`
			`args.action_shape = env.action_space.shape or env.action_space.n`
Fixed hardcoded reward_treshold (#548) 2022-03-04 03:35:39 +01:00			`if args.reward_threshold is None:`
			`default_reward_threshold = {"CartPole-v0": 180}`
			`args.reward_threshold = default_reward_threshold.get(`
			`args.task, env.spec.reward_threshold`
			`)`
Add discrete Critic Regularized Regression (#367) 2021-05-18 22:29:56 -07:00			`test_envs = DummyVectorEnv(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`[lambda: gym.make(args.task) for _ in range(args.test_num)]`
			`)`
Add discrete Critic Regularized Regression (#367) 2021-05-18 22:29:56 -07:00			`# seed`
			`np.random.seed(args.seed)`
			`torch.manual_seed(args.seed)`
			`test_envs.seed(args.seed)`
			`# model`
Fix critic network for Discrete CRR (#485) - Fixes an inconsistency in the implementation of Discrete CRR. Now it uses `Critic` class for its critic, following conventions in other actor-critic policies; - Updates several offline policies to use `ActorCritic` class for its optimizer to eliminate randomness caused by parameter sharing between actor and critic; - Add `writer.flush()` in TensorboardLogger to ensure real-time result; - Enable `test_collector=None` in 3 trainers to turn off testing during training; - Updates the Atari offline results in README.md; - Moves Atari offline RL examples to `examples/offline`; tests to `test/offline` per review comments. 2021-11-28 07:10:28 -08:00			`net = Net(args.state_shape, args.hidden_sizes[0], device=args.device)`
			`actor = Actor(`
			`net,`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`args.action_shape,`
			`hidden_sizes=args.hidden_sizes,`
			`device=args.device,`
Fix critic network for Discrete CRR (#485) - Fixes an inconsistency in the implementation of Discrete CRR. Now it uses `Critic` class for its critic, following conventions in other actor-critic policies; - Updates several offline policies to use `ActorCritic` class for its optimizer to eliminate randomness caused by parameter sharing between actor and critic; - Add `writer.flush()` in TensorboardLogger to ensure real-time result; - Enable `test_collector=None` in 3 trainers to turn off testing during training; - Updates the Atari offline results in README.md; - Moves Atari offline RL examples to `examples/offline`; tests to `test/offline` per review comments. 2021-11-28 07:10:28 -08:00			`softmax_output=False`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`)`
Fix critic network for Discrete CRR (#485) - Fixes an inconsistency in the implementation of Discrete CRR. Now it uses `Critic` class for its critic, following conventions in other actor-critic policies; - Updates several offline policies to use `ActorCritic` class for its optimizer to eliminate randomness caused by parameter sharing between actor and critic; - Add `writer.flush()` in TensorboardLogger to ensure real-time result; - Enable `test_collector=None` in 3 trainers to turn off testing during training; - Updates the Atari offline results in README.md; - Moves Atari offline RL examples to `examples/offline`; tests to `test/offline` per review comments. 2021-11-28 07:10:28 -08:00			`critic = Critic(`
			`net,`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`hidden_sizes=args.hidden_sizes,`
Fix critic network for Discrete CRR (#485) - Fixes an inconsistency in the implementation of Discrete CRR. Now it uses `Critic` class for its critic, following conventions in other actor-critic policies; - Updates several offline policies to use `ActorCritic` class for its optimizer to eliminate randomness caused by parameter sharing between actor and critic; - Add `writer.flush()` in TensorboardLogger to ensure real-time result; - Enable `test_collector=None` in 3 trainers to turn off testing during training; - Updates the Atari offline results in README.md; - Moves Atari offline RL examples to `examples/offline`; tests to `test/offline` per review comments. 2021-11-28 07:10:28 -08:00			`last_size=np.prod(args.action_shape),`
			`device=args.device`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`)`
Fix critic network for Discrete CRR (#485) - Fixes an inconsistency in the implementation of Discrete CRR. Now it uses `Critic` class for its critic, following conventions in other actor-critic policies; - Updates several offline policies to use `ActorCritic` class for its optimizer to eliminate randomness caused by parameter sharing between actor and critic; - Add `writer.flush()` in TensorboardLogger to ensure real-time result; - Enable `test_collector=None` in 3 trainers to turn off testing during training; - Updates the Atari offline results in README.md; - Moves Atari offline RL examples to `examples/offline`; tests to `test/offline` per review comments. 2021-11-28 07:10:28 -08:00			`actor_critic = ActorCritic(actor, critic)`
			`optim = torch.optim.Adam(actor_critic.parameters(), lr=args.lr)`
Add discrete Critic Regularized Regression (#367) 2021-05-18 22:29:56 -07:00
			`policy = DiscreteCRRPolicy(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`actor,`
			`critic,`
			`optim,`
			`args.gamma,`
Add discrete Critic Regularized Regression (#367) 2021-05-18 22:29:56 -07:00			`target_update_freq=args.target_update_freq,`
			`).to(args.device)`
			`# buffer`
Fix critic network for Discrete CRR (#485) - Fixes an inconsistency in the implementation of Discrete CRR. Now it uses `Critic` class for its critic, following conventions in other actor-critic policies; - Updates several offline policies to use `ActorCritic` class for its optimizer to eliminate randomness caused by parameter sharing between actor and critic; - Add `writer.flush()` in TensorboardLogger to ensure real-time result; - Enable `test_collector=None` in 3 trainers to turn off testing during training; - Updates the Atari offline results in README.md; - Moves Atari offline RL examples to `examples/offline`; tests to `test/offline` per review comments. 2021-11-28 07:10:28 -08:00			`if os.path.exists(args.load_buffer_name) and os.path.isfile(args.load_buffer_name):`
make unit test faster (#522) * test cache expert data in offline training * faster cql test * faster tests * use dummy * test ray dependency 2022-02-08 11:24:52 -05:00			`if args.load_buffer_name.endswith(".hdf5"):`
			`buffer = VectorReplayBuffer.load_hdf5(args.load_buffer_name)`
			`else:`
			`buffer = pickle.load(open(args.load_buffer_name, "rb"))`
Fix critic network for Discrete CRR (#485) - Fixes an inconsistency in the implementation of Discrete CRR. Now it uses `Critic` class for its critic, following conventions in other actor-critic policies; - Updates several offline policies to use `ActorCritic` class for its optimizer to eliminate randomness caused by parameter sharing between actor and critic; - Add `writer.flush()` in TensorboardLogger to ensure real-time result; - Enable `test_collector=None` in 3 trainers to turn off testing during training; - Updates the Atari offline results in README.md; - Moves Atari offline RL examples to `examples/offline`; tests to `test/offline` per review comments. 2021-11-28 07:10:28 -08:00			`else:`
			`buffer = gather_data()`
Add discrete Critic Regularized Regression (#367) 2021-05-18 22:29:56 -07:00
			`# collector`
			`test_collector = Collector(policy, test_envs, exploration_noise=True)`

Fix critic network for Discrete CRR (#485) - Fixes an inconsistency in the implementation of Discrete CRR. Now it uses `Critic` class for its critic, following conventions in other actor-critic policies; - Updates several offline policies to use `ActorCritic` class for its optimizer to eliminate randomness caused by parameter sharing between actor and critic; - Add `writer.flush()` in TensorboardLogger to ensure real-time result; - Enable `test_collector=None` in 3 trainers to turn off testing during training; - Updates the Atari offline results in README.md; - Moves Atari offline RL examples to `examples/offline`; tests to `test/offline` per review comments. 2021-11-28 07:10:28 -08:00			`log_path = os.path.join(args.logdir, args.task, 'discrete_crr')`
Add discrete Critic Regularized Regression (#367) 2021-05-18 22:29:56 -07:00			`writer = SummaryWriter(log_path)`
Add Weights and Biases Logger (#427) - rename BasicLogger to TensorboardLogger - refactor logger code - add WandbLogger Co-authored-by: Jiayi Weng <trinkle23897@gmail.com> 2021-08-30 10:35:02 -04:00			`logger = TensorboardLogger(writer)`
Add discrete Critic Regularized Regression (#367) 2021-05-18 22:29:56 -07:00
rename save_fn to save_best_fn to avoid ambiguity (#575) This PR also introduces `tianshou.utils.deprecation` for a unified deprecation wrapper. 2022-03-21 16:29:27 -04:00			`def save_best_fn(policy):`
Add discrete Critic Regularized Regression (#367) 2021-05-18 22:29:56 -07:00			`torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))`

			`def stop_fn(mean_rewards):`
Fixed hardcoded reward_treshold (#548) 2022-03-04 03:35:39 +01:00			`return mean_rewards >= args.reward_threshold`
Add discrete Critic Regularized Regression (#367) 2021-05-18 22:29:56 -07:00
			`result = offline_trainer(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`policy,`
			`buffer,`
			`test_collector,`
			`args.epoch,`
			`args.update_per_epoch,`
			`args.test_num,`
			`args.batch_size,`
			`stop_fn=stop_fn,`
rename save_fn to save_best_fn to avoid ambiguity (#575) This PR also introduces `tianshou.utils.deprecation` for a unified deprecation wrapper. 2022-03-21 16:29:27 -04:00			`save_best_fn=save_best_fn,`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`logger=logger`
			`)`
Add discrete Critic Regularized Regression (#367) 2021-05-18 22:29:56 -07:00
			`assert stop_fn(result['best_reward'])`

			`if __name__ == '__main__':`
			`pprint.pprint(result)`
			`# Let's watch its performance!`
			`env = gym.make(args.task)`
			`policy.eval()`
			`collector = Collector(policy, env)`
			`result = collector.collect(n_episode=1, render=args.render)`
			`rews, lens = result["rews"], result["lens"]`
			`print(f"Final reward: {rews.mean()}, length: {lens.mean()}")`


			`if __name__ == "__main__":`
			`test_discrete_crr(get_args())`