diff --git a/flightpolicy/envs/vec_env_wrapper.py b/flightpolicy/envs/vec_env_wrapper.py index 8455fff..7840a11 100644 --- a/flightpolicy/envs/vec_env_wrapper.py +++ b/flightpolicy/envs/vec_env_wrapper.py @@ -25,7 +25,6 @@ class FlightEnvVec(VecEnv): self.world_box = np.zeros([6], dtype=np.float32) self.wrapper.getWorldBox(self.world_box) # xyz_min, xyz_max self.reward_names = self.wrapper.getRewardNames() - self.pretrained = False # observations self._traj_cost = np.zeros([self.num_envs, 1], dtype=np.float32) # cost of current pred diff --git a/flightpolicy/yopo/buffers.py b/flightpolicy/yopo/buffers.py index bd792b1..4400fa6 100644 --- a/flightpolicy/yopo/buffers.py +++ b/flightpolicy/yopo/buffers.py @@ -152,7 +152,6 @@ class ReplayBuffer(BaseBuffer): device: Union[th.device, str] = "cpu", n_envs: int = 1, optimize_memory_usage: bool = False, - handle_timeout_termination: bool = True, ): super(ReplayBuffer, self).__init__(buffer_size, observation_dim, device, n_envs=n_envs) @@ -165,15 +164,10 @@ class ReplayBuffer(BaseBuffer): self.optimize_memory_usage = optimize_memory_usage - self.observations = np.zeros((self.buffer_size, self.n_envs) + observation_dim, dtype=np.float32) + self.observations = np.zeros((self.buffer_size, self.n_envs, observation_dim), dtype=np.float32) self.goals = np.zeros((self.buffer_size, self.n_envs, 3), dtype=np.float32) self.depths = np.zeros((self.buffer_size, self.n_envs, 1, image_WxH[1], image_WxH[0]), dtype=np.float32) - self.map_ids = np.zeros((self.buffer_size, self.n_envs, 1), dtype=np.float32) - - # Handle timeouts termination properly if needed - # see https://github.com/DLR-RM/stable-baselines3/issues/284 - self.handle_timeout_termination = handle_timeout_termination - self.timeouts = np.zeros((self.buffer_size, self.n_envs), dtype=np.float32) + self.map_ids = np.zeros((self.buffer_size, self.n_envs, 1), dtype=np.int16) if psutil is not None: total_memory_usage = self.observations.nbytes + self.goals.nbytes + self.depths.nbytes + self.map_ids.nbytes @@ -187,16 +181,11 @@ class ReplayBuffer(BaseBuffer): f"replay buffer {total_memory_usage:.2f}GB > {mem_available:.2f}GB" ) - def add( - self, + def add(self, obs: np.ndarray, goal: np.ndarray, depth: np.ndarray, - map_id: int, - infos: List[Dict[str, Any]], - ) -> None: - - # TODO: 删了obs的格式调整,检查下还能不能正常放 + map_id: int) -> None: # Copy to avoid modification by reference self.observations[self.pos] = np.array(obs).copy() @@ -204,9 +193,6 @@ class ReplayBuffer(BaseBuffer): self.depths[self.pos] = np.array(depth).copy() self.map_ids[self.pos] = np.array(map_id).copy() - if self.handle_timeout_termination: - self.timeouts[self.pos] = np.array([info.get("TimeLimit.truncated", False) for info in infos]) - self.pos += 1 if self.pos == self.buffer_size: self.full = True diff --git a/flightpolicy/yopo/yopo_algorithm.py b/flightpolicy/yopo/yopo_algorithm.py index e29d57c..4239a74 100644 --- a/flightpolicy/yopo/yopo_algorithm.py +++ b/flightpolicy/yopo/yopo_algorithm.py @@ -11,7 +11,7 @@ import numpy as np import torch as th from torch.nn import functional as F from stable_baselines3.common.type_aliases import RolloutReturn, TrainFreq, TrainFrequencyUnit -from stable_baselines3.common.utils import should_collect_more_steps, get_schedule_fn, configure_logger +from stable_baselines3.common.utils import should_collect_more_steps, get_schedule_fn, configure_logger, update_learning_rate from stable_baselines3.common.vec_env import VecEnv from stable_baselines3.common.utils import get_device @@ -119,7 +119,7 @@ class YopoAlgorithm: cost_losses = [] # Performance (score) of prediction score_losses = [] # Accuracy of the predicted score for step, (depth, pos, quat, obs_b, map_id) in enumerate(data_loader): # obs: body frame - if depth.shape[0] != self.batch_size: # batch size == num of env + if depth.shape[0] != self.batch_size: # batch size == num of env continue n_updates = n_updates + 1 depth = depth.to(self.device) @@ -172,82 +172,51 @@ class YopoAlgorithm: path = policy_path + "/epoch{}_iter{}.pth".format(epoch_, step) th.save({"state_dict": self.policy.state_dict(), "data": self.policy.get_constructor_parameters()}, path) - # 模仿学习: 已弃用(暂未删除以备后续使用) - # 0、reset_state、get_depth、reset_goal - # 1、执行若干步(env_num * 200) - # 2、训练若干步(batch_size = env_num, 训200次=1eposide) - # 3、reset_state、get_depth、reset_goal def imitation_learning( self, total_timesteps, - callback=None, - log_interval=4, - eval_env=None, - eval_freq=-1, - n_eval_episodes=5, - tb_log_name="YOPO", - eval_log_path=None, + log_interval, reset_num_timesteps=True, ): - # 0. 初始化第一次观测 - total_timesteps, callback = self._setup_learn( - total_timesteps, - eval_env, - callback, - eval_freq, - n_eval_episodes, - eval_log_path, - reset_num_timesteps, - tb_log_name, - ) - self.pretrained = self.env.pretrained - callback.on_training_start(locals(), globals()) + # 0. setup learn and init the first observation + self._setup_learn(total_timesteps, reset_num_timesteps) while self.num_timesteps < total_timesteps: - # 1. 数据收集 + # 1. Rollout and Collect Data into Buffer rollout = self.collect_rollouts( self.env, train_freq=self.train_freq, - action_noise=self.action_noise, - callback=callback, - replay_buffer=self.replay_buffer, - log_interval=log_interval, + replay_buffer=self.replay_buffer ) - if rollout.continue_training is False: break - # 2. 训练模型 + # 2. Train the Policy if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts: - # If no `gradient_steps` is specified, - # do as many gradients steps as steps performed during the rollout + # If no `gradient_steps` is specified, do as many gradients steps as steps performed during the rollout gradient_steps = self.gradient_steps if self.gradient_steps >= 0 else rollout.episode_timesteps - # Special case when the user passes `gradient_steps=0` - if gradient_steps > 0: + if gradient_steps > 0: # Special case when the user passes `gradient_steps=0` self.train(batch_size=self.batch_size, gradient_steps=gradient_steps) self.reset_state() iteration = int(self.num_timesteps / (self.train_freq.frequency * self.env.num_envs)) - # 3. 重置环境 + # 3. reset the environment if self.change_env_freq > 0 and iteration % self.change_env_freq == 0: self.env.spawnTreesAndSavePointcloud() self._map_id = self._map_id + 1 self.reset_state() - # 4. 终端打印log + # 4. print the log and save weight if log_interval is not None and iteration % log_interval[0] == 0: self._dump_logs() - if log_interval is not None and iteration % log_interval[1] == 0: policy_path = self.logger.get_dir() + "/Policy" os.makedirs(policy_path, exist_ok=True) path = policy_path + "/epoch0_iter{}.pth".format(iteration) th.save({"state_dict": self.policy.state_dict(), "data": self.policy.get_constructor_parameters()}, path) - callback.on_training_end() - def test_policy(self, num_rollouts: int = 10): max_ep_length = 400 self.policy.set_training_mode(False) @@ -294,19 +263,16 @@ class YopoAlgorithm: def train(self, gradient_steps: int, batch_size: int) -> None: """ - Sample the replay buffer and do the updates - (gradient descent and update target networks) + Imitation learning: sample data from the replay buffer and train the Policy """ - # Switch to train mode (this affects batch norm / dropout) - self.policy.set_training_mode(True) - # Update learning rate according to schedule (TODO in supervised learning) - self._update_learning_rate(self.policy.optimizer) + self.policy.set_training_mode(True) # Switch to train mode (this affects batch norm / dropout) + update_learning_rate(self.policy.optimizer, self.lr_schedule(self._current_progress_remaining)) cost_losses = [] score_losses = [] # dy, dz, r, p, vx, vy, vz for _ in range(gradient_steps): # Sample replay buffer - replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env) + replay_data = self.replay_buffer.sample(batch_size) depth = th.from_numpy(replay_data.depths).to(self.device) pos = replay_data.observations[:, 0:3] vel_acc_b = replay_data.observations[:, 3:9] @@ -354,69 +320,43 @@ class YopoAlgorithm: def collect_rollouts( self, env, - callback, train_freq, replay_buffer, - action_noise=None, - log_interval=None, ) -> RolloutReturn: self.policy.set_training_mode(False) - num_collected_steps, num_collected_episodes = 0, 0 - assert isinstance(env, VecEnv), "You must pass a VecEnv" assert train_freq.frequency > 0, "Should at least collect one step or episode." - if env.num_envs > 1: assert train_freq.unit == TrainFrequencyUnit.STEP, "You must use only one env when doing episodic training." - - callback.on_rollout_start() continue_training = True - """ - 1、pred endstate - 2、get obs: self._last_obs = env.step(endstate) - 3、get depth: self._last_depth = env.getDepthImage() - 4、record to buffer and back to 1 - """ while should_collect_more_steps(train_freq, num_collected_steps, num_collected_episodes): - # 1. pred endstate used latest policy or pre-trained policy - sampled_endstate = self._sample_action(action_noise, env.num_envs) + # 1. pred endstate used latest policy + sampled_endstate = self._sample_action() - # 2. perform action + # 2. perform action and get new observation new_obs, rewards, dones = env.step(sampled_endstate) self.num_timesteps += env.num_envs num_collected_steps += 1 - # Give access to local variables - callback.update_locals(locals()) - # Only stop training if return value is False, not when it is None. - if callback.on_step() is False: - return RolloutReturn(num_collected_steps * env.num_envs, num_collected_episodes, - continue_training=False) - # 3. store the last obs, depth, and goal - # self._update_info_buffer(infos, dones) self._store_transition(replay_buffer) - self._update_current_progress_remaining(self.num_timesteps, self._total_timesteps) + self._current_progress_remaining = 1.0 - float(self.num_timesteps) / float(self._total_timesteps) - # 4. update the obs, depth, goal, and reset the goal for the done-env + # 4. update the obs, depth, and reset the goal for the done-env self._last_obs = new_obs self._last_depth = env.getDepthImage() for idx, done in enumerate(dones): if done: - # Update stats num_collected_episodes += 1 - self._episode_num += 1 # reset goal for the 'done' env self._last_goal[idx] = self.get_random_goal(self._last_obs[idx]) - callback.on_rollout_end() - return RolloutReturn(num_collected_steps * env.num_envs, num_collected_episodes, continue_training) def prapare_input_observation(self, obs): @@ -468,38 +408,23 @@ class YopoAlgorithm: costs_[i][indices] = 0.0 return costs_ - def _setup_learn( - self, - total_timesteps, - eval_env=None, - callback=None, - eval_freq=10000, - n_eval_episodes=5, - log_path=None, - reset_num_timesteps=True, - tb_log_name="run", - ): + def _setup_learn(self, total_timesteps, reset_num_timesteps=True): + # reset the time info + self.start_time = time.time() + if reset_num_timesteps: + self.num_timesteps = 0 # steps of sampling + self._n_updates = 0 # steps of policy updating + self._total_timesteps = total_timesteps + self._num_timesteps_at_start = self.num_timesteps + # ----------------- Init the First Observation ----------------- - # super()._setup_learn() 中: self._last_obs = self.env.reset() - total_timesteps_, callback_ = super()._setup_learn( - total_timesteps, - eval_env, - callback, - eval_freq, - n_eval_episodes, - log_path, - reset_num_timesteps, - tb_log_name, - ) + self._last_obs = self.env.reset() self._last_depth = self.env.getDepthImage() self._last_goal = np.zeros([self.env.num_envs, 3], dtype=np.float32) for i in range(0, self.env.num_envs): self._last_goal[i] = self.get_random_goal(self._last_obs[i]) self._map_id = np.zeros((self.env.num_envs, 1), dtype=np.float32) - return total_timesteps_, callback_ - - def _sample_action(self) -> np.ndarray: """ use pretrained model or current model to sample the actions (endstate) @@ -509,7 +434,7 @@ class YopoAlgorithm: obs = self._last_obs.copy() goal_w = self._last_goal.copy() depth = th.from_numpy(self._last_depth).to(self.device) - # wxyz 四元数的逆[w, -x, -y, -z] + # [w, x, y, z] inv() of quat: [w, -x, -y, -z] quat_bw = -obs[:, 9:13] quat_bw[:, 0] = -quat_bw[:, 0] vel_acc_norm_b = self.normalize_obs(obs[:, 3:9]) @@ -546,12 +471,7 @@ class YopoAlgorithm: depth = deepcopy(self._last_depth) map_id = deepcopy(self._map_id) - replay_buffer.add( - obs, - goal, - depth, - map_id - ) + replay_buffer.add(obs, goal, depth, map_id) def get_random_goal(self, uav_state=None): world = self.env.world_box @@ -564,8 +484,8 @@ class YopoAlgorithm: random_goal = random_numbers * world_scale + world_center # 2. Use goal in front of the UAV (for better imitation learning) else: - q_wb = uav_state[9:] - p_wb = uav_state[0:3] + q_wb = uav_state[9:].copy() + p_wb = uav_state[0:3].copy() goal = np.random.randn(3) + np.array([2, 0, 0]) goal_dir = goal / np.linalg.norm(goal) random_goal_b = 50 * goal_dir