From e3f7af7f0156c8a49f0af6330e2e3ae8c0ba0802 Mon Sep 17 00:00:00 2001
From: TJU_Lu <lqzx1998@tju.edu.cn>
Date: Tue, 17 Dec 2024 22:03:45 +0800
Subject: [PATCH] move network inference to depth callback

---
 run/test_yopo_ros.py | 131 +++++++++++++++++++++----------------------
 1 file changed, 64 insertions(+), 67 deletions(-)

diff --git a/run/test_yopo_ros.py b/run/test_yopo_ros.py
index cd40dfd..40c17a0 100644
--- a/run/test_yopo_ros.py
+++ b/run/test_yopo_ros.py
@@ -82,7 +82,7 @@ class YopoNet:
         self.odom_ref_sub = rospy.Subscriber("/juliett/state_ref/odom", Odometry, self.callback_odometry_ref, queue_size=1, tcp_nodelay=True)
         self.depth_sub = rospy.Subscriber(depth_topic, Image, self.callback_depth, queue_size=1, tcp_nodelay=True)
         self.goal_sub = rospy.Subscriber("/move_base_simple/goal", PoseStamped, self.callback_set_goal, queue_size=1)
-        self.timer_net = rospy.Timer(rospy.Duration(1. / self.config['network_frequency']), self.test_policy)
+        # self.timer_net = rospy.Timer(rospy.Duration(1. / self.config['network_frequency']), self.test_policy)
         print("YOPO Net Node Ready!")
         rospy.spin()
 
@@ -134,6 +134,13 @@ class YopoNet:
         return obs_norm
 
     def callback_depth(self, data):
+        # start a new round if no new odom_ref (we will stop publish odom_ref when arrive and assume that the rate of odom is higher than depth)
+        if not self.new_odom:
+            self.odom_ref_init = False
+            return
+        self.new_odom = False
+
+        # 1. Depth Image Process
         min_dis, max_dis = 0.03, 20.0
         scale = {'435': 0.001, 'flightmare': 1.0}.get(self.env, 1.0)
 
@@ -147,83 +154,76 @@ class YopoNet:
         depth_ = np.minimum(depth_ * scale, max_dis) / max_dis
 
         # interpolated the nan value (experiment shows that treating nan directly as 0 produces similar results)
-        start = time.time()
+        interpolation_start = time.time()
         nan_mask = np.isnan(depth_) | (depth_ < min_dis)
         interpolated_image = cv2.inpaint(np.uint8(depth_ * 255), np.uint8(nan_mask), 1, cv2.INPAINT_NS)
         interpolated_image = interpolated_image.astype(np.float32) / 255.0
         depth_ = interpolated_image.reshape([1, 1, self.height, self.width])
         if self.verbose:
-            self.time_interpolation = self.time_interpolation + (time.time() - start)
+            self.time_interpolation = self.time_interpolation + (time.time() - interpolation_start)
             self.count_interpolation = self.count_interpolation + 1
             print(f"Time Consuming: depth-interpolation: {1000 * self.time_interpolation / self.count_interpolation:.2f}ms")
         # cv2.imshow("1", depth_[0][0])
         # cv2.waitKey(1)
         self.depth = depth_.astype(np.float32)
-        self.new_depth = True
 
-    # TODO: Move the test_policy to callback_depth directly?
-    def test_policy(self, _timer):
-        if self.new_depth and self.new_odom:
-            self.new_odom, self.new_depth = False, False
-            obs = self.process_odom()
-            odom_sec = self.odom.header.stamp.to_sec()
+        # 2. YOPO Network Inference
+        obs = self.process_odom()
+        odom_sec = self.odom.header.stamp.to_sec()
 
-            # input prepare
-            time0 = time.time()
-            depth = torch.from_numpy(self.depth).to(self.device, non_blocking=True)  # (non_blocking: copying speed 3x)
-            obs_norm_input = self.prepare_input_observation(obs)
-            obs_norm_input = obs_norm_input.to(self.device, non_blocking=True)
-            # torch.cuda.synchronize()
+        # input prepare
+        time0 = time.time()
+        depth = torch.from_numpy(self.depth).to(self.device, non_blocking=True)  # (non_blocking: copying speed 3x)
+        obs_norm_input = self.prepare_input_observation(obs)
+        obs_norm_input = obs_norm_input.to(self.device, non_blocking=True)
+        # torch.cuda.synchronize()
 
-            time1 = time.time()
-            # Forward (TensorRT: inference speed increased by 5x)
-            with torch.no_grad():
-                network_output = self.policy(depth, obs_norm_input)
-            network_output = network_output.cpu().numpy()   # torch.cuda.synchronize() is not needed here
-            time2 = time.time()
-            # Replacing PyTorch operation on CUDA with NumPy operation on CPU (speed increased by 10x)
-            endstate_pred, score_pred = self.process_output(network_output, return_all_preds=self.visualize)
-            time3 = time.time()
+        time1 = time.time()
+        # Forward (TensorRT: inference speed increased by 5x)
+        with torch.no_grad():
+            network_output = self.policy(depth, obs_norm_input)
+        network_output = network_output.cpu().numpy()   # torch.cuda.synchronize() is not needed here
+        time2 = time.time()
+        # Replacing PyTorch operation on CUDA with NumPy operation on CPU (speed increased by 10x)
+        endstate_pred, score_pred = self.process_output(network_output, return_all_preds=self.visualize)
+        time3 = time.time()
 
-            # Vectorization: transform the prediction(P V A in body frame) to the world frame with the attitude (without the position)
-            endstate_c = endstate_pred.T.reshape(-1, 3, 3)
-            endstate_w = np.matmul(self.Rotation_wc, endstate_c)
-            endstate_w = endstate_w.reshape(-1, 9).T
+        # Vectorization: transform the prediction(P V A in body frame) to the world frame with the attitude (without the position)
+        endstate_c = endstate_pred.T.reshape(-1, 3, 3)
+        endstate_w = np.matmul(self.Rotation_wc, endstate_c)
+        endstate_w = endstate_w.reshape(-1, 9).T
 
-            if self.verbose:
-                self.time_prepare = self.time_prepare + (time1 - time0)
-                self.time_forward = self.time_forward + (time2 - time1)
-                self.time_process = self.time_process + (time3 - time2)
-                self.count = self.count + 1
-                print(f"Time Consuming: data-prepare: {1000 * self.time_prepare / self.count:.2f}ms; "
-                      f"network-inference: {1000 * self.time_forward / self.count:.2f}ms; "
-                      f"post-process: {1000 * self.time_process / self.count:.2f}ms")
+        if self.verbose:
+            self.time_prepare = self.time_prepare + (time1 - time0)
+            self.time_forward = self.time_forward + (time2 - time1)
+            self.time_process = self.time_process + (time3 - time2)
+            self.count = self.count + 1
+            print(f"Time Consuming: data-prepare: {1000 * self.time_prepare / self.count:.2f}ms; "
+                  f"network-inference: {1000 * self.time_forward / self.count:.2f}ms; "
+                  f"post-process: {1000 * self.time_process / self.count:.2f}ms")
 
-            # publish
-            if not self.visualize:
-                endstate_pred_to_pub = Float32MultiArray(data=endstate_w.reshape(-1))
-                endstate_pred_to_pub.layout.data_offset = int(1000 * odom_sec) % 1000000  # 预测时用的里程计时间戳(ms)
-                self.endstate_pub.publish(endstate_pred_to_pub)
-            else:
-                action_id = np.argmin(score_pred)
-                best_endstate_pred = endstate_w[:, action_id].reshape(-1)
-                endstate_pred_to_pub = Float32MultiArray(data=best_endstate_pred)
-                endstate_pred_to_pub.layout.data_offset = int(1000 * odom_sec) % 1000000  # 预测时用的里程计时间戳(ms)
-                self.endstate_pub.publish(endstate_pred_to_pub)
-                # visualization
-                endstate_score_preds = np.vstack([endstate_w, score_pred])
-                all_endstate_pred = Float32MultiArray(data=endstate_score_preds.T.reshape(-1))
-                all_endstate_pred.layout.dim.append(MultiArrayDimension())
-                all_endstate_pred.layout.dim[0].size = endstate_score_preds.shape[1]
-                all_endstate_pred.layout.dim[0].label = "primitive_num"
-                all_endstate_pred.layout.dim.append(MultiArrayDimension())
-                all_endstate_pred.layout.dim[1].size = endstate_score_preds.shape[0]
-                all_endstate_pred.layout.dim[1].label = "endstate_and_score_num"
-                self.all_endstate_pub.publish(all_endstate_pred)
-            self.goal_pub.publish(Float32MultiArray(data=self.goal))
-        # start a new round
-        elif not self.new_odom:
-            self.odom_ref_init = False
+        # publish
+        if not self.visualize:
+            endstate_pred_to_pub = Float32MultiArray(data=endstate_w.reshape(-1))
+            endstate_pred_to_pub.layout.data_offset = int(1000 * odom_sec) % 1000000  # 预测时用的里程计时间戳(ms)
+            self.endstate_pub.publish(endstate_pred_to_pub)
+        else:
+            action_id = np.argmin(score_pred)
+            best_endstate_pred = endstate_w[:, action_id].reshape(-1)
+            endstate_pred_to_pub = Float32MultiArray(data=best_endstate_pred)
+            endstate_pred_to_pub.layout.data_offset = int(1000 * odom_sec) % 1000000  # 预测时用的里程计时间戳(ms)
+            self.endstate_pub.publish(endstate_pred_to_pub)
+            # visualization
+            endstate_score_preds = np.vstack([endstate_w, score_pred])
+            all_endstate_pred = Float32MultiArray(data=endstate_score_preds.T.reshape(-1))
+            all_endstate_pred.layout.dim.append(MultiArrayDimension())
+            all_endstate_pred.layout.dim[0].size = endstate_score_preds.shape[1]
+            all_endstate_pred.layout.dim[0].label = "primitive_num"
+            all_endstate_pred.layout.dim.append(MultiArrayDimension())
+            all_endstate_pred.layout.dim[1].size = endstate_score_preds.shape[0]
+            all_endstate_pred.layout.dim[1].label = "endstate_and_score_num"
+            self.all_endstate_pub.publish(all_endstate_pred)
+        self.goal_pub.publish(Float32MultiArray(data=self.goal))
 
     def process_output(self, network_output, return_all_preds=False):
         if network_output.shape[0] != 1:
@@ -313,17 +313,14 @@ def parser():
 
 # In realworld flight: visualize=False; use_tensorrt=True, and ensure the pitch_angle consistent with your platform
 # When modifying the pitch_angle, there's no need to re-collect and re-train, as all predictions are in the camera coordinate system
+# Change the flight speed at traj_opt.yaml and there's no need to re-collect and re-train
 def main():
     args = parser().parse_args()
     rsg_root = os.path.dirname(os.path.abspath(__file__))
-    if args.use_tensorrt:
-        weight = args.trt_file
-    else:
-        weight = rsg_root + "/saved/YOPO_{}/Policy/epoch{}_iter{}.pth".format(args.trial, args.epoch, args.iter)
+    weight = args.trt_file if args.use_tensorrt else f"{rsg_root}/saved/YOPO_{args.trial}/Policy/epoch{args.epoch}_iter{args.iter}.pth"
     print("load weight from:", weight)
 
     settings = {'use_tensorrt': args.use_tensorrt,
-                'network_frequency': 30,
                 'img_height': 96,
                 'img_width': 160,
                 'goal': [20, 20, 2],           # the goal