Merge pull request #23 from NVIDIA-ISAAC-ROS/hotfix-release-dp3.1-1

jaiveersinghNV · web-flow · commit 538c7bbf3221 · 2023-07-31T10:07:03.000-07:00
Add DOPE Custom Model tutorial
diff --git a/README.md b/README.md
@@ -246,6 +246,7 @@ To continue your exploration, check out the following suggested examples:
 - [`DOPE` with `Triton`](docs/dope-triton.md)
 - [`Centerpose` with `Triton`](docs/centerpose.md)
 - [`DOPE` with non-standard input image sizes](docs/dope-custom-size.md)
+- [Train your own `DOPE` model](docs/dope-custom-model.md)
 
 ### Use Different Models
 
diff --git a/docs/dope-custom-model.md b/docs/dope-custom-model.md
@@ -0,0 +1,33 @@
+# Training your own DOPE model
+
+## Overview
+
+The DOPE network architecture is intended to be trained on objects of a specific class, which means that using DOPE for pose estimation of a custom object class requires training a custom model for that class.
+
+[NVIDIA Isaac Sim](https://developer.nvidia.com/isaac-sim) offers a convenient workflow for training a custom DOPE model using synthetic data generation (SDG).
+
+## Tutorial Walkthrough
+
+1. Clone the [Isaac Sim DOPE Training repository](https://github.com/andrewyguo/dope_training#deep-object-pose-estimation-dope---training) and follow the training instructions to prepare a custom DOPE model.
+2. Using the [Isaac Sim DOPE inference script](https://github.com/andrewyguo/dope_training/tree/master/inference), test the custom DOPE model's inference capability and ensure that the quality is acceptable for your use case.
+
+3. Follow steps 1-5 of the main DOPE [quickstart](../README.md#quickstart).
+
+4. At step 6, move the prepared `.pth` model output from the Isaac Sim DOPE Training script into the `/tmp/models` path inside the Docker container.
+    ```bash
+    docker cp custom_model.pth isaac_ros_dev-x86_64-container:/tmp/models
+    ```
+5. At step 7, run the `dope_converter.py` script with the custom model:
+
+    ```bash
+    python3 /workspaces/isaac_ros-dev/src/isaac_ros_pose_estimation/isaac_ros_dope/scripts/dope_converter.py --format onnx --input /tmp/models/custom_model.pth
+    ```
+
+6. Proceed through steps 8-9.
+7. At step 10, launch the ROS 2 launchfile with the custom model:
+
+    ```bash
+    ros2 launch isaac_ros_dope isaac_ros_dope_tensor_rt.launch.py model_file_path:=/tmp/models/custom_model.onnx engine_file_path:=/tmp/models/custom_model.plan
+    ```
+
+8. Continue with the rest of the quickstart. You should now be able to detect poses of custom objects.
diff --git a/isaac_ros_dope/config/dope_config.yaml b/isaac_ros_dope/config/dope_config.yaml
@@ -60,5 +60,10 @@ dope:
       "PeasAndCarrots" : [ 5.8512001037597656, 7.0636000633239746, 6.5918002128601074 ]
     }
 
-    # 9 element camera matrix (assuming 640x480 image)
-    camera_matrix: [463.51, 0.0, 321.652, 0.0, 616.44, 232.260, 0.0, 0.0, 1.0]
+    # 9 element camera matrix (using default from Ketchup demo)
+    # Taken from: https://github.com/andrewyguo/dope_training/blob/master/inference/config/camera_info.yaml
+    camera_matrix: [
+      364.16501736,   0.0,          121.36296296,
+      0.0,            364.16501736, 121.36296296,
+      0.0,            0.0,          1.0
+    ]
diff --git a/isaac_ros_dope/gxf/dope/dope_decoder.cpp b/isaac_ros_dope/gxf/dope/dope_decoder.cpp
@@ -182,24 +182,24 @@ FindObjects(const std::array<cv::Mat, kInputMapsChannels> &maps) {
     for (size_t pp = 0; pp < peaks.size(); ++pp) {
       const auto peak = peaks[pp];
 
-      // Compute the weighted average for localizing the peak, using a 5x5
+      // Compute the weighted average for localizing the peak, using an 11x11
       // window
       Vector2f peak_sum(0, 0);
       float weight_sum = 0.0f;
-      for (int ii = -2; ii <= 2; ++ii) {
-        for (int jj = -2; jj <= 2; ++jj) {
-          const int row = peak[0] + ii;
-          const int col = peak[1] + jj;
+      const int WINDOW_SIZE = 11;
+      for (int ii = -(WINDOW_SIZE - 1) / 2; ii <= (WINDOW_SIZE - 1) / 2; ++ii) {
+        for (int jj = -(WINDOW_SIZE - 1) / 2; jj <= (WINDOW_SIZE - 1) / 2; ++jj) {
+          const int col = peak[0] + ii;
+          const int row = peak[1] + jj;
 
-          if (col < 0 || col >= image.size[1] || row < 0 ||
-              row >= image.size[0]) {
+          if (col < 0 || col >= image.cols || row < 0 || row >= image.rows) {
             continue;
           }
 
           const float weight = image.at<float>(row, col);
           weight_sum += weight;
-          peak_sum[0] += row * weight;
-          peak_sum[1] += col * weight;
+          peak_sum[0] += col * weight;
+          peak_sum[1] += row * weight;
         }
       }
 
@@ -322,7 +322,7 @@ ExtractPose(const DopeObjectKeypoints &object,
   cv::Mat cv_keypoints_2d;
   cv::eigen2cv(object.second, cv_keypoints_2d);
   if (!cv::solvePnP(cv_keypoints_3d.t(), cv_keypoints_2d.t(), camera_matrix,
-                    dist_coeffs, rvec, tvec)) {
+                    dist_coeffs, rvec, tvec, false, cv::SOLVEPNP_EPNP)) {
     GXF_LOG_ERROR("cv::solvePnP failed");
     return nvidia::gxf::Unexpected{GXF_FAILURE};
   }
diff --git a/isaac_ros_dope/launch/isaac_ros_dope_tensor_rt.launch.py b/isaac_ros_dope/launch/isaac_ros_dope_tensor_rt.launch.py
@@ -38,6 +38,14 @@ def generate_launch_description():
             'network_image_height',
             default_value='480',
             description='The input image height that the network expects'),
+        DeclareLaunchArgument(
+            'encoder_image_mean',
+            default_value='[0.5, 0.5, 0.5]',
+            description='The mean for image normalization'),
+        DeclareLaunchArgument(
+            'encoder_image_stddev',
+            default_value='[0.5, 0.5, 0.5]',
+            description='The standard deviation for image normalization'),
         DeclareLaunchArgument(
             'model_file_path',
             default_value=f'{default_model_file_path}',
@@ -87,6 +95,8 @@ def generate_launch_description():
     # DNN Image Encoder parameters
     network_image_width = LaunchConfiguration('network_image_width')
     network_image_height = LaunchConfiguration('network_image_height')
+    encoder_image_mean = LaunchConfiguration('encoder_image_mean')
+    encoder_image_stddev = LaunchConfiguration('encoder_image_stddev')
 
     # Tensor RT parameters
     model_file_path = LaunchConfiguration('model_file_path')
@@ -110,6 +120,8 @@ def generate_launch_description():
         parameters=[{
             'network_image_width': network_image_width,
             'network_image_height': network_image_height,
+            'image_mean': encoder_image_mean,
+            'image_stddev': encoder_image_stddev,
         }],
         remappings=[('encoded_tensor', 'tensor_pub')])