[CPU] Remove custom OP: box_head_nms (#5729)

jiayisunx · web-flow · commit bea75b235d95 · 2025-07-15T22:49:20.000+08:00
diff --git a/csrc/cpu/aten/Nms.cpp b/csrc/cpu/aten/Nms.cpp
@@ -14,7 +14,6 @@ namespace cpu {
 IPEX_DEFINE_DISPATCH(nms_cpu_kernel_stub);
 IPEX_DEFINE_DISPATCH(batch_score_nms_cpu_kernel_stub);
 IPEX_DEFINE_DISPATCH(rpn_nms_cpu_kernel_stub);
-IPEX_DEFINE_DISPATCH(box_head_nms_cpu_kernel_stub);
 
 } // namespace cpu
 } // namespace torch_ipex
@@ -99,47 +98,6 @@ std::tuple<std::vector<at::Tensor>, std::vector<at::Tensor>> rpn_nms(
   return result;
 }
 
-std::tuple<
-    std::vector<at::Tensor>,
-    std::vector<at::Tensor>,
-    std::vector<at::Tensor>>
-box_head_nms(
-    const std::vector<at::Tensor>& batch_bboxes,
-    const std::vector<at::Tensor>& batch_scores,
-    const std::vector<std::tuple<int64_t, int64_t>>& image_shapes,
-    const double score_thresh,
-    const double threshold,
-    const int64_t detections_per_img,
-    const int64_t num_classes) {
-#if defined(IPEX_DISP_OP)
-  printf("IpexExternal::box_head_nms\n");
-#endif
-  RECORD_FUNCTION("IpexExternal::box_head_nms", c10::ArrayRef<c10::IValue>({}));
-
-  /*
-  pointer to cpu::box_head_nms_cpu_kernel_impl(
-      batch_bboxes,
-      batch_scores,
-      image_shapes,
-      score_thresh,
-      threshold,
-      detections_per_img,
-      num_classes);
-  */
-  auto&& result = cpu::box_head_nms_cpu_kernel_stub(
-      kCPU,
-      batch_bboxes,
-      batch_scores,
-      image_shapes,
-      score_thresh,
-      threshold,
-      detections_per_img,
-      num_classes);
-
-  static_cast<void>(result); // Avoid warnings in case not used
-  return result;
-}
-
 template <typename scalar_t>
 at::Tensor scale_back_batch_kernel(
     const at::Tensor& _ipex_bboxes_in,
@@ -260,7 +218,6 @@ static auto dispatch =
         .op("torch_ipex::nms", &torch_ipex::nms)
         .op("torch_ipex::batch_score_nms", &torch_ipex::batch_score_nms)
         .op("torch_ipex::rpn_nms", &torch_ipex::rpn_nms)
-        .op("torch_ipex::box_head_nms", &torch_ipex::box_head_nms)
         .op("torch_ipex::parallel_scale_back_batch",
             &torch_ipex::parallel_scale_back_batch);
 }
@@ -319,32 +276,6 @@ std::tuple<std::vector<at::Tensor>, std::vector<at::Tensor>> rpn_nms(
       max_output);
 }
 
-std::tuple<
-    std::vector<at::Tensor>,
-    std::vector<at::Tensor>,
-    std::vector<at::Tensor>>
-box_head_nms(
-    const std::vector<at::Tensor>& batch_bboxes,
-    const std::vector<at::Tensor>& batch_scores,
-    const std::vector<std::tuple<int64_t, int64_t>>& image_shapes,
-    const double score_thresh,
-    const double threshold,
-    const int64_t detections_per_img,
-    const int64_t num_classes) {
-  c10::impl::ExcludeDispatchKeyGuard no_autocastCPU(DispatchKey::AutocastCPU);
-  static auto op = torch::Dispatcher::singleton()
-                       .findSchemaOrThrow("torch_ipex::box_head_nms", "")
-                       .typed<decltype(box_head_nms)>();
-  return op.call(
-      cpu_cached_cast(at::kFloat, batch_bboxes),
-      cpu_cached_cast(at::kFloat, batch_scores),
-      image_shapes,
-      score_thresh,
-      threshold,
-      detections_per_img,
-      num_classes);
-}
-
 std::tuple<at::Tensor, at::Tensor> parallel_scale_back_batch(
     const at::Tensor& bboxes_in,
     const at::Tensor& scores_in,
@@ -368,7 +299,6 @@ TORCH_LIBRARY_IMPL(torch_ipex, AutocastCPU, m) {
   m.impl("nms", torch_ipex::autocast::nms);
   m.impl("batch_score_nms", torch_ipex::autocast::batch_score_nms);
   m.impl("rpn_nms", torch_ipex::autocast::rpn_nms);
-  m.impl("box_head_nms", torch_ipex::autocast::box_head_nms);
   m.impl(
       "parallel_scale_back_batch",
       torch_ipex::autocast::parallel_scale_back_batch);
diff --git a/csrc/cpu/aten/Nms.h b/csrc/cpu/aten/Nms.h
@@ -29,19 +29,6 @@ rpn_nms_cpu_kernel_impl(
     const float threshold,
     const int max_output);
 
-std::tuple<
-    std::vector<at::Tensor>,
-    std::vector<at::Tensor>,
-    std::vector<at::Tensor>>
-box_head_nms_cpu_kernel_impl(
-    const std::vector<at::Tensor>& batch_bboxes,
-    const std::vector<at::Tensor>& batch_scores,
-    const std::vector<std::tuple<int64_t, int64_t>>& image_shapes,
-    const float score_thresh,
-    const float threshold,
-    const int detections_per_img,
-    const int num_classes);
-
 } // namespace
 
 using nms_cpu_kernel_fn = at::Tensor (*)(
@@ -71,19 +58,6 @@ using rpn_nms_cpu_kernel_fn =
         const int);
 IPEX_DECLARE_DISPATCH(rpn_nms_cpu_kernel_fn, rpn_nms_cpu_kernel_stub);
 
-using box_head_nms_cpu_kernel_fn = std::tuple<
-    std::vector<at::Tensor>,
-    std::vector<at::Tensor>,
-    std::vector<at::Tensor>> (*)(
-    const std::vector<at::Tensor>&,
-    const std::vector<at::Tensor>&,
-    const std::vector<std::tuple<int64_t, int64_t>>&,
-    const float,
-    const float,
-    const int,
-    const int);
-IPEX_DECLARE_DISPATCH(box_head_nms_cpu_kernel_fn, box_head_nms_cpu_kernel_stub);
-
 } // namespace cpu
 } // namespace torch_ipex
 
@@ -156,42 +130,6 @@ std::tuple<std::vector<at::Tensor>, std::vector<at::Tensor>> rpn_nms(
     const double threshold,
     const int64_t max_output);
 
-/// \brief Perform batch non-maximum suppression (NMS) for MaskRCNN box_head
-/// part.
-///
-/// C++ version of batch NMS for MaskRCNN box_head part.
-/// Refer to
-/// https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py#L79.
-///
-/// \param batch_bboxes: predicted loc in ltrb format, BS tensors in vector,
-/// and the size of each tensor: [number_boxes, 4]. \param batch_scores:
-/// predicted score, BS tensors in vector, and the size of each tensor:
-/// [number_boxes]. \param image_shapes: the shapes of images, BS tuples in
-/// vector. \param score_thresh: the threshold of score. \param threshold: IOU
-/// threshold(scalar) to suppress bboxs which has the IOU val larger than the
-/// threshold. \param detections_per_img: the max number of detections per
-/// image. \param num_classes: class number of objects.
-///
-/// \return result is a tuple. There are 3 vectors of tensors in the tuple:
-///   bboxes_out_: the selected out bboxes coordinate, BS tensors in vector,
-///   and the size of each tensor: [selected_box_number, 4]. scores_out_: the
-///   score of each selected out bboxes, BS tensors in vector, and the size of
-///   each tensor: [selected_box_number]. labels_out_: the label of each
-///   selected out bboxes, BS tensors in vector, and the size of each tensor:
-///   [selected_box_number].
-std::tuple<
-    std::vector<at::Tensor>,
-    std::vector<at::Tensor>,
-    std::vector<at::Tensor>>
-box_head_nms(
-    const std::vector<at::Tensor>& batch_bboxes,
-    const std::vector<at::Tensor>& batch_scores,
-    const std::vector<std::tuple<int64_t, int64_t>>& image_shapes,
-    const double score_thresh,
-    const double threshold,
-    const int64_t detections_per_img,
-    const int64_t num_classes);
-
 /// \brief Do scale and transform from xywh to ltrb for predicted loc and do
 /// Softmax along the last dim for predicted score.
 ///
diff --git a/csrc/cpu/aten/kernels/NmsKrnl.cpp b/csrc/cpu/aten/kernels/NmsKrnl.cpp
@@ -560,116 +560,6 @@ std::tuple<std::vector<at::Tensor>, std::vector<at::Tensor>> rpn_nms_kernel(
   return std::make_tuple(bboxes_out, scores_out);
 }
 
-template <typename scalar_t>
-std::tuple<
-    std::vector<at::Tensor>,
-    std::vector<at::Tensor>,
-    std::vector<at::Tensor>>
-box_head_nms_kernel(
-    const std::vector<at::Tensor>& batch_bboxes,
-    const std::vector<at::Tensor>& batch_scores,
-    const std::vector<std::tuple<int64_t, int64_t>>& image_shapes,
-    const float score_thresh,
-    const float threshold,
-    const int detections_per_img,
-    const int num_classes) {
-  auto nbatch = batch_scores.size(); // number of batches
-  auto nbatch_x_nclass =
-      nbatch * num_classes; // (number of batches) * (number of labels)
-
-  std::vector<at::Tensor> bboxes_out(nbatch_x_nclass);
-  std::vector<at::Tensor> scores_out(nbatch_x_nclass);
-  std::vector<at::Tensor> labels_out(nbatch_x_nclass);
-
-#ifdef _OPENMP
-#if (_OPENMP >= 201307)
-#pragma omp parallel for simd schedule( \
-    static) if (omp_get_max_threads() > 1 && !omp_in_parallel())
-#else
-#pragma omp parallel for schedule( \
-    static) if (omp_get_max_threads() > 1 && !omp_in_parallel())
-#endif
-#endif
-  for (int bs = 0; bs < nbatch; bs++) {
-    at::Tensor bboxes = batch_bboxes[bs].reshape({-1, 4});
-    at::Tensor scores = batch_scores[bs];
-    auto image_shape = image_shapes[bs];
-    bboxes.slice(1, 0, 1).clamp_(0, std::get<0>(image_shape) - 1);
-    bboxes.slice(1, 1, 2).clamp_(0, std::get<1>(image_shape) - 1);
-    bboxes.slice(1, 2, 3).clamp_(0, std::get<0>(image_shape) - 1);
-    bboxes.slice(1, 3, 4).clamp_(0, std::get<1>(image_shape) - 1);
-    bboxes = bboxes.reshape({-1, num_classes * 4});
-    scores = scores.reshape({-1, num_classes});
-    at::Tensor indexes = scores > score_thresh;
-
-    for (int j = 1; j < num_classes; j++) {
-      at::Tensor index =
-          at::nonzero(indexes.slice(1, j, j + 1).squeeze(1)).squeeze(1);
-      at::Tensor score =
-          scores.slice(1, j, j + 1).squeeze(1).index_select(0, index);
-      at::Tensor bbox =
-          bboxes.slice(1, j * 4, (j + 1) * 4).index_select(0, index);
-      if (score.size(0) == 0) {
-        continue;
-      }
-      auto iter = bs * num_classes + j;
-      if (threshold > 0) {
-        at::Tensor keep =
-            nms_cpu_kernel<scalar_t, /*sorted*/ false>(bbox, score, threshold);
-        bboxes_out[iter] = bbox.index_select(0, keep);
-        scores_out[iter] = score.index_select(0, keep);
-        labels_out[iter] = at::full({keep.sizes()}, j, torch::kInt64);
-      } else {
-        bboxes_out[iter] = bbox;
-        scores_out[iter] = score;
-        labels_out[iter] = at::full({score.sizes()}, j, torch::kInt64);
-      }
-    }
-  }
-
-  std::vector<at::Tensor> bboxes_out_(nbatch);
-  std::vector<at::Tensor> scores_out_(nbatch);
-  std::vector<at::Tensor> labels_out_(nbatch);
-
-#ifdef _OPENMP
-#if (_OPENMP >= 201307)
-#pragma omp parallel for simd schedule( \
-    static) if (omp_get_max_threads() > 1 && !omp_in_parallel())
-#else
-#pragma omp parallel for schedule( \
-    static) if (omp_get_max_threads() > 1 && !omp_in_parallel())
-#endif
-#endif
-  for (int bs = 0; bs < nbatch; bs++) {
-    std::vector<at::Tensor> valid_bboxes_out =
-        remove_empty(bboxes_out, bs * num_classes, (bs + 1) * num_classes);
-    std::vector<at::Tensor> valid_scores_out =
-        remove_empty(scores_out, bs * num_classes, (bs + 1) * num_classes);
-    std::vector<at::Tensor> valid_labels_out =
-        remove_empty(labels_out, bs * num_classes, (bs + 1) * num_classes);
-    if (valid_bboxes_out.size() > 0) {
-      bboxes_out_[bs] = at::cat(valid_bboxes_out, 0);
-      scores_out_[bs] = at::cat(valid_scores_out, 0);
-      labels_out_[bs] = at::cat(valid_labels_out, 0);
-    } else {
-      bboxes_out_[bs] = at::empty({0, 4}, torch::kFloat);
-      scores_out_[bs] = at::empty({0}, torch::kFloat);
-      labels_out_[bs] = at::empty({0}, torch::kInt64);
-    }
-    auto number_of_detections = bboxes_out_[bs].size(0);
-    if (number_of_detections > detections_per_img && detections_per_img > 0) {
-      auto out_ = scores_out_[bs].kthvalue(
-          number_of_detections - detections_per_img + 1);
-      at::Tensor keep =
-          at::nonzero(scores_out_[bs] >= std::get<0>(out_).item()).squeeze(1);
-      bboxes_out_[bs] = bboxes_out_[bs].index_select(0, keep);
-      scores_out_[bs] = scores_out_[bs].index_select(0, keep);
-      labels_out_[bs] = labels_out_[bs].index_select(0, keep);
-    }
-  }
-  return std::make_tuple(bboxes_out_, scores_out_, labels_out_);
-}
-
 at::Tensor nms_cpu_kernel_impl(
     const at::Tensor& dets,
     const at::Tensor& scores,
@@ -718,37 +608,6 @@ rpn_nms_cpu_kernel_impl(
   return result;
 }
 
-std::tuple<
-    std::vector<at::Tensor>,
-    std::vector<at::Tensor>,
-    std::vector<at::Tensor>>
-box_head_nms_cpu_kernel_impl(
-    const std::vector<at::Tensor>& batch_bboxes,
-    const std::vector<at::Tensor>& batch_scores,
-    const std::vector<std::tuple<int64_t, int64_t>>& image_shapes,
-    const float score_thresh,
-    const float threshold,
-    const int detections_per_img,
-    const int num_classes) {
-  std::tuple<
-      std::vector<at::Tensor>,
-      std::vector<at::Tensor>,
-      std::vector<at::Tensor>>
-      result;
-  AT_DISPATCH_FLOATING_TYPES(
-      batch_bboxes[0].scalar_type(), "box_head_nms", [&] {
-        result = box_head_nms_kernel<scalar_t>(
-            batch_bboxes,
-            batch_scores,
-            image_shapes,
-            score_thresh,
-            threshold,
-            detections_per_img,
-            num_classes);
-      });
-  return result;
-}
-
 } // anonymous namespace
 
 IPEX_REGISTER_DISPATCH(nms_cpu_kernel_stub, &nms_cpu_kernel_impl);
@@ -759,9 +618,5 @@ IPEX_REGISTER_DISPATCH(
 
 IPEX_REGISTER_DISPATCH(rpn_nms_cpu_kernel_stub, &rpn_nms_cpu_kernel_impl);
 
-IPEX_REGISTER_DISPATCH(
-    box_head_nms_cpu_kernel_stub,
-    &box_head_nms_cpu_kernel_impl);
-
 } // namespace cpu
 } // namespace torch_ipex
diff --git a/tests/cpu/test_nms.py b/tests/cpu/test_nms.py