Skip to content

Commit bea75b2

Browse files
authored
[CPU] Remove custom OP: box_head_nms (#5729)
1 parent 902f942 commit bea75b2

File tree

4 files changed

+0
-396
lines changed

4 files changed

+0
-396
lines changed

csrc/cpu/aten/Nms.cpp

Lines changed: 0 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ namespace cpu {
1414
IPEX_DEFINE_DISPATCH(nms_cpu_kernel_stub);
1515
IPEX_DEFINE_DISPATCH(batch_score_nms_cpu_kernel_stub);
1616
IPEX_DEFINE_DISPATCH(rpn_nms_cpu_kernel_stub);
17-
IPEX_DEFINE_DISPATCH(box_head_nms_cpu_kernel_stub);
1817

1918
} // namespace cpu
2019
} // namespace torch_ipex
@@ -99,47 +98,6 @@ std::tuple<std::vector<at::Tensor>, std::vector<at::Tensor>> rpn_nms(
9998
return result;
10099
}
101100

102-
std::tuple<
103-
std::vector<at::Tensor>,
104-
std::vector<at::Tensor>,
105-
std::vector<at::Tensor>>
106-
box_head_nms(
107-
const std::vector<at::Tensor>& batch_bboxes,
108-
const std::vector<at::Tensor>& batch_scores,
109-
const std::vector<std::tuple<int64_t, int64_t>>& image_shapes,
110-
const double score_thresh,
111-
const double threshold,
112-
const int64_t detections_per_img,
113-
const int64_t num_classes) {
114-
#if defined(IPEX_DISP_OP)
115-
printf("IpexExternal::box_head_nms\n");
116-
#endif
117-
RECORD_FUNCTION("IpexExternal::box_head_nms", c10::ArrayRef<c10::IValue>({}));
118-
119-
/*
120-
pointer to cpu::box_head_nms_cpu_kernel_impl(
121-
batch_bboxes,
122-
batch_scores,
123-
image_shapes,
124-
score_thresh,
125-
threshold,
126-
detections_per_img,
127-
num_classes);
128-
*/
129-
auto&& result = cpu::box_head_nms_cpu_kernel_stub(
130-
kCPU,
131-
batch_bboxes,
132-
batch_scores,
133-
image_shapes,
134-
score_thresh,
135-
threshold,
136-
detections_per_img,
137-
num_classes);
138-
139-
static_cast<void>(result); // Avoid warnings in case not used
140-
return result;
141-
}
142-
143101
template <typename scalar_t>
144102
at::Tensor scale_back_batch_kernel(
145103
const at::Tensor& _ipex_bboxes_in,
@@ -260,7 +218,6 @@ static auto dispatch =
260218
.op("torch_ipex::nms", &torch_ipex::nms)
261219
.op("torch_ipex::batch_score_nms", &torch_ipex::batch_score_nms)
262220
.op("torch_ipex::rpn_nms", &torch_ipex::rpn_nms)
263-
.op("torch_ipex::box_head_nms", &torch_ipex::box_head_nms)
264221
.op("torch_ipex::parallel_scale_back_batch",
265222
&torch_ipex::parallel_scale_back_batch);
266223
}
@@ -319,32 +276,6 @@ std::tuple<std::vector<at::Tensor>, std::vector<at::Tensor>> rpn_nms(
319276
max_output);
320277
}
321278

322-
std::tuple<
323-
std::vector<at::Tensor>,
324-
std::vector<at::Tensor>,
325-
std::vector<at::Tensor>>
326-
box_head_nms(
327-
const std::vector<at::Tensor>& batch_bboxes,
328-
const std::vector<at::Tensor>& batch_scores,
329-
const std::vector<std::tuple<int64_t, int64_t>>& image_shapes,
330-
const double score_thresh,
331-
const double threshold,
332-
const int64_t detections_per_img,
333-
const int64_t num_classes) {
334-
c10::impl::ExcludeDispatchKeyGuard no_autocastCPU(DispatchKey::AutocastCPU);
335-
static auto op = torch::Dispatcher::singleton()
336-
.findSchemaOrThrow("torch_ipex::box_head_nms", "")
337-
.typed<decltype(box_head_nms)>();
338-
return op.call(
339-
cpu_cached_cast(at::kFloat, batch_bboxes),
340-
cpu_cached_cast(at::kFloat, batch_scores),
341-
image_shapes,
342-
score_thresh,
343-
threshold,
344-
detections_per_img,
345-
num_classes);
346-
}
347-
348279
std::tuple<at::Tensor, at::Tensor> parallel_scale_back_batch(
349280
const at::Tensor& bboxes_in,
350281
const at::Tensor& scores_in,
@@ -368,7 +299,6 @@ TORCH_LIBRARY_IMPL(torch_ipex, AutocastCPU, m) {
368299
m.impl("nms", torch_ipex::autocast::nms);
369300
m.impl("batch_score_nms", torch_ipex::autocast::batch_score_nms);
370301
m.impl("rpn_nms", torch_ipex::autocast::rpn_nms);
371-
m.impl("box_head_nms", torch_ipex::autocast::box_head_nms);
372302
m.impl(
373303
"parallel_scale_back_batch",
374304
torch_ipex::autocast::parallel_scale_back_batch);

csrc/cpu/aten/Nms.h

Lines changed: 0 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -29,19 +29,6 @@ rpn_nms_cpu_kernel_impl(
2929
const float threshold,
3030
const int max_output);
3131

32-
std::tuple<
33-
std::vector<at::Tensor>,
34-
std::vector<at::Tensor>,
35-
std::vector<at::Tensor>>
36-
box_head_nms_cpu_kernel_impl(
37-
const std::vector<at::Tensor>& batch_bboxes,
38-
const std::vector<at::Tensor>& batch_scores,
39-
const std::vector<std::tuple<int64_t, int64_t>>& image_shapes,
40-
const float score_thresh,
41-
const float threshold,
42-
const int detections_per_img,
43-
const int num_classes);
44-
4532
} // namespace
4633

4734
using nms_cpu_kernel_fn = at::Tensor (*)(
@@ -71,19 +58,6 @@ using rpn_nms_cpu_kernel_fn =
7158
const int);
7259
IPEX_DECLARE_DISPATCH(rpn_nms_cpu_kernel_fn, rpn_nms_cpu_kernel_stub);
7360

74-
using box_head_nms_cpu_kernel_fn = std::tuple<
75-
std::vector<at::Tensor>,
76-
std::vector<at::Tensor>,
77-
std::vector<at::Tensor>> (*)(
78-
const std::vector<at::Tensor>&,
79-
const std::vector<at::Tensor>&,
80-
const std::vector<std::tuple<int64_t, int64_t>>&,
81-
const float,
82-
const float,
83-
const int,
84-
const int);
85-
IPEX_DECLARE_DISPATCH(box_head_nms_cpu_kernel_fn, box_head_nms_cpu_kernel_stub);
86-
8761
} // namespace cpu
8862
} // namespace torch_ipex
8963

@@ -156,42 +130,6 @@ std::tuple<std::vector<at::Tensor>, std::vector<at::Tensor>> rpn_nms(
156130
const double threshold,
157131
const int64_t max_output);
158132

159-
/// \brief Perform batch non-maximum suppression (NMS) for MaskRCNN box_head
160-
/// part.
161-
///
162-
/// C++ version of batch NMS for MaskRCNN box_head part.
163-
/// Refer to
164-
/// https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py#L79.
165-
///
166-
/// \param batch_bboxes: predicted loc in ltrb format, BS tensors in vector,
167-
/// and the size of each tensor: [number_boxes, 4]. \param batch_scores:
168-
/// predicted score, BS tensors in vector, and the size of each tensor:
169-
/// [number_boxes]. \param image_shapes: the shapes of images, BS tuples in
170-
/// vector. \param score_thresh: the threshold of score. \param threshold: IOU
171-
/// threshold(scalar) to suppress bboxs which has the IOU val larger than the
172-
/// threshold. \param detections_per_img: the max number of detections per
173-
/// image. \param num_classes: class number of objects.
174-
///
175-
/// \return result is a tuple. There are 3 vectors of tensors in the tuple:
176-
/// bboxes_out_: the selected out bboxes coordinate, BS tensors in vector,
177-
/// and the size of each tensor: [selected_box_number, 4]. scores_out_: the
178-
/// score of each selected out bboxes, BS tensors in vector, and the size of
179-
/// each tensor: [selected_box_number]. labels_out_: the label of each
180-
/// selected out bboxes, BS tensors in vector, and the size of each tensor:
181-
/// [selected_box_number].
182-
std::tuple<
183-
std::vector<at::Tensor>,
184-
std::vector<at::Tensor>,
185-
std::vector<at::Tensor>>
186-
box_head_nms(
187-
const std::vector<at::Tensor>& batch_bboxes,
188-
const std::vector<at::Tensor>& batch_scores,
189-
const std::vector<std::tuple<int64_t, int64_t>>& image_shapes,
190-
const double score_thresh,
191-
const double threshold,
192-
const int64_t detections_per_img,
193-
const int64_t num_classes);
194-
195133
/// \brief Do scale and transform from xywh to ltrb for predicted loc and do
196134
/// Softmax along the last dim for predicted score.
197135
///

csrc/cpu/aten/kernels/NmsKrnl.cpp

Lines changed: 0 additions & 145 deletions
Original file line numberDiff line numberDiff line change
@@ -560,116 +560,6 @@ std::tuple<std::vector<at::Tensor>, std::vector<at::Tensor>> rpn_nms_kernel(
560560
return std::make_tuple(bboxes_out, scores_out);
561561
}
562562

563-
template <typename scalar_t>
564-
std::tuple<
565-
std::vector<at::Tensor>,
566-
std::vector<at::Tensor>,
567-
std::vector<at::Tensor>>
568-
box_head_nms_kernel(
569-
const std::vector<at::Tensor>& batch_bboxes,
570-
const std::vector<at::Tensor>& batch_scores,
571-
const std::vector<std::tuple<int64_t, int64_t>>& image_shapes,
572-
const float score_thresh,
573-
const float threshold,
574-
const int detections_per_img,
575-
const int num_classes) {
576-
auto nbatch = batch_scores.size(); // number of batches
577-
auto nbatch_x_nclass =
578-
nbatch * num_classes; // (number of batches) * (number of labels)
579-
580-
std::vector<at::Tensor> bboxes_out(nbatch_x_nclass);
581-
std::vector<at::Tensor> scores_out(nbatch_x_nclass);
582-
std::vector<at::Tensor> labels_out(nbatch_x_nclass);
583-
584-
#ifdef _OPENMP
585-
#if (_OPENMP >= 201307)
586-
#pragma omp parallel for simd schedule( \
587-
static) if (omp_get_max_threads() > 1 && !omp_in_parallel())
588-
#else
589-
#pragma omp parallel for schedule( \
590-
static) if (omp_get_max_threads() > 1 && !omp_in_parallel())
591-
#endif
592-
#endif
593-
for (int bs = 0; bs < nbatch; bs++) {
594-
at::Tensor bboxes = batch_bboxes[bs].reshape({-1, 4});
595-
at::Tensor scores = batch_scores[bs];
596-
auto image_shape = image_shapes[bs];
597-
bboxes.slice(1, 0, 1).clamp_(0, std::get<0>(image_shape) - 1);
598-
bboxes.slice(1, 1, 2).clamp_(0, std::get<1>(image_shape) - 1);
599-
bboxes.slice(1, 2, 3).clamp_(0, std::get<0>(image_shape) - 1);
600-
bboxes.slice(1, 3, 4).clamp_(0, std::get<1>(image_shape) - 1);
601-
bboxes = bboxes.reshape({-1, num_classes * 4});
602-
scores = scores.reshape({-1, num_classes});
603-
at::Tensor indexes = scores > score_thresh;
604-
605-
for (int j = 1; j < num_classes; j++) {
606-
at::Tensor index =
607-
at::nonzero(indexes.slice(1, j, j + 1).squeeze(1)).squeeze(1);
608-
at::Tensor score =
609-
scores.slice(1, j, j + 1).squeeze(1).index_select(0, index);
610-
at::Tensor bbox =
611-
bboxes.slice(1, j * 4, (j + 1) * 4).index_select(0, index);
612-
if (score.size(0) == 0) {
613-
continue;
614-
}
615-
auto iter = bs * num_classes + j;
616-
if (threshold > 0) {
617-
at::Tensor keep =
618-
nms_cpu_kernel<scalar_t, /*sorted*/ false>(bbox, score, threshold);
619-
bboxes_out[iter] = bbox.index_select(0, keep);
620-
scores_out[iter] = score.index_select(0, keep);
621-
labels_out[iter] = at::full({keep.sizes()}, j, torch::kInt64);
622-
} else {
623-
bboxes_out[iter] = bbox;
624-
scores_out[iter] = score;
625-
labels_out[iter] = at::full({score.sizes()}, j, torch::kInt64);
626-
}
627-
}
628-
}
629-
630-
std::vector<at::Tensor> bboxes_out_(nbatch);
631-
std::vector<at::Tensor> scores_out_(nbatch);
632-
std::vector<at::Tensor> labels_out_(nbatch);
633-
634-
#ifdef _OPENMP
635-
#if (_OPENMP >= 201307)
636-
#pragma omp parallel for simd schedule( \
637-
static) if (omp_get_max_threads() > 1 && !omp_in_parallel())
638-
#else
639-
#pragma omp parallel for schedule( \
640-
static) if (omp_get_max_threads() > 1 && !omp_in_parallel())
641-
#endif
642-
#endif
643-
for (int bs = 0; bs < nbatch; bs++) {
644-
std::vector<at::Tensor> valid_bboxes_out =
645-
remove_empty(bboxes_out, bs * num_classes, (bs + 1) * num_classes);
646-
std::vector<at::Tensor> valid_scores_out =
647-
remove_empty(scores_out, bs * num_classes, (bs + 1) * num_classes);
648-
std::vector<at::Tensor> valid_labels_out =
649-
remove_empty(labels_out, bs * num_classes, (bs + 1) * num_classes);
650-
if (valid_bboxes_out.size() > 0) {
651-
bboxes_out_[bs] = at::cat(valid_bboxes_out, 0);
652-
scores_out_[bs] = at::cat(valid_scores_out, 0);
653-
labels_out_[bs] = at::cat(valid_labels_out, 0);
654-
} else {
655-
bboxes_out_[bs] = at::empty({0, 4}, torch::kFloat);
656-
scores_out_[bs] = at::empty({0}, torch::kFloat);
657-
labels_out_[bs] = at::empty({0}, torch::kInt64);
658-
}
659-
auto number_of_detections = bboxes_out_[bs].size(0);
660-
if (number_of_detections > detections_per_img && detections_per_img > 0) {
661-
auto out_ = scores_out_[bs].kthvalue(
662-
number_of_detections - detections_per_img + 1);
663-
at::Tensor keep =
664-
at::nonzero(scores_out_[bs] >= std::get<0>(out_).item()).squeeze(1);
665-
bboxes_out_[bs] = bboxes_out_[bs].index_select(0, keep);
666-
scores_out_[bs] = scores_out_[bs].index_select(0, keep);
667-
labels_out_[bs] = labels_out_[bs].index_select(0, keep);
668-
}
669-
}
670-
return std::make_tuple(bboxes_out_, scores_out_, labels_out_);
671-
}
672-
673563
at::Tensor nms_cpu_kernel_impl(
674564
const at::Tensor& dets,
675565
const at::Tensor& scores,
@@ -718,37 +608,6 @@ rpn_nms_cpu_kernel_impl(
718608
return result;
719609
}
720610

721-
std::tuple<
722-
std::vector<at::Tensor>,
723-
std::vector<at::Tensor>,
724-
std::vector<at::Tensor>>
725-
box_head_nms_cpu_kernel_impl(
726-
const std::vector<at::Tensor>& batch_bboxes,
727-
const std::vector<at::Tensor>& batch_scores,
728-
const std::vector<std::tuple<int64_t, int64_t>>& image_shapes,
729-
const float score_thresh,
730-
const float threshold,
731-
const int detections_per_img,
732-
const int num_classes) {
733-
std::tuple<
734-
std::vector<at::Tensor>,
735-
std::vector<at::Tensor>,
736-
std::vector<at::Tensor>>
737-
result;
738-
AT_DISPATCH_FLOATING_TYPES(
739-
batch_bboxes[0].scalar_type(), "box_head_nms", [&] {
740-
result = box_head_nms_kernel<scalar_t>(
741-
batch_bboxes,
742-
batch_scores,
743-
image_shapes,
744-
score_thresh,
745-
threshold,
746-
detections_per_img,
747-
num_classes);
748-
});
749-
return result;
750-
}
751-
752611
} // anonymous namespace
753612

754613
IPEX_REGISTER_DISPATCH(nms_cpu_kernel_stub, &nms_cpu_kernel_impl);
@@ -759,9 +618,5 @@ IPEX_REGISTER_DISPATCH(
759618

760619
IPEX_REGISTER_DISPATCH(rpn_nms_cpu_kernel_stub, &rpn_nms_cpu_kernel_impl);
761620

762-
IPEX_REGISTER_DISPATCH(
763-
box_head_nms_cpu_kernel_stub,
764-
&box_head_nms_cpu_kernel_impl);
765-
766621
} // namespace cpu
767622
} // namespace torch_ipex

0 commit comments

Comments
 (0)