Skip to content

Commit ec64f44

Browse files
LielinJiangwanghaoshuang
authored andcommitted
Make roi_perspective_transform op return mask and transform matrix,test=release/1.5 (#19391)
* make_roi_perspective_transform_op_return_mask_and_matrix * make_roi_perspective_transform_op_return_mask_and_matrix
1 parent 1460648 commit ec64f44

File tree

5 files changed

+97
-27
lines changed

5 files changed

+97
-27
lines changed

paddle/fluid/API.spec

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,7 @@ paddle.fluid.layers.rpn_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits',
353353
paddle.fluid.layers.retinanet_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'gt_labels', 'is_crowd', 'im_info', 'num_classes', 'positive_overlap', 'negative_overlap'], varargs=None, keywords=None, defaults=(1, 0.5, 0.4)), ('document', 'fa1d1c9d5e0111684c0db705f86a2595'))
354354
paddle.fluid.layers.sigmoid_focal_loss (ArgSpec(args=['x', 'label', 'fg_num', 'gamma', 'alpha'], varargs=None, keywords=None, defaults=(2, 0.25)), ('document', 'aeac6aae100173b3fc7f102cf3023a3d'))
355355
paddle.fluid.layers.anchor_generator (ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)), ('document', '0aaacaf9858b8270a8ab5b0aacdd94b7'))
356-
paddle.fluid.layers.roi_perspective_transform (ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)), ('document', 'd1ddc75629fedee46f82e631e22c79dc'))
356+
paddle.fluid.layers.roi_perspective_transform (ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)), ('document', 'a82016342789ba9d85737e405f824ff1'))
357357
paddle.fluid.layers.generate_proposal_labels (ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random', 'is_cls_agnostic', 'is_cascade_rcnn'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True, False, False)), ('document', 'e87c1131e98715d3657a96c44db1b910'))
358358
paddle.fluid.layers.generate_proposals (ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)), ('document', 'b7d707822b6af2a586bce608040235b1'))
359359
paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None), ('document', 'b319b10ddaf17fb4ddf03518685a17ef'))

paddle/fluid/operators/detection/roi_perspective_transform_op.cc

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,9 @@ class CPUROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
243243
auto* in = ctx.Input<framework::Tensor>("X");
244244
auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
245245
auto* out = ctx.Output<framework::Tensor>("Out");
246-
246+
auto* mask = ctx.Output<framework::Tensor>("Mask");
247+
auto* out_transform_matrix =
248+
ctx.Output<framework::Tensor>("TransformMatrix");
247249
auto transformed_height = ctx.Attr<int>("transformed_height");
248250
auto transformed_width = ctx.Attr<int>("transformed_width");
249251
auto spatial_scale = ctx.Attr<float>("spatial_scale");
@@ -255,6 +257,7 @@ class CPUROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
255257
int rois_num = rois->dims()[0];
256258

257259
const T* input_data = in->data<T>();
260+
int* mask_data = mask->mutable_data<int>(ctx.GetPlace());
258261

259262
framework::Tensor roi2image;
260263
roi2image.Resize({rois_num});
@@ -269,6 +272,9 @@ class CPUROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
269272
T* output_data = out->mutable_data<T>(ctx.GetPlace());
270273
const T* rois_data = rois->data<T>();
271274

275+
T* transform_matrix =
276+
out_transform_matrix->mutable_data<T>({rois_num, 9}, ctx.GetPlace());
277+
272278
for (int n = 0; n < rois_num; ++n) {
273279
const T* n_rois = rois_data + n * 8;
274280
T roi_x[4];
@@ -279,10 +285,12 @@ class CPUROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
279285
}
280286
int image_id = roi2image_data[n];
281287
// Get transform matrix
282-
T transform_matrix[9];
288+
T matrix[9];
283289
get_transform_matrix<T>(transformed_width, transformed_height, roi_x,
284-
roi_y, transform_matrix);
285-
290+
roi_y, matrix);
291+
for (int i = 0; i < 9; i++) {
292+
transform_matrix[n * 9 + i] = matrix[i];
293+
}
286294
for (int c = 0; c < channels; ++c) {
287295
for (int out_h = 0; out_h < transformed_height; ++out_h) {
288296
for (int out_w = 0; out_w < transformed_width; ++out_w) {
@@ -291,20 +299,26 @@ class CPUROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
291299
c * transformed_height * transformed_width +
292300
out_h * transformed_width + out_w;
293301
T in_w, in_h;
294-
get_source_coords<T>(transform_matrix, out_w, out_h, &in_w, &in_h);
302+
get_source_coords<T>(matrix, out_w, out_h, &in_w, &in_h);
295303
if (in_quad<T>(in_w, in_h, roi_x, roi_y)) {
296304
if (GT<T>(-0.5, in_w) ||
297305
GT<T>(in_w, static_cast<T>(in_width - 0.5)) ||
298306
GT<T>(-0.5, in_h) ||
299307
GT<T>(in_h, static_cast<T>(in_height - 0.5))) {
300308
output_data[out_index] = 0.0;
309+
mask_data[(n * transformed_height + out_h) * transformed_width +
310+
out_w] = 0;
301311
} else {
302312
bilinear_interpolate(input_data, channels, in_width, in_height,
303313
image_id, c, in_w, in_h,
304314
output_data + out_index);
315+
mask_data[(n * transformed_height + out_h) * transformed_width +
316+
out_w] = 1;
305317
}
306318
} else {
307319
output_data[out_index] = 0.0;
320+
mask_data[(n * transformed_height + out_h) * transformed_width +
321+
out_w] = 0;
308322
}
309323
}
310324
}
@@ -467,7 +481,6 @@ class ROIPerspectiveTransformOp : public framework::OperatorWithKernel {
467481
"Output(Out) of ROIPerspectiveTransformOp should not be null.");
468482
auto input_dims = ctx->GetInputDim("X");
469483
auto rois_dims = ctx->GetInputDim("ROIs");
470-
471484
PADDLE_ENFORCE(input_dims.size() == 4,
472485
"The format of input tensor is NCHW.");
473486
PADDLE_ENFORCE(rois_dims.size() == 2,
@@ -476,7 +489,6 @@ class ROIPerspectiveTransformOp : public framework::OperatorWithKernel {
476489
PADDLE_ENFORCE(rois_dims[1] == 8,
477490
"ROIs should be a 2-D LoDTensor of shape (num_rois, 8)"
478491
"given as [[x0, y0, x1, y1, x2, y2, x3, y3], ...].");
479-
480492
int transformed_height = ctx->Attrs().Get<int>("transformed_height");
481493
int transformed_width = ctx->Attrs().Get<int>("transformed_width");
482494
float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
@@ -493,7 +505,18 @@ class ROIPerspectiveTransformOp : public framework::OperatorWithKernel {
493505
static_cast<int64_t>(transformed_width)});
494506
auto out_dims = framework::make_ddim(out_dims_v);
495507

508+
std::vector<int64_t> mask_dims_v({rois_dims[0], // num_rois
509+
1, // channels
510+
static_cast<int64_t>(transformed_height),
511+
static_cast<int64_t>(transformed_width)});
512+
auto mask_dims = framework::make_ddim(mask_dims_v);
513+
514+
std::vector<int64_t> matrix_dims_v({rois_dims[0], 9});
515+
auto matrix_dims = framework::make_ddim(matrix_dims_v);
516+
496517
ctx->SetOutputDim("Out", out_dims);
518+
ctx->SetOutputDim("Mask", mask_dims);
519+
ctx->SetOutputDim("TransformMatrix", matrix_dims);
497520
ctx->SetOutputDim("Out2InIdx", out_dims);
498521
ctx->SetOutputDim("Out2InWeights", out_dims);
499522
ctx->ShareLoD("ROIs", /*->*/ "Out");
@@ -552,6 +575,16 @@ class ROIPerspectiveTransformOpMaker
552575
"(Tensor), "
553576
"The output of ROIPerspectiveTransformOp is a 4-D tensor with shape "
554577
"(num_rois, channels, transformed_h, transformed_w).");
578+
AddOutput("Mask",
579+
"(Tensor), "
580+
"The output mask of ROIPerspectiveTransformOp is a 4-D tensor "
581+
"with shape "
582+
"(num_rois, 1, transformed_h, transformed_w).");
583+
AddOutput("TransformMatrix",
584+
"(Tensor), "
585+
"The output transform matrix of ROIPerspectiveTransformOp is a "
586+
"1-D tensor with shape "
587+
"(num_rois, 9).");
555588
AddOutput("Out2InIdx",
556589
"(Tensor), "
557590
"An intermediate tensor used to map indexes of input feature map "

paddle/fluid/operators/detection/roi_perspective_transform_op.cu

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -274,11 +274,14 @@ __device__ void get_transform_matrix(const int transformed_width,
274274
}
275275

276276
template <typename T>
277-
__global__ void RoiTransformKernel(
278-
const float* input_data, const float* rois_data, const int* roi2image_data,
279-
int num_rois, int in_height, int in_width, int channels,
280-
int transformed_height, int transformed_width, float spatial_scale,
281-
T* output_data, int* out2in_idx, T* out2in_w) {
277+
__global__ void RoiTransformKernel(const float* input_data,
278+
const float* rois_data,
279+
const int* roi2image_data, int num_rois,
280+
int in_height, int in_width, int channels,
281+
int transformed_height,
282+
int transformed_width, float spatial_scale,
283+
T* output_data, int* out2in_idx, T* out2in_w,
284+
int* mask, T* transform_matrix) {
282285
int output_size =
283286
num_rois * transformed_height * transformed_width * channels;
284287

@@ -306,7 +309,9 @@ __global__ void RoiTransformKernel(
306309
T matrix[9];
307310
get_transform_matrix<T>(transformed_width, transformed_height, roi_x, roi_y,
308311
matrix);
309-
312+
for (int i = 0; i < 9; i++) {
313+
transform_matrix[n * 9 + i] = matrix[i];
314+
}
310315
// Get source coords
311316
T in_w;
312317
T in_h;
@@ -317,17 +322,20 @@ __global__ void RoiTransformKernel(
317322
GT<T>(-0.5, in_h) || GT<T>(in_h, static_cast<T>(in_height - 0.5))) {
318323
// Skip if source coords is not in input image
319324
output_data[index] = 0.0;
325+
mask[(n * transformed_height + out_h) * transformed_width + out_w] = 0;
320326
} else {
321327
// Perform bilinear interpolation
322328
int in_n = roi2image_data[n];
323329
bilinear_interpolate<T>(input_data, channels, in_width, in_height, in_n,
324330
c, in_w, in_h, output_data + index, index,
325331
out2in_idx, out2in_w);
332+
mask[(n * transformed_height + out_h) * transformed_width + out_w] = 1;
326333
}
327334

328335
} else {
329336
// Skip if source coords is not in quad
330337
output_data[index] = 0.0;
338+
mask[(n * transformed_height + out_h) * transformed_width + out_w] = 0;
331339
}
332340
}
333341
}
@@ -341,7 +349,11 @@ class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
341349
auto* out = ctx.Output<framework::Tensor>("Out");
342350
auto* out2in_idx = ctx.Output<framework::Tensor>("Out2InIdx");
343351
auto* out2in_w = ctx.Output<framework::Tensor>("Out2InWeights");
352+
auto* mask = ctx.Output<framework::Tensor>("Mask");
353+
auto* out_transform_matrix =
354+
ctx.Output<framework::Tensor>("TransformMatrix");
344355

356+
int* mask_data = mask->mutable_data<int>(ctx.GetPlace());
345357
int* out2in_idx_data =
346358
out2in_idx->mutable_data<int>({out->numel(), 4}, ctx.GetPlace());
347359
T* out2in_w_data =
@@ -382,10 +394,15 @@ class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
382394
int block = 512;
383395
int grid = (out_size + block - 1) / block;
384396

397+
// Get transform matrix
398+
T* matrix =
399+
out_transform_matrix->mutable_data<T>({rois_num, 9}, ctx.GetPlace());
400+
385401
RoiTransformKernel<T><<<grid, block, 0, stream>>>(
386402
input_data, rois_data, roi2image_dev.data<int>(), rois_num, in_height,
387403
in_width, channels, transformed_height, transformed_width,
388-
spatial_scale, output_data, out2in_idx_data, out2in_w_data);
404+
spatial_scale, output_data, out2in_idx_data, out2in_w_data, mask_data,
405+
matrix);
389406
}
390407
};
391408

python/paddle/fluid/layers/detection.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2100,8 +2100,16 @@ def roi_perspective_transform(input,
21002100
spatial_scale (float): Spatial scale factor to scale ROI coords. Default: 1.0
21012101
21022102
Returns:
2103-
Variable: The output of ROIPerspectiveTransformOp which is a 4-D tensor with shape
2104-
(num_rois, channels, transformed_h, transformed_w).
2103+
tuple: A tuple with three Variables. (out, mask, transform_matrix)
2104+
2105+
out: The output of ROIPerspectiveTransformOp which is a 4-D tensor with shape
2106+
(num_rois, channels, transformed_h, transformed_w).
2107+
2108+
mask: The mask of ROIPerspectiveTransformOp which is a 4-D tensor with shape
2109+
(num_rois, 1, transformed_h, transformed_w).
2110+
2111+
transform_matrix: The transform matrix of ROIPerspectiveTransformOp which is
2112+
a 2-D tensor with shape (num_rois, 9).
21052113
21062114
Examples:
21072115
.. code-block:: python
@@ -2110,11 +2118,13 @@ def roi_perspective_transform(input,
21102118
21112119
x = fluid.layers.data(name='x', shape=[256, 28, 28], dtype='float32')
21122120
rois = fluid.layers.data(name='rois', shape=[8], lod_level=1, dtype='float32')
2113-
out = fluid.layers.roi_perspective_transform(x, rois, 7, 7, 1.0)
2121+
out, mask, transform_matrix = fluid.layers.roi_perspective_transform(x, rois, 7, 7, 1.0)
21142122
"""
21152123
helper = LayerHelper('roi_perspective_transform', **locals())
21162124
dtype = helper.input_dtype()
21172125
out = helper.create_variable_for_type_inference(dtype)
2126+
mask = helper.create_variable_for_type_inference(dtype="int32")
2127+
transform_matrix = helper.create_variable_for_type_inference(dtype)
21182128
out2in_idx = helper.create_variable_for_type_inference(dtype="int32")
21192129
out2in_w = helper.create_variable_for_type_inference(dtype)
21202130
helper.append_op(
@@ -2124,14 +2134,16 @@ def roi_perspective_transform(input,
21242134
outputs={
21252135
"Out": out,
21262136
"Out2InIdx": out2in_idx,
2127-
"Out2InWeights": out2in_w
2137+
"Out2InWeights": out2in_w,
2138+
"Mask": mask,
2139+
"TransformMatrix": transform_matrix
21282140
},
21292141
attrs={
21302142
"transformed_height": transformed_height,
21312143
"transformed_width": transformed_width,
21322144
"spatial_scale": spatial_scale
21332145
})
2134-
return out
2146+
return out, mask, transform_matrix
21352147

21362148

21372149
def generate_proposal_labels(rpn_rois,

python/paddle/fluid/tests/unittests/test_roi_perspective_transform_op.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,9 @@ def roi_transform(in_data, rois, rois_lod, transformed_height,
198198
roi2image[j] = i
199199

200200
out = np.zeros([rois_num, channels, transformed_height, transformed_width])
201-
201+
mask = np.zeros(
202+
[rois_num, 1, transformed_height, transformed_width]).astype('int')
203+
matrix = np.zeros([rois_num, 9], dtype=in_data.dtype)
202204
for n in range(rois_num):
203205
roi_x = []
204206
roi_y = []
@@ -208,7 +210,7 @@ def roi_transform(in_data, rois, rois_lod, transformed_height,
208210
image_id = roi2image[n]
209211
transform_matrix = get_transform_matrix(
210212
transformed_width, transformed_height, roi_x, roi_y)
211-
213+
matrix[n] = transform_matrix
212214
for c in range(channels):
213215
for out_h in range(transformed_height):
214216
for out_w in range(transformed_width):
@@ -219,9 +221,11 @@ def roi_transform(in_data, rois, rois_lod, transformed_height,
219221
in_h, -0.5) and lt_e(in_h, in_height - 0.5):
220222
out[n][c][out_h][out_w] = bilinear_interpolate(
221223
in_data, image_id, c, in_w, in_h)
224+
mask[n][0][out_h][out_w] = 1
222225
else:
223226
out[n][c][out_h][out_w] = 0.0
224-
return out.astype("float32")
227+
mask[n][0][out_h][out_w] = 0
228+
return out.astype("float32"), mask, matrix
225229

226230

227231
class TestROIPoolOp(OpTest):
@@ -236,10 +240,14 @@ def set_data(self):
236240
'transformed_height': self.transformed_height,
237241
'transformed_width': self.transformed_width
238242
}
239-
out = roi_transform(self.x, self.rois, self.rois_lod,
240-
self.transformed_height, self.transformed_width,
241-
self.spatial_scale)
242-
self.outputs = {'Out': out}
243+
out, mask, transform_matrix = roi_transform(
244+
self.x, self.rois, self.rois_lod, self.transformed_height,
245+
self.transformed_width, self.spatial_scale)
246+
self.outputs = {
247+
'Out': out,
248+
'Mask': mask,
249+
'TransformMatrix': transform_matrix
250+
}
243251

244252
def init_test_case(self):
245253
self.batch_size = 2

0 commit comments

Comments
 (0)