Skip to content

Commit 593ad76

Browse files
committed
refactor(op): polish generate_proposals_op
Polish styles in generate_proposals_op. 1. inline lambda functions rathar than use std::function to save var. 2. add `static inline` to template functions .cc * Make them static to prevent generating symbols. * Make them inline to give compiler a hit inline them as possible. * Not if the function is not static, they cannot be inlined since the symbols should be exported. 3. add `static` to global functions in .cc * Make them static to prevent generating symbols. 4. Use Vector<uint64> instead manually manange storage between devices. 5. Prefer to use platform::ForRange, so we can optimize `ForRange` by just changing `for_range.h` if it is needed. 6. Do not change shape of inputs test=develop
1 parent 7a5f3f7 commit 593ad76

File tree

3 files changed

+190
-178
lines changed

3 files changed

+190
-178
lines changed

paddle/fluid/operators/detection/generate_proposals_op.cc

Lines changed: 97 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

15+
#include <cmath>
16+
#include <cstring>
1517
#include <string>
1618
#include <vector>
1719
#include "paddle/fluid/framework/op_registry.h"
18-
#include "paddle/fluid/framework/var_type.h"
20+
#include "paddle/fluid/operators/detail/safe_ref.h"
1921
#include "paddle/fluid/operators/gather.h"
2022
#include "paddle/fluid/operators/math/math_function.h"
2123

@@ -25,21 +27,17 @@ namespace operators {
2527
using Tensor = framework::Tensor;
2628
using LoDTensor = framework::LoDTensor;
2729

28-
struct AppendProposalsFunctor {
29-
LoDTensor *out_;
30-
int64_t offset_;
31-
Tensor *to_add_;
30+
static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
3231

33-
AppendProposalsFunctor(LoDTensor *out, int64_t offset, Tensor *to_add)
34-
: out_(out), offset_(offset), to_add_(to_add) {}
35-
36-
template <typename T>
37-
void apply() const {
38-
auto *out_data = out_->data<T>();
39-
auto *to_add_data = to_add_->data<T>();
40-
memcpy(out_data + offset_, to_add_data, to_add_->numel() * sizeof(T));
41-
}
42-
};
32+
static void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) {
33+
auto *out_data = dst->data<void>();
34+
auto *to_add_data = src.data<void>();
35+
size_t size_of_t = framework::SizeOfType(src.type());
36+
offset *= size_of_t;
37+
std::memcpy(
38+
reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(out_data) + offset),
39+
to_add_data, src.numel() * size_of_t);
40+
}
4341

4442
class GenerateProposalsOp : public framework::OperatorWithKernel {
4543
public:
@@ -75,8 +73,9 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
7573
};
7674

7775
template <class T>
78-
void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
79-
Tensor *bbox_deltas, Tensor *variances, Tensor *proposals) {
76+
static inline void BoxCoder(const platform::DeviceContext &ctx,
77+
Tensor *all_anchors, Tensor *bbox_deltas,
78+
Tensor *variances, Tensor *proposals) {
8079
T *proposals_data = proposals->mutable_data<T>(ctx.GetPlace());
8180

8281
int64_t row = all_anchors->dims()[0];
@@ -108,22 +107,22 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
108107
anchor_center_y;
109108
bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
110109
bbox_deltas_data[i * len + 2],
111-
std::log(1000.0 / 16.0))) *
110+
kBBoxClipDefault)) *
112111
anchor_width;
113112
bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
114113
bbox_deltas_data[i * len + 3],
115-
std::log(1000.0 / 16.0))) *
114+
kBBoxClipDefault)) *
116115
anchor_height;
117116
} else {
118117
bbox_center_x =
119118
bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
120119
bbox_center_y =
121120
bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
122121
bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
123-
std::log(1000.0 / 16.0))) *
122+
kBBoxClipDefault)) *
124123
anchor_width;
125124
bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
126-
std::log(1000.0 / 16.0))) *
125+
kBBoxClipDefault)) *
127126
anchor_height;
128127
}
129128

@@ -136,30 +135,32 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
136135
}
137136

138137
template <class T>
139-
void ClipTiledBoxes(const platform::DeviceContext &ctx, const Tensor &im_info,
140-
Tensor *boxes) {
138+
static inline void ClipTiledBoxes(const platform::DeviceContext &ctx,
139+
const Tensor &im_info, Tensor *boxes) {
141140
T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
142141
const T *im_info_data = im_info.data<T>();
142+
T zero(0);
143143
for (int64_t i = 0; i < boxes->numel(); ++i) {
144144
if (i % 4 == 0) {
145145
boxes_data[i] =
146-
std::max(std::min(boxes_data[i], im_info_data[1] - 1), 0.0f);
146+
std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
147147
} else if (i % 4 == 1) {
148148
boxes_data[i] =
149-
std::max(std::min(boxes_data[i], im_info_data[0] - 1), 0.0f);
149+
std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
150150
} else if (i % 4 == 2) {
151151
boxes_data[i] =
152-
std::max(std::min(boxes_data[i], im_info_data[1] - 1), 0.0f);
152+
std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
153153
} else {
154154
boxes_data[i] =
155-
std::max(std::min(boxes_data[i], im_info_data[0] - 1), 0.0f);
155+
std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
156156
}
157157
}
158158
}
159159

160160
template <class T>
161-
void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
162-
float min_size, const Tensor &im_info, Tensor *keep) {
161+
static inline void FilterBoxes(const platform::DeviceContext &ctx,
162+
Tensor *boxes, float min_size,
163+
const Tensor &im_info, Tensor *keep) {
163164
const T *im_info_data = im_info.data<T>();
164165
T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
165166
T im_scale = im_info_data[2];
@@ -185,24 +186,24 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
185186
keep->Resize({keep_len});
186187
}
187188

188-
bool SortScorePairDescend(const std::pair<float, int> &pair1,
189-
const std::pair<float, int> &pair2) {
190-
return pair1.first > pair2.first;
191-
}
192-
193189
template <class T>
194-
void GetMaxScoreIndex(const std::vector<T> &scores,
195-
std::vector<std::pair<T, int>> *sorted_indices) {
190+
static inline std::vector<std::pair<T, int>> GetSortedScoreIndex(
191+
const std::vector<T> &scores) {
192+
std::vector<std::pair<T, int>> sorted_indices;
193+
sorted_indices.reserve(scores.size());
196194
for (size_t i = 0; i < scores.size(); ++i) {
197-
sorted_indices->push_back(std::make_pair(scores[i], i));
195+
sorted_indices.emplace_back(scores[i], i);
198196
}
199197
// Sort the score pair according to the scores in descending order
200-
std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
201-
SortScorePairDescend);
198+
std::stable_sort(sorted_indices.begin(), sorted_indices.end(),
199+
[](const std::pair<T, int> &a, const std::pair<T, int> &b) {
200+
return a.first < b.first;
201+
});
202+
return sorted_indices;
202203
}
203204

204205
template <class T>
205-
T BBoxArea(const T *box, const bool normalized) {
206+
static inline T BBoxArea(const T *box, bool normalized) {
206207
if (box[2] < box[0] || box[3] < box[1]) {
207208
// If coordinate values are is invalid
208209
// (e.g. xmax < xmin or ymax < ymin), return 0.
@@ -220,7 +221,7 @@ T BBoxArea(const T *box, const bool normalized) {
220221
}
221222

222223
template <class T>
223-
T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
224+
static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) {
224225
if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
225226
box2[3] < box1[1]) {
226227
return static_cast<T>(0.);
@@ -229,39 +230,49 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
229230
const T inter_ymin = std::max(box1[1], box2[1]);
230231
const T inter_xmax = std::min(box1[2], box2[2]);
231232
const T inter_ymax = std::min(box1[3], box2[3]);
232-
const T inter_w = std::max(0.0f, inter_xmax - inter_xmin + 1);
233-
const T inter_h = std::max(0.0f, inter_ymax - inter_ymin + 1);
233+
const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1);
234+
const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1);
234235
const T inter_area = inter_w * inter_h;
235236
const T bbox1_area = BBoxArea<T>(box1, normalized);
236237
const T bbox2_area = BBoxArea<T>(box2, normalized);
237238
return inter_area / (bbox1_area + bbox2_area - inter_area);
238239
}
239240
}
240241

242+
template <typename T>
243+
static inline Tensor VectorToTensor(const std::vector<T> &selected_indices,
244+
int selected_num) {
245+
Tensor keep_nms;
246+
keep_nms.Resize({selected_num});
247+
auto *keep_data = keep_nms.mutable_data<T>(platform::CPUPlace());
248+
for (int i = 0; i < selected_num; ++i) {
249+
keep_data[i] = selected_indices[i];
250+
}
251+
return keep_nms;
252+
}
253+
241254
template <class T>
242-
Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores,
243-
const T nms_threshold, const float eta) {
255+
static inline Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox,
256+
Tensor *scores, T nms_threshold, float eta) {
244257
PADDLE_ENFORCE_NOT_NULL(bbox);
245258
int64_t num_boxes = bbox->dims()[0];
246259
// 4: [xmin ymin xmax ymax]
247260
int64_t box_size = bbox->dims()[1];
248261

249262
std::vector<T> scores_data(num_boxes);
250263
std::copy_n(scores->data<T>(), num_boxes, scores_data.begin());
251-
std::vector<std::pair<T, int>> sorted_indices;
252-
GetMaxScoreIndex<T>(scores_data, &sorted_indices);
264+
std::vector<std::pair<T, int>> sorted_indices =
265+
GetSortedScoreIndex<T>(scores_data);
253266

254267
std::vector<int> selected_indices;
255268
int selected_num = 0;
256269
T adaptive_threshold = nms_threshold;
257270
const T *bbox_data = bbox->data<T>();
258-
bool flag;
259271
while (sorted_indices.size() != 0) {
260-
int idx = sorted_indices.front().second;
261-
flag = true;
262-
for (size_t k = 0; k < selected_indices.size(); ++k) {
272+
int idx = sorted_indices.back().second;
273+
bool flag = true;
274+
for (int kept_idx : selected_indices) {
263275
if (flag) {
264-
const int kept_idx = selected_indices[k];
265276
T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
266277
bbox_data + kept_idx * box_size, false);
267278
flag = (overlap <= adaptive_threshold);
@@ -271,32 +282,29 @@ Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores,
271282
}
272283
if (flag) {
273284
selected_indices.push_back(idx);
274-
selected_num++;
285+
++selected_num;
275286
}
276-
sorted_indices.erase(sorted_indices.begin());
287+
sorted_indices.erase(sorted_indices.end());
277288
if (flag && eta < 1 && adaptive_threshold > 0.5) {
278289
adaptive_threshold *= eta;
279290
}
280291
}
281-
Tensor keep_nms;
282-
keep_nms.Resize({selected_num});
283-
int *keep_data = keep_nms.mutable_data<int>(ctx.GetPlace());
284-
for (int i = 0; i < selected_num; ++i) {
285-
keep_data[i] = selected_indices[i];
286-
}
287-
288-
return keep_nms;
292+
return VectorToTensor(selected_indices, selected_num);
289293
}
290294

291-
template <typename DeviceContext, typename T>
295+
template <typename T>
292296
class GenerateProposalsKernel : public framework::OpKernel<T> {
293297
public:
294298
void Compute(const framework::ExecutionContext &context) const override {
295299
auto *scores = context.Input<Tensor>("Scores");
296300
auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
297301
auto *im_info = context.Input<Tensor>("ImInfo");
298-
auto *anchors = context.Input<Tensor>("Anchors");
299-
auto *variances = context.Input<Tensor>("Variances");
302+
auto anchors = detail::Ref(context.Input<Tensor>("Anchors"),
303+
"Cannot find input Anchors(%s) in scope",
304+
context.Inputs("Anchors")[0]);
305+
auto variances = detail::Ref(context.Input<Tensor>("Variances"),
306+
"Cannot find input Variances(%s) in scope",
307+
context.Inputs("Variances")[0]);
300308

301309
auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
302310
auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
@@ -307,15 +315,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
307315
float min_size = context.Attr<float>("min_size");
308316
float eta = context.Attr<float>("eta");
309317

310-
auto &dev_ctx = context.template device_context<DeviceContext>();
318+
auto &dev_ctx =
319+
context.template device_context<platform::CPUDeviceContext>();
311320

312-
auto scores_dim = scores->dims();
321+
auto &scores_dim = scores->dims();
313322
int64_t num = scores_dim[0];
314323
int64_t c_score = scores_dim[1];
315324
int64_t h_score = scores_dim[2];
316325
int64_t w_score = scores_dim[3];
317326

318-
auto bbox_dim = bbox_deltas->dims();
327+
auto &bbox_dim = bbox_deltas->dims();
319328
int64_t c_bbox = bbox_dim[1];
320329
int64_t h_bbox = bbox_dim[2];
321330
int64_t w_bbox = bbox_dim[3];
@@ -330,17 +339,17 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
330339
scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
331340
dev_ctx.GetPlace());
332341

333-
math::Transpose<DeviceContext, T, 4> trans;
342+
math::Transpose<platform::CPUDeviceContext, T, 4> trans;
334343
std::vector<int> axis = {0, 2, 3, 1};
335344
trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
336345
trans(dev_ctx, *scores, &scores_swap, axis);
337346

338347
framework::LoD lod;
339-
std::vector<size_t> lod0(1, 0);
340-
Tensor *anchor = const_cast<framework::Tensor *>(anchors);
341-
anchor->Resize({anchors->numel() / 4, 4});
342-
Tensor *var = const_cast<framework::Tensor *>(variances);
343-
var->Resize({var->numel() / 4, 4});
348+
lod.resize(1);
349+
auto &lod0 = lod[0];
350+
lod0.push_back(0);
351+
anchors.Resize({anchors.numel() / 4, 4});
352+
variances.Resize({variances.numel() / 4, 4});
344353

345354
int64_t num_proposals = 0;
346355
for (int64_t i = 0; i < num; ++i) {
@@ -352,32 +361,25 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
352361
scores_slice.Resize({h_score * w_score * c_score, 1});
353362

354363
std::pair<Tensor, Tensor> tensor_pair =
355-
ProposalForOneImage(dev_ctx, im_info_slice, *anchor, *var,
364+
ProposalForOneImage(dev_ctx, im_info_slice, anchors, variances,
356365
bbox_deltas_slice, scores_slice, pre_nms_top_n,
357366
post_nms_top_n, nms_thresh, min_size, eta);
358-
Tensor proposals = tensor_pair.first;
359-
Tensor scores = tensor_pair.second;
360-
361-
framework::VisitDataType(
362-
framework::ToDataType(rpn_rois->type()),
363-
AppendProposalsFunctor(rpn_rois, 4 * num_proposals, &proposals));
364-
framework::VisitDataType(
365-
framework::ToDataType(rpn_roi_probs->type()),
366-
AppendProposalsFunctor(rpn_roi_probs, num_proposals, &scores));
367+
Tensor &proposals = tensor_pair.first;
368+
Tensor &scores = tensor_pair.second;
367369

370+
AppendProposals(rpn_rois, 4 * num_proposals, proposals);
371+
AppendProposals(rpn_roi_probs, num_proposals, scores);
368372
num_proposals += proposals.dims()[0];
369-
lod0.emplace_back(num_proposals);
373+
lod0.push_back(num_proposals);
370374
}
371-
372-
lod.emplace_back(lod0);
373375
rpn_rois->set_lod(lod);
374376
rpn_roi_probs->set_lod(lod);
375377
rpn_rois->Resize({num_proposals, 4});
376378
rpn_roi_probs->Resize({num_proposals, 1});
377379
}
378380

379381
std::pair<Tensor, Tensor> ProposalForOneImage(
380-
const DeviceContext &ctx, const Tensor &im_info_slice,
382+
const platform::CPUDeviceContext &ctx, const Tensor &im_info_slice,
381383
const Tensor &anchors, const Tensor &variances,
382384
const Tensor &bbox_deltas_slice, // [M, 4]
383385
const Tensor &scores_slice, // [N, 1]
@@ -392,10 +394,9 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
392394
for (int i = 0; i < scores_slice.numel(); ++i) {
393395
index[i] = i;
394396
}
395-
std::function<bool(const int64_t &, const int64_t &)> compare =
396-
[scores_data](const int64_t &i, const int64_t &j) {
397-
return scores_data[i] > scores_data[j];
398-
};
397+
auto compare = [scores_data](const int64_t &i, const int64_t &j) {
398+
return scores_data[i] > scores_data[j];
399+
};
399400

400401
if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) {
401402
std::sort(index, index + scores_slice.numel(), compare);
@@ -469,12 +470,12 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
469470
Generate Proposals OP
470471
471472
This operator proposes rois according to each box with their probability to be a foreground object and
472-
the box can be calculated by anchors. Bbox_deltais and scores are the output of RPN. Final proposals
473+
the box can be calculated by anchors. Bbox_details and scores are the output of RPN. Final proposals
473474
could be used to train detection net.
474475
475476
Scores is the probability for each box to be an object. In format of (N, A, H, W) where N is batch size, A is number
476477
of anchors, H and W are height and width of the feature map.
477-
BboxDeltas is the differece between predicted box locatoin and anchor location. In format of (N, 4*A, H, W)
478+
BboxDeltas is the differece between predicted box location and anchor location. In format of (N, 4*A, H, W)
478479
479480
For generating proposals, this operator transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) and
480481
calculate box locations as proposals candidates. Then clip boxes to image and remove predicted boxes with small area.
@@ -490,6 +491,5 @@ namespace ops = paddle::operators;
490491
REGISTER_OPERATOR(generate_proposals, ops::GenerateProposalsOp,
491492
ops::GenerateProposalsOpMaker,
492493
paddle::framework::EmptyGradOpMaker);
493-
REGISTER_OP_CPU_KERNEL(
494-
generate_proposals,
495-
ops::GenerateProposalsKernel<paddle::platform::CPUDeviceContext, float>);
494+
REGISTER_OP_CPU_KERNEL(generate_proposals, ops::GenerateProposalsKernel<float>,
495+
ops::GenerateProposalsKernel<double>);

0 commit comments

Comments
 (0)