Skip to content

Commit 4020451

Browse files
committed
delete memory copy from linear_chain_crf_op.
1 parent 7a68787 commit 4020451

File tree

2 files changed

+20
-210
lines changed

2 files changed

+20
-210
lines changed

paddle/operators/linear_chain_crf_op.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
187187
const framework::ExecutionContext& ctx) const override {
188188
return framework::OpKernelType(
189189
framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
190-
ctx.device_context());
190+
platform::CPUPlace());
191191
}
192192
};
193193

@@ -248,7 +248,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
248248
framework::ToDataType(
249249
ctx.Input<LoDTensor>(framework::GradVarName("LogLikelihood"))
250250
->type()),
251-
ctx.device_context());
251+
platform::CPUPlace());
252252
}
253253
};
254254

paddle/operators/linear_chain_crf_op.h

Lines changed: 18 additions & 208 deletions
Original file line numberDiff line numberDiff line change
@@ -65,57 +65,14 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
6565
const size_t level = 0;
6666
const size_t seq_num = in_lod[level].size() - 1;
6767

68-
// These local variables hold the inputs and outputs, garanteeing them on
69-
// CPU memory, to provide a consistent reference.
70-
// TODO(caoying) Fix this by moving all these local variables into the
71-
// class's data members once we can profile the whole training process.
72-
LoDTensor* emission_weights = nullptr;
73-
LoDTensor emission_weight_tensor;
74-
Tensor* transition_weights = nullptr;
75-
Tensor transition_weight_tensor;
76-
LoDTensor* label = nullptr;
77-
LoDTensor label_tensor;
78-
79-
Tensor* emission_exps = nullptr;
80-
Tensor emission_exps_tensor;
81-
Tensor* transition_exps = nullptr;
82-
Tensor transition_exps_tensor;
83-
Tensor* alpha = nullptr;
84-
Tensor alpha_tensor;
85-
Tensor* ll = nullptr;
86-
Tensor ll_tensor;
87-
88-
if (platform::is_gpu_place(ctx.GetPlace())) {
89-
emission_weights = &emission_weight_tensor;
90-
transition_weights = &transition_weight_tensor;
91-
label = &label_tensor;
92-
93-
CopyInputsToCpuMemory(
94-
ctx.device_context(), *ctx.Input<LoDTensor>("Emission"),
95-
*ctx.Input<Tensor>("Transition"), *ctx.Input<LoDTensor>("Label"),
96-
emission_weights, transition_weights, label);
97-
98-
emission_exps = &emission_exps_tensor;
99-
emission_exps->Resize(emission_weights->dims());
100-
101-
transition_exps = &transition_exps_tensor;
102-
transition_exps->Resize(transition_weights->dims());
103-
104-
alpha = &alpha_tensor;
105-
alpha->Resize(ctx.Output<Tensor>("Alpha")->dims());
106-
107-
ll = &ll_tensor;
108-
} else {
109-
emission_weights =
110-
const_cast<LoDTensor*>(ctx.Input<LoDTensor>("Emission"));
111-
transition_weights = const_cast<Tensor*>(ctx.Input<Tensor>("Transition"));
112-
label = const_cast<LoDTensor*>(ctx.Input<LoDTensor>("Label"));
113-
114-
emission_exps = ctx.Output<Tensor>("EmissionExps");
115-
transition_exps = ctx.Output<Tensor>("TransitionExps");
116-
alpha = ctx.Output<Tensor>("Alpha");
117-
ll = ctx.Output<Tensor>("LogLikelihood");
118-
}
68+
const LoDTensor* emission_weights = ctx.Input<LoDTensor>("Emission");
69+
const Tensor* transition_weights = ctx.Input<Tensor>("Transition");
70+
const LoDTensor* label = ctx.Input<LoDTensor>("Label");
71+
72+
Tensor* emission_exps = ctx.Output<Tensor>("EmissionExps");
73+
Tensor* transition_exps = ctx.Output<Tensor>("TransitionExps");
74+
Tensor* alpha = ctx.Output<Tensor>("Alpha");
75+
Tensor* ll = ctx.Output<Tensor>("LogLikelihood");
11976

12077
// Because the computation codes only runs on CPU, here the memory for all
12178
// the outputs is FIXED to be allocated on the CPU memory.
@@ -173,61 +130,9 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
173130
one_seq, one_seq_row_max, one_seq_exps, *transition_weights,
174131
*transition_exps, one_seq_label, &one_seq_alpha);
175132
}
176-
177-
if (platform::is_gpu_place(ctx.GetPlace())) {
178-
CopyOutputsToGpuMemory(
179-
ctx.device_context(), *emission_exps, *transition_exps, *alpha, *ll,
180-
ctx.Output<Tensor>("EmissionExps"),
181-
ctx.Output<Tensor>("TransitionExps"), ctx.Output<Tensor>("Alpha"),
182-
ctx.Output<Tensor>("LogLikelihood"));
183-
}
184133
};
185134

186135
private:
187-
void CopyInputsToCpuMemory(const platform::DeviceContext& ctx,
188-
const LoDTensor& emission_weights_src,
189-
const Tensor& transition_weights_src,
190-
const LoDTensor& label_src,
191-
LoDTensor* emission_weights_dst,
192-
Tensor* transition_weights_dst,
193-
LoDTensor* label_dst) const {
194-
// Copy the inputs from GPU memory to CPU memory if this operators runs on
195-
// GPU device.
196-
auto copyLoDTensor = [](const platform::DeviceContext& ctx,
197-
const LoDTensor& src, LoDTensor* dst) {
198-
dst->mutable_data<T>(src.dims(), platform::CPUPlace());
199-
framework::Copy(src, platform::CPUPlace(), ctx, dst);
200-
};
201-
202-
copyLoDTensor(ctx, emission_weights_src, emission_weights_dst);
203-
copyLoDTensor(ctx, label_src, label_dst);
204-
205-
transition_weights_dst->mutable_data<T>(transition_weights_src.dims(),
206-
platform::CPUPlace());
207-
framework::Copy(transition_weights_src, platform::CPUPlace(), ctx,
208-
transition_weights_dst);
209-
}
210-
211-
void CopyOutputsToGpuMemory(const platform::DeviceContext& ctx,
212-
const Tensor& emission_exps_src,
213-
const Tensor& transition_exps_src,
214-
const Tensor& alpha_src, const Tensor& ll_src,
215-
Tensor* emission_exps_dst,
216-
Tensor* transition_exps_dst, Tensor* alpha_dst,
217-
Tensor* ll_dst) const {
218-
// Copy the forward results from CPU memory to GPU memory if this
219-
// operators runs on GPU device.
220-
auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
221-
Tensor* dst) {
222-
dst->mutable_data<T>(platform::CUDAPlace());
223-
framework::Copy(src, platform::CUDAPlace(), ctx, dst);
224-
};
225-
copyTensor(ctx, emission_exps_src, emission_exps_dst);
226-
copyTensor(ctx, transition_exps_src, transition_exps_dst);
227-
copyTensor(ctx, alpha_src, alpha_dst);
228-
copyTensor(ctx, ll_src, ll_dst);
229-
}
230-
231136
T ForwardOneSequence(const Tensor& emission, const Tensor& emission_row_max,
232137
const Tensor& emission_exps, const Tensor& trans_weights,
233138
const Tensor& trans_weight_exps, const Tensor& label,
@@ -296,63 +201,17 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
296201
auto lod = ctx.Input<LoDTensor>("Label")->lod();
297202
PADDLE_ENFORCE(lod.size(), "Input(Label) must be a sequence.");
298203

299-
// These local variables hold the inputs and outputs, garanteeing them on
300-
// CPU memory, to provide a consistent reference.
301-
// TODO(caoying) Fix this by moving all these local variables into the
302-
// class's data members once we can profile the training process, or
303-
// implementing a real GPU kernel for CRF.
304-
Tensor* label = nullptr;
305-
Tensor label_tensor;
306-
Tensor* emission_exps = nullptr;
307-
Tensor emission_exps_tensor;
308-
Tensor* transition_exps = nullptr;
309-
Tensor transition_exps_tensor;
310-
Tensor* alpha = nullptr;
311-
Tensor alpha_tensor;
312-
Tensor ll_grad_tensor;
313-
T* ll_grad = nullptr;
314-
315-
Tensor* emission_grad = nullptr;
316-
Tensor emission_grad_tensor;
317-
Tensor* transition_grad = nullptr;
318-
Tensor transition_grad_tensor;
319-
320-
if (platform::is_gpu_place(ctx.GetPlace())) {
321-
label = &label_tensor;
322-
emission_exps = &emission_exps_tensor;
323-
transition_exps = &transition_exps_tensor;
324-
alpha = &alpha_tensor;
325-
CopyInputsToCpuMemory(
326-
ctx.device_context(), *ctx.Input<LoDTensor>("Label"),
327-
*ctx.Input<Tensor>("EmissionExps"),
328-
*ctx.Input<Tensor>("TransitionExps"), *ctx.Input<Tensor>("Alpha"),
329-
*ctx.Input<Tensor>(framework::GradVarName("LogLikelihood")), label,
330-
emission_exps, transition_exps, alpha, &ll_grad_tensor);
331-
ll_grad = ll_grad_tensor.data<T>();
332-
333-
if (ctx.Output<Tensor>(framework::GradVarName("Emission"))) {
334-
emission_grad = &emission_grad_tensor;
335-
emission_grad->Resize(emission_exps->dims());
336-
}
204+
const Tensor* label = ctx.Input<LoDTensor>("Label");
205+
const Tensor* emission_exps = ctx.Input<Tensor>("EmissionExps");
206+
const Tensor* transition_exps = ctx.Input<Tensor>("TransitionExps");
207+
const Tensor* alpha = ctx.Input<Tensor>("Alpha");
208+
const T* ll_grad =
209+
ctx.Input<Tensor>(framework::GradVarName("LogLikelihood"))->data<T>();
337210

338-
if (ctx.Output<Tensor>(framework::GradVarName("Transition"))) {
339-
transition_grad = &transition_grad_tensor;
340-
transition_grad->Resize(transition_exps->dims());
341-
}
342-
} else {
343-
label = const_cast<LoDTensor*>(ctx.Input<LoDTensor>("Label"));
344-
emission_exps = const_cast<Tensor*>(ctx.Input<Tensor>("EmissionExps"));
345-
transition_exps =
346-
const_cast<Tensor*>(ctx.Input<Tensor>("TransitionExps"));
347-
alpha = const_cast<Tensor*>(ctx.Input<Tensor>("Alpha"));
348-
ll_grad = const_cast<Tensor*>(
349-
ctx.Input<Tensor>(framework::GradVarName("LogLikelihood")))
350-
->data<T>();
351-
352-
emission_grad = ctx.Output<Tensor>(framework::GradVarName("Emission"));
353-
transition_grad =
354-
ctx.Output<Tensor>(framework::GradVarName("Transition"));
355-
}
211+
Tensor* emission_grad =
212+
ctx.Output<Tensor>(framework::GradVarName("Emission"));
213+
Tensor* transition_grad =
214+
ctx.Output<Tensor>(framework::GradVarName("Transition"));
356215

357216
// TODO(caoying) Fix this constraint. When the Input(Emission) is from the
358217
// data reader operator, it can have no gradients.
@@ -389,58 +248,9 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
389248
one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label,
390249
&one_seq_beta, transition_grad, &one_seq_emission_grad);
391250
}
392-
393-
if (platform::is_gpu_place(ctx.GetPlace())) {
394-
CopyOutputsToGpuMemory(
395-
ctx.device_context(), emission_grad, transition_grad,
396-
ctx.Output<Tensor>(framework::GradVarName("Emission")),
397-
ctx.Output<Tensor>(framework::GradVarName("Transition")));
398-
}
399251
};
400252

401253
private:
402-
void CopyInputsToCpuMemory(const platform::DeviceContext& ctx,
403-
const LoDTensor& label_src,
404-
const Tensor& emission_exps_src,
405-
const Tensor& transition_exps_src,
406-
const Tensor& alpha_src, const Tensor& ll_grad_src,
407-
Tensor* label_dst, Tensor* emission_exps_dst,
408-
Tensor* transition_exps_dst, Tensor* alpha_dst,
409-
Tensor* ll_grad_dst) const {
410-
// Copy the inputs from GPU memory to CPU memory when this operators runs on
411-
// GPU device.
412-
label_dst->mutable_data<T>(label_src.dims(), platform::CPUPlace());
413-
framework::Copy(label_src, platform::CPUPlace(), ctx, label_dst);
414-
415-
auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
416-
Tensor* dst) {
417-
dst->mutable_data<T>(src.dims(), platform::CPUPlace());
418-
framework::Copy(src, platform::CPUPlace(), ctx, dst);
419-
};
420-
copyTensor(ctx, emission_exps_src, emission_exps_dst);
421-
copyTensor(ctx, transition_exps_src, transition_exps_dst);
422-
copyTensor(ctx, alpha_src, alpha_dst);
423-
copyTensor(ctx, ll_grad_src, ll_grad_dst);
424-
}
425-
426-
void CopyOutputsToGpuMemory(const platform::DeviceContext& ctx,
427-
const Tensor* emission_grad_src,
428-
const Tensor* transition_grad_src,
429-
Tensor* emission_grad_dst,
430-
Tensor* transition_grad_dst) const {
431-
// Copy the backward results from CPU memory to GPU
432-
// memory if this operators runs on GPU device.
433-
auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor* src,
434-
Tensor* dst) {
435-
if (src && dst) {
436-
dst->mutable_data<T>(platform::CUDAPlace());
437-
framework::Copy(*src, platform::CUDAPlace(), ctx, dst);
438-
}
439-
};
440-
copyTensor(ctx, emission_grad_src, emission_grad_dst);
441-
copyTensor(ctx, transition_grad_src, transition_grad_dst);
442-
}
443-
444254
void BackwardOneSequence(const platform::CPUDeviceContext& ctx,
445255
const T ll_grad, const Tensor& emission_exps,
446256
const Tensor& transition_exps, const Tensor& alpha,

0 commit comments

Comments
 (0)