@@ -65,57 +65,14 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
65
65
const size_t level = 0 ;
66
66
const size_t seq_num = in_lod[level].size () - 1 ;
67
67
68
- // These local variables hold the inputs and outputs, garanteeing them on
69
- // CPU memory, to provide a consistent reference.
70
- // TODO(caoying) Fix this by moving all these local variables into the
71
- // class's data members once we can profile the whole training process.
72
- LoDTensor* emission_weights = nullptr ;
73
- LoDTensor emission_weight_tensor;
74
- Tensor* transition_weights = nullptr ;
75
- Tensor transition_weight_tensor;
76
- LoDTensor* label = nullptr ;
77
- LoDTensor label_tensor;
78
-
79
- Tensor* emission_exps = nullptr ;
80
- Tensor emission_exps_tensor;
81
- Tensor* transition_exps = nullptr ;
82
- Tensor transition_exps_tensor;
83
- Tensor* alpha = nullptr ;
84
- Tensor alpha_tensor;
85
- Tensor* ll = nullptr ;
86
- Tensor ll_tensor;
87
-
88
- if (platform::is_gpu_place (ctx.GetPlace ())) {
89
- emission_weights = &emission_weight_tensor;
90
- transition_weights = &transition_weight_tensor;
91
- label = &label_tensor;
92
-
93
- CopyInputsToCpuMemory (
94
- ctx.device_context (), *ctx.Input <LoDTensor>(" Emission" ),
95
- *ctx.Input <Tensor>(" Transition" ), *ctx.Input <LoDTensor>(" Label" ),
96
- emission_weights, transition_weights, label);
97
-
98
- emission_exps = &emission_exps_tensor;
99
- emission_exps->Resize (emission_weights->dims ());
100
-
101
- transition_exps = &transition_exps_tensor;
102
- transition_exps->Resize (transition_weights->dims ());
103
-
104
- alpha = &alpha_tensor;
105
- alpha->Resize (ctx.Output <Tensor>(" Alpha" )->dims ());
106
-
107
- ll = &ll_tensor;
108
- } else {
109
- emission_weights =
110
- const_cast <LoDTensor*>(ctx.Input <LoDTensor>(" Emission" ));
111
- transition_weights = const_cast <Tensor*>(ctx.Input <Tensor>(" Transition" ));
112
- label = const_cast <LoDTensor*>(ctx.Input <LoDTensor>(" Label" ));
113
-
114
- emission_exps = ctx.Output <Tensor>(" EmissionExps" );
115
- transition_exps = ctx.Output <Tensor>(" TransitionExps" );
116
- alpha = ctx.Output <Tensor>(" Alpha" );
117
- ll = ctx.Output <Tensor>(" LogLikelihood" );
118
- }
68
+ const LoDTensor* emission_weights = ctx.Input <LoDTensor>(" Emission" );
69
+ const Tensor* transition_weights = ctx.Input <Tensor>(" Transition" );
70
+ const LoDTensor* label = ctx.Input <LoDTensor>(" Label" );
71
+
72
+ Tensor* emission_exps = ctx.Output <Tensor>(" EmissionExps" );
73
+ Tensor* transition_exps = ctx.Output <Tensor>(" TransitionExps" );
74
+ Tensor* alpha = ctx.Output <Tensor>(" Alpha" );
75
+ Tensor* ll = ctx.Output <Tensor>(" LogLikelihood" );
119
76
120
77
// Because the computation codes only runs on CPU, here the memory for all
121
78
// the outputs is FIXED to be allocated on the CPU memory.
@@ -173,61 +130,9 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
173
130
one_seq, one_seq_row_max, one_seq_exps, *transition_weights,
174
131
*transition_exps, one_seq_label, &one_seq_alpha);
175
132
}
176
-
177
- if (platform::is_gpu_place (ctx.GetPlace ())) {
178
- CopyOutputsToGpuMemory (
179
- ctx.device_context (), *emission_exps, *transition_exps, *alpha, *ll,
180
- ctx.Output <Tensor>(" EmissionExps" ),
181
- ctx.Output <Tensor>(" TransitionExps" ), ctx.Output <Tensor>(" Alpha" ),
182
- ctx.Output <Tensor>(" LogLikelihood" ));
183
- }
184
133
};
185
134
186
135
private:
187
- void CopyInputsToCpuMemory (const platform::DeviceContext& ctx,
188
- const LoDTensor& emission_weights_src,
189
- const Tensor& transition_weights_src,
190
- const LoDTensor& label_src,
191
- LoDTensor* emission_weights_dst,
192
- Tensor* transition_weights_dst,
193
- LoDTensor* label_dst) const {
194
- // Copy the inputs from GPU memory to CPU memory if this operators runs on
195
- // GPU device.
196
- auto copyLoDTensor = [](const platform::DeviceContext& ctx,
197
- const LoDTensor& src, LoDTensor* dst) {
198
- dst->mutable_data <T>(src.dims (), platform::CPUPlace ());
199
- framework::Copy (src, platform::CPUPlace (), ctx, dst);
200
- };
201
-
202
- copyLoDTensor (ctx, emission_weights_src, emission_weights_dst);
203
- copyLoDTensor (ctx, label_src, label_dst);
204
-
205
- transition_weights_dst->mutable_data <T>(transition_weights_src.dims (),
206
- platform::CPUPlace ());
207
- framework::Copy (transition_weights_src, platform::CPUPlace (), ctx,
208
- transition_weights_dst);
209
- }
210
-
211
- void CopyOutputsToGpuMemory (const platform::DeviceContext& ctx,
212
- const Tensor& emission_exps_src,
213
- const Tensor& transition_exps_src,
214
- const Tensor& alpha_src, const Tensor& ll_src,
215
- Tensor* emission_exps_dst,
216
- Tensor* transition_exps_dst, Tensor* alpha_dst,
217
- Tensor* ll_dst) const {
218
- // Copy the forward results from CPU memory to GPU memory if this
219
- // operators runs on GPU device.
220
- auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
221
- Tensor* dst) {
222
- dst->mutable_data <T>(platform::CUDAPlace ());
223
- framework::Copy (src, platform::CUDAPlace (), ctx, dst);
224
- };
225
- copyTensor (ctx, emission_exps_src, emission_exps_dst);
226
- copyTensor (ctx, transition_exps_src, transition_exps_dst);
227
- copyTensor (ctx, alpha_src, alpha_dst);
228
- copyTensor (ctx, ll_src, ll_dst);
229
- }
230
-
231
136
T ForwardOneSequence (const Tensor& emission, const Tensor& emission_row_max,
232
137
const Tensor& emission_exps, const Tensor& trans_weights,
233
138
const Tensor& trans_weight_exps, const Tensor& label,
@@ -296,63 +201,17 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
296
201
auto lod = ctx.Input <LoDTensor>(" Label" )->lod ();
297
202
PADDLE_ENFORCE (lod.size (), " Input(Label) must be a sequence." );
298
203
299
- // These local variables hold the inputs and outputs, garanteeing them on
300
- // CPU memory, to provide a consistent reference.
301
- // TODO(caoying) Fix this by moving all these local variables into the
302
- // class's data members once we can profile the training process, or
303
- // implementing a real GPU kernel for CRF.
304
- Tensor* label = nullptr ;
305
- Tensor label_tensor;
306
- Tensor* emission_exps = nullptr ;
307
- Tensor emission_exps_tensor;
308
- Tensor* transition_exps = nullptr ;
309
- Tensor transition_exps_tensor;
310
- Tensor* alpha = nullptr ;
311
- Tensor alpha_tensor;
312
- Tensor ll_grad_tensor;
313
- T* ll_grad = nullptr ;
314
-
315
- Tensor* emission_grad = nullptr ;
316
- Tensor emission_grad_tensor;
317
- Tensor* transition_grad = nullptr ;
318
- Tensor transition_grad_tensor;
319
-
320
- if (platform::is_gpu_place (ctx.GetPlace ())) {
321
- label = &label_tensor;
322
- emission_exps = &emission_exps_tensor;
323
- transition_exps = &transition_exps_tensor;
324
- alpha = &alpha_tensor;
325
- CopyInputsToCpuMemory (
326
- ctx.device_context (), *ctx.Input <LoDTensor>(" Label" ),
327
- *ctx.Input <Tensor>(" EmissionExps" ),
328
- *ctx.Input <Tensor>(" TransitionExps" ), *ctx.Input <Tensor>(" Alpha" ),
329
- *ctx.Input <Tensor>(framework::GradVarName (" LogLikelihood" )), label,
330
- emission_exps, transition_exps, alpha, &ll_grad_tensor);
331
- ll_grad = ll_grad_tensor.data <T>();
332
-
333
- if (ctx.Output <Tensor>(framework::GradVarName (" Emission" ))) {
334
- emission_grad = &emission_grad_tensor;
335
- emission_grad->Resize (emission_exps->dims ());
336
- }
204
+ const Tensor* label = ctx.Input <LoDTensor>(" Label" );
205
+ const Tensor* emission_exps = ctx.Input <Tensor>(" EmissionExps" );
206
+ const Tensor* transition_exps = ctx.Input <Tensor>(" TransitionExps" );
207
+ const Tensor* alpha = ctx.Input <Tensor>(" Alpha" );
208
+ const T* ll_grad =
209
+ ctx.Input <Tensor>(framework::GradVarName (" LogLikelihood" ))->data <T>();
337
210
338
- if (ctx.Output <Tensor>(framework::GradVarName (" Transition" ))) {
339
- transition_grad = &transition_grad_tensor;
340
- transition_grad->Resize (transition_exps->dims ());
341
- }
342
- } else {
343
- label = const_cast <LoDTensor*>(ctx.Input <LoDTensor>(" Label" ));
344
- emission_exps = const_cast <Tensor*>(ctx.Input <Tensor>(" EmissionExps" ));
345
- transition_exps =
346
- const_cast <Tensor*>(ctx.Input <Tensor>(" TransitionExps" ));
347
- alpha = const_cast <Tensor*>(ctx.Input <Tensor>(" Alpha" ));
348
- ll_grad = const_cast <Tensor*>(
349
- ctx.Input <Tensor>(framework::GradVarName (" LogLikelihood" )))
350
- ->data <T>();
351
-
352
- emission_grad = ctx.Output <Tensor>(framework::GradVarName (" Emission" ));
353
- transition_grad =
354
- ctx.Output <Tensor>(framework::GradVarName (" Transition" ));
355
- }
211
+ Tensor* emission_grad =
212
+ ctx.Output <Tensor>(framework::GradVarName (" Emission" ));
213
+ Tensor* transition_grad =
214
+ ctx.Output <Tensor>(framework::GradVarName (" Transition" ));
356
215
357
216
// TODO(caoying) Fix this constraint. When the Input(Emission) is from the
358
217
// data reader operator, it can have no gradients.
@@ -389,58 +248,9 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
389
248
one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label,
390
249
&one_seq_beta, transition_grad, &one_seq_emission_grad);
391
250
}
392
-
393
- if (platform::is_gpu_place (ctx.GetPlace ())) {
394
- CopyOutputsToGpuMemory (
395
- ctx.device_context (), emission_grad, transition_grad,
396
- ctx.Output <Tensor>(framework::GradVarName (" Emission" )),
397
- ctx.Output <Tensor>(framework::GradVarName (" Transition" )));
398
- }
399
251
};
400
252
401
253
private:
402
- void CopyInputsToCpuMemory (const platform::DeviceContext& ctx,
403
- const LoDTensor& label_src,
404
- const Tensor& emission_exps_src,
405
- const Tensor& transition_exps_src,
406
- const Tensor& alpha_src, const Tensor& ll_grad_src,
407
- Tensor* label_dst, Tensor* emission_exps_dst,
408
- Tensor* transition_exps_dst, Tensor* alpha_dst,
409
- Tensor* ll_grad_dst) const {
410
- // Copy the inputs from GPU memory to CPU memory when this operators runs on
411
- // GPU device.
412
- label_dst->mutable_data <T>(label_src.dims (), platform::CPUPlace ());
413
- framework::Copy (label_src, platform::CPUPlace (), ctx, label_dst);
414
-
415
- auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
416
- Tensor* dst) {
417
- dst->mutable_data <T>(src.dims (), platform::CPUPlace ());
418
- framework::Copy (src, platform::CPUPlace (), ctx, dst);
419
- };
420
- copyTensor (ctx, emission_exps_src, emission_exps_dst);
421
- copyTensor (ctx, transition_exps_src, transition_exps_dst);
422
- copyTensor (ctx, alpha_src, alpha_dst);
423
- copyTensor (ctx, ll_grad_src, ll_grad_dst);
424
- }
425
-
426
- void CopyOutputsToGpuMemory (const platform::DeviceContext& ctx,
427
- const Tensor* emission_grad_src,
428
- const Tensor* transition_grad_src,
429
- Tensor* emission_grad_dst,
430
- Tensor* transition_grad_dst) const {
431
- // Copy the backward results from CPU memory to GPU
432
- // memory if this operators runs on GPU device.
433
- auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor* src,
434
- Tensor* dst) {
435
- if (src && dst) {
436
- dst->mutable_data <T>(platform::CUDAPlace ());
437
- framework::Copy (*src, platform::CUDAPlace (), ctx, dst);
438
- }
439
- };
440
- copyTensor (ctx, emission_grad_src, emission_grad_dst);
441
- copyTensor (ctx, transition_grad_src, transition_grad_dst);
442
- }
443
-
444
254
void BackwardOneSequence (const platform::CPUDeviceContext& ctx,
445
255
const T ll_grad, const Tensor& emission_exps,
446
256
const Tensor& transition_exps, const Tensor& alpha,
0 commit comments