@@ -158,7 +158,7 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
158
158
159
159
using Tensor = paddle::framework::Tensor;
160
160
161
- template <paddle::operators:: jit::KernelType KT, typename T, typename PlaceType>
161
+ template <jit::KernelType KT, typename T, typename PlaceType>
162
162
void BenchXYZNKernel () {
163
163
for (int d : TestSizes ()) {
164
164
Tensor x, y, z;
@@ -175,7 +175,7 @@ void BenchXYZNKernel() {
175
175
}
176
176
}
177
177
178
- template <paddle::operators:: jit::KernelType KT, typename T, typename PlaceType>
178
+ template <jit::KernelType KT, typename T, typename PlaceType>
179
179
void BenchAXYNKernel () {
180
180
for (int d : TestSizes ()) {
181
181
const T a = static_cast <T>(3 );
@@ -187,10 +187,23 @@ void BenchAXYNKernel() {
187
187
RandomVec<T>(d, x_data);
188
188
BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data <T>(), y_data,
189
189
d);
190
+ // test inplace
191
+ BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data <T>(), x_data,
192
+ d);
193
+ }
194
+ }
195
+
196
+ template <jit::KernelType KT, typename T, typename PlaceType>
197
+ void BenchXRNKernel () {
198
+ for (int d : TestSizes ()) {
199
+ Tensor x;
200
+ RandomVec<T>(d, x.mutable_data <T>({d}, PlaceType ()));
201
+ T res;
202
+ BenchAllImpls<KT, jit::XRNTuples<T>, PlaceType>(d, x.data <T>(), &res, d);
190
203
}
191
204
}
192
205
193
- template <paddle::operators:: jit::KernelType KT, typename T, typename PlaceType>
206
+ template <jit::KernelType KT, typename T, typename PlaceType>
194
207
void BenchXYNKernel () {
195
208
for (int d : TestSizes ()) {
196
209
Tensor x, y;
@@ -203,7 +216,7 @@ void BenchXYNKernel() {
203
216
}
204
217
}
205
218
206
- template <paddle::operators:: jit::KernelType KT, typename T, typename PlaceType>
219
+ template <jit::KernelType KT, typename T, typename PlaceType>
207
220
void BenchLSTMKernel () {
208
221
for (bool use_peephole : {true , false }) {
209
222
for (int d : TestSizes ()) {
@@ -240,7 +253,7 @@ void BenchLSTMKernel() {
240
253
}
241
254
}
242
255
243
- template <paddle::operators:: jit::KernelType KT, typename T, typename PlaceType>
256
+ template <jit::KernelType KT, typename T, typename PlaceType>
244
257
void BenchGRUKernel () {
245
258
for (int d : TestSizes ()) {
246
259
const jit::gru_attr_t attr (d, jit::kVSigmoid , jit::kVTanh );
@@ -262,7 +275,7 @@ void BenchGRUKernel() {
262
275
}
263
276
}
264
277
265
- template <paddle::operators:: jit::KernelType KT, typename T, typename PlaceType>
278
+ template <jit::KernelType KT, typename T, typename PlaceType>
266
279
void BenchSeqPoolKernel () {
267
280
std::vector<jit::SeqPoolType> pool_types = {
268
281
jit::SeqPoolType::kSum , jit::SeqPoolType::kAvg , jit::SeqPoolType::kSqrt };
@@ -284,7 +297,7 @@ void BenchSeqPoolKernel() {
284
297
}
285
298
}
286
299
287
- template <paddle::operators:: jit::KernelType KT, typename T, typename PlaceType>
300
+ template <jit::KernelType KT, typename T, typename PlaceType>
288
301
void BenchMatMulKernel () {
289
302
for (int m : {1 , 2 , 3 , 4 }) {
290
303
for (int n : TestSizes ()) {
@@ -305,57 +318,64 @@ void BenchMatMulKernel() {
305
318
}
306
319
}
307
320
321
+ template <jit::KernelType KT, typename T, typename PlaceType>
322
+ void BenchSoftmaxKernel () {
323
+ for (int bs : {1 , 2 , 10 }) {
324
+ for (int n : TestSizes ()) {
325
+ Tensor x, y;
326
+ x.Resize ({bs, n});
327
+ y.Resize ({bs, n});
328
+ RandomVec<T>(bs * n, x.mutable_data <T>(PlaceType ()), -2 .f , 2 .f );
329
+ const T* x_data = x.data <T>();
330
+ T* y_data = y.mutable_data <T>(PlaceType ());
331
+ BenchAllImpls<KT, jit::SoftmaxTuples<T>, PlaceType>(n, x_data, y_data, n,
332
+ bs);
333
+ }
334
+ }
335
+ }
336
+
308
337
using T = float ;
309
- using PlaceType = paddle::platform::CPUPlace;
338
+ using CPUPlace = paddle::platform::CPUPlace;
310
339
311
340
// xyzn
312
- BENCH_FP32_CPU (kVMul ) { BenchXYZNKernel<jit::kVMul , T, PlaceType>(); }
313
-
314
- BENCH_FP32_CPU (kVAdd ) { BenchXYZNKernel<jit::kVAdd , T, PlaceType>(); }
315
-
316
- BENCH_FP32_CPU (kVAddRelu ) { BenchXYZNKernel<jit::kVAddRelu , T, PlaceType>(); }
317
-
318
- BENCH_FP32_CPU (kVSub ) { BenchXYZNKernel<jit::kVSub , T, PlaceType>(); }
341
+ BENCH_FP32_CPU (kVMul ) { BenchXYZNKernel<jit::kVMul , T, CPUPlace>(); }
342
+ BENCH_FP32_CPU (kVAdd ) { BenchXYZNKernel<jit::kVAdd , T, CPUPlace>(); }
343
+ BENCH_FP32_CPU (kVAddRelu ) { BenchXYZNKernel<jit::kVAddRelu , T, CPUPlace>(); }
344
+ BENCH_FP32_CPU (kVSub ) { BenchXYZNKernel<jit::kVSub , T, CPUPlace>(); }
319
345
320
346
// axyn
321
- BENCH_FP32_CPU (kVScal ) { BenchAXYNKernel<jit::kVScal , T, PlaceType>(); }
347
+ BENCH_FP32_CPU (kVScal ) { BenchAXYNKernel<jit::kVScal , T, CPUPlace>(); }
348
+ BENCH_FP32_CPU (kVAddBias ) { BenchAXYNKernel<jit::kVAddBias , T, CPUPlace>(); }
322
349
323
- BENCH_FP32_CPU (kVAddBias ) { BenchAXYNKernel<jit::kVAddBias , T, PlaceType>(); }
350
+ // xrn
351
+ BENCH_FP32_CPU (kHSum ) { BenchXRNKernel<jit::kHSum , T, CPUPlace>(); }
352
+ BENCH_FP32_CPU (kHMax ) { BenchXRNKernel<jit::kHMax , T, CPUPlace>(); }
324
353
325
354
// xyn
326
- BENCH_FP32_CPU (kVRelu ) { BenchXYNKernel<jit::kVRelu , T, PlaceType>(); }
327
-
328
- BENCH_FP32_CPU (kVIdentity ) { BenchXYNKernel<jit::kVIdentity , T, PlaceType>(); }
329
-
330
- BENCH_FP32_CPU (kVSquare ) { BenchXYNKernel<jit::kVSquare , T, PlaceType>(); }
331
-
332
- BENCH_FP32_CPU (kVExp ) { BenchXYNKernel<jit::kVExp , T, PlaceType>(); }
333
-
334
- BENCH_FP32_CPU (kVSigmoid ) { BenchXYNKernel<jit::kVSigmoid , T, PlaceType>(); }
335
-
336
- BENCH_FP32_CPU (kVTanh ) { BenchXYNKernel<jit::kVTanh , T, PlaceType>(); }
355
+ BENCH_FP32_CPU (kVRelu ) { BenchXYNKernel<jit::kVRelu , T, CPUPlace>(); }
356
+ BENCH_FP32_CPU (kVIdentity ) { BenchXYNKernel<jit::kVIdentity , T, CPUPlace>(); }
357
+ BENCH_FP32_CPU (kVSquare ) { BenchXYNKernel<jit::kVSquare , T, CPUPlace>(); }
358
+ BENCH_FP32_CPU (kVExp ) { BenchXYNKernel<jit::kVExp , T, CPUPlace>(); }
359
+ BENCH_FP32_CPU (kVSigmoid ) { BenchXYNKernel<jit::kVSigmoid , T, CPUPlace>(); }
360
+ BENCH_FP32_CPU (kVTanh ) { BenchXYNKernel<jit::kVTanh , T, CPUPlace>(); }
337
361
338
362
// lstm and peephole
339
- BENCH_FP32_CPU (kLSTMCtHt ) { BenchLSTMKernel<jit::kLSTMCtHt , T, PlaceType>(); }
340
-
341
- BENCH_FP32_CPU (kLSTMC1H1 ) { BenchLSTMKernel<jit::kLSTMC1H1 , T, PlaceType>(); }
363
+ BENCH_FP32_CPU (kLSTMCtHt ) { BenchLSTMKernel<jit::kLSTMCtHt , T, CPUPlace>(); }
364
+ BENCH_FP32_CPU (kLSTMC1H1 ) { BenchLSTMKernel<jit::kLSTMC1H1 , T, CPUPlace>(); }
342
365
343
366
// gru functions
344
- BENCH_FP32_CPU (kGRUH1 ) { BenchGRUKernel<jit::kGRUH1 , T, PlaceType>(); }
345
-
346
- BENCH_FP32_CPU (kGRUHtPart1 ) {
347
- BenchGRUKernel<jit::kGRUHtPart1 , T, PlaceType>();
348
- }
349
-
350
- BENCH_FP32_CPU (kGRUHtPart2 ) {
351
- BenchGRUKernel<jit::kGRUHtPart2 , T, PlaceType>();
352
- }
367
+ BENCH_FP32_CPU (kGRUH1 ) { BenchGRUKernel<jit::kGRUH1 , T, CPUPlace>(); }
368
+ BENCH_FP32_CPU (kGRUHtPart1 ) { BenchGRUKernel<jit::kGRUHtPart1 , T, CPUPlace>(); }
369
+ BENCH_FP32_CPU (kGRUHtPart2 ) { BenchGRUKernel<jit::kGRUHtPart2 , T, CPUPlace>(); }
353
370
354
371
// seq pool function
355
- BENCH_FP32_CPU (kSeqPool ) { BenchSeqPoolKernel<jit::kSeqPool , T, PlaceType >(); }
372
+ BENCH_FP32_CPU (kSeqPool ) { BenchSeqPoolKernel<jit::kSeqPool , T, CPUPlace >(); }
356
373
357
374
// matmul
358
- BENCH_FP32_CPU (kMatMul ) { BenchMatMulKernel<jit::kMatMul , T, PlaceType>(); }
375
+ BENCH_FP32_CPU (kMatMul ) { BenchMatMulKernel<jit::kMatMul , T, CPUPlace>(); }
376
+
377
+ // softmax
378
+ BENCH_FP32_CPU (kSoftmax ) { BenchSoftmaxKernel<jit::kSoftmax , T, CPUPlace>(); }
359
379
360
380
// Benchmark all jit kernels including jitcode, mkl and refer.
361
381
// To use this tool, run command: ./benchmark [options...]
0 commit comments