@@ -158,7 +158,7 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
158
158
159
159
using Tensor = paddle::framework::Tensor;
160
160
161
- template <paddle::operators:: jit::KernelType KT, typename T, typename PlaceType>
161
+ template <jit::KernelType KT, typename T, typename PlaceType>
162
162
void BenchXYZNKernel () {
163
163
for (int d : TestSizes ()) {
164
164
Tensor x, y, z;
@@ -175,7 +175,7 @@ void BenchXYZNKernel() {
175
175
}
176
176
}
177
177
178
- template <paddle::operators:: jit::KernelType KT, typename T, typename PlaceType>
178
+ template <jit::KernelType KT, typename T, typename PlaceType>
179
179
void BenchAXYNKernel () {
180
180
for (int d : TestSizes ()) {
181
181
const T a = static_cast <T>(3 );
@@ -190,7 +190,17 @@ void BenchAXYNKernel() {
190
190
}
191
191
}
192
192
193
- template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
193
+ template <jit::KernelType KT, typename T, typename PlaceType>
194
+ void BenchXRNKernel () {
195
+ for (int d : TestSizes ()) {
196
+ Tensor x;
197
+ RandomVec<T>(d, x.mutable_data <T>({d}, PlaceType ()));
198
+ T res;
199
+ BenchAllImpls<KT, jit::XRNTuples<T>, PlaceType>(d, x.data <T>(), &res, d);
200
+ }
201
+ }
202
+
203
+ template <jit::KernelType KT, typename T, typename PlaceType>
194
204
void BenchXYNKernel () {
195
205
for (int d : TestSizes ()) {
196
206
Tensor x, y;
@@ -203,7 +213,7 @@ void BenchXYNKernel() {
203
213
}
204
214
}
205
215
206
- template <paddle::operators:: jit::KernelType KT, typename T, typename PlaceType>
216
+ template <jit::KernelType KT, typename T, typename PlaceType>
207
217
void BenchLSTMKernel () {
208
218
for (bool use_peephole : {true , false }) {
209
219
for (int d : TestSizes ()) {
@@ -240,7 +250,7 @@ void BenchLSTMKernel() {
240
250
}
241
251
}
242
252
243
- template <paddle::operators:: jit::KernelType KT, typename T, typename PlaceType>
253
+ template <jit::KernelType KT, typename T, typename PlaceType>
244
254
void BenchGRUKernel () {
245
255
for (int d : TestSizes ()) {
246
256
const jit::gru_attr_t attr (d, jit::kVSigmoid , jit::kVTanh );
@@ -262,7 +272,7 @@ void BenchGRUKernel() {
262
272
}
263
273
}
264
274
265
- template <paddle::operators:: jit::KernelType KT, typename T, typename PlaceType>
275
+ template <jit::KernelType KT, typename T, typename PlaceType>
266
276
void BenchSeqPoolKernel () {
267
277
std::vector<jit::SeqPoolType> pool_types = {
268
278
jit::SeqPoolType::kSum , jit::SeqPoolType::kAvg , jit::SeqPoolType::kSqrt };
@@ -284,7 +294,7 @@ void BenchSeqPoolKernel() {
284
294
}
285
295
}
286
296
287
- template <paddle::operators:: jit::KernelType KT, typename T, typename PlaceType>
297
+ template <jit::KernelType KT, typename T, typename PlaceType>
288
298
void BenchMatMulKernel () {
289
299
for (int m : {1 , 2 , 3 , 4 }) {
290
300
for (int n : TestSizes ()) {
@@ -305,57 +315,64 @@ void BenchMatMulKernel() {
305
315
}
306
316
}
307
317
318
+ template <jit::KernelType KT, typename T, typename PlaceType>
319
+ void BenchSoftmaxKernel () {
320
+ for (int bs : {1 , 2 , 10 }) {
321
+ for (int n : TestSizes ()) {
322
+ Tensor x, y;
323
+ x.Resize ({bs, n});
324
+ y.Resize ({bs, n});
325
+ RandomVec<T>(bs * n, x.mutable_data <T>(PlaceType ()), -2 .f , 2 .f );
326
+ const T* x_data = x.data <T>();
327
+ T* y_data = y.mutable_data <T>(PlaceType ());
328
+ BenchAllImpls<KT, jit::SoftmaxTuples<T>, PlaceType>(n, x_data, y_data, n,
329
+ bs);
330
+ }
331
+ }
332
+ }
333
+
308
334
using T = float ;
309
- using PlaceType = paddle::platform::CPUPlace;
335
+ using CPUPlace = paddle::platform::CPUPlace;
310
336
311
337
// xyzn
312
- BENCH_FP32_CPU (kVMul ) { BenchXYZNKernel<jit::kVMul , T, PlaceType>(); }
313
-
314
- BENCH_FP32_CPU (kVAdd ) { BenchXYZNKernel<jit::kVAdd , T, PlaceType>(); }
315
-
316
- BENCH_FP32_CPU (kVAddRelu ) { BenchXYZNKernel<jit::kVAddRelu , T, PlaceType>(); }
317
-
318
- BENCH_FP32_CPU (kVSub ) { BenchXYZNKernel<jit::kVSub , T, PlaceType>(); }
338
+ BENCH_FP32_CPU (kVMul ) { BenchXYZNKernel<jit::kVMul , T, CPUPlace>(); }
339
+ BENCH_FP32_CPU (kVAdd ) { BenchXYZNKernel<jit::kVAdd , T, CPUPlace>(); }
340
+ BENCH_FP32_CPU (kVAddRelu ) { BenchXYZNKernel<jit::kVAddRelu , T, CPUPlace>(); }
341
+ BENCH_FP32_CPU (kVSub ) { BenchXYZNKernel<jit::kVSub , T, CPUPlace>(); }
319
342
320
343
// axyn
321
- BENCH_FP32_CPU (kVScal ) { BenchAXYNKernel<jit::kVScal , T, PlaceType>(); }
344
+ BENCH_FP32_CPU (kVScal ) { BenchAXYNKernel<jit::kVScal , T, CPUPlace>(); }
345
+ BENCH_FP32_CPU (kVAddBias ) { BenchAXYNKernel<jit::kVAddBias , T, CPUPlace>(); }
322
346
323
- BENCH_FP32_CPU (kVAddBias ) { BenchAXYNKernel<jit::kVAddBias , T, PlaceType>(); }
347
+ // xrn
348
+ BENCH_FP32_CPU (kHSum ) { BenchXRNKernel<jit::kHSum , T, CPUPlace>(); }
349
+ BENCH_FP32_CPU (kHMax ) { BenchXRNKernel<jit::kHMax , T, CPUPlace>(); }
324
350
325
351
// xyn
326
- BENCH_FP32_CPU (kVRelu ) { BenchXYNKernel<jit::kVRelu , T, PlaceType>(); }
327
-
328
- BENCH_FP32_CPU (kVIdentity ) { BenchXYNKernel<jit::kVIdentity , T, PlaceType>(); }
329
-
330
- BENCH_FP32_CPU (kVSquare ) { BenchXYNKernel<jit::kVSquare , T, PlaceType>(); }
331
-
332
- BENCH_FP32_CPU (kVExp ) { BenchXYNKernel<jit::kVExp , T, PlaceType>(); }
333
-
334
- BENCH_FP32_CPU (kVSigmoid ) { BenchXYNKernel<jit::kVSigmoid , T, PlaceType>(); }
335
-
336
- BENCH_FP32_CPU (kVTanh ) { BenchXYNKernel<jit::kVTanh , T, PlaceType>(); }
352
+ BENCH_FP32_CPU (kVRelu ) { BenchXYNKernel<jit::kVRelu , T, CPUPlace>(); }
353
+ BENCH_FP32_CPU (kVIdentity ) { BenchXYNKernel<jit::kVIdentity , T, CPUPlace>(); }
354
+ BENCH_FP32_CPU (kVSquare ) { BenchXYNKernel<jit::kVSquare , T, CPUPlace>(); }
355
+ BENCH_FP32_CPU (kVExp ) { BenchXYNKernel<jit::kVExp , T, CPUPlace>(); }
356
+ BENCH_FP32_CPU (kVSigmoid ) { BenchXYNKernel<jit::kVSigmoid , T, CPUPlace>(); }
357
+ BENCH_FP32_CPU (kVTanh ) { BenchXYNKernel<jit::kVTanh , T, CPUPlace>(); }
337
358
338
359
// lstm and peephole
339
- BENCH_FP32_CPU (kLSTMCtHt ) { BenchLSTMKernel<jit::kLSTMCtHt , T, PlaceType>(); }
340
-
341
- BENCH_FP32_CPU (kLSTMC1H1 ) { BenchLSTMKernel<jit::kLSTMC1H1 , T, PlaceType>(); }
360
+ BENCH_FP32_CPU (kLSTMCtHt ) { BenchLSTMKernel<jit::kLSTMCtHt , T, CPUPlace>(); }
361
+ BENCH_FP32_CPU (kLSTMC1H1 ) { BenchLSTMKernel<jit::kLSTMC1H1 , T, CPUPlace>(); }
342
362
343
363
// gru functions
344
- BENCH_FP32_CPU (kGRUH1 ) { BenchGRUKernel<jit::kGRUH1 , T, PlaceType>(); }
345
-
346
- BENCH_FP32_CPU (kGRUHtPart1 ) {
347
- BenchGRUKernel<jit::kGRUHtPart1 , T, PlaceType>();
348
- }
349
-
350
- BENCH_FP32_CPU (kGRUHtPart2 ) {
351
- BenchGRUKernel<jit::kGRUHtPart2 , T, PlaceType>();
352
- }
364
+ BENCH_FP32_CPU (kGRUH1 ) { BenchGRUKernel<jit::kGRUH1 , T, CPUPlace>(); }
365
+ BENCH_FP32_CPU (kGRUHtPart1 ) { BenchGRUKernel<jit::kGRUHtPart1 , T, CPUPlace>(); }
366
+ BENCH_FP32_CPU (kGRUHtPart2 ) { BenchGRUKernel<jit::kGRUHtPart2 , T, CPUPlace>(); }
353
367
354
368
// seq pool function
355
- BENCH_FP32_CPU (kSeqPool ) { BenchSeqPoolKernel<jit::kSeqPool , T, PlaceType >(); }
369
+ BENCH_FP32_CPU (kSeqPool ) { BenchSeqPoolKernel<jit::kSeqPool , T, CPUPlace >(); }
356
370
357
371
// matmul
358
- BENCH_FP32_CPU (kMatMul ) { BenchMatMulKernel<jit::kMatMul , T, PlaceType>(); }
372
+ BENCH_FP32_CPU (kMatMul ) { BenchMatMulKernel<jit::kMatMul , T, CPUPlace>(); }
373
+
374
+ // softmax
375
+ BENCH_FP32_CPU (kSoftmax ) { BenchSoftmaxKernel<jit::kSoftmax , T, CPUPlace>(); }
359
376
360
377
// Benchmark all jit kernels including jitcode, mkl and refer.
361
378
// To use this tool, run command: ./benchmark [options...]
0 commit comments