@@ -7,10 +7,6 @@ __kernel void conv2d_1x1_opt(
7
7
__read_only image2d_t input_image ,
8
8
__read_only image2d_t filter ,
9
9
__read_only image2d_t bias ,
10
- #ifdef BATCH_NORM
11
- __read_only image2d_t new_scale ,
12
- __read_only image2d_t new_biase ,
13
- #endif
14
10
__write_only image2d_t output_image ,
15
11
__private const int stride ,
16
12
__private const int offset ,
@@ -63,12 +59,6 @@ __kernel void conv2d_1x1_opt(
63
59
CL_DTYPE4 output1 = output0 ;
64
60
CL_DTYPE4 output2 = output0 ;
65
61
CL_DTYPE4 output3 = output0 ;
66
- #elif defined(BIASE_ELE )
67
- CL_DTYPE4 output0 = READ_IMG_TYPE (CL_DTYPE_CHAR , bias , SAMPLER , output_pos0 );
68
- CL_DTYPE4 output1 = output0 ;
69
- CL_DTYPE4 output2 = output0 ;
70
- CL_DTYPE4 output3 = output0 ;
71
-
72
62
#else
73
63
CL_DTYPE4 output0 = 0.0f ;
74
64
CL_DTYPE4 output1 = 0.0f ;
@@ -234,24 +224,6 @@ __kernel void conv2d_1x1_opt(
234
224
}
235
225
}
236
226
237
- #ifdef BATCH_NORM
238
- output0 = output0 * READ_IMG_TYPE (
239
- CL_DTYPE_CHAR , new_scale , SAMPLER , (int2 )(out_c , 0 )) +
240
- READ_IMG_TYPE (CL_DTYPE_CHAR , new_biase , SAMPLER , (int2 )(out_c , 0 ));
241
-
242
- output1 = output1 * READ_IMG_TYPE (
243
- CL_DTYPE_CHAR , new_scale , SAMPLER , (int2 )(out_c , 0 )) +
244
- READ_IMG_TYPE (CL_DTYPE_CHAR , new_biase , SAMPLER , (int2 )(out_c , 0 ));
245
-
246
- output2 = output2 * READ_IMG_TYPE (
247
- CL_DTYPE_CHAR , new_scale , SAMPLER , (int2 )(out_c , 0 )) +
248
- READ_IMG_TYPE (CL_DTYPE_CHAR , new_biase , SAMPLER , (int2 )(out_c , 0 ));
249
-
250
- output3 = output3 * READ_IMG_TYPE (
251
- CL_DTYPE_CHAR , new_scale , SAMPLER , (int2 )(out_c , 0 )) +
252
- READ_IMG_TYPE (CL_DTYPE_CHAR , new_biase , SAMPLER , (int2 )(out_c , 0 ));
253
- #endif
254
-
255
227
CL_DTYPE4 alpha0 ,alpha1 ,alpha2 ,alpha3 ;
256
228
#ifdef PRELU_CH //{
257
229
alpha0 = READ_IMG_TYPE (CL_DTYPE_CHAR , prelu_alpha , SAMPLER , (int2 )(out_c , 0 ));
@@ -260,10 +232,18 @@ CL_DTYPE4 alpha0,alpha1,alpha2,alpha3;
260
232
alpha3 = alpha0 ;
261
233
//}
262
234
#elif defined(PRELU_ELE ) //{
263
- alpha0 = READ_IMG_TYPE (CL_DTYPE_CHAR , prelu_alpha , SAMPLER , output_pos0 );
264
- alpha1 = alpha0 ;
265
- alpha2 = alpha0 ;
266
- alpha3 = alpha0 ;
235
+ if (out_w0 < old_w ) {
236
+ alpha0 = READ_IMG_TYPE (CL_DTYPE_CHAR , prelu_alpha , SAMPLER , output_pos0 );
237
+ }
238
+ if (out_w1 < old_w ) {
239
+ alpha1 = READ_IMG_TYPE (CL_DTYPE_CHAR , prelu_alpha , SAMPLER , output_pos1 );
240
+ }
241
+ if (out_w2 < old_w ) {
242
+ alpha2 = READ_IMG_TYPE (CL_DTYPE_CHAR , prelu_alpha , SAMPLER , output_pos2 );
243
+ }
244
+ if (out_w3 < old_w ) {
245
+ alpha3 = READ_IMG_TYPE (CL_DTYPE_CHAR , prelu_alpha , SAMPLER , output_pos3 );
246
+ }
267
247
//}
268
248
#elif defined(PRELU_ALL ) //{
269
249
alpha0 = READ_IMG_TYPE (CL_DTYPE_CHAR , prelu_alpha , SAMPLER , (int2 )(0 , 0 ));
@@ -280,6 +260,13 @@ CL_DTYPE4 alpha0,alpha1,alpha2,alpha3;
280
260
output2 = activation_type4 (output2 , alpha2 );
281
261
output3 = activation_type4 (output3 , alpha3 );
282
262
263
+ #ifdef SCALE_ACTIVATION
264
+ output0 = fuse_scale (output0 , 1.f , 0.f , 0.f );
265
+ output1 = fuse_scale (output1 , 1.f , 0.f , 0.f );
266
+ output2 = fuse_scale (output2 , 1.f , 0.f , 0.f );
267
+ output3 = fuse_scale (output3 , 1.f , 0.f , 0.f );
268
+ #endif
269
+
283
270
if (out_w0 < old_w ) {
284
271
WRITE_IMG_TYPE (CL_DTYPE_CHAR , output_image , output_pos0 , output0 );
285
272
}
@@ -304,10 +291,6 @@ __kernel void conv2d_1x1_simple(
304
291
__read_only image2d_t input_image ,
305
292
__read_only image2d_t filter ,
306
293
__read_only image2d_t bias ,
307
- #ifdef BATCH_NORM
308
- __read_only image2d_t new_scale ,
309
- __read_only image2d_t new_biase ,
310
- #endif
311
294
__write_only image2d_t output_image ,
312
295
__private const int stride ,
313
296
__private const int offset ,
@@ -359,12 +342,6 @@ __kernel void conv2d_1x1_simple(
359
342
CL_DTYPE4 output1 = output0 ;
360
343
CL_DTYPE4 output2 = output0 ;
361
344
CL_DTYPE4 output3 = output0 ;
362
- #elif defined(BIASE_ELE )
363
- CL_DTYPE4 output0 = READ_IMG_TYPE (CL_DTYPE_CHAR , bias , SAMPLER , output_pos0 );
364
- CL_DTYPE4 output1 = output0 ;
365
- CL_DTYPE4 output2 = output0 ;
366
- CL_DTYPE4 output3 = output0 ;
367
-
368
345
#else
369
346
CL_DTYPE4 output0 = 0.0f ;
370
347
CL_DTYPE4 output1 = 0.0f ;
@@ -421,24 +398,6 @@ __kernel void conv2d_1x1_simple(
421
398
output3 = mad (input3 .w , weight3 , output3 );
422
399
}
423
400
424
- #ifdef BATCH_NORM
425
- output0 = output0 * READ_IMG_TYPE (
426
- CL_DTYPE_CHAR , new_scale , SAMPLER , (int2 )(out_c , 0 )) +
427
- READ_IMG_TYPE (CL_DTYPE_CHAR , new_biase , SAMPLER , (int2 )(out_c , 0 ));
428
-
429
- output1 = output1 * READ_IMG_TYPE (
430
- CL_DTYPE_CHAR , new_scale , SAMPLER , (int2 )(out_c , 0 )) +
431
- READ_IMG_TYPE (CL_DTYPE_CHAR , new_biase , SAMPLER , (int2 )(out_c , 0 ));
432
-
433
- output2 = output2 * READ_IMG_TYPE (
434
- CL_DTYPE_CHAR , new_scale , SAMPLER , (int2 )(out_c , 0 )) +
435
- READ_IMG_TYPE (CL_DTYPE_CHAR , new_biase , SAMPLER , (int2 )(out_c , 0 ));
436
-
437
- output3 = output3 * READ_IMG_TYPE (
438
- CL_DTYPE_CHAR , new_scale , SAMPLER , (int2 )(out_c , 0 )) +
439
- READ_IMG_TYPE (CL_DTYPE_CHAR , new_biase , SAMPLER , (int2 )(out_c , 0 ));
440
- #endif
441
-
442
401
CL_DTYPE4 alpha0 ,alpha1 ,alpha2 ,alpha3 ;
443
402
#ifdef PRELU_CH //{
444
403
alpha0 = READ_IMG_TYPE (CL_DTYPE_CHAR , prelu_alpha , SAMPLER , (int2 )(out_c , 0 ));
@@ -447,10 +406,18 @@ CL_DTYPE4 alpha0,alpha1,alpha2,alpha3;
447
406
alpha3 = alpha0 ;
448
407
//}
449
408
#elif defined(PRELU_ELE ) //{
450
- alpha0 = READ_IMG_TYPE (CL_DTYPE_CHAR , prelu_alpha , SAMPLER , output_pos0 );
451
- alpha1 = alpha0 ;
452
- alpha2 = alpha0 ;
453
- alpha3 = alpha0 ;
409
+ if (out_w0 < old_w ) {
410
+ alpha0 = READ_IMG_TYPE (CL_DTYPE_CHAR , prelu_alpha , SAMPLER , output_pos0 );
411
+ }
412
+ if (out_w1 < old_w ) {
413
+ alpha1 = READ_IMG_TYPE (CL_DTYPE_CHAR , prelu_alpha , SAMPLER , output_pos1 );
414
+ }
415
+ if (out_w2 < old_w ) {
416
+ alpha2 = READ_IMG_TYPE (CL_DTYPE_CHAR , prelu_alpha , SAMPLER , output_pos2 );
417
+ }
418
+ if (out_w3 < old_w ) {
419
+ alpha3 = READ_IMG_TYPE (CL_DTYPE_CHAR , prelu_alpha , SAMPLER , output_pos3 );
420
+ }
454
421
//}
455
422
#elif defined(PRELU_ALL ) //{
456
423
alpha0 = READ_IMG_TYPE (CL_DTYPE_CHAR , prelu_alpha , SAMPLER , (int2 )(0 , 0 ));
@@ -467,6 +434,13 @@ CL_DTYPE4 alpha0,alpha1,alpha2,alpha3;
467
434
output2 = activation_type4 (output2 , alpha2 );
468
435
output3 = activation_type4 (output3 , alpha3 );
469
436
437
+ #ifdef SCALE_ACTIVATION
438
+ output0 = fuse_scale (output0 , 1.f , 0.f , 0.f );
439
+ output1 = fuse_scale (output1 , 1.f , 0.f , 0.f );
440
+ output2 = fuse_scale (output2 , 1.f , 0.f , 0.f );
441
+ output3 = fuse_scale (output3 , 1.f , 0.f , 0.f );
442
+ #endif
443
+
470
444
if (out_w0 < old_w ) {
471
445
WRITE_IMG_TYPE (CL_DTYPE_CHAR , output_image , output_pos0 , output0 );
472
446
}
@@ -482,4 +456,4 @@ CL_DTYPE4 alpha0,alpha1,alpha2,alpha3;
482
456
if (out_w3 < old_w ) {
483
457
WRITE_IMG_TYPE (CL_DTYPE_CHAR , output_image , output_pos3 , output3 );
484
458
}
485
- }
459
+ }
0 commit comments