@@ -238,7 +238,7 @@ convolve_simd(
238
238
int curr_local_x = ( lid % ( TILE_X / 4 ) ) * 4 ;
239
239
int curr_y = or * STRIDE_Y + curr_local_y ;
240
240
int curr_x = oc * STRIDE_X + curr_local_x ;
241
- #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0
241
+ #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
242
242
int saved_y = curr_y ;
243
243
#endif
244
244
in_addr = input_batch_offset
@@ -256,19 +256,22 @@ convolve_simd(
256
256
LOOP (INVEC_SIZE , reg ,
257
257
{
258
258
if (curr_local_y + reg * TILE_Y_STRIDE < TILE_Y || INVEC_SIZE * TILE_Y_STRIDE <= (TILE_Y + 2 ) || reg < INVEC_SIZE - 1 ) {
259
- #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0
259
+ #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
260
260
if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + 3 >= INPUT_PAD_W && curr_x < input_width + INPUT_PAD_W ) {
261
261
if (curr_x < INPUT_PAD_W ) {
262
262
in_buf .in_vec [reg ].s0 = 0 ;
263
- if (curr_x + 1 >= INPUT_PAD_W )
263
+ if (curr_x + 1 >= INPUT_PAD_W && curr_x + 1 < input_width + INPUT_PAD_W )
264
264
in_buf .in_vec [reg ].s1 = * (inputs + in_offset + 1 );
265
265
else
266
266
in_buf .in_vec [reg ].s1 = 0 ;
267
- if (curr_x + 2 >= INPUT_PAD_W )
267
+ if (curr_x + 2 >= INPUT_PAD_W && curr_x + 2 < input_width + INPUT_PAD_W )
268
268
in_buf .in_vec [reg ].s2 = * (inputs + in_offset + 2 );
269
269
else
270
270
in_buf .in_vec [reg ].s2 = 0 ;
271
- in_buf .in_vec [reg ].s3 = * (inputs + in_offset + 3 );
271
+ if (curr_x + 3 < input_width + INPUT_PAD_W )
272
+ in_buf .in_vec [reg ].s3 = * (inputs + in_offset + 3 );
273
+ else
274
+ in_buf .in_vec [reg ].s3 = 0 ;
272
275
} else {
273
276
VLOAD4 (in_buf .in_vec [reg ], inputs + in_offset );
274
277
if (curr_x + 1 >= input_width + INPUT_PAD_W )
@@ -289,7 +292,7 @@ convolve_simd(
289
292
in_offset += input_width * TILE_Y_STRIDE ;
290
293
});
291
294
in_addr += input_height * input_width ;
292
- #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0
295
+ #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
293
296
curr_y = saved_y ;
294
297
#endif
295
298
@@ -492,7 +495,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
492
495
// atile is M rows x K columns.
493
496
int curr_x = ( global_y % output_width ) * STRIDE_X ;
494
497
int curr_y = ( global_y / output_width ) * STRIDE_Y ;
495
- #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
498
+ #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
496
499
int saved_y = curr_y ;
497
500
#endif
498
501
const __global Dtype * src0_read = src0
@@ -512,7 +515,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
512
515
do
513
516
{
514
517
int patch_row = 0 ;
515
- #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
518
+ #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
516
519
curr_y = saved_y ;
517
520
#endif
518
521
@@ -530,7 +533,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
530
533
// ...
531
534
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1 ;
532
535
533
- #if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1
536
+ #if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
534
537
Dtype_t blockA00 = ( (const __global Dtype_t * )src0_read )[ 0 ];
535
538
Dtype * pblockA00 = (Dtype * )(& blockA00 );
536
539
#else
@@ -646,7 +649,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
646
649
// atile is M rows x K columns.
647
650
int curr_x = ( global_y % output_width ) * STRIDE_X ;
648
651
int curr_y = ( global_y / output_width ) * STRIDE_Y ;
649
- #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
652
+ #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
650
653
int saved_y = curr_y ;
651
654
#endif
652
655
const __global Dtype * src0_read = src0
@@ -666,14 +669,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
666
669
do
667
670
{
668
671
int patch_row = 0 ;
669
- #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
672
+ #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
670
673
curr_y = saved_y ;
671
674
#endif
672
675
do
673
676
{
674
677
// Load atile and interleaved btile.
675
678
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1 ;
676
- #if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1
679
+ #if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
677
680
Dtype_t blockA00 = ( (const __global Dtype_t * )src0_read )[ 0 ];
678
681
Dtype * pblockA00 = (Dtype * )(& blockA00 );
679
682
#else
@@ -873,7 +876,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
873
876
int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X ;
874
877
int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y ;
875
878
int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y ;
876
- #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
879
+ #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
877
880
int saved_y0 = curr_y0 ;
878
881
int saved_y1 = curr_y1 ;
879
882
#endif
@@ -911,7 +914,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
911
914
// (0, 2) (8, 2) (16, 2) (24, 2) ... ...
912
915
// ...
913
916
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1 ;
914
- #if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1
917
+ #if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
915
918
Dtype_t blockA00 = ( (const __global Dtype_t * )src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH ;
916
919
Dtype_t blockA01 = ( (const __global Dtype_t * )src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH ;
917
920
Dtype * pblockA00 = (Dtype * )(& blockA00 );
@@ -997,7 +1000,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
997
1000
998
1001
//while( ++patch_row < 1 ); //debug
999
1002
while ( ++ patch_row < KERNEL_HEIGHT );
1000
- #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1
1003
+ #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1001
1004
curr_y0 = saved_y0 ;
1002
1005
curr_y1 = saved_y1 ;
1003
1006
#endif
@@ -1073,7 +1076,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
1073
1076
int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X ;
1074
1077
int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y ;
1075
1078
int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y ;
1076
- #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
1079
+ #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1077
1080
int saved_y0 = curr_y0 ;
1078
1081
int saved_y1 = curr_y1 ;
1079
1082
#endif
@@ -1102,7 +1105,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
1102
1105
{
1103
1106
// Load atile and interleaved btile.
1104
1107
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1 ;
1105
- #if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1
1108
+ #if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
1106
1109
Dtype_t blockA00 = ( (const __global Dtype_t * )src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH ;
1107
1110
Dtype_t blockA01 = ( (const __global Dtype_t * )src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH ;
1108
1111
Dtype * pblockA00 = (Dtype * )(& blockA00 );
@@ -1210,7 +1213,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
1210
1213
1211
1214
//while( ++patch_row < 1 ); //debug
1212
1215
while ( ++ patch_row < KERNEL_HEIGHT );
1213
- #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1
1216
+ #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1214
1217
curr_y0 = saved_y0 ;
1215
1218
curr_y1 = saved_y1 ;
1216
1219
#endif
@@ -1377,7 +1380,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
1377
1380
// atile is M rows x K columns.
1378
1381
int curr_x = ( global_y % output_width ) * STRIDE_X ;
1379
1382
int curr_y = ( global_y / output_width ) * STRIDE_Y ;
1380
- #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
1383
+ #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1381
1384
int saved_y = curr_y ;
1382
1385
#endif
1383
1386
const __global Dtype * src0_read = src0
@@ -1419,7 +1422,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
1419
1422
do
1420
1423
{
1421
1424
int patch_row = 0 ;
1422
- #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
1425
+ #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1423
1426
curr_y = saved_y ;
1424
1427
#endif
1425
1428
__attribute__((opencl_unroll_hint (1 )))
@@ -1437,7 +1440,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
1437
1440
// ...
1438
1441
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1 ;
1439
1442
1440
- #if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1
1443
+ #if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
1441
1444
Dtype_t blockA00 = ( (const __global Dtype_t * )src0_read )[ 0 ];
1442
1445
Dtype * pblockA00 = (Dtype * )(& blockA00 );
1443
1446
#else
@@ -1580,7 +1583,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
1580
1583
int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X ;
1581
1584
int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y ;
1582
1585
int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y ;
1583
- #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
1586
+ #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1584
1587
int saved_y0 = curr_y0 ;
1585
1588
int saved_y1 = curr_y1 ;
1586
1589
#endif
@@ -1618,7 +1621,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
1618
1621
// (0, 2) (8, 2) (16, 2) (24, 2) ... ...
1619
1622
// ...
1620
1623
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1 ;
1621
- #if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1
1624
+ #if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
1622
1625
Dtype_t blockA00 = ( (const __global Dtype_t * )src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH ;
1623
1626
Dtype_t blockA01 = ( (const __global Dtype_t * )src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH ;
1624
1627
Dtype * pblockA00 = (Dtype * )(& blockA00 );
@@ -1692,7 +1695,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
1692
1695
1693
1696
//while( ++patch_row < 1 ); //debug
1694
1697
while ( ++ patch_row < KERNEL_HEIGHT );
1695
- #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1
1698
+ #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1696
1699
curr_y0 = saved_y0 ;
1697
1700
curr_y1 = saved_y1 ;
1698
1701
#endif
0 commit comments