fix prelu elementsise bug for 1x1 conv test=develop (#5801)

daming5432 · web-flow · commit 4fc26f783e52 · 2021-03-29T16:19:05.000+08:00
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
@@ -7,10 +7,6 @@ __kernel void conv2d_1x1_opt(
     __read_only image2d_t input_image,
     __read_only image2d_t filter,
     __read_only image2d_t bias,
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale,
-    __read_only image2d_t new_biase,
-#endif
     __write_only image2d_t output_image,
     __private const int stride,
     __private const int offset,
@@ -63,12 +59,6 @@ __kernel void conv2d_1x1_opt(
   CL_DTYPE4 output1 = output0;
   CL_DTYPE4 output2 = output0;
   CL_DTYPE4 output3 = output0;
-#elif defined(BIASE_ELE)
-  CL_DTYPE4 output0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, output_pos0);
-  CL_DTYPE4 output1 = output0;
-  CL_DTYPE4 output2 = output0;
-  CL_DTYPE4 output3 = output0;
-
 #else
   CL_DTYPE4 output0 = 0.0f;
   CL_DTYPE4 output1 = 0.0f;
@@ -234,24 +224,6 @@ __kernel void conv2d_1x1_opt(
     }
   }
 
-#ifdef BATCH_NORM
-  output0 = output0 * READ_IMG_TYPE(
-                          CL_DTYPE_CHAR, new_scale, SAMPLER, (int2)(out_c, 0)) +
-            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0));
-
-  output1 = output1 * READ_IMG_TYPE(
-                          CL_DTYPE_CHAR, new_scale, SAMPLER, (int2)(out_c, 0)) +
-            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0));
-
-  output2 = output2 * READ_IMG_TYPE(
-                          CL_DTYPE_CHAR, new_scale, SAMPLER, (int2)(out_c, 0)) +
-            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0));
-
-  output3 = output3 * READ_IMG_TYPE(
-                          CL_DTYPE_CHAR, new_scale, SAMPLER, (int2)(out_c, 0)) +
-            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0));
-#endif
-
 CL_DTYPE4 alpha0,alpha1,alpha2,alpha3;
 #ifdef PRELU_CH //{
   alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0));
@@ -260,10 +232,18 @@ CL_DTYPE4 alpha0,alpha1,alpha2,alpha3;
   alpha3 = alpha0;
   //}
 #elif defined(PRELU_ELE) //{
-  alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0);
-  alpha1 = alpha0;
-  alpha2 = alpha0;
-  alpha3 = alpha0;
+  if (out_w0 < old_w) {
+    alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0);
+  }
+  if (out_w1 < old_w) {  
+    alpha1 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos1);
+  }
+  if (out_w2 < old_w) {    
+    alpha2 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos2);
+  }
+  if (out_w3 < old_w) {  
+    alpha3 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos3);
+  }
   //}
 #elif defined(PRELU_ALL) //{
   alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0));
@@ -280,6 +260,13 @@ CL_DTYPE4 alpha0,alpha1,alpha2,alpha3;
   output2 = activation_type4(output2, alpha2);
   output3 = activation_type4(output3, alpha3);
 
+#ifdef SCALE_ACTIVATION
+  output0 = fuse_scale(output0, 1.f, 0.f, 0.f);
+  output1 = fuse_scale(output1, 1.f, 0.f, 0.f);
+  output2 = fuse_scale(output2, 1.f, 0.f, 0.f);
+  output3 = fuse_scale(output3, 1.f, 0.f, 0.f);
+#endif
+
   if (out_w0 < old_w) {
     WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
   }
@@ -304,10 +291,6 @@ __kernel void conv2d_1x1_simple(
     __read_only image2d_t input_image,
     __read_only image2d_t filter,
     __read_only image2d_t bias,
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale,
-    __read_only image2d_t new_biase,
-#endif
     __write_only image2d_t output_image,
     __private const int stride,
     __private const int offset,
@@ -359,12 +342,6 @@ __kernel void conv2d_1x1_simple(
   CL_DTYPE4 output1 = output0;
   CL_DTYPE4 output2 = output0;
   CL_DTYPE4 output3 = output0;
-#elif defined(BIASE_ELE)
-  CL_DTYPE4 output0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, output_pos0);
-  CL_DTYPE4 output1 = output0;
-  CL_DTYPE4 output2 = output0;
-  CL_DTYPE4 output3 = output0;
-
 #else
   CL_DTYPE4 output0 = 0.0f;
   CL_DTYPE4 output1 = 0.0f;
@@ -421,24 +398,6 @@ __kernel void conv2d_1x1_simple(
     output3 = mad(input3.w, weight3, output3);
   }
 
-#ifdef BATCH_NORM
-  output0 = output0 * READ_IMG_TYPE(
-                          CL_DTYPE_CHAR, new_scale, SAMPLER, (int2)(out_c, 0)) +
-            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0));
-
-  output1 = output1 * READ_IMG_TYPE(
-                          CL_DTYPE_CHAR, new_scale, SAMPLER, (int2)(out_c, 0)) +
-            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0));
-
-  output2 = output2 * READ_IMG_TYPE(
-                          CL_DTYPE_CHAR, new_scale, SAMPLER, (int2)(out_c, 0)) +
-            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0));
-
-  output3 = output3 * READ_IMG_TYPE(
-                          CL_DTYPE_CHAR, new_scale, SAMPLER, (int2)(out_c, 0)) +
-            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0));
-#endif
-
 CL_DTYPE4 alpha0,alpha1,alpha2,alpha3;
 #ifdef PRELU_CH //{
   alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0));
@@ -447,10 +406,18 @@ CL_DTYPE4 alpha0,alpha1,alpha2,alpha3;
   alpha3 = alpha0;
   //}
 #elif defined(PRELU_ELE) //{
-  alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0);
-  alpha1 = alpha0;
-  alpha2 = alpha0;
-  alpha3 = alpha0;
+  if (out_w0 < old_w) {
+    alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0);
+  }
+  if (out_w1 < old_w) {
+    alpha1 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos1);
+  }
+  if (out_w2 < old_w) {
+    alpha2 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos2);
+  }
+  if (out_w3 < old_w) {  
+    alpha3 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos3);
+  }
   //}
 #elif defined(PRELU_ALL) //{
   alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0));
@@ -467,6 +434,13 @@ CL_DTYPE4 alpha0,alpha1,alpha2,alpha3;
   output2 = activation_type4(output2, alpha2);
   output3 = activation_type4(output3, alpha3);
 
+#ifdef SCALE_ACTIVATION
+  output0 = fuse_scale(output0, 1.f, 0.f, 0.f);
+  output1 = fuse_scale(output1, 1.f, 0.f, 0.f);
+  output2 = fuse_scale(output2, 1.f, 0.f, 0.f);
+  output3 = fuse_scale(output3, 1.f, 0.f, 0.f);
+#endif
+
   if (out_w0 < old_w) {
     WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
   }
@@ -482,4 +456,4 @@ CL_DTYPE4 alpha0,alpha1,alpha2,alpha3;
   if (out_w3 < old_w) {
     WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos3, output3);
   }
-}
+}