postprocessor: fix width_padding handling

MartinPulec · MartinPulec · commit cc12b8da63bf · 2025-08-22T10:57:10.000+02:00
align with the postprocessor implementation

the previous was not entirely correct
diff --git a/src/gpujpeg_postprocessor.cu b/src/gpujpeg_postprocessor.cu
@@ -122,22 +122,20 @@ inline __device__ void gpujpeg_comp_to_raw_store<GPUJPEG_U8>(uint8_t *d_data_raw
 }
 
 template<>
-inline __device__ void gpujpeg_comp_to_raw_store<GPUJPEG_444_U8_P012>(uint8_t *d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+inline __device__ void gpujpeg_comp_to_raw_store<GPUJPEG_444_U8_P012>(uint8_t *d_data_raw, int &image_width, int &image_height, int &offset, int &x, int &y, uchar4 &r)
 {
-    image_position = image_position * 3;
-    d_data_raw[image_position + 0] = r.x;
-    d_data_raw[image_position + 1] = r.y;
-    d_data_raw[image_position + 2] = r.z;
+    d_data_raw[offset + 0] = r.x;
+    d_data_raw[offset + 1] = r.y;
+    d_data_raw[offset + 2] = r.z;
 }
 
 template<>
-inline __device__ void gpujpeg_comp_to_raw_store<GPUJPEG_4444_U8_P0123>(uint8_t *d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+inline __device__ void gpujpeg_comp_to_raw_store<GPUJPEG_4444_U8_P0123>(uint8_t *d_data_raw, int &image_width, int &image_height, int &offset, int &x, int &y, uchar4 &r)
 {
-    image_position = image_position * 4;
-    d_data_raw[image_position + 0] = r.x;
-    d_data_raw[image_position + 1] = r.y;
-    d_data_raw[image_position + 2] = r.z;
-    d_data_raw[image_position + 3] = r.w;
+    d_data_raw[offset + 0] = r.x;
+    d_data_raw[offset + 1] = r.y;
+    d_data_raw[offset + 2] = r.z;
+    d_data_raw[offset + 3] = r.w;
 }
 
 template<>
@@ -159,14 +157,13 @@ inline __device__ void gpujpeg_comp_to_raw_store<GPUJPEG_422_U8_P0P1P2>(uint8_t
 }
 
 template<>
-inline __device__ void gpujpeg_comp_to_raw_store<GPUJPEG_422_U8_P1020>(uint8_t *d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+inline __device__ void gpujpeg_comp_to_raw_store<GPUJPEG_422_U8_P1020>(uint8_t *d_data_raw, int &image_width, int &image_height, int &offset, int &x, int &y, uchar4 &r)
 {
-    image_position = image_position * 2;
-    d_data_raw[image_position + 1] = r.x;
+    d_data_raw[offset + 1] = r.x;
     if ( (x % 2) == 0 )
-        d_data_raw[image_position + 0] = r.y;
+        d_data_raw[offset + 0] = r.y;
     else
-        d_data_raw[image_position + 0] = r.z;
+        d_data_raw[offset + 0] = r.z;
 }
 
 template<>
@@ -244,7 +241,8 @@ struct post_load<in_is_rgb, GPUJPEG_DYNAMIC>
  * @param pixel_count  Number of pixels to copy
  * @return void
  */
-typedef void (*gpujpeg_preprocessor_decode_kernel)(struct gpujpeg_preprocessor_data data, uint8_t* d_data_raw, int image_width, int image_height);
+typedef void (*gpujpeg_preprocessor_decode_kernel)(struct gpujpeg_preprocessor_data data, uint8_t* d_data_raw,
+                                                   int width_padding, int image_width, int image_height);
 
 template<
     enum gpujpeg_color_space color_space_internal,
@@ -256,7 +254,8 @@ template<
     uint8_t s_comp4_samp_factor_h, uint8_t s_comp4_samp_factor_v
 >
 __global__ void
-gpujpeg_preprocessor_comp_to_raw_kernel(struct gpujpeg_preprocessor_data data, uint8_t* d_data_raw, int image_width, int image_height)
+gpujpeg_preprocessor_comp_to_raw_kernel(struct gpujpeg_preprocessor_data data, uint8_t* d_data_raw,
+                                        int image_width_padding, int image_width, int image_height)
 {
     int x  = threadIdx.x;
     int gX = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x;
@@ -276,7 +275,8 @@ gpujpeg_preprocessor_comp_to_raw_kernel(struct gpujpeg_preprocessor_data data, u
     gpujpeg_color_transform<color_space_internal, color_space>::perform(r);
 
     // Save
-    gpujpeg_comp_to_raw_store<pixel_format>(d_data_raw, image_width, image_height, image_position, image_position_x, image_position_y, r);
+    int offset = image_position * unit_size<pixel_format>() + image_width_padding * image_position_y;
+    gpujpeg_comp_to_raw_store<pixel_format>(d_data_raw, image_width, image_height, offset, image_position_x, image_position_y, r);
 }
 
 /**
@@ -508,7 +508,7 @@ gpujpeg_postprocessor_decode(struct gpujpeg_coder* coder, cudaStream_t stream)
     gpujpeg_preprocessor_decode_kernel kernel = (gpujpeg_preprocessor_decode_kernel)coder->preprocessor.kernel;
     assert(kernel != NULL);
 
-    int image_width = coder->param_image.width + coder->param_image.width_padding;
+    int image_width = coder->param_image.width;
     int image_height = coder->param_image.height;
 
     // When saving 4:2:2 data of odd width, the data should have even width, so round it
@@ -537,6 +537,7 @@ gpujpeg_postprocessor_decode(struct gpujpeg_coder* coder, cudaStream_t stream)
     kernel<<<grid, threads, 0, stream>>>(
         coder->preprocessor.data,
         coder->d_data_raw,
+        coder->param_image.width_padding,
         image_width,
         image_height
     );
diff --git a/src/gpujpeg_preprocessor.cu b/src/gpujpeg_preprocessor.cu
@@ -87,9 +87,6 @@ struct gpujpeg_preprocessor_raw_to_comp_store {
 template<enum gpujpeg_pixel_format>
 inline __device__ void raw_to_comp_load(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r);
 
-template<enum gpujpeg_pixel_format>
-inline __device__ int unit_size() { return 1; }
-
 template<>
 inline __device__ void raw_to_comp_load<GPUJPEG_U8>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
 {
@@ -122,9 +119,6 @@ inline __device__ void raw_to_comp_load<GPUJPEG_420_U8_P0P1P2>(const uint8_t* d_
     r.z = d_data_raw[image_width * image_height + ((image_height + 1) / 2 + y / 2) * ((image_width + 1) / 2) + x / 2];
 }
 
-template<>
-inline __device__ int unit_size<GPUJPEG_444_U8_P012>() { return 3; }
-
 template<>
 inline __device__ void raw_to_comp_load<GPUJPEG_444_U8_P012>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &offset, int &x, int &y, uchar4 &r)
 {
@@ -133,9 +127,6 @@ inline __device__ void raw_to_comp_load<GPUJPEG_444_U8_P012>(const uint8_t* d_da
     r.z = d_data_raw[offset + 2];
 }
 
-template<>
-inline __device__ int unit_size<GPUJPEG_4444_U8_P0123>() { return 4; }
-
 template<>
 inline __device__ void raw_to_comp_load<GPUJPEG_4444_U8_P0123>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &offset, int &x, int &y, uchar4 &r)
 {
@@ -145,9 +136,6 @@ inline __device__ void raw_to_comp_load<GPUJPEG_4444_U8_P0123>(const uint8_t* d_
     r.w = d_data_raw[offset + 3];
 }
 
-template<>
-inline __device__ int unit_size<GPUJPEG_422_U8_P1020>() { return 2; }
-
 template<>
 inline __device__ void raw_to_comp_load<GPUJPEG_422_U8_P1020>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &offset, int &x, int &y, uchar4 &r)
 {
diff --git a/src/gpujpeg_preprocessor_common.cuh b/src/gpujpeg_preprocessor_common.cuh
@@ -108,5 +108,18 @@ gpujpeg_preprocessor_make_sampling_factor_i(int comp_count, int numerator_h, int
            coder->component[2].sampling_factor.horizontal, coder->component[2].sampling_factor.vertical,               \
            coder->component[3].sampling_factor.horizontal, coder->component[3].sampling_factor.vertical)
 
+template<enum gpujpeg_pixel_format>
+inline __device__ int unit_size() { return 1; }
+
+template<>
+inline __device__ int unit_size<GPUJPEG_444_U8_P012>() { return 3; }
+
+template<>
+inline __device__ int unit_size<GPUJPEG_4444_U8_P0123>() { return 4; }
+
+template<>
+inline __device__ int unit_size<GPUJPEG_422_U8_P1020>() { return 2; }
+
+
 #endif // defined GPUJPEG_PREPROCESSOR_COMMON_CUH_DCC657E3_2EDF_47E2_90F4_F7CA26829E81
 /* vi: set expandtab sw=4: */

Original file line number	Diff line number	Diff line change
`@@ -87,9 +87,6 @@ struct gpujpeg_preprocessor_raw_to_comp_store {`
`87`	`87`	`template<enum gpujpeg_pixel_format>`
`88`	`88`	`inline __device__ void raw_to_comp_load(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r);`
`89`	`89`
`90`		`-template<enum gpujpeg_pixel_format>`
`91`		`-inline __device__ int unit_size() { return 1; }`
`92`		`-`
`93`	`90`	`template<>`
`94`	`91`	`inline __device__ void raw_to_comp_load<GPUJPEG_U8>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)`
`95`	`92`	`{`
`@@ -122,9 +119,6 @@ inline __device__ void raw_to_comp_load<GPUJPEG_420_U8_P0P1P2>(const uint8_t* d_`
`122`	`119`	`r.z = d_data_raw[image_width * image_height + ((image_height + 1) / 2 + y / 2) * ((image_width + 1) / 2) + x / 2];`
`123`	`120`	`}`
`124`	`121`
`125`		`-template<>`
`126`		`-inline __device__ int unit_size<GPUJPEG_444_U8_P012>() { return 3; }`
`127`		`-`
`128`	`122`	`template<>`
`129`	`123`	`inline __device__ void raw_to_comp_load<GPUJPEG_444_U8_P012>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &offset, int &x, int &y, uchar4 &r)`
`130`	`124`	`{`
`@@ -133,9 +127,6 @@ inline __device__ void raw_to_comp_load<GPUJPEG_444_U8_P012>(const uint8_t* d_da`
`133`	`127`	`r.z = d_data_raw[offset + 2];`
`134`	`128`	`}`
`135`	`129`
`136`		`-template<>`
`137`		`-inline __device__ int unit_size<GPUJPEG_4444_U8_P0123>() { return 4; }`
`138`		`-`
`139`	`130`	`template<>`
`140`	`131`	`inline __device__ void raw_to_comp_load<GPUJPEG_4444_U8_P0123>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &offset, int &x, int &y, uchar4 &r)`
`141`	`132`	`{`
`@@ -145,9 +136,6 @@ inline __device__ void raw_to_comp_load<GPUJPEG_4444_U8_P0123>(const uint8_t* d_`
`145`	`136`	`r.w = d_data_raw[offset + 3];`
`146`	`137`	`}`
`147`	`138`
`148`		`-template<>`
`149`		`-inline __device__ int unit_size<GPUJPEG_422_U8_P1020>() { return 2; }`
`150`		`-`
`151`	`139`	`template<>`
`152`	`140`	`inline __device__ void raw_to_comp_load<GPUJPEG_422_U8_P1020>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &offset, int &x, int &y, uchar4 &r)`
`153`	`141`	`{`