[MIOpen] Bugfix Use uint64_t instead of long for offsets (#3381)

sbalint98 · web-flow · commit 5104576a3827 · 2025-12-20T18:54:11.000Z
## Motivation When porting OpenCL kernels the `ulong` datatype was often ported to `unsigned long`. This is is problematic since the size of long on windows is 32 bits. This MR replaces the uses of long with `uint64_t`. refs: #3364 ## Test Plan The tests should be extended in a follow up MR to cover these cases as well. ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
diff --git a/projects/miopen/src/kernels/MIOpenCol2Im3d.cpp b/projects/miopen/src/kernels/MIOpenCol2Im3d.cpp
@@ -59,7 +59,7 @@ extern "C" __global__ void Col2Im3dU(FLOAT* col,
                                      const unsigned int height,
                                      const unsigned int width,
                                      FLOAT* im,
-                                     const unsigned long im_offset)
+                                     const uint64_t im_offset)
 {
     FLOAT* im_off            = im + im_offset;
     unsigned int gid         = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/projects/miopen/src/kernels/MIOpenConvFFT.cpp b/projects/miopen/src/kernels/MIOpenConvFFT.cpp
diff --git a/projects/miopen/src/kernels/MIOpenDropoutHIP.cpp b/projects/miopen/src/kernels/MIOpenDropoutHIP.cpp
@@ -30,6 +30,7 @@
 
 // Workaround to overcome redefinition errors while including rocrand header files directly
 #include "miopen_rocrand.hpp"
+#include "miopen_cstdint.hpp"
 
 #ifndef MIOPEN_USE_FP32
 #define MIOPEN_USE_FP32 0
@@ -63,7 +64,7 @@
  * @param states_num The number of elements in the state array.
  */
 extern "C" __global__ void
-InitKernelStateHIP(rocrand_state_xorwow* state, ulong prng_seed, ulong states_num)
+InitKernelStateHIP(rocrand_state_xorwow* state, uint64_t prng_seed, uint64_t states_num)
 {
     // Get the index of the current element
     size_t index  = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/projects/miopen/src/kernels/MIOpenMultiMarginLoss.cpp b/projects/miopen/src/kernels/MIOpenMultiMarginLoss.cpp
@@ -37,7 +37,7 @@ __device__ void multimarginlossforward2d(const DTYPE* __restrict__ I,
                                          const uint64_t* __restrict__ T,
                                          const DTYPE* __restrict__ W,
                                          void* __restrict__ O,
-                                         const long p,
+                                         const int64_t p,
                                          const float margin,
                                          tensor_view_t<2> I_tv,
                                          tensor_view_t<1> T_tv,
@@ -88,7 +88,7 @@ extern "C" __global__ void MultiMarginLossForward2d(const FLOAT* __restrict__ I,
                                                     const uint64_t* __restrict__ T,
                                                     const FLOAT* __restrict__ W,
                                                     void* __restrict__ O,
-                                                    const long p,
+                                                    const int64_t p,
                                                     const float margin,
                                                     tensor_view_t<2> I_tv,
                                                     tensor_view_t<1> T_tv,
diff --git a/projects/miopen/src/kernels/MIOpenNeuron.cpp b/projects/miopen/src/kernels/MIOpenNeuron.cpp
@@ -32,6 +32,7 @@
 #endif
 
 #include "activation_functions.hpp"
+#include "miopen_cstdint.hpp"
 
 #ifdef LITE
 
@@ -55,8 +56,8 @@ extern "C" __global__ void MIOpenActiveFwdLite(const FP_TYPE* bot,
                                                FP_TYPE gamma,
                                                FP_TYPE beta,
                                                FP_TYPE alpha,
-                                               const long bot_offset,
-                                               const long top_offset)
+                                               const int64_t bot_offset,
+                                               const int64_t top_offset)
 {
     const unsigned int tid   = blockIdx.x * LOCAL_SIZE + threadIdx.x;
     const unsigned int index = tid * MIOPEN_READ_UNIT;
@@ -84,8 +85,8 @@ extern "C" __global__ void MIOpenActiveFwd2DLite(const FP_TYPE* bot,
                                                  FP_TYPE gamma,
                                                  FP_TYPE beta,
                                                  FP_TYPE alpha,
-                                                 const long bot_offset,
-                                                 const long top_offset,
+                                                 const int64_t bot_offset,
+                                                 const int64_t top_offset,
                                                  const uint bot_stride,
                                                  const uint top_stride)
 {
@@ -123,10 +124,10 @@ extern "C" __global__ void MIOpenActiveBwdLite(FP_TYPE* bot_diff,
                                                FP_TYPE gamma,
                                                FP_TYPE beta,
                                                FP_TYPE alpha,
-                                               const long bot_diff_offset,
-                                               const long top_diff_offset,
-                                               const long bot_offset,
-                                               const long top_offset)
+                                               const int64_t bot_diff_offset,
+                                               const int64_t top_diff_offset,
+                                               const int64_t bot_offset,
+                                               const int64_t top_offset)
 {
     const unsigned int tid = blockIdx.x * LOCAL_SIZE + threadIdx.x;
     int index              = tid * MIOPEN_READ_UNIT;
@@ -163,14 +164,14 @@ extern "C" __global__ void MIOpenActiveBwd2DLite(FP_TYPE* bot_diff,
                                                  FP_TYPE gamma,
                                                  FP_TYPE beta,
                                                  FP_TYPE alpha,
-                                                 const long bot_diff_offset,
-                                                 const long top_diff_offset,
-                                                 const long bot_offset,
-                                                 const long top_offset,
-                                                 const uint bot_diff_stride,
-                                                 const uint top_diff_stride,
-                                                 const uint bot_stride,
-                                                 const uint top_stride)
+                                                 const int64_t bot_diff_offset,
+                                                 const int64_t top_diff_offset,
+                                                 const int64_t bot_offset,
+                                                 const int64_t top_offset,
+                                                 const uint32_t bot_diff_stride,
+                                                 const uint32_t top_diff_stride,
+                                                 const uint32_t bot_stride,
+                                                 const uint32_t top_stride)
 {
     const unsigned int x_id = blockIdx.x * LOCAL_SIZE + threadIdx.x;
     const unsigned int y    = blockIdx.y * blockDim.y + threadIdx.y;
@@ -181,10 +182,10 @@ extern "C" __global__ void MIOpenActiveBwd2DLite(FP_TYPE* bot_diff,
     if(y >= height)
         return;
 
-    uint bot_diff_index = y * bot_diff_stride + x_id * MIOPEN_READ_UNIT;
-    uint top_diff_index = y * top_diff_stride + x_id * MIOPEN_READ_UNIT;
-    uint bot_index      = y * bot_stride + x_id * MIOPEN_READ_UNIT;
-    uint top_index      = y * top_stride + x_id * MIOPEN_READ_UNIT;
+    uint32_t bot_diff_index = y * bot_diff_stride + x_id * MIOPEN_READ_UNIT;
+    uint32_t top_diff_index = y * top_diff_stride + x_id * MIOPEN_READ_UNIT;
+    uint32_t bot_index      = y * bot_stride + x_id * MIOPEN_READ_UNIT;
+    uint32_t top_index      = y * top_stride + x_id * MIOPEN_READ_UNIT;
 
     FP_TYPE bot_diff_dat[MIOPEN_READ_UNIT];
     FP_TYPE top_diff_dat[MIOPEN_READ_UNIT];
@@ -215,8 +216,8 @@ __launch_bounds__(
                          FP_TYPE gamma,
                          FP_TYPE beta,
                          FP_TYPE alpha,
-                         const long xOffset,
-                         const long yOffset)
+                         const int64_t xOffset,
+                         const int64_t yOffset)
 {
     const unsigned int x = blockIdx.x * MIOPEN_NRN_GROUP_SZ0 + threadIdx.x; // channel x
 
@@ -339,10 +340,10 @@ __launch_bounds__(
                          FP_TYPE gamma,
                          FP_TYPE beta,
                          FP_TYPE alpha,
-                         const long dxOffset,
-                         const long dyOffset,
-                         const long xOffset,
-                         const long yOffset)
+                         const int64_t dxOffset,
+                         const int64_t dyOffset,
+                         const int64_t xOffset,
+                         const int64_t yOffset)
 {
     const unsigned int x = blockIdx.x * MIOPEN_NRN_GROUP_SZ0 + threadIdx.x;
 
diff --git a/projects/miopen/src/kernels/MIOpenPoolingForwardNaive.cpp b/projects/miopen/src/kernels/MIOpenPoolingForwardNaive.cpp
@@ -46,7 +46,7 @@
 #error "MLO_POOLING_IS2D_KERNEL must be defined"
 #endif
 
-using arg_size_t = unsigned long;
+using arg_size_t = uint64_t;
 
 extern "C" __global__ void mloPoolingForwardNaive(const FLOAT* bot_ptr,
                                                   FLOAT* top_ptr,
diff --git a/projects/miopen/src/kernels/MIOpenTensorKernelsHip.cpp b/projects/miopen/src/kernels/MIOpenTensorKernelsHip.cpp
@@ -166,10 +166,10 @@ extern "C" __global__ void Op2dTensorSquash(const MIOPEN_TYPE* a,
                                             const MIOPEN_TYPE alpha0,
                                             const MIOPEN_TYPE alpha1,
                                             const MIOPEN_TYPE beta,
-                                            const long Aoffset,
-                                            const long Boffset,
-                                            const long Coffset,
-                                            const long total_work,
+                                            const int64_t Aoffset,
+                                            const int64_t Boffset,
+                                            const int64_t Coffset,
+                                            const int64_t total_work,
                                             const int use_apl0,
                                             const int use_apl1,
                                             const int use_bet)
@@ -448,9 +448,9 @@ extern "C" __global__ void Op4dTensorGeneric(MIOPEN_TYPE* a,
                                              const MIOPEN_TYPE beta,
                                              const unsigned int bitmap,
                                              const int work_per_wg,
-                                             const long Aoffset,
-                                             const long Boffset,
-                                             const long Coffset,
+                                             const int64_t Aoffset,
+                                             const int64_t Boffset,
+                                             const int64_t Coffset,
                                              const int num_wg)
 {
     int gid = blockIdx.x;
@@ -544,10 +544,10 @@ extern "C" __global__ void Op4dTensorLite(const MIOPEN_TYPE* a,
                                           const MIOPEN_TYPE alpha0,
                                           const MIOPEN_TYPE alpha1,
                                           const MIOPEN_TYPE beta,
-                                          const long Aoffset,
-                                          const long Boffset,
-                                          const long Coffset,
-                                          const long total_work,
+                                          const int64_t Aoffset,
+                                          const int64_t Boffset,
+                                          const int64_t Coffset,
+                                          const int64_t total_work,
                                           const int use_beta)
 {
     int gid0        = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/projects/miopen/src/ocl/utilocl.cpp b/projects/miopen/src/ocl/utilocl.cpp
@@ -544,7 +544,7 @@ float Col2Im3dGPU(const Handle& handle,
                   const uint32_t in_h,
                   const uint32_t in_w,
                   Data_t im,
-                  std::size_t im_offset,
+                  const uint64_t im_offset,
                   miopenDataType_t type)
 {
     std::string program_name = "MIOpenCol2Im3d.cpp";

Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ extern "C" __global__ void Col2Im3dU(FLOAT* col,`
`59`	`59`	`const unsigned int height,`
`60`	`60`	`const unsigned int width,`
`61`	`61`	`FLOAT* im,`
`62`		`- const unsigned long im_offset)`
	`62`	`+ const uint64_t im_offset)`
`63`	`63`	`{`
`64`	`64`	`FLOAT* im_off = im + im_offset;`
`65`	`65`	`unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x;`
Original file line number	Diff line number	Diff line change
`@@ -544,7 +544,7 @@ float Col2Im3dGPU(const Handle& handle,`
`544`	`544`	`const uint32_t in_h,`
`545`	`545`	`const uint32_t in_w,`
`546`	`546`	`Data_t im,`
`547`		`- std::size_t im_offset,`
	`547`	`+ const uint64_t im_offset,`
`548`	`548`	`miopenDataType_t type)`
`549`	`549`	`{`
`550`	`550`	`std::string program_name = "MIOpenCol2Im3d.cpp";`