Merge branch 'release/2.8' into vmijovic/change_default_to_hipblaslt

vuksan314 · vuksan314 · commit c22226f99f69 · 2025-10-30T13:19:34.000+01:00
diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1 @@
-f9e5bf54a2fe1a6262a41b27b38180cdb6fae6a2
+21876a4bbaf371bcb83df8e6ee4f43a92f524dfe
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
@@ -341,6 +341,9 @@ at::BlasBackend Context::blasPreferredBackend() {
 #if ROCM_VERSION >= 60402
           "gfx1150", "gfx1151",
 #endif
+#if ROCM_VERSION >= 60402
+          "gfx1150", "gfx1151",
+#endif
 #if ROCM_VERSION >= 60500
           "gfx950"
 #endif
@@ -370,6 +373,9 @@ at::BlasBackend Context::blasPreferredBackend() {
 #if ROCM_VERSION >= 60402
           "gfx1150", "gfx1151",
 #endif
+#if ROCM_VERSION >= 60402
+          "gfx1150", "gfx1151",
+#endif
 #if ROCM_VERSION >= 60500
           "gfx950"
 #endif
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
@@ -275,6 +275,9 @@ static bool isSupportedHipLtROCmArch(int index) {
 #if ROCM_VERSION >= 60402
         "gfx1150", "gfx1151",
 #endif
+#if ROCM_VERSION >= 60402
+        "gfx1150", "gfx1151",
+#endif
 #if ROCM_VERSION >= 60500
         "gfx950"
 #endif
diff --git a/aten/src/ATen/native/cuda/Normalization.cuh b/aten/src/ATen/native/cuda/Normalization.cuh
@@ -23,7 +23,7 @@ namespace at::native {
 
 // The maximum number of threads in a block
 #if defined(USE_ROCM)
-constexpr int MAX_BLOCK_SIZE = 256;
+constexpr int MAX_BLOCK_SIZE = 1024;
 #else
 constexpr int MAX_BLOCK_SIZE = 512;
 #endif
@@ -33,7 +33,7 @@ constexpr unsigned MAX_GRID_SIZE = 65535u;
 // Number of threads in a block given an input size up to MAX_BLOCK_SIZE
 static int getNumThreads(int nElem) {
 #if defined(USE_ROCM)
-  int threadSizes[5] = { 16, 32, 64, 128, MAX_BLOCK_SIZE };
+  int threadSizes[5] = { 64, 128, 256, 512, MAX_BLOCK_SIZE };
 #else
   int threadSizes[5] = { 32, 64, 128, 256, MAX_BLOCK_SIZE };
 #endif
@@ -115,9 +115,23 @@ __device__ scalar_t reduce(Op op, PTA tensor, int plane) {
   // first the reductions each thread does separately
   scalar_t sum = static_cast<scalar_t>(0);
   for (int batch = threadIdx.y; batch < tensor.size(0); batch += blockDim.y) {
+#if defined(USE_ROCM)
+    constexpr int UNRL = 4; // load deserilize factor
+    scalar_t tmp[UNRL];
+    for (int x = threadIdx.x; x < tensor.size(2); x += blockDim.x*UNRL) {
+#pragma unroll
+      for (int u = 0; u < UNRL; u++)
+        tmp[u] = op(batch, plane, min((int)tensor.size(2)-1, (int)(x+u*blockDim.x)));
+#pragma unroll
+      for (int u = 0; u < UNRL; u++)
+        if (x+u*blockDim.x < tensor.size(2))
+          sum += tmp[u];
+    }
+#else
     for (int x = threadIdx.x; x < tensor.size(2); x += blockDim.x) {
       sum += op(batch, plane, x);
     }
+#endif
   }
   __shared__ scalar_t shared[C10_WARP_SIZE];
   SumReduceOp<scalar_t> reduce_op;
@@ -292,13 +306,30 @@ __global__ void batch_norm_collect_statistics_kernel(
   stat_accscalar_t var_n = 0;
   int n = 0;
   for (int batch = threadIdx.y; batch < input.size(0); batch += blockDim.y) {
+#if defined(USE_ROCM)
+    constexpr int UNRL = 4;
+    stat_accscalar_t v_[UNRL];
+    for (int x = threadIdx.x; x < input.size(2); x += blockDim.x*UNRL) {
+      for (int u = 0; u < UNRL; u++)
+        v_[u] = input[batch][plane][min(x+u*blockDim.x, input.size(2)-1)];
+      for (int u = 0; u < UNRL; u++) {
+        if (x+u*blockDim.x < input.size(2)) {
+          stat_accscalar_t d1 = v_[u] - avg;
+          n++;
+          avg += d1 / n;
+          var_n += d1 * (v_[u] - avg);
+        }
+      }
+    }
+#else
     for (int x = threadIdx.x; x < input.size(2); x += blockDim.x) {
       stat_accscalar_t v = input[batch][plane][x];
       stat_accscalar_t d1 = v - avg;
       n++;
       avg += d1 / n;
       var_n += d1 * (v - avg);
     }
+#endif
   }
 
   // first warpSum to get one value per thread to
diff --git a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
@@ -127,6 +127,29 @@ __global__ void upsample_bilinear2d_nhwc_out_frame(
   }
 }
 
+#ifdef USE_ROCM
+// Helper function to compute output pixel range that can contribute to input pixel
+template <typename accscalar_t>
+__device__ __forceinline__ void compute_output_range(
+    int input_pos,
+    accscalar_t scale,
+    int output_size,
+    bool align_corners,
+    int& min_output,
+    int& max_output) {
+  accscalar_t lo, hi;
+  if (align_corners) {
+      lo = static_cast<accscalar_t>(input_pos - 1) / scale;
+      hi = static_cast<accscalar_t>(input_pos + 1) / scale;
+  } else {
+      lo = (input_pos - static_cast<accscalar_t>(0.5)) / scale - static_cast<accscalar_t>(0.5);
+      hi = (input_pos + static_cast<accscalar_t>(1.5)) / scale - static_cast<accscalar_t>(0.5);
+  }
+  min_output = max(0, static_cast<int>(std::ceil(lo)));
+  max_output = min(output_size - 1, static_cast<int>(std::floor(hi)));
+}
+#endif
+
 // Backward (adjoint) operation 1 <- 2 (accumulates)
 template <typename scalar_t, typename accscalar_t>
 C10_LAUNCH_BOUNDS_1(1024)
@@ -141,8 +164,74 @@ __global__ void upsample_bilinear2d_backward_out_frame(
     const bool align_corners,
     scalar_t* __restrict__ idata,
     const scalar_t* __restrict__ odata) {
-  const size_t o_numel = nc * width2 * height2;
+  // In C++, integer multiplication, like in standard arithmetic, is generally commutative.
   const size_t i_numel = nc * width1 * height1;
+#ifdef USE_ROCM
+  for (size_t index = blockDim.x * blockIdx.x + threadIdx.x; index < i_numel;
+       index += blockDim.x * gridDim.x) {
+    // Decode input pixel coordinates
+    size_t index_temp = index;
+    const int w1 = index_temp % width1;
+    index_temp /= width1;
+    const int h1 = index_temp % height1;
+    const size_t nc_idx = index_temp / height1;
+
+    accscalar_t grad_sum = 0;
+
+    // Find range of output pixels that could interpolate from this input pixel
+    int h2_min, h2_max, w2_min, w2_max;
+    compute_output_range<accscalar_t>(h1, rheight, height2, align_corners, h2_min, h2_max);
+    compute_output_range<accscalar_t>(w1, rwidth, width2, align_corners, w2_min, w2_max);
+
+    // Iterate over potential output pixels
+    for (int h2 = h2_min; h2 <= h2_max; h2++) {
+      for (int w2 = w2_min; w2 <= w2_max; w2++) {
+        // Compute source coordinates for this output pixel
+        const accscalar_t h1r = area_pixel_compute_source_index<accscalar_t>(
+            rheight, h2, align_corners, /*cubic=*/false);
+        const int h1_base = (int)h1r;
+        const int h1p = (h1_base < height1 - 1) ? 1 : 0;
+        const accscalar_t h1lambda = h1r - h1_base;
+        const accscalar_t h0lambda = static_cast<accscalar_t>(1) - h1lambda;
+
+        const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
+            rwidth, w2, align_corners, /*cubic=*/false);
+        const int w1_base = (int)w1r;
+        const int w1p = (w1_base < width1 - 1) ? 1 : 0;
+        const accscalar_t w1lambda = w1r - w1_base;
+        const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
+
+        // Check if our input pixel participates in this interpolation and accumulate all weights
+        // At boundaries, h1p=0 or w1p=0 causes some sampling positions to collapse
+        // to the same pixel, so we need to accumulate weights from all matching positions
+        accscalar_t weight = 0;
+
+        // Check all four interpolation positions and accumulate weights
+        if (h1 == h1_base && w1 == w1_base) {
+          weight += h0lambda * w0lambda;  // top-left
+        }
+        if (h1 == h1_base && w1 == w1_base + w1p) {
+          weight += h0lambda * w1lambda;  // top-right (may be same as top-left if w1p=0)
+        }
+        if (h1 == h1_base + h1p && w1 == w1_base) {
+          weight += h1lambda * w0lambda;  // bottom-left (may be same as top-left if h1p=0)
+        }
+        if (h1 == h1_base + h1p && w1 == w1_base + w1p) {
+          weight += h1lambda * w1lambda;  // bottom-right (may collapse to other positions)
+        }
+
+        if (weight > 0) {
+          const size_t output_idx = nc_idx * height2 * width2 + h2 * width2 + w2;
+          grad_sum += weight * static_cast<accscalar_t>(odata[output_idx]);
+        }
+      }
+    }
+
+    // Write accumulated gradient (no atomics needed)
+    idata[index] = static_cast<scalar_t>(grad_sum);
+  }
+#else
+  const size_t o_numel = nc * width2 * height2;
   for (size_t index = blockDim.x * blockIdx.x + threadIdx.x; index < o_numel;
        index += blockDim.x * gridDim.x) {
     size_t index_temp = index;
@@ -191,6 +280,7 @@ __global__ void upsample_bilinear2d_backward_out_frame(
         static_cast<scalar_t>(h1lambda * w1lambda * d2val),
         true);
   }
+#endif
 }
 
 template <typename scalar_t, typename accscalar_t>
@@ -387,7 +477,6 @@ static void upsample_bilinear2d_backward_out_cuda_template(
   // threads are not covering the whole input tensor.
   grad_input.zero_();
 
-  const size_t num_kernels = nbatch * channels * output_height * output_width;
   const int num_threads = std::min(
       at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@@ -397,6 +486,12 @@ static void upsample_bilinear2d_backward_out_cuda_template(
     return;
   }
 
+#ifdef USE_ROCM
+  constexpr bool use_input = true;
+#else
+  constexpr bool use_input = false;
+#endif
+
   AT_DISPATCH_FLOATING_TYPES_AND2(
       at::ScalarType::Half, at::ScalarType::BFloat16,
       grad_output_.scalar_type(), "upsample_bilinear2d_backward_out_frame", [&] {
@@ -414,6 +509,8 @@ static void upsample_bilinear2d_backward_out_cuda_template(
       const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
           input_width, output_width, align_corners, scales_w);
 
+      const size_t num_kernels = nbatch * channels * output_height * output_width;
+
       upsample_bilinear2d_backward_nhwc_out_frame<scalar_t, accscalar_t>
           <<<ceil_div(num_kernels, static_cast<size_t>(num_threads)), num_threads, 0, stream>>>(
               input_height,
@@ -444,6 +541,8 @@ static void upsample_bilinear2d_backward_out_cuda_template(
       const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
           input_width, output_width, align_corners, scales_w);
 
+      const size_t num_kernels = nbatch * channels * (use_input ? input_height * input_width : output_height * output_width);
+
       upsample_bilinear2d_backward_out_frame<scalar_t, accscalar_t>
           <<<ceil_div(num_kernels, static_cast<size_t>(num_threads)),
              num_threads,
diff --git a/cmake/Modules/FindOpenBLAS.cmake b/cmake/Modules/FindOpenBLAS.cmake
@@ -29,10 +29,15 @@ SET(Open_BLAS_LIB_SEARCH_PATHS
         $ENV{OpenBLAS}/lib
         $ENV{OpenBLAS_HOME}
         $ENV{OpenBLAS_HOME}/lib
- )
+)
+
+SET(Open_BLAS_LIB_NAME openblas)
+IF(DEFINED ENV{OpenBLAS_LIB_NAME})
+  SET(Open_BLAS_LIB_NAME $ENV{OpenBLAS_LIB_NAME})
+ENDIF()
 
 FIND_PATH(OpenBLAS_INCLUDE_DIR NAMES cblas.h PATHS ${Open_BLAS_INCLUDE_SEARCH_PATHS})
-FIND_LIBRARY(OpenBLAS_LIB NAMES openblas PATHS ${Open_BLAS_LIB_SEARCH_PATHS})
+FIND_LIBRARY(OpenBLAS_LIB NAMES ${Open_BLAS_LIB_NAME} PATHS ${Open_BLAS_LIB_SEARCH_PATHS})
 
 SET(OpenBLAS_FOUND ON)
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-f9e5bf54a2fe1a6262a41b27b38180cdb6fae6a2`
	`1`	`+21876a4bbaf371bcb83df8e6ee4f43a92f524dfe`