ROCm
diff --git a/‎.ci/docker/build.sh‎
Lines changed: 1 addition & 13 deletions b/‎.ci/docker/build.sh‎
Lines changed: 1 addition & 13 deletions
diff --git a/‎.ci/docker/ci_commit_pins/triton.txt‎
Lines changed: 1 addition & 5 deletions b/‎.ci/docker/ci_commit_pins/triton.txt‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎.ci/docker/libtorch/build.sh‎
Lines changed: 0 additions & 4 deletions b/‎.ci/docker/libtorch/build.sh‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 0 additions & 5 deletions b/‎CMakeLists.txt‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎aten/src/ATen/native/cuda/Blas.cpp‎
Lines changed: 0 additions & 165 deletions b/‎aten/src/ATen/native/cuda/Blas.cpp‎
Lines changed: 0 additions & 165 deletions
diff --git a/‎aten/src/ATen/native/cuda/Normalization.cuh‎
Lines changed: 0 additions & 8 deletions b/‎aten/src/ATen/native/cuda/Normalization.cuh‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎aten/src/ATen/native/sparse/cuda/SparseMatMul.cu‎
Lines changed: 0 additions & 3 deletions b/‎aten/src/ATen/native/sparse/cuda/SparseMatMul.cu‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎requirements-build.txt‎
Lines changed: 0 additions & 12 deletions b/‎requirements-build.txt‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎test/dynamo/test_structured_trace.py‎
Lines changed: 0 additions & 4 deletions b/‎test/dynamo/test_structured_trace.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎test/inductor/test_cuda_repro.py‎
Lines changed: 0 additions & 4 deletions b/‎test/inductor/test_cuda_repro.py‎
Lines changed: 0 additions & 4 deletions
@@ -288,7 +288,7 @@ case "$tag" in
     ;;
   *)
     # Catch-all for builds that are not hardcoded.
-    PROTOBUF=yes    
+    PROTOBUF=yes
     VISION=yes
     echo "image '$image' did not match an existing build configuration"
     if [[ "$image" == *py* ]]; then
@@ -460,15 +460,3 @@ elif [ "$HAS_TRITON" = "yes" ]; then
   echo "expecting triton to not be installed, but it is"
   exit 0
 fi
-<<<<<<< HEAD
-
-# Sanity check cmake version.  Executorch reinstalls cmake and I'm not sure if
-# they support 4.0.0 yet, so exclude them from this check.
-CMAKE_VERSION=$(drun cmake --version)
-if [[ "$EXECUTORCH" != *yes* && "$CMAKE_VERSION" != *4.* ]]; then
-  echo "CMake version is not 4.0.0:"
-  drun cmake --version
-  exit 0
-fi
-=======
->>>>>>> upstream/main
@@ -1,5 +1 @@
-<<<<<<< HEAD
-d704bc6e69c1a588c8edd3cbb67505d554ed65f6
-=======
-7416ffcb92cdbe98d9f97e4e6f95247e46dfc9fd
->>>>>>> upstream/main
+ac80c4190aa0321f761a08af97e1e1eee41f01d9
@@ -50,15 +50,11 @@ case ${DOCKER_TAG_PREFIX} in
         BASE_TARGET=rocm
         GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
         PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-<<<<<<< HEAD
-        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
-=======
         # add gfx950, gfx115x conditionally starting in ROCm 7.0
         if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
             PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
         fi
         DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
->>>>>>> upstream/main
         ;;
     *)
         echo "ERROR: Unrecognized DOCKER_TAG_PREFIX: ${DOCKER_TAG_PREFIX}"
 
@@ -903,13 +903,8 @@ cmake_dependent_option(
   USE_FBGEMM_GENAI
   "Whether to build FBGEMM GenAI quantized GEMM kernels.\
   Will be disabled if not supported by the platform"
-<<<<<<< HEAD
-  OFF
-  "USE_CUDA OR USE_ROCM"
-=======
   ${USE_FBGEMM_GENAI_DEFAULT}
   "(USE_CUDA AND NOT MSVC) OR USE_ROCM"
->>>>>>> upstream/main
   OFF)
 
 
 
@@ -58,173 +58,8 @@
 
 namespace at::native {
 
-<<<<<<< HEAD
-namespace {
-
-// TODO: https://github.com/pytorch/pytorch/pull/59380#pullrequestreview-725310492
-c10::MaybeOwned<Tensor> inline resolve_conj_if_indicated(const Tensor& tensor, bool resolve_conj) {
-  if (resolve_conj && tensor.is_conj()) {
-    return c10::MaybeOwned<Tensor>::owned(tensor.resolve_conj());
-  } else {
-    return c10::MaybeOwned<Tensor>::borrowed(tensor);
-  }
-}
-
-c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor, bool transpose_result) {
-  if (tensor.is_non_overlapping_and_dense()) { // common case
-      transpose_tensor = tensor.is_contiguous();
-      return resolve_conj_if_indicated(tensor, transpose_result ? transpose_tensor : !transpose_tensor);
-  }
-  IntArrayRef tensor_strides = tensor.strides();
-  IntArrayRef tensor_sizes = tensor.sizes();
-  if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
-    transpose_tensor = false;
-    return resolve_conj_if_indicated(tensor, !transpose_result);
-  } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
-    transpose_tensor = true;
-    return resolve_conj_if_indicated(tensor, transpose_result);
-  } else {
-    transpose_tensor = true;
-    return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
-  }
-}
-
-c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor) {
-  if (tensor.is_non_overlapping_and_dense()) { // common case
-      transpose_tensor = tensor.is_contiguous();
-      return resolve_conj_if_indicated(tensor, true);
-  }
-
-  IntArrayRef tensor_strides = tensor.strides();
-  IntArrayRef tensor_sizes = tensor.sizes();
-  if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
-    transpose_tensor = false;
-    return resolve_conj_if_indicated(tensor, true);
-  } else if ((tensor_strides[1] == 1) &&
-    (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
-    transpose_tensor = true;
-    return resolve_conj_if_indicated(tensor, true);
-  } else {
-    transpose_tensor = true;
-    return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
-  }
-}
-
-using at::cuda::blas::ScalingType;
-
-/**
- * @brief Prepares matrices for CUBLAS operation
- *
- * This constructor prepares tensors for CUBLAS
- * The main difference is that PyTorch uses row-major as the default and
- * CUBLAS expects column-major.
- *
- * @details
- * To enable row-major output while using CUBLAS,
- * we use the mathematical identity that (A × B)^T = B^T × A^T.
- *
- * Transpose in this context refers to Cublas's(Fortran) definition of transpose (row-major)
- * T = row-major, N = col-major
- *
- * Example:
- * For matrices A (M×K)(row-major) and B (K×N)(row-major):
- *   - Standard multiplication: A × B = (M×K) × (K×N) = M×N result (row-major)
- *   - Using our transpose trick: (B^T × A^T) = (N×K)(T) × (K×M)(T) = N×M(N)
- *   - However, since the output form cublas is column-major this is
- *   - equivalent to an output of size MxN row-major as expected
- *
- * The transpose flags are derived from the layouts of the passed in tensors
- *
- * If the operands are in packed float4 format, `k`, `lda` and `ldb` are adjusted
- * to their unpacked values to match what cuBLAS expects.
- *
- * @param mat1 First input matrix
- * @param mat2 Second input matrix
- * @param c Output matrix (result)
- * @param scale_a Optional scaling factor for first matrix
- * @param scale_b Optional scaling factor for second matrix
- * @param scale_result Optional scaling factor for result
- */
-struct cublasCommonArgs {
-  cublasCommonArgs(
-      const Tensor& mat1,
-      const Tensor& mat2,
-      Tensor& c,
-      const std::optional<Tensor>& scale_a = std::nullopt,
-      const std::optional<Tensor>& scale_b = std::nullopt,
-      const std::optional<Tensor>& scale_result = std::nullopt,
-      const std::optional<ScalingType>& scaling_choice_a = std::nullopt,
-      const std::optional<ScalingType>& scaling_choice_b = std::nullopt) {
-    bool transpose_result = false, transpose_a = false, transpose_b = false;
-    result = prepare_matrix_for_cublas(c, transpose_result);
-    mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result);
-    matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result);
-
-    // Handle scale tensors if provided
-    if (scale_a && scale_b) {
-      // By default since we return in row-major we run the gemm
-      // as B.T @ A.T, check transpose_result to determine if we flip the scales
-      scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr();
-      scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type();
-      scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a;
-      scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr();
-      scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type();
-      scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b;
-    }
-
-    if (scale_result) {
-      scale_result_ptr = scale_result->data_ptr();
-      scale_result_dtype = scale_result->scalar_type();
-    }
-
-    // Update transpose flags
-    if (transpose_result) {
-      transpose_a = !transpose_a;
-      transpose_b = !transpose_b;
-    }
-
-    auto sizes_a = mata->sizes();
-    auto sizes_b = matb->sizes();
-
-    m = sizes_a[transpose_result ? 1 : 0];
-    k = sizes_a[transpose_result ? 0 : 1];
-    n = sizes_b[transpose_result ? 0 : 1];
-    lda = mata->stride((transpose_a == transpose_result) ? 1 : 0);
-    ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0);
-    result_ld = result->stride(transpose_result ? 0 : 1);
-    transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n';
-    transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n';
-
-    // cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing
-    // if the gemm operands are in packed float4
-    if (mat1.dtype() == at::kFloat4_e2m1fn_x2 && mat2.dtype() == at::kFloat4_e2m1fn_x2) {
-      k = k * 2;
-      lda = lda * 2;
-      ldb = ldb * 2;
-    }
-  }
-
-  // Matrix members
-  char transa, transb;
-  int64_t m, n, k;
-  int64_t lda, ldb, result_ld;
-  c10::MaybeOwned<Tensor> mata, matb, result;
-
-  // Scale members
-  void* scale_mata_ptr = nullptr;
-  void* scale_matb_ptr = nullptr;
-  void* scale_result_ptr = nullptr;
-  std::optional<c10::ScalarType> scale_mata_dtype;
-  std::optional<ScalingType> scaling_mata_type;
-  std::optional<c10::ScalarType> scale_matb_dtype;
-  std::optional<ScalingType> scaling_matb_type;
-  std::optional<c10::ScalarType> scale_result_dtype;
-};
-} // namespace
-=======
 using at::blas::ScalingType;
 using at::blas::SwizzleType;
->>>>>>> upstream/main
 
 c10::MaybeOwned<Tensor> prepare_batch_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor, int64_t& ld_tensor, bool transpose_result, int64_t m, int64_t n) {
   IntArrayRef tensor_strides = tensor.strides();
 
@@ -121,11 +121,7 @@ __device__ scalar_t reduce(Op op, PTA tensor, int plane) {
     for (int x = threadIdx.x; x < tensor.size(2); x += blockDim.x*UNRL) {
 #pragma unroll
       for (int u = 0; u < UNRL; u++)
-<<<<<<< HEAD
-        tmp[u] = op(batch, plane, min((int)tensor.size(2)-1, (int)(x+u*blockDim.x)));
-=======
         tmp[u] = op(batch, plane, std::min((int)tensor.size(2)-1, (int)(x+u*blockDim.x)));
->>>>>>> upstream/main
 #pragma unroll
       for (int u = 0; u < UNRL; u++)
         if (x+u*blockDim.x < tensor.size(2))
@@ -315,11 +311,7 @@ __global__ void batch_norm_collect_statistics_kernel(
     stat_accscalar_t v_[UNRL];
     for (int x = threadIdx.x; x < input.size(2); x += blockDim.x*UNRL) {
       for (int u = 0; u < UNRL; u++)
-<<<<<<< HEAD
-        v_[u] = input[batch][plane][min(x+u*blockDim.x, input.size(2)-1)];
-=======
         v_[u] = input[batch][plane][std::min(x+u*blockDim.x, input.size(2)-1)];
->>>>>>> upstream/main
       for (int u = 0; u < UNRL; u++) {
         if (x+u*blockDim.x < input.size(2)) {
           stat_accscalar_t d1 = v_[u] - avg;
 
@@ -40,7 +40,6 @@
 #include <thrust/iterator/discard_iterator.h>
 
 
-<<<<<<< HEAD
 #if defined(__CUDACC__) && ((CUSPARSE_VERSION >= 11000) || (defined(USE_ROCM) && ROCM_VERSION >= 60300))
 #define IS_CUSPARSE11_AVAILABLE() 1
 #else
@@ -60,8 +59,6 @@
 #endif
 
 #if IS_CUSPARSE11_AVAILABLE()
-=======
->>>>>>> upstream/main
 #include <library_types.h>
 
 namespace at::native {
 
@@ -1,5 +1,4 @@
 # Build System requirements
-<<<<<<< HEAD
 setuptools>=70.1.0,<80.0  # setuptools develop deprecated on 80.0
 cmake>=3.31.4
 ninja==1.11.1.3
@@ -10,15 +9,4 @@ pyyaml==6.0.2
 requests==2.32.4
 six==1.17.0  # dependency chain: NNPACK -> PeachPy -> six
 typing-extensions==4.14.1
-=======
-setuptools>=70.1.0
-cmake>=3.27
-ninja
-numpy
-packaging
-pyyaml
-requests
-six  # dependency chain: NNPACK -> PeachPy -> six
-typing-extensions>=4.10.0
->>>>>>> upstream/main
 pip  # not technically needed, but this makes setup.py invocation work
@@ -21,11 +21,7 @@
 from torch._inductor.test_case import TestCase
 from torch._logging._internal import TorchLogsFormatter
 from torch.nn.parallel import DistributedDataParallel as DDP
-<<<<<<< HEAD
-from torch.testing._internal.common_utils import find_free_port, skipIfRocm
-=======
 from torch.testing._internal.common_utils import find_free_port, xfailIfS390X
->>>>>>> upstream/main
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
 
@@ -39,11 +39,7 @@
     DeterministicGuard,
     freeze_rng_state,
     IS_FBCODE,
-<<<<<<< HEAD
-    skipIfRocm,
-=======
     MI350_ARCH,
->>>>>>> upstream/main
     skipIfRocmArch,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,