sandeepgupta12
diff --git a/‎.ci/pytorch/test.sh‎
Lines changed: 2 additions & 1 deletion b/‎.ci/pytorch/test.sh‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/ci_commit_pins/xla.txt‎
Lines changed: 1 addition & 1 deletion b/‎.github/ci_commit_pins/xla.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/DLConvertor.cpp‎
Lines changed: 3 additions & 0 deletions b/‎aten/src/ATen/DLConvertor.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎aten/src/ATen/cuda/CUDABlas.cpp‎
Lines changed: 17 additions & 8 deletions b/‎aten/src/ATen/cuda/CUDABlas.cpp‎
Lines changed: 17 additions & 8 deletions
diff --git a/‎aten/src/ATen/cuda/CUDADataType.h‎
Lines changed: 4 additions & 0 deletions b/‎aten/src/ATen/cuda/CUDADataType.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎aten/src/ATen/cuda/detail/LazyNVRTC.cpp‎
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/cuda/detail/LazyNVRTC.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h‎
Lines changed: 3 additions & 0 deletions b/‎aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/cuda/Blas.cpp‎
Lines changed: 30 additions & 8 deletions b/‎aten/src/ATen/native/cuda/Blas.cpp‎
Lines changed: 30 additions & 8 deletions
diff --git a/‎aten/src/ATen/native/cuda/Shape.cu‎
Lines changed: 5 additions & 2 deletions b/‎aten/src/ATen/native/cuda/Shape.cu‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/sparse/cuda/SparseSemiStructuredLinear.cu‎
Lines changed: 1 addition & 7 deletions b/‎aten/src/ATen/native/sparse/cuda/SparseSemiStructuredLinear.cu‎
Lines changed: 1 addition & 7 deletions
@@ -1173,8 +1173,9 @@ build_xla() {
   apply_patches
   SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
   # These functions are defined in .circleci/common.sh in pytorch/xla repo
-  retry install_deps_pytorch_xla $XLA_DIR $USE_CACHE
+  retry install_pre_deps_pytorch_xla $XLA_DIR $USE_CACHE
   CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch:${CMAKE_PREFIX_PATH}" XLA_SANDBOX_BUILD=1 build_torch_xla $XLA_DIR
+  retry install_post_deps_pytorch_xla
   assert_git_not_dirty
 }
 
 
@@ -1 +1 @@
-b2b890e962f5fb6f481e5da2eb4a43bb990d0f1b
+760675ad9aa8e7202d4f9f51fe862e8a9bedb713
@@ -71,6 +71,9 @@ DLDataType getDLDataType(const Tensor& t) {
     case ScalarType::Float8_e8m0fnu:
       TORCH_CHECK(false, "float8 types are not supported by dlpack");
       break;
+    case ScalarType::Float4_e2m1fn_x2:
+      TORCH_CHECK(false, "float4 types are not supported by dlpack");
+      break;
     case ScalarType::QInt8:
     case ScalarType::QUInt8:
     case ScalarType::QInt32:
 
@@ -1552,6 +1552,8 @@ void scaled_gemm(
     ScalarType result_dtype,
     bool use_fast_accum,
     bool use_rowwise) {
+  // Note: see `cublasCommonArgs` for various non-intuitive manupulations
+  // of input arguments to this function.
 #if CUDA_VERSION >= 11080 || defined(USE_ROCM)
   const auto computeType = CUBLAS_COMPUTE_32F;
   const auto scaleType = CUDA_R_32F;
@@ -1570,7 +1572,7 @@ void scaled_gemm(
 #else
   // rowwise isn't supported using cublaslt or older hipblaslt
   TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with blaslt");
-#endif
+#endif  // if defined(USE_ROCM) && defined(HIPBLASLT_VEC_EXT)
   computeDesc.setAttribute(matmulDescA, mat1_scale_ptr);
   computeDesc.setAttribute(matmulDescB, mat2_scale_ptr);
   if (result_scale_ptr != nullptr) {
@@ -1583,19 +1585,19 @@ void scaled_gemm(
         at::cuda::getCurrentDeviceProperties()->multiProcessorCount -
             at::globalContext()._SMCarveout_EXPERIMENTAL().value());
   }
-#endif
+#endif // ifndef USE_ROCM
 #ifndef USE_ROCM
   const int8_t fastAccuMode = use_fast_accum ? 1 : 0;
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_FAST_ACCUM, fastAccuMode);
-#endif
+#endif // ifndef USE_ROCM
   CuBlasLtMatrixLayout Adesc(ScalarTypeToCudaDataType(mat1_dtype), m, k, mat1_ld, transa == 't');
   CuBlasLtMatrixLayout Bdesc(ScalarTypeToCudaDataType(mat2_dtype), k, n, mat2_ld, transb == 't');
 #ifdef USE_ROCM
   // Cdesc is unused, beta is 0. But hipblaslt needs this set to something reasonable.
   CuBlasLtMatrixLayout Cdesc(ScalarTypeToCudaDataType(result_dtype), m, n, result_ld);
 #else
   CuBlasLtMatrixLayout Cdesc(ScalarTypeToCudaDataType(bias_dtype), m, n, result_ld);
-#endif
+#endif // ifdef USE_ROCM
   CuBlasLtMatrixLayout Ddesc(ScalarTypeToCudaDataType(result_dtype), m, n, result_ld);
   if (bias_ptr) {
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias_ptr);
@@ -1609,7 +1611,14 @@ void scaled_gemm(
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0);
 #else
     TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales is only supported for CUDA 12.8 and above");
-#endif // CUDA_VERSION >= 12080
+#endif // if CUDA_VERSION >= 12080
+  } else if (mat1_scale_dtype == kFloat8_e4m3fn && mat2_scale_dtype == kFloat8_e4m3fn) {
+#if CUDA_VERSION >= 12080
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3);
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3);
+#else
+    TORCH_CHECK(false, "scaled_gemm with `torch.float8_e4m3fn` scales is only supported for CUDA 12.8 and above");
+#endif // if CUDA_VERSION >= 12080
   }
 
   auto stream = c10::cuda::getCurrentCUDAStream();
@@ -1677,7 +1686,7 @@ void scaled_gemm(
         }
     }
     TORCH_CHECK(found, "could not find valid hipblaslt solution");
-#endif
+#endif // ifndef USE_ROCM
   }
   cublasStatus_t cublasStatus = cublasLtMatmul(
       ltHandle,
@@ -1692,7 +1701,7 @@ void scaled_gemm(
       result_ptr, // unused, since beta_val is 0, but hipblaslt can't handle nullptr
 #else
       nullptr,
-#endif
+#endif // ifdef USE_ROCM
       Cdesc.descriptor(),
       result_ptr,
       Ddesc.descriptor(),
@@ -1725,7 +1734,7 @@ void scaled_gemm(
       " scaleType ",
       scaleType);
   return;
-#endif // CUDA_VERSION >= 11080 || defined(USE_ROCM)
+#endif // if CUDA_VERSION >= 11080 || defined(USE_ROCM)
   TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above");
 }
 
 
@@ -89,6 +89,10 @@ inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type)
       return HIP_R_8F_E4M3_FNUZ;
     case c10::ScalarType::Float8_e5m2fnuz:
       return HIP_R_8F_E5M2_FNUZ;
+#endif
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12080)
+    case c10::ScalarType::Float4_e2m1fn_x2:
+      return CUDA_R_4F_E2M1;
 #endif
     default:
       TORCH_INTERNAL_ASSERT(false, "Cannot convert ScalarType ", scalar_type, " to cudaDataType.")
 
@@ -158,6 +158,8 @@ NVRTC_STUB3(nvrtcGetLoweredName, nvrtcProgram, const char *, const char **)
 
 CUDA_STUB2(cuModuleLoad, CUmodule*, const char*)
 CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *)
+CUDA_STUB2(cuFuncSetCacheConfig, CUfunction, CUfunc_cache_enum)
+CUDA_STUB3(cuDeviceGetAttribute, int*, CUdevice_attribute_enum, CUdevice)
 CUDA_STUB3(cuModuleGetFunction, CUfunction *, CUmodule, const char *)
 CUDA_STUB4(cuOccupancyMaxActiveBlocksPerMultiprocessor, int *, CUfunction, int, size_t)
 CUDA_STUB2(cuGetErrorString, CUresult, const char **)
 
@@ -62,6 +62,9 @@ namespace at::cuda {
   _(cuFuncSetAttribute)                          \
   _(cuFuncGetAttribute)                          \
   _(cuPointerGetAttribute)                       \
+  _(cuFuncSetCacheConfig)                        \
+  _(cuDeviceGetAttribute)                        \
+
 
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
 #define AT_FORALL_NVRTC_EXTENDED(_)              \
 
@@ -121,6 +121,9 @@ c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, b
  *
  * The transpose flags are derived from the layouts of the passed in tensors
  *
+ * If the operands are in packed float4 format, `k`, `lda` and `ldb` are adjusted
+ * to their unpacked values to match what cuBLAS expects.
+ *
  * @param mat1 First input matrix
  * @param mat2 Second input matrix
  * @param c Output matrix (result)
@@ -173,6 +176,14 @@ struct cublasCommonArgs {
     result_ld = result->stride(transpose_result ? 0 : 1);
     transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n';
     transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n';
+
+    // cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing
+    // if the gemm operands are in packed float4
+    if (mat1.dtype() == at::kFloat4_e2m1fn_x2 && mat2.dtype() == at::kFloat4_e2m1fn_x2) {
+      k = k * 2;
+      lda = lda * 2;
+      ldb = ldb * 2;
+    }
   }
 
   // Matrix members
@@ -980,7 +991,7 @@ enum class ScalingType : std::uint8_t {
  * ---------------------------
  * Conditions and corresponding Scaling Types:
  *
- * - If scale tensors are Float8_e8m0fnu:
+ * - If scale tensors are both `Float8_e8m0fnu` or `Float8_e4m3fn`:
  *   - Returns BlockWise (with additional size checks).
  *
  * - If scale_a.numel() == 1 && scale_b.numel() == 1:
@@ -1001,14 +1012,22 @@ ScalingType get_scaling_type(
     int64_t dim_m,
     int64_t dim_k,
     int64_t dim_n) {
-  // Check for BlockWise scaling (FP8_E8M0 types)
-  if (scale_a.scalar_type() == scale_b.scalar_type() &&
-      scale_a.scalar_type() == at::kFloat8_e8m0fnu) {
-    constexpr int64_t BLOCK_SIZE_K = 32;
+  // Check for BlockWise scaling (FP8_E8M0 and FP8_E4M3 types)
+  if ((scale_a.scalar_type() == scale_b.scalar_type()) &&
+      ((scale_a.scalar_type() == at::kFloat8_e8m0fnu) || (scale_a.scalar_type() == at::kFloat8_e4m3fn))) {
+    const bool is_nvfp4 = scale_a.scalar_type() == at::kFloat8_e4m3fn;
+
+    // cuBLAS's mxfp8 gemm: block_size is 1 scale per 32 elements
+    // cuBLAS's nvfp4 gemm: block_size is 1 scale per 16 unpacked elements.
+    const auto BLOCK_SIZE_K = is_nvfp4 ? 16 : 32;
+
     constexpr int64_t BLOCK_SIZE_MN = 128;
 
+    // adjust for fp4x2 packing if necessary
+    const auto dim_k_unpacked = is_nvfp4 ? dim_k * 2 : dim_k;
+
     auto ceil_div = [](auto a, auto b) { return (a + b - 1) / b; };
-    auto num_k_blocks = ceil_div(dim_k, BLOCK_SIZE_K);
+    auto num_k_blocks = ceil_div(dim_k_unpacked, BLOCK_SIZE_K);
     auto padded_num_k_blocks = ceil_div(num_k_blocks, 4) * 4;
 
     // TODO: We might want to enforce some structure on the shapes of the scale
@@ -1149,13 +1168,16 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
        mat2.sizes()[1], ") must be divisible by 16");
   // Check types
   TORCH_CHECK(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type");
-  TORCH_CHECK(isFloat8Type(mat1.scalar_type()), "Expected mat1 to be Float8 matrix got ", mat1.scalar_type());
-  TORCH_CHECK(isFloat8Type(mat2.scalar_type()), "Expected mat2 to be Float8 matrix got ", mat2.scalar_type());
+  TORCH_CHECK(isFloat8Type(mat1.scalar_type()) || mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2, "Expected mat1 to be Float8 or Float4_x2 matrix got ", mat1.scalar_type());
+  TORCH_CHECK(isFloat8Type(mat2.scalar_type()) || mat2.scalar_type() == ScalarType::Float4_e2m1fn_x2, "Expected mat2 to be Float8 or Float4_x2 matrix got ", mat2.scalar_type());
 #ifndef USE_ROCM
   // Type restrictions imposed by CuBLASLt as of CUDA-12.1
   TORCH_CHECK(mat1.scalar_type() != ScalarType::Float8_e5m2 || mat2.scalar_type() != ScalarType::Float8_e5m2,
         "Multiplication of two Float8_e5m2 matrices is not supported");
 #endif
+  if (use_fast_accum) {
+    TORCH_CHECK(mat1.scalar_type() != ScalarType::Float4_e2m1fn_x2 && mat2.scalar_type() != ScalarType::Float4_e2m1fn_x2, "`use_fast_accum` is not supported when `mat1` or `mat2` tensors have the `Float4_e2m1fn_x2` dtype.");
+  }
   if (bias) {
     TORCH_CHECK(out.scalar_type() != kFloat, "Bias is not supported when out_dtype is set to Float32");
     TORCH_CHECK(bias->scalar_type() == ScalarType::BFloat16 || bias->scalar_type() == ScalarType::Half,
 
@@ -507,7 +507,8 @@ TORCH_IMPL_FUNC(cat_out_cuda)
             kBool,
             kBFloat16,
             AT_EXPAND(AT_FLOAT8_TYPES),
-            AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+            AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
+            kFloat4_e2m1fn_x2);
       }
   } else if (materialized.size() > 1 &&
       result.dim() <= CAT_ARRAY_MAX_INPUT_DIMS &&
@@ -542,7 +543,9 @@ TORCH_IMPL_FUNC(cat_out_cuda)
             kFloat8_e4m3fnuz,
             kFloat8_e5m2,
             kFloat8_e5m2fnuz,
-            AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+            AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
+            // TODO(#146647): extend this to other shell dtypes
+            kFloat4_e2m1fn_x2);
       }
   } else {
     int64_t offset = 0;
 
@@ -75,12 +75,6 @@ Tensor two_four_sgemm(
     using LayoutC = LayoutOutput;
     constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
 
-    using BiasTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<
-        ThreadblockShape,
-        WarpShape,
-        ElementC,
-        AlignmentC,
-        NumEVTEpilogueStages>;
     using OutputTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<
         ThreadblockShape,
         WarpShape,
@@ -94,7 +88,7 @@ Tensor two_four_sgemm(
         cutlass::epilogue::threadblock::VisitorScalarBroadcast<ElementC>;
     using BiasTensor =
         cutlass::epilogue::threadblock::VisitorColBroadcast<
-            BiasTileThreadMap,
+            OutputTileThreadMap,
             ElementC,
             cute::Stride<cute::_1, cute::_0, int64_t>>;
     using Bias = std::conditional_t<use_bias, BiasTensor, BiasScalar>;
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-b2b890e962f5fb6f481e5da2eb4a43bb990d0f1b`
	`1`	`+760675ad9aa8e7202d4f9f51fe862e8a9bedb713`