NVIDIA · vthumbe1503 · Jan 5, 2026 · Jan 5, 2026 · Jan 6, 2026 · Jan 6, 2026
diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu
@@ -129,7 +129,8 @@ GemmParam CanonicalizeGemmInput(const transformer_engine::Tensor &A, const cubla
     ret.Atype = A.data.dtype;
     ret.A_scale_inv = A.scale_inv.dptr;
     ret.lda = is_A_transposed ? k : m;
-    if (!nvte_is_non_tn_fp8_gemm_supported() && !is_A_transposed) {
+    int is_nvte_non_tn_fp8_gemm_supported = nvte_is_non_tn_fp8_gemm_supported();
-    int is_nvte_non_tn_fp8_gemm_supported = nvte_is_non_tn_fp8_gemm_supported();
+    // Move this declaration before line 125 (Configure A matrix)
+    int is_nvte_non_tn_fp8_gemm_supported = nvte_is_non_tn_fp8_gemm_supported();
-    int is_nvte_non_tn_fp8_gemm_supported = nvte_is_non_tn_fp8_gemm_supported();
+    // Move this declaration before line 125 (Configure A matrix)
+    int is_nvte_non_tn_fp8_gemm_supported = nvte_is_non_tn_fp8_gemm_supported();
+    if (!is_nvte_non_tn_fp8_gemm_supported && !is_A_transposed) {
       // Hopper only supports TN GEMMs for FP8. "Column-wise data" is transpose of data.
       if (A.has_columnwise_data() && is_fp8_dtype(A.columnwise_data.dtype)) {
         ret.A = A.columnwise_data.dptr;
@@ -140,7 +141,7 @@ GemmParam CanonicalizeGemmInput(const transformer_engine::Tensor &A, const cubla
       } else {
         NVTE_CHECK(!is_fp8_dtype(ret.Atype), "Input A is missing column-wise usage");
       }
-    } else if (nvte_is_non_tn_fp8_gemm_supported() && !A.has_data()) {
+    } else if (is_nvte_non_tn_fp8_gemm_supported && !A.has_data()) {
       // Blackwell supports any GEMM layout for FP8, so we can use column-wise/transposed
       // data  with the mirrored transpose-flag if we don't have row-wise data.
       NVTE_CHECK(A.has_columnwise_data() && is_fp8_dtype(A.columnwise_data.dtype),
@@ -220,7 +221,8 @@ GemmParam CanonicalizeGemmInput(const transformer_engine::Tensor &A, const cubla
     ret.Btype = B.data.dtype;
     ret.B_scale_inv = B.scale_inv.dptr;
     ret.ldb = is_B_transposed ? n : k;
-    if (!nvte_is_non_tn_fp8_gemm_supported() && is_B_transposed) {
+    int is_nvte_non_tn_fp8_gemm_supported = nvte_is_non_tn_fp8_gemm_supported();
-    int is_nvte_non_tn_fp8_gemm_supported = nvte_is_non_tn_fp8_gemm_supported();
+    // Move declaration to line 104 (after GemmParam ret;) to avoid redeclaration
-    int is_nvte_non_tn_fp8_gemm_supported = nvte_is_non_tn_fp8_gemm_supported();
+    // Reuse is_nvte_non_tn_fp8_gemm_supported from line 132
-    int is_nvte_non_tn_fp8_gemm_supported = nvte_is_non_tn_fp8_gemm_supported();
+    // Move declaration to line 104 (after GemmParam ret;) to avoid redeclaration
-    int is_nvte_non_tn_fp8_gemm_supported = nvte_is_non_tn_fp8_gemm_supported();
+    // Reuse is_nvte_non_tn_fp8_gemm_supported from line 132
+    if (!is_nvte_non_tn_fp8_gemm_supported && is_B_transposed) {
       // Hopper only supports TN GEMMs for FP8. "Column-wise data" is transpose of data.
       if (B.has_columnwise_data() && is_fp8_dtype(B.columnwise_data.dtype)) {
         ret.B = B.columnwise_data.dptr;
@@ -231,7 +233,7 @@ GemmParam CanonicalizeGemmInput(const transformer_engine::Tensor &A, const cubla
       } else {
         NVTE_CHECK(!is_fp8_dtype(ret.Btype), "Input B is missing column-wise usage");
       }
-    } else if (nvte_is_non_tn_fp8_gemm_supported() && !B.has_data()) {
+    } else if (is_nvte_non_tn_fp8_gemm_supported && !B.has_data()) {
       // Blackwell supports any GEMM layout for FP8, so we can use column-wise/transposed
       // data with the mirrored transpose-flag if we don't have row-wise data.
       NVTE_CHECK(B.has_columnwise_data() && is_fp8_dtype(B.columnwise_data.dtype),

diff --git a/transformer_engine/common/transformer_engine.cpp b/transformer_engine/common/transformer_engine.cpp
@@ -961,7 +961,7 @@ void nvte_destroy_quantization_config(NVTEQuantizationConfig config) {
 }
 
 int nvte_is_non_tn_fp8_gemm_supported() {
-  int num_devices = transformer_engine::cuda::num_devices();
+  static int num_devices = transformer_engine::cuda::num_devices();
   static std::vector<int> cache(num_devices, -1);
   static std::vector<std::once_flag> flags(num_devices);
-  static int num_devices = transformer_engine::cuda::num_devices();
-  static std::vector<int> cache(num_devices, -1);
-  static std::vector<std::once_flag> flags(num_devices);
+  static int num_devices = transformer_engine::cuda::num_devices();
+  static std::vector<int> cache(num_devices, -1);
+  static std::vector<std::once_flag> flags(num_devices);
+  int device_id = transformer_engine::cuda::current_device();
+  NVTE_CHECK(device_id < num_devices, "Device ID exceeds cached device count");
-  static int num_devices = transformer_engine::cuda::num_devices();
-  static std::vector<int> cache(num_devices, -1);
-  static std::vector<std::once_flag> flags(num_devices);
+  static int num_devices = transformer_engine::cuda::num_devices();
+  static std::vector<int> cache(num_devices, -1);
+  static std::vector<std::once_flag> flags(num_devices);
+  int device_id = transformer_engine::cuda::current_device();
+  NVTE_CHECK(device_id < num_devices, "Device ID exceeds cached device count");
   int device_id = transformer_engine::cuda::current_device();

diff --git a/transformer_engine/pytorch/cpp_extensions/gemm.py b/transformer_engine/pytorch/cpp_extensions/gemm.py
@@ -76,8 +76,6 @@ def get_tensor_device(tensor: torch.Tensor) -> int:
     The order of attributes checked is important to also
     minimize overhead.
     """
-    if hasattr(tensor, "device"):
-        return tensor.device.index
     if hasattr(tensor, "_rowwise_data") and tensor._rowwise_data is not None:
         return tensor._rowwise_data.device.index
     if hasattr(tensor, "_columnwise_data") and tensor._columnwise_data is not None:
@@ -86,6 +84,8 @@ def get_tensor_device(tensor: torch.Tensor) -> int:
         return tensor._data.device.index
     if hasattr(tensor, "_transpose") and tensor._transpose is not None:
         return tensor._transpose.device.index
+    if hasattr(tensor, "device"):
+        return tensor.device.index
     return torch.cuda.current_device()
 
 

diff --git a/transformer_engine/pytorch/csrc/extensions/pybind.cpp b/transformer_engine/pytorch/csrc/extensions/pybind.cpp
@@ -35,9 +35,9 @@ PyTypeObject *Float8BlockwiseQuantizerClass = nullptr;
 PyTypeObject *NVFP4TensorPythonClass = nullptr;
 PyTypeObject *NVFP4TensorStoragePythonClass = nullptr;
 PyTypeObject *NVFP4QuantizerClass = nullptr;
+bool is_extension_initialized = false;
-bool is_extension_initialized = false;
+bool is_extension_initialized = false;
+std::once_flag extension_init_flag;
-bool is_extension_initialized = false;
+bool is_extension_initialized = false;
+std::once_flag extension_init_flag;
 
 void init_float8_extension() {
-  if (Float8TensorPythonClass) return;
   auto fp8_module = py::module_::import("transformer_engine.pytorch.tensor.float8_tensor");
   Float8QuantizerClass =
       reinterpret_cast<PyTypeObject *>(PyObject_GetAttrString(fp8_module.ptr(), "Float8Quantizer"));
@@ -54,7 +54,6 @@ void init_float8_extension() {
 }
 
 void init_mxfp8_extension() {
-  if (MXFP8TensorPythonClass) return;
   auto fp8_module = py::module_::import("transformer_engine.pytorch.tensor.mxfp8_tensor");
   MXFP8QuantizerClass =
       reinterpret_cast<PyTypeObject *>(PyObject_GetAttrString(fp8_module.ptr(), "MXFP8Quantizer"));
@@ -69,7 +68,6 @@ void init_mxfp8_extension() {
 }
 
 void init_float8blockwise_extension() {
-  if (Float8BlockwiseQTensorStoragePythonClass) return;
   auto fp8_module =
       py::module_::import("transformer_engine.pytorch.tensor.float8_blockwise_tensor");
   auto fp8_base_module = py::module_::import(
@@ -90,7 +88,6 @@ void init_float8blockwise_extension() {
 }
 
 void init_nvfp4_extensions() {
-  if (NVFP4TensorPythonClass) return;
   auto nvfp4_module = py::module_::import("transformer_engine.pytorch.tensor.nvfp4_tensor");
   NVFP4QuantizerClass = reinterpret_cast<PyTypeObject *>(
       PyObject_GetAttrString(nvfp4_module.ptr(), "NVFP4Quantizer"));
@@ -105,10 +102,12 @@ void init_nvfp4_extensions() {
 }
 
 void init_extension() {
+  if (is_extension_initialized) return;
   init_float8_extension();
   init_mxfp8_extension();
   init_float8blockwise_extension();
   init_nvfp4_extensions();
+  is_extension_initialized = true;
 }
 
 }  // namespace transformer_engine::pytorch