Merge branch 'main' into grouped_tensor_python

ksivaman · web-flow · commit cf61339e4737 · 2026-01-16T13:07:10.000+05:30
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -92,7 +92,7 @@ jobs:
       options: --user root
     steps:
       - name: 'Dependencies'
-        run: pip install pybind11[global]
+        run: pip install cmake==3.21.0 pybind11[global]
       - name: 'Checkout'
         uses: actions/checkout@v3
         with:
@@ -144,7 +144,7 @@ jobs:
       - name: 'Dependencies'
         run: |
           docker exec builder bash -c '\
-            pip install pybind11[global] einops onnxscript && \
+            pip install cmake==3.21.0 pybind11[global] einops onnxscript && \
             pip install torch --no-cache-dir --index-url https://download.pytorch.org/whl/cu130
           '
       - name: 'Build'
diff --git a/examples/jax/encoder/test_model_parallel_encoder.py b/examples/jax/encoder/test_model_parallel_encoder.py
@@ -3,6 +3,7 @@
 # See LICENSE for license information.
 """Encoder training on multi-GPU with tesnor parallelism"""
 import argparse
+import os
 import unittest
 from functools import partial
 
@@ -489,6 +490,9 @@ class TestEncoder(unittest.TestCase):
 
     def setUp(self):
         """Run 5 epochs for testing"""
+        # TODO(jberchtold): Remove once fused attention from cuDNN supports determinism on Blackwell
+        if "NVTE_FUSED_ATTN" not in os.environ:
+            os.environ["NVTE_FUSED_ATTN"] = "0"
         self.args = encoder_parser(["--epochs", "5"])
 
     @unittest.skipIf(not is_bf16_supported(), "Device compute capability 8.0+ is required for BF16")
diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt
@@ -232,12 +232,24 @@ add_library(transformer_engine SHARED ${transformer_engine_SOURCES})
 target_include_directories(transformer_engine PUBLIC
                            "${CMAKE_CURRENT_SOURCE_DIR}/include")
 
-# CUTLASS kernels require SM90a and cause hang in debug build
+# Grouped GEMM kernels require SM90a
 set_property(
   SOURCE gemm/cutlass_grouped_gemm.cu
   APPEND
   PROPERTY
-  COMPILE_OPTIONS "--generate-code=arch=compute_90a,code=sm_90a;-g0")
+  COMPILE_OPTIONS "--generate-code=arch=compute_90a,code=sm_90a")
+
+# CUTLASS kernels could cause hang in debug build
+set(CUTLASS_KERNEL_SOURCES
+    gemm/cutlass_grouped_gemm.cu
+    hadamard_transform/group_hadamard_transform_cast_fusion.cu
+    hadamard_transform/group_row_cast_col_hadamard_transform_cast_fusion.cu
+    hadamard_transform/hadamard_transform_cast_fusion.cu)
+set_property(
+  SOURCE ${CUTLASS_KERNEL_SOURCES}
+  APPEND
+  PROPERTY
+  COMPILE_OPTIONS "-g0;-dopt=on")
 
 # Configure dependencies
 target_link_libraries(transformer_engine PUBLIC