v4.4.1 update (#3080)

Junkai-Wu · web-flow · commit 4370102f9dac · 2026-02-27T14:01:08.000-05:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,12 @@
 
 # CUTLASS 4.x
 
+## [4.4.1](https://github.com/NVIDIA/cutlass/releases/tag/v4.4.1) (2026-02-27)
+
+### CuTe DSL
+* Bug fixing and improvements
+  - Fixed a segfault issue with tvm-ffi on aarch64
+
 ## [4.4.0](https://github.com/NVIDIA/cutlass/releases/tag/v4.4.0) (2026-02-14)
 
 ### CuTe DSL
@@ -139,7 +145,7 @@
   - Fixed order issue in `make_smem_layout_a` in utils/hopper_helpers.py
 
 ### CUTLASS C++
-* Work around a driver bug which will cause occasionally errors when executing kernels.
+* Work around a driver TMA descriptor related bug which will cause occasional errors on Blackwell when the tensor's backing memory allocation is less than 128KB and it is not a dense non-overlapping tensor.
 
 ## [4.3.3](https://github.com/NVIDIA/cutlass/releases/tag/v4.3.3) (2025-12-12)
 
diff --git a/README.md b/README.md
@@ -1,9 +1,9 @@
 ![ALT](./media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
 # Overview
 
-# CUTLASS 4.4.0
+# CUTLASS 4.4.1
 
-_CUTLASS 4.4.0 - Feb 2026_
+_CUTLASS 4.4.1 - Feb 2026_
 
 CUTLASS is a collection of abstractions for implementing high-performance matrix-matrix multiplication (GEMM)
 and related computations at all levels and scales within CUDA. It incorporates strategies for
@@ -84,6 +84,7 @@ To get started quickly - please refer :
   - Fixed `cute.printf` with f-string
   - Fixed an indexing issue of scalar tensor
   - Fixed small K reference check error for cta_tile_n = 256 case with overlapping accumulator optimization in [Blackwell SM100 persistent dense blockscaled GEMM with static scheduling](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/blackwell/dense_blockscaled_gemm_persistent.py).
+  - Fixed a segfault issue with tvm-ffi on aarch64
 
 * API changes
   - Deprecate get_num_tmem_alloc_cols from blackwell_helpers.py. Use the one from tmem_allocator.py instead.
diff --git a/examples/python/CuTeDSL/hopper/cta_norm.py b/examples/python/CuTeDSL/hopper/cta_norm.py
@@ -170,10 +170,10 @@ def kernel(
         print(f"[DSL INFO]   pred = {pred.type}")
         for i in range(cute.size(tXrX, mode=[1])):
             if pred[i]:
-                cute.autovec_copy(tXgX[None, i], tXrX[None, i])  # LDG.128
-                cute.autovec_copy(tWgW[None, i], tWrW[None, i])  # LDG.128
+                cute.autovec_copy(tXgX[None, i], tXrX[None, i])  # Global load
+                cute.autovec_copy(tWgW[None, i], tWrW[None, i])  # Global load
                 if cutlass.const_expr(self.norm_type == "layer"):
-                    cute.autovec_copy(tBgB[None, i], tBrB[None, i])  # LDG.128
+                    cute.autovec_copy(tBgB[None, i], tBrB[None, i])  # Global load
         if cutlass.const_expr(self.norm_type == "layer"):
             tYrY = self.apply_layernorm(tXrX, tWrW, tBrB, eps, tidx, pred)
         elif cutlass.const_expr(self.norm_type == "rms"):
@@ -421,4 +421,4 @@ def eval(func, name):
         warmup_iterations=args.warmup_iterations,
         iterations=args.iterations,
     )
-    print("\nPASS")
+    print("\nPASS")
diff --git a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -212,7 +212,8 @@ struct CollectiveMma<
 
   static_assert(cute::is_same_v<ElementAccumulator, ElementBlockScale>,
              "ElementAccumulator and ElementBlockScale should be same datatype");
-  using NumSplitsM = cute::C<get<0>(TileShape_{}) / 128>;
+  // For TileShapeM < 128, NumSplitsM should be 1
+  using NumSplitsM = cute::conditional_t<get<0>(TileShape_{}) < _128{}, _1, cute::C<get<0>(TileShape_{}) / 128>>;
   static_assert(NumSplitsM{} == 1 || NumSplitsM{} == 2);
 
   struct SharedStorage {
diff --git a/include/cutlass/version.h b/include/cutlass/version.h
@@ -36,7 +36,7 @@
 
 #define CUTLASS_MAJOR 4
 #define CUTLASS_MINOR 4
-#define CUTLASS_PATCH 0
+#define CUTLASS_PATCH 1
 
 #ifdef CUTLASS_VERSIONS_GENERATED
 #include "cutlass/version_extended.h"
diff --git a/python/CuTeDSL/cutlass/base_dsl/tvm_ffi_builder/mlir_builder.py b/python/CuTeDSL/cutlass/base_dsl/tvm_ffi_builder/mlir_builder.py
@@ -371,6 +371,64 @@ def define_global_string(self, content: str) -> str:
             self.const_str_table[content] = symbol
         return symbol
 
+    def get_or_load_global_func_ptr_from_text(
+        self,
+        current_block: ir.Block,
+        function_name: str,
+    ) -> ir.Value:
+        """Get or create a function pointer global in .text section and load it.
+
+        This creates a constant global function pointer in the .text section
+        (for AArch64 ADRP range compatibility) and performs a volatile load
+        to prevent optimization.
+
+        This forces the function pointer to be local to the code, bypassing GOT entry
+        ADRP lookup issues on AArch64 when GOT and .text section are more than 4GB
+        apart which can happen when ASLR is applied.
+        """
+        # Check if we've already created this global
+        if function_name not in self.const_func_ptr_table:
+            symbol = f"__func_ptr_{function_name}"
+
+            module_body = self.module.body
+            with ir.InsertionPoint(module_body):
+                # 1. Create the global constant
+                # We use 'private' linkage so it doesn't conflict across modules
+                global_ptr = llvm.GlobalOp(
+                    self.ptr_type,
+                    symbol,
+                    ir.Attribute.parse("#llvm.linkage<private>"),
+                    # Initialization via block below
+                )
+
+                # 2. Set the necessary attributes for JIT safety and AArch64 range
+                # We use 'constant' to mark it as immutable
+                # We use 'section = ".text"' to force it into the code block
+                global_ptr.attributes["constant"] = ir.UnitAttr.get()
+                global_ptr.attributes["section"] = ir.StringAttr.get(".text")
+
+                # 3. Add a constructor block to the GlobalOp to initialize it
+                # with the address of the target function
+                initializer_block = global_ptr.initializer.blocks.append()
+                with ir.InsertionPoint(initializer_block):
+                    # Get the address of the external function
+                    func_addr = llvm.AddressOfOp(self.ptr_type, function_name).res
+                    # Return the address as the initial value of the global
+                    llvm.return_(arg=func_addr)
+
+                self.const_func_ptr_table[function_name] = symbol
+        else:
+            symbol = self.const_func_ptr_table[function_name]
+
+        # Load it with volatile semantics in the current block
+        with ir.InsertionPoint(current_block):
+            symbol_addr = self.address_of(symbol, self.ptr_type)
+            # Perform a volatile load to prevent optimization
+            load_op = llvm.load(self.ptr_type, symbol_addr)
+            # Set volatile attribute to prevent optimization
+            load_op.owner.attributes["volatile_"] = ir.UnitAttr.get()
+            return load_op
+
 
     # function
     def function(
diff --git a/python/CuTeDSL/cutlass/cutlass_dsl/tvm_ffi_provider.py b/python/CuTeDSL/cutlass/cutlass_dsl/tvm_ffi_provider.py
@@ -129,13 +129,16 @@ def insert_lazy_init_cuda(self, current_block: ir.Block, context: CallContext):
             cuda_global_state_ptr = self.address_of(
                 self.cuda_global_state_symbol, self.ptr_type
             )
-            cuda_init_ptr = self.address_of("cuda_init", self.ptr_type)
-            cuda_load_to_device_ptr = self.address_of(
-                "cuda_load_to_device", self.ptr_type
-            )
-            set_error_ptr = self.address_of(
-                "TVMFFIErrorSetRaisedFromCStr", self.ptr_type
-            )
+
+        cuda_init_ptr = context.builder.get_or_load_global_func_ptr_from_text(
+            current_block, "cuda_init"
+        )
+        cuda_load_to_device_ptr = context.builder.get_or_load_global_func_ptr_from_text(
+            current_block, "cuda_load_to_device"
+        )
+        set_error_ptr = context.builder.get_or_load_global_func_ptr_from_text(
+            current_block, "TVMFFIErrorSetRaisedFromCStr"
+        )
 
         with ir.InsertionPoint(current_block):
             # Call the callback function with the loaded ptr value
@@ -530,7 +533,7 @@ class TVMFFIJitCompiledFunction(tvm_ffi.Function, TVMFFIJitCompiledFunctionBase)
     """TVM FFI Function that directly subclasses the tvm_ffi.Function for pos only arguments."""
 
     def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+        TVMFFIJitCompiledFunctionBase.__init__(self, *args, **kwargs)
         # initialize the tvm_ffi.Function from the current execution engine
         if self.__chandle__() != 0:
             raise DSLRuntimeError("TVM FFI function is already initialized")
diff --git a/python/CuTeDSL/requirements-cu13.txt b/python/CuTeDSL/requirements-cu13.txt
@@ -1,3 +1,3 @@
 # Use `pip install -r requirements-cu13.txt` with the present file to install a
 # wheel consistent with the present state of the github repository
-nvidia-cutlass-dsl[cu13]==4.4.0
+nvidia-cutlass-dsl[cu13]==4.4.1
diff --git a/python/CuTeDSL/requirements.txt b/python/CuTeDSL/requirements.txt
@@ -1,3 +1,3 @@
 # Use `pip install -r requirements.txt` with the present file to install a
 # wheel consistent with the present state of the github repository
-nvidia-cutlass-dsl==4.4.0
+nvidia-cutlass-dsl==4.4.1
diff --git a/python/cutlass_cppgen/__init__.py b/python/cutlass_cppgen/__init__.py
@@ -133,7 +133,7 @@ def get_option_registry():
         this._option_registry = OptionRegistry(device_cc())
     return this._option_registry
 
-this.__version__ = '4.4.0'
+this.__version__ = '4.4.1'
 
 from cutlass_cppgen.backend import create_memory_pool
 from cutlass_cppgen.emit.pytorch import pytorch
diff --git a/python/setup_cutlass.py b/python/setup_cutlass.py
@@ -51,7 +51,7 @@
 
 setup(
     name='cutlass_cppgen',
-    version='4.4.0',
+    version='4.4.1',
     description='CUTLASS Pythonic Interface',
     package_dir={'': '.'},
     packages=[
diff --git a/python/setup_library.py b/python/setup_library.py
@@ -36,7 +36,7 @@
 def perform_setup():
     setup(
         name='cutlass_library',
-        version='4.4.0',
+        version='4.4.1',
         description='CUTLASS library generation scripts',
         packages=['cutlass_library']
     )
diff --git a/python/setup_pycute.py b/python/setup_pycute.py
@@ -36,7 +36,7 @@
 def perform_setup():
     setup(
         name='pycute',
-        version='4.4.0',
+        version='4.4.1',
         description='Python implementation of CuTe',
         packages=['pycute'],
     )