Skip to content

Commit 4370102

Browse files
authored
v4.4.1 update (#3080)
1 parent c213bfd commit 4370102

File tree

13 files changed

+92
-23
lines changed

13 files changed

+92
-23
lines changed

CHANGELOG.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22

33
# CUTLASS 4.x
44

5+
## [4.4.1](https://github.com/NVIDIA/cutlass/releases/tag/v4.4.1) (2026-02-27)
6+
7+
### CuTe DSL
8+
* Bug fixing and improvements
9+
- Fixed a segfault issue with tvm-ffi on aarch64
10+
511
## [4.4.0](https://github.com/NVIDIA/cutlass/releases/tag/v4.4.0) (2026-02-14)
612

713
### CuTe DSL
@@ -139,7 +145,7 @@
139145
- Fixed order issue in `make_smem_layout_a` in utils/hopper_helpers.py
140146

141147
### CUTLASS C++
142-
* Work around a driver bug which will cause occasionally errors when executing kernels.
148+
* Work around a driver TMA descriptor related bug which will cause occasional errors on Blackwell when the tensor's backing memory allocation is less than 128KB and it is not a dense non-overlapping tensor.
143149

144150
## [4.3.3](https://github.com/NVIDIA/cutlass/releases/tag/v4.3.3) (2025-12-12)
145151

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
![ALT](./media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
22
# Overview
33

4-
# CUTLASS 4.4.0
4+
# CUTLASS 4.4.1
55

6-
_CUTLASS 4.4.0 - Feb 2026_
6+
_CUTLASS 4.4.1 - Feb 2026_
77

88
CUTLASS is a collection of abstractions for implementing high-performance matrix-matrix multiplication (GEMM)
99
and related computations at all levels and scales within CUDA. It incorporates strategies for
@@ -84,6 +84,7 @@ To get started quickly - please refer :
8484
- Fixed `cute.printf` with f-string
8585
- Fixed an indexing issue of scalar tensor
8686
- Fixed small K reference check error for cta_tile_n = 256 case with overlapping accumulator optimization in [Blackwell SM100 persistent dense blockscaled GEMM with static scheduling](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/blackwell/dense_blockscaled_gemm_persistent.py).
87+
- Fixed a segfault issue with tvm-ffi on aarch64
8788

8889
* API changes
8990
- Deprecate get_num_tmem_alloc_cols from blackwell_helpers.py. Use the one from tmem_allocator.py instead.

examples/python/CuTeDSL/hopper/cta_norm.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -170,10 +170,10 @@ def kernel(
170170
print(f"[DSL INFO] pred = {pred.type}")
171171
for i in range(cute.size(tXrX, mode=[1])):
172172
if pred[i]:
173-
cute.autovec_copy(tXgX[None, i], tXrX[None, i]) # LDG.128
174-
cute.autovec_copy(tWgW[None, i], tWrW[None, i]) # LDG.128
173+
cute.autovec_copy(tXgX[None, i], tXrX[None, i]) # Global load
174+
cute.autovec_copy(tWgW[None, i], tWrW[None, i]) # Global load
175175
if cutlass.const_expr(self.norm_type == "layer"):
176-
cute.autovec_copy(tBgB[None, i], tBrB[None, i]) # LDG.128
176+
cute.autovec_copy(tBgB[None, i], tBrB[None, i]) # Global load
177177
if cutlass.const_expr(self.norm_type == "layer"):
178178
tYrY = self.apply_layernorm(tXrX, tWrW, tBrB, eps, tidx, pred)
179179
elif cutlass.const_expr(self.norm_type == "rms"):
@@ -421,4 +421,4 @@ def eval(func, name):
421421
warmup_iterations=args.warmup_iterations,
422422
iterations=args.iterations,
423423
)
424-
print("\nPASS")
424+
print("\nPASS")

include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,8 @@ struct CollectiveMma<
212212

213213
static_assert(cute::is_same_v<ElementAccumulator, ElementBlockScale>,
214214
"ElementAccumulator and ElementBlockScale should be same datatype");
215-
using NumSplitsM = cute::C<get<0>(TileShape_{}) / 128>;
215+
// For TileShapeM < 128, NumSplitsM should be 1
216+
using NumSplitsM = cute::conditional_t<get<0>(TileShape_{}) < _128{}, _1, cute::C<get<0>(TileShape_{}) / 128>>;
216217
static_assert(NumSplitsM{} == 1 || NumSplitsM{} == 2);
217218

218219
struct SharedStorage {

include/cutlass/version.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636

3737
#define CUTLASS_MAJOR 4
3838
#define CUTLASS_MINOR 4
39-
#define CUTLASS_PATCH 0
39+
#define CUTLASS_PATCH 1
4040

4141
#ifdef CUTLASS_VERSIONS_GENERATED
4242
#include "cutlass/version_extended.h"

python/CuTeDSL/cutlass/base_dsl/tvm_ffi_builder/mlir_builder.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,64 @@ def define_global_string(self, content: str) -> str:
371371
self.const_str_table[content] = symbol
372372
return symbol
373373

374+
def get_or_load_global_func_ptr_from_text(
375+
self,
376+
current_block: ir.Block,
377+
function_name: str,
378+
) -> ir.Value:
379+
"""Get or create a function pointer global in .text section and load it.
380+
381+
This creates a constant global function pointer in the .text section
382+
(for AArch64 ADRP range compatibility) and performs a volatile load
383+
to prevent optimization.
384+
385+
This forces the function pointer to be local to the code, bypassing GOT entry
386+
ADRP lookup issues on AArch64 when GOT and .text section are more than 4GB
387+
apart which can happen when ASLR is applied.
388+
"""
389+
# Check if we've already created this global
390+
if function_name not in self.const_func_ptr_table:
391+
symbol = f"__func_ptr_{function_name}"
392+
393+
module_body = self.module.body
394+
with ir.InsertionPoint(module_body):
395+
# 1. Create the global constant
396+
# We use 'private' linkage so it doesn't conflict across modules
397+
global_ptr = llvm.GlobalOp(
398+
self.ptr_type,
399+
symbol,
400+
ir.Attribute.parse("#llvm.linkage<private>"),
401+
# Initialization via block below
402+
)
403+
404+
# 2. Set the necessary attributes for JIT safety and AArch64 range
405+
# We use 'constant' to mark it as immutable
406+
# We use 'section = ".text"' to force it into the code block
407+
global_ptr.attributes["constant"] = ir.UnitAttr.get()
408+
global_ptr.attributes["section"] = ir.StringAttr.get(".text")
409+
410+
# 3. Add a constructor block to the GlobalOp to initialize it
411+
# with the address of the target function
412+
initializer_block = global_ptr.initializer.blocks.append()
413+
with ir.InsertionPoint(initializer_block):
414+
# Get the address of the external function
415+
func_addr = llvm.AddressOfOp(self.ptr_type, function_name).res
416+
# Return the address as the initial value of the global
417+
llvm.return_(arg=func_addr)
418+
419+
self.const_func_ptr_table[function_name] = symbol
420+
else:
421+
symbol = self.const_func_ptr_table[function_name]
422+
423+
# Load it with volatile semantics in the current block
424+
with ir.InsertionPoint(current_block):
425+
symbol_addr = self.address_of(symbol, self.ptr_type)
426+
# Perform a volatile load to prevent optimization
427+
load_op = llvm.load(self.ptr_type, symbol_addr)
428+
# Set volatile attribute to prevent optimization
429+
load_op.owner.attributes["volatile_"] = ir.UnitAttr.get()
430+
return load_op
431+
374432

375433
# function
376434
def function(

python/CuTeDSL/cutlass/cutlass_dsl/tvm_ffi_provider.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -129,13 +129,16 @@ def insert_lazy_init_cuda(self, current_block: ir.Block, context: CallContext):
129129
cuda_global_state_ptr = self.address_of(
130130
self.cuda_global_state_symbol, self.ptr_type
131131
)
132-
cuda_init_ptr = self.address_of("cuda_init", self.ptr_type)
133-
cuda_load_to_device_ptr = self.address_of(
134-
"cuda_load_to_device", self.ptr_type
135-
)
136-
set_error_ptr = self.address_of(
137-
"TVMFFIErrorSetRaisedFromCStr", self.ptr_type
138-
)
132+
133+
cuda_init_ptr = context.builder.get_or_load_global_func_ptr_from_text(
134+
current_block, "cuda_init"
135+
)
136+
cuda_load_to_device_ptr = context.builder.get_or_load_global_func_ptr_from_text(
137+
current_block, "cuda_load_to_device"
138+
)
139+
set_error_ptr = context.builder.get_or_load_global_func_ptr_from_text(
140+
current_block, "TVMFFIErrorSetRaisedFromCStr"
141+
)
139142

140143
with ir.InsertionPoint(current_block):
141144
# Call the callback function with the loaded ptr value
@@ -530,7 +533,7 @@ class TVMFFIJitCompiledFunction(tvm_ffi.Function, TVMFFIJitCompiledFunctionBase)
530533
"""TVM FFI Function that directly subclasses the tvm_ffi.Function for pos only arguments."""
531534

532535
def __init__(self, *args, **kwargs):
533-
super().__init__(*args, **kwargs)
536+
TVMFFIJitCompiledFunctionBase.__init__(self, *args, **kwargs)
534537
# initialize the tvm_ffi.Function from the current execution engine
535538
if self.__chandle__() != 0:
536539
raise DSLRuntimeError("TVM FFI function is already initialized")
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
# Use `pip install -r requirements-cu13.txt` with the present file to install a
22
# wheel consistent with the present state of the github repository
3-
nvidia-cutlass-dsl[cu13]==4.4.0
3+
nvidia-cutlass-dsl[cu13]==4.4.1

python/CuTeDSL/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
# Use `pip install -r requirements.txt` with the present file to install a
22
# wheel consistent with the present state of the github repository
3-
nvidia-cutlass-dsl==4.4.0
3+
nvidia-cutlass-dsl==4.4.1

python/cutlass_cppgen/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def get_option_registry():
133133
this._option_registry = OptionRegistry(device_cc())
134134
return this._option_registry
135135

136-
this.__version__ = '4.4.0'
136+
this.__version__ = '4.4.1'
137137

138138
from cutlass_cppgen.backend import create_memory_pool
139139
from cutlass_cppgen.emit.pytorch import pytorch

0 commit comments

Comments
 (0)