[Frontend] Make sure aggregate members are added to the cache key (#8528)

Mogball · web-flow · commit 40dd0c41758f · 2025-10-24T19:00:39.000-07:00
Each aggregate class tracks its callable members and when the aggregate
is referenced by name, the cache keys of all its members are computed.
This does require `def __init__` to be marked as `@constexpr_function`
diff --git a/python/examples/gluon/01-attention-forward.py b/python/examples/gluon/01-attention-forward.py
@@ -52,10 +52,11 @@ class BarrierCounter:
     phase: gl.tensor
     num_barriers: gl.constexpr
 
+    @gluon.constexpr_function
     def __init__(self, index, phase, num_barriers):
         self.index = index
         self.phase = phase
-        self.num_barriers = num_barriers
+        self.num_barriers = gl.constexpr(num_barriers)
 
     @gluon.must_use_result
     @gluon.jit
@@ -79,6 +80,7 @@ class ChannelType:
         num_buffers: gl.constexpr
         num_consumers: gl.constexpr
 
+        @gluon.constexpr_function
         def __init__(self, mem, ready_bars, empty_bars, num_buffers, num_consumers):
             self.mem = mem
             self.ready_bars = ready_bars
@@ -143,6 +145,7 @@ class Producer:
         channel: ChannelType
         counter: BarrierCounter
 
+        @gluon.constexpr_function
         def __init__(self, channel, counter):
             self.channel = channel
             self.counter = counter
@@ -158,6 +161,7 @@ class Consumer:
         channel: ChannelType
         counter: BarrierCounter
 
+        @gluon.constexpr_function
         def __init__(self, channel, counter):
             self.channel = channel
             self.counter = counter
@@ -234,6 +238,7 @@ class AttentionConfig:
     num_kv_buffers: gl.constexpr
     use_exp2_turnstile: gl.constexpr
 
+    @gluon.constexpr_function
     def __init__(self, qk_scale, Z, H, N_CTX, BLOCK_M, BLOCK_N, HEAD_DIM, GROUP_SIZE_N, NUM_SMS, STAGE, dtype,
                  num_warps):
         self.qk_scale = qk_scale
@@ -250,7 +255,7 @@ def __init__(self, qk_scale, Z, H, N_CTX, BLOCK_M, BLOCK_N, HEAD_DIM, GROUP_SIZE
         self.num_warps = gl.constexpr(num_warps)
 
         self.SPLIT_D_FACTOR = gl.constexpr(2)
-        self.SPLIT_EXP_FACTOR = 256 // HEAD_DIM
+        self.SPLIT_EXP_FACTOR = gl.constexpr(256 // HEAD_DIM)
         self.SPLIT_QK_LOAD_FACTOR = gl.constexpr(2 if STAGE == 1 else 1)
         self.SPLIT_M = gl.constexpr(self.BLOCK_M // 2)
         self.SPLIT_D = gl.constexpr(self.HEAD_DIM // self.SPLIT_D_FACTOR)
@@ -305,6 +310,7 @@ class ProgramScheduler:
     num_pid_in_group: gl.tensor
     num_tiles: gl.tensor
 
+    @gluon.constexpr_function
     def __init__(self, config, start_pid, num_pid_n, num_pid_in_group, num_tiles):
         self.config = config
         self.start_pid = start_pid
@@ -339,6 +345,7 @@ class AttentionProgram:
     offset_y: gl.tensor
     qo_offset_y: gl.tensor
 
+    @gluon.constexpr_function
     def __init__(self, config, start_m, off_hz, offset_y, qo_offset_y):
         self.config = config
         self.start_m = start_m
diff --git a/python/triton/experimental/gluon/language/nvidia/blackwell/float2.py b/python/triton/experimental/gluon/language/nvidia/blackwell/float2.py
@@ -72,6 +72,7 @@ def _fma_f32x2(a, b, c):
 class Float2Tensor:
     value: ttgl.tensor
 
+    @constexpr_function
     def __init__(self, value: ttgl.tensor):
         self.value = value
 
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
@@ -1544,13 +1544,15 @@ def _get_instance(this_cls):
         def __new__(this_cls, *args, _semantic=None, _generator=None, **kwargs):
             # Call into the user-defined constructor.
             instance = this_cls._get_instance()
-            if isinstance(cls.__init__, JITCallable):
-                raise ValueError(f"{cls.__name__}.__init__ cannot be a @triton.jit function")
             extra_kwargs = {}
-            if "_semantic" in inspect.signature(cls.__init__).parameters:
-                extra_kwargs["_semantic"] = _semantic
-            if "_generator" in inspect.signature(cls.__init__).parameters:
-                extra_kwargs["_generator"] = _generator
+            if isinstance(cls.__init__, JITCallable):
+                # raise ValueError(f"{cls.__name__}.__init__ cannot be a @triton.jit function")
+                pass
+            else:
+                if "_semantic" in inspect.signature(cls.__init__).parameters:
+                    extra_kwargs["_semantic"] = _semantic
+                if "_generator" in inspect.signature(cls.__init__).parameters:
+                    extra_kwargs["_generator"] = _generator
             cls.__init__(instance, *args, **extra_kwargs, **kwargs)
 
             # Require that the user-defined constructor initialized all fields.
@@ -1577,11 +1579,15 @@ def type(self):
             return _aggregate_type(aggregate_value,
                                    [(name, getattr(self, name).type) for name in cls.__annotations__.keys()])
 
+    hash_attrs = [cls.__init__]
+
     for (name, member) in inspect.getmembers(cls):
         if inspect.isfunction(member) or inspect.ismethod(member) or isinstance(member, JITCallable):
             if name != "__init__":
                 setattr(aggregate_value, name, member)
+                hash_attrs.append(member)
 
+    aggregate_value.hash_attrs = hash_attrs
     aggregate_value.__name__ = cls.__name__
     aggregate_value.__module__ = cls.__module__
     aggregate_value.__qualname__ = cls.__qualname__
diff --git a/python/triton/runtime/jit.py b/python/triton/runtime/jit.py
@@ -122,6 +122,11 @@ def record_reference(self, val, var_dict=None, name=None):
         if val is None or type(val) is ModuleType:
             return
 
+        if getattr(val, "__triton_aggregate__", False):
+            for attr in val.hash_attrs:
+                self.record_reference(attr)
+            return
+
         if getattr(val, "__triton_builtin__", False):
             return
 
diff --git a/python/tutorials/gluon/07-persistence.py b/python/tutorials/gluon/07-persistence.py
@@ -97,6 +97,7 @@ class WGMMA:
     acc: Union[warpgroup_mma_accumulator, gl.tensor]
     use_acc: gl.tensor
 
+    @gluon.constexpr_function
     def __init__(self, acc, use_acc):
         self.acc = acc
         self.use_acc = use_acc
@@ -136,12 +137,13 @@ class MMAv5:
     counter: gl.tensor
     reg_layout: gl.constexpr
 
+    @gluon.constexpr_function
     def __init__(self, use_acc, acc_tmem, bar, counter, reg_layout):
         self.use_acc = use_acc
         self.acc_tmem = acc_tmem
         self.bar = bar
         self.counter = counter
-        self.reg_layout = reg_layout
+        self.reg_layout = gl.constexpr(reg_layout)
 
     @gluon.jit
     def initialize(dtype: gl.constexpr, BLOCK_M: gl.constexpr, BLOCK_N: gl.constexpr, num_warps: gl.constexpr):
@@ -342,6 +344,7 @@ class PersistentTileScheduler:
     pid_end: gl.tensor
     num_pid_m: gl.tensor
 
+    @gluon.constexpr_function
     def __init__(self, pid_start, pid_end, num_pid_m):
         self.pid_start = pid_start
         self.pid_end = pid_end
@@ -523,6 +526,7 @@ class GroupedPersistentTileSchedulerImpl:
         num_pid_in_group: gl.tensor
         num_pid: gl.tensor
 
+        @gluon.constexpr_function
         def __init__(self, start_pid, num_pid_m, num_pid_in_group, num_pid):
             self.start_pid = start_pid
             self.num_pid_m = num_pid_m
diff --git a/python/tutorials/gluon/08-warp-specialization.py b/python/tutorials/gluon/08-warp-specialization.py
@@ -400,6 +400,7 @@ class PartitionArgs:
     SUBTILE_FACTOR: gl.constexpr
     num_warps: gl.constexpr
 
+    @gluon.constexpr_function
     def __init__(self, a_desc, b_desc, c_desc, a_bufs, b_bufs, load_empty_bars, load_ready_bars, acc_bufs,
                  acc_empty_bars, acc_ready_bars, SUBTILE_FACTOR, num_warps):
         self.a_desc = a_desc
@@ -412,8 +413,8 @@ def __init__(self, a_desc, b_desc, c_desc, a_bufs, b_bufs, load_empty_bars, load
         self.acc_bufs = acc_bufs
         self.acc_empty_bars = acc_empty_bars
         self.acc_ready_bars = acc_ready_bars
-        self.SUBTILE_FACTOR = SUBTILE_FACTOR
-        self.num_warps = num_warps
+        self.SUBTILE_FACTOR = gl.constexpr(SUBTILE_FACTOR)
+        self.num_warps = gl.constexpr(num_warps)
 
 
 # Counter abstraction for tracking barrier index and phase.
@@ -423,10 +424,11 @@ class Counter:
     phase: gl.tensor
     num_barriers: gl.constexpr
 
+    @gluon.constexpr_function
     def __init__(self, index, phase, num_barriers):
         self.index = index
         self.phase = phase
-        self.num_barriers = num_barriers
+        self.num_barriers = gl.constexpr(num_barriers)
 
     @gluon.jit
     def create(phase, num_barriers: gl.constexpr):
diff --git a/third_party/amd/python/examples/gluon/f16_gemm_gfx1250.py b/third_party/amd/python/examples/gluon/f16_gemm_gfx1250.py
@@ -19,6 +19,7 @@ class PersistentTileScheduler:
     pid_end: ttgl.tensor
     num_pid_m: ttgl.tensor
 
+    @gluon.constexpr_function
     def __init__(self, pid_start, pid_end, num_pid_m):
         self.pid_start = pid_start
         self.pid_end = pid_end