triton-lang · Jokeren · Apr 16, 2025 · Apr 9, 2025 · Apr 9, 2025 · Apr 15, 2025
@@ -34,8 +34,20 @@ def Triton_Dialect : Dialect {
 
   let extraClassDeclaration = [{
     void registerTypes();
+
+    static TritonDialect *getLoaded(MLIRContext *ctx) {
+      return ctx->getLoadedDialect<TritonDialect>();
+    }
+    static TritonDialect *getLoaded(Operation *op) {
+      return getLoaded(op->getContext());
+    }
   }];
 
+  let discardableAttrs = (ins
+     "::mlir::IntegerAttr":$num_stages,
+     "::mlir::IntegerAttr":$latency
+  );
+
   let hasConstantMaterializer = 1;
   let useDefaultTypePrinterParser = 1;
   let usePropertiesForAttributes = 1;

@@ -18,7 +18,6 @@ static const char *kWarpSpecializeAttrName = "tt.warp_specialize";
 static const char *kLoopStageAttrName = "loop.stage";
 static const char *kLoopClusterAttrName = "loop.cluster";
 static const char *kScheduledMaxStageAttrName = "tt.scheduled_max_stage";
-static const char *kLatencyAttrName = "tt.latency";
 
 bool loopHasDistGreaterThanOne(scf::ForOp forOp);
 bool isOuterLoop(scf::ForOp forOp);

@@ -37,18 +37,20 @@ bool preCondition(scf::ForOp forOp) {
 }
 
 bool hasLatenciesAssigned(scf::ForOp forOp) {
+  auto helper = TritonDialect::getLoaded(forOp)->getLatencyAttrHelper();
   for (auto &op : forOp.getBody()->without_terminator()) {
-    if (op.hasAttr("tt_latency"))
+    if (helper.getAttr(&op))
       return true;
   }
   return false;
 }
 
 void assignUserProvidedLatencies(scf::ForOp forOp,
                                  DenseMap<Operation *, int> &opLatency) {
+  auto helper = TritonDialect::getLoaded(forOp)->getLatencyAttrHelper();
   for (auto &op : forOp.getBody()->without_terminator()) {
-    if (auto latencyAttr = op.getAttr("tt_latency")) {
-      opLatency[&op] = mlir::cast<IntegerAttr>(latencyAttr).getInt();
+    if (auto latencyAttr = helper.getAttr(&op)) {
+      opLatency[&op] = latencyAttr.getInt();
     }
   }
 }

@@ -248,19 +248,20 @@ int mlir::triton::getCopyVecBytes(RankedTensorType registerTy,
 
 void mlir::triton::serializeLatencies(ModuleOp module,
                                       DenseMap<Operation *, int> &opLatency) {
+  auto helper = TritonDialect::getLoaded(module)->getLatencyAttrHelper();
+  auto builder = Builder(module);
   for (auto &[op, latency] : opLatency) {
-    op->setAttr(
-        kLatencyAttrName,
-        IntegerAttr::get(IntegerType::get(module.getContext(), 32), latency));
+    helper.setAttr(op, builder.getI32IntegerAttr(latency));
   }
 }
 
 DenseMap<Operation *, int> mlir::triton::deserializeLatencies(Operation *op) {
+  auto helper = TritonDialect::getLoaded(op)->getLatencyAttrHelper();
   DenseMap<Operation *, int> opLatency;
   op->walk([&](Operation *op) {
-    if (op->hasAttr(kLatencyAttrName)) {
-      opLatency[op] = op->getAttrOfType<IntegerAttr>(kLatencyAttrName).getInt();
-      op->removeAttr(kLatencyAttrName);
+    if (auto attr = helper.getAttr(op)) {
+      opLatency[op] = attr.getInt();
+      helper.removeAttr(op);
     }
   });
   return opLatency;
@@ -408,9 +409,8 @@ int mlir::triton::getNumStagesOrDefault(scf::ForOp forOp,
                                         int defaultNumStages) {
   // Use the attribute attached to the loop if it exists otherwise use the
   // global control.
-  if (!forOp->hasAttr(mlir::triton::kNumStagesAttrName))
-    return defaultNumStages;
-  return mlir::cast<IntegerAttr>(
-             forOp->getAttr(mlir::triton::kNumStagesAttrName))
-      .getInt();
+  auto helper = TritonDialect::getLoaded(forOp)->getNumStagesAttrHelper();
+  if (auto attr = helper.getAttr(forOp))
+    return attr.getInt();
+  return defaultNumStages;
 }
@@ -15,8 +15,6 @@ namespace gpu {
 #define GEN_PASS_DEF_TRITONGPUTESTPIPELINESCHEDULELOOP
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
 
-static const char *kLatencyAttrName = "tt.latency";
-
 struct TestPipelineScheduleLoop
     : public impl::TritonGPUTestPipelineScheduleLoopBase<
           TestPipelineScheduleLoop> {

@@ -101,9 +101,9 @@ tt.func public @matmul_kernel_tma_persistent(%arg0: !tt.ptr<i8, 0> {tt.nv_tma_de
     // CHECK: [[RHS_MBAR:%.*]] = ttg.memdesc_subview [[RHS_BARS]][[[RHS_BUF_IDX]]]
     // CHECK-NEXT: ttng.wait_barrier [[RHS_MBAR]], [[RHS_PHASE]]
 
-    %4 = tt.descriptor_load %1[%c0_i32, %arg6] {tt_latency = 1 : i32} : !tt.tensordesc<tensor<128x64xf16, #shared>> -> tensor<128x64xf16, #blocked>
+    %4 = tt.descriptor_load %1[%c0_i32, %arg6] {tt.latency = 1 : i32} : !tt.tensordesc<tensor<128x64xf16, #shared>> -> tensor<128x64xf16, #blocked>
     %5 = ttg.local_alloc %4 : (tensor<128x64xf16, #blocked>) -> !ttg.memdesc<128x64xf16, #shared, #smem>
-    %6 = tt.descriptor_load %2[%c0_i32, %arg6] {tt_latency = 3 : i32} : !tt.tensordesc<tensor<256x64xf16, #shared>> -> tensor<256x64xf16, #blocked>
+    %6 = tt.descriptor_load %2[%c0_i32, %arg6] {tt.latency = 3 : i32} : !tt.tensordesc<tensor<256x64xf16, #shared>> -> tensor<256x64xf16, #blocked>
     %7 = ttg.local_alloc %6 : (tensor<256x64xf16, #blocked>) -> !ttg.memdesc<256x64xf16, #shared, #smem>
     %8 = ttg.memdesc_trans %7 {order = array<i32: 1, 0>} : !ttg.memdesc<256x64xf16, #shared, #smem> -> !ttg.memdesc<64x256xf16, #shared1, #smem>
     %9 = ttng.warp_group_dot %5, %8, %arg7 {inputPrecision = 0 : i32} : !ttg.memdesc<128x64xf16, #shared, #smem> * !ttg.memdesc<64x256xf16, #shared1, #smem> -> tensor<128x256xf32, #mma>

@@ -168,7 +168,7 @@ tt.func @prologue_backward_slice(%ub: i32, %cond: i1) {
     // CHECK: op.with_region
     "op.with_region"() ({
       "use"(%1) : (i32) -> ()
-    }) {tt_latency = 2 : i32} : () -> ()
+    }) {tt.latency = 2 : i32} : () -> ()
     // CHECK: loop.cluster = 1 : i32, loop.stage = 0 : i32
 
   } {tt.num_stages = 3 : i32}
@@ -186,7 +186,7 @@ tt.func @epilogue_forward_slice(%ub: i32, %cond: i1) {
   // CHECK: scf.for
   scf.for %i = %c0_i32 to %ub step %c1_i32 : i32 {
     // CHECK: "latency.op"() {loop.cluster = 3 : i32, loop.stage = 0 : i32
-    %0 = "latency.op"() {tt_latency = 2 : i32} : () -> i32
+    %0 = "latency.op"() {tt.latency = 2 : i32} : () -> i32
     // CHECK: scf.if
     %1 = scf.if %cond -> i32 {
       scf.yield %0 : i32
@@ -219,7 +219,7 @@ tt.func @prologue_latency(%ub: i32, %cond: i1) {
       scf.yield %0 : i32
     } else {
       scf.yield %c0_i32 : i32
-    } {tt_latency = 2 : i32}
+    } {tt.latency = 2 : i32}
     // CHECK: loop.cluster = 0 : i32, loop.stage = 0 : i32
 
   } {tt.num_stages = 3 : i32}