intel
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Traits.h‎
Lines changed: 6 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/Traits.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrBase.td‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrBase.td‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Analysis/AxisInfo.cpp‎
Lines changed: 17 additions & 26 deletions b/‎lib/Analysis/AxisInfo.cpp‎
Lines changed: 17 additions & 26 deletions
diff --git a/‎lib/Analysis/Membar.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Analysis/Membar.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Conversion/TritonInstrumentToLLVM/InstrumentationToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Conversion/TritonInstrumentToLLVM/InstrumentationToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 2 additions & 2 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp‎
Lines changed: 25 additions & 16 deletions b/‎lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp‎
Lines changed: 25 additions & 16 deletions
diff --git a/‎python/src/gluon_ir.cc‎
Lines changed: 41 additions & 2 deletions b/‎python/src/gluon_ir.cc‎
Lines changed: 41 additions & 2 deletions
@@ -22,6 +22,12 @@ class LocalLoadTrait
   // Optional: Add methods or verification logic here
 };
 
+template <typename ConcreteType>
+class MemWaitOpTrait
+    : public mlir::OpTrait::TraitBase<ConcreteType, MemWaitOpTrait> {
+  // Optional: Add methods or verification logic here
+};
+
 } // namespace OpTrait
 } // namespace mlir
 
 
@@ -14,6 +14,7 @@ include "triton/Dialect/TritonGPU/IR/TritonGPUDialect.td"
 // Traits used across several attrs.
 def MemDescViewTrait : NativeOpTrait<"MemDescViewTrait">;
 def LocalLoadTrait : NativeOpTrait<"LocalLoadTrait">;
+def MemWaitOpTrait : NativeOpTrait<"MemWaitOpTrait">;
 
 // Common parameter helpers.
 def LinearLayoutParam : AttrOrTypeParameter<"LinearLayout",
 
@@ -42,7 +42,7 @@ def TTG_ConvertLayoutOp : TTG_Op<"convert_layout",
   let assemblyFormat = "$src attr-dict `:` type($src) `->` type($result)";
 }
 
-def TTG_AsyncWaitOp : TTG_Op<"async_wait"> {
+def TTG_AsyncWaitOp : TTG_Op<"async_wait", [MemWaitOpTrait]> {
   let summary = "async wait";
 
   let arguments = (ins Variadic<TTG_AsyncToken>:$asyncToken, I32Attr:$num);
 
@@ -401,7 +401,7 @@ def TTNG_AsyncTMAScatterOp : TTNG_Op<"async_tma_scatter"> {
   let hasVerifier = 1;
 }
 
-def TTNG_TMAStoreWaitOp : TTNG_Op<"async_tma_store_wait"> {
+def TTNG_TMAStoreWaitOp : TTNG_Op<"async_tma_store_wait", [MemWaitOpTrait]> {
   let summary = "wait until all the inputs are read.";
   let arguments = (ins I32Attr:$pendings);
   let description = [{
 
@@ -1079,11 +1079,10 @@ AxisInfoAnalysis::AxisInfoAnalysis(DataFlowSolver &solver,
 LogicalResult AxisInfoAnalysis::visitOperation(
     Operation *op, ArrayRef<const dataflow::Lattice<AxisInfo> *> operands,
     ArrayRef<dataflow::Lattice<AxisInfo> *> results) {
-  // TODO: For sure not the right way to do this
-  // but why is scf.if not initialized otherwise?
+  // If any operands are not yet ready, skip this operation for now.
   for (auto op : operands)
     if (op->getValue().getRank() == 0)
-      setToEntryState((dataflow::Lattice<AxisInfo> *)op);
+      return success();
   AxisInfo curr = visitors.apply(op, operands);
   if (curr.getRank() == 0) {
     setAllToEntryStates(results);
@@ -1112,9 +1111,11 @@ void AxisInfoAnalysis::visitForOpInductionVar(
   ProgramPoint *programPoint = getProgramPointAfter(op);
   auto *lbLattice = getLatticeElementFor(programPoint, op.getLowerBound());
   auto *stepLattice = getLatticeElementFor(programPoint, op.getStep());
-  for (auto op_iter : {lbLattice, stepLattice})
-    if (op_iter->getValue().getRank() == 0)
-      setToEntryState((dataflow::Lattice<AxisInfo> *)op_iter);
+  // If lb or step is not yet ready, skip this operation for now.
+  if (lbLattice->getValue().getRank() == 0 ||
+      stepLattice->getValue().getRank() == 0) {
+    return;
+  }
 
   AxisInfo::DimVectorT knownContiguity(1, 1);
   AxisInfo::DimVectorT knownDivisibility(1, 1);
@@ -1188,24 +1189,15 @@ void AxisInfo::initDimVectorFromHint(Attribute attr, DimVectorT *vec) {
       initPessimisticStateFromFunc(blockArg.getArgNumber(), fun,
                                    &knownContiguity, &knownDivisibility,
                                    &knownConstancy);
-    } else if (isa<RegionBranchOpInterface, gpu::WarpSpecializePartitionsOp>(
-                   op)) {
-      // scf::ForOp, scf::IfOp, scf::WhileOp, gpu::WarpSpecializePartitionsOp
-      // Control flow operations are initialized with "unknown" state:
-      // the maximum possible divisibility, contiguity, and constancy.
+    } else if (isa<gpu::WarpSpecializePartitionsOp>(op)) {
+      // Initialize the arguments to gpu::WarpSpecializePartitionsOp with
+      // "unknown" state: the maximum possible divisibility, contiguity, and
+      // constancy.
       knownDivisibility = DimVectorT(rank, kMaxDivisor);
       knownConstancy = DimVectorT(rank, kMaxDivisor);
       knownContiguity = DimVectorT(rank, kMaxDivisor);
     }
   } else if (Operation *op = value.getDefiningOp()) {
-    if (isa<RegionBranchOpInterface>(op)) {
-      // scf::ForOp, scf::IfOp, scf::WhileOp
-      // Control flow operations are initialized with "unknown" state:
-      // the maximum possible divisibility, contiguity, and constancy.
-      knownDivisibility = DimVectorT(rank, kMaxDivisor);
-      knownConstancy = DimVectorT(rank, kMaxDivisor);
-      knownContiguity = DimVectorT(rank, kMaxDivisor);
-    }
     // Other operations are conservatively initialized with the lowest possible
     // divisibility, contiguity, and constancy unless they have specified.
     AxisInfo::initDimVectorFromHint(op->getDiscardableAttr("tt.divisibility"),
@@ -1358,13 +1350,12 @@ void ModuleAxisInfoAnalysis::initialize(FunctionOpInterface funcOp,
   auto *axisInfoMap = getFuncData(funcOp);
   auto updateAxisInfoMap = [&](Value value) {
     auto axisInfo = analysis->getLatticeElement(value)->getValue();
-    AxisInfo curAxisInfo;
-    if (axisInfoMap->count(value)) {
-      curAxisInfo = AxisInfo::join(axisInfo, axisInfoMap->lookup(value));
-    } else {
-      curAxisInfo = axisInfo;
-    }
-    (*axisInfoMap)[value] = curAxisInfo;
+    // If we could not determine the AxisInfo for this value, assume the
+    // pessimistic state.
+    if (axisInfo.getRank() == 0)
+      axisInfo = AxisInfo::getPessimisticValueState(value);
+    auto &valInfo = (*axisInfoMap)[value];
+    valInfo = AxisInfo::join(axisInfo, valInfo);
   };
   funcOp.walk([&](Operation *op) {
     for (auto value : op->getResults()) {
 
@@ -171,7 +171,7 @@ void MembarAnalysis::update(Operation *op, BlockInfo *blockInfo,
     return;
   }
 
-  if (isa<triton::gpu::AsyncWaitOp, triton::nvidia_gpu::TMAStoreWaitOp>(op) &&
+  if (op->hasTrait<mlir::OpTrait::MemWaitOpTrait>() &&
       !isa<gpu::BarrierOp, triton::gpu::LocalBarrierOp>(op->getNextNode())) {
     // If the current op is an async wait and the next op is not a barrier we
     // insert a barrier op and sync
 
@@ -27,7 +27,7 @@ Value createMemDescToI64(RewriterBase &rewriter, Location loc,
                          const LLVMTypeConverter *typeConverter,
                          ttg::MemDescType memDescTy, Value sharedMemStruct) {
   TritonLLVMOpBuilder b(loc, rewriter);
-  if (isa<ttng::TensorMemoryEncodingAttr>(memDescTy.getEncoding())) {
+  if (isa<ttng::TensorMemorySpaceAttr>(memDescTy.getMemorySpace())) {
     return b.ptrtoint(rewriter.getIntegerType(64), sharedMemStruct);
   }
   assert(isa<ttg::SharedEncodingTrait>(memDescTy.getEncoding()) &&
 
@@ -2505,9 +2505,9 @@ LogicalResult DotOperandEncodingAttr::verify(
       return emitError()
              << "ttg.dot_op kWidth parameter must be 4/8/16 for WMMA v2 "
                 "(including packed cases for `scaled_dot`)";
-    if (parentAttr.getVersion() == 3 && !llvm::is_contained({2, 8, 16}, kWidth))
+    if (parentAttr.getVersion() == 3 && kWidth == 0)
       return emitError()
-             << "ttg.dot_op kWidth parameter must be 2/8/16 for WMMA v3";
+             << "ttg.dot_op kWidth parameter is mandatory for WMMA v3 ";
     return success();
   }
 
 
@@ -127,7 +127,7 @@ class LayoutRematerialization {
   }
 
   void cleanup();
-  void backwardRematerialization();
+  bool backwardRematerialization();
   void backwardRematerialization(ConvertLayoutOp convertOp);
   // TODO: Merge the three hoistConvert*(); functions as they are duplicate code
   void hoistConvertDotOperand();
@@ -1019,7 +1019,8 @@ LogicalResult LayoutRematerialization::getRematerializableSlice(
   return success();
 }
 
-void LayoutRematerialization::backwardRematerialization() {
+bool LayoutRematerialization::backwardRematerialization() {
+  bool changed = false;
   // Go through each ConvertLayoutOp.
   SmallVector<ConvertLayoutOp> convertOps;
   funcOp.walk(
@@ -1031,8 +1032,11 @@ void LayoutRematerialization::backwardRematerialization() {
       // backward slices.
       addRematValue(convertOp.getSrc(), convertOp.getType().getEncoding(),
                     convertOp.getResult());
+    } else {
+      changed = true;
     }
   }
+  return changed;
 }
 
 void LayoutRematerialization::hoistConvertOnTopOfExtOrBroadcast() {
@@ -1593,12 +1597,14 @@ void LayoutRematerialization::hoistConvertIntoConditionals(
   rewriteSlice(slice, layout, convertOp, mapping);
 }
 
-void backwardRematerialization(ModuleOp module) {
-  module.walk([](FuncOp funcOp) {
+bool backwardRematerialization(ModuleOp module) {
+  bool changed = false;
+  module.walk([&](FuncOp funcOp) {
     LayoutRematerialization layoutRemat(funcOp);
-    layoutRemat.backwardRematerialization();
+    changed |= layoutRemat.backwardRematerialization();
     layoutRemat.cleanup();
   });
+  return changed;
 }
 
 void hoistConvert(ModuleOp module) {
@@ -1659,17 +1665,20 @@ class TritonGPURemoveLayoutConversionsPass
 
     cleanupConvertOps();
 
-    // 2. For remaining convert ops, try to rematerialize the slice of producer
-    // operation to avoid having to convert.
-    backwardRematerialization(m);
-    LLVM_DEBUG({
-      DBGS() << "Module after backward remat:\n";
-      m.dump();
-    });
-
-    // Cleanup dummy converts created during backward remat.
-    cleanupConvertOps();
-
+    bool changed = false;
+    do {
+      changed = false;
+      // 2. For remaining convert ops, try to rematerialize the slice of
+      // producer operation to avoid having to convert.
+      changed = backwardRematerialization(m);
+      LLVM_DEBUG({
+        DBGS() << "Module after backward remat:\n";
+        m.dump();
+      });
+
+      // Cleanup dummy converts created during backward remat.
+      cleanupConvertOps();
+    } while (changed);
     // 3. For remaining converts, try to hoist them above cast generating larger
     // size types in order to reduce the cost of the convert op.
     hoistConvert(m);
 
@@ -387,8 +387,47 @@ void init_gluon_ir(py::module &&m) {
               std::vector<int64_t> &shape) -> py::object {
              auto ctx = self.getContext();
              auto linearLayout = ttg::toLinearLayout(shape, layout);
-             auto attr = ttg::LinearEncodingAttr::get(ctx, linearLayout);
-             return layoutToGluon(attr);
+
+             if (isa<ttg::DistributedEncodingTrait>(layout)) {
+               auto attr = ttg::LinearEncodingAttr::get(ctx, linearLayout);
+               return layoutToGluon(attr);
+             }
+             if (isa<ttg::SharedEncodingTrait>(layout)) {
+               auto alignment =
+                   cast<ttg::SharedEncodingTrait>(layout).getAlignment();
+               auto attr = ttg::SharedLinearEncodingAttr::get(ctx, linearLayout,
+                                                              alignment);
+               return layoutToGluon(attr);
+             }
+
+             // TensorMemory encodings: keep the LinearLayout but wrap as
+             // print-only Python object carrying row/col bases -> dim0/dim1.
+             auto inNamesRange = linearLayout.getInDimNames();
+             auto inNames = llvm::to_vector(inNamesRange);
+             bool isTmemLayout =
+                 (inNames.size() == 2 && inNames[0].str() == "row" &&
+                  inNames[1].str() == "col");
+             if (!isTmemLayout)
+               throw std::invalid_argument(
+                   "Unsupported layout in to_linear_layout");
+
+             // Build Py _TensorMemoryLinearLayout(row_bases, col_bases, shape,
+             // repr)
+             py::object tmemCls =
+                 py::module::import(
+                     "triton.experimental.gluon.language.nvidia.blackwell")
+                     .attr("_TensorMemoryLinearLayout");
+             auto bases = linearLayout.getBases();
+             auto rowBases = bases[mlir::StringAttr::get(ctx, "row")];
+             auto colBases = bases[mlir::StringAttr::get(ctx, "col")];
+             auto outDims = linearLayout.getOutDims();
+             std::vector<int> shapeVec;
+             for (auto &od : outDims)
+               shapeVec.push_back(od.second);
+
+             py::object pyObj = tmemCls(py::cast(rowBases), py::cast(colBases),
+                                        py::cast(shapeVec));
+             return pyObj;
            })
       .def("get_dot_operand_layout",
            [](GluonOpBuilder &self, unsigned opIdx, Attribute parent,
Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ def TTG_ConvertLayoutOp : TTG_Op<"convert_layout",`
`42`	`42`	let assemblyFormat = "$src attr-dict `:` type($src) `->` type($result)";
`43`	`43`	`}`
`44`	`44`
`45`		`-def TTG_AsyncWaitOp : TTG_Op<"async_wait"> {`
	`45`	`+def TTG_AsyncWaitOp : TTG_Op<"async_wait", [MemWaitOpTrait]> {`
`46`	`46`	`let summary = "async wait";`
`47`	`47`
`48`	`48`	`let arguments = (ins Variadic<TTG_AsyncToken>:$asyncToken, I32Attr:$num);`
Original file line number	Diff line number	Diff line change
`@@ -401,7 +401,7 @@ def TTNG_AsyncTMAScatterOp : TTNG_Op<"async_tma_scatter"> {`
`401`	`401`	`let hasVerifier = 1;`
`402`	`402`	`}`
`403`	`403`
`404`		`-def TTNG_TMAStoreWaitOp : TTNG_Op<"async_tma_store_wait"> {`
	`404`	`+def TTNG_TMAStoreWaitOp : TTNG_Op<"async_tma_store_wait", [MemWaitOpTrait]> {`
`405`	`405`	`let summary = "wait until all the inputs are read.";`
`406`	`406`	`let arguments = (ins I32Attr:$pendings);`
`407`	`407`	`let description = [{`
Original file line number	Diff line number	Diff line change
`@@ -171,7 +171,7 @@ void MembarAnalysis::update(Operation op, BlockInfo blockInfo,`
`171`	`171`	`return;`
`172`	`172`	`}`
`173`	`173`
`174`		`- if (isa<triton::gpu::AsyncWaitOp, triton::nvidia_gpu::TMAStoreWaitOp>(op) &&`
	`174`	`+ if (op->hasTrait<mlir::OpTrait::MemWaitOpTrait>() &&`
`175`	`175`	`!isa<gpu::BarrierOp, triton::gpu::LocalBarrierOp>(op->getNextNode())) {`
`176`	`176`	`// If the current op is an async wait and the next op is not a barrier we`
`177`	`177`	`// insert a barrier op and sync`
Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ Value createMemDescToI64(RewriterBase &rewriter, Location loc,`
`27`	`27`	`const LLVMTypeConverter *typeConverter,`
`28`	`28`	`ttg::MemDescType memDescTy, Value sharedMemStruct) {`
`29`	`29`	`TritonLLVMOpBuilder b(loc, rewriter);`
`30`		`- if (isa<ttng::TensorMemoryEncodingAttr>(memDescTy.getEncoding())) {`
	`30`	`+ if (isa<ttng::TensorMemorySpaceAttr>(memDescTy.getMemorySpace())) {`
`31`	`31`	`return b.ptrtoint(rewriter.getIntegerType(64), sharedMemStruct);`
`32`	`32`	`}`
`33`	`33`	`assert(isa<ttg::SharedEncodingTrait>(memDescTy.getEncoding()) &&`