intel
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 6 additions & 1 deletion b/‎Makefile‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 0 additions & 3 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 5 additions & 0 deletions b/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp‎
Lines changed: 1 addition & 51 deletions b/‎lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp‎
Lines changed: 1 addition & 51 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/OptimizeAccumulatorInit.cpp‎
Lines changed: 12 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/OptimizeAccumulatorInit.cpp‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/ScheduleLoops.cpp‎
Lines changed: 26 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/ScheduleLoops.cpp‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp‎
Lines changed: 0 additions & 12 deletions b/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionLoops.cpp‎
Lines changed: 11 additions & 6 deletions b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionLoops.cpp‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎python/src/gluon_ir.cc‎
Lines changed: 86 additions & 0 deletions b/‎python/src/gluon_ir.cc‎
Lines changed: 86 additions & 0 deletions
@@ -300,6 +300,7 @@ if(TRITON_BUILD_PYTHON_MODULE)
   add_compile_definitions(TRITON_BACKENDS_TUPLE=${TRITON_BACKENDS_TUPLE})
   add_library(triton SHARED ${PYTHON_SRC_PATH}/main.cc
                   ${PYTHON_SRC_PATH}/ir.cc
+                  ${PYTHON_SRC_PATH}/gluon_ir.cc
                   ${PYTHON_SRC_PATH}/passes.cc
                   ${PYTHON_SRC_PATH}/interpreter.cc
                   ${PYTHON_SRC_PATH}/llvm.cc)
 
@@ -28,7 +28,7 @@ test-lit:
 test-cpp:
 	ninja -C $(BUILD_DIR) check-triton-unit-tests
 
-.PHONY: test-python
+.PHONY: test-unit
 test-unit: all
 	cd python/test/unit && $(PYTEST) -s -n 8 --ignore=cuda/test_flashattention.py \
 		--ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
@@ -40,6 +40,11 @@ test-unit: all
 	$(PYTEST) -vs python/tutorials/06-fused-attention.py
 	TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=python/triton/instrumentation/libGPUInstrumentationTestLib.so \
 		$(PYTEST) --capture=tee-sys -rfs -vvv python/test/unit/instrumentation/test_gpuhello.py
+	$(PYTEST) -s -n 8 python/test/gluon
+
+.PHONY: test-gluon
+test-gluon: all
+	$(PYTEST) -s -n 8 python/test/gluon
 
 .PHONY: test-regression
 test-regression: all
 
@@ -247,9 +247,6 @@ SetVector<Value> getNestedOperands(Operation *op);
 // Erase the given loop carried values from the loop, where `loop` is replaced
 // with a new loop.
 void eraseLoopCarriedValues(scf::ForOp &loop, llvm::BitVector indices);
-
-// Get a boolean if the Value is an arith::ConstantOp
-std::optional<bool> getBoolFromConstant(Value cst);
 } // namespace mlir
 
 namespace mlir::triton {
 
@@ -4,6 +4,7 @@
 #include <algorithm>
 #include <assert.h>
 #include <cstdlib>
+#include <mutex>
 #include <set>
 #include <sstream>
 #include <string>
@@ -75,7 +76,10 @@ inline void assertIsRecognized(const std::string &env) {
   assert((is_invalidating || is_neutral) && errmsg.c_str());
 }
 
+static std::mutex getenv_mutex;
+
 inline std::string getStrEnv(const std::string &env) {
+  std::lock_guard<std::mutex> lock(getenv_mutex);
   assertIsRecognized(env);
   const char *cstr = std::getenv(env.c_str());
   if (!cstr)
@@ -86,6 +90,7 @@ inline std::string getStrEnv(const std::string &env) {
 
 // return value of a cache-invalidating boolean environment variable
 inline bool getBoolEnv(const std::string &env) {
+  std::lock_guard<std::mutex> lock(getenv_mutex);
   assertIsRecognized(env);
   const char *s = std::getenv(env.c_str());
   std::string str(s ? s : "");
 
@@ -1,5 +1,3 @@
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
@@ -94,53 +92,6 @@ class RemoveUnusedTMEMLoad : public OpRewritePattern<TMEMTokenLoadOp> {
   }
 };
 
-class RemoveUnusedTMEMStore : public OpRewritePattern<TMEMTokenStoreOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(TMEMTokenStoreOp store,
-                                PatternRewriter &rewriter) const override {
-    auto pred = getBoolFromConstant(store.getPred());
-    if (!pred || pred.value() == false)
-      return failure(); // we've already processed this
-    auto tok = store.getToken();
-    if (!tok.hasOneUse())
-      return failure();
-    auto loop = dyn_cast<scf::ForOp>(*tok.getUsers().begin());
-    if (!loop)
-      return failure();
-    auto loopTok = loop.getBody()->getArgument(
-        tok.getUses().begin()->getOperandNumber() - 2);
-    if (!loopTok.hasOneUse())
-      return failure();
-    auto mma =
-        dyn_cast<nvidia_gpu::MMAv5OpInterface>(*loopTok.getUsers().begin());
-    if (!mma)
-      return failure();
-    auto useD = dyn_cast<BlockArgument>(mma.useAccumulator());
-    if (!useD)
-      return failure();
-    auto parent = useD.getParentBlock()->getParentOp();
-    if (parent != loop)
-      return failure();
-    auto loopInit = loop.getInitArgs()[useD.getArgNumber() - 1];
-    auto val = getBoolFromConstant(loopInit);
-    if (!val)
-      return failure();
-    if (val.value() == true)
-      return failure();
-    auto loc = store.getLoc();
-    rewriter.setInsertionPoint(store);
-    Value diff = rewriter.create<arith::SubIOp>(loc, loop.getUpperBound(),
-                                                loop.getLowerBound());
-    Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, diff.getType());
-    Value cond = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sle,
-                                                diff, zero);
-    store.getPredMutable().assign(cond);
-    return success();
-  }
-};
-
 // Load-store forwarding pattern.
 class CombineTMEMLoadAndStore : public OpRewritePattern<TMEMTokenStoreOp> {
 public:
@@ -460,8 +411,7 @@ struct HoistTMEMAlloc
     mlir::RewritePatternSet patterns(&getContext());
     patterns.add<RotateTMEMStoreInLoop, RotateTMEMLoadInLoop,
                  CombineTMEMLoadAndStore, CombineTMEMStoreAndSelect,
-                 SinkTMEMLoad, RemoveUnusedTMEMLoad, RemoveUnusedTMEMStore>(
-        &getContext());
+                 SinkTMEMLoad, RemoveUnusedTMEMLoad>(&getContext());
     scf::ForOp::getCanonicalizationPatterns(patterns, &getContext());
     if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
       llvm_unreachable("Failed to hoist tmem_store");
 
@@ -171,6 +171,18 @@ findZeroInitOp(Value accUse, scf::ForOp forOp, bool &loopArgIsZero) {
   return std::nullopt;
 }
 
+std::optional<bool> getBoolFromConstant(Value cst) {
+  auto constantOp = cst.getDefiningOp<arith::ConstantOp>();
+  if (!constantOp) {
+    return std::nullopt;
+  }
+  assert(constantOp.getValue());
+  if (auto boolAttr = dyn_cast<BoolAttr>(constantOp.getValue())) {
+    return boolAttr.getValue();
+  }
+  return std::nullopt;
+}
+
 } // namespace
 
 class OptimizeAccumulatorInitPass
 
@@ -167,6 +167,32 @@ CoarseSchedule getInitialSchedule(scf::ForOp forOp,
   CoarseSchedule schedule;
   if (forOp->hasAttr(kWarpSpecializeAttrName) &&
       succeeded(schedule.deSerialize(forOp))) {
+    // The loop was partitioned from a warp-specialized loop, meaning it can
+    // have a partial view of the original loop stages. Re-schedule the loop
+    // root at the stages of the latency ops to prune unnecessary stages.
+    auto isLatencyOp = [&](Operation &op) {
+      return opLatency.count(&op) ||
+             isa<LocalStoreOp, LocalLoadOp, ttng::TMEMLoadOp, ttng::TMEMStoreOp,
+                 AsyncCopyGlobalToLocalOp, ttng::AsyncTMACopyGlobalToLocalOp,
+                 ttng::AsyncTMAGatherOp, ttng::MMAv5OpInterface,
+                 ttng::WaitBarrierOp, ttng::ArriveBarrierOp>(op);
+    };
+
+    // If there are no latency ops or all latency ops are in the same stage, we
+    // don't need to pipeline the loop. Return a new schedule with everything
+    // assigned to the same stage.
+    DenseSet<int> latencyStages;
+    auto ops = forOp.getBody()->without_terminator();
+    for (Operation &op : llvm::make_filter_range(ops, isLatencyOp))
+      latencyStages.insert(schedule[&op].first);
+    if (latencyStages.size() <= 1) {
+      CoarseSchedule normalized(/*numStages=*/1);
+      auto cluster = normalized.clusters.newAtFront();
+      for (Operation &op : ops)
+        normalized.insert(&op, 0, cluster);
+      return normalized;
+    }
+
     schedule.shrinkToFit();
     return schedule;
   }
 
@@ -1400,18 +1400,6 @@ void eraseLoopCarriedValues(scf::ForOp &loop, llvm::BitVector indices) {
   loop = newLoop;
 }
 
-std::optional<bool> getBoolFromConstant(Value cst) {
-  auto constantOp = cst.getDefiningOp<arith::ConstantOp>();
-  if (!constantOp) {
-    return std::nullopt;
-  }
-  assert(constantOp.getValue());
-  if (auto boolAttr = dyn_cast<BoolAttr>(constantOp.getValue())) {
-    return boolAttr.getValue();
-  }
-  return std::nullopt;
-}
-
 } // namespace mlir
 
 namespace mlir::triton {
 
@@ -192,6 +192,10 @@ LogicalResult triton::gpu::partitionLoop(scf::ForOp loop) {
   // explicit captures are the leaves of the subgraph.
   SetVector<Operation *> opsToClone;
   SmallVector<Value> explicitCaptures;
+  SmallVector<IRMapping> mappings(wsOp.getPartitionNumWarps().size());
+  SmallVector<OpBuilder> builders;
+  for (Region *region : wsOp.getPartitionRegions())
+    builders.push_back(OpBuilder::atBlockBegin(&region->front()));
   for (unsigned i = 0; i < captures.size(); ++i) {
     Value capture = captures[i];
 
@@ -215,10 +219,11 @@ LogicalResult triton::gpu::partitionLoop(scf::ForOp loop) {
           tensorTy.getShape(), tensorTy.getElementType(), sharedEnc,
           SharedMemorySpaceAttr::get(tensorTy.getContext()));
       auto alloc = b.create<LocalAllocOp>(memdescTy, capture);
-      for (Region *region : wsOp.getPartitionRegions()) {
-        b.setInsertionPointToStart(&region->front());
-        Value value = b.create<LocalLoadOp>(tensorTy, alloc);
+      for (auto [i, region] : llvm::enumerate(wsOp.getPartitionRegions())) {
+        Value value =
+            builders[i].create<LocalLoadOp>(capture.getLoc(), tensorTy, alloc);
         replaceAllUsesInRegionWith(capture, value, *region);
+        mappings[i].map(capture, value);
       }
       capture = alloc;
     }
@@ -228,9 +233,9 @@ LogicalResult triton::gpu::partitionLoop(scf::ForOp loop) {
 
   // Clone the ops into each region in topological order.
   opsToClone = topologicalSort(opsToClone);
-  for (Region *region : wsOp.getPartitionRegions()) {
-    b.setInsertionPointToStart(&region->front());
-    IRMapping mapping;
+  for (auto [i, region] : llvm::enumerate(wsOp.getPartitionRegions())) {
+    OpBuilder &b = builders[i];
+    IRMapping &mapping = mappings[i];
     for (Operation *op : opsToClone) {
       Value copy = b.clone(*op, mapping)->getResult(0);
       mapping.map(op->getResult(0), copy);
 
@@ -0,0 +1,86 @@
+#include "ir.h"
+#include "pybind11/pybind11.h"
+#include <pybind11/stl.h>
+
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Types.h"
+#include "triton/Conversion/TritonGPUToLLVM/Utility.h"
+#include "triton/Dialect/TritonGPU/IR/Attributes.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+
+using namespace mlir;
+namespace py = pybind11;
+namespace ttg = triton::gpu;
+
+struct GluonOpBuilder : public TritonOpBuilder {};
+
+void init_gluon_ir(py::module &&m) {
+  py::class_<GluonOpBuilder, TritonOpBuilder>(
+      m, "GluonOpBuilder", py::module_local(), py::dynamic_attr())
+      .def(py::init<MLIRContext *>())
+      .def("get_distributed_ty",
+           [](GluonOpBuilder &self, Type &elementType,
+              std::vector<int64_t> &shape, Attribute layout) -> Type {
+             return RankedTensorType::get(shape, elementType, layout);
+           })
+      .def("get_shared_mem_desc_ty",
+           [](GluonOpBuilder &self, Type &elementType,
+              std::vector<int64_t> &shape, Attribute layout,
+              std::vector<int64_t> &allocShape) -> Type {
+             auto ctx = self.getContext();
+             return ttg::MemDescType::get(shape, elementType, layout,
+                                          ttg::SharedMemorySpaceAttr::get(ctx),
+                                          /*mutableMemory=*/true,
+                                          /*allocShape=*/allocShape);
+           })
+      .def("get_blocked_layout",
+           [](GluonOpBuilder &self, std::vector<unsigned> &sizePerThread,
+              std::vector<unsigned> &threadsPerWarp,
+              std::vector<unsigned> &warpsPerCta, std::vector<unsigned> &order,
+              std::vector<unsigned> &ctasPerCga,
+              std::vector<unsigned> &ctaSplitNum,
+              std::vector<unsigned> &ctaOrder) -> Attribute {
+             auto ctx = self.getContext();
+             auto ctaLayout = ttg::CTALayoutAttr::get(ctx, ctasPerCga,
+                                                      ctaSplitNum, ctaOrder);
+             return ttg::BlockedEncodingAttr::get(ctx, sizePerThread,
+                                                  threadsPerWarp, warpsPerCta,
+                                                  order, ctaLayout);
+           })
+      .def("get_slice_layout",
+           [](GluonOpBuilder &self, unsigned dim,
+              Attribute parent) -> Attribute {
+             auto ctx = self.getContext();
+             auto dist = cast<ttg::DistributedEncodingTrait>(parent);
+             return ttg::SliceEncodingAttr::get(ctx, dim, dist);
+           })
+      .def("get_nvmma_shared_layout",
+           [](GluonOpBuilder &self, unsigned swizzleByteWidth,
+              unsigned elementBitwidth, bool transposed, bool fp4Padded,
+              std::vector<unsigned> &ctasPerCga,
+              std::vector<unsigned> &ctaSplitNum,
+              std::vector<unsigned> &ctaOrder) -> Attribute {
+             auto ctx = self.getContext();
+             auto ctaLayout = ttg::CTALayoutAttr::get(ctx, ctasPerCga,
+                                                      ctaSplitNum, ctaOrder);
+             return ttg::NVMMASharedEncodingAttr::get(
+                 ctx, swizzleByteWidth, transposed, elementBitwidth, fp4Padded,
+                 ctaLayout);
+           })
+      .def("create_convert_layout",
+           [](GluonOpBuilder &self, Type resultTy, Value value) -> Value {
+             return self.create<ttg::ConvertLayoutOp>(resultTy, value);
+           })
+      .def("create_local_alloc",
+           [](GluonOpBuilder &self, Type resultTy, Value value) -> Value {
+             return self.create<ttg::LocalAllocOp>(resultTy, value);
+           })
+      .def("create_local_store",
+           [](GluonOpBuilder &self, Value memDesc, Value value) {
+             self.create<ttg::LocalStoreOp>(value, memDesc);
+           })
+      .def("create_local_load",
+           [](GluonOpBuilder &self, Type resultTy, Value memDesc) -> Value {
+             return self.create<ttg::LocalLoadOp>(resultTy, memDesc);
+           });
+}