intel
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 9 additions & 0 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 0 additions & 3 deletions b/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/AssertOpToLLVM.cpp‎
Lines changed: 2 additions & 2 deletions b/‎lib/Conversion/TritonGPUToLLVM/AssertOpToLLVM.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ReduceScanCommon.h‎
Lines changed: 3 additions & 3 deletions b/‎lib/Conversion/TritonGPUToLLVM/ReduceScanCommon.h‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 0 additions & 7 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp‎
Lines changed: 14 additions & 11 deletions b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp‎
Lines changed: 14 additions & 11 deletions
diff --git a/‎python/src/gluon_ir.cc‎
Lines changed: 24 additions & 1 deletion b/‎python/src/gluon_ir.cc‎
Lines changed: 24 additions & 1 deletion
diff --git a/‎python/test/gluon/test_core.py‎
Lines changed: 36 additions & 2 deletions b/‎python/test/gluon/test_core.py‎
Lines changed: 36 additions & 2 deletions
@@ -88,7 +88,7 @@ def TTG_AsyncCopyGlobalToLocalOp : TTG_Op<"async_copy_global_to_local", [
   let description = [{
     This operation copies data from global memory to local memory asynchronously.
     This is analogue to tt.load except the data are copied to local memory pointed
-    by by the memory descriptor instead of a distributed tensor. The rest of the
+    to by the memory descriptor instead of a distributed tensor. The rest of the
     operands are the same as tt.load.
   }];
 
 
@@ -262,6 +262,15 @@ def TTNG_ArriveBarrierOp : TTNG_Op<"arrive_barrier"> {
   let hasVerifier = 1;
 }
 
+def TTNG_AsyncCopyMbarrierArriveOp : TTNG_Op<"async_copy_mbarrier_arrive"> {
+  let summary = "arrive on mbarrier once all previously issued copies are completed";
+  let arguments = (ins
+    Arg<TTG_MemDescType, "", [MemWrite<SharedMemory>]>:$barrier,
+    UnitAttr:$noIncrement
+  );
+  let assemblyFormat = "$barrier attr-dict `:` qualified(type($barrier))";
+}
+
 
 def TTNG_AsyncTMACopyGlobalToLocalOp : TTNG_Op<"async_tma_copy_global_to_local"> {
   let summary = "copy data based on descriptor from global memory to local memory asynchronously";
 
@@ -15,7 +15,6 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     // clang-format off
     "AMDGCN_ENABLE_DUMP",
     "AMDGCN_USE_BUFFER_OPS",
-    "DISABLE_FAST_REDUCTION",
     "DISABLE_LLVM_OPT",
     "DISABLE_MMA_V3",
     "DISABLE_MMA_V5",
@@ -30,7 +29,6 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "MLIR_DISABLE_MULTITHREADING",
     "TRITON_DEFAULT_FP_FUSION",
     "TRITON_DISABLE_LINE_INFO",
-    "TRITON_DISABLE_RESHAPE_ENCODING_INFERENCE",
     "TRITON_ENABLE_LLVM_DEBUG",
     "TRITON_HIP_GLOBAL_PREFETCH",
     "TRITON_HIP_LOCAL_PREFETCH",
@@ -42,7 +40,6 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_OVERRIDE_ARCH",
     "USE_IR_LOC",
     "NVPTX_ENABLE_DUMP",
-    "STORE_TMEM_TO_GLOBAL_BYPASS_SMEM",
     "ALLOW_LHS_TMEM_LAYOUT_CONVERSION",
     "TRITON_F32_DEFAULT",
     "TRITON_PREFER_TMEM_16x256_LAYOUT",
 
@@ -84,9 +84,9 @@ struct AssertOpConversion : public ConvertOpToLLVMPattern<triton::AssertOp> {
     // Split a block after the call.
     Block *thenBlock = rewriter.splitBlock(ifBlock, op->getIterator());
     rewriter.setInsertionPointToEnd(ifBlock);
-    rewriter.create<cf::BranchOp>(loc, thenBlock);
+    rewriter.create<LLVM::BrOp>(loc, thenBlock);
     rewriter.setInsertionPointToEnd(prevBlock);
-    rewriter.create<cf::CondBranchOp>(loc, condition, ifBlock, thenBlock);
+    rewriter.create<LLVM::CondBrOp>(loc, condition, ifBlock, thenBlock);
     rewriter.setInsertionPointToStart(thenBlock);
   }
 
 
@@ -97,12 +97,12 @@ inline SmallVector<Value> applyCombineOp(Location loc,
     thenBlockArgs.push_back(undef);
     thenBlock->addArgument(ty, loc);
   }
-  rewriter.create<cf::CondBranchOp>(loc, pred, &newCombine, combineArgs,
-                                    thenBlock, thenBlockArgs);
+  rewriter.create<LLVM::CondBrOp>(loc, pred, &newCombine, combineArgs,
+                                  thenBlock, thenBlockArgs);
 
   // Split a block after the call.
   rewriter.setInsertionPointToEnd(&newCombine);
-  rewriter.replaceOpWithNewOp<cf::BranchOp>(returnOp, thenBlock, results);
+  rewriter.replaceOpWithNewOp<LLVM::BrOp>(returnOp, results, thenBlock);
   rewriter.setInsertionPointToStart(thenBlock);
   return SmallVector<Value>(thenBlock->getArguments());
 }
 
@@ -2277,13 +2277,6 @@ struct TritonGPUInferLayoutInterface
       return success();
     }
 
-    // Feature flag to disable this routine while it's relatively new.
-    // TODO(jlebar): Remove this once we're confident in the code.
-    if (triton::tools::getBoolEnv(
-            "TRITON_DISABLE_RESHAPE_ENCODING_INFERENCE")) {
-      return failure();
-    }
-
     // Cowardly refuse to handle encodings with multiple CTAs.  CTAsPerCGA
     // should be like the other fields in blocked encoding, but I'm not sure how
     // to handle CTASplitNum.
 
@@ -47,7 +47,7 @@ struct PipelinedLoad {
 
   SmallVector<Operation *, 1> allocOps;
   SmallVector<Operation *, 1> liveBeforeOps;
-  SmallVector<Operation *, 0> liveUntilOps;
+  SmallVector<std::pair<Operation *, bool>, 0> liveUntilOps;
   SmallVector<Operation *, 1> asyncUsers;
 };
 
@@ -252,8 +252,6 @@ LogicalResult PipelinedLoad::determineLiveRange(Block &container,
     // memory must be live until after this operation.
     Operation *lastShmemSink =
         findNearestCommonPostDominator(shmemTerminals, postDomInfo);
-    if (lastShmemSink)
-      lastShmemSink = lastShmemSink->getNextNode();
 
     // The memory only needs to be live until before the first register user.
     Operation *liveUntilReg = findNearestCommonDominator(regSink, domInfo);
@@ -262,14 +260,16 @@ LogicalResult PipelinedLoad::determineLiveRange(Block &container,
 
     // The memory is live until before the first register user or after the last
     // shmem terminal, whichever is later.
-    Operation *liveUntilOp;
+    std::pair<Operation *, bool> liveUntilOp{nullptr, false};
     if (lastShmemSink && liveUntilReg) {
-      liveUntilOp = liveUntilReg->isBeforeInBlock(lastShmemSink) ? lastShmemSink
-                                                                 : liveUntilReg;
+      if (liveUntilReg->isBeforeInBlock(lastShmemSink))
+        liveUntilOp = {lastShmemSink, /*after=*/true};
+      else
+        liveUntilOp = {liveUntilReg, /*after=*/false};
     } else if (liveUntilReg) {
-      liveUntilOp = liveUntilReg;
+      liveUntilOp = {liveUntilReg, /*after=*/false};
     } else {
-      liveUntilOp = lastShmemSink;
+      liveUntilOp = {lastShmemSink, /*after=*/true};
     }
     liveUntilOps.push_back(liveUntilOp);
   }
@@ -316,7 +316,7 @@ void PipelinedLoadGroup::allocateAref(scf::ForOp &loop, int numStages) {
   for (PipelinedLoad &load : loads) {
     distinctAsyncUsers.insert(load.asyncUsers.begin(), load.asyncUsers.end());
     int numLiveUntil =
-        llvm::count_if(load.liveUntilOps, [](Operation *op) { return !!op; });
+        llvm::count_if(load.liveUntilOps, [](auto p) { return !!p.first; });
     maxLiveUntil = std::max(maxLiveUntil, numLiveUntil);
   }
   int arriveCount = distinctAsyncUsers.size() + maxLiveUntil;
@@ -390,8 +390,11 @@ LogicalResult PipelinedLoadGroup::lowerLoads(WarpSchedule &schedule,
 
     SmallVector<Operation *> liveUntilOps;
     for (PipelinedLoad &load : loads) {
-      if (Operation *liveUntilOp = load.liveUntilOps[i])
-        liveUntilOps.push_back(liveUntilOp);
+      auto [liveUntilOp, after] = load.liveUntilOps[i];
+      if (liveUntilOp) {
+        liveUntilOps.push_back(after ? liveUntilOp->getNextNode()
+                                     : liveUntilOp);
+      }
     }
     if (!liveUntilOps.empty()) {
       Operation *liveUntilOp =
 
@@ -130,7 +130,7 @@ py::object layoutToGluon(Attribute layout) {
     return layouts.DistributedLinearLayout(
         ll.getBases().lookup(kReg), ll.getBases().lookup(kLane),
         ll.getBases().lookup(kWarp), ll.getBases().lookup(kBlock),
-        ll.getOutDimSizes());
+        toStdVector(ArrayRef(llvm::to_vector(ll.getOutDimSizes()))));
   } else if (auto nvmma = dyn_cast<ttg::NVMMASharedEncodingAttr>(layout)) {
     auto ctaLayout = nvmma.getCTALayout();
     return layouts.NVMMASharedLayout(
@@ -279,6 +279,29 @@ void init_gluon_ir(py::module &&m) {
                  blockTy.getShape(), blockTy.getElementType(), layout);
              return triton::TensorDescType::get(ctx, blockTyLayout, isSigned);
            })
+      .def("create_async_copy_global_to_local",
+           [](GluonOpBuilder &self, Value smem, Value pointer, Value mask,
+              tt::CacheModifier cacheModifier,
+              tt::EvictionPolicy evictionPolicy, bool isVolatile) {
+             self.create<ttg::AsyncCopyGlobalToLocalOp>(
+                 pointer, smem, mask, /*other*/ Value{}, cacheModifier,
+                 evictionPolicy, isVolatile);
+           })
+      .def("create_async_copy_mbarrier_arrive",
+           [](GluonOpBuilder &self, Value mbarrier, bool incrementCount) {
+             self.create<ttng::AsyncCopyMbarrierArriveOp>(mbarrier,
+                                                          !incrementCount);
+           })
+      .def("create_async_commit_group",
+           [](GluonOpBuilder &self) {
+             ValueRange tokens;
+             self.create<ttg::AsyncCommitGroupOp>(tokens);
+           })
+      .def("create_async_wait_group",
+           [](GluonOpBuilder &self, int num) {
+             ValueRange tokens;
+             self.create<ttg::AsyncWaitOp>(tokens, num);
+           })
       .def("create_convert_layout",
            [](GluonOpBuilder &self, Type resultTy, Value value) -> Value {
              return self.create<ttg::ConvertLayoutOp>(resultTy, value);
 
@@ -1,9 +1,10 @@
 import torch
 import pytest
 
-from triton._internal_testing import is_cuda
+from triton._internal_testing import is_ampere_or_newer, is_hopper
 from triton.experimental import gluon
 from triton.experimental.gluon import language as ttgl
+from triton.experimental.gluon.language.nvidia.ampere import async_copy, mbarrier
 from triton.experimental.gluon.language.nvidia.hopper import tma
 
 
@@ -45,7 +46,7 @@ def tma_kernel(desc):
     alloc._keep_alive()
 
 
-@pytest.mark.skipif(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires Hopper")
+@pytest.mark.skipif(not is_hopper(), reason="Requires Hopper")
 def test_tma():
     out = torch.ones((16, 16), dtype=torch.float16, device="cuda")
     layout = ttgl.NVMMASharedLayout(
@@ -59,3 +60,36 @@ def test_tma():
     desc = gluon.nvidia.hopper.TensorDescriptor.from_tensor(out, [16, 16], layout)
     tma_kernel[(1, )](desc)
     torch.testing.assert_close(out, torch.zeros_like(out))
+
+
+@gluon.jit
+def async_copy_mbarrier_kernel(out, inp, xnumel, XBLOCK: ttgl.constexpr, YBLOCK: ttgl.constexpr):
+    smem = ttgl.allocate_shared_memory(inp.dtype.element_ty, [XBLOCK, YBLOCK],
+                                       ttgl.SwizzledSharedLayout(1, 1, 1, order=[1, 0]))
+    block_layout: ttgl.constexpr = ttgl.BlockedLayout([1, 4], [1, 32], [4, 1], [1, 0])
+    xindex = ttgl.arange(0, XBLOCK, ttgl.SliceLayout(1, block_layout))[:, None]
+    yindex = ttgl.arange(0, YBLOCK, ttgl.SliceLayout(0, block_layout))[None, :]
+    mask = xindex < xnumel
+    async_copy.async_copy_global_to_shared(
+        smem,
+        inp + xindex * YBLOCK + yindex,
+        mask,
+    )
+    mbar = ttgl.allocate_shared_memory(ttgl.int64, [1], mbarrier.MBarrierLayout())
+    mbarrier.init(mbar, count=1)
+    async_copy.mbarrier_arrive(mbar)
+    mbarrier.arrive(mbar)
+    mbarrier.wait(mbar, 0)
+
+    val = smem.load(block_layout)
+    ttgl.store(out + xindex * YBLOCK + yindex, val)
+
+
+@pytest.mark.skipif(not is_ampere_or_newer(), reason="Requires Ampere")
+def test_async_copy_mbarrier():
+    tensor_opts = dict(dtype=torch.float, device="cuda")
+    out = torch.empty((32, 32), **tensor_opts)
+    inp = torch.randn((20, 32), **tensor_opts)
+    async_copy_mbarrier_kernel[(1, )](out, inp, inp.shape[0], XBLOCK=32, YBLOCK=32)
+    torch.testing.assert_close(out[:20], inp)
+    torch.testing.assert_close(out[20:], torch.zeros((12, 32), **tensor_opts))