intel
diff --git a/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 6 additions & 1 deletion b/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 2 additions & 0 deletions b/‎Makefile‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/Utility.h‎
Lines changed: 8 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/Utility.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 5 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Dialect/Triton/IR/Utility.cpp‎
Lines changed: 78 additions & 0 deletions b/‎lib/Dialect/Triton/IR/Utility.cpp‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Types.cpp‎
Lines changed: 6 additions & 6 deletions b/‎lib/Dialect/TritonGPU/IR/Types.cpp‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp‎
Lines changed: 13 additions & 1 deletion b/‎lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp‎
Lines changed: 15 additions & 3 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp‎
Lines changed: 15 additions & 3 deletions
diff --git a/‎python/src/gluon_ir.cc‎
Lines changed: 1 addition & 2 deletions b/‎python/src/gluon_ir.cc‎
Lines changed: 1 addition & 2 deletions
@@ -13,7 +13,7 @@ jobs:
   integration-tests-amd:
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 45
-    continue-on-error: ${{ matrix.runner[1] == 'gfx90a' || matrix.runner[0] == 'gfx950' }}
+    continue-on-error: ${{ matrix.runner[1] == 'gfx90a' || matrix.runner[0] == 'amd-gfx950' }}
     strategy:
       matrix:
         runner: ${{ fromJson(inputs.matrix) }}
@@ -39,6 +39,7 @@ jobs:
               --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
               --env-file /etc/podinfo/gha-gpu-isolation-settings
               --volume /home/runner/.triton:/github/home/.triton
+              --volume /triton-data:/triton-data
     env:
       RUNNER_TYPE: ${{ matrix.runner[1] }}
       TRITON_BUILD_WITH_CCACHE: "true"
@@ -104,6 +105,10 @@ jobs:
           pip uninstall -y triton pytorch-triton-rocm
 
           ccache --zero-stats
+          if [ "${{ matrix.runner[0] }}" = "amd-gfx950" ]; then
+            pip install --cache-dir /triton-data/pip-cache -r python/requirements.txt
+            pip install --cache-dir /triton-data/pip-cache -r python/test-requirements.txt
+          fi
           make dev-install
       - name: Print ccache stats
         run: ccache --print-stats
 
@@ -82,6 +82,8 @@ test-python: test-unit test-regression test-interpret test-proton
 
 .PHONY: test-nogpu
 test-nogpu: test-lit test-cpp
+	$(PYTEST) python/test/gluon/test_frontend.py
+	$(PYTEST) python/test/unit/language/test_frontend.py
 
 .PHONY: test
 test: test-lit test-cpp test-python
 
@@ -200,6 +200,14 @@ bool isHostSideDescriptor(Value v);
 bool isKernel(FunctionOpInterface funcOp);
 
 unsigned getBitwidth(RankedTensorType ty);
+
+// If the value "anchor" is compared against a statically-computed bound, return
+// inclusive lower and upper bounds lb <= anchor <= ub. Depending on the
+// compariosn operator, one of the bounds is a computed one while the other is
+// derived from the data type of anchor.
+std::optional<ConstantIntRanges> getBoundFromCmpOp(arith::CmpIOp cmpOp,
+                                                   Value anchor);
+
 } // namespace triton
 } // namespace mlir
 
 
@@ -88,6 +88,9 @@ def TTG_AsyncCopyGlobalToLocalOp : TTG_Op<"async_copy_global_to_local", [
     This is analogue to tt.load except the data are copied to local memory pointed
     to by the memory descriptor instead of a distributed tensor. The rest of the
     operands are the same as tt.load.
+    Contiguity is the maximum number of elements that can be loaded in a single vector with
+    the given layout and mask.
+    This allows op to use async_copy_global_to_local even if the alignment cannot be proven based on IR.
   }];
 
   let arguments = (ins
@@ -97,7 +100,8 @@ def TTG_AsyncCopyGlobalToLocalOp : TTG_Op<"async_copy_global_to_local", [
     Optional<TT_Type>:$other,
     DefaultValuedAttr<TT_CacheModifierAttr, "triton::CacheModifier::NONE">:$cache,
     DefaultValuedAttr<TT_EvictionPolicyAttr, "triton::EvictionPolicy::NORMAL">:$evict,
-    DefaultValuedAttr<BoolAttr, "false">:$isVolatile
+    DefaultValuedAttr<BoolAttr, "false">:$isVolatile,
+    DefaultValuedAttr<I32Attr, "1">:$contiguity
   );
 
   let results = (outs TTG_AsyncToken:$token);
 
@@ -31,6 +31,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "MLIR_DISABLE_MULTITHREADING",
     "TRITON_DEFAULT_FP_FUSION",
     "TRITON_DISABLE_LINE_INFO",
+    "TRITON_DUMP_MIR",
     "TRITON_ENABLE_LLVM_DEBUG",
     "TRITON_HIP_USE_ASYNC_COPY",
     "TRITON_HIP_USE_BLOCK_PINGPONG",
 
@@ -128,3 +128,81 @@ unsigned tt::getBitwidth(RankedTensorType ty) {
   auto isPtr = isa<PointerType>(ty.getElementType());
   return isPtr ? kPtrBitWidth : std::max(ty.getElementTypeBitWidth(), 8u);
 }
+
+std::optional<ConstantIntRanges> tt::getBoundFromCmpOp(arith::CmpIOp cmpOp,
+                                                       Value anchor) {
+  bool isSigned = true;
+  switch (cmpOp.getPredicate()) {
+  case arith::CmpIPredicate::uge:
+  case arith::CmpIPredicate::ugt:
+  case arith::CmpIPredicate::ule:
+  case arith::CmpIPredicate::ult:
+    isSigned = false;
+  default:
+    break;
+  }
+
+  bool anchorIsLhs = cmpOp.getLhs() == anchor;
+  auto maybeConstantIntValue = getConstantIntValue(
+      getAsOpFoldResult(anchorIsLhs ? cmpOp.getRhs() : cmpOp.getLhs()));
+  if (auto constValue = maybeConstantIntValue) {
+    unsigned bitWidth = ConstantIntRanges::getStorageBitwidth(anchor.getType());
+    assert(bitWidth > 0 && "expected non-zero bitwdith");
+    APInt apVal = {bitWidth, static_cast<uint64_t>(*constValue), isSigned};
+    APInt min, max;
+    if (isSigned) {
+      min = APInt::getSignedMinValue(bitWidth);
+      if (llvm::isa_and_nonnull<mlir::triton::GetProgramIdOp,
+                                mlir::triton::GetNumProgramsOp>(
+              anchor.getDefiningOp())) {
+        min = APInt::getZero(bitWidth);
+      } else
+        min = APInt::getSignedMinValue(bitWidth);
+      max = APInt::getSignedMaxValue(bitWidth);
+    } else {
+      min = APInt::getMinValue(bitWidth);
+      max = APInt::getMaxValue(bitWidth);
+    }
+
+    switch (cmpOp.getPredicate()) {
+    case arith::CmpIPredicate::eq:
+      return mlir::ConstantIntRanges::constant(apVal);
+    case arith::CmpIPredicate::uge:
+    case arith::CmpIPredicate::sge: {
+      // K >= apVal implies K ∈ [apVal, max]
+      if (anchorIsLhs)
+        return mlir::ConstantIntRanges::range(apVal, max, isSigned);
+      // apVal >= K implies K ∈ [min, apVal]
+      return mlir::ConstantIntRanges::range(min, apVal, isSigned);
+    }
+    case arith::CmpIPredicate::ugt:
+    case arith::CmpIPredicate::sgt: {
+      // K > apVal implies K >= apVal + 1 implies K ∈ [apVal + 1, max]
+      if (anchorIsLhs)
+        return mlir::ConstantIntRanges::range(apVal + 1, max, isSigned);
+      // apVal > K implies apVal - 1 >= K implies K ∈ [min, apVal - 1]
+      return mlir::ConstantIntRanges::range(min, apVal - 1, isSigned);
+    }
+    case arith::CmpIPredicate::ule:
+    case arith::CmpIPredicate::sle: {
+      // K <= apVal implies K ∈ [min, apVal]
+      if (anchorIsLhs)
+        return mlir::ConstantIntRanges::range(min, apVal, isSigned);
+      // apVal <= K implies K ∈ [apVal, max]
+      return mlir::ConstantIntRanges::range(apVal, max, isSigned);
+    }
+    case arith::CmpIPredicate::ult:
+    case arith::CmpIPredicate::slt: {
+      // K < apVal implies K <= apVal -1 implies K ∈ [min, apVal - 1]
+      if (anchorIsLhs)
+        return mlir::ConstantIntRanges::range(min, apVal - 1, isSigned);
+      // apVal < K implies apVal + 1 <= K implies K ∈ [apVal + 1, max]
+      return mlir::ConstantIntRanges::range(apVal + 1, max, isSigned);
+    }
+    default:
+      emitRemark(cmpOp.getLoc(), "unsupported cmp predicate for assumption");
+      return {};
+    }
+  }
+  return {};
+}
@@ -157,6 +157,12 @@ LogicalResult MemDescType::verify(function_ref<InFlightDiagnostic()> emitError,
              << "memorySpace must be SharedMemorySpace for shared encoding. "
              << "Got " << memorySpace;
     }
+    auto rank = cast<LayoutEncodingTrait>(enc).getRank();
+    if (!(rank == shape.size() || rank == shape.size() - 1)) {
+      return emitError() << "rank must be equal to or one less than "
+                         << "the shape size. Got " << rank << " and "
+                         << shape.size();
+    }
   } else if (auto enc = dyn_cast<nvidia_gpu::TensorMemoryScalesEncodingAttr>(
                  encoding)) {
     if (memorySpace != nvidia_gpu::TensorMemorySpaceAttr::get(ctx)) {
@@ -177,12 +183,6 @@ LogicalResult MemDescType::verify(function_ref<InFlightDiagnostic()> emitError,
   // additional rules to verify.
   if (auto enc = dyn_cast<PaddedSharedEncodingAttr>(encoding)) {
     auto rank = enc.getRank();
-
-    if (rank != shape.size() && rank != shape.size() - 1) {
-      return emitError() << "padding rank must be equal to or one less than "
-                         << "the shape size when pipelining.";
-    }
-
     // Ensure linear component's outDims match the alloc size ignoring
     // pipelining dimension
     auto outDims = standardOutDimNames(ctx, rank);
 
@@ -1,5 +1,6 @@
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/Passes.h"
+#include "triton/Analysis/AxisInfo.h"
 #include "triton/Analysis/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
@@ -32,7 +33,11 @@ namespace gpu {
 // global data.
 struct ClipAsyncCopySizePerThread
     : public OpRewritePattern<AsyncCopyGlobalToLocalOp> {
+  ModuleAxisInfoAnalysis &axisInfoAnalysis;
   using OpRewritePattern::OpRewritePattern;
+  ClipAsyncCopySizePerThread(ModuleAxisInfoAnalysis &axisInfoAnalysis,
+                             MLIRContext *context)
+      : OpRewritePattern(context), axisInfoAnalysis(axisInfoAnalysis) {}
 
   LogicalResult matchAndRewrite(AsyncCopyGlobalToLocalOp copyOp,
                                 PatternRewriter &rewriter) const override {
@@ -94,12 +99,18 @@ struct ClipAsyncCopySizePerThread
     if (other)
       other = convertBlockLayout(other, newBlockEnc);
 
+    unsigned contiguity = axisInfoAnalysis.getContiguity(src);
+    if (mask)
+      contiguity = std::min<unsigned>(contiguity,
+                                      axisInfoAnalysis.getMaskAlignment(mask));
+
     rewriter.modifyOpInPlace(copyOp, [&]() {
       copyOp.getSrcMutable().assign(src);
       if (mask)
         copyOp.getMaskMutable().assign(mask);
       if (other)
         copyOp.getOtherMutable().assign(other);
+      copyOp.setContiguity(contiguity);
     });
 
     return success();
@@ -112,10 +123,11 @@ struct CoalesceAsyncCopyPass
 
   void runOnOperation() override {
     ModuleOp m = getOperation();
+    triton::ModuleAxisInfoAnalysis axisInfoAnalysis(m);
     MLIRContext *context = &getContext();
 
     mlir::RewritePatternSet patterns(context);
-    patterns.add<ClipAsyncCopySizePerThread>(context);
+    patterns.add<ClipAsyncCopySizePerThread>(axisInfoAnalysis, context);
 
     if (failed(applyPatternsGreedily(m, std::move(patterns))))
       signalPassFailure();
 
@@ -156,7 +156,7 @@ static Value createAlloc(scf::ForOp &forOp, Operation *loadOp,
 }
 
 void createAsyncCopy(scf::ForOp forOp, tt::LoadOp loadOp, Value alloc,
-                     Value insertIdx, Value extractIdx,
+                     Value insertIdx, Value extractIdx, int contiguity,
                      CoarseSchedule &schedule) {
   OpBuilderForStage builder(loadOp.getLoc(), forOp, schedule);
   Value zero = arith::ConstantIntOp::create(builder, forOp.getLoc(), 0, 32);
@@ -176,7 +176,7 @@ void createAsyncCopy(scf::ForOp forOp, tt::LoadOp loadOp, Value alloc,
   Value view = createSingleBufferView(builder, alloc, insertIdx);
   Operation *copy = ttg::AsyncCopyGlobalToLocalOp::create(
       builder, src, view, mask, other, loadOp.getCache(), loadOp.getEvict(),
-      loadOp.getIsVolatile());
+      loadOp.getIsVolatile(), contiguity);
   Operation *commit =
       ttg::AsyncCommitGroupOp::create(builder, copy->getResult(0));
 
@@ -274,6 +274,7 @@ void createTMAAsyncGather(scf::ForOp forOp, tt::DescriptorGatherOp gatherOp,
 
 struct AsyncLoad {
   int stageDiff;
+  int contiguity = 1;
   Value alloc;
   Value barrier;
   Operation *waitOp;
@@ -459,6 +460,7 @@ scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule,
       }
       SharedEncodingTrait sharedEncoding;
       bool canUseAsyncCp = false;
+      int contiguity = 1;
       if (!isa<RankedTensorType>(op.getResultTypes()[0])) {
         canUseAsyncCp = op.getResultTypes()[0].getIntOrFloatBitWidth() >= 32;
         sharedEncoding = ttg::SwizzledSharedEncodingAttr::get(
@@ -478,6 +480,15 @@ scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule,
             cast<RankedTensorType>(op.getResultTypes()[0]), sharedEncoding);
 
         canUseAsyncCp &= copyVecBytes >= 4;
+        if (canUseAsyncCp) {
+          auto loadOp = cast<tt::LoadOp>(op);
+          auto ptr = loadOp.getPtr();
+          unsigned vec = axisInfoAnalysis.getContiguity(ptr);
+          if (auto mask = loadOp.getMask())
+            vec = std::min<unsigned>(vec,
+                                     axisInfoAnalysis.getMaskAlignment(mask));
+          contiguity = vec;
+        }
       }
       if (canUseAsyncCp || isTMALoad(&op)) {
         if (loadRequiresAdditionalBuffer(&op)) {
@@ -486,6 +497,7 @@ scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule,
         }
         auto &asyncLoad = asyncLoads[&op];
         asyncLoad.stageDiff = stageDiff;
+        asyncLoad.contiguity = contiguity;
         asyncLoad.sharedEncoding = sharedEncoding;
       } else if (stageDiff > 1) {
         // Distance-1 loads can in most cases be pipelined in registers without
@@ -589,7 +601,7 @@ scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule,
     auto [insertIdx, extractIdx, phase, _] = loadGroups[asyncLoad.stageDiff];
     if (auto loadOp = dyn_cast<tt::LoadOp>(op)) {
       createAsyncCopy(forOp, loadOp, asyncLoad.alloc, insertIdx, extractIdx,
-                      schedule);
+                      asyncLoad.contiguity, schedule);
       hasAsyncLoads = true;
     } else if (auto loadOp = dyn_cast<tt::DescriptorLoadOp>(op)) {
       createTMAAsyncLoad(forOp, loadOp, asyncLoad.alloc, insertIdx, extractIdx,
 
@@ -867,8 +867,7 @@ void init_gluon_ir(py::module &&m) {
            })
       .def("create_async_tdm_copy_global_to_local",
            [](GluonOpBuilder &self, Value descPtr, std::vector<Value> &indices,
-              Value result, Value barrier) {
-             Value pred = self.create<arith::ConstantIntOp>(1, 1);
+              Value result, Value pred, Value barrier) {
              self.create<ttag::AsyncTDMCopyGlobalToLocalOp>(
                  descPtr, indices, result, pred, barrier);
            })