intel
diff --git a/‎.github/workflows/integration-tests-nvidia.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/integration-tests-nvidia.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 14 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 14 additions & 0 deletions
diff --git a/‎Makefile
Lines changed: 7 additions & 6 deletions b/‎Makefile
Lines changed: 7 additions & 6 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h
Lines changed: 7 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h
Lines changed: 7 additions & 0 deletions
diff --git a/‎lib/Dialect/Triton/IR/Ops.cpp
Lines changed: 2 additions & 2 deletions b/‎lib/Dialect/Triton/IR/Ops.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
Lines changed: 0 additions & 23 deletions b/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
Lines changed: 0 additions & 23 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp
Lines changed: 25 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp
Lines changed: 25 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionScheduling.cpp
Lines changed: 13 additions & 6 deletions b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionScheduling.cpp
Lines changed: 13 additions & 6 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/PromoteLHSToTMem.cpp
Lines changed: 2 additions & 1 deletion b/‎lib/Dialect/TritonNvidiaGPU/Transforms/PromoteLHSToTMem.cpp
Lines changed: 2 additions & 1 deletion
diff --git a/‎python/src/gluon_ir.cc
Lines changed: 10 additions & 1 deletion b/‎python/src/gluon_ir.cc
Lines changed: 10 additions & 1 deletion
@@ -88,7 +88,7 @@ jobs:
           if [ "${{ matrix.runner[0] }}" == "nvidia-gb200" ]; then
             source /venv/bin/activate
           fi
-          make test-unit
+          make NUM_PROCS=24 test-unit
       - name: Run interpreter tests
         if: ${{ matrix.runner[0] == 'nvidia-h100' }}
         run: make test-interpret
 
@@ -321,6 +321,20 @@ if(TRITON_BUILD_PYTHON_MODULE)
     target_link_libraries(triton PRIVATE z)
   endif()
   target_link_options(triton PRIVATE ${LLVM_LDFLAGS})
+
+  if (NOT DEFINED LLVM_SYSPATH)
+      message(FATAL_ERROR "LLVM_SYSPATH must be set.")
+  endif()
+
+  if (NOT DEFINED TRITON_WHEEL_DIR)
+      message(FATAL_ERROR "TRITON_WHEEL_DIR must be set.")
+  endif()
+
+  configure_file(
+    "${LLVM_SYSPATH}/bin/FileCheck"
+    "${TRITON_WHEEL_DIR}/FileCheck"
+    COPYONLY)
+
 endif()
 
 if (UNIX AND NOT APPLE)
 
@@ -7,6 +7,7 @@ BUILD_DIR := $(shell cd python; $(PYTHON) -c 'from build_helpers import get_cmak
 TRITON_OPT := $(BUILD_DIR)/bin/triton-opt
 PYTEST := $(PYTHON) -m pytest
 LLVM_BUILD_PATH ?= ".llvm-project/build"
+NUM_PROCS ?= 8
 
 # Incremental builds
 
@@ -30,25 +31,25 @@ test-cpp:
 
 .PHONY: test-unit
 test-unit: all
-	cd python/test/unit && $(PYTEST) -s -n 8 --ignore=language/test_line_info.py \
+	cd python/test/unit && $(PYTEST) -s -n $(NUM_PROCS) --ignore=language/test_line_info.py \
 		--ignore=language/test_subprocess.py --ignore=test_debug.py
-	$(PYTEST) -s -n 8 python/test/unit/language/test_subprocess.py
-	$(PYTEST) -s -n 8 python/test/unit/test_debug.py --forked
+	$(PYTEST) -s -n $(NUM_PROCS) python/test/unit/language/test_subprocess.py
+	$(PYTEST) -s -n $(NUM_PROCS) python/test/unit/test_debug.py --forked
 	$(PYTEST) -s -n 8 python/triton_kernels/tests/
 	TRITON_DISABLE_LINE_INFO=0 $(PYTEST) -s python/test/unit/language/test_line_info.py
 	# Run attention separately to avoid out of gpu memory
 	$(PYTEST) -vs python/tutorials/06-fused-attention.py
 	TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=python/triton/instrumentation/libGPUInstrumentationTestLib.so \
 		$(PYTEST) --capture=tee-sys -rfs -vvv python/test/unit/instrumentation/test_gpuhello.py
-	$(PYTEST) -s -n 8 python/test/gluon
+	$(PYTEST) -s -n $(NUM_PROCS) python/test/gluon
 
 .PHONY: test-gluon
 test-gluon: all
-	$(PYTEST) -s -n 8 python/test/gluon
+	$(PYTEST) -s -n $(NUM_PROCS) python/test/gluon
 
 .PHONY: test-regression
 test-regression: all
-	$(PYTEST) -s -n 8 python/test/regression
+	$(PYTEST) -s -n $(NUM_PROCS) python/test/regression
 
 .PHONY: test-interpret
 test-interpret: all
 
@@ -263,6 +263,13 @@ void replaceUsesWithLocalLoad(
     OpBuilder &builder, OpResult old,
     TypedValue<triton::gpu::MemDescType> alloc,
     TypedValue<triton::gpu::AsyncTokenType> token = {});
+
+// Return true if the value comes from a load or a block argument.
+// This will skip convert layouts and memdesc views.
+// This is a helper useful to know if value is likely to come from shared memory
+// after converting loads into async loads.
+bool comesFromLoadOrBlockArg(Value v);
+
 } // namespace mlir::triton
 
 #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_
@@ -368,8 +368,8 @@ OpFoldResult MakeRangeOp::fold(FoldAdaptor adaptor) {
 LogicalResult MakeRangeOp::verify() {
   int64_t start = getStartAttr().getInt();
   int64_t end = getEndAttr().getInt();
-  if (start > end) {
-    return this->emitOpError() << "start must be less than or equal to end";
+  if (start >= end) {
+    return this->emitOpError() << "start must be less than end";
   }
   auto ty = getType();
   if (ty.getShape().size() != 1) {
 
@@ -322,29 +322,6 @@ class BlockedToMMA : public mlir::OpRewritePattern<DotOp> {
     if (!(versionMajor >= 1 && versionMajor <= 3))
       return failure();
 
-    // If both of the operands are not loads, we fallback to MMAv2
-    // otherwise the reg-smem roundtrip will tank the MMAv3 performance
-    auto comesFromLoadOrBlockArg = [](Value v) -> bool {
-      // Peel out the original cvt dot_op<..., #blocked>
-      // and any other potential cvt/trans ops
-      while (true) {
-        if (auto cvtOp = v.getDefiningOp<ConvertLayoutOp>()) {
-          v = cvtOp.getSrc();
-          continue;
-        }
-        if (auto transOp = v.getDefiningOp<TransOp>()) {
-          v = transOp.getSrc();
-          continue;
-        }
-        break;
-      }
-      // We also accept block arguments as they appear in many MLIR tests
-      // If this is problematic we can totally drop them
-      return isa<BlockArgument>(v) ||
-             (v.getDefiningOp() &&
-              isa<LoadOp, DescriptorLoadOp>(v.getDefiningOp()));
-    };
-
     bool aFromLoad = comesFromLoadOrBlockArg(dotOp.getA());
     bool bFromLoad = comesFromLoadOrBlockArg(dotOp.getB());
     auto origDotOp = dotOp;
 
@@ -1554,4 +1554,29 @@ void replaceUsesWithLocalLoad(OpBuilder &builder, OpResult old,
     alloc.erase();
   }
 }
+
+bool comesFromLoadOrBlockArg(Value v) {
+  // Peel out the original cvt dot_op<..., #blocked>
+  // and any other potential cvt/trans ops
+  while (true) {
+    Operation *def = v.getDefiningOp();
+    if (!def)
+      break;
+    if (auto cvtOp = dyn_cast<ttg::ConvertLayoutOp>(def)) {
+      v = cvtOp.getSrc();
+      continue;
+    }
+    if (def->hasTrait<OpTrait::MemDescViewTrait>()) {
+      v = def->getOperand(0);
+      continue;
+    }
+    break;
+  }
+  // We also accept block arguments as they appear in many MLIR tests
+  // If this is problematic we can totally drop them
+  return isa<BlockArgument>(v) ||
+         (v.getDefiningOp() &&
+          isa<LoadOp, DescriptorLoadOp, DescriptorGatherOp>(v.getDefiningOp()));
+}
+
 } // namespace mlir::triton
@@ -226,11 +226,17 @@ static std::optional<WarpSchedule> getInitialSchedule(scf::ForOp loop) {
     return std::nullopt;
 
   // Propagate defs of exp.
-  for (auto expOp : loop.getOps<math::Exp2Op>()) {
-    auto tensorTy = dyn_cast<RankedTensorType>(expOp.getType());
-    if (tensorTy && tensorTy.getNumElements() > 256) {
-      schedule.trySchedule(defaultPartition, expOp);
-      scheduleDependencies(loop, schedule, defaultPartition, expOp);
+  for (Operation &op : loop.getOps()) {
+    if (!isa<math::Exp2Op, ElementwiseInlineAsmOp>(op))
+      continue;
+    int elementCount = 0;
+    for (Type type : op.getResultTypes()) {
+      if (auto tensorTy = dyn_cast<RankedTensorType>(type))
+        elementCount += tensorTy.getNumElements();
+    }
+    if (elementCount > 256) {
+      schedule.trySchedule(defaultPartition, &op);
+      scheduleDependencies(loop, schedule, defaultPartition, &op);
     }
   }
 
@@ -242,7 +248,8 @@ static std::optional<WarpSchedule> getInitialSchedule(scf::ForOp loop) {
   while (userPartitions.size() < mmas.size()) {
     userPartitions.push_back(schedule.addPartition(userPartitions.size()));
   }
-  for (auto [mmaOp, userPartition] : llvm::zip(mmas, userPartitions)) {
+  for (auto [mmaOp, userPartition] :
+       llvm::reverse(llvm::zip(mmas, userPartitions))) {
     scheduleUsers(loop, schedule, userPartition, mmaOp);
   }
 
 
@@ -69,7 +69,8 @@ template <class MMAOpTy> class LHSToTMem : public OpRewritePattern<MMAOpTy> {
         isDistributedLayoutTMemCompatible(tcGen5MMAOp, srcType, lhsMemDescType);
     Attribute newLayout = srcLayout;
     if (!layoutTmemCompatible) {
-      if (triton::tools::getBoolEnv("ALLOW_LHS_TMEM_LAYOUT_CONVERSION")) {
+      if (!comesFromLoadOrBlockArg(src) ||
+          triton::tools::getBoolEnv("ALLOW_LHS_TMEM_LAYOUT_CONVERSION")) {
         newLayout = getLHSTMemLayout(tcGen5MMAOp, srcType);
       } else {
         return failure();
 
@@ -11,6 +11,7 @@
 
 using namespace mlir;
 namespace py = pybind11;
+namespace tt = triton;
 namespace ttg = triton::gpu;
 namespace ttng = triton::nvidia_gpu;
 
@@ -298,7 +299,15 @@ void init_gluon_ir(py::module &&m) {
              self.create<ttng::AsyncTMAScatterOp>(descPtr, xOffsets, yOffset,
                                                   src);
            })
-
+      .def("create_broadcast",
+           [](TritonOpBuilder &self, Value &arg, Type retTy) -> Value {
+             return self.create<tt::BroadcastOp>(retTy, arg);
+           })
+      .def(
+          "create_expand_dims",
+          [](TritonOpBuilder &self, Value &arg, int axis, Type retTy) -> Value {
+            return self.create<tt::ExpandDimsOp>(retTy, arg, axis);
+          })
       .def("create_warp_return",
            [](GluonOpBuilder &self) -> Operation * {
              return self.create<ttg::WarpReturnOp>();