intel
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h‎
Lines changed: 31 additions & 2 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h‎
Lines changed: 31 additions & 2 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 79 additions & 78 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 79 additions & 78 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 15 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Partition.h‎
Lines changed: 4 additions & 7 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Partition.h‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 0 additions & 7 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 2 additions & 0 deletions b/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lib/Analysis/Membar.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Analysis/Membar.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -299,6 +299,7 @@ if(TRITON_BUILD_PYTHON_MODULE)
   add_library(triton SHARED ${PYTHON_SRC_PATH}/main.cc
                   ${PYTHON_SRC_PATH}/ir.cc
                   ${PYTHON_SRC_PATH}/gluon_ir.cc
+                  ${PYTHON_SRC_PATH}/linear_layout.cc
                   ${PYTHON_SRC_PATH}/passes.cc
                   ${PYTHON_SRC_PATH}/interpreter.cc
                   ${PYTHON_SRC_PATH}/llvm.cc
 
@@ -68,7 +68,7 @@ test-microbenchmark: all
 test-interpret: all
 	cd python/test/unit && TRITON_INTERPRET=1 $(PYTEST) --tb=short -s -n 16 -m interpreter cuda language/test_core.py language/test_standard.py \
 		language/test_random.py language/test_block_pointer.py language/test_subprocess.py language/test_line_info.py \
-		language/test_tuple.py runtime/test_autotuner.py::test_kwargs[False] \
+		language/test_tuple.py runtime/test_launch.py runtime/test_autotuner.py::test_kwargs[False] \
 		../../tutorials/06-fused-attention.py::test_op --device=cpu
 
 .PHONY: test-proton
 
@@ -166,11 +166,40 @@ struct ElementwiseOpConversion
                                     ConversionPatternRewriter &rewriter,
                                     Type elemTy, MultipleOperandsRange operands,
                                     Location loc) const {
-    return {rewriter.create<DestOp>(loc, elemTy, operands[0],
-                                    adaptor.getAttributes().getValue())};
+    return {DestOp::create(rewriter, loc, elemTy, operands[0],
+                           adaptor.getAttributes().getValue())};
   }
 };
 
+template <typename SourceOp>
+struct ElementwiseToIntrinsicOpConversion
+    : public ElementwiseOpConversionBase<
+          SourceOp, ElementwiseToIntrinsicOpConversion<SourceOp>> {
+  using Base =
+      ElementwiseOpConversionBase<SourceOp, ElementwiseToIntrinsicOpConversion>;
+  using OpAdaptor = typename Base::OpAdaptor;
+
+  using Base::Base;
+
+  explicit ElementwiseToIntrinsicOpConversion(
+      LLVMTypeConverter &typeConverter,
+      ModuleAxisInfoAnalysis &axisAnalysisPass, StringRef intrinsic,
+      PatternBenefit benefit = patternBenefitDefault)
+      : Base(typeConverter, axisAnalysisPass, benefit), intrinsic(intrinsic) {}
+
+  SmallVector<Value> createDestOps(SourceOp op, OpAdaptor adaptor,
+                                   ConversionPatternRewriter &rewriter,
+                                   Type elemTy, MultipleOperandsRange operands,
+                                   Location loc) const {
+    return {LLVM::createLLVMIntrinsicCallOp(rewriter, loc, intrinsic, elemTy,
+                                            operands[0])
+                .getResult(0)};
+  }
+
+private:
+  StringRef intrinsic;
+};
+
 } // namespace gpu
 
 } // namespace mlir::triton
 
@@ -45,6 +45,11 @@ constexpr static char AttrNumWarpsName[] = "ttg.num-warps";
 constexpr static char AttrNumCTAsName[] = "ttg.num-ctas";
 constexpr static char AttrTargetName[] = "ttg.target";
 constexpr static char AttrNumThreadsPerWarp[] = "ttg.threads-per-warp";
+// FIXME: rename to match above
+constexpr static char kPartitionAttrName[] = "ttg.partition";
+constexpr static char kPartitionOutputsAttrName[] = "ttg.partition.outputs";
+constexpr static char kPartitionStagesAttrName[] = "ttg.partition.stages";
+constexpr static char kWarpSpecializeTagAttrName[] = "ttg.warp_specialize.tag";
 
 // Find the contextual number of warps on which this operation is executed.
 int lookupNumWarps(Operation *op);
@@ -266,6 +271,12 @@ void dumpHWLayout(RankedTensorType tensorType);
 // Return a string representation of the layout of the tensor.
 std::string getLayoutStr(RankedTensorType tensorType, bool useHWPointOfView);
 
+// Return a string representation of the shared layout of the tensor.
+std::string getSharedLayoutStr(LinearLayout &ll, bool useHWPointOfView);
+
+// Return a string representation of the distributed layout of the tensor.
+std::string getDistributedLayoutStr(LinearLayout &ll, bool useHWPointOfView);
+
 template <typename T>
 llvm::SmallVector<T> expandMatrixShapeWithBatch(llvm::ArrayRef<T> s);
 
@@ -287,6 +298,10 @@ LogicalResult verifyMemoryOpTypes(Operation *op, ShapedType srcTy,
                                   ShapedType dstTy);
 // Verify a memory allocation operation.
 LogicalResult verifyAllocOp(Operation *op, Value src, MemDescType dstTy);
+
+std::optional<SetVector<int>> getPartitionIds(Operation *op);
+std::optional<int> getNumOutputPartitionIds(Operation *op);
+std::optional<SetVector<int>> getOutputPartitionIds(Operation *op, int idx);
 } // namespace mlir::triton::gpu
 
 #endif // TRITON_DIALECT_TRITONGPU_IR_DIALECT_H_
@@ -16,11 +16,6 @@ class ForOp;
 } // namespace scf
 } // namespace mlir
 
-static constexpr char kPartitionAttrName[] = "ttg.partition";
-static constexpr char kPartitionOutputsAttrName[] = "ttg.partition.outputs";
-static constexpr char kPartitionStagesAttrName[] = "ttg.partition.stages";
-static constexpr char kWarpSpecializeTagAttrName[] = "ttg.warp_specialize.tag";
-
 //===----------------------------------------------------------------------===//
 // PartitionSet
 //===----------------------------------------------------------------------===//
@@ -40,6 +35,7 @@ class Partition {
   ArrayRef<Operation *> getOps() const { return ops; }
   void addOp(Operation *op) { ops.push_back(op); }
   bool hasOp(Operation *op) const;
+  bool empty() const { return ops.empty(); }
 
   // Iterate the inputs of the partition. Input values are those that originate
   // from a different partition or a previous iteration of the current
@@ -127,8 +123,9 @@ void setPartition(Operation *op, const SetVector<Partition *> &partitions);
 // which does not work with Partition instances and iterate* functions, since
 // it does not keep the op attributes and the op list of a partition in sync.
 void setPartition(Operation *op, const SetVector<int> &partitionIds);
-
-std::optional<SetVector<int>> getPartitionIds(Operation *op);
+void setPartitionOutputs(Operation *op,
+                         ArrayRef<SetVector<int>> partitionOutputsIds);
+SmallVector<SetVector<int>, 4> getPartitionOutputs(Operation *op);
 
 } // namespace mlir::triton::gpu
 
 
@@ -36,7 +36,7 @@ template <typename OpT, typename... Args>
 OpT createInto(OpBuilder &b, Location loc,
                std::optional<SetVector<int>> partitionSet,
                StageCluster stageCluster, Args &&...args) {
-  auto op = b.create<OpT>(loc, std::forward<Args>(args)...);
+  auto op = OpT::create(b, loc, std::forward<Args>(args)...);
   if (partitionSet) {
     setPartition(op, *partitionSet);
     setStageCluster(b, op, stageCluster);
 
@@ -184,13 +184,6 @@ getLastUseOfPipelinedOp(ArrayRef<Operation *> ops, scf::ForOp forOp,
 
 // Clean up attributes passing over schedules across stages in pipelining
 void removePipeliningAttributes(ModuleOp moduleOp);
-
-// For LoadOp, DescriptorLoad, and DescriptorGather ops, determine if
-// they should be pipelined.
-bool isPipeliningBeneficial(Operation *op,
-                            triton::ModuleAxisInfoAnalysis &axisInfoAnalysis,
-                            bool filterSmall = true);
-
 } // namespace triton
 } // namespace mlir
 
 
@@ -869,6 +869,8 @@ inline std::ostream &operator<<(std::ostream &os, const ColumnAction &action) {
   return os;
 }
 
+std::unique_ptr<uint64_t[]> getMatrix(const LinearLayout &layout);
+
 } // namespace mlir::triton
 
 #endif // TRITON_TOOLS_LINEARLAYOUT_H
@@ -159,7 +159,7 @@ void MembarOrFenceAnalysis::visitTerminator(
 
 void MembarAnalysis::insertBarrier(Operation *op, OpBuilder *builder) {
   OpBuilder::InsertionGuard g(*builder);
-  auto barrierOp = builder->create<triton::gpu::LocalBarrierOp>(op->getLoc());
+  auto barrierOp = triton::gpu::LocalBarrierOp::create(*builder, op->getLoc());
 }
 
 void MembarAnalysis::update(Operation *op, BlockInfo *blockInfo,
Original file line number	Diff line number	Diff line change
`@@ -869,6 +869,8 @@ inline std::ostream &operator<<(std::ostream &os, const ColumnAction &action) {`
`869`	`869`	`return os;`
`870`	`870`	`}`
`871`	`871`
	`872`	`+std::unique_ptr<uint64_t[]> getMatrix(const LinearLayout &layout);`
	`873`	`+`
`872`	`874`	`} // namespace mlir::triton`
`873`	`875`
`874`	`876`	`#endif // TRITON_TOOLS_LINEARLAYOUT_H`
Original file line number	Diff line number	Diff line change
`@@ -159,7 +159,7 @@ void MembarOrFenceAnalysis::visitTerminator(`
`159`	`159`
`160`	`160`	`void MembarAnalysis::insertBarrier(Operation op, OpBuilder builder) {`
`161`	`161`	`OpBuilder::InsertionGuard g(*builder);`
`162`		`- auto barrierOp = builder->create<triton::gpu::LocalBarrierOp>(op->getLoc());`
	`162`	`+ auto barrierOp = triton::gpu::LocalBarrierOp::create(*builder, op->getLoc());`
`163`	`163`	`}`
`164`	`164`
`165`	`165`	`void MembarAnalysis::update(Operation op, BlockInfo blockInfo,`