intel
diff --git a/‎Makefile‎
Lines changed: 2 additions & 2 deletions b/‎Makefile‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 3 additions & 1 deletion b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 1 addition & 2 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 1 addition & 3 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 3 additions & 2 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td‎
Lines changed: 1 addition & 2 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 5 additions & 7 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 23 additions & 0 deletions b/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 10 additions & 3 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 10 additions & 3 deletions
@@ -45,14 +45,14 @@ test-regression: all
 
 .PHONY: test-interpret
 test-interpret: all
-	cd python/test/unit && TRITON_INTERPRET=1 $(PYTEST) -s -n 16 -m interpreter language/test_core.py language/test_standard.py \
+	cd python/test/unit && TRITON_INTERPRET=1 $(PYTEST) -s -n 16 -m interpreter cuda language/test_core.py language/test_standard.py \
 		language/test_random.py language/test_block_pointer.py language/test_subprocess.py language/test_line_info.py \
 		runtime/test_autotuner.py::test_kwargs[False] \
 		../../tutorials/06-fused-attention.py::test_op --device=cpu
 
 .PHONY: test-proton
 test-proton: all
-	$(PYTEST) -s third_party/proton/test
+	$(PYTEST) -s -n 8 third_party/proton/test
 
 .PHONY: test-python
 test-python: test-unit test-regression test-interpret test-proton
 
@@ -581,6 +581,7 @@ def TT_TransOp : TT_Op<"trans", [Pure,
     let assemblyFormat = "$src attr-dict `:` type($src) `->` type($result)";
 
     let hasFolder = 1;
+    let hasVerifier = 1;
 }
 
 //
@@ -830,7 +831,8 @@ def TT_MakeRangeOp : TT_Op<"make_range", [Pure]> {
 def TT_ElementwiseInlineAsmOp : TT_Op<"elementwise_inline_asm", [
   Elementwise,
   SameOperandsAndResultEncoding,
-  DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+  DeclareOpInterfaceMethods<ConditionallySpeculatable>
 ]> {
   let summary = "inline assembly applying an elementwise operation to a group of packed elements.";
   let description = [{
 
@@ -14,8 +14,7 @@
 #include <unordered_map>
 
 // LinearLayoutCache Utils
-using CacheKey =
-    std::tuple<std::vector<int64_t>, mlir::Attribute, std::optional<int32_t>>;
+using CacheKey = std::tuple<std::vector<int64_t>, mlir::Attribute>;
 
 namespace llvm {
 template <typename T> size_t hash_value(const std::vector<T> &vec) {
 
@@ -41,8 +41,7 @@ class NVMMASharedEncodingAttr;
 // shared layouts with nvmma_shared layout) but is otherwise unused.
 //
 // Returns std::nullopt if the given layout can't be converted to an LL.
-LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
-                            std::optional<int32_t> elemBitWidth = std::nullopt);
+LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout);
 
 // Convert the shared encoding of a tensor with `nvmma_shared` layout to a
 // LinearLayout that maps from a linear shared memory offset to tensor index.
@@ -51,7 +50,6 @@ LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
 // swizzling.
 LinearLayout sharedToLinearLayoutLeadingOffset(ArrayRef<int64_t> shape,
                                                NVMMASharedEncodingAttr shared,
-                                               int32_t elemBitWidth,
                                                bool disableSwizzle = false);
 
 // Given a linear layout where the input dimensions contain a "block" dimension,
 
@@ -423,6 +423,7 @@ def NVMMASharedEncodingAttr :
     ins
     "unsigned":$swizzlingByteWidth,
     "bool":$transposed,
+    "unsigned":$elementBitWidth,
     "CTALayoutAttr":$CTALayout
   );
 
@@ -433,7 +434,7 @@ def NVMMASharedEncodingAttr :
                      "Type":$eltTy), [{
         auto shapePerCTA = getShapePerCTA(CTALayout.getCTASplitNum(), shape);
         int32_t swizzlingByteWidth = 0;
-        int32_t eleBitWidth = eltTy.getIntOrFloatBitWidth();
+        unsigned eleBitWidth = eltTy.getIntOrFloatBitWidth();
 
         // get proper shared memory swizzling mode from the contiguous dimension
         // size of the origin blocked layout.
@@ -448,7 +449,7 @@ def NVMMASharedEncodingAttr :
           llvm_unreachable("unsupported shared memory layout for MMAv3");
         }
         bool transposed = order[0] == 0;
-        return $_get(context, swizzlingByteWidth, transposed, CTALayout);
+        return $_get(context, swizzlingByteWidth, transposed, eleBitWidth, CTALayout);
     }]>
   ];
 
 
@@ -44,8 +44,7 @@ def TritonGPU_Dialect : Dialect {
       return cast<IntegerAttr>(threadsPerWarp).getInt();
     }
 
-    LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
-                                std::optional<int32_t> elemBitWidth);
+    LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout);
 
     private:
       LinearLayoutCache llCache;
 
@@ -225,7 +225,7 @@ def TTG_MemDescSubviewOp : TTG_Op<"memdesc_subview", [Pure]> {
 
 def TTG_MemDescTransOp : TTG_Op<"memdesc_trans", [Pure,
                                                   TransposeOpInterface,
-                                                  DeclareOpInterfaceMethods<InferTypeOpInterface>,
+                                                  InferTypeOpWithLayoutEquivalence,
                                                   SameOperandsAndResultElementType]> {
   let summary = "transpose the descriptor";
 
 
@@ -317,8 +317,8 @@ bool emitTransferBetweenRegistersAndShared(
   StringAttr kWarp = str_attr("warp");
 
   auto shape = sharedTy.getShape();
-  LinearLayout sharedLayout = triton::gpu::toLinearLayout(
-      shape, sharedTy.getEncoding(), elemLlvmTy.getIntOrFloatBitWidth());
+  LinearLayout sharedLayout =
+      triton::gpu::toLinearLayout(shape, sharedTy.getEncoding());
   LinearLayout regToSharedLayout = regLayout.invertAndCompose(sharedLayout);
 
   // TODO(jlebar): We don't currently support loading from shared memory in a
@@ -363,8 +363,7 @@ bool emitTransferBetweenRegistersAndShared(
   auto allocShape = sharedTy.getAllocShape();
   LinearLayout invertAllocSharedLayout =
       triton::gpu::toLinearLayout(allocShape.take_back(sharedTy.getRank()),
-                                  sharedTy.getEncoding(),
-                                  elemLlvmTy.getIntOrFloatBitWidth())
+                                  sharedTy.getEncoding())
           .pseudoinvert();
 
   int numElems = regToSharedLayout.getInDimSize(kRegister);
@@ -386,9 +385,8 @@ bool emitTransferBetweenRegistersAndShared(
     const SharedMemoryObject &smemObj, Location loc, RewriterBase &rewriter,
     const TargetInfoBase &target,
     std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback) {
-  auto regLayout = triton::gpu::toLinearLayout(
-      registerTy.getShape(), registerTy.getEncoding(),
-      elemLlvmTy.getIntOrFloatBitWidth());
+  auto regLayout = triton::gpu::toLinearLayout(registerTy.getShape(),
+                                               registerTy.getEncoding());
   return emitTransferBetweenRegistersAndShared(
       regLayout, sharedTy, elemLlvmTy, maxVecElems, smemObj, loc, rewriter,
       target, perVectorCallback);
 
@@ -209,6 +209,23 @@ OpFoldResult TransOp::fold(FoldAdaptor adaptor) {
   return {};
 }
 
+LogicalResult TransOp::verify() {
+  auto order = getOrder();
+  auto srcTy = cast<RankedTensorType>(getSrc().getType());
+  if (order.size() != srcTy.getShape().size()) {
+    return emitError("order must have the same size as the source tensor");
+  }
+  if (!isPermutationOfIota(order)) {
+    return emitError("order must be a permutation of 0..n-1");
+  }
+  SmallVector<int64_t> retShape = applyPermutation(srcTy.getShape(), order);
+  if (retShape != getType().getShape()) {
+    return emitError(
+        "result shape must match the permutation of the source shape");
+  }
+  return success();
+}
+
 LogicalResult TransOp::inferReturnTypes(
     MLIRContext *context, std::optional<Location> location,
     TransOp::Adaptor adaptor, SmallVectorImpl<Type> &inferredReturnTypes) {
@@ -1037,6 +1054,12 @@ void ElementwiseInlineAsmOp::getEffects(
                        SideEffects::DefaultResource::get());
 }
 
+Speculation::Speculatability ElementwiseInlineAsmOp::getSpeculatability() {
+  if (getPure())
+    return Speculation::Speculatable;
+  return Speculation::NotSpeculatable;
+}
+
 LogicalResult ElementwiseInlineAsmOp::verify() {
   if (getNumOperands() >= 1) {
     auto tensorType = dyn_cast<RankedTensorType>(getOperand(0).getType());
 
@@ -1927,6 +1927,7 @@ Attribute NVMMASharedEncodingAttr::parse(AsmParser &parser, Type type) {
 
   unsigned swizzlingByteWidth;
   bool transposed;
+  unsigned elementBitWidth;
   std::optional<SmallVector<unsigned>> CTAsPerCGA;
   std::optional<SmallVector<unsigned>> CTASplitNum;
   std::optional<SmallVector<unsigned>> CTAOrder;
@@ -1938,6 +1939,9 @@ Attribute NVMMASharedEncodingAttr::parse(AsmParser &parser, Type type) {
     } else if (attr.getName() == "transposed") {
       if (parseBool(parser, attr, transposed, "transposed").failed())
         return {};
+    } else if (attr.getName() == "elementBitWidth") {
+      if (parseUInt(parser, attr, elementBitWidth, "elementBitWidth").failed())
+        return {};
     } else if (attr.getName() == "CTAsPerCGA") {
       if (parseIntArrayAttr(parser, attr, CTAsPerCGA.emplace(), "CTAsPerCGA")
               .failed())
@@ -1963,13 +1967,15 @@ Attribute NVMMASharedEncodingAttr::parse(AsmParser &parser, Type type) {
     return {};
 
   return parser.getChecked<NVMMASharedEncodingAttr>(
-      parser.getContext(), swizzlingByteWidth, transposed, *CTALayout);
+      parser.getContext(), swizzlingByteWidth, transposed, elementBitWidth,
+      *CTALayout);
 }
 
 void NVMMASharedEncodingAttr::print(AsmPrinter &printer) const {
   printer << "<{"
           << "swizzlingByteWidth = " << getSwizzlingByteWidth() //
-          << ", transposed = " << getTransposed();
+          << ", transposed = " << getTransposed()               //
+          << ", elementBitWidth = " << getElementBitWidth();
   maybePrintCTALayout(getContext(), printer, getCTALayout(),
                       /*rank=*/2);
   printer << "}>";
@@ -2611,7 +2617,8 @@ struct TritonGPUInferLayoutInterface
         return failure();
       }
       resultEncoding = NVMMASharedEncodingAttr::get(
-          ctx, enc.getSwizzlingByteWidth(), !enc.getTransposed(), *ctaLayout);
+          ctx, enc.getSwizzlingByteWidth(), !enc.getTransposed(),
+          enc.getElementBitWidth(), *ctaLayout);
       return success();
     }
Original file line number	Diff line number	Diff line change
`@@ -44,8 +44,7 @@ def TritonGPU_Dialect : Dialect {`
`44`	`44`	`return cast<IntegerAttr>(threadsPerWarp).getInt();`
`45`	`45`	`}`
`46`	`46`
`47`		`- LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout,`
`48`		`- std::optional<int32_t> elemBitWidth);`
	`47`	`+ LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout);`
`49`	`48`
`50`	`49`	`private:`
`51`	`50`	`LinearLayoutCache llCache;`