intel
diff --git a/‎.github/actions/setup-pytorch/action.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/actions/setup-pytorch/action.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/pins/pytorch-upstream.txt‎
Lines changed: 1 addition & 1 deletion b/‎.github/pins/pytorch-upstream.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/triton_kernels_benchmark/benchmark_driver.py‎
Lines changed: 61 additions & 3 deletions b/‎benchmarks/triton_kernels_benchmark/benchmark_driver.py‎
Lines changed: 61 additions & 3 deletions
diff --git a/‎lib/Analysis/AxisInfo.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Analysis/AxisInfo.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ScanOpToLLVM.cpp‎
Lines changed: 5 additions & 5 deletions b/‎lib/Conversion/TritonGPUToLLVM/ScanOpToLLVM.cpp‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 10 additions & 10 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 67 additions & 1 deletion b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 67 additions & 1 deletion
diff --git a/‎python/test/unit/runtime/test_bindings.py‎
Lines changed: 3 additions & 3 deletions b/‎python/test/unit/runtime/test_bindings.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎python/test/unit/runtime/test_subproc.py‎
Lines changed: 4 additions & 3 deletions b/‎python/test/unit/runtime/test_subproc.py‎
Lines changed: 4 additions & 3 deletions
@@ -82,7 +82,7 @@ runs:
       uses: ./.github/actions/load
       env:
         # Increase this value to reset cache
-        CACHE_NUMBER: 11
+        CACHE_NUMBER: 12
       with:
         path: pytorch
         key: pytorch-$PYTORCH_CACHE_KEY-$CACHE_NUMBER
 
@@ -1 +1 @@
-8321eec009c8c79145ebccd51fdfc336e5f8b848
+487873f7cafeb0fd390eaefe40496b804bceabbd
@@ -399,19 +399,77 @@ def format_of(ty):
     return src
 
 
+def serialize_kernel_metadata(arg, args_dict):
+    args_dict["num_warps"] = arg.num_warps
+    args_dict["threads_per_warp"] = arg.threads_per_warp
+    args_dict["shared_memory"] = arg.shared
+    args_dict["kernel_name"] = arg.name
+    args_dict["spv_name"] = f"{arg.name}.spv"
+
+
+def serialize_args(args, constants, signature):
+    import numbers
+    dir_path = os.getenv("TRITON_XPU_DUMP_SPIRV_KERNEL_ARGS")
+    if not os.path.exists(dir_path):
+        os.makedirs(dir_path)
+        print(f"Path to directory consisting of SPIR-V Runner data: {dir_path}")
+
+    cnt = 0
+    args_dict = {"gridX": args[cnt], "gridY": args[cnt + 1], "gridZ": args[cnt + 2]}
+    args_dict["argument_list"] = []
+    counts = {"tensors": 0, "scalars": 0, "karg_cnt": 0}
+    cnt = 4
+    for arg in args[cnt:]:
+        if type(arg).__name__ == "KernelMetadata":
+            serialize_kernel_metadata(arg, args_dict)
+
+        if isinstance(arg, torch.Tensor):
+            cpu_tensor = arg.cpu()
+            tensor_path = os.path.join(dir_path, f"tensor_{counts['tensors']}.pt")
+            with open(tensor_path, "wb") as f:
+                torch.save(cpu_tensor, f)
+            new_arg = {
+                "name": f"tensor_{counts['tensors']}", "type": "tensor", "dtype": str(arg.dtype), "ctype":
+                signature[counts["karg_cnt"]]
+            }
+            args_dict["argument_list"].append(new_arg)
+            counts["karg_cnt"] += 1
+            counts["tensors"] += 1
+
+        if isinstance(arg, numbers.Number):
+            if counts["karg_cnt"] not in constants:
+                new_arg = {
+                    "name": f"scalarArg_{counts['scalars']}", "type": "scalar", "value": args[cnt], "ctype":
+                    signature[counts["karg_cnt"]]
+                }
+                args_dict["argument_list"].append(new_arg)
+            counts["karg_cnt"] += 1
+            counts["scalars"] += 1
+        cnt += 1
+    # Dump argument info as a JSON file
+    json_path = os.path.join(dir_path, "args_data.json")
+    with open(json_path, "w", encoding="utf-8") as json_file:
+        import json
+        json.dump(args_dict, json_file, indent=4)
+
+
 class XPULauncher:
 
     def __init__(self, src, metadata):  # pylint: disable=unused-argument
         ids = {"ids_of_const_exprs": src.fn.constexprs if hasattr(src, "fn") else tuple()}
         constants = src.constants if hasattr(src, "constants") else {}
         cst_key = lambda i: src.fn.arg_names.index(i) if isinstance(i, str) else i
-        constants = {cst_key(key): value for key, value in constants.items()}
-        signature = {cst_key(key): value for key, value in src.signature.items()}
-        src = make_launcher(constants, signature, ids)
+        self.constants = {cst_key(key): value for key, value in constants.items()}
+        self.signature = {cst_key(key): value for key, value in src.signature.items()}
+        src = make_launcher(self.constants, self.signature, ids)
         mod = compile_module_from_src(src, "__triton_launcher")
         self.launch = mod.launch
 
     def __call__(self, *args, **kwargs):
+        # Serialize KernelArguments for SPIR-V Runner
+        serialize_kernel_args = os.getenv("TRITON_XPU_DUMP_SPIRV_KERNEL_ARGS", None)
+        if serialize_kernel_args:
+            serialize_args(args, self.constants, self.signature)
         self.launch(*args, **kwargs)
 
 
 
@@ -895,7 +895,7 @@ class ShLIOpAxisInfoVisitor final : public BinaryOpVisitorImpl<arith::ShLIOp> {
       lhsDivisibility = 1;
     }
     auto numBits = log2Int(lhsDivisibility);
-    return multiplyDivisor(lhsDivisibility, 1 << shift);
+    return multiplyDivisor(lhsDivisibility, 1ll << shift);
   }
 
   int64_t getConstancy(arith::ShLIOp op, const AxisInfo &lhs,
 
@@ -76,7 +76,7 @@ static void warpScan(SmallVector<SmallVector<Value>> &srcValues,
         acc[j] = select(mask, tempAcc[j], acc[j]);
       }
     }
-    srcValues[srcIndex] = acc;
+    srcValues[srcIndex] = std::move(acc);
   }
 }
 
@@ -128,8 +128,8 @@ static void AddPartialReduce(SmallVector<SmallVector<Value>> &srcValues,
                              ConversionPatternRewriter &rewriter,
                              const TargetInfoBase &targetInfo,
                              ScanLoweringHelper &helper,
-                             SmallVector<Value> smemBases,
-                             SmallVector<Type> smemTypes, Value warpId,
+                             ArrayRef<Value> smemBases,
+                             ArrayRef<Type> smemTypes, Value warpId,
                              Value laneIdAxis, Value parallelLaneId) {
   Location loc = helper.getLoc();
   unsigned numParallelLane = helper.getNonAxisNumThreadsPerCTA();
@@ -224,7 +224,7 @@ static void AddPartialReduce(SmallVector<SmallVector<Value>> &srcValues,
                                 srcValues[srcIndex - i * elementStride][j]);
         }
       }
-      srcValues[srcIndex - i * elementStride] = laneValue;
+      srcValues[srcIndex - i * elementStride] = std::move(laneValue);
     }
     // For the next chunk start back from the value containing the
     // accumulated value of all the warps.
@@ -303,7 +303,7 @@ static void AddPartialReduceOneWarp(SmallVector<SmallVector<Value>> &srcValues,
                      srcValues[srcIndex - i * elementStride][j], laneValue[j]);
         }
       }
-      srcValues[srcIndex - i * elementStride] = laneValue;
+      srcValues[srcIndex - i * elementStride] = std::move(laneValue);
     }
     // For the next chunk start back from the value containing the
     // accumulated value of all the warps.
 
@@ -1044,16 +1044,16 @@ SmallVector<unsigned> DotOperandEncodingAttr::getCTASplitNum() const {
   return res;
 }
 SmallVector<unsigned> DotOperandEncodingAttr::getWarpsPerCTA() const {
-  auto parentLayout = getParent();
-  assert(parentLayout && "DotOperandEncodingAttr must have a parent");
-  if (auto distributedLayout =
-          mlir::dyn_cast<DistributedEncodingTrait>(parentLayout)) {
-    return distributedLayout.getWarpsPerCTA();
-  } else {
-    llvm::report_fatal_error(
-        "DotOperandEncodingAttr non-DistributedEncodingAttr parent not "
-        "supported yet");
-  }
+  auto distributedLayout = mlir::cast<DistributedEncodingTrait>(getParent());
+  auto warps = distributedLayout.getWarpsPerCTA();
+  // FIXME: This is a temporary solution to avoid distribute-to-warps.mlir
+  // failure.
+  if (mlir::triton::tools::getBoolEnv("TRITON_INTEL_ADVANCED_PATH"))
+    return warps;
+  auto rank = warps.size();
+  auto kDim = getOpIdx() == 0 ? rank - 1 : rank - 2;
+  warps[kDim] = 1;
+  return warps;
 }
 SmallVector<unsigned> DotOperandEncodingAttr::getWarpOrder() const {
   return ::getWarpOrder(*this);
 
@@ -5,6 +5,7 @@
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
+#include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h"
 #include "triton/Tools/LinearLayout.h"
 #include "triton/Tools/StrUtil.h"
 #include "llvm/ADT/DenseMap.h"
@@ -822,16 +823,81 @@ SliceEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
   return ret;
 }
 
+LinearLayout ampereDotToLinearLayout(ArrayRef<int64_t> shape,
+                                     DotOperandEncodingAttr dot) {
+  // TODO,BE. Implement ampereMMA in terms of this one
+  int rank = shape.size();
+  auto mma = cast<NvidiaMmaEncodingAttr>(dot.getParent());
+  int kWidth = dot.getKWidth();
+  bool isA = dot.getOpIdx() == 0;
+
+  assert(mma.isAmpere());
+  assert((rank == 2 && mma.getInstrShape() == ArrayRef<unsigned>({16, 8})) ||
+         (rank == 3 && mma.getInstrShape() == ArrayRef<unsigned>({1, 16, 8})));
+
+  MLIRContext *ctx = mma.getContext();
+  SmallVector<StringAttr> dimNames = standardOutDimNames(ctx, rank);
+
+  // Implement A. For B transpose in the end
+  std::vector<std::vector<int32_t>> registers;
+  std::vector<std::vector<int32_t>> lanes;
+  int32_t i = 1;
+  // kWidth contiguous elements
+  while (i < kWidth) {
+    registers.push_back({i, 0});
+    i *= 2;
+  }
+  // 4 threads per chunk
+  for (int j = 0; j < 2; j++) {
+    lanes.push_back({i, 0});
+    i *= 2;
+  }
+  // 8 threads going down
+  lanes.push_back({0, 1});
+  lanes.push_back({0, 2});
+  lanes.push_back({0, 4});
+  // 2 tiles in column-major order
+  // Just one if it's the B operand
+  if (isA) {
+    registers.push_back({0, 8});
+  }
+  registers.push_back({i, 0});
+
+  if (!isA) {
+    for (auto &r : registers) {
+      std::swap(r[0], r[1]);
+    }
+    for (auto &l : lanes) {
+      std::swap(l[0], l[1]);
+    }
+  }
+
+  LinearLayout ctaLayout(
+      {{S("register"), registers}, {S("lane"), lanes}},
+      llvm::to_vector(llvm::reverse(ArrayRef(dimNames).take_back(2))));
+
+  auto order = dot.getCTAOrder();
+  assert(order[0] == 1 && order[1] == 0);
+  ctaLayout *= identityND(S("warp"), dot.getWarpsPerCTA(), order, dimNames);
+
+  return combineCtaCgaWithShape(ctaLayout, mma.getCTALayout(), shape);
+}
+
 std::optional<LinearLayout>
 DotOperandEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
-
   if (auto mfmaLayout = llvm::dyn_cast<AMDMfmaEncodingAttr>(getParent())) {
     return dotOperandMfmaToLinearLayout(*this, shape);
   }
   if (auto dpasLayout = llvm::dyn_cast<intel::DpasEncodingAttr>(getParent())) {
     return dotOperandDpasToLinearLayout(*this, shape);
   }
 
+  // TODO Activate in a follow-up PR
+  // else if (auto mma = mlir::dyn_cast<NvidiaMmaEncodingAttr>(getParent())) {
+  //  if (mma.isAmpere()) {
+  //    return ampereDotToLinearLayout(shape, *this);
+  //  }
+  //}
   return std::nullopt;
 }
 
 
@@ -59,6 +59,8 @@ def walk_fn(op):
         torch.empty((32, 32), device=device),  # out_ptr
         16,  # BLOCK_SIZE
     ]
+    target = triton.runtime.driver.active.get_current_target()
+    backend = triton.compiler.compiler.make_backend(target)
     src = triton.compiler.compiler.ASTSource(
         fn=kernel,
         signature={
@@ -69,12 +71,10 @@ def walk_fn(op):
         constants={kernel.arg_names[i]: arg
                    for i, arg in enumerate(args)
                    if not isinstance(arg, torch.Tensor)},
-        attrs=kernel._get_config(*args, ),
+        attrs=backend.get_attrs_descriptor(args, kernel.params),
     )
 
     context = triton._C.libtriton.ir.context()
-    target = triton.runtime.driver.active.get_current_target()
-    backend = triton.compiler.compiler.make_backend(target)
     options = backend.parse_options(dict())
     codegen_fns = dict()
     module_map = backend.get_module_map()
 
@@ -3,6 +3,7 @@
 
 import triton
 import triton.language as tl
+from triton.backends.compiler import AttrsDescriptor
 from triton.compiler import ASTSource
 
 target = triton.runtime.driver.active.get_current_target()
@@ -25,7 +26,7 @@ def kernel_sub(a, b, o, N: tl.constexpr):
 
 
 def test_compile_in_subproc() -> None:
-    config = triton.compiler.AttrsDescriptor(tuple(range(4)), ())
+    config = AttrsDescriptor.from_hints({i: 16 for i in range(4)})
     multiprocessing.set_start_method('fork')
     proc = multiprocessing.Process(target=compile_fn, args=(config, ))
     proc.start()
@@ -47,7 +48,7 @@ def kernel_dot(Z):
 
 
 def test_compile_in_forked_subproc(fresh_triton_cache) -> None:
-    config = triton.compiler.AttrsDescriptor(tuple(range(1)), ())
+    config = AttrsDescriptor.from_hints({0: 16})
     assert multiprocessing.get_start_method() == 'fork'
     proc = multiprocessing.Process(target=compile_fn_dot, args=(config, ))
     proc.start()
@@ -86,7 +87,7 @@ def test_compile_in_forked_subproc_with_forced_gc(fresh_triton_cache) -> None:
     gc.disable()
 
     # stage 1.p
-    config = triton.compiler.AttrsDescriptor(tuple(range(1)), ())
+    config = AttrsDescriptor.from_hints({0: 16})
     compile_empty_kernel_with_gc(config)
 
     # stage 2.p
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-8321eec009c8c79145ebccd51fdfc336e5f8b848`
	`1`	`+487873f7cafeb0fd390eaefe40496b804bceabbd`
Original file line number	Diff line number	Diff line change
`@@ -895,7 +895,7 @@ class ShLIOpAxisInfoVisitor final : public BinaryOpVisitorImpl<arith::ShLIOp> {`
`895`	`895`	`lhsDivisibility = 1;`
`896`	`896`	`}`
`897`	`897`	`auto numBits = log2Int(lhsDivisibility);`
`898`		`- return multiplyDivisor(lhsDivisibility, 1 << shift);`
	`898`	`+ return multiplyDivisor(lhsDivisibility, 1ll << shift);`
`899`	`899`	`}`
`900`	`900`
`901`	`901`	`int64_t getConstancy(arith::ShLIOp op, const AxisInfo &lhs,`
Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ static void warpScan(SmallVector<SmallVector<Value>> &srcValues,`
`76`	`76`	`acc[j] = select(mask, tempAcc[j], acc[j]);`
`77`	`77`	`}`
`78`	`78`	`}`
`79`		`- srcValues[srcIndex] = acc;`
	`79`	`+ srcValues[srcIndex] = std::move(acc);`
`80`	`80`	`}`
`81`	`81`	`}`
`82`	`82`
`@@ -128,8 +128,8 @@ static void AddPartialReduce(SmallVector<SmallVector<Value>> &srcValues,`
`128`	`128`	`ConversionPatternRewriter &rewriter,`
`129`	`129`	`const TargetInfoBase &targetInfo,`
`130`	`130`	`ScanLoweringHelper &helper,`
`131`		`- SmallVector<Value> smemBases,`
`132`		`- SmallVector<Type> smemTypes, Value warpId,`
	`131`	`+ ArrayRef<Value> smemBases,`
	`132`	`+ ArrayRef<Type> smemTypes, Value warpId,`
`133`	`133`	`Value laneIdAxis, Value parallelLaneId) {`
`134`	`134`	`Location loc = helper.getLoc();`
`135`	`135`	`unsigned numParallelLane = helper.getNonAxisNumThreadsPerCTA();`
`@@ -224,7 +224,7 @@ static void AddPartialReduce(SmallVector<SmallVector<Value>> &srcValues,`
`224`	`224`	`srcValues[srcIndex - i * elementStride][j]);`
`225`	`225`	`}`
`226`	`226`	`}`
`227`		`- srcValues[srcIndex - i * elementStride] = laneValue;`
	`227`	`+ srcValues[srcIndex - i * elementStride] = std::move(laneValue);`
`228`	`228`	`}`
`229`	`229`	`// For the next chunk start back from the value containing the`
`230`	`230`	`// accumulated value of all the warps.`
`@@ -303,7 +303,7 @@ static void AddPartialReduceOneWarp(SmallVector<SmallVector<Value>> &srcValues,`
`303`	`303`	`srcValues[srcIndex - i * elementStride][j], laneValue[j]);`
`304`	`304`	`}`
`305`	`305`	`}`
`306`		`- srcValues[srcIndex - i * elementStride] = laneValue;`
	`306`	`+ srcValues[srcIndex - i * elementStride] = std::move(laneValue);`
`307`	`307`	`}`
`308`	`308`	`// For the next chunk start back from the value containing the`
`309`	`309`	`// accumulated value of all the warps.`