intel
diff --git a/‎test/Analysis/intel/test-alignment.mlir‎ renamed to ‎test/Analysis/intel/test-axis-info.mlir‎
Lines changed: 15 additions & 0 deletions b/‎test/Analysis/intel/test-alignment.mlir‎ renamed to ‎test/Analysis/intel/test-axis-info.mlir‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎test/lib/Analysis/intel/TestAxisInfo.cpp‎
Lines changed: 1 addition & 1 deletion b/‎test/lib/Analysis/intel/TestAxisInfo.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎third_party/intel/backend/driver.py‎
Lines changed: 62 additions & 3 deletions b/‎third_party/intel/backend/driver.py‎
Lines changed: 62 additions & 3 deletions
diff --git a/‎third_party/intel/lib/Analysis/AxisInfo.cpp‎
Lines changed: 64 additions & 7 deletions b/‎third_party/intel/lib/Analysis/AxisInfo.cpp‎
Lines changed: 64 additions & 7 deletions
diff --git a/‎utils/SPIRVRunner/CMakeLists.txt‎
Lines changed: 14 additions & 2 deletions b/‎utils/SPIRVRunner/CMakeLists.txt‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎utils/SPIRVRunner/README.md‎
Lines changed: 25 additions & 9 deletions b/‎utils/SPIRVRunner/README.md‎
Lines changed: 25 additions & 9 deletions
@@ -876,3 +876,18 @@ module {
     tt.return %int_min : i64
   }
 }
+
+// -----
+
+// CHECK-LABEL: @make_tensor_ptr
+tt.func public @make_tensor_ptr(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f8E5M2> {tt.divisibility = 32 : i32}, %arg2: i64 {tt.divisibility = 16 : i32}) {
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i64 = arith.constant 1 : i64
+  %c32_i64 = arith.constant 32 : i64
+  %c128_i64 = arith.constant 128 : i64
+  // CHECK: %0 = tt.make_tensor_ptr %arg0, {{.*}} => contiguity = [128, 32], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>
+  %0 = tt.make_tensor_ptr %arg0, [%c128_i64, %c32_i64], [%c1_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : !tt.ptr<tensor<128x32xf16>>
+  // CHECK: %1 = tt.make_tensor_ptr %arg1, {{.*}} => contiguity = [32, 1], divisibility = [16, 1], constancy = [1, 1], constant_value = <none>
+  %1 = tt.make_tensor_ptr %arg1, [%c32_i64, %c32_i64], [%c1_i64, %arg2], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<64x16xf8E5M2>>
+  tt.return
+}
@@ -13,7 +13,7 @@ struct TestAxisInfoPass
 
   StringRef getArgument() const final { return "test-print-axis-info"; }
   StringRef getDescription() const final {
-    return "print the result of the alignment analysis pass";
+    return "print the result of the axis analysis pass";
   }
 
   void runOnOperation() override {
 
@@ -435,19 +435,78 @@ def format_of(ty):
     return src
 
 
+def serialize_kernel_metadata(arg, args_dict):
+    args_dict['num_warps'] = arg.num_warps
+    args_dict['threads_per_warp'] = arg.threads_per_warp
+    args_dict['shared_memory'] = arg.shared
+    args_dict['kernel_name'] = arg.name
+    args_dict['spv_name'] = f"{arg.name}.spv"
+
+
+def serialize_args(args, constants, signature):
+    import torch
+    import numbers
+    dir_path = os.getenv('TRITON_XPU_DUMP_SPIRV_KERNEL_ARGS')
+    if not os.path.exists(dir_path):
+        os.makedirs(dir_path)
+        print(f"Path to directory consisting of SPIR-V Runner data: {dir_path}")
+
+    cnt = 0
+    args_dict = {"gridX": args[cnt], "gridY": args[cnt + 1], "gridZ": args[cnt + 2]}
+    args_dict['argument_list'] = []
+    counts = {"tensors": 0, "scalars": 0, "karg_cnt": 0}
+    cnt = 4
+    for arg in args[cnt:]:
+        if type(arg).__name__ == "KernelMetadata":
+            serialize_kernel_metadata(arg, args_dict)
+
+        if isinstance(arg, torch.Tensor):
+            cpu_tensor = arg.cpu()
+            tensor_path = os.path.join(dir_path, f"tensor_{counts['tensors']}.pt")
+            with open(tensor_path, 'wb') as f:
+                torch.save(cpu_tensor, f)
+            new_arg = {
+                "name": f"tensor_{counts['tensors']}", "type": "tensor", "dtype": str(arg.dtype), "ctype":
+                signature[counts['karg_cnt']]
+            }
+            args_dict['argument_list'].append(new_arg)
+            counts['karg_cnt'] += 1
+            counts['tensors'] += 1
+
+        if isinstance(arg, numbers.Number):
+            if counts['karg_cnt'] not in constants:
+                new_arg = {
+                    "name": f"scalarArg_{counts['scalars']}", "type": "scalar", "value": args[cnt], "ctype":
+                    signature[counts['karg_cnt']]
+                }
+                args_dict['argument_list'].append(new_arg)
+            counts['karg_cnt'] += 1
+            counts['scalars'] += 1
+        cnt += 1
+    # Dump argument info as a JSON file
+    json_path = os.path.join(dir_path, 'args_data.json')
+    with open(json_path, 'w') as json_file:
+        import json
+        json.dump(args_dict, json_file, indent=4)
+
+
 class XPULauncher(object):
 
     def __init__(self, src, metadata):
         ids = {"ids_of_const_exprs": src.fn.constexprs if hasattr(src, "fn") else tuple()}
         constants = src.constants if hasattr(src, "constants") else dict()
         cst_key = lambda i: src.fn.arg_names.index(i) if isinstance(i, str) else i
-        constants = {cst_key(key): value for key, value in constants.items()}
-        signature = {cst_key(key): value for key, value in src.signature.items()}
-        src = make_launcher(constants, signature, ids)
+        self.constants = {cst_key(key): value for key, value in constants.items()}
+        self.signature = {cst_key(key): value for key, value in src.signature.items()}
+        src = make_launcher(self.constants, self.signature, ids)
         mod = compile_module_from_src(src, "__triton_launcher")
         self.launch = mod.launch
 
     def __call__(self, *args, **kwargs):
+        # Serialize KernelArguments for SPIR-V Runner
+        serialize_kernel_args = os.getenv('TRITON_XPU_DUMP_SPIRV_KERNEL_ARGS', None)
+        if serialize_kernel_args:
+            serialize_args(args, self.constants, self.signature)
         self.launch(*args, **kwargs)
 
 
 
@@ -50,6 +50,12 @@ int64_t multiplyDivisor(int64_t lhs, int64_t rhs) {
   return lhs * rhs;
 }
 
+RankedTensorType getRankedTensorType(Type ptrTy) {
+  return isTensorPointerType(ptrTy)
+             ? cast<RankedTensorType>(cast<PointerType>(ptrTy).getPointeeType())
+             : dyn_cast<RankedTensorType>(ptrTy);
+}
+
 class AxisInfoVisitor {
 public:
   AxisInfoVisitor() = default;
@@ -409,7 +415,7 @@ class DivOpAxisInfoVisitor final : public BinaryOpVisitorImpl<OpTy> {
 
   int64_t getConstancy(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs,
                        int dim) override {
-    auto resTy = dyn_cast<RankedTensorType>(op.getType());
+    auto resTy = getRankedTensorType(op.getType());
     if (!resTy)
       return BinaryOpVisitorImpl<OpTy>::getConstancy(op, lhs, rhs, dim);
     auto shape = resTy.getShape();
@@ -464,7 +470,7 @@ class RemOpAxisInfoVisitor final : public BinaryOpVisitorImpl<OpTy> {
 private:
   int64_t getContiguity(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs,
                         int dim) override {
-    auto resTy = dyn_cast<RankedTensorType>(op.getType());
+    auto resTy = getRankedTensorType(op.getType());
     if (!resTy)
       return BinaryOpVisitorImpl<OpTy>::getContiguity(op, lhs, rhs, dim);
     auto shape = resTy.getShape();
@@ -498,7 +504,7 @@ class RemOpAxisInfoVisitor final : public BinaryOpVisitorImpl<OpTy> {
 
   int64_t getConstancy(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs,
                        int dim) override {
-    auto resTy = dyn_cast<RankedTensorType>(op.getType());
+    auto resTy = getRankedTensorType(op.getType());
     if (!resTy)
       return BinaryOpVisitorImpl<OpTy>::getConstancy(op, lhs, rhs, dim);
     auto shape = resTy.getShape();
@@ -647,7 +653,7 @@ class CmpOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {
   AxisInfo
   getAxisInfo(OpTy op,
               ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) override {
-    auto resTy = dyn_cast<RankedTensorType>(op.getType());
+    auto resTy = getRankedTensorType(op.getType());
     if (!resTy)
       return AxisInfo();
     auto shape = resTy.getShape();
@@ -995,6 +1001,55 @@ class MaxMinOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {
   }
 };
 
+class MakeTensorPtrOpAxisInfoVisitor final
+    : public AxisInfoVisitorImpl<triton::MakeTensorPtrOp> {
+public:
+  using AxisInfoVisitorImpl<triton::MakeTensorPtrOp>::AxisInfoVisitorImpl;
+
+  AxisInfo
+  getAxisInfo(triton::MakeTensorPtrOp op,
+              ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) override {
+    LDBG("MakeTensorPtrOpAxisInfoVisitor: " << *op);
+    assert(op.getShape().size() == 2 && operands.size() == 7 &&
+           "MakeTensorPtrOp should have 2D shape");
+
+    AxisInfo ptrInfo = operands[0]->getValue();
+    AxisInfo shapeInfo0 = operands[1]->getValue();
+    AxisInfo shapeInfo1 = operands[2]->getValue();
+    AxisInfo strideInfo0 = operands[3]->getValue();
+    AxisInfo strideInfo1 = operands[4]->getValue();
+
+    std::optional<int64_t> shape0 = shapeInfo0.getConstantValue();
+    std::optional<int64_t> shape1 = shapeInfo1.getConstantValue();
+    std::optional<int64_t> stride0 = strideInfo0.getConstantValue();
+    std::optional<int64_t> stride1 = strideInfo1.getConstantValue();
+
+    AxisInfo::DimVectorT contiguity{
+        shape0.has_value() && (stride0 == 1) ? shape0.value() : 1,
+        shape1.has_value() && (stride1 == 1) ? shape1.value() : 1};
+
+    int64_t ptrDivisibility = ptrInfo.getDivisibility()[0];
+    int64_t strideDivisibility0 = strideInfo0.getDivisibility()[0];
+    int64_t strideDivisibility1 = strideInfo1.getDivisibility()[0];
+
+    LDBG("ptrDivisibility: " << ptrDivisibility);
+    LDBG("strideDivisibility0: " << strideDivisibility0);
+    LDBG("strideDivisibility1: " << strideDivisibility1);
+
+    AxisInfo::DimVectorT divisibility{1, 1};
+    if (ptrDivisibility > 1) {
+      if (contiguity[0] > 1)
+        divisibility[0] = std::min(ptrDivisibility, strideDivisibility1);
+      if (contiguity[1] > 1)
+        divisibility[1] = std::min(ptrDivisibility, strideDivisibility0);
+    }
+
+    AxisInfo::DimVectorT constancy{1, 1};
+
+    return AxisInfo(contiguity, divisibility, constancy);
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // AxisInfoAnalysis
 //===----------------------------------------------------------------------===//
@@ -1042,11 +1097,13 @@ AxisInfoAnalysis::AxisInfoAnalysis(DataFlowSolver &solver)
                   MaxMinOpAxisInfoVisitor<arith::MinSIOp>,
                   MaxMinOpAxisInfoVisitor<arith::MinUIOp>>();
   visitors.append<LoadOpAxisInfoVisitor>();
+  visitors.append<MakeTensorPtrOpAxisInfoVisitor>();
 }
 
 LogicalResult AxisInfoAnalysis::visitOperation(
     Operation *op, ArrayRef<const dataflow::Lattice<AxisInfo> *> operands,
     ArrayRef<dataflow::Lattice<AxisInfo> *> results) {
+  LDBG("visitOperation: << " << *op);
   // TODO: For sure not the right way to do this
   // but why is scf.if not initialized otherwise?
   for (auto op : operands)
@@ -1204,7 +1261,7 @@ void AxisInfo::initPessimisticStateFromFunc(int argNumber, T funcOp,
 }
 
 unsigned ModuleAxisInfoAnalysis::getPtrContiguity(Value ptr) {
-  auto tensorTy = dyn_cast<RankedTensorType>(ptr.getType());
+  auto tensorTy = getRankedTensorType(ptr.getType());
   if (!tensorTy)
     return 1;
   auto layout = tensorTy.getEncoding();
@@ -1226,7 +1283,7 @@ unsigned ModuleAxisInfoAnalysis::getPtrContiguity(Value ptr) {
 }
 
 unsigned ModuleAxisInfoAnalysis::getPtrAlignment(Value ptr) {
-  auto tensorTy = dyn_cast<RankedTensorType>(ptr.getType());
+  auto tensorTy = getRankedTensorType(ptr.getType());
   if (!tensorTy)
     return 1;
   auto *axisInfo = getAxisInfo(ptr);
@@ -1254,7 +1311,7 @@ unsigned ModuleAxisInfoAnalysis::getPtrAlignment(Value ptr) {
 }
 
 unsigned ModuleAxisInfoAnalysis::getMaskAlignment(Value mask) {
-  auto tensorTy = dyn_cast<RankedTensorType>(mask.getType());
+  auto tensorTy = getRankedTensorType(mask.getType());
   if (!tensorTy)
     return 1;
   auto *axisInfo = getAxisInfo(mask);
 
@@ -1,13 +1,24 @@
 cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
 project(reproducer)
-
 set(CMAKE_CXX_COMPILER icpx)
 set(BUILD_SHARED_LIBS OFF)
 
 list(APPEND CMAKE_PREFIX_PATH "/opt/intel/oneapi/tbb/latest/lib/cmake/tbb/")
 
 find_package(Torch REQUIRED)
 
+include(ExternalProject)
+ExternalProject_Add(
+    json
+    GIT_REPOSITORY https://github.com/nlohmann/json.git
+    GIT_TAG v3.11.2
+    PREFIX ${CMAKE_BINARY_DIR}/nlohmann_json
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    INSTALL_COMMAND ""
+)
+set(JSON_INCLUDE_DIR ${CMAKE_BINARY_DIR}/nlohmann_json/src/json/include/)
+
 # Add preview-breaking-changes for ABI compatibility with SYCL library linked by PyTorch: https://github.com/pytorch/pytorch/commit/92bebb46fa9fd60523d8aeb7b5f1a3f488c4cd93
 set(COMPILE_FLAGS "-fsycl -Wall -fpreview-breaking-changes")
 set(LINK_FLAGS "-fsycl -lze_loader")
@@ -16,9 +27,10 @@ set(SYCL_FUNCTIONS_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/in
 
 set(TARGET_NAME SPIRVRunner)
 add_executable(${TARGET_NAME} ${TARGET_NAME}.cpp)
-target_include_directories(${TARGET_NAME} PRIVATE "/opt/intel/oneapi/compiler/latest/include" ${SYCL_FUNCTIONS_INCLUDE_DIR})
+target_include_directories(${TARGET_NAME} PRIVATE "/opt/intel/oneapi/compiler/latest/include" ${SYCL_FUNCTIONS_INCLUDE_DIR} ${JSON_INCLUDE_DIR})
 set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_FLAGS "${COMPILE_FLAGS}")
 set_target_properties(${TARGET_NAME} PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+add_dependencies(${TARGET_NAME} json)
 
 target_link_libraries(${TARGET_NAME} "${TORCH_LIBRARIES}")
 set_property(TARGET ${TARGET_NAME} PROPERTY CXX_STANDARD 17)
@@ -17,28 +17,44 @@ CMAKE_PREFIX_PATH=/abs/path/to/TorchConfig.cmake/FromAbove/ cmake -DCMAKE_BUILD_
 make -j
 ```
 
-## Configuring
+## Configuration
 
-`SPIRVRunner` is configured to run the `add_kernel.spv` SPIRV binary with inputs `x.py` and `y.py`. `add_kernel.spv` was generated from the `01-vector-add.py` tutorial.
+### Generate Data
 
-Kernels of different shapes require modifying parameters manually in the `SPIRVRunner`. Two places require modification:
+In order to utilize this utility, Triton application must be run with following environment variables enabled
+Provide the path to the directory where the serialized JSON, tensors and SPRI-V binary stored. It is recommended to clear triton cache.
+
+```
+export TRITON_XPU_DUMP_SPIRV_KERNEL_ARGS=< Absolute path to SPV Dumps >
+```
+
+Following input data is generated,
+
+1. args_data.json - (Kernel Arguments / Grid Configuration)
+2. tensors  (Tensors used by the kernel (.pt))
+3. SPIR-V binary (.spv)
 
-1. `launchKernel`: Add input Tensors to the function signature, add arguments as variables within the function. Arguments can be pulled from the `args` variable to `XPULauncher.__call__` method in `driver.py`. Arguments should be passed to the `sycl_kernel_launch` function. Note that we currently rely on `sycl::memcpy` to move the PyTorch Tensor to XPU. In later versions of PyTorch we should be able to delegate this responsibility to `PyTorch`, and pass the raw XPU `data_ptr()` from `PyTorch` to the kernel.
-2. `sycl_kernel_launch`: Place all `arg*` parameters into the `params` array and add an appropriate call to `set_scalar_arg` for each param, which tells `SYCL` what the arguments are for the kernel we are going to launch.
 
 ## Running
 
-Once the `SPIRVRunner` has been appropriately configured for the kernel and inputs, run the binary with no arguments:
+Help:
+`./build/SPIRVRunner` < Output Tensor Name >
+
+Note: `Output Tensor Name`  is essentially a chosen tensor that needs to be copied back to the CPU and written to disk. Additionally, the name must match the tensor's name (tensor_) and number as specified in the JSON file. Please refer args_data.json file.
+
+### Demo (01-vector-add.py)
+
+`SPIRVRunner` is configured to run the `add_kernel.spv` SPIRV binary with inputs `tensor_0.pt` and `tensor_1.pt` and output `tensor_2.pt`. `add_kernel.spv` was generated from the `01-vector-add.py` tutorial.
 
-`./build/SPIRVRunner`
+SPIRVRunner Usage:
+`./build/SPIRVRunner tensor_2`
 
 Expected output follows:
 
 ```
 Running on device: Intel(R) Data Center GPU Max 1100
-Tensor a: [98432], Float (393728 bytes)
-Tensor b: [98432], Float (393728 bytes)
 Read 3772 byte kernel.
+create kernel:add_kernel
 Loaded kernel with 0 registers and 0 register spills.
 Tensor output: [98432], Float (393728 bytes)
 Kernel return output: 1.37129
Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@ struct TestAxisInfoPass`
`13`	`13`
`14`	`14`	`StringRef getArgument() const final { return "test-print-axis-info"; }`
`15`	`15`	`StringRef getDescription() const final {`
`16`		`- return "print the result of the alignment analysis pass";`
	`16`	`+ return "print the result of the axis analysis pass";`
`17`	`17`	`}`
`18`	`18`
`19`	`19`	`void runOnOperation() override {`