intel
diff --git a/‎.github/actions/setup-pytorch/action.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/actions/setup-pytorch/action.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/pins/pytorch-upstream.txt‎
Lines changed: 1 addition & 1 deletion b/‎.github/pins/pytorch-upstream.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/integration-tests.yml‎
Lines changed: 1 addition & 2 deletions b/‎.github/workflows/integration-tests.yml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 1 addition & 2 deletions b/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎.github/workflows/llvm-build.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/llvm-build.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/benchmark_driver.py‎
Lines changed: 61 additions & 3 deletions b/‎benchmarks/triton_kernels_benchmark/benchmark_driver.py‎
Lines changed: 61 additions & 3 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py‎
Lines changed: 3 additions & 2 deletions b/‎benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 0 additions & 2 deletions b/‎bin/RegisterTritonDialects.h‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎cmake/llvm-hash.txt‎
Lines changed: 1 addition & 1 deletion b/‎cmake/llvm-hash.txt‎
Lines changed: 1 addition & 1 deletion
@@ -82,7 +82,7 @@ runs:
       uses: ./.github/actions/load
       env:
         # Increase this value to reset cache
-        CACHE_NUMBER: 11
+        CACHE_NUMBER: 12
       with:
         path: pytorch
         key: pytorch-$PYTORCH_CACHE_KEY-$CACHE_NUMBER
 
@@ -1 +1 @@
-8321eec009c8c79145ebccd51fdfc336e5f8b848
+487873f7cafeb0fd390eaefe40496b804bceabbd
@@ -10,8 +10,7 @@ name: Integration Tests
 on:
   workflow_dispatch:
   pull_request:
-    # You can name your branch dev-foo to get CI runs.
-    branches: [main, 'dev-**']
+    branches-ignore: ['llvm-**']
   merge_group:
     branches: [main, 'dev-**']
     types: [checks_requested]
 
@@ -9,8 +9,7 @@ name: Integration Tests
 on:
   workflow_dispatch:
   pull_request:
-    # You can name your branch dev-foo to get CI runs.
-    branches: [main, 'dev-**']
+    branches-ignore: ['llvm-**']
   merge_group:
     branches: [main, 'dev-**']
     types: [checks_requested]
 
@@ -157,8 +157,8 @@ jobs:
         cp -r /usr/aarch64-linux-gnu/lib ./arm-sysroot
         cp -r /usr/aarch64-linux-gnu/include ./arm-sysroot
         LINKER=$(pwd)/arm-sysroot/lib/ld-linux-aarch64.so.1
-        wget http://ftp.de.debian.org/debian/pool/main/g/gcc-defaults/gcc-aarch64-linux-gnu_14.1.0-2_amd64.deb
-        dpkg-deb -x gcc-aarch64-linux-gnu_14.1.0-2_amd64.deb ./arm-sysroot
+        wget http://ftp.de.debian.org/debian/pool/main/g/gcc-defaults/gcc-aarch64-linux-gnu_14.2.0-1_amd64.deb
+        dpkg-deb -x gcc-aarch64-linux-gnu_14.2.0-1_amd64.deb ./arm-sysroot
         export LD_LIBRARY_PATH=$(pwd)/arm-sysroot/lib:$LD_LIBRARY_PATH
         sudo ln -s $LINKER /lib/ld-linux-aarch64.so.1
         SYSROOT="$(pwd)/arm-sysroot"
 
@@ -31,6 +31,9 @@ python/triton/language/extra
 # Proton
 python/triton/profiler
 
+# Instrumentation
+python/triton/instrumentation
+
 # Python caches
 __pycache__/
 *.py[cod]
 
@@ -399,19 +399,77 @@ def format_of(ty):
     return src
 
 
+def serialize_kernel_metadata(arg, args_dict):
+    args_dict["num_warps"] = arg.num_warps
+    args_dict["threads_per_warp"] = arg.threads_per_warp
+    args_dict["shared_memory"] = arg.shared
+    args_dict["kernel_name"] = arg.name
+    args_dict["spv_name"] = f"{arg.name}.spv"
+
+
+def serialize_args(args, constants, signature):
+    import numbers
+    dir_path = os.getenv("TRITON_XPU_DUMP_SPIRV_KERNEL_ARGS")
+    if not os.path.exists(dir_path):
+        os.makedirs(dir_path)
+        print(f"Path to directory consisting of SPIR-V Runner data: {dir_path}")
+
+    cnt = 0
+    args_dict = {"gridX": args[cnt], "gridY": args[cnt + 1], "gridZ": args[cnt + 2]}
+    args_dict["argument_list"] = []
+    counts = {"tensors": 0, "scalars": 0, "karg_cnt": 0}
+    cnt = 4
+    for arg in args[cnt:]:
+        if type(arg).__name__ == "KernelMetadata":
+            serialize_kernel_metadata(arg, args_dict)
+
+        if isinstance(arg, torch.Tensor):
+            cpu_tensor = arg.cpu()
+            tensor_path = os.path.join(dir_path, f"tensor_{counts['tensors']}.pt")
+            with open(tensor_path, "wb") as f:
+                torch.save(cpu_tensor, f)
+            new_arg = {
+                "name": f"tensor_{counts['tensors']}", "type": "tensor", "dtype": str(arg.dtype), "ctype":
+                signature[counts["karg_cnt"]]
+            }
+            args_dict["argument_list"].append(new_arg)
+            counts["karg_cnt"] += 1
+            counts["tensors"] += 1
+
+        if isinstance(arg, numbers.Number):
+            if counts["karg_cnt"] not in constants:
+                new_arg = {
+                    "name": f"scalarArg_{counts['scalars']}", "type": "scalar", "value": args[cnt], "ctype":
+                    signature[counts["karg_cnt"]]
+                }
+                args_dict["argument_list"].append(new_arg)
+            counts["karg_cnt"] += 1
+            counts["scalars"] += 1
+        cnt += 1
+    # Dump argument info as a JSON file
+    json_path = os.path.join(dir_path, "args_data.json")
+    with open(json_path, "w", encoding="utf-8") as json_file:
+        import json
+        json.dump(args_dict, json_file, indent=4)
+
+
 class XPULauncher:
 
     def __init__(self, src, metadata):  # pylint: disable=unused-argument
         ids = {"ids_of_const_exprs": src.fn.constexprs if hasattr(src, "fn") else tuple()}
         constants = src.constants if hasattr(src, "constants") else {}
         cst_key = lambda i: src.fn.arg_names.index(i) if isinstance(i, str) else i
-        constants = {cst_key(key): value for key, value in constants.items()}
-        signature = {cst_key(key): value for key, value in src.signature.items()}
-        src = make_launcher(constants, signature, ids)
+        self.constants = {cst_key(key): value for key, value in constants.items()}
+        self.signature = {cst_key(key): value for key, value in src.signature.items()}
+        src = make_launcher(self.constants, self.signature, ids)
         mod = compile_module_from_src(src, "__triton_launcher")
         self.launch = mod.launch
 
     def __call__(self, *args, **kwargs):
+        # Serialize KernelArguments for SPIR-V Runner
+        serialize_kernel_args = os.getenv("TRITON_XPU_DUMP_SPIRV_KERNEL_ARGS", None)
+        if serialize_kernel_args:
+            serialize_args(args, self.constants, self.signature)
         self.launch(*args, **kwargs)
 
 
 
@@ -171,7 +171,7 @@ def forward(q, k, v, causal, sm_scale):
     assert Lk in {16, 32, 64, 128}
     o = torch.empty_like(q, dtype=torch.float32)
     BLOCK_M = 128
-    BLOCK_N = 64 if Lk <= 64 else 32
+    BLOCK_N = 64
     num_stages = 3
     num_warps = 8 if Lq == 64 else 16
     stage = 3 if causal else 1
@@ -205,7 +205,8 @@ def forward(q, k, v, causal, sm_scale):
             BLOCK_DMODEL=Lk,  #
             STAGE=stage,  #
             num_warps=num_warps,  #
-            num_stages=num_stages  #
+            num_stages=num_stages,  #
+            grf_mode='large',  #
         )
     return o
 
 
@@ -88,8 +88,6 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::registerTritonAMDGPUStreamPipeline();
   mlir::registerTritonAMDGPUStreamPipelineV2();
   mlir::registerTritonAMDGPUCanonicalizePointers();
-  mlir::triton::registerTritonAMDGPUInsertInstructionSchedHints();
-  mlir::triton::registerTritonAMDGPULowerInstructionSchedHints();
 
   // TODO: register Triton & TritonGPU passes
   registry.insert<mlir::triton::TritonDialect, mlir::cf::ControlFlowDialect,
 
@@ -1 +1 @@
-61f8a7f618901797ee8663389a29722f29216a96
+b5cc222d7429fe6f18c787f633d5262fac2e676f
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-8321eec009c8c79145ebccd51fdfc336e5f8b848`
	`1`	`+487873f7cafeb0fd390eaefe40496b804bceabbd`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-61f8a7f618901797ee8663389a29722f29216a96`
	`1`	`+b5cc222d7429fe6f18c787f633d5262fac2e676f`