From 08a049f36a96415b83385a3340e2784cf823157d Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 1 Dec 2025 01:37:32 +0000
Subject: [PATCH 1/3] [LLVM] Bump to 75aa01b89

---
 third_party/llvm-project | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/llvm-project b/third_party/llvm-project
index 24b87b8d..75aa01b8 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project
@@ -1 +1 @@
-Subproject commit 24b87b8d4891d90afd8c4033a4997dedecbdd107
+Subproject commit 75aa01b89553bf4213a3b0e83829b6d0689941b9

From 2ec323158ff62a568001a54632d846f32885142e Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Mon, 1 Dec 2025 13:59:46 -0800
Subject: [PATCH 2/3] fix linalg.fill

---
 .../tests/dialect/test_transform.py                  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/projects/eudsl-python-extras/tests/dialect/test_transform.py b/projects/eudsl-python-extras/tests/dialect/test_transform.py
index 4794902f..f38c1a79 100644
--- a/projects/eudsl-python-extras/tests/dialect/test_transform.py
+++ b/projects/eudsl-python-extras/tests/dialect/test_transform.py
@@ -702,7 +702,7 @@ def matmul_i8_i8(
         B: T.tensor(K, N, T.i8()),
     ):
         empty = tensor.empty(M, N, T.i8())
-        filled = linalg_dialect.fill(arith.constant(0), outs=[empty])
+        filled = linalg_dialect.fill(arith.constant(0, type=T.i8()), outs=[empty])
         return linalg.matmul(A, B, filled)
 
     @module(attrs={"transform.target_tag": StringAttr.get("payload")})
@@ -856,8 +856,8 @@ def main(variant_op: any_op_t()):
           module attributes {transform.target_tag = "payload"} {
             func.func @matmul_i8_i8(%arg0: tensor<16x256xi8>, %arg1: tensor<256x256xi8>) -> tensor<16x256xi8> {
               %0 = tensor.empty() : tensor<16x256xi8>
-              %c0_i32 = arith.constant 0 : i32
-              %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<16x256xi8>) -> tensor<16x256xi8>
+              %c0_i8 = arith.constant 0 : i8
+              %1 = linalg.fill ins(%c0_i8 : i8) outs(%0 : tensor<16x256xi8>) -> tensor<16x256xi8>
               %2 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%arg0, %arg1 : tensor<16x256xi8>, tensor<256x256xi8>) outs(%1 : tensor<16x256xi8>) -> tensor<16x256xi8>
               return %2 : tensor<16x256xi8>
             }
@@ -924,7 +924,7 @@ def matmul_i8_i8(
         B: T.tensor(K, N, T.i8()),
     ):
         empty = tensor.empty(M, N, T.i8())
-        filled = linalg_dialect.fill(arith.constant(0), outs=[empty])
+        filled = linalg_dialect.fill(arith.constant(0, type=T.i8()), outs=[empty])
         return linalg.matmul(A, B, filled)
 
     @module(attrs={"transform.target_tag": StringAttr.get("payload")})
@@ -997,13 +997,13 @@ def main(variant_op: any_op_t()):
         module {
           module attributes {transform.target_tag = "payload"} {
             func.func @matmul_i8_i8(%arg0: tensor<16x256xi8>, %arg1: tensor<256x256xi8>) -> tensor<16x256xi8> {
-              %c0_i32 = arith.constant 0 : i32
+              %c0_i8 = arith.constant 0 : i8
               %0 = tensor.empty() : tensor<16x256xi8>
               %1 = tensor.empty() : tensor<1x4x16x64xi8>
               %pack = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [16, 64] into %1 : tensor<16x256xi8> -> tensor<1x4x16x64xi8>
               %2 = tensor.empty() : tensor<4x1x64x64xi8>
               %3 = tensor.empty() : tensor<1x1x16x64xi8>
-              %4 = linalg.fill ins(%c0_i32 : i32) outs(%3 : tensor<1x1x16x64xi8>) -> tensor<1x1x16x64xi8>
+              %4 = linalg.fill ins(%c0_i8 : i8) outs(%3 : tensor<1x1x16x64xi8>) -> tensor<1x1x16x64xi8>
               %5 = scf.forall (%arg2, %arg3) in (1, 4) shared_outs(%arg4 = %0) -> (tensor<16x256xi8>) {
                 %6 = affine.apply #map(%arg3)
                 %extracted_slice = tensor.extract_slice %arg1[0, %6] [256, 64] [1, 1] : tensor<256x256xi8> to tensor<256x64xi8>

From ca1aa4c9ba1d113afb82bacf29e8cbbc8c1f8f51 Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Mon, 1 Dec 2025 16:10:12 -0800
Subject: [PATCH 3/3] fix example

---
 .../examples/cuda_matmul_opt.py               | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/projects/eudsl-python-extras/examples/cuda_matmul_opt.py b/projects/eudsl-python-extras/examples/cuda_matmul_opt.py
index 7f48c833..12623784 100644
--- a/projects/eudsl-python-extras/examples/cuda_matmul_opt.py
+++ b/projects/eudsl-python-extras/examples/cuda_matmul_opt.py
@@ -415,7 +415,7 @@ def sgemm_shared_mem_1d_block_tiling[
     inner_row_B = tid / BN
 
     thread_results = memref.alloca((TM,), dtype)
-    linalg.fill(0, thread_results)
+    linalg.fill(0.0, thread_results)
 
     for bk_idx in range_(0, K, BK):
         # Move blocktile to beginning of A's row and B's column
@@ -483,13 +483,13 @@ def sgemm_shared_mem_2d_block_tiling[
     stride_B = num_threads_blocktile // BN
 
     thread_results = memref.alloca((TM, TN), dtype)
-    linalg.fill(0, thread_results)
+    linalg.fill(0.0, thread_results)
 
     reg_M = memref.alloca((TM,), dtype)
-    linalg.fill(0, reg_M)
+    linalg.fill(0.0, reg_M)
 
     reg_N = memref.alloca((TN,), dtype)
-    linalg.fill(0, reg_N)
+    linalg.fill(0.0, reg_N)
 
     for bk_idx in range_(0, K, BK):
         A_ = A[c_row : c_row + BM, bk_idx : bk_idx + BK]
@@ -579,13 +579,13 @@ def sgemm_shared_mem_2d_block_tiling_vectorize[
     inner_row_B = tid / (BN // VECTOR_WIDTH)
 
     thread_results = memref.alloca((TM, TN), dtype)
-    linalg.fill(0, thread_results)
+    linalg.fill(0.0, thread_results)
 
     reg_M = memref.alloca((TM,), dtype)
-    linalg.fill(0, reg_M)
+    linalg.fill(0.0, reg_M)
 
     reg_N = memref.alloca((TN,), dtype)
-    linalg.fill(0, reg_N)
+    linalg.fill(0.0, reg_N)
 
     for bk_idx in range_(0, K, BK):
         A_ = A[c_row : c_row + BM, bk_idx : bk_idx + BK]
@@ -708,13 +708,13 @@ def sgemm_warp_tiling[
 
     # allocate thread-local cache for results in registerfile
     thread_results = memref.alloca((WMITER * TM, WNITER * TN), dtype)
-    linalg.fill(0, thread_results)
+    linalg.fill(0.0, thread_results)
 
     reg_M = memref.alloca((WMITER, TM), dtype)
-    linalg.fill(0, reg_M)
+    linalg.fill(0.0, reg_M)
 
     reg_N = memref.alloca((WNITER, TN), dtype)
-    linalg.fill(0, reg_N)
+    linalg.fill(0.0, reg_N)
 
     for bk_idx in range_(0, K, BK):
         A_ = A[c_row : c_row + BM, bk_idx : bk_idx + BK]