From 08a049f36a96415b83385a3340e2784cf823157d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 1 Dec 2025 01:37:32 +0000 Subject: [PATCH 1/3] [LLVM] Bump to 75aa01b89 --- third_party/llvm-project | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/llvm-project b/third_party/llvm-project index 24b87b8d..75aa01b8 160000 --- a/third_party/llvm-project +++ b/third_party/llvm-project @@ -1 +1 @@ -Subproject commit 24b87b8d4891d90afd8c4033a4997dedecbdd107 +Subproject commit 75aa01b89553bf4213a3b0e83829b6d0689941b9 From 2ec323158ff62a568001a54632d846f32885142e Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Mon, 1 Dec 2025 13:59:46 -0800 Subject: [PATCH 2/3] fix linalg.fill --- .../tests/dialect/test_transform.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/projects/eudsl-python-extras/tests/dialect/test_transform.py b/projects/eudsl-python-extras/tests/dialect/test_transform.py index 4794902f..f38c1a79 100644 --- a/projects/eudsl-python-extras/tests/dialect/test_transform.py +++ b/projects/eudsl-python-extras/tests/dialect/test_transform.py @@ -702,7 +702,7 @@ def matmul_i8_i8( B: T.tensor(K, N, T.i8()), ): empty = tensor.empty(M, N, T.i8()) - filled = linalg_dialect.fill(arith.constant(0), outs=[empty]) + filled = linalg_dialect.fill(arith.constant(0, type=T.i8()), outs=[empty]) return linalg.matmul(A, B, filled) @module(attrs={"transform.target_tag": StringAttr.get("payload")}) @@ -856,8 +856,8 @@ def main(variant_op: any_op_t()): module attributes {transform.target_tag = "payload"} { func.func @matmul_i8_i8(%arg0: tensor<16x256xi8>, %arg1: tensor<256x256xi8>) -> tensor<16x256xi8> { %0 = tensor.empty() : tensor<16x256xi8> - %c0_i32 = arith.constant 0 : i32 - %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<16x256xi8>) -> tensor<16x256xi8> + %c0_i8 = arith.constant 0 : i8 + %1 = linalg.fill ins(%c0_i8 : i8) outs(%0 : tensor<16x256xi8>) -> tensor<16x256xi8> %2 = linalg.matmul {cast = #linalg.type_fn} ins(%arg0, %arg1 : tensor<16x256xi8>, tensor<256x256xi8>) outs(%1 : tensor<16x256xi8>) -> tensor<16x256xi8> return %2 : tensor<16x256xi8> } @@ -924,7 +924,7 @@ def matmul_i8_i8( B: T.tensor(K, N, T.i8()), ): empty = tensor.empty(M, N, T.i8()) - filled = linalg_dialect.fill(arith.constant(0), outs=[empty]) + filled = linalg_dialect.fill(arith.constant(0, type=T.i8()), outs=[empty]) return linalg.matmul(A, B, filled) @module(attrs={"transform.target_tag": StringAttr.get("payload")}) @@ -997,13 +997,13 @@ def main(variant_op: any_op_t()): module { module attributes {transform.target_tag = "payload"} { func.func @matmul_i8_i8(%arg0: tensor<16x256xi8>, %arg1: tensor<256x256xi8>) -> tensor<16x256xi8> { - %c0_i32 = arith.constant 0 : i32 + %c0_i8 = arith.constant 0 : i8 %0 = tensor.empty() : tensor<16x256xi8> %1 = tensor.empty() : tensor<1x4x16x64xi8> %pack = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [16, 64] into %1 : tensor<16x256xi8> -> tensor<1x4x16x64xi8> %2 = tensor.empty() : tensor<4x1x64x64xi8> %3 = tensor.empty() : tensor<1x1x16x64xi8> - %4 = linalg.fill ins(%c0_i32 : i32) outs(%3 : tensor<1x1x16x64xi8>) -> tensor<1x1x16x64xi8> + %4 = linalg.fill ins(%c0_i8 : i8) outs(%3 : tensor<1x1x16x64xi8>) -> tensor<1x1x16x64xi8> %5 = scf.forall (%arg2, %arg3) in (1, 4) shared_outs(%arg4 = %0) -> (tensor<16x256xi8>) { %6 = affine.apply #map(%arg3) %extracted_slice = tensor.extract_slice %arg1[0, %6] [256, 64] [1, 1] : tensor<256x256xi8> to tensor<256x64xi8> From ca1aa4c9ba1d113afb82bacf29e8cbbc8c1f8f51 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Mon, 1 Dec 2025 16:10:12 -0800 Subject: [PATCH 3/3] fix example --- .../examples/cuda_matmul_opt.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/projects/eudsl-python-extras/examples/cuda_matmul_opt.py b/projects/eudsl-python-extras/examples/cuda_matmul_opt.py index 7f48c833..12623784 100644 --- a/projects/eudsl-python-extras/examples/cuda_matmul_opt.py +++ b/projects/eudsl-python-extras/examples/cuda_matmul_opt.py @@ -415,7 +415,7 @@ def sgemm_shared_mem_1d_block_tiling[ inner_row_B = tid / BN thread_results = memref.alloca((TM,), dtype) - linalg.fill(0, thread_results) + linalg.fill(0.0, thread_results) for bk_idx in range_(0, K, BK): # Move blocktile to beginning of A's row and B's column @@ -483,13 +483,13 @@ def sgemm_shared_mem_2d_block_tiling[ stride_B = num_threads_blocktile // BN thread_results = memref.alloca((TM, TN), dtype) - linalg.fill(0, thread_results) + linalg.fill(0.0, thread_results) reg_M = memref.alloca((TM,), dtype) - linalg.fill(0, reg_M) + linalg.fill(0.0, reg_M) reg_N = memref.alloca((TN,), dtype) - linalg.fill(0, reg_N) + linalg.fill(0.0, reg_N) for bk_idx in range_(0, K, BK): A_ = A[c_row : c_row + BM, bk_idx : bk_idx + BK] @@ -579,13 +579,13 @@ def sgemm_shared_mem_2d_block_tiling_vectorize[ inner_row_B = tid / (BN // VECTOR_WIDTH) thread_results = memref.alloca((TM, TN), dtype) - linalg.fill(0, thread_results) + linalg.fill(0.0, thread_results) reg_M = memref.alloca((TM,), dtype) - linalg.fill(0, reg_M) + linalg.fill(0.0, reg_M) reg_N = memref.alloca((TN,), dtype) - linalg.fill(0, reg_N) + linalg.fill(0.0, reg_N) for bk_idx in range_(0, K, BK): A_ = A[c_row : c_row + BM, bk_idx : bk_idx + BK] @@ -708,13 +708,13 @@ def sgemm_warp_tiling[ # allocate thread-local cache for results in registerfile thread_results = memref.alloca((WMITER * TM, WNITER * TN), dtype) - linalg.fill(0, thread_results) + linalg.fill(0.0, thread_results) reg_M = memref.alloca((WMITER, TM), dtype) - linalg.fill(0, reg_M) + linalg.fill(0.0, reg_M) reg_N = memref.alloca((WNITER, TN), dtype) - linalg.fill(0, reg_N) + linalg.fill(0.0, reg_N) for bk_idx in range_(0, K, BK): A_ = A[c_row : c_row + BM, bk_idx : bk_idx + BK]