llvm · makslevental · Dec 2, 2025 · Dec 1, 2025 · Dec 1, 2025 · Dec 2, 2025
diff --git a/projects/eudsl-python-extras/examples/cuda_matmul_opt.py b/projects/eudsl-python-extras/examples/cuda_matmul_opt.py
@@ -415,7 +415,7 @@ def sgemm_shared_mem_1d_block_tiling[
     inner_row_B = tid / BN
 
     thread_results = memref.alloca((TM,), dtype)
-    linalg.fill(0, thread_results)
+    linalg.fill(0.0, thread_results)
 
     for bk_idx in range_(0, K, BK):
         # Move blocktile to beginning of A's row and B's column
@@ -483,13 +483,13 @@ def sgemm_shared_mem_2d_block_tiling[
     stride_B = num_threads_blocktile // BN
 
     thread_results = memref.alloca((TM, TN), dtype)
-    linalg.fill(0, thread_results)
+    linalg.fill(0.0, thread_results)
 
     reg_M = memref.alloca((TM,), dtype)
-    linalg.fill(0, reg_M)
+    linalg.fill(0.0, reg_M)
 
     reg_N = memref.alloca((TN,), dtype)
-    linalg.fill(0, reg_N)
+    linalg.fill(0.0, reg_N)
 
     for bk_idx in range_(0, K, BK):
         A_ = A[c_row : c_row + BM, bk_idx : bk_idx + BK]
@@ -579,13 +579,13 @@ def sgemm_shared_mem_2d_block_tiling_vectorize[
     inner_row_B = tid / (BN // VECTOR_WIDTH)
 
     thread_results = memref.alloca((TM, TN), dtype)
-    linalg.fill(0, thread_results)
+    linalg.fill(0.0, thread_results)
 
     reg_M = memref.alloca((TM,), dtype)
-    linalg.fill(0, reg_M)
+    linalg.fill(0.0, reg_M)
 
     reg_N = memref.alloca((TN,), dtype)
-    linalg.fill(0, reg_N)
+    linalg.fill(0.0, reg_N)
 
     for bk_idx in range_(0, K, BK):
         A_ = A[c_row : c_row + BM, bk_idx : bk_idx + BK]
@@ -708,13 +708,13 @@ def sgemm_warp_tiling[
 
     # allocate thread-local cache for results in registerfile
     thread_results = memref.alloca((WMITER * TM, WNITER * TN), dtype)
-    linalg.fill(0, thread_results)
+    linalg.fill(0.0, thread_results)
 
     reg_M = memref.alloca((WMITER, TM), dtype)
-    linalg.fill(0, reg_M)
+    linalg.fill(0.0, reg_M)
 
     reg_N = memref.alloca((WNITER, TN), dtype)
-    linalg.fill(0, reg_N)
+    linalg.fill(0.0, reg_N)
 
     for bk_idx in range_(0, K, BK):
         A_ = A[c_row : c_row + BM, bk_idx : bk_idx + BK]

diff --git a/projects/eudsl-python-extras/tests/dialect/test_transform.py b/projects/eudsl-python-extras/tests/dialect/test_transform.py
@@ -702,7 +702,7 @@ def matmul_i8_i8(
         B: T.tensor(K, N, T.i8()),
     ):
         empty = tensor.empty(M, N, T.i8())
-        filled = linalg_dialect.fill(arith.constant(0), outs=[empty])
+        filled = linalg_dialect.fill(arith.constant(0, type=T.i8()), outs=[empty])
         return linalg.matmul(A, B, filled)
 
     @module(attrs={"transform.target_tag": StringAttr.get("payload")})
@@ -856,8 +856,8 @@ def main(variant_op: any_op_t()):
           module attributes {transform.target_tag = "payload"} {
             func.func @matmul_i8_i8(%arg0: tensor<16x256xi8>, %arg1: tensor<256x256xi8>) -> tensor<16x256xi8> {
               %0 = tensor.empty() : tensor<16x256xi8>
-              %c0_i32 = arith.constant 0 : i32
-              %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<16x256xi8>) -> tensor<16x256xi8>
+              %c0_i8 = arith.constant 0 : i8
+              %1 = linalg.fill ins(%c0_i8 : i8) outs(%0 : tensor<16x256xi8>) -> tensor<16x256xi8>
               %2 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%arg0, %arg1 : tensor<16x256xi8>, tensor<256x256xi8>) outs(%1 : tensor<16x256xi8>) -> tensor<16x256xi8>
               return %2 : tensor<16x256xi8>
             }
@@ -924,7 +924,7 @@ def matmul_i8_i8(
         B: T.tensor(K, N, T.i8()),
     ):
         empty = tensor.empty(M, N, T.i8())
-        filled = linalg_dialect.fill(arith.constant(0), outs=[empty])
+        filled = linalg_dialect.fill(arith.constant(0, type=T.i8()), outs=[empty])
         return linalg.matmul(A, B, filled)
 
     @module(attrs={"transform.target_tag": StringAttr.get("payload")})
@@ -997,13 +997,13 @@ def main(variant_op: any_op_t()):
         module {
           module attributes {transform.target_tag = "payload"} {
             func.func @matmul_i8_i8(%arg0: tensor<16x256xi8>, %arg1: tensor<256x256xi8>) -> tensor<16x256xi8> {
-              %c0_i32 = arith.constant 0 : i32
+              %c0_i8 = arith.constant 0 : i8
               %0 = tensor.empty() : tensor<16x256xi8>
               %1 = tensor.empty() : tensor<1x4x16x64xi8>
               %pack = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [16, 64] into %1 : tensor<16x256xi8> -> tensor<1x4x16x64xi8>
               %2 = tensor.empty() : tensor<4x1x64x64xi8>
               %3 = tensor.empty() : tensor<1x1x16x64xi8>
-              %4 = linalg.fill ins(%c0_i32 : i32) outs(%3 : tensor<1x1x16x64xi8>) -> tensor<1x1x16x64xi8>
+              %4 = linalg.fill ins(%c0_i8 : i8) outs(%3 : tensor<1x1x16x64xi8>) -> tensor<1x1x16x64xi8>
               %5 = scf.forall (%arg2, %arg3) in (1, 4) shared_outs(%arg4 = %0) -> (tensor<16x256xi8>) {
                 %6 = affine.apply #map(%arg3)
                 %extracted_slice = tensor.extract_slice %arg1[0, %6] [256, 64] [1, 1] : tensor<256x256xi8> to tensor<256x64xi8>

diff --git a/third_party/llvm-project b/third_party/llvm-project
+4 −2		clang/include/clang/Basic/arm_mve.td
+5 −4		clang/include/clang/Basic/arm_mve_defs.td
+4 −5		clang/lib/AST/RecordLayoutBuilder.cpp
+692 −320		clang/test/CodeGen/arm-mve-intrinsics/ternary.c
+11 −4		clang/utils/TableGen/MveEmitter.cpp
+47 −4		flang/lib/Lower/Runtime.cpp
+26 −2		flang/test/Lower/pause-statement.f90
+70 −87		lld/ELF/SyntheticSections.cpp
+11 −19		lld/ELF/SyntheticSections.h
+54 −56		llvm/docs/LangRef.rst
+2 −1		llvm/include/llvm/Analysis/TargetTransformInfo.h
+12 −1		llvm/include/llvm/CodeGen/BasicTTIImpl.h
+5 −15		llvm/include/llvm/CodeGen/ISDOpcodes.h
+3 −0		llvm/include/llvm/IR/IntrinsicsARM.td
+0 −7		llvm/include/llvm/MC/MCObjectFileInfo.h
+3 −3		llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+0 −4		llvm/lib/MC/MCObjectFileInfo.cpp
+12 −0		llvm/lib/Target/ARM/ARMInstrMVE.td
+16 −0		llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+4 −0		llvm/lib/Target/RISCV/RISCVISelLowering.h
+19 −0		llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+4 −0		llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+2 −1		llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+65 −65		llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
+112 −12		llvm/test/CodeGen/Thumb2/mve-intrinsics/strict-intrinsics.ll
+3 −12		llvm/test/CodeGen/X86/addcarry.ll
+89 −0		llvm/test/Transforms/LoopVectorize/select-umin-first-index.ll
+2 −4		llvm/tools/llc/llc.cpp
+2 −0		llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+9 −8		mlir/docs/Dialects/Linalg/OpDSL.md
+6 −12		mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
+1 −0		mlir/lib/Dialect/ControlFlow/IR/CMakeLists.txt
+33 −1		mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
+21 −2		mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
+4 −4		mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
+2 −2		mlir/test/Dialect/Affine/value-bounds-reification.mlir
+22 −0		mlir/test/Dialect/ControlFlow/canonicalize.mlir
+1 −25		mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
+4 −4		mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir
+18 −0		mlir/test/Dialect/Linalg/invalid.mlir
+2 −2		mlir/test/Integration/Dialect/Linalg/CPU/test-matmul-masked-vec.mlir
+2 −2		mlir/test/Integration/Dialect/Transform/match_matmul.mlir
+12 −14		mlir/test/python/integration/dialects/linalg/opsrun.py
+1 −0		utils/bazel/llvm-project-overlay/mlir/BUILD.bazel