iree-org
diff --git a/‎tests/kernel/wave/asm/test_waveasm_e2e.py‎
Lines changed: 40 additions & 13 deletions b/‎tests/kernel/wave/asm/test_waveasm_e2e.py‎
Lines changed: 40 additions & 13 deletions
diff --git a/‎wave_lang/kernel/wave/asm/waveasm_e2e.py‎
Lines changed: 26 additions & 7 deletions b/‎wave_lang/kernel/wave/asm/waveasm_e2e.py‎
Lines changed: 26 additions & 7 deletions
diff --git a/‎waveasm/include/waveasm/Dialect/WaveASMOps.td‎
Lines changed: 30 additions & 1 deletion b/‎waveasm/include/waveasm/Dialect/WaveASMOps.td‎
Lines changed: 30 additions & 1 deletion
diff --git a/‎waveasm/include/waveasm/Transforms/Passes.td‎
Lines changed: 23 additions & 0 deletions b/‎waveasm/include/waveasm/Transforms/Passes.td‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎waveasm/include/waveasm/Transforms/TranslateFromMLIR.h‎
Lines changed: 42 additions & 3 deletions b/‎waveasm/include/waveasm/Transforms/TranslateFromMLIR.h‎
Lines changed: 42 additions & 3 deletions
diff --git a/‎waveasm/lib/Transforms/AssemblyEmitter.cpp‎
Lines changed: 12 additions & 0 deletions b/‎waveasm/lib/Transforms/AssemblyEmitter.cpp‎
Lines changed: 12 additions & 0 deletions
@@ -48,7 +48,7 @@
 
 import pytest
 
-from tests.kernel.common.utils import require_cdna4
+from tests.kernel.common.utils import param_bool, require_cdna4
 from wave_lang.kernel.wave.asm.waveasm_e2e import (
     WaveASMCompiler,
     capture_wave_kernel_info,
@@ -1319,6 +1319,8 @@ def _dbuf_mxfp4_helper(
     compiler,
     backend,
     dump_asm,
+    dynamic_dims=False,
+    use_buffer_ops=True,
 ):
     """Shared helper for double-buffered MXFP4 scheduled GEMM tests.
 
@@ -1349,6 +1351,8 @@ def _dbuf_mxfp4_helper(
     from wave_lang.kernel.wave.utils.mxfp_utils import (
         generate_gemm_afp4wfp4_inputs,
         torchScaledGemmMXFP4,
+        b_preshuffle,
+        e8m0_shuffle,
     )
 
     # Get tagged kernel + options (same as 7.1_schedule.py)
@@ -1359,8 +1363,9 @@ def _dbuf_mxfp4_helper(
             shape,
             block,
             wave_shape=(1, 4),
+            reorder_workgroups=not dynamic_dims,
         )
-        schedule = get_mxfp4_asymmetric_schedule()
+        schedule = get_mxfp4_asymmetric_schedule(is_bscale_shuffled=True)
     else:
         gemm, options = get_tagged_mxfp4_gemm(
             shape,
@@ -1373,8 +1378,24 @@ def _dbuf_mxfp4_helper(
     options.backend = "asm"
     options.wave_runtime = True
     options.compile_to_mlir = False
+    options.use_buffer_ops = use_buffer_ops
     options = set_default_run_config(options)
 
+    import wave_lang.kernel.lang as tkl
+
+    M = tkl.sym.M
+    N = tkl.sym.N
+    m, n, k = shape
+
+    dynamic_symbols = []
+    dynamic_values = {}
+    if dynamic_dims:
+        dynamic_symbols = [M, N]
+        dynamic_values = {M: m, N: n}
+        del options.subs[M]
+        del options.subs[N]
+        options.dynamic_symbols = dynamic_symbols
+
     # Generate MXFP4 inputs and reference output
     x, w, x_scales, w_scales = generate_gemm_afp4wfp4_inputs(shape)
     torch_out = torchScaledGemmMXFP4(x, w, x_scales, w_scales)
@@ -1384,7 +1405,9 @@ def _dbuf_mxfp4_helper(
     c = torch.zeros(shape[0], shape[1], dtype=torch.float32).cuda()
 
     # Capture MLIR with schedule applied
-    kernel_info = capture_wave_kernel_info(options, gemm, schedule=schedule)
+    kernel_info = capture_wave_kernel_info(
+        options, gemm, schedule=schedule, dynamic_values=dynamic_values
+    )
 
     # Verify MLIR contains scaled_mfma operation
     assert (
@@ -1424,8 +1447,10 @@ def _dbuf_mxfp4_helper(
 
     # Execute on GPU
     # Kernel signature: (a, a_scale, b, b_scale, c)
-    # For preshuffle B: transform B data and B scales to preshuffled layout
+    # For preshuffle B: transform all inputs to match kernel expectations.
+    # a_scale_preshuffle=True (default) means a_scales must also be shuffled.
     if num_waves <= 4:
+        x_scales = e8m0_shuffle(x_scales).contiguous()
         w_input = b_preshuffle(w.T.contiguous()).contiguous()
         w_scales_input = e8m0_shuffle(w_scales).contiguous()
     else:
@@ -1439,6 +1464,7 @@ def _dbuf_mxfp4_helper(
         block=block_size,
         shared_memory_bytes=lds_size,
         func_name=kernel_name,
+        dynamic_dims=[dynamic_values[s] for s in dynamic_symbols],
     )
 
     # Numerical correctness validation (same tolerance as existing MXFP4 test)
@@ -1453,25 +1479,26 @@ def _dbuf_mxfp4_helper(
     )
 
 
-@pytest.mark.xfail(
-    reason="Asymmetric schedule with wave_shape=(1,4) requires ~323 VGPRs, "
-    "exceeding the 256 hardware encoding limit. Needs LDS scale layout "
-    "fix or spilling to resolve.",
-)
-def test_dbuf_4wave_mxfp4_gemm_cpp_backend(compiler, backend, dump_asm):
+@param_bool("dynamic_dims", "dyn")
+@param_bool("use_buffer_ops", "bufops")
+def test_dbuf_4wave_mxfp4_gemm_cpp_backend(
+    dynamic_dims, use_buffer_ops, compiler, backend, dump_asm
+):
     """End-to-end test for asymmetric MXFP4 GEMM with 4 waves.
 
-    Uses get_mxfp4_asymmetric_schedule() with wave_shape=(1,4) and
-    B direct from global (no LDS).
+    Uses get_mxfp4_asymmetric_schedule() with wave_shape=(1,4),
+    preshuffle B, and block=(128,256,256) matching 7.1_schedule.py.
     """
     _dbuf_mxfp4_helper(
         shape=(1024, 1024, 8192),
-        block=(256, 256, 256),
+        block=(128, 256, 256),
         num_waves=4,
         use_stagger=False,
         compiler=compiler,
         backend=backend,
         dump_asm=dump_asm,
+        dynamic_dims=dynamic_dims,
+        use_buffer_ops=use_buffer_ops,
     )
 
 
 
@@ -33,7 +33,7 @@
 import tempfile
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Optional, List, Tuple
+from typing import Dict, Optional, List, Tuple
 
 import torch
 
@@ -436,7 +436,12 @@ def capture_wave_mlir(options, kernel_func) -> str:
     return mlir_text
 
 
-def capture_wave_kernel_info(options, kernel_func, schedule=None) -> CapturedKernelInfo:
+def capture_wave_kernel_info(
+    options,
+    kernel_func,
+    schedule=None,
+    dynamic_values: Optional[Dict] = None,
+) -> CapturedKernelInfo:
     """
     Capture MLIR and kernel launch info from Wave compilation.
 
@@ -447,6 +452,9 @@ def capture_wave_kernel_info(options, kernel_func, schedule=None) -> CapturedKer
         options: WaveCompileOptions
         kernel_func: Decorated wave kernel function
         schedule: Optional WaveSchedule to apply during compilation
+        dynamic_values: Optional dict mapping dynamic symbols to their concrete
+            values. Used for grid computation when symbols are not in
+            options.subs (i.e. truly dynamic shapes).
 
     Returns:
         CapturedKernelInfo with all launch information
@@ -517,13 +525,19 @@ def capture_wave_kernel_info(options, kernel_func, schedule=None) -> CapturedKer
         dynamic_syms = list(getattr(options, "dynamic_symbols", None) or [])
         grid_symbols = list(kernel_func.bound_scalar_symbols.keys()) + dynamic_syms
         grid_values = []
+        dv = dynamic_values or {}
         for sym in grid_symbols:
-            if sym not in options.subs:
+            if sym in options.subs:
+                grid_values.append(options.subs[sym])
+            elif sym in dv:
+                grid_values.append(dv[sym])
+            else:
                 raise ValueError(
-                    f"Grid symbol {sym} not found in options.subs. "
-                    f"Available: {list(options.subs.keys())}"
+                    f"Grid symbol {sym} not found in options.subs or "
+                    f"dynamic_values. "
+                    f"Available subs: {list(options.subs.keys())}, "
+                    f"dynamic_values: {list(dv.keys())}"
                 )
-            grid_values.append(options.subs[sym])
         grid = launch_info.grid(grid_values)
         grid = tuple(int(x) for x in grid)
 
@@ -617,6 +631,7 @@ def run_with_wave_runtime(
     block: Tuple[int, int, int],
     shared_memory_bytes: int = 0,
     func_name: str = "isolated_benchmark",
+    dynamic_dims: Optional[List[int]] = None,
 ):
     """
     Execute a compiled GPU binary using wave_runtime.
@@ -629,6 +644,8 @@ def run_with_wave_runtime(
         block: Block dimensions (x, y, z)
         shared_memory_bytes: Shared memory size
         func_name: Function name in the binary (default: "isolated_benchmark")
+        dynamic_dims: Optional list of concrete values for dynamic dimension
+            symbols, passed as additional kernel arguments.
     """
     import wave_runtime
 
@@ -660,11 +677,13 @@ def run_with_wave_runtime(
     kern_args = [tensor.data_ptr() for tensor in all_tensors]
     kernel_args = wave_runtime.Int64Vector(kern_args)
 
+    dyn_dims = wave_runtime.Int64Vector(dynamic_dims or [])
+
     # Prepare dynamic stride arguments
     stride_args = get_dynamic_stride_args(all_tensors)
 
     # Launch
-    wave_runtime.launch(kernel_launch_info, kernel_args, [], [], stride_args)
+    wave_runtime.launch(kernel_launch_info, kernel_args, dyn_dims, [], stride_args)
 
     # Sync
     torch.cuda.synchronize()
@@ -591,7 +591,14 @@ def WaveASM_V_LSHL_OR_B32 : VALUTernaryOp<"v_lshl_or_b32">;
 def WaveASM_V_LSHL_ADD_U32 : VALUTernaryOp<"v_lshl_add_u32">;
 
 // Conditional mask and lane operations
-def WaveASM_V_CNDMASK_B32 : VALUTernaryOp<"v_cndmask_b32">;
+// V_CNDMASK_B32 implicitly reads VCC, so it must NOT have Pure or
+// ArithmeticOp traits (which would make CSE treat two instances with
+// identical explicit operands as equivalent even when VCC differs).
+def WaveASM_V_CNDMASK_B32 : WAVEASMOp<"v_cndmask_b32", []> {
+  let arguments = (ins WaveASM_VALUSrc:$src0, WaveASM_VALUSrc:$src1, WaveASM_VALUSrc:$src2);
+  let results = (outs WaveASM_AnyVGPR:$dst);
+  let assemblyFormat = "$src0 `,` $src1 `,` $src2 attr-dict `:` type($src0) `,` type($src1) `,` type($src2) `->` type($dst)";
+}
 
 // Lane read operations (VGPR -> SGPR)
 def WaveASM_V_READLANE_B32 : WAVEASMOp<"v_readlane_b32", [Pure]> {
@@ -908,6 +915,28 @@ def WaveASM_S_MOV_B32_M0 : WAVEASMOp<"s_mov_b32_m0", [WaveASM_SpecialRegOp]> {
   let assemblyFormat = "$src attr-dict `:` type($src)";
 }
 
+def WaveASM_S_AND_SAVEEXEC_B64 : WAVEASMOp<"s_and_saveexec_b64", [WaveASM_SpecialRegOp]> {
+  let summary = "Save exec to dst, then AND exec with VCC (implicit)";
+  let description = [{
+    dst = exec; exec &= vcc.
+    VCC is read implicitly (set by a preceding V_CMP).
+    Used for conditional execution: lanes where VCC is 0 become inactive.
+    The saved exec is restored later via s_mov_b64_exec.
+  }];
+  let results = (outs WaveASM_AnySGPR:$dst);
+  let assemblyFormat = "attr-dict `->` type($dst)";
+}
+
+def WaveASM_S_MOV_B64_EXEC : WAVEASMOp<"s_mov_b64_exec", [WaveASM_SpecialRegOp]> {
+  let summary = "Restore exec from saved SGPR pair";
+  let description = [{
+    exec = src.
+    Used to restore exec after a conditional execution region.
+  }];
+  let arguments = (ins WaveASM_AnySGPR:$src);
+  let assemblyFormat = "$src attr-dict `:` type($src)";
+}
+
 //===----------------------------------------------------------------------===//
 // VMEM Atomic Instructions
 //===----------------------------------------------------------------------===//
 
@@ -223,6 +223,29 @@ def WAVEASMScalePackElimination : Pass<"waveasm-scale-pack-elimination"> {
   let dependentDialects = ["::waveasm::WaveASMDialect"];
 }
 
+//===----------------------------------------------------------------------===//
+// Extract Scalarization Pass
+//===----------------------------------------------------------------------===//
+
+def WAVEASMExtractScalarization
+    : Pass<"waveasm-extract-scalarization"> {
+  let summary = "Scalarize vector.extract from broadcast+dense-const patterns";
+  let description = [{
+    Pre-translation pass that rewrites
+      vector.extract[k]( index_cast?( select?( addi(broadcast(x), dense<[...]>) )))
+    into scalar operations: arith.addi %x, dense[k], with an optional scalar
+    arith.select if the original chain included one.
+
+    This eliminates non-splat dense vector constants before the WaveASM
+    translator runs, so translation handlers only see ordinary scalar IR.
+  }];
+
+  let dependentDialects = [
+    "::mlir::arith::ArithDialect",
+    "::mlir::vector::VectorDialect"
+  ];
+}
+
 //===----------------------------------------------------------------------===//
 // Memory Offset Optimization Pass
 //===----------------------------------------------------------------------===//
 
@@ -341,9 +341,18 @@ class TranslationContext {
     int64_t srdBaseIndex; // SGPR index for SRD (e.g., 8 for s[8:11])
   };
 
+  /// Information about a pending scalar kernel argument load (index, i32, etc.)
+  struct PendingScalarArg {
+    mlir::Value blockArg; // The MLIR block argument
+    int64_t argIndex;     // Position in function signature
+  };
+
   /// Queue an SRD setup for a binding
   void queueSRDSetup(mlir::Value memref, int64_t argIndex, int64_t bufferSize);
 
+  /// Queue a scalar argument load from the kernarg buffer
+  void queueScalarArgLoad(mlir::Value blockArg, int64_t argIndex);
+
   /// Emit all pending SRD setup instructions (called at start of kernel body)
   void emitSRDPrologue();
 
@@ -398,8 +407,10 @@ class TranslationContext {
   /// Update buffer size for a pending SRD (called when we see reinterpret_cast)
   void updateSRDBufferSize(mlir::Value memref, int64_t bufferSize);
 
-  /// Get the number of kernel arguments (based on pending SRD count)
-  size_t getNumKernelArgs() const { return pendingSRDs.size(); }
+  /// Get the number of kernel arguments (bindings + scalar args)
+  size_t getNumKernelArgs() const {
+    return pendingSRDs.size() + pendingScalarArgs.size();
+  }
 
   //===--------------------------------------------------------------------===//
   // Split Vector Result Tracking
@@ -514,6 +525,30 @@ class TranslationContext {
     return ldsBaseOffsetMap.contains(memref);
   }
 
+  //===--------------------------------------------------------------------===//
+  // Dynamic Stride Tracking (for memref.reinterpret_cast with runtime strides)
+  //===--------------------------------------------------------------------===//
+
+  /// Store a dynamic (runtime) stride value for a memref dimension.
+  /// \p strideValue is the mapped WaveASM SSA value holding the element stride.
+  void setDynamicStride(mlir::Value memref, unsigned dim,
+                        mlir::Value strideValue) {
+    dynamicStrideMap[memref][dim] = strideValue;
+  }
+
+  /// Get the dynamic stride value for a memref dimension.
+  /// Returns nullopt if the stride is static.
+  std::optional<mlir::Value> getDynamicStride(mlir::Value memref,
+                                              unsigned dim) const {
+    auto it = dynamicStrideMap.find(memref);
+    if (it == dynamicStrideMap.end())
+      return std::nullopt;
+    auto dimIt = it->second.find(dim);
+    if (dimIt == it->second.end())
+      return std::nullopt;
+    return dimIt->second;
+  }
+
   /// Track a pending per-workgroup SRD base adjustment for a linearized memref
   struct PendingSRDBaseAdjust {
     mlir::Value elementOffset;
@@ -690,6 +725,9 @@ class TranslationContext {
 
   llvm::DenseMap<mlir::Value, PendingSRDBaseAdjust> pendingSRDBaseAdjustMap;
   llvm::SmallVector<PendingSRD, 4> pendingSRDs;
+  llvm::SmallVector<PendingScalarArg, 2> pendingScalarArgs;
+  llvm::DenseMap<mlir::Value, llvm::DenseMap<unsigned, mlir::Value>>
+      dynamicStrideMap;
   llvm::StringMap<mlir::Value> exprCache;
   int64_t nextSRDIndex =
       -1; // Will be computed lazily, starts after user+system SGPRs
@@ -739,7 +777,8 @@ struct VOffsetResult {
 VOffsetResult computeVOffsetFromIndices(mlir::MemRefType memrefType,
                                         mlir::ValueRange indices,
                                         TranslationContext &ctx,
-                                        mlir::Location loc);
+                                        mlir::Location loc,
+                                        mlir::Value base = nullptr);
 
 /// Emit inline SRD base adjustment for per-workgroup buffer addressing.
 /// Allocates a new SRD (5 SGPRs: base pair, hi/lo temporaries, offset temp),
 
@@ -471,6 +471,18 @@ std::optional<std::string> KernelGenerator::generateOp(Operation *op) {
         return result;
       })
 
+      .Case<S_AND_SAVEEXEC_B64>(
+          [&](S_AND_SAVEEXEC_B64 saveOp) -> std::optional<std::string> {
+            std::string dst = resolveValue(saveOp.getDst());
+            return "  s_and_saveexec_b64 " + dst + ", vcc";
+          })
+
+      .Case<S_MOV_B64_EXEC>(
+          [&](S_MOV_B64_EXEC restoreOp) -> std::optional<std::string> {
+            std::string src = resolveValue(restoreOp.getSrc());
+            return "  s_mov_b64 exec, " + src;
+          })
+
       .Case<S_BRANCH>([&](S_BRANCH branchOp) {
         return std::string("  s_branch ") +
                branchOp.getTarget().getRootReference().str();