meta-pytorch
diff --git a/‎oink/benchmarks/README.md‎
Lines changed: 6 additions & 0 deletions b/‎oink/benchmarks/README.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py‎
Lines changed: 49 additions & 6 deletions b/‎oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py‎
Lines changed: 49 additions & 6 deletions
diff --git a/‎oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py‎
Lines changed: 4 additions & 7 deletions b/‎oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py‎
Lines changed: 4 additions & 7 deletions
@@ -96,6 +96,12 @@ CUDA_VISIBLE_DEVICES=0 python oink/benchmarks/benchmark/benchmark_fused_add_rmsn
   --json /tmp/fused_add_rmsnorm_sm100_bf16.json
 ```
 
+Note on the Quack baseline: Oink exposes an **in-place** fused op (updates `x` and `residual`).
+Quack’s fused kernel produces `out` and `residual_out` out-of-place, so by default the benchmark
+times `quack::_rmsnorm_fwd` **plus** two explicit copies (`x.copy_(out)`, `residual.copy_(residual_out)`)
+to match the in-place semantics (integration-realistic). Use `--quack-baseline kernel` to time only
+the Quack fused kernel with preallocated outputs.
+
 ### RMSNorm backward
 
 ```bash
 
@@ -31,6 +31,15 @@
 DSv3 suite (Oink vs Quack, multi-shape):
   CUDA_VISIBLE_DEVICES=0 python oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py --dtype bf16 --dsv3 \\
     --json /tmp/kernelagent_oink_sm100_suite_bf16/fused_add_rmsnorm_dsv3.json
+
+Quack baseline note:
+- Oink exposes an **in-place** fused op (writes `x` and `residual` in-place).
+- Quack provides an equivalent fused kernel, but typically returns `out` and
+  `residual_out` (out-of-place) and does not expose a public "update my input
+  buffers in-place" API.
+- For integration realism (vLLM-style semantics) we default to timing:
+    Quack fused kernel + 2 explicit copies to apply the in-place updates
+  so the benchmark covers the full semantic cost.
 """
 
 from __future__ import annotations
@@ -177,6 +186,7 @@ def bench_one(
     warmup_ms: int,
     iters_ms: int,
     verify: bool,
+    quack_baseline: str,
 ) -> Dict[str, Any]:
     device = torch.device("cuda")
     x = torch.randn((M, N), device=device, dtype=dtype)
@@ -212,23 +222,40 @@ def fn():
     row.update(stats)
 
     if quack_rmsnorm_fwd_mut is not None:
-        out_q = torch.empty_like(x)
-        res_out_q = torch.empty_like(residual)
+        x_q = x.clone()
+        residual_q = residual.clone()
+        out_q = torch.empty_like(x_q)
+        res_out_q = torch.empty_like(residual_q)
 
-        def fn_q():
+        def fn_q_kernel():
             quack_rmsnorm_fwd_mut(
-                x,
+                x_q,
                 w,
                 out_q,
                 None,  # bias
                 None,  # rstd
                 None,  # mean
-                residual,
+                residual_q,
                 res_out_q,
                 1e-6,
                 False,  # is_layernorm
             )
 
+        if quack_baseline == "kernel":
+            fn_q = fn_q_kernel
+        elif quack_baseline == "kernel_inplace":
+
+            def fn_q():
+                fn_q_kernel()
+                # Apply the same in-place semantics as vLLM expects:
+                # - x is overwritten with y
+                # - residual is overwritten with z = x + residual
+                x_q.copy_(out_q)
+                residual_q.copy_(res_out_q)
+
+        else:
+            raise ValueError(f"Unknown quack_baseline: {quack_baseline}")
+
         ms_q = do_bench_triton(fn_q, warmup_ms=warmup_ms, rep_ms=iters_ms)
         gbps_q = bytes_io / (ms_q * 1e-3) / 1e9
         row.update(
@@ -287,6 +314,18 @@ def main() -> None:
     p.add_argument(
         "--iters", type=int, default=200, help="rep_ms for do_bench (default: 200)"
     )
+    p.add_argument(
+        "--quack-baseline",
+        type=str,
+        default="kernel_inplace",
+        choices=["kernel", "kernel_inplace"],
+        help=(
+            "How to time Quack for the in-place fused op.\n"
+            "- kernel: Quack fused kernel only (preallocated out/residual_out).\n"
+            "- kernel_inplace: Quack fused kernel + 2 explicit copies to apply "
+            "in-place semantics (integration-realistic)."
+        ),
+    )
     p.add_argument("--skip-verify", action="store_true")
     p.add_argument("--json", type=str, default=None)
     args = p.parse_args()
@@ -309,6 +348,7 @@ def main() -> None:
                 warmup_ms=int(args.warmup_ms),
                 iters_ms=int(args.iters),
                 verify=not bool(args.skip_verify),
+                quack_baseline=str(args.quack_baseline),
             )
         )
 
@@ -324,7 +364,10 @@ def main() -> None:
                 warmup_ms=int(args.warmup_ms),
                 rep_ms=int(args.iters),
                 method="triton.testing.do_bench(mean)",
-                note="Oink fused_add_rmsnorm_inplace_ vs Quack quack::_rmsnorm_fwd(residual=..., residual_out=...) when available",
+                note=(
+                    "Oink fused_add_rmsnorm_inplace_ vs Quack baseline "
+                    f"({args.quack_baseline}) when available"
+                ),
             ),
         )
 
 
@@ -17,7 +17,6 @@
 import argparse
 import csv
 import os
-import sys
 from dataclasses import dataclass
 from typing import List, Optional, Tuple
 
@@ -30,19 +29,17 @@
 # Ensure SM100 (GB200) architecture is recognized by CuTeDSL when running outside vLLM.
 os.environ.setdefault("CUTE_DSL_ARCH", "sm_100a")
 
-# Make the in-repo KernelAgent Oink package importable without an editable install.
-_HERE = os.path.dirname(os.path.abspath(__file__))
-_OINK_SRC = os.path.abspath(os.path.join(_HERE, "..", "src"))
-if _OINK_SRC not in sys.path:
-    sys.path.insert(0, _OINK_SRC)
-
 from bench_utils import (  # noqa: E402
     ErrorStatsAccumulator,
     collect_device_meta,
+    ensure_oink_src_on_path,
     error_stats_to_row,
     iter_row_blocks,
     write_json,
 )
+
+ensure_oink_src_on_path()
+
 from kernelagent_oink.blackwell import rmsnorm as oink_rmsnorm  # noqa: E402
 
 try: