test: add a diagnostic script for prefix caching naning (#1987)

terrykong · web-flow · commit ac2c7744399f · 2026-03-01T23:17:00.000+08:00
Signed-off-by: Terry Kong &lt;terryk@nvidia.com&gt;
diff --git a/docs/adding-new-models.md b/docs/adding-new-models.md
@@ -311,4 +311,54 @@ The different compilation modes offer distinct trade-offs between accuracy and p
 - **Eager vs CUDA Graph failures are normal** - don't panic if this fails
 - **Focus on patterns** - some models are more sensitive than others
 - **Use as guidance** - helps choose reliable compilation settings
-- **Balance precision vs performance** - choose what works for your use case
+- **Balance precision vs performance** - choose what works for your use case
+
+## [5.prefix_caching_nan.py](https://github.com/NVIDIA-NeMo/RL/blob/main/tools/model_diagnostics/5.prefix_caching_nan.py)
+
+Tests that prefix caching doesn't produce NaN logprobs when prior generation is rolled back into the prompt (the standard RL / multi-turn pattern). In vLLM >= 0.14, the second request can return all-NaN logprobs with `token_id=0` (`<unk>`) for every token after the first.
+
+```sh
+# Single version (requires 2+ GPUs for TP=2)
+uv run --no-project --with "vllm==0.14.0" tools/model_diagnostics/5.prefix_caching_nan.py
+
+# Test across multiple vLLM versions:
+for ver in 0.11.2 0.13.0 0.14.0 0.15.0 0.15.1; do
+    uv run --no-project --with "vllm==$ver" tools/model_diagnostics/5.prefix_caching_nan.py 2>&1 | tee "prefix_caching_nan_vllm_${ver}.log"
+done
+```
+
+Expected pass output (vLLM 0.13.0):
+```
+Iteration 1 — prompt length: 13990 chars
+  tokens: 2048, finish_reason: length
+  text (first 100): '3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 '
+
+Iteration 2 — prompt length: 16038 chars
+  tokens: 2048, finish_reason: length
+  text (first 100): '1 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 343'
+
+[nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16] ALL GOOD!
+```
+
+Expected fail output (vLLM 0.14.0):
+```
+Iteration 1 — prompt length: 13990 chars
+  tokens: 2048, finish_reason: length
+  text (first 100): '3000\n\nAssistant: 2600 2601 2602 2603 2604 2605 2606 2607 2609 2610 ...'
+
+Iteration 2 — prompt length: 16047 chars
+  tokens: 2048, finish_reason: length
+  text (first 100): '3'
+
+  Sample logprobs from iteration 2:
+    token[0] id=1051: logprob=-0.0005862186080776155 decoded='3'
+    token[1] id=0: logprob=nan decoded='<unk>'
+    token[2] id=0: logprob=nan decoded='<unk>'
+    token[2047] id=0: logprob=nan decoded='<unk>'
+
+AssertionError: FAIL: 2047/2048 logprobs are NaN on iteration 2 (prefix caching is broken in vLLM 0.14.0)
+```
+
+Note: the `ERROR ... Engine core proc EngineCore_DP0 died unexpectedly` message that may appear after the assertion is just vLLM's engine shutting down ungracefully after the process exits — it is not a separate issue.
+
+The script generates from a counting prompt, appends the output back into the prompt, and generates again. On the second generation, prefix caching reuses the KV cache from the first request's prefix. The bug causes the cached prefix to produce corrupted activations, resulting in `token_id=0` (`<unk>`) with `logprob=nan` for all tokens after the first.
diff --git a/pyrefly.toml b/pyrefly.toml
@@ -134,6 +134,7 @@ project-includes = [
   "tools/model_diagnostics/1.max_model_len_respected.py",
   "tools/model_diagnostics/2.long_generation_decode_vs_prefill.py",
   "tools/model_diagnostics/4.vllm_precision_compilation_test.py",
+  "tools/model_diagnostics/5.prefix_caching_nan.py",
 ]
 
 # Disable TypedDict mutation errors since TypedDict objects are regular dicts at runtime
diff --git a/tools/model_diagnostics/5.prefix_caching_nan.py b/tools/model_diagnostics/5.prefix_caching_nan.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prefix caching NaN reproducer.
+
+Tests that prefix caching doesn't produce NaN logprobs when prior generation
+is rolled back into the prompt (the standard RL / multi-turn pattern).
+
+Known failure: vLLM >= 0.14 may return token_id=0 (<unk>) with logprob=nan
+for every token after the first on the second request.
+
+Usage:
+    python 5.prefix_caching_nan.py
+    python 5.prefix_caching_nan.py --model meta-llama/Llama-3.1-8B-Instruct
+"""
+
+import argparse
+import math
+
+MODEL = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
+TP = 2
+MAX_TOKENS = 2048
+MAX_MODEL_LEN = 32768
+COUNT_UP_TO = 3000
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--model", type=str, default=MODEL)
+parser.add_argument("--tp", type=int, default=TP)
+args = parser.parse_args()
+
+import vllm
+from vllm import LLM, SamplingParams
+
+print(f"vLLM version: {vllm.__version__}")
+
+numbers = " ".join(str(i) for i in range(1, COUNT_UP_TO + 1))
+prompt = (
+    "You are a counting assistant. Output ONLY numbers separated by spaces.\n\n"
+    f"User: Continue counting: {numbers} "
+)
+
+llm = LLM(
+    model=args.model,
+    tensor_parallel_size=args.tp,
+    enable_prefix_caching=True,
+    max_model_len=MAX_MODEL_LEN,
+    gpu_memory_utilization=0.90,
+    trust_remote_code=True,
+)
+sampling_params = SamplingParams(temperature=0.0, max_tokens=MAX_TOKENS, logprobs=1)
+
+# Iteration 1: initial generation (builds the prefix cache)
+print(f"\nIteration 1 — prompt length: {len(prompt)} chars")
+out1 = llm.generate([prompt], sampling_params)[0].outputs[0]
+print(f"  tokens: {len(out1.token_ids)}, finish_reason: {out1.finish_reason}")
+print(f"  text (first 100): {out1.text[:100]!r}")
+
+# Iteration 2: extend prompt with prior output (triggers prefix cache reuse)
+prompt += out1.text
+print(f"\nIteration 2 — prompt length: {len(prompt)} chars")
+out2 = llm.generate([prompt], sampling_params)[0].outputs[0]
+print(f"  tokens: {len(out2.token_ids)}, finish_reason: {out2.finish_reason}")
+print(f"  text (first 100): {out2.text[:100]!r}")
+
+# Check for NaN logprobs
+nan_count = 0
+if out2.logprobs:
+    for step in out2.logprobs:
+        if step is None:
+            continue
+        for _tid, lp_obj in step.items():
+            lp = lp_obj.logprob if hasattr(lp_obj, "logprob") else lp_obj
+            if isinstance(lp, float) and math.isnan(lp):
+                nan_count += 1
+            break
+
+if nan_count > 0:
+    print("\n  Sample logprobs from iteration 2:")
+    for idx in [0, 1, 2, len(out2.logprobs) - 1]:
+        if idx < len(out2.logprobs) and out2.logprobs[idx] is not None:
+            for tid, lp_obj in out2.logprobs[idx].items():
+                lp = lp_obj.logprob if hasattr(lp_obj, "logprob") else lp_obj
+                decoded = (
+                    lp_obj.decoded_token if hasattr(lp_obj, "decoded_token") else "?"
+                )
+                print(f"    token[{idx}] id={tid}: logprob={lp} decoded={decoded!r}")
+                break
+
+assert nan_count == 0, (
+    f"FAIL: {nan_count}/{len(out2.token_ids)} logprobs are NaN on iteration 2 "
+    f"(prefix caching is broken in vLLM {vllm.__version__})"
+)
+print(f"\n[{args.model}] ALL GOOD!")

Original file line number	Diff line number	Diff line change
`@@ -134,6 +134,7 @@ project-includes = [`
`134`	`134`	`"tools/model_diagnostics/1.max_model_len_respected.py",`
`135`	`135`	`"tools/model_diagnostics/2.long_generation_decode_vs_prefill.py",`
`136`	`136`	`"tools/model_diagnostics/4.vllm_precision_compilation_test.py",`
	`137`	`+ "tools/model_diagnostics/5.prefix_caching_nan.py",`
`137`	`138`	`]`
`138`	`139`
`139`	`140`	`# Disable TypedDict mutation errors since TypedDict objects are regular dicts at runtime`