[Examples] Correct out-of-date warning for kv cache examples (vllm-project#2209)

kylesayrs · dsikka · HDCharles · web-flow · commit a34a161ebfa4 · 2026-01-24T15:25:14.000Z
## Purpose ##
* As of the attention refactor, CT inference with kv cache quantization
is supported. Fix incorrect information

## Changes ##
* Remove note about CT inference not being supported
* Standardize sample generation code
* Remove note about gemma in transformers==4.49.0 (out of supported
versions)

---------

Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
Co-authored-by: Dipika Sikka &lt;dipikasikka1@gmail.com&gt;
Co-authored-by: HDCharles &lt;39544797+HDCharles@users.noreply.github.com&gt;
diff --git a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
@@ -78,23 +78,13 @@ def process_and_tokenize(example):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-print(
-    "Note: Inference with the quantized kv_cache is not supported. ",
-    "Please use vLLM for inference with the quantized kv_cache.",
-)
 # Confirm generations of the quantized model look sane.
-
-# NOTE: transformers 4.49.0 results in a generation error with gemma2.
-# Consider either downgrading your transformers version to a previous version
-# or use vLLM for sample generation.
-# Note: compile is disabled: https://github.com/huggingface/transformers/issues/38333
 print("\n\n")
-dispatch_for_generation(model)
 print("========== SAMPLE GENERATION ==============")
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
-    model.device
-)
-output = model.generate(input_ids, max_new_tokens=100, disable_compile=True)
+dispatch_for_generation(model)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to(model.device) for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
 
diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
@@ -1,5 +1,4 @@
 from datasets import load_dataset
-from loguru import logger
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
@@ -79,19 +78,13 @@ def process_and_tokenize(example):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-logger.info(
-    "Running sample generation. ",
-    "Note: Inference with the quantized kv_cache is not supported. ",
-    "Please use vLLM for inference with the quantized kv_cache.",
-)
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
-    model.device
-)
-output = model.generate(input_ids, max_new_tokens=100)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to(model.device) for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
 
diff --git a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
@@ -80,18 +80,13 @@ def process_and_tokenize(example):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-print(
-    "Note: Inference with the quantized kv_cache is not supported. ",
-    "Please use vLLM for inference with the quantized kv_cache.",
-)
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
-    model.device
-)
-output = model.generate(input_ids, max_new_tokens=100)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to(model.device) for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")