Merge pull request #133 from foundation-model-stack/drive_program_script_enhancements_last_n_tokens

JRosenkranz · web-flow · commit ee218a46cec5 · 2025-09-26T20:24:49.000-04:00
Drive program script enhancements last n tokens
diff --git a/aiu_fms_testing_utils/testing/validation.py b/aiu_fms_testing_utils/testing/validation.py
@@ -256,7 +256,7 @@ def extract_validation_information(
     post_iteration_hook,
     attn_algorithm=None,
     eos_token_id=None,
-    only_last_token=False,
+    last_n_tokens=0,
     timing="",
     **extra_kwargs,
 ):
@@ -270,10 +270,10 @@ def extract_validation_information(
         attention_specific_kwargs["contiguous_cache"] = True
         attention_specific_kwargs["max_seq_len"] = input_ids.shape[1] + max_new_tokens
 
-    # Add only_last_token optimization
+    # Add last_n_tokens optimization
     extra_generation_kwargs = {**extra_kwargs}
-    if only_last_token:
-        extra_generation_kwargs["only_last_token"] = only_last_token
+    if last_n_tokens != 0:
+        extra_generation_kwargs["last_n_tokens"] = last_n_tokens
     if attn_algorithm is not None:
         extra_generation_kwargs["attn_algorithm"] = attn_algorithm
 
diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py
@@ -85,7 +85,7 @@ def warmup_model(
                 **extra_kwargs,
             )
 
-    extra_kwargs = {**_extra_kwargs, "only_last_token": "paged" not in attn_name}
+    extra_kwargs = {**_extra_kwargs, "last_n_tokens": 64 if "paged" in attn_name else 1}
 
     with stagger_region(stagger_update_lazyhandle):
         with torch_sendnn.warmup_mode():
diff --git a/aiu_fms_testing_utils/utils/paged.py b/aiu_fms_testing_utils/utils/paged.py
@@ -86,6 +86,11 @@ def generate(
     if extra_kwargs is not None:
         kwargs.update(extra_kwargs)
 
+    # if we didn't specify last_n_tokens and only_last_token is set to True, set last_n_tokens to 1, otherwise use default
+    # we do this since the output shape of only_last_token is different and therefore would change the logic in generate
+    if "last_n_tokens" not in kwargs and kwargs.get("only_last_token", False):
+        kwargs["last_n_tokens"] = 1
+
     is_fp8 = "fp8" in kwargs["attn_name"]
     if isinstance(input_ids, torch.Tensor):
         if len(input_ids.shape) == 1:
@@ -233,7 +238,7 @@ def generate(
     kwargs["current_tkv_mask"] = None
     kwargs["left_padded_prompt_mask"] = None
     kwargs["use_cache"] = use_cache
-    only_last_token = kwargs.get("only_last_token", False)
+    last_n_tokens = kwargs.get("last_n_tokens", 0)
 
     prompt_length = input_ids.shape[1]
 
@@ -296,21 +301,20 @@ def generate(
                         t1._scale = current_kv_scales[layer_idx][0][seq_i].reshape(-1)
                         t2._scale = current_kv_scales[layer_idx][1][seq_i].reshape(-1)
 
-                only_last_token = kwargs.get("only_last_token", False)
+                last_n_tokens = kwargs.get("last_n_tokens", 0)
                 output, current_kv_cache = model(
                     input_ids_i,
                     slot_mapping=slot_mapping_i,
                     position_ids=position_ids_i,
                     mask=mask_i,
                     past_key_value_states=current_kv_cache,
                     use_cache=kwargs["use_cache"],
-                    only_last_token=only_last_token,
+                    last_n_tokens=last_n_tokens,
                     attn_name=kwargs["attn_name"],
                 )
 
                 # only last token must be handled here to properly stack the tensors
-                if not only_last_token:
-                    output = output[:, -1, :]
+                output = output[:, -1, :]
 
                 # TODO: Figure out how to do this cleanly
                 if "fp8" in kwargs["attn_name"]:
@@ -341,6 +345,7 @@ def generate(
             kwargs["position_ids"] = kwargs["position_ids"].clone(
                 memory_format=torch.contiguous_format
             )
+            kwargs["last_n_tokens"] = 1
 
             # we no longer have a global pos_i, each sequence has its own pos_i
             slot_mapping = []
@@ -396,8 +401,7 @@ def generate(
             # typically this is done outside of prefill/decode logic, but since this logic already exists as part of the
             # conditional for prefill (since prefill does this within a loop for each batch size 1 prefill), we also provide
             # this same logic as part of the decode conditional
-            if not only_last_token:
-                logits = logits[:, -1, :]
+            logits = logits[:, -1, :]
 
             output = (logits, past_key_value_states)
 
diff --git a/examples/README.md b/examples/README.md
@@ -50,7 +50,7 @@ prompt = template.format("Provide a list of instructions for preparing chicken s
 input_ids = tokenizer.encode(prompt, return_tensors="pt")
 input_ids, extra_generation_kwargs = pad_input_ids([input_ids.squeeze(0)], min_pad_length=math.ceil(input_ids.size(1)/64) * 64)
 # only_last_token optimization
-extra_generation_kwargs["only_last_token"] = True
+extra_generation_kwargs["last_n_tokens"] = 1
 # Set a desired number
 max_new_tokens = 16
 ```
diff --git a/examples/run_granite3.py b/examples/run_granite3.py
@@ -34,7 +34,7 @@
     [input_ids.squeeze(0)], min_pad_length=math.ceil(input_ids.size(1) / 64) * 64
 )
 # only_last_token optimization
-extra_generation_kwargs["only_last_token"] = True
+extra_generation_kwargs["last_n_tokens"] = 1
 # Set a desired number
 max_new_tokens = 16
 
diff --git a/scripts/drive_paged_programs.py b/scripts/drive_paged_programs.py
@@ -632,7 +632,7 @@ def __metric_calculator(r: torch.Tensor, t: torch.Tensor):
                 input_ids,
                 max_new_tokens,
                 GoldenTokenHook(cpu_validation_info.get_info("tokens")),
-                only_last_token=False,
+                last_n_tokens=64,
                 timing=TIMING,
                 **extra_kwargs,
             )
@@ -676,7 +676,7 @@ def __metric_calculator(r: torch.Tensor, t: torch.Tensor):
                 input_ids,
                 max_new_tokens,
                 None,
-                only_last_token=False,
+                last_n_tokens=64,
                 timing=TIMING,
                 **extra_kwargs,
             )
@@ -718,7 +718,7 @@ def __metric_calculator(r: torch.Tensor, t: torch.Tensor):
             input_ids,
             max_new_tokens,
             None,
-            only_last_token=False,
+            last_n_tokens=64,
             timing=TIMING,
             **extra_kwargs,
         )
diff --git a/scripts/generate_metrics.py b/scripts/generate_metrics.py
@@ -259,7 +259,7 @@ def write_csv(metrics, path, metric_name):
         ids.to("cuda"),
         args.max_new_tokens,
         None,
-        only_last_token=True,
+        last_n_tokens=1,
         **{k: v.to("cuda") for k, v in padding_kwargs.items()},
     )
     cuda_static_tokens = cuda_validation_info.get_info("tokens")
@@ -334,7 +334,7 @@ def write_csv(metrics, path, metric_name):
             ids.to("cuda"),
             args.max_new_tokens,
             GoldenTokenHook(cpu_validation_info.get_info("tokens"), "cuda"),
-            only_last_token=True,
+            last_n_tokens=1,
             **{k: v.to("cuda") for k, v in padding_kwargs.items()},
         )
 
diff --git a/scripts/inference.py b/scripts/inference.py
@@ -771,7 +771,7 @@ def infer(use_cache, do_sample, warmup):
     global extra_generation_kwargs
     if extra_generation_kwargs is None:
         extra_generation_kwargs = {}
-    extra_generation_kwargs["only_last_token"] = "paged" not in attn_name
+    extra_generation_kwargs["last_n_tokens"] = 64 if "paged" in attn_name else 1
 
     if not args.no_early_termination and not warmup:
         eos_token_id = tokenizer.eos_token_id
diff --git a/scripts/validation.py b/scripts/validation.py
@@ -710,7 +710,7 @@ def print_result(result, result_idx: int = 0, file_prefix: str = ""):
     args.max_new_tokens,
     post_iteration_hook,
     eos_token_id=None if args.no_early_termination else tokenizer.eos_token_id,
-    only_last_token=True,
+    last_n_tokens=1,
     timing=args.timing,
     **padding_kwargs,
 )
diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py
@@ -588,7 +588,7 @@ def test_common_shapes(
         input_ids,
         max_new_tokens,
         None,
-        only_last_token="paged" not in ATTN_NAME,
+        last_n_tokens=64 if "paged" in ATTN_NAME else 1,
         timing=TIMING,
         **extra_kwargs,
     )
@@ -689,7 +689,7 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor):
                 input_ids,
                 max_new_tokens,
                 GoldenTokenHook(cpu_static_tokens),
-                only_last_token=ATTN_TYPE != "paged",
+                last_n_tokens=64 if "paged" in ATTN_NAME else 1,
                 timing=TIMING,
                 **extra_kwargs,
             )

Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,7 @@ def warmup_model(`
`85`	`85`	`**extra_kwargs,`
`86`	`86`	`)`
`87`	`87`
`88`		`- extra_kwargs = {**_extra_kwargs, "only_last_token": "paged" not in attn_name}`
	`88`	`+ extra_kwargs = {**_extra_kwargs, "last_n_tokens": 64 if "paged" in attn_name else 1}`
`89`	`89`
`90`	`90`	`with stagger_region(stagger_update_lazyhandle):`
`91`	`91`	`with torch_sendnn.warmup_mode():`
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@`
`34`	`34`	`[input_ids.squeeze(0)], min_pad_length=math.ceil(input_ids.size(1) / 64) * 64`
`35`	`35`	`)`
`36`	`36`	`# only_last_token optimization`
`37`		`-extra_generation_kwargs["only_last_token"] = True`
	`37`	`+extra_generation_kwargs["last_n_tokens"] = 1`
`38`	`38`	`# Set a desired number`
`39`	`39`	`max_new_tokens = 16`
`40`	`40`
Original file line number	Diff line number	Diff line change
`@@ -710,7 +710,7 @@ def print_result(result, result_idx: int = 0, file_prefix: str = ""):`
`710`	`710`	`args.max_new_tokens,`
`711`	`711`	`post_iteration_hook,`
`712`	`712`	`eos_token_id=None if args.no_early_termination else tokenizer.eos_token_id,`
`713`		`- only_last_token=True,`
	`713`	`+ last_n_tokens=1,`
`714`	`714`	`timing=args.timing,`
`715`	`715`	`**padding_kwargs,`
`716`	`716`	`)`