foundation-model-stack · JRosenkranz · Oct 17, 2025 · Oct 20, 2025 · Oct 23, 2025 · Oct 24, 2025
diff --git a/aiu_fms_testing_utils/scripts/drive_paged_programs.py b/aiu_fms_testing_utils/scripts/drive_paged_programs.py
@@ -16,13 +16,13 @@
 from torch import distributed as dist
 from torch.fx.experimental import _config as fx_config
 from transformers import AutoTokenizer
+import numpy as np
 
 from aiu_fms_testing_utils.testing.validation import (
     GoldenTokenHook,
     LogitsExtractorHook,
     capture_level_1_metrics,
     extract_validation_information,
-    filter_failed_level_1_cases,
     find_validation_info_path,
     get_validation_info_path,
     load_validation_information,
@@ -109,12 +109,26 @@
 )
 
 parser.add_argument(
-    "--cross_entropy_threshold",
+    "--default_cross_entropy_threshold",
     type=float,
     default=2.5,
     help="threshold to denote passing/failing a given iteration",
 )
 
+parser.add_argument(
+    "--cross_entropy_threshold_path",
+    type=str,
+    default=None,
+    help="path to a file with all expected cross-entropy loss thresholds per program, pre sequence",
+)
+
+parser.add_argument(
+    "--per_sequence_failure_rate_threshold",
+    type=float,
+    default=0.1,
+    help="the threshold which denotes whether to pass or fail the test for a given sequence.",
+)
+
 parser.add_argument(
     "--failure_rate_threshold",
     type=float,
@@ -179,6 +193,12 @@
     action="store_true",
     help="set to true ensure that all prompts hit the same prompt program for a given test",
 )
+parser.add_argument(
+    "--generate_metrics_path",
+    type=str,
+    default=None,
+    help="if set, will bypass AIU model processing and generate cross-entropy loss thresholds used for testing, and save the metrics to the given path",
+)
 
 args = parser.parse_args()
 
@@ -187,9 +207,19 @@
 model_variant = args.model_variant
 DATASET_PATH = args.dataset_path
 save_validation_info_outputs = args.save_validation_info_outputs
+generate_metrics = args.generate_metrics_path is not None
 tokenizer = AutoTokenizer.from_pretrained(model_variant)
 custom_shape = None
 
+default_cross_entropy_threshold = float(args.default_cross_entropy_threshold)
+program_threshold_dict = {}
+# if the path exists, load it as a json
+if args.cross_entropy_threshold_path is not None and os.path.exists(
+    args.cross_entropy_threshold_path
+):
+    with open(args.cross_entropy_threshold_path, "r") as f:
+        program_threshold_dict = json.load(f)
+
 if args.dataset_type == "custom":
     if local_rank == 0:
         dprint(
@@ -352,13 +382,17 @@ def __load_validation_info(
 
 distributed_kwargs = {}
 if USE_DISTRIBUTED:
+    if generate_metrics:
+        torch.cuda.set_device(local_rank)
     if args.dist_timeout > 0:
         # Default timeout:
         # https://docs.pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group
-        dist.init_process_group(timeout=datetime.timedelta(minutes=args.dist_timeout))
+        dist.init_process_group(
+            timeout=datetime.timedelta(minutes=args.dist_timeout), backend="gloo"
+        )
         dprint(f"NOTICE: init_process_group timeout set to {args.dist_timeout} minutes")
     else:
-        dist.init_process_group()
+        dist.init_process_group(backend="gloo")
     aiu_dist_setup(dist.get_rank(), dist.get_world_size())
     distributed_kwargs["distributed_strategy"] = "tp"
     distributed_kwargs["group"] = dist.group.WORLD
@@ -369,7 +403,7 @@ def __load_validation_info(
 with stagger_region(args.stagger_load):
     model = get_model(
         architecture="hf_pretrained",
-        device_type="cpu",
+        device_type="cuda" if generate_metrics else "cpu",
         data_type=None if is_fp8 else torch.float16,
         fused_weights=False,
         **model_path_kwargs,
@@ -378,7 +412,8 @@ def __load_validation_info(
 
 model.eval()
 fx_config.backed_size_oblivious = True
-model.compile(backend="sendnn", options={"sendnn.dynamic": True})
+if not generate_metrics:
+    model.compile(backend="sendnn", options={"sendnn.dynamic": True})
 
 __maybe_prepare_fp8_weights(model, is_fp8)
 
@@ -411,15 +446,17 @@ def __load_validation_info(
     and dist.get_world_size() == 4
 ):
     extra_kwargs["_kvcache_num_blocks_hint"] = KVCACHE_NUM_BLOCKS_HINT
-warmup_model(
-    model,
-    input_ids,
-    max_new_tokens=max_new_tokens,
-    compile_dynamic_sendnn=True,
-    stagger_update_lazyhandle=args.stagger_update_lazyhandle,
-    prefill_chunk_size=args.prefill_chunk_size,
-    **extra_kwargs,
-)
+
+if not generate_metrics:
+    warmup_model(
+        model,
+        input_ids,
+        max_new_tokens=max_new_tokens,
+        compile_dynamic_sendnn=True,
+        stagger_update_lazyhandle=args.stagger_update_lazyhandle,
+        prefill_chunk_size=args.prefill_chunk_size,
+        **extra_kwargs,
+    )
 
 if USE_DISTRIBUTED:
     # wait for rank0 to be finished as it is the only one generating the criteria json
@@ -629,18 +666,19 @@ def parse_program_limit(limit_str: str) -> tuple[int, str]:
 # metric calculator based on the cross-entropy and mean diff for each decode step
 def __metric_calculator(r: torch.Tensor, t: torch.Tensor):
     cross_entropy = torch.nn.CrossEntropyLoss()(
-        r, t.softmax(dim=1).to(dtype=torch.float32)
+        r, t.softmax(dim=1).to(device="cpu", dtype=torch.float32)
     )
     diff = torch.mean(
         torch.abs(
             r.softmax(dim=1).to(dtype=torch.float32)
-            - t.softmax(dim=1).to(dtype=torch.float32)
+            - t.softmax(dim=1).to(device="cpu", dtype=torch.float32)
         )
     )
     return (cross_entropy, diff)
 
 
-failed_cases = []
+per_sentence_failed_cases = []
+aggregate_failed_cases = []
 # for each program and valid prompt (batch size, sequence length)
 for program_id, valid_prompt, input_ids, extra_kwargs, sample_key in valid_prompts:
     extra_kwargs["attn_name"] = ATTN_NAME
@@ -696,6 +734,14 @@ def __metric_calculator(r: torch.Tensor, t: torch.Tensor):
                 )
 
         if args.test_type == "metrics":
+            # if we are generating metrics, all inputs should be on cuda device
+            if generate_metrics:
+                input_ids = input_ids.to("cuda")
+                extra_kwargs = {
+                    k: v.to("cuda") if isinstance(v, torch.Tensor) else v
+                    for k, v in extra_kwargs.items()
+                }
+
             aiu_validation_info = extract_validation_information(
                 model,
                 input_ids,
@@ -733,12 +779,70 @@ def __metric_calculator(r: torch.Tensor, t: torch.Tensor):
                         f'For Program {program_id} in sentence {sentence_idx + 1}: the metric for token {token_idx} is {metrics_value}, AIU ID="{aiu_token.item()}" | STR="{aiu_str}" -- CPU ID="{cpu_token.item()}" | CPU STR="{cpu_str}"'
                     )
 
-            ce_fail_responses = filter_failed_level_1_cases(
-                level_1_metrics, lambda m: m[0] >= args.cross_entropy_threshold
-            )
-            failure_rate = len(ce_fail_responses) / len(level_1_metrics)
-            if failure_rate >= args.failure_rate_threshold:
-                failed_cases.append((program_id, valid_prompt, failure_rate))
+            # if generating metrics, get the 99th percentile ce threshold per sentence
+            # otherwise test the thresholds
+            if generate_metrics:
+                sentence_ce_dict = {}
+                for sentence_idx, token_idx, metrics_value in level_1_metrics:
+                    sentence_ce_dict.setdefault(sentence_idx, [])
+                    sentence_ce_dict[sentence_idx].append(metrics_value[0])
+
+                sentence_ce_threshold = {
+                    k: np.percentile(v, 99) for k, v in sentence_ce_dict.items()
+                }
+                if local_rank == 0:
+                    dprint(
+                        f"Program {str(program_id.program_id)} produced the following thresholds:\n{sentence_ce_threshold}"
+                    )
+                program_threshold_dict[(str(program_id.program_id), sample_key)] = (
+                    sentence_ce_threshold
+                )
+            else:
+                sentence_failures_dict = {}
+                for sentence_idx, token_idx, metrics_value in level_1_metrics:
+                    program_threshold_key = f"{str(program_id.program_id)},{sample_key}"
+                    if (
+                        len(program_threshold_dict) != 0
+                        and program_threshold_key not in program_threshold_dict
+                        and local_rank == 0
+                    ):
+                        dprint(
+                            f"could not find the following key {program_threshold_key}, defaulting to {default_cross_entropy_threshold}"
+                        )
+                    ce_threshold = program_threshold_dict.get(
+                        program_threshold_key,
+                        {str(sentence_idx): default_cross_entropy_threshold},
+                    )[str(sentence_idx)]
+                    sentence_failures_dict.setdefault(sentence_idx, 0)
+                    if metrics_value[0].item() >= ce_threshold:
+                        sentence_failures_dict[sentence_idx] += 1
+
+                for sentence_idx, failure_count in sentence_failures_dict.items():
+                    per_sentence_failure_rate = failure_count / max_new_tokens
+                    if (
+                        per_sentence_failure_rate
+                        >= args.per_sequence_failure_rate_threshold
+                    ):
+                        per_sentence_failed_cases.append(
+                            (
+                                program_id,
+                                valid_prompt,
+                                sentence_idx,
+                                per_sentence_failure_rate,
+                            )
+                        )
+
+                aggregate_failure_rate = sum(sentence_failures_dict.values()) / (
+                    max_new_tokens * len(sentence_failures_dict)
+                )
+                if aggregate_failure_rate >= args.failure_rate_threshold:
+                    aggregate_failed_cases.append(
+                        (
+                            program_id,
+                            valid_prompt,
+                            aggregate_failure_rate,
+                        )
+                    )
 
         elif args.test_type == "tokens":
             aiu_validation_info = extract_validation_information(
@@ -808,12 +912,30 @@ def __metric_calculator(r: torch.Tensor, t: torch.Tensor):
                 dprint(f"AIU tokens:\n{aiu_tokens_generated}")
                 dprint(f"AIU output:\n{tokenizer.decode(aiu_tokens_generated)}")
 
+if generate_metrics and local_rank == 0:
+    with open(args.generate_metrics_path, "w") as f:
+        json_dict = {}
+        for program_seq, sentence_ce_threshold_dict in program_threshold_dict.items():
+            program_seq_key = ",".join(program_seq)
+            json_dict[program_seq_key] = {}
+            for sentence_i, ce_threshold in sentence_ce_threshold_dict.items():
+                json_dict[program_seq_key][sentence_i] = float(ce_threshold)
+
+        json.dump(json_dict, f, indent=4)
+
 if not args.skip_validation and local_rank == 0:
-    if len(failed_cases) != 0:
-        dprint("the test failed with the following cases:")
-        for failed_case in failed_cases:
+    if len(aggregate_failed_cases) != 0:
+        dprint("the test failed with the following aggregate cases:")
+        for failed_case in aggregate_failed_cases:
             dprint(
                 f"Program ID: {failed_case[0]}, Prompt Shape: {failed_case[1]}, Failure Rate: {failed_case[2]}"
             )
-    else:
+    if len(per_sentence_failed_cases) != 0:
+        dprint("the test failed with the following per sentence cases:")
+        for failed_case in per_sentence_failed_cases:
+            dprint(
+                f"Program ID: {failed_case[0]}, Prompt Shape: {failed_case[1]}, Sentence Index: {failed_case[2]}, Failure Rate: {failed_case[3]}"
+            )
+
+    if len(aggregate_failed_cases) == 0 and len(per_sentence_failed_cases) == 0:
         dprint("all tests passed")
diff --git a/aiu_fms_testing_utils/utils/paged.py b/aiu_fms_testing_utils/utils/paged.py
@@ -200,10 +200,10 @@ def generate(
             (
                 torch.zeros(
                     NUM_BLOCKS, BLOCK_SIZE, kvheads, head_size, dtype=model_dtype
-                ),
+                ).to(input_ids.device),
                 torch.zeros(
                     NUM_BLOCKS, BLOCK_SIZE, kvheads, head_size, dtype=model_dtype
-                ),
+                ).to(input_ids.device),
             )
             for _ in range(model.config.nlayers)
         ]
@@ -462,9 +462,9 @@ def generate(
                     torch._dynamo.mark_dynamic(mask_seq, 3)
                     output, current_kv_cache = model(
                         input_ids_seq,
-                        slot_mapping=slot_mapping_seq,
-                        position_ids=position_ids_seq,
-                        mask=mask_seq,
+                        slot_mapping=slot_mapping_seq.to(input_ids.device),
+                        position_ids=position_ids_seq.to(input_ids.device),
+                        mask=mask_seq.to(input_ids.device),
                         past_key_value_states=current_kv_cache,
                         use_cache=kwargs["use_cache"],
                         last_n_tokens=last_n_tokens,
@@ -500,8 +500,10 @@ def generate(
             # mask is no longer used here
             kwargs["mask"] = None
             kwargs["position_ids"] = kwargs["position_ids"][:, -1:] + 1
-            kwargs["position_ids"] = kwargs["position_ids"].clone(
-                memory_format=torch.contiguous_format
+            kwargs["position_ids"] = (
+                kwargs["position_ids"]
+                .clone(memory_format=torch.contiguous_format)
+                .to(device=input_ids.device)
             )
             kwargs["last_n_tokens"] = 1
 
@@ -529,14 +531,20 @@ def generate(
                     for b_seq in block_table
                 ],
                 dtype=torch.int64,
+            ).to(device=input_ids.device)
+            kwargs["left_padded_prompt_mask"] = left_padded_prompt_mask.to(
+                device=input_ids.device
             )
-            kwargs["left_padded_prompt_mask"] = left_padded_prompt_mask
             current_tkv_mask = current_tkv_mask + 1
-            kwargs["current_tkv_mask"] = current_tkv_mask
-            kwargs["slot_mapping"] = torch.tensor(slot_mapping, dtype=torch.int64)
+            kwargs["current_tkv_mask"] = current_tkv_mask.to(device=input_ids.device)
+            kwargs["slot_mapping"] = torch.tensor(slot_mapping, dtype=torch.int64).to(
+                device=input_ids.device
+            )
 
             # batch
-            input_ids = input_ids.clone(memory_format=torch.contiguous_format)
+            input_ids = input_ids.clone(memory_format=torch.contiguous_format).to(
+                device=input_ids.device
+            )
             torch._dynamo.mark_dynamic(input_ids, 0)
             torch._dynamo.mark_dynamic(kwargs["block_table"], 0)
             torch._dynamo.mark_dynamic(kwargs["slot_mapping"], 0)