foundation-model-stack · cfsarmiento · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026
diff --git a/aiu_fms_testing_utils/scripts/README.md b/aiu_fms_testing_utils/scripts/README.md
@@ -1,6 +1,6 @@
 # Scripts for using Foundation Model Stack (FMS) on AIU hardware
 
-The scripts provided here allow you to run FMS on AIU device for a variety of models. 
+The scripts provided here allow you to run FMS on AIU device for a variety of models.
 
 Let's look at some of the example usage below.
 
@@ -76,3 +76,35 @@ python3 scripts/validation.py --architecture=hf_configured --model_path=/home/de
 
 To run a logits-based validation, pass `--validation_level=1` to the validation script. This will check for the logits output to match at every step of the model through cross-entropy loss. You can control the acceptable threshold with `--logits_loss_threshold`.
 
+## Setup the environment for reporting resource usage
+
+When running `drive_paged_programs.py` you may want to see how much CPU and memory usage is
+happening. This is done using Prometheus, thus if you are running in a container environment (non-OpenShift), you want to set up a simple Prometheus server to start collecting these metrics. To do this, do the following:
+
+1. Run `podman network create promnet`
+2. Run `podman run -d --name node-exporter --network promnet quay.io/prometheus/node-exporter:latest`
+3. Create a file called `prometheus.yml` that has the following contents:
+
+```yaml
+global:
+  scrape_interval: 5s
+
+scrape_configs:
+  - job_name: "node"
+    static_configs:
+      - targets: ["node-exporter:9100"]
+```
+
+4. Run `podman run -d --name prometheus --network promnet -p 9091:9090   -v "$PWD/prometheus.yml:/etc/prometheus/prometheus.yml:Z"   quay.io/prometheus/prometheus:latest   --config.file=/etc/prometheus/prometheus.yml`
+5. Check the status of the server by running `curl -s "http://localhost:9091/api/v1/targets" | python3 -m json.tool | grep health` and ensuring that "health" says "up".
+6. When you are about to run DPP, run `export PROMETHEUS_URL="http://localhost:9091"`
+
+If you are running in OpenShift, the aformentioned instructions are not neccessary and instead, you are going to want to set `PROMETHEUS_URL` to an OpenShift route that already has Prometheus set up. Additionally, you are going to want to set `PROMETHEUS_API_KEY` to your OpenShift OAuth token if the Prometheus instance on the cluster is protected. You can get this token by running `oc whoami -t`.
+
+When actually running a DPP test, you are going to want to set the `--report_resource_utilization` flag to see outputs. Regardless if you have this flag set or if you do not have Prometheus installed or any of the environment variables set, DPP should always run. These instructions are simply just to see resource utilization outputs.
+
+Sample test to run with resource utilization outputs:
+
+```bash
+torchrun --nproc-per-node=4 aiu-fms-testing-utils/scripts/drive_paged_programs.py --model_variant=/ibm-granite/granite-3.3-8b-instruct --program_criteria_json_path=path/to/program_criterion.json --dataset_type=sharegpt --skip_validation --programs "*:0,<8192" --prioritize_large_batch_sizes --enforce_homogeneous_prompt_programs --prefill_chunk_size=1024 --dataset_path=ShareGPT_V3_unfiltered_cleaned_split.json --report_resource_utilization
+```
diff --git a/aiu_fms_testing_utils/scripts/drive_paged_programs.py b/aiu_fms_testing_utils/scripts/drive_paged_programs.py
@@ -1,6 +1,6 @@
 import argparse
 from dataclasses import dataclass
-import datetime
+from datetime import datetime
 import itertools
 import json
 import os
@@ -44,6 +44,10 @@
     get_programs_prompts,
 )
 from aiu_fms_testing_utils.testing.utils import format_kwargs_to_string
+from aiu_fms_testing_utils.utils.resource_collection import (
+    instantiate_prometheus,
+    print_step,
+)
 
 # Constants
 PAD_MULTIPLE = 64
@@ -276,6 +280,11 @@ def parse_cli_args() -> argparse.Namespace:
         action="store_true",
         help="set to true ensure that all prompts hit the same prompt program for a given test",
     )
+    parser.add_argument(
+        "--report_resource_utilization",
+        action="store_true",
+        help="set to true to report CPU/memory utilization during compilation and inference stages",
+    )
 
     return parser.parse_args()
 
@@ -1251,6 +1260,8 @@ def generate_validation_info_and_test(
     timing: str,
     prefill_chunk_size: int,
     model_variant: str,
+    print_utilization: bool = False,
+    profile: Optional[Any] = None,
 ) -> list[Any]:
     """Generates tokens using AIU and CPU models and validates the results.
 
@@ -1271,8 +1282,12 @@ def generate_validation_info_and_test(
                 f"program id: {valid_prompt.program_id}, valid prompt: {valid_prompt.shape}, input shape: {valid_prompt.input_ids.shape}"
             )
 
+        # Start inference
         if not skip_validation:
             # Generate or load CPU validation info
+            cpu_metric_start = print_step(
+                profile, print_utilization, "started", "CPU Inference"
+            )
             cpu_validation_info = generate_cpu_validation(
                 model_variant=model_variant,
                 max_new_tokens=max_new_tokens,
@@ -1287,7 +1302,18 @@ def generate_validation_info_and_test(
                 cpu_dtype=env_config.cpu_dtype,
                 tokenizer=tokenizer,
             )
+            print_step(
+                profile,
+                print_utilization,
+                "completed",
+                "CPU Inference",
+                cpu_metric_start,
+            )
 
+            # Generate AIU validation info
+            aiu_metric_start = print_step(
+                profile, print_utilization, "started", "AIU Inference"
+            )
             aiu_validation_info = generate_aiu_validation(
                 test_type=test_type,
                 max_new_tokens=max_new_tokens,
@@ -1298,6 +1324,13 @@ def generate_validation_info_and_test(
                 cpu_validation_info=cpu_validation_info,
                 extra_kwargs=valid_prompt.extra_kwargs,
             )
+            print_step(
+                profile,
+                print_utilization,
+                "completed",
+                "AIU Inference",
+                aiu_metric_start,
+            )
 
             if test_type == "metrics":
                 failure_rate = evaluate_cross_entropy_metrics(
@@ -1325,6 +1358,10 @@ def generate_validation_info_and_test(
             else:
                 raise ValueError("test type must be one of metrics or tokens")
         else:
+            # Generate AIU validation info
+            aiu_metric_start = print_step(
+                profile, print_utilization, "started", "AIU Inference"
+            )
             aiu_validation_info = generate_aiu_validation(
                 test_type=test_type,
                 max_new_tokens=max_new_tokens,
@@ -1335,6 +1372,13 @@ def generate_validation_info_and_test(
                 cpu_validation_info=None,
                 extra_kwargs=valid_prompt.extra_kwargs,
             )
+            print_step(
+                profile,
+                print_utilization,
+                "completed",
+                "AIU Inference",
+                aiu_metric_start,
+            )
 
             if local_rank == 0:
                 for sentence_idx, test_sentence in enumerate(
@@ -1392,6 +1436,9 @@ def main() -> None:
         tokenizer=tokenizer,
     )
 
+    # Instantiate the Prometheus client for resource metric collection
+    p = instantiate_prometheus(args.report_resource_utilization)
+
     # Model Loading
     model_kwargs: Dict[str, Any] = _get_model_kwargs(model_variant=args.model_variant)
     distributed_kwargs: Dict[str, Any] = _get_distributed_kwargs(
@@ -1448,6 +1495,8 @@ def main() -> None:
         compile_dynamic_sendnn=True,
         stagger_update_lazyhandle=args.stagger_update_lazyhandle,
         prefill_chunk_size=args.prefill_chunk_size,
+        print_utilization=args.report_resource_utilization,
+        profile=p,
         **extra_kwargs,
     )
     if args.distributed:
@@ -1490,6 +1539,8 @@ def main() -> None:
         timing=args.timing,
         prefill_chunk_size=args.prefill_chunk_size,
         model_variant=args.model_variant,
+        print_utilization=args.report_resource_utilization,
+        profile=p,
     )
 
     if not args.skip_validation and local_rank == 0:

diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py
@@ -1,5 +1,5 @@
 # Standard
-from typing import Optional, List, Tuple
+from typing import Optional, List, Tuple, Any
 import json
 import os
 import random
@@ -12,7 +12,7 @@
 from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank, world_size
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from aiu_fms_testing_utils.testing.utils import format_kwargs_to_string
-
+from aiu_fms_testing_utils.utils.resource_collection import print_step
 from fms.utils.generation import pad_input_ids
 import torch
 import torch.nn as nn
@@ -55,6 +55,8 @@ def warmup_model(
     use_cache: bool = True,
     stagger_update_lazyhandle: int = 0,
     prefill_chunk_size: int = 0,
+    print_utilization: bool = False,
+    profile: Optional[Any] = None,
     **extra_kwargs,
 ):
     import torch_sendnn
@@ -72,9 +74,13 @@ def warmup_model(
         attention_specific_kwargs["contiguous_cache"] = True
         attention_specific_kwargs["max_seq_len"] = input_ids.shape[1] + max_new_tokens
 
+    # Start the warmup
     dprint("AIU warmup")
     pt_compile_model_time = time.time()
 
+    ## Report on initial resource usage
+    metric_start = print_step(profile, print_utilization, "started", "Compilation")
+
     # adjust inputs depending on attn_type and dynamic shapes
     _warmup_input_ids = input_ids
     _extra_kwargs = extra_kwargs
@@ -103,6 +109,9 @@ def warmup_model(
                 **attention_specific_kwargs,
             )
     pt_compile_model_time = time.time() - pt_compile_model_time
+
+    # Get completed metric read
+    print_step(profile, print_utilization, "completed", "Compilation", metric_start)
     dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s")