From af984385e1d096d8a8cad5975e00d148fa0fa07c Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Fri, 20 Feb 2026 13:15:22 -0500 Subject: [PATCH 01/42] Adding a print statement for compilation Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/scripts/drive_paged_programs.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/aiu_fms_testing_utils/scripts/drive_paged_programs.py b/aiu_fms_testing_utils/scripts/drive_paged_programs.py index 0e343aeb..8336bf09 100644 --- a/aiu_fms_testing_utils/scripts/drive_paged_programs.py +++ b/aiu_fms_testing_utils/scripts/drive_paged_programs.py @@ -679,9 +679,11 @@ def load_model( model.eval() if device_type == "spyre": + dprint("Compilation started") with scoped_environ(model_config.env_updates()): - # Temporarily set environment variables needed for compile + # Temporarily set environment variables needed for compile model.compile(backend="sendnn", options={"sendnn.dynamic": True}) + dprint("Compilation completed") _maybe_prepare_fp8_weights(model, is_fp8) From 9f5eb1777ce7ae1486a7fb2086a1a35fc8761120 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Fri, 20 Feb 2026 13:40:37 -0500 Subject: [PATCH 02/42] Moving the print to the real compilation stage Added a small function to easily format the timestamp Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/scripts/drive_paged_programs.py | 4 +--- aiu_fms_testing_utils/utils/__init__.py | 6 ++++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/aiu_fms_testing_utils/scripts/drive_paged_programs.py b/aiu_fms_testing_utils/scripts/drive_paged_programs.py index 8336bf09..0e343aeb 100644 --- a/aiu_fms_testing_utils/scripts/drive_paged_programs.py +++ b/aiu_fms_testing_utils/scripts/drive_paged_programs.py @@ -679,11 +679,9 @@ def load_model( model.eval() if device_type == "spyre": - dprint("Compilation started") with scoped_environ(model_config.env_updates()): - # Temporarily set environment variables needed for compile + # Temporarily set environment variables needed for compile model.compile(backend="sendnn", options={"sendnn.dynamic": True}) - dprint("Compilation completed") _maybe_prepare_fp8_weights(model, is_fp8) diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index 1bbb82ec..13e1cedb 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -6,6 +6,7 @@ import requests import time import bisect +from datetime import datetime # Third Party @@ -46,6 +47,9 @@ def stagger_region(limit: int): torch.distributed.barrier() dprint("Stagger: All Complete") +def timestamp_print(given_string): + timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") + print(f"[{timestamp}] {given_string}") def warmup_model( model: nn.Module, @@ -91,6 +95,7 @@ def warmup_model( extra_kwargs = {**_extra_kwargs, "last_n_tokens": 64 if "paged" in attn_name else 1} + timestamp_print("Compilation started") with stagger_region(stagger_update_lazyhandle): with torch_sendnn.warmup_mode(): generate( @@ -102,6 +107,7 @@ def warmup_model( extra_kwargs=extra_kwargs, **attention_specific_kwargs, ) + timestamp_print("Compilation complete") pt_compile_model_time = time.time() - pt_compile_model_time dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s") From 86bcbf90dacf1771d1f71cf8567f1db70145a1f2 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Fri, 20 Feb 2026 14:03:45 -0500 Subject: [PATCH 03/42] using dprint Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index 13e1cedb..2cb049c2 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -49,7 +49,7 @@ def stagger_region(limit: int): def timestamp_print(given_string): timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") - print(f"[{timestamp}] {given_string}") + dprint(f"[{timestamp}] {given_string}") def warmup_model( model: nn.Module, From d0fbe1ed43a04cc9aa598a1edc63333e7ab5f238 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Fri, 20 Feb 2026 15:37:46 -0500 Subject: [PATCH 04/42] Forcing the output of the print statements Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/utils/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index 2cb049c2..aa910e8b 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -7,6 +7,7 @@ import time import bisect from datetime import datetime +import sys # Third Party @@ -50,6 +51,7 @@ def stagger_region(limit: int): def timestamp_print(given_string): timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") dprint(f"[{timestamp}] {given_string}") + sys.stdout.flush() # forcing output def warmup_model( model: nn.Module, From 232672ee29a764e93de5b798458714557ee6199f Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Fri, 20 Feb 2026 16:24:49 -0500 Subject: [PATCH 05/42] Seeing if this works for seeing the prints Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index aa910e8b..28f77e57 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -50,7 +50,7 @@ def stagger_region(limit: int): def timestamp_print(given_string): timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") - dprint(f"[{timestamp}] {given_string}") + print(f"[{timestamp}] {given_string}", flush=True) sys.stdout.flush() # forcing output def warmup_model( From 730a1942932ac39b42945eb41ccc4c065d8797b4 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Fri, 20 Feb 2026 17:00:08 -0500 Subject: [PATCH 06/42] Seeing if taking it away from the context manager works Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/utils/__init__.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index 28f77e57..ee36b5df 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -48,10 +48,6 @@ def stagger_region(limit: int): torch.distributed.barrier() dprint("Stagger: All Complete") -def timestamp_print(given_string): - timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") - print(f"[{timestamp}] {given_string}", flush=True) - sys.stdout.flush() # forcing output def warmup_model( model: nn.Module, @@ -97,7 +93,8 @@ def warmup_model( extra_kwargs = {**_extra_kwargs, "last_n_tokens": 64 if "paged" in attn_name else 1} - timestamp_print("Compilation started") + timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") + dprint(f"[{timestamp}] Compilation started") with stagger_region(stagger_update_lazyhandle): with torch_sendnn.warmup_mode(): generate( @@ -109,7 +106,8 @@ def warmup_model( extra_kwargs=extra_kwargs, **attention_specific_kwargs, ) - timestamp_print("Compilation complete") + timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") + dprint(f"[{timestamp}] Compilation ended") pt_compile_model_time = time.time() - pt_compile_model_time dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s") From 13654d31da619cfdb9358dc5717ec87be0946c6b Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Fri, 20 Feb 2026 17:28:34 -0500 Subject: [PATCH 07/42] Adding flush=True to dprint Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/utils/__init__.py | 4 ++-- aiu_fms_testing_utils/utils/aiu_setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index ee36b5df..f5f5a464 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -76,6 +76,8 @@ def warmup_model( dprint("AIU warmup") pt_compile_model_time = time.time() + timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") + dprint(f"[{timestamp}] Compilation started") # adjust inputs depending on attn_type and dynamic shapes _warmup_input_ids = input_ids @@ -93,8 +95,6 @@ def warmup_model( extra_kwargs = {**_extra_kwargs, "last_n_tokens": 64 if "paged" in attn_name else 1} - timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") - dprint(f"[{timestamp}] Compilation started") with stagger_region(stagger_update_lazyhandle): with torch_sendnn.warmup_mode(): generate( diff --git a/aiu_fms_testing_utils/utils/aiu_setup.py b/aiu_fms_testing_utils/utils/aiu_setup.py index fb2dedf2..3a089873 100644 --- a/aiu_fms_testing_utils/utils/aiu_setup.py +++ b/aiu_fms_testing_utils/utils/aiu_setup.py @@ -20,7 +20,7 @@ def dprint_str(text): def dprint(text): - print(dprint_str(text)) + print(dprint_str(text), flush=True) # ============================================================== From 820b336a716ca8134001b7de8faec67c927ac78e Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Fri, 20 Feb 2026 17:31:14 -0500 Subject: [PATCH 08/42] Not using dprint Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/utils/__init__.py | 4 ++-- aiu_fms_testing_utils/utils/aiu_setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index f5f5a464..c12e6134 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -77,7 +77,7 @@ def warmup_model( dprint("AIU warmup") pt_compile_model_time = time.time() timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") - dprint(f"[{timestamp}] Compilation started") + print(f"[{timestamp}] Compilation started", flush=True) # adjust inputs depending on attn_type and dynamic shapes _warmup_input_ids = input_ids @@ -107,7 +107,7 @@ def warmup_model( **attention_specific_kwargs, ) timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") - dprint(f"[{timestamp}] Compilation ended") + print(f"[{timestamp}] Compilation ended", flush=True) pt_compile_model_time = time.time() - pt_compile_model_time dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s") diff --git a/aiu_fms_testing_utils/utils/aiu_setup.py b/aiu_fms_testing_utils/utils/aiu_setup.py index 3a089873..fb2dedf2 100644 --- a/aiu_fms_testing_utils/utils/aiu_setup.py +++ b/aiu_fms_testing_utils/utils/aiu_setup.py @@ -20,7 +20,7 @@ def dprint_str(text): def dprint(text): - print(dprint_str(text), flush=True) + print(dprint_str(text)) # ============================================================== From 0f52beea91431976303ecda6ca003eb21b4f0c49 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Fri, 20 Feb 2026 18:06:44 -0500 Subject: [PATCH 09/42] seeing if these changes get picked up Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/utils/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index c12e6134..23b15b60 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -74,7 +74,7 @@ def warmup_model( attention_specific_kwargs["contiguous_cache"] = True attention_specific_kwargs["max_seq_len"] = input_ids.shape[1] + max_new_tokens - dprint("AIU warmup") + dprint("AIU warmup -- changed") pt_compile_model_time = time.time() timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") print(f"[{timestamp}] Compilation started", flush=True) @@ -109,7 +109,7 @@ def warmup_model( timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") print(f"[{timestamp}] Compilation ended", flush=True) pt_compile_model_time = time.time() - pt_compile_model_time - dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s") + dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s -- changed") def __download_file(url, filename): From 8aea68784f609a32de5283d6572c21c231bc3363 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Fri, 20 Feb 2026 18:33:10 -0500 Subject: [PATCH 10/42] Adding compilation start/end prints Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/utils/__init__.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index 23b15b60..dcaefbb7 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -48,6 +48,10 @@ def stagger_region(limit: int): torch.distributed.barrier() dprint("Stagger: All Complete") +def timestamp_print(given_string): + timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") + print(f"[{timestamp}] {given_string}") + def warmup_model( model: nn.Module, @@ -74,10 +78,9 @@ def warmup_model( attention_specific_kwargs["contiguous_cache"] = True attention_specific_kwargs["max_seq_len"] = input_ids.shape[1] + max_new_tokens - dprint("AIU warmup -- changed") + dprint("AIU warmup") pt_compile_model_time = time.time() - timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") - print(f"[{timestamp}] Compilation started", flush=True) + timestamp_print("Compilation started") # adjust inputs depending on attn_type and dynamic shapes _warmup_input_ids = input_ids @@ -106,10 +109,9 @@ def warmup_model( extra_kwargs=extra_kwargs, **attention_specific_kwargs, ) - timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") - print(f"[{timestamp}] Compilation ended", flush=True) pt_compile_model_time = time.time() - pt_compile_model_time - dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s -- changed") + timestamp_print("Compilation complete") + dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s") def __download_file(url, filename): From 19aecb74c5479410df7502542ab19531bc1d0243 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Mon, 23 Feb 2026 14:20:21 -0500 Subject: [PATCH 11/42] Using prometheus to get static metric reads This adds logic to get a read at the start and end of compilation using Prometheus. Next, I will add logic to get the peak usage and later on I will refine the error handling so that this also works if you don't have the necessary environment variables set. Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/utils/__init__.py | 26 ++++++- .../utils/resource_collection.py | 76 +++++++++++++++++++ 2 files changed, 99 insertions(+), 3 deletions(-) create mode 100644 aiu_fms_testing_utils/utils/resource_collection.py diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index dcaefbb7..a7a29c90 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -6,7 +6,7 @@ import requests import time import bisect -from datetime import datetime +from datetime import datetime, timezone import sys # Third Party @@ -14,6 +14,7 @@ from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank, world_size from transformers.tokenization_utils_base import PreTrainedTokenizerBase from aiu_fms_testing_utils.testing.utils import format_kwargs_to_string +from aiu_fms_testing_utils.utils.resource_collection import instantiate_prometheus, get_static_read from fms.utils.generation import pad_input_ids import torch @@ -49,9 +50,16 @@ def stagger_region(limit: int): dprint("Stagger: All Complete") def timestamp_print(given_string): + timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") print(f"[{timestamp}] {given_string}") +def print_comp_resource_metrics(cpu_val, mem_val, stage): + + if cpu_val is None or mem_val is None: + timestamp_print(f"Compilation {stage}") + else: + timestamp_print(f"Compilation {stage} - CPU: {cpu_val}, Memory: {mem_val}") def warmup_model( model: nn.Module, @@ -78,9 +86,17 @@ def warmup_model( attention_specific_kwargs["contiguous_cache"] = True attention_specific_kwargs["max_seq_len"] = input_ids.shape[1] + max_new_tokens + # Instantiate the Prometheus client for resource metric collection + p = instantiate_prometheus() + + # Start the warmup dprint("AIU warmup") pt_compile_model_time = time.time() - timestamp_print("Compilation started") + + ## Report on initial resource usage + metric_start = datetime.now(timezone.utc) + initial_cpu, initial_mem = get_static_read(p, metric_start) + print_comp_resource_metrics(initial_cpu, initial_mem, "started") # adjust inputs depending on attn_type and dynamic shapes _warmup_input_ids = input_ids @@ -110,7 +126,11 @@ def warmup_model( **attention_specific_kwargs, ) pt_compile_model_time = time.time() - pt_compile_model_time - timestamp_print("Compilation complete") + + # Get completed metric read + metric_end = datetime.now(timezone.utc) + end_cpu, end_mem = get_static_read(p, metric_end) + print_comp_resource_metrics(end_cpu, end_mem, "completed") dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s") diff --git a/aiu_fms_testing_utils/utils/resource_collection.py b/aiu_fms_testing_utils/utils/resource_collection.py new file mode 100644 index 00000000..93f35325 --- /dev/null +++ b/aiu_fms_testing_utils/utils/resource_collection.py @@ -0,0 +1,76 @@ +import os + +from prometheus_api_client import PrometheusConnect + + +def instantiate_prometheus(): + """ + Top-level method that will instantiate the Prometheus Client to collect + resource usage metrics. + + Returns: + - PrometheusConnect(url=connection_url, headers=request_headers): the instantiated + Prometheus client. + """ + + # Get required env variables + connection_url = os.environ["PROMETHEUS_URL"] + api_token = os.environ.get("PROMETHEUS_API_KEY") + + # Define necessary headers + request_headers = {"Authorization": f"Bearer {api_token}"} if api_token else None + + return PrometheusConnect(url=connection_url, headers=request_headers) + + +def get_value(given_res): + """ + Helper method to get the given value from a Prometheus response + + Args: + - given_res: The response object obtained from the Prometheus client that has our value. + + Returns: + - value: the value for the given resource metric we want to report that was obtained from + the response, represented as a float if present, otherwise None. + """ + + # Iterate through to save our output to a list + values = [] + for series in given_res or []: + try: + values.append(float(series["value"][1])) + except Exception: + pass + value = values[0] if values else None + + return value + + +def get_static_read(client, recorded_time): + """ + Top-level method that will get a read on CPU and Memory usage give a single + moment in time. + + Args: + - client: the Prometheus client to use to get our metrics. + - recorded_time: the time that we want to get the metric read at. + + Returns: + - cpu_value: this is the reported value for percentage of CPU usage at the given + recorded time. + - mem_value: this is the reported value for memory usage at the given + recorded time in gigabytes. + """ + + # Make the request for CPU and Mem + cpu_query = '100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[2m])))' + mem_query = '100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))' + cpu_response = client.custom_query(query=cpu_query, params={"time", recorded_time.timestamp()}) + mem_response = client.custom_query(query=mem_query, params={"time", recorded_time.timestamp()}) + + ## Get the CPU & Mem metrics out of the response + cpu_value = get_value(cpu_response) + mem_value = get_value(mem_response) + + return cpu_value, mem_value From 9e4fbc976055cde9a9f443bf1804b966d35a843f Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Mon, 23 Feb 2026 14:52:46 -0500 Subject: [PATCH 12/42] Added peak usage reporting Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/utils/__init__.py | 20 +++++-- .../utils/resource_collection.py | 60 ++++++++++++++++--- 2 files changed, 68 insertions(+), 12 deletions(-) diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index a7a29c90..729d0345 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -14,8 +14,9 @@ from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank, world_size from transformers.tokenization_utils_base import PreTrainedTokenizerBase from aiu_fms_testing_utils.testing.utils import format_kwargs_to_string -from aiu_fms_testing_utils.utils.resource_collection import instantiate_prometheus, get_static_read - +from aiu_fms_testing_utils.utils.resource_collection import ( + instantiate_prometheus, get_static_read, get_peak_read +) from fms.utils.generation import pad_input_ids import torch import torch.nn as nn @@ -56,10 +57,14 @@ def timestamp_print(given_string): def print_comp_resource_metrics(cpu_val, mem_val, stage): - if cpu_val is None or mem_val is None: - timestamp_print(f"Compilation {stage}") + if stage != "peak": + if cpu_val is None or mem_val is None: + timestamp_print(f"Compilation {stage}") + else: + timestamp_print(f"Compilation {stage} - CPU: {cpu_val}, Memory: {mem_val}") + else: - timestamp_print(f"Compilation {stage} - CPU: {cpu_val}, Memory: {mem_val}") + dprint(f"Peak Resource Utilization - CPU: {cpu_val}, Memory: {mem_val}") def warmup_model( model: nn.Module, @@ -131,6 +136,11 @@ def warmup_model( metric_end = datetime.now(timezone.utc) end_cpu, end_mem = get_static_read(p, metric_end) print_comp_resource_metrics(end_cpu, end_mem, "completed") + + # Get the peak usage during compilation + peak_cpu, peak_mem = get_peak_read(p, metric_start, metric_end) + print_comp_resource_metrics(peak_cpu, peak_mem, "peak") + dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s") diff --git a/aiu_fms_testing_utils/utils/resource_collection.py b/aiu_fms_testing_utils/utils/resource_collection.py index 93f35325..ef5c6033 100644 --- a/aiu_fms_testing_utils/utils/resource_collection.py +++ b/aiu_fms_testing_utils/utils/resource_collection.py @@ -23,12 +23,13 @@ def instantiate_prometheus(): return PrometheusConnect(url=connection_url, headers=request_headers) -def get_value(given_res): +def get_value(given_res, query_type="static"): """ Helper method to get the given value from a Prometheus response Args: - given_res: The response object obtained from the Prometheus client that has our value. + - query_type: The type of query we are processing, "static" or "range" Returns: - value: the value for the given resource metric we want to report that was obtained from @@ -37,12 +38,23 @@ def get_value(given_res): # Iterate through to save our output to a list values = [] - for series in given_res or []: - try: - values.append(float(series["value"][1])) - except Exception: - pass - value = values[0] if values else None + value = None + if query_type == "static": ## For start/end reads + for series in given_res or []: + try: + values.append(float(series["value"][1])) + except Exception: + pass + value = values[0] if values else None + + else: ## For peak reads + for series in given_res or []: + for timestamp, val in series.get("values", []): + try: + values.append(float(val)) + except Exception: + pass + value = max(values) if values else None return value @@ -74,3 +86,37 @@ def get_static_read(client, recorded_time): mem_value = get_value(mem_response) return cpu_value, mem_value + + +def get_peak_read(client, start, end): + """ + Top-level method that will get the peak resource usage during a given interval. + + Args: + - client: the Prometheus client to use to get our metrics. + - start: the recorded start time for the interval. + - end: the recorded end time for the interval. + + Returns: + - peak_cpu_value: this is the peak reported value for percentage of CPU usage over the + given interval. + - peak_mem_value: this is the peak reported value for memory usage over the given interval + in gigabytes. + + """ + + # Make the request for CPU and Mem + cpu_query = '100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[2m])))' + mem_query = '100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))' + cpu_response = client.custom_query_range( + query=cpu_query, start_time=start, end_time=end, step="3s" + ) + mem_response = client.custom_query_range( + query=mem_query, start_time=start, end_time=end, step="3s" + ) + + ## Get the CPU & Mem metrics out of the response + peak_cpu_value = get_value(cpu_response, "range") + peak_mem_value = get_value(mem_response, "range") + + return peak_cpu_value, peak_mem_value From dfa7a68811aedb278f2a20d4e0ad9174f93d9a8b Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Mon, 23 Feb 2026 15:09:54 -0500 Subject: [PATCH 13/42] Optimizing condition Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index 729d0345..6275c971 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -63,7 +63,7 @@ def print_comp_resource_metrics(cpu_val, mem_val, stage): else: timestamp_print(f"Compilation {stage} - CPU: {cpu_val}, Memory: {mem_val}") - else: + elif cpu_val is not None and mem_val is not None: dprint(f"Peak Resource Utilization - CPU: {cpu_val}, Memory: {mem_val}") def warmup_model( From f9bb5114c701b795c07c88906e4a63e87321d3b8 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Mon, 23 Feb 2026 15:43:44 -0500 Subject: [PATCH 14/42] Adding some error handling Signed-off-by: Christian Sarmiento --- .../utils/resource_collection.py | 72 +++++++++++-------- 1 file changed, 43 insertions(+), 29 deletions(-) diff --git a/aiu_fms_testing_utils/utils/resource_collection.py b/aiu_fms_testing_utils/utils/resource_collection.py index ef5c6033..d4f55311 100644 --- a/aiu_fms_testing_utils/utils/resource_collection.py +++ b/aiu_fms_testing_utils/utils/resource_collection.py @@ -9,18 +9,24 @@ def instantiate_prometheus(): resource usage metrics. Returns: - - PrometheusConnect(url=connection_url, headers=request_headers): the instantiated - Prometheus client. + - client: the instantiated Prometheus client. """ - # Get required env variables - connection_url = os.environ["PROMETHEUS_URL"] - api_token = os.environ.get("PROMETHEUS_API_KEY") + client = None + try: + # Get required env variables + connection_url = os.environ["PROMETHEUS_URL"] + api_token = os.environ.get("PROMETHEUS_API_KEY") - # Define necessary headers - request_headers = {"Authorization": f"Bearer {api_token}"} if api_token else None + # Define necessary headers + request_headers = {"Authorization": f"Bearer {api_token}"} if api_token else None - return PrometheusConnect(url=connection_url, headers=request_headers) + client = PrometheusConnect(url=connection_url, headers=request_headers) + + except Exception as e: + print(f"WARNING: Cannot instantiate Prometheus. Make sure PROMETHEUS_URL and PROMETHEUS_API_KEY are set in your environment if you want resource metrics. Error: {e}") + + return client def get_value(given_res, query_type="static"): @@ -75,15 +81,19 @@ def get_static_read(client, recorded_time): recorded time in gigabytes. """ - # Make the request for CPU and Mem - cpu_query = '100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[2m])))' - mem_query = '100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))' - cpu_response = client.custom_query(query=cpu_query, params={"time", recorded_time.timestamp()}) - mem_response = client.custom_query(query=mem_query, params={"time", recorded_time.timestamp()}) + cpu_value = None + mem_value = None + if client is not None: + + # Make the request for CPU and Mem + cpu_query = '100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[2m])))' + mem_query = '100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))' + cpu_response = client.custom_query(query=cpu_query, params={"time", recorded_time.timestamp()}) + mem_response = client.custom_query(query=mem_query, params={"time", recorded_time.timestamp()}) - ## Get the CPU & Mem metrics out of the response - cpu_value = get_value(cpu_response) - mem_value = get_value(mem_response) + ## Get the CPU & Mem metrics out of the response + cpu_value = get_value(cpu_response) + mem_value = get_value(mem_response) return cpu_value, mem_value @@ -105,18 +115,22 @@ def get_peak_read(client, start, end): """ - # Make the request for CPU and Mem - cpu_query = '100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[2m])))' - mem_query = '100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))' - cpu_response = client.custom_query_range( - query=cpu_query, start_time=start, end_time=end, step="3s" - ) - mem_response = client.custom_query_range( - query=mem_query, start_time=start, end_time=end, step="3s" - ) - - ## Get the CPU & Mem metrics out of the response - peak_cpu_value = get_value(cpu_response, "range") - peak_mem_value = get_value(mem_response, "range") + peak_cpu_value = None + peak_mem_value = None + if client is not None: + + # Make the request for CPU and Mem + cpu_query = '100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[2m])))' + mem_query = '100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))' + cpu_response = client.custom_query_range( + query=cpu_query, start_time=start, end_time=end, step="3s" + ) + mem_response = client.custom_query_range( + query=mem_query, start_time=start, end_time=end, step="3s" + ) + + ## Get the CPU & Mem metrics out of the response + peak_cpu_value = get_value(cpu_response, "range") + peak_mem_value = get_value(mem_response, "range") return peak_cpu_value, peak_mem_value From fb0f3af4d337485f17a4a1c73974a7ee6345d8e9 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Mon, 23 Feb 2026 16:23:56 -0500 Subject: [PATCH 15/42] Accidentally used a set instead of a dictionary Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/utils/resource_collection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aiu_fms_testing_utils/utils/resource_collection.py b/aiu_fms_testing_utils/utils/resource_collection.py index d4f55311..67a498e3 100644 --- a/aiu_fms_testing_utils/utils/resource_collection.py +++ b/aiu_fms_testing_utils/utils/resource_collection.py @@ -88,8 +88,8 @@ def get_static_read(client, recorded_time): # Make the request for CPU and Mem cpu_query = '100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[2m])))' mem_query = '100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))' - cpu_response = client.custom_query(query=cpu_query, params={"time", recorded_time.timestamp()}) - mem_response = client.custom_query(query=mem_query, params={"time", recorded_time.timestamp()}) + cpu_response = client.custom_query(query=cpu_query, params={"time": recorded_time.timestamp()}) + mem_response = client.custom_query(query=mem_query, params={"time": recorded_time.timestamp()}) ## Get the CPU & Mem metrics out of the response cpu_value = get_value(cpu_response) From 15e953da9bc77d8c058acadfe47a4ca48ada08eb Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Mon, 23 Feb 2026 16:27:38 -0500 Subject: [PATCH 16/42] Disabling SSL Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/utils/resource_collection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/aiu_fms_testing_utils/utils/resource_collection.py b/aiu_fms_testing_utils/utils/resource_collection.py index 67a498e3..87b43a8b 100644 --- a/aiu_fms_testing_utils/utils/resource_collection.py +++ b/aiu_fms_testing_utils/utils/resource_collection.py @@ -1,3 +1,4 @@ +from gc import disable import os from prometheus_api_client import PrometheusConnect @@ -21,7 +22,7 @@ def instantiate_prometheus(): # Define necessary headers request_headers = {"Authorization": f"Bearer {api_token}"} if api_token else None - client = PrometheusConnect(url=connection_url, headers=request_headers) + client = PrometheusConnect(url=connection_url, headers=request_headers, disable_ssl=True) except Exception as e: print(f"WARNING: Cannot instantiate Prometheus. Make sure PROMETHEUS_URL and PROMETHEUS_API_KEY are set in your environment if you want resource metrics. Error: {e}") From 2a7d9516481981d9794fa693eb757e9531ea3a06 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Mon, 23 Feb 2026 18:55:03 -0500 Subject: [PATCH 17/42] debug statement Signed-off-by: Christian Sarmiento --- .../utils/resource_collection.py | 4 +- log.txt | 162 ++++++++++++++++++ 2 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 log.txt diff --git a/aiu_fms_testing_utils/utils/resource_collection.py b/aiu_fms_testing_utils/utils/resource_collection.py index 87b43a8b..11002ccd 100644 --- a/aiu_fms_testing_utils/utils/resource_collection.py +++ b/aiu_fms_testing_utils/utils/resource_collection.py @@ -85,16 +85,18 @@ def get_static_read(client, recorded_time): cpu_value = None mem_value = None if client is not None: + print("we have client") # Make the request for CPU and Mem cpu_query = '100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[2m])))' mem_query = '100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))' cpu_response = client.custom_query(query=cpu_query, params={"time": recorded_time.timestamp()}) mem_response = client.custom_query(query=mem_query, params={"time": recorded_time.timestamp()}) - + print(f"cpu response: {cpu_response}") ## Get the CPU & Mem metrics out of the response cpu_value = get_value(cpu_response) mem_value = get_value(mem_response) + print(f"Returned CPU value: {cpu_value}, Mem: {mem_value}") return cpu_value, mem_value diff --git a/log.txt b/log.txt new file mode 100644 index 00000000..86b2accb --- /dev/null +++ b/log.txt @@ -0,0 +1,162 @@ +(vllm) [root@zaiu pytorch_workspace]# PYTHONPATH=./aiu-fms-testing-utils:$PYTHONPATH torchrun --nproc-per-node=4 aiu-fms-testing-utils/scripts/drive_paged_programs.py --model_variant=/ibm-granite/granite-3.3-8b-instruct --program_criteria_json_path=z-spyre-runtimes/testing/acceptance/fms/criterion/program_criterion.json --dataset_type=sharegpt --skip_validation --programs "*:0,<8192" --prioritize_large_batch_sizes --enforce_homogeneous_prompt_programs --prefill_chunk_size=1024 --dataset_path=ShareGPT_V3_unfiltered_cleaned_split.json +[ 0/ 4]: When skipping validation, only test_type will be ignored +[ 3/ 4]: When skipping validation, only test_type will be ignored +[ 1/ 4]: When skipping validation, only test_type will be ignored +[ 2/ 4]: When skipping validation, only test_type will be ignored +[ 0/ 4]: Unknown model configuration. Using VLLM_DT_MAX_BATCH_TKV_LIMIT from environment: 16384 +[ 3/ 4]: Unknown model configuration. Using VLLM_DT_MAX_BATCH_TKV_LIMIT from environment: 16384 +[ 2/ 4]: Unknown model configuration. Using VLLM_DT_MAX_BATCH_TKV_LIMIT from environment: 16384 +[ 1/ 4]: Unknown model configuration. Using VLLM_DT_MAX_BATCH_TKV_LIMIT from environment: 16384 +WARNING:torchao.kernel.intmm:Warning: Detected no triton, on systems without Triton certain kernels will not work +WARNING:torchao.kernel.intmm:Warning: Detected no triton, on systems without Triton certain kernels will not work +WARNING:torchao.kernel.intmm:Warning: Detected no triton, on systems without Triton certain kernels will not work +WARNING:torchao.kernel.intmm:Warning: Detected no triton, on systems without Triton certain kernels will not work +[ 0/ 4]: AIU warmup +[ 3/ 4]: AIU warmup +[ 2/ 4]: AIU warmup +Traceback (most recent call last): +Traceback (most recent call last): + File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 976, in json + File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 976, in json +Traceback (most recent call last): + return complexjson.loads(self.text, **kwargs)return complexjson.loads(self.text, **kwargs) File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 976, in json + + + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^ File "/usr/lib64/python3.12/json/__init__.py", line 346, in loads + + File "/usr/lib64/python3.12/json/__init__.py", line 346, in loads + return complexjson.loads(self.text, **kwargs) + return _default_decoder.decode(s) +return _default_decoder.decode(s) + ^^^^^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^ File "/usr/lib64/python3.12/json/__init__.py", line 346, in loads + +^ + File "/usr/lib64/python3.12/json/decoder.py", line 338, in decode + File "/usr/lib64/python3.12/json/decoder.py", line 338, in decode + obj, end = self.raw_decode(s, idx=_w(s, 0).end())obj, end = self.raw_decode(s, idx=_w(s, 0).end()) + + return _default_decoder.decode(s) + ^^ ^^ ^^ ^^ ^^ ^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^ File "/usr/lib64/python3.12/json/decoder.py", line 338, in decode +^^^^ + + File "/usr/lib64/python3.12/json/decoder.py", line 356, in raw_decode + File "/usr/lib64/python3.12/json/decoder.py", line 356, in raw_decode + obj, end = self.raw_decode(s, idx=_w(s, 0).end()) + raise JSONDecodeError("Expecting value", s, err.value) from Noneraise JSONDecodeError("Expecting value", s, err.value) from None + + json.decoder json.decoder. .JSONDecodeError JSONDecodeError : : Expecting value: line 1 column 1 (char 0)Expecting value: line 1 column 1 (char 0)^ + +^^ + +^^During handling of the above exception, another exception occurred: +During handling of the above exception, another exception occurred: +^^ +^ +^Traceback (most recent call last): +^Traceback (most recent call last): +^^^^^ File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1507, in +^ File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1507, in +^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib64/python3.12/json/decoder.py", line 356, in raw_decode + raise JSONDecodeError("Expecting value", s, err.value) from None +json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1507, in + main()main() + + File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1444, in main + File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1444, in main + main() + File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1444, in main + warmup_model(warmup_model( + + File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/__init__.py", line 103, in warmup_model + File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/__init__.py", line 103, in warmup_model + initial_cpu, initial_mem = get_static_read(p, metric_start)initial_cpu, initial_mem = get_static_read(p, metric_start) + + ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^warmup_model(^^ +^^^^^^^^^^^^^ File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/__init__.py", line 103, in warmup_model +^^^^^^^^^^^^^^^^ +^ + File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/resource_collection.py", line 92, in get_static_read + File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/resource_collection.py", line 92, in get_static_read + initial_cpu, initial_mem = get_static_read(p, metric_start) + cpu_response = client.custom_query(query=cpu_query, params={"time": recorded_time.timestamp()})cpu_response = client.custom_query(query=cpu_query, params={"time": recorded_time.timestamp()}) + + ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^ File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/resource_collection.py", line 92, in get_static_read +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^cpu_response = client.custom_query(query=cpu_query, params={"time": recorded_time.timestamp()})^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^ ^^ ^^ ^^ ^^ ^^ ^^ ^^ ^ ^^ ^^ ^^ ^^ ^^ ^^ ^^ ^^ ^^ ^^^^^^^^^^^^^^^^^^^^^ +^^^^^^ File "/opt/vllm/lib64/python3.12/site-packages/prometheus_api_client/prometheus_connect.py", line 473, in custom_query +^^^^^^^ +^^^^ File "/opt/vllm/lib64/python3.12/site-packages/prometheus_api_client/prometheus_connect.py", line 473, in custom_query +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/opt/vllm/lib64/python3.12/site-packages/prometheus_api_client/prometheus_connect.py", line 473, in custom_query + data = response.json()["data"]["result"]data = response.json()["data"]["result"] + + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 980, in json + File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 980, in json + data = response.json()["data"]["result"] + ^^^^^^^^^^^^^^^ + File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 980, in json + raise RequestsJSONDecodeError(e.msg, e.doc, e.pos) +raise RequestsJSONDecodeError(e.msg, e.doc, e.pos) +requests.exceptionsrequests.exceptions..JSONDecodeErrorJSONDecodeError: : Expecting value: line 1 column 1 (char 0)Expecting value: line 1 column 1 (char 0) + + raise RequestsJSONDecodeError(e.msg, e.doc, e.pos) +requests.exceptions.JSONDecodeError: Expecting value: line 1 column 1 (char 0) +W0223 22:12:21.239000 191 torch/distributed/elastic/multiprocessing/api.py:900] Sending process 194 closing signal SIGTERM +E0223 22:12:21.363000 191 torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 0 (pid: 193) of binary: /opt/vllm/bin/python3 +Traceback (most recent call last): + File "/opt/vllm/bin/torchrun", line 10, in + sys.exit(main()) + ^^^^^^ + File "/opt/vllm/lib64/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper + return f(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^ + File "/opt/vllm/lib64/python3.12/site-packages/torch/distributed/run.py", line 892, in main + run(args) + File "/opt/vllm/lib64/python3.12/site-packages/torch/distributed/run.py", line 883, in run + elastic_launch( + File "/opt/vllm/lib64/python3.12/site-packages/torch/distributed/launcher/api.py", line 139, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/opt/vllm/lib64/python3.12/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +aiu-fms-testing-utils/scripts/drive_paged_programs.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2026-02-23_22:12:21 + host : host.containers.internal + rank : 2 (local_rank: 2) + exitcode : 1 (pid: 195) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[2]: + time : 2026-02-23_22:12:21 + host : host.containers.internal + rank : 3 (local_rank: 3) + exitcode : 1 (pid: 196) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2026-02-23_22:12:21 + host : host.containers.internal + rank : 0 (local_rank: 0) + exitcode : 1 (pid: 193) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ From d5806afc8d180548fa5dc9266f523dcf652844d5 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Mon, 23 Feb 2026 19:50:29 -0500 Subject: [PATCH 18/42] Output formatting Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/utils/__init__.py | 4 ++-- aiu_fms_testing_utils/utils/resource_collection.py | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index 6275c971..c8bc452b 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -61,10 +61,10 @@ def print_comp_resource_metrics(cpu_val, mem_val, stage): if cpu_val is None or mem_val is None: timestamp_print(f"Compilation {stage}") else: - timestamp_print(f"Compilation {stage} - CPU: {cpu_val}, Memory: {mem_val}") + timestamp_print(f"Compilation {stage} - CPU: {cpu_val:.3f}%, Memory: {mem_val:.3f}GB") elif cpu_val is not None and mem_val is not None: - dprint(f"Peak Resource Utilization - CPU: {cpu_val}, Memory: {mem_val}") + dprint(f"Peak Resource Utilization - CPU: {cpu_val:.3f}%, Memory: {mem_val:.3f}GB") def warmup_model( model: nn.Module, diff --git a/aiu_fms_testing_utils/utils/resource_collection.py b/aiu_fms_testing_utils/utils/resource_collection.py index 11002ccd..d444379d 100644 --- a/aiu_fms_testing_utils/utils/resource_collection.py +++ b/aiu_fms_testing_utils/utils/resource_collection.py @@ -25,7 +25,7 @@ def instantiate_prometheus(): client = PrometheusConnect(url=connection_url, headers=request_headers, disable_ssl=True) except Exception as e: - print(f"WARNING: Cannot instantiate Prometheus. Make sure PROMETHEUS_URL and PROMETHEUS_API_KEY are set in your environment if you want resource metrics. Error: {e}") + print(f"WARNING: Cannot instantiate Prometheus. Make sure PROMETHEUS_URL and PROMETHEUS_API_KEY are set in your environment if you are trying to collect resource metrics. Error: {e}") return client @@ -85,11 +85,10 @@ def get_static_read(client, recorded_time): cpu_value = None mem_value = None if client is not None: - print("we have client") # Make the request for CPU and Mem cpu_query = '100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[2m])))' - mem_query = '100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))' + mem_query = '(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 1024 / 1024 / 1024' cpu_response = client.custom_query(query=cpu_query, params={"time": recorded_time.timestamp()}) mem_response = client.custom_query(query=mem_query, params={"time": recorded_time.timestamp()}) print(f"cpu response: {cpu_response}") @@ -124,7 +123,7 @@ def get_peak_read(client, start, end): # Make the request for CPU and Mem cpu_query = '100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[2m])))' - mem_query = '100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))' + mem_query = '(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 1024 / 1024 / 1024' cpu_response = client.custom_query_range( query=cpu_query, start_time=start, end_time=end, step="3s" ) From 3aed13220a01ed8de886b32b4c23da092e57ae9e Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Mon, 23 Feb 2026 19:52:59 -0500 Subject: [PATCH 19/42] Removing debug statements Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/utils/resource_collection.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/aiu_fms_testing_utils/utils/resource_collection.py b/aiu_fms_testing_utils/utils/resource_collection.py index d444379d..b16b5377 100644 --- a/aiu_fms_testing_utils/utils/resource_collection.py +++ b/aiu_fms_testing_utils/utils/resource_collection.py @@ -91,11 +91,10 @@ def get_static_read(client, recorded_time): mem_query = '(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 1024 / 1024 / 1024' cpu_response = client.custom_query(query=cpu_query, params={"time": recorded_time.timestamp()}) mem_response = client.custom_query(query=mem_query, params={"time": recorded_time.timestamp()}) - print(f"cpu response: {cpu_response}") + ## Get the CPU & Mem metrics out of the response cpu_value = get_value(cpu_response) mem_value = get_value(mem_response) - print(f"Returned CPU value: {cpu_value}, Mem: {mem_value}") return cpu_value, mem_value From 406b8497492c09140f19ef3bb770563829a70a21 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Mon, 23 Feb 2026 19:57:19 -0500 Subject: [PATCH 20/42] Rounding to 2 instead of 3 Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/utils/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index c8bc452b..0f04ce0f 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -61,10 +61,10 @@ def print_comp_resource_metrics(cpu_val, mem_val, stage): if cpu_val is None or mem_val is None: timestamp_print(f"Compilation {stage}") else: - timestamp_print(f"Compilation {stage} - CPU: {cpu_val:.3f}%, Memory: {mem_val:.3f}GB") + timestamp_print(f"Compilation {stage} - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f}GB") elif cpu_val is not None and mem_val is not None: - dprint(f"Peak Resource Utilization - CPU: {cpu_val:.3f}%, Memory: {mem_val:.3f}GB") + dprint(f"Peak Resource Utilization - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f}GB") def warmup_model( model: nn.Module, From 87e87144af81d902a6f88e23f2cd01b495937628 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Mon, 23 Feb 2026 20:58:42 -0500 Subject: [PATCH 21/42] Adding logging for inference Moved some functions and variable instantiation around to avoid duplicate code. Signed-off-by: Christian Sarmiento --- .../scripts/drive_paged_programs.py | 47 ++++++++++++++++++- aiu_fms_testing_utils/utils/__init__.py | 43 +++++++++++------ 2 files changed, 75 insertions(+), 15 deletions(-) diff --git a/aiu_fms_testing_utils/scripts/drive_paged_programs.py b/aiu_fms_testing_utils/scripts/drive_paged_programs.py index 0e343aeb..c41d97c1 100644 --- a/aiu_fms_testing_utils/scripts/drive_paged_programs.py +++ b/aiu_fms_testing_utils/scripts/drive_paged_programs.py @@ -1,6 +1,6 @@ import argparse from dataclasses import dataclass -import datetime +from datetime import datetime, timezone import itertools import json import os @@ -37,6 +37,7 @@ sample_sharegpt_requests, stagger_region, warmup_model, + print_comp_resource_metrics ) from aiu_fms_testing_utils.utils.aiu_setup import aiu_dist_setup, dprint, local_rank from aiu_fms_testing_utils.utils.paged import ( @@ -44,6 +45,9 @@ get_programs_prompts, ) from aiu_fms_testing_utils.testing.utils import format_kwargs_to_string +from aiu_fms_testing_utils.utils.resource_collection import ( + instantiate_prometheus, get_static_read, get_peak_read +) # Constants PAD_MULTIPLE = 64 @@ -1251,6 +1255,7 @@ def generate_validation_info_and_test( timing: str, prefill_chunk_size: int, model_variant: str, + profile: PrometheusConnect | None ) -> list[Any]: """Generates tokens using AIU and CPU models and validates the results. @@ -1271,7 +1276,12 @@ def generate_validation_info_and_test( f"program id: {valid_prompt.program_id}, valid prompt: {valid_prompt.shape}, input shape: {valid_prompt.input_ids.shape}" ) + # Start inference + metric_start = datetime.now(timezone.utc) + initial_cpu, initial_mem = get_static_read(profile, metric_start) + print_comp_resource_metrics(initial_cpu, initial_mem, "started", "inference") if not skip_validation: + # Generate or load CPU validation info cpu_validation_info = generate_cpu_validation( model_variant=model_variant, @@ -1288,6 +1298,16 @@ def generate_validation_info_and_test( tokenizer=tokenizer, ) + ## Get completed metric read + cpu_inference_metric_end = datetime.now(timezone.utc) + end_cpu_inference_cpu, end_mem_inference_cpu = get_static_read(profile, cpu_inference_metric_end) + print_comp_resource_metrics(end_cpu_inference_cpu, end_mem_inference_cpu, "completed", "CPU inference") + + ## Get the peak usage during compilation + peak_cpu_inference_cpu, peak_mem_inference_cpu = get_peak_read(profile, metric_start, cpu_inference_metric_end) + print_comp_resource_metrics(peak_cpu_inference_cpu, peak_mem_inference_cpu, "peak", "CPU inference") + + # Generate AIU validation info aiu_validation_info = generate_aiu_validation( test_type=test_type, max_new_tokens=max_new_tokens, @@ -1299,6 +1319,15 @@ def generate_validation_info_and_test( extra_kwargs=valid_prompt.extra_kwargs, ) + ## Get completed metric read + aiu_inference_metric_end = datetime.now(timezone.utc) + end_cpu_inference_aiu, end_mem_inference_aiu = get_static_read(profile, aiu_inference_metric_end) + print_comp_resource_metrics(end_cpu_inference_aiu, end_mem_inference_aiu, "completed", "AIU inference") + + ## Get the peak usage during compilation + peak_cpu_inference_aiu, peak_mem_inference_aiu = get_peak_read(profile, metric_start, aiu_inference_metric_end) + print_comp_resource_metrics(peak_cpu_inference_aiu, peak_mem_inference_aiu, "peak", "AIU inference") + if test_type == "metrics": failure_rate = evaluate_cross_entropy_metrics( cross_entropy_threshold=cross_entropy_threshold, @@ -1325,6 +1354,8 @@ def generate_validation_info_and_test( else: raise ValueError("test type must be one of metrics or tokens") else: + + # Generate AIU validation info aiu_validation_info = generate_aiu_validation( test_type=test_type, max_new_tokens=max_new_tokens, @@ -1336,6 +1367,15 @@ def generate_validation_info_and_test( extra_kwargs=valid_prompt.extra_kwargs, ) + ## Get completed metric read + aiu_inference_metric_end = datetime.now(timezone.utc) + end_cpu_inference_aiu, end_mem_inference_aiu = get_static_read(profile, aiu_inference_metric_end) + print_comp_resource_metrics(end_cpu_inference_aiu, end_mem_inference_aiu, "completed", "AIU inference") + + ## Get the peak usage during compilation + peak_cpu_inference_aiu, peak_mem_inference_aiu = get_peak_read(profile, metric_start, aiu_inference_metric_end) + print_comp_resource_metrics(peak_cpu_inference_aiu, peak_mem_inference_aiu, "peak", "AIU inference") + if local_rank == 0: for sentence_idx, test_sentence in enumerate( aiu_validation_info.get_info("tokens") @@ -1392,6 +1432,9 @@ def main() -> None: tokenizer=tokenizer, ) + # Instantiate the Prometheus client for resource metric collection + p = instantiate_prometheus() + # Model Loading model_kwargs: Dict[str, Any] = _get_model_kwargs(model_variant=args.model_variant) distributed_kwargs: Dict[str, Any] = _get_distributed_kwargs( @@ -1448,6 +1491,7 @@ def main() -> None: compile_dynamic_sendnn=True, stagger_update_lazyhandle=args.stagger_update_lazyhandle, prefill_chunk_size=args.prefill_chunk_size, + profile=p, **extra_kwargs, ) if args.distributed: @@ -1490,6 +1534,7 @@ def main() -> None: timing=args.timing, prefill_chunk_size=args.prefill_chunk_size, model_variant=args.model_variant, + profile=p ) if not args.skip_validation and local_rank == 0: diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index 0f04ce0f..bc04742b 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -15,7 +15,7 @@ from transformers.tokenization_utils_base import PreTrainedTokenizerBase from aiu_fms_testing_utils.testing.utils import format_kwargs_to_string from aiu_fms_testing_utils.utils.resource_collection import ( - instantiate_prometheus, get_static_read, get_peak_read + get_static_read, get_peak_read ) from fms.utils.generation import pad_input_ids import torch @@ -51,20 +51,37 @@ def stagger_region(limit: int): dprint("Stagger: All Complete") def timestamp_print(given_string): + """ + Helper method that will add a timestamp before the given string that needs to be + printed. + + Args: + - given_string: the string that is to be printed with the timestamp. + """ timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") print(f"[{timestamp}] {given_string}") -def print_comp_resource_metrics(cpu_val, mem_val, stage): +def print_comp_resource_metrics(cpu_val, mem_val, stage, step): + """ + Helper method that will do a timestamp print for a specific step to report resource + usage. + + Args: + - cpu_val: the value for CPU usage as a percentage that we want to print. + - mem_val: the value for memory usage in gigabytes we want to print. + - stage: The stage of the step we are in, either "peak" or "started". + - step: The step that we performing in the script, either "compilation" or "inference". + """ if stage != "peak": if cpu_val is None or mem_val is None: - timestamp_print(f"Compilation {stage}") + timestamp_print(f"{step.title()} {stage}") else: - timestamp_print(f"Compilation {stage} - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f}GB") + timestamp_print(f"{step.title()} {stage} - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB") elif cpu_val is not None and mem_val is not None: - dprint(f"Peak Resource Utilization - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f}GB") + dprint(f"Peak Resource Utilization - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB") def warmup_model( model: nn.Module, @@ -74,6 +91,7 @@ def warmup_model( use_cache: bool = True, stagger_update_lazyhandle: int = 0, prefill_chunk_size: int = 0, + profile: PrometheusConnect | None = None, **extra_kwargs, ): import torch_sendnn @@ -91,17 +109,14 @@ def warmup_model( attention_specific_kwargs["contiguous_cache"] = True attention_specific_kwargs["max_seq_len"] = input_ids.shape[1] + max_new_tokens - # Instantiate the Prometheus client for resource metric collection - p = instantiate_prometheus() - # Start the warmup dprint("AIU warmup") pt_compile_model_time = time.time() ## Report on initial resource usage metric_start = datetime.now(timezone.utc) - initial_cpu, initial_mem = get_static_read(p, metric_start) - print_comp_resource_metrics(initial_cpu, initial_mem, "started") + initial_cpu, initial_mem = get_static_read(profile, metric_start) + print_comp_resource_metrics(initial_cpu, initial_mem, "started", "compilation") # adjust inputs depending on attn_type and dynamic shapes _warmup_input_ids = input_ids @@ -134,12 +149,12 @@ def warmup_model( # Get completed metric read metric_end = datetime.now(timezone.utc) - end_cpu, end_mem = get_static_read(p, metric_end) - print_comp_resource_metrics(end_cpu, end_mem, "completed") + end_cpu, end_mem = get_static_read(profile, metric_end) + print_comp_resource_metrics(end_cpu, end_mem, "completed", "compilation") # Get the peak usage during compilation - peak_cpu, peak_mem = get_peak_read(p, metric_start, metric_end) - print_comp_resource_metrics(peak_cpu, peak_mem, "peak") + peak_cpu, peak_mem = get_peak_read(profile, metric_start, metric_end) + print_comp_resource_metrics(peak_cpu, peak_mem, "peak", "compilation") dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s") From d7f314a9b0edc2814012edf50947d394474b3785 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Mon, 23 Feb 2026 21:01:24 -0500 Subject: [PATCH 22/42] Adding imports to support type annotations Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/scripts/drive_paged_programs.py | 1 + aiu_fms_testing_utils/utils/__init__.py | 1 + aiu_fms_testing_utils/utils/resource_collection.py | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/aiu_fms_testing_utils/scripts/drive_paged_programs.py b/aiu_fms_testing_utils/scripts/drive_paged_programs.py index c41d97c1..504378b6 100644 --- a/aiu_fms_testing_utils/scripts/drive_paged_programs.py +++ b/aiu_fms_testing_utils/scripts/drive_paged_programs.py @@ -48,6 +48,7 @@ from aiu_fms_testing_utils.utils.resource_collection import ( instantiate_prometheus, get_static_read, get_peak_read ) +from prometheus_api_client import PrometheusConnect # Constants PAD_MULTIPLE = 64 diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index bc04742b..73ff62d4 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -23,6 +23,7 @@ import math import contextlib import warnings +from prometheus_api_client import PrometheusConnect @contextlib.contextmanager diff --git a/aiu_fms_testing_utils/utils/resource_collection.py b/aiu_fms_testing_utils/utils/resource_collection.py index b16b5377..b15f3b6e 100644 --- a/aiu_fms_testing_utils/utils/resource_collection.py +++ b/aiu_fms_testing_utils/utils/resource_collection.py @@ -1,4 +1,4 @@ -from gc import disable +# Imports import os from prometheus_api_client import PrometheusConnect From 329676706c0ee2191757155a7c2eee16d1de6551 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Mon, 23 Feb 2026 21:19:36 -0500 Subject: [PATCH 23/42] Additional formatting Signed-off-by: Christian Sarmiento --- .../scripts/drive_paged_programs.py | 14 +++++++------- aiu_fms_testing_utils/utils/__init__.py | 10 +++++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/aiu_fms_testing_utils/scripts/drive_paged_programs.py b/aiu_fms_testing_utils/scripts/drive_paged_programs.py index 504378b6..f1016368 100644 --- a/aiu_fms_testing_utils/scripts/drive_paged_programs.py +++ b/aiu_fms_testing_utils/scripts/drive_paged_programs.py @@ -1280,7 +1280,7 @@ def generate_validation_info_and_test( # Start inference metric_start = datetime.now(timezone.utc) initial_cpu, initial_mem = get_static_read(profile, metric_start) - print_comp_resource_metrics(initial_cpu, initial_mem, "started", "inference") + print_comp_resource_metrics(initial_cpu, initial_mem, "started", "Inference") if not skip_validation: # Generate or load CPU validation info @@ -1302,11 +1302,11 @@ def generate_validation_info_and_test( ## Get completed metric read cpu_inference_metric_end = datetime.now(timezone.utc) end_cpu_inference_cpu, end_mem_inference_cpu = get_static_read(profile, cpu_inference_metric_end) - print_comp_resource_metrics(end_cpu_inference_cpu, end_mem_inference_cpu, "completed", "CPU inference") + print_comp_resource_metrics(end_cpu_inference_cpu, end_mem_inference_cpu, "completed", "CPU Inference") ## Get the peak usage during compilation peak_cpu_inference_cpu, peak_mem_inference_cpu = get_peak_read(profile, metric_start, cpu_inference_metric_end) - print_comp_resource_metrics(peak_cpu_inference_cpu, peak_mem_inference_cpu, "peak", "CPU inference") + print_comp_resource_metrics(peak_cpu_inference_cpu, peak_mem_inference_cpu, "peak", "CPU Inference") # Generate AIU validation info aiu_validation_info = generate_aiu_validation( @@ -1323,11 +1323,11 @@ def generate_validation_info_and_test( ## Get completed metric read aiu_inference_metric_end = datetime.now(timezone.utc) end_cpu_inference_aiu, end_mem_inference_aiu = get_static_read(profile, aiu_inference_metric_end) - print_comp_resource_metrics(end_cpu_inference_aiu, end_mem_inference_aiu, "completed", "AIU inference") + print_comp_resource_metrics(end_cpu_inference_aiu, end_mem_inference_aiu, "completed", "AIU Inference") ## Get the peak usage during compilation peak_cpu_inference_aiu, peak_mem_inference_aiu = get_peak_read(profile, metric_start, aiu_inference_metric_end) - print_comp_resource_metrics(peak_cpu_inference_aiu, peak_mem_inference_aiu, "peak", "AIU inference") + print_comp_resource_metrics(peak_cpu_inference_aiu, peak_mem_inference_aiu, "peak", "AIU Inference") if test_type == "metrics": failure_rate = evaluate_cross_entropy_metrics( @@ -1371,11 +1371,11 @@ def generate_validation_info_and_test( ## Get completed metric read aiu_inference_metric_end = datetime.now(timezone.utc) end_cpu_inference_aiu, end_mem_inference_aiu = get_static_read(profile, aiu_inference_metric_end) - print_comp_resource_metrics(end_cpu_inference_aiu, end_mem_inference_aiu, "completed", "AIU inference") + print_comp_resource_metrics(end_cpu_inference_aiu, end_mem_inference_aiu, "completed", "AIU Inference") ## Get the peak usage during compilation peak_cpu_inference_aiu, peak_mem_inference_aiu = get_peak_read(profile, metric_start, aiu_inference_metric_end) - print_comp_resource_metrics(peak_cpu_inference_aiu, peak_mem_inference_aiu, "peak", "AIU inference") + print_comp_resource_metrics(peak_cpu_inference_aiu, peak_mem_inference_aiu, "peak", "AIU Inference") if local_rank == 0: for sentence_idx, test_sentence in enumerate( diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index 73ff62d4..b3859175 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -77,9 +77,9 @@ def print_comp_resource_metrics(cpu_val, mem_val, stage, step): if stage != "peak": if cpu_val is None or mem_val is None: - timestamp_print(f"{step.title()} {stage}") + timestamp_print(f"{step} {stage}") else: - timestamp_print(f"{step.title()} {stage} - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB") + timestamp_print(f"{step} {stage} - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB") elif cpu_val is not None and mem_val is not None: dprint(f"Peak Resource Utilization - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB") @@ -117,7 +117,7 @@ def warmup_model( ## Report on initial resource usage metric_start = datetime.now(timezone.utc) initial_cpu, initial_mem = get_static_read(profile, metric_start) - print_comp_resource_metrics(initial_cpu, initial_mem, "started", "compilation") + print_comp_resource_metrics(initial_cpu, initial_mem, "started", "Compilation") # adjust inputs depending on attn_type and dynamic shapes _warmup_input_ids = input_ids @@ -151,11 +151,11 @@ def warmup_model( # Get completed metric read metric_end = datetime.now(timezone.utc) end_cpu, end_mem = get_static_read(profile, metric_end) - print_comp_resource_metrics(end_cpu, end_mem, "completed", "compilation") + print_comp_resource_metrics(end_cpu, end_mem, "completed", "Compilation") # Get the peak usage during compilation peak_cpu, peak_mem = get_peak_read(profile, metric_start, metric_end) - print_comp_resource_metrics(peak_cpu, peak_mem, "peak", "compilation") + print_comp_resource_metrics(peak_cpu, peak_mem, "peak", "Compilation") dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s") From 36756b7e9d40cc0b4eba5ffa2a55488504aa96b2 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Mon, 23 Feb 2026 21:31:55 -0500 Subject: [PATCH 24/42] More formatting Signed-off-by: Christian Sarmiento --- .../scripts/drive_paged_programs.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/aiu_fms_testing_utils/scripts/drive_paged_programs.py b/aiu_fms_testing_utils/scripts/drive_paged_programs.py index f1016368..dfacc63e 100644 --- a/aiu_fms_testing_utils/scripts/drive_paged_programs.py +++ b/aiu_fms_testing_utils/scripts/drive_paged_programs.py @@ -1302,11 +1302,11 @@ def generate_validation_info_and_test( ## Get completed metric read cpu_inference_metric_end = datetime.now(timezone.utc) end_cpu_inference_cpu, end_mem_inference_cpu = get_static_read(profile, cpu_inference_metric_end) - print_comp_resource_metrics(end_cpu_inference_cpu, end_mem_inference_cpu, "completed", "CPU Inference") + print_comp_resource_metrics(end_cpu_inference_cpu, end_mem_inference_cpu, "completed", "CPU inference") ## Get the peak usage during compilation peak_cpu_inference_cpu, peak_mem_inference_cpu = get_peak_read(profile, metric_start, cpu_inference_metric_end) - print_comp_resource_metrics(peak_cpu_inference_cpu, peak_mem_inference_cpu, "peak", "CPU Inference") + print_comp_resource_metrics(peak_cpu_inference_cpu, peak_mem_inference_cpu, "peak", "CPU inference") # Generate AIU validation info aiu_validation_info = generate_aiu_validation( @@ -1323,11 +1323,11 @@ def generate_validation_info_and_test( ## Get completed metric read aiu_inference_metric_end = datetime.now(timezone.utc) end_cpu_inference_aiu, end_mem_inference_aiu = get_static_read(profile, aiu_inference_metric_end) - print_comp_resource_metrics(end_cpu_inference_aiu, end_mem_inference_aiu, "completed", "AIU Inference") + print_comp_resource_metrics(end_cpu_inference_aiu, end_mem_inference_aiu, "completed", "AIU inference") ## Get the peak usage during compilation peak_cpu_inference_aiu, peak_mem_inference_aiu = get_peak_read(profile, metric_start, aiu_inference_metric_end) - print_comp_resource_metrics(peak_cpu_inference_aiu, peak_mem_inference_aiu, "peak", "AIU Inference") + print_comp_resource_metrics(peak_cpu_inference_aiu, peak_mem_inference_aiu, "peak", "AIU inference") if test_type == "metrics": failure_rate = evaluate_cross_entropy_metrics( @@ -1371,11 +1371,11 @@ def generate_validation_info_and_test( ## Get completed metric read aiu_inference_metric_end = datetime.now(timezone.utc) end_cpu_inference_aiu, end_mem_inference_aiu = get_static_read(profile, aiu_inference_metric_end) - print_comp_resource_metrics(end_cpu_inference_aiu, end_mem_inference_aiu, "completed", "AIU Inference") + print_comp_resource_metrics(end_cpu_inference_aiu, end_mem_inference_aiu, "completed", "AIU inference") ## Get the peak usage during compilation peak_cpu_inference_aiu, peak_mem_inference_aiu = get_peak_read(profile, metric_start, aiu_inference_metric_end) - print_comp_resource_metrics(peak_cpu_inference_aiu, peak_mem_inference_aiu, "peak", "AIU Inference") + print_comp_resource_metrics(peak_cpu_inference_aiu, peak_mem_inference_aiu, "peak", "AIU inference") if local_rank == 0: for sentence_idx, test_sentence in enumerate( From 363f97130a2e60546a1542c477afec3ba0431000 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Tue, 24 Feb 2026 09:24:46 -0500 Subject: [PATCH 25/42] Updating README with instructions on Prometheus setup Signed-off-by: Christian Sarmiento --- README.md | 25 +++++++++++++++++++++++++ aiu_fms_testing_utils/scripts/README.md | 3 +-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6d30905f..9e7c4410 100644 --- a/README.md +++ b/README.md @@ -139,6 +139,31 @@ export TORCH_SENDNN_LOG=CRITICAL export DT_DEEPRT_VERBOSE=-1 ``` +### Setup the environment for reporting resource usage + +When running `drive_paged_programs.py` you may want to see how much CPU and memory usage is +happening. This is done using Prometheus, thus if you are running in a container, you want to set up a simple Prometheus server to start collecting these metrics. To do this, do the following: + +1. Run `podman network create promnet` +2. Run `podman run -d --name node-exporter --network promnet quay.io/prometheus/node-exporter:latest` +3. Create a file called `prometheus.yml` that has the following contents: + +```yaml +global: +  scrape_interval: 5s + +scrape_configs: +  - job_name: "node" +    static_configs: +      - targets: ["node-exporter:9100"] +``` + +4. Run `podman run -d --name prometheus --network promnet -p 9091:9090   -v "$PWD/prometheus.yml:/etc/prometheus/prometheus.yml:Z"   quay.io/prometheus/prometheus:latest   --config.file=/etc/prometheus/prometheus.yml` +5. Check the status of the server by running `curl -s "http://localhost:9091/api/v1/targets" | python3 -m json.tool | grep health` and ensuring that "health" says "up". +6. When you are about to run DPP, run `export PROMETHEUS_URL="http://localhost:9091"` + +If you are running in OpenShift, you are going to want to set `PROMETHEUS_URL` to an OpenShift route that has Prometheus set up. Additionally, you are going to want to set `PROMETHEUS_API_KEY` to your OpenShift OAuth token if the Prometheus instance on the cluster is protected. You can get this token by running `oc whoami -t`. + ## How to use Foundation Model Stack (FMS) on AIU hardware The [scripts](https://github.com/foundation-model-stack/aiu-fms-testing-utils/tree/main/scripts) directory provides various scripts to use FMS on AIU hardware for many use cases. These scripts provide robust support for passing desired command line options for running encoder and decoder models along with other use cases. Refer to the documentation on [using different scripts](https://github.com/foundation-model-stack/aiu-fms-testing-utils/blob/main/scripts/README.md) for more details. diff --git a/aiu_fms_testing_utils/scripts/README.md b/aiu_fms_testing_utils/scripts/README.md index e2432bf1..a652fe6a 100644 --- a/aiu_fms_testing_utils/scripts/README.md +++ b/aiu_fms_testing_utils/scripts/README.md @@ -1,6 +1,6 @@ # Scripts for using Foundation Model Stack (FMS) on AIU hardware -The scripts provided here allow you to run FMS on AIU device for a variety of models. +The scripts provided here allow you to run FMS on AIU device for a variety of models. Let's look at some of the example usage below. @@ -75,4 +75,3 @@ python3 scripts/validation.py --architecture=hf_configured --model_path=/home/de ``` To run a logits-based validation, pass `--validation_level=1` to the validation script. This will check for the logits output to match at every step of the model through cross-entropy loss. You can control the acceptable threshold with `--logits_loss_threshold`. - From c8ec9ef89f1cc92c9c0d5fa661f515efdb75716e Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Tue, 24 Feb 2026 11:49:05 -0500 Subject: [PATCH 26/42] Cleanup Signed-off-by: Christian Sarmiento --- log.txt | 162 ------------------------------------------------- pyproject.toml | 2 +- 2 files changed, 1 insertion(+), 163 deletions(-) delete mode 100644 log.txt diff --git a/log.txt b/log.txt deleted file mode 100644 index 86b2accb..00000000 --- a/log.txt +++ /dev/null @@ -1,162 +0,0 @@ -(vllm) [root@zaiu pytorch_workspace]# PYTHONPATH=./aiu-fms-testing-utils:$PYTHONPATH torchrun --nproc-per-node=4 aiu-fms-testing-utils/scripts/drive_paged_programs.py --model_variant=/ibm-granite/granite-3.3-8b-instruct --program_criteria_json_path=z-spyre-runtimes/testing/acceptance/fms/criterion/program_criterion.json --dataset_type=sharegpt --skip_validation --programs "*:0,<8192" --prioritize_large_batch_sizes --enforce_homogeneous_prompt_programs --prefill_chunk_size=1024 --dataset_path=ShareGPT_V3_unfiltered_cleaned_split.json -[ 0/ 4]: When skipping validation, only test_type will be ignored -[ 3/ 4]: When skipping validation, only test_type will be ignored -[ 1/ 4]: When skipping validation, only test_type will be ignored -[ 2/ 4]: When skipping validation, only test_type will be ignored -[ 0/ 4]: Unknown model configuration. Using VLLM_DT_MAX_BATCH_TKV_LIMIT from environment: 16384 -[ 3/ 4]: Unknown model configuration. Using VLLM_DT_MAX_BATCH_TKV_LIMIT from environment: 16384 -[ 2/ 4]: Unknown model configuration. Using VLLM_DT_MAX_BATCH_TKV_LIMIT from environment: 16384 -[ 1/ 4]: Unknown model configuration. Using VLLM_DT_MAX_BATCH_TKV_LIMIT from environment: 16384 -WARNING:torchao.kernel.intmm:Warning: Detected no triton, on systems without Triton certain kernels will not work -WARNING:torchao.kernel.intmm:Warning: Detected no triton, on systems without Triton certain kernels will not work -WARNING:torchao.kernel.intmm:Warning: Detected no triton, on systems without Triton certain kernels will not work -WARNING:torchao.kernel.intmm:Warning: Detected no triton, on systems without Triton certain kernels will not work -[ 0/ 4]: AIU warmup -[ 3/ 4]: AIU warmup -[ 2/ 4]: AIU warmup -Traceback (most recent call last): -Traceback (most recent call last): - File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 976, in json - File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 976, in json -Traceback (most recent call last): - return complexjson.loads(self.text, **kwargs)return complexjson.loads(self.text, **kwargs) File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 976, in json - - - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -^^^^^ File "/usr/lib64/python3.12/json/__init__.py", line 346, in loads - - File "/usr/lib64/python3.12/json/__init__.py", line 346, in loads - return complexjson.loads(self.text, **kwargs) - return _default_decoder.decode(s) -return _default_decoder.decode(s) - ^^^^^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -^^^^^^ File "/usr/lib64/python3.12/json/__init__.py", line 346, in loads - -^ - File "/usr/lib64/python3.12/json/decoder.py", line 338, in decode - File "/usr/lib64/python3.12/json/decoder.py", line 338, in decode - obj, end = self.raw_decode(s, idx=_w(s, 0).end())obj, end = self.raw_decode(s, idx=_w(s, 0).end()) - - return _default_decoder.decode(s) - ^^ ^^ ^^ ^^ ^^ ^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -^^^^^^^ File "/usr/lib64/python3.12/json/decoder.py", line 338, in decode -^^^^ - - File "/usr/lib64/python3.12/json/decoder.py", line 356, in raw_decode - File "/usr/lib64/python3.12/json/decoder.py", line 356, in raw_decode - obj, end = self.raw_decode(s, idx=_w(s, 0).end()) - raise JSONDecodeError("Expecting value", s, err.value) from Noneraise JSONDecodeError("Expecting value", s, err.value) from None - - json.decoder json.decoder. .JSONDecodeError JSONDecodeError : : Expecting value: line 1 column 1 (char 0)Expecting value: line 1 column 1 (char 0)^ - -^^ - -^^During handling of the above exception, another exception occurred: -During handling of the above exception, another exception occurred: -^^ -^ -^Traceback (most recent call last): -^Traceback (most recent call last): -^^^^^ File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1507, in -^ File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1507, in -^^^^^^^^^^^^^^^^^^^^^^ - File "/usr/lib64/python3.12/json/decoder.py", line 356, in raw_decode - raise JSONDecodeError("Expecting value", s, err.value) from None -json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) - -During handling of the above exception, another exception occurred: - -Traceback (most recent call last): - File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1507, in - main()main() - - File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1444, in main - File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1444, in main - main() - File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1444, in main - warmup_model(warmup_model( - - File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/__init__.py", line 103, in warmup_model - File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/__init__.py", line 103, in warmup_model - initial_cpu, initial_mem = get_static_read(p, metric_start)initial_cpu, initial_mem = get_static_read(p, metric_start) - - ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^warmup_model(^^ -^^^^^^^^^^^^^ File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/__init__.py", line 103, in warmup_model -^^^^^^^^^^^^^^^^ -^ - File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/resource_collection.py", line 92, in get_static_read - File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/resource_collection.py", line 92, in get_static_read - initial_cpu, initial_mem = get_static_read(p, metric_start) - cpu_response = client.custom_query(query=cpu_query, params={"time": recorded_time.timestamp()})cpu_response = client.custom_query(query=cpu_query, params={"time": recorded_time.timestamp()}) - - ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -^^^^^^ File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/resource_collection.py", line 92, in get_static_read -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^cpu_response = client.custom_query(query=cpu_query, params={"time": recorded_time.timestamp()})^^ -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^ ^^ ^^ ^^ ^^ ^^ ^^ ^^ ^ ^^ ^^ ^^ ^^ ^^ ^^ ^^ ^^ ^^ ^^^^^^^^^^^^^^^^^^^^^ -^^^^^^ File "/opt/vllm/lib64/python3.12/site-packages/prometheus_api_client/prometheus_connect.py", line 473, in custom_query -^^^^^^^ -^^^^ File "/opt/vllm/lib64/python3.12/site-packages/prometheus_api_client/prometheus_connect.py", line 473, in custom_query -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/opt/vllm/lib64/python3.12/site-packages/prometheus_api_client/prometheus_connect.py", line 473, in custom_query - data = response.json()["data"]["result"]data = response.json()["data"]["result"] - - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 980, in json - File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 980, in json - data = response.json()["data"]["result"] - ^^^^^^^^^^^^^^^ - File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 980, in json - raise RequestsJSONDecodeError(e.msg, e.doc, e.pos) -raise RequestsJSONDecodeError(e.msg, e.doc, e.pos) -requests.exceptionsrequests.exceptions..JSONDecodeErrorJSONDecodeError: : Expecting value: line 1 column 1 (char 0)Expecting value: line 1 column 1 (char 0) - - raise RequestsJSONDecodeError(e.msg, e.doc, e.pos) -requests.exceptions.JSONDecodeError: Expecting value: line 1 column 1 (char 0) -W0223 22:12:21.239000 191 torch/distributed/elastic/multiprocessing/api.py:900] Sending process 194 closing signal SIGTERM -E0223 22:12:21.363000 191 torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 0 (pid: 193) of binary: /opt/vllm/bin/python3 -Traceback (most recent call last): - File "/opt/vllm/bin/torchrun", line 10, in - sys.exit(main()) - ^^^^^^ - File "/opt/vllm/lib64/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper - return f(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^ - File "/opt/vllm/lib64/python3.12/site-packages/torch/distributed/run.py", line 892, in main - run(args) - File "/opt/vllm/lib64/python3.12/site-packages/torch/distributed/run.py", line 883, in run - elastic_launch( - File "/opt/vllm/lib64/python3.12/site-packages/torch/distributed/launcher/api.py", line 139, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/opt/vllm/lib64/python3.12/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -aiu-fms-testing-utils/scripts/drive_paged_programs.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2026-02-23_22:12:21 - host : host.containers.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 195) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2026-02-23_22:12:21 - host : host.containers.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 196) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2026-02-23_22:12:21 - host : host.containers.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 193) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ diff --git a/pyproject.toml b/pyproject.toml index 500ea2a9..add580ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ dependencies = [ "sentencepiece>=0.2.0,<0.3.0", "numpy>=1.26.4,<2.3.0", "transformers>=4.45,<=4.58", -"torch==2.10.0", +"torch==2.7.1", ] # This section installs command line executables that point to specific scripts From 5bcddc302ec78f4bd124b08fc9d892da0eb759fd Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Tue, 24 Feb 2026 11:51:05 -0500 Subject: [PATCH 27/42] Revert "Cleanup" This reverts commit 963f39c83552fce4550e74b2e724641505eaf8dd. Signed-off-by: Christian Sarmiento --- log.txt | 162 +++++++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 163 insertions(+), 1 deletion(-) create mode 100644 log.txt diff --git a/log.txt b/log.txt new file mode 100644 index 00000000..86b2accb --- /dev/null +++ b/log.txt @@ -0,0 +1,162 @@ +(vllm) [root@zaiu pytorch_workspace]# PYTHONPATH=./aiu-fms-testing-utils:$PYTHONPATH torchrun --nproc-per-node=4 aiu-fms-testing-utils/scripts/drive_paged_programs.py --model_variant=/ibm-granite/granite-3.3-8b-instruct --program_criteria_json_path=z-spyre-runtimes/testing/acceptance/fms/criterion/program_criterion.json --dataset_type=sharegpt --skip_validation --programs "*:0,<8192" --prioritize_large_batch_sizes --enforce_homogeneous_prompt_programs --prefill_chunk_size=1024 --dataset_path=ShareGPT_V3_unfiltered_cleaned_split.json +[ 0/ 4]: When skipping validation, only test_type will be ignored +[ 3/ 4]: When skipping validation, only test_type will be ignored +[ 1/ 4]: When skipping validation, only test_type will be ignored +[ 2/ 4]: When skipping validation, only test_type will be ignored +[ 0/ 4]: Unknown model configuration. Using VLLM_DT_MAX_BATCH_TKV_LIMIT from environment: 16384 +[ 3/ 4]: Unknown model configuration. Using VLLM_DT_MAX_BATCH_TKV_LIMIT from environment: 16384 +[ 2/ 4]: Unknown model configuration. Using VLLM_DT_MAX_BATCH_TKV_LIMIT from environment: 16384 +[ 1/ 4]: Unknown model configuration. Using VLLM_DT_MAX_BATCH_TKV_LIMIT from environment: 16384 +WARNING:torchao.kernel.intmm:Warning: Detected no triton, on systems without Triton certain kernels will not work +WARNING:torchao.kernel.intmm:Warning: Detected no triton, on systems without Triton certain kernels will not work +WARNING:torchao.kernel.intmm:Warning: Detected no triton, on systems without Triton certain kernels will not work +WARNING:torchao.kernel.intmm:Warning: Detected no triton, on systems without Triton certain kernels will not work +[ 0/ 4]: AIU warmup +[ 3/ 4]: AIU warmup +[ 2/ 4]: AIU warmup +Traceback (most recent call last): +Traceback (most recent call last): + File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 976, in json + File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 976, in json +Traceback (most recent call last): + return complexjson.loads(self.text, **kwargs)return complexjson.loads(self.text, **kwargs) File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 976, in json + + + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^ File "/usr/lib64/python3.12/json/__init__.py", line 346, in loads + + File "/usr/lib64/python3.12/json/__init__.py", line 346, in loads + return complexjson.loads(self.text, **kwargs) + return _default_decoder.decode(s) +return _default_decoder.decode(s) + ^^^^^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^ File "/usr/lib64/python3.12/json/__init__.py", line 346, in loads + +^ + File "/usr/lib64/python3.12/json/decoder.py", line 338, in decode + File "/usr/lib64/python3.12/json/decoder.py", line 338, in decode + obj, end = self.raw_decode(s, idx=_w(s, 0).end())obj, end = self.raw_decode(s, idx=_w(s, 0).end()) + + return _default_decoder.decode(s) + ^^ ^^ ^^ ^^ ^^ ^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^ File "/usr/lib64/python3.12/json/decoder.py", line 338, in decode +^^^^ + + File "/usr/lib64/python3.12/json/decoder.py", line 356, in raw_decode + File "/usr/lib64/python3.12/json/decoder.py", line 356, in raw_decode + obj, end = self.raw_decode(s, idx=_w(s, 0).end()) + raise JSONDecodeError("Expecting value", s, err.value) from Noneraise JSONDecodeError("Expecting value", s, err.value) from None + + json.decoder json.decoder. .JSONDecodeError JSONDecodeError : : Expecting value: line 1 column 1 (char 0)Expecting value: line 1 column 1 (char 0)^ + +^^ + +^^During handling of the above exception, another exception occurred: +During handling of the above exception, another exception occurred: +^^ +^ +^Traceback (most recent call last): +^Traceback (most recent call last): +^^^^^ File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1507, in +^ File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1507, in +^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib64/python3.12/json/decoder.py", line 356, in raw_decode + raise JSONDecodeError("Expecting value", s, err.value) from None +json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1507, in + main()main() + + File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1444, in main + File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1444, in main + main() + File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1444, in main + warmup_model(warmup_model( + + File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/__init__.py", line 103, in warmup_model + File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/__init__.py", line 103, in warmup_model + initial_cpu, initial_mem = get_static_read(p, metric_start)initial_cpu, initial_mem = get_static_read(p, metric_start) + + ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^warmup_model(^^ +^^^^^^^^^^^^^ File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/__init__.py", line 103, in warmup_model +^^^^^^^^^^^^^^^^ +^ + File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/resource_collection.py", line 92, in get_static_read + File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/resource_collection.py", line 92, in get_static_read + initial_cpu, initial_mem = get_static_read(p, metric_start) + cpu_response = client.custom_query(query=cpu_query, params={"time": recorded_time.timestamp()})cpu_response = client.custom_query(query=cpu_query, params={"time": recorded_time.timestamp()}) + + ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^ File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/resource_collection.py", line 92, in get_static_read +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^cpu_response = client.custom_query(query=cpu_query, params={"time": recorded_time.timestamp()})^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^ ^^ ^^ ^^ ^^ ^^ ^^ ^^ ^ ^^ ^^ ^^ ^^ ^^ ^^ ^^ ^^ ^^ ^^^^^^^^^^^^^^^^^^^^^ +^^^^^^ File "/opt/vllm/lib64/python3.12/site-packages/prometheus_api_client/prometheus_connect.py", line 473, in custom_query +^^^^^^^ +^^^^ File "/opt/vllm/lib64/python3.12/site-packages/prometheus_api_client/prometheus_connect.py", line 473, in custom_query +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/opt/vllm/lib64/python3.12/site-packages/prometheus_api_client/prometheus_connect.py", line 473, in custom_query + data = response.json()["data"]["result"]data = response.json()["data"]["result"] + + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 980, in json + File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 980, in json + data = response.json()["data"]["result"] + ^^^^^^^^^^^^^^^ + File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 980, in json + raise RequestsJSONDecodeError(e.msg, e.doc, e.pos) +raise RequestsJSONDecodeError(e.msg, e.doc, e.pos) +requests.exceptionsrequests.exceptions..JSONDecodeErrorJSONDecodeError: : Expecting value: line 1 column 1 (char 0)Expecting value: line 1 column 1 (char 0) + + raise RequestsJSONDecodeError(e.msg, e.doc, e.pos) +requests.exceptions.JSONDecodeError: Expecting value: line 1 column 1 (char 0) +W0223 22:12:21.239000 191 torch/distributed/elastic/multiprocessing/api.py:900] Sending process 194 closing signal SIGTERM +E0223 22:12:21.363000 191 torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 0 (pid: 193) of binary: /opt/vllm/bin/python3 +Traceback (most recent call last): + File "/opt/vllm/bin/torchrun", line 10, in + sys.exit(main()) + ^^^^^^ + File "/opt/vllm/lib64/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper + return f(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^ + File "/opt/vllm/lib64/python3.12/site-packages/torch/distributed/run.py", line 892, in main + run(args) + File "/opt/vllm/lib64/python3.12/site-packages/torch/distributed/run.py", line 883, in run + elastic_launch( + File "/opt/vllm/lib64/python3.12/site-packages/torch/distributed/launcher/api.py", line 139, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/opt/vllm/lib64/python3.12/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +aiu-fms-testing-utils/scripts/drive_paged_programs.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2026-02-23_22:12:21 + host : host.containers.internal + rank : 2 (local_rank: 2) + exitcode : 1 (pid: 195) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[2]: + time : 2026-02-23_22:12:21 + host : host.containers.internal + rank : 3 (local_rank: 3) + exitcode : 1 (pid: 196) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2026-02-23_22:12:21 + host : host.containers.internal + rank : 0 (local_rank: 0) + exitcode : 1 (pid: 193) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ diff --git a/pyproject.toml b/pyproject.toml index add580ed..500ea2a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ dependencies = [ "sentencepiece>=0.2.0,<0.3.0", "numpy>=1.26.4,<2.3.0", "transformers>=4.45,<=4.58", -"torch==2.7.1", +"torch==2.10.0", ] # This section installs command line executables that point to specific scripts From 2f12a97ec1f870df2c42ec6f5c720f8b61107324 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Tue, 24 Feb 2026 16:50:03 -0500 Subject: [PATCH 28/42] Delete log.txt Signed-off-by: Christian Sarmiento --- log.txt | 162 -------------------------------------------------------- 1 file changed, 162 deletions(-) delete mode 100644 log.txt diff --git a/log.txt b/log.txt deleted file mode 100644 index 86b2accb..00000000 --- a/log.txt +++ /dev/null @@ -1,162 +0,0 @@ -(vllm) [root@zaiu pytorch_workspace]# PYTHONPATH=./aiu-fms-testing-utils:$PYTHONPATH torchrun --nproc-per-node=4 aiu-fms-testing-utils/scripts/drive_paged_programs.py --model_variant=/ibm-granite/granite-3.3-8b-instruct --program_criteria_json_path=z-spyre-runtimes/testing/acceptance/fms/criterion/program_criterion.json --dataset_type=sharegpt --skip_validation --programs "*:0,<8192" --prioritize_large_batch_sizes --enforce_homogeneous_prompt_programs --prefill_chunk_size=1024 --dataset_path=ShareGPT_V3_unfiltered_cleaned_split.json -[ 0/ 4]: When skipping validation, only test_type will be ignored -[ 3/ 4]: When skipping validation, only test_type will be ignored -[ 1/ 4]: When skipping validation, only test_type will be ignored -[ 2/ 4]: When skipping validation, only test_type will be ignored -[ 0/ 4]: Unknown model configuration. Using VLLM_DT_MAX_BATCH_TKV_LIMIT from environment: 16384 -[ 3/ 4]: Unknown model configuration. Using VLLM_DT_MAX_BATCH_TKV_LIMIT from environment: 16384 -[ 2/ 4]: Unknown model configuration. Using VLLM_DT_MAX_BATCH_TKV_LIMIT from environment: 16384 -[ 1/ 4]: Unknown model configuration. Using VLLM_DT_MAX_BATCH_TKV_LIMIT from environment: 16384 -WARNING:torchao.kernel.intmm:Warning: Detected no triton, on systems without Triton certain kernels will not work -WARNING:torchao.kernel.intmm:Warning: Detected no triton, on systems without Triton certain kernels will not work -WARNING:torchao.kernel.intmm:Warning: Detected no triton, on systems without Triton certain kernels will not work -WARNING:torchao.kernel.intmm:Warning: Detected no triton, on systems without Triton certain kernels will not work -[ 0/ 4]: AIU warmup -[ 3/ 4]: AIU warmup -[ 2/ 4]: AIU warmup -Traceback (most recent call last): -Traceback (most recent call last): - File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 976, in json - File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 976, in json -Traceback (most recent call last): - return complexjson.loads(self.text, **kwargs)return complexjson.loads(self.text, **kwargs) File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 976, in json - - - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -^^^^^ File "/usr/lib64/python3.12/json/__init__.py", line 346, in loads - - File "/usr/lib64/python3.12/json/__init__.py", line 346, in loads - return complexjson.loads(self.text, **kwargs) - return _default_decoder.decode(s) -return _default_decoder.decode(s) - ^^^^^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -^^^^^^ File "/usr/lib64/python3.12/json/__init__.py", line 346, in loads - -^ - File "/usr/lib64/python3.12/json/decoder.py", line 338, in decode - File "/usr/lib64/python3.12/json/decoder.py", line 338, in decode - obj, end = self.raw_decode(s, idx=_w(s, 0).end())obj, end = self.raw_decode(s, idx=_w(s, 0).end()) - - return _default_decoder.decode(s) - ^^ ^^ ^^ ^^ ^^ ^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -^^^^^^^ File "/usr/lib64/python3.12/json/decoder.py", line 338, in decode -^^^^ - - File "/usr/lib64/python3.12/json/decoder.py", line 356, in raw_decode - File "/usr/lib64/python3.12/json/decoder.py", line 356, in raw_decode - obj, end = self.raw_decode(s, idx=_w(s, 0).end()) - raise JSONDecodeError("Expecting value", s, err.value) from Noneraise JSONDecodeError("Expecting value", s, err.value) from None - - json.decoder json.decoder. .JSONDecodeError JSONDecodeError : : Expecting value: line 1 column 1 (char 0)Expecting value: line 1 column 1 (char 0)^ - -^^ - -^^During handling of the above exception, another exception occurred: -During handling of the above exception, another exception occurred: -^^ -^ -^Traceback (most recent call last): -^Traceback (most recent call last): -^^^^^ File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1507, in -^ File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1507, in -^^^^^^^^^^^^^^^^^^^^^^ - File "/usr/lib64/python3.12/json/decoder.py", line 356, in raw_decode - raise JSONDecodeError("Expecting value", s, err.value) from None -json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) - -During handling of the above exception, another exception occurred: - -Traceback (most recent call last): - File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1507, in - main()main() - - File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1444, in main - File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1444, in main - main() - File "/pytorch_workspace/aiu-fms-testing-utils/scripts/drive_paged_programs.py", line 1444, in main - warmup_model(warmup_model( - - File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/__init__.py", line 103, in warmup_model - File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/__init__.py", line 103, in warmup_model - initial_cpu, initial_mem = get_static_read(p, metric_start)initial_cpu, initial_mem = get_static_read(p, metric_start) - - ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^warmup_model(^^ -^^^^^^^^^^^^^ File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/__init__.py", line 103, in warmup_model -^^^^^^^^^^^^^^^^ -^ - File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/resource_collection.py", line 92, in get_static_read - File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/resource_collection.py", line 92, in get_static_read - initial_cpu, initial_mem = get_static_read(p, metric_start) - cpu_response = client.custom_query(query=cpu_query, params={"time": recorded_time.timestamp()})cpu_response = client.custom_query(query=cpu_query, params={"time": recorded_time.timestamp()}) - - ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -^^^^^^ File "/pytorch_workspace/aiu-fms-testing-utils/aiu_fms_testing_utils/utils/resource_collection.py", line 92, in get_static_read -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^cpu_response = client.custom_query(query=cpu_query, params={"time": recorded_time.timestamp()})^^ -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^ ^^ ^^ ^^ ^^ ^^ ^^ ^^ ^ ^^ ^^ ^^ ^^ ^^ ^^ ^^ ^^ ^^ ^^^^^^^^^^^^^^^^^^^^^ -^^^^^^ File "/opt/vllm/lib64/python3.12/site-packages/prometheus_api_client/prometheus_connect.py", line 473, in custom_query -^^^^^^^ -^^^^ File "/opt/vllm/lib64/python3.12/site-packages/prometheus_api_client/prometheus_connect.py", line 473, in custom_query -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/opt/vllm/lib64/python3.12/site-packages/prometheus_api_client/prometheus_connect.py", line 473, in custom_query - data = response.json()["data"]["result"]data = response.json()["data"]["result"] - - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 980, in json - File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 980, in json - data = response.json()["data"]["result"] - ^^^^^^^^^^^^^^^ - File "/opt/vllm/lib64/python3.12/site-packages/requests/models.py", line 980, in json - raise RequestsJSONDecodeError(e.msg, e.doc, e.pos) -raise RequestsJSONDecodeError(e.msg, e.doc, e.pos) -requests.exceptionsrequests.exceptions..JSONDecodeErrorJSONDecodeError: : Expecting value: line 1 column 1 (char 0)Expecting value: line 1 column 1 (char 0) - - raise RequestsJSONDecodeError(e.msg, e.doc, e.pos) -requests.exceptions.JSONDecodeError: Expecting value: line 1 column 1 (char 0) -W0223 22:12:21.239000 191 torch/distributed/elastic/multiprocessing/api.py:900] Sending process 194 closing signal SIGTERM -E0223 22:12:21.363000 191 torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 0 (pid: 193) of binary: /opt/vllm/bin/python3 -Traceback (most recent call last): - File "/opt/vllm/bin/torchrun", line 10, in - sys.exit(main()) - ^^^^^^ - File "/opt/vllm/lib64/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper - return f(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^ - File "/opt/vllm/lib64/python3.12/site-packages/torch/distributed/run.py", line 892, in main - run(args) - File "/opt/vllm/lib64/python3.12/site-packages/torch/distributed/run.py", line 883, in run - elastic_launch( - File "/opt/vllm/lib64/python3.12/site-packages/torch/distributed/launcher/api.py", line 139, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/opt/vllm/lib64/python3.12/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -aiu-fms-testing-utils/scripts/drive_paged_programs.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2026-02-23_22:12:21 - host : host.containers.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 195) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2026-02-23_22:12:21 - host : host.containers.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 196) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2026-02-23_22:12:21 - host : host.containers.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 193) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ From b5f0d305fd016b0bf8af7e8b826ced865ed602b1 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Tue, 24 Feb 2026 16:53:12 -0500 Subject: [PATCH 29/42] removing package Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/scripts/drive_paged_programs.py | 3 +-- aiu_fms_testing_utils/utils/__init__.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/aiu_fms_testing_utils/scripts/drive_paged_programs.py b/aiu_fms_testing_utils/scripts/drive_paged_programs.py index dfacc63e..d6425536 100644 --- a/aiu_fms_testing_utils/scripts/drive_paged_programs.py +++ b/aiu_fms_testing_utils/scripts/drive_paged_programs.py @@ -48,7 +48,6 @@ from aiu_fms_testing_utils.utils.resource_collection import ( instantiate_prometheus, get_static_read, get_peak_read ) -from prometheus_api_client import PrometheusConnect # Constants PAD_MULTIPLE = 64 @@ -1256,7 +1255,7 @@ def generate_validation_info_and_test( timing: str, prefill_chunk_size: int, model_variant: str, - profile: PrometheusConnect | None + profile ) -> list[Any]: """Generates tokens using AIU and CPU models and validates the results. diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index b3859175..18cc8850 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -92,7 +92,7 @@ def warmup_model( use_cache: bool = True, stagger_update_lazyhandle: int = 0, prefill_chunk_size: int = 0, - profile: PrometheusConnect | None = None, + profile = None, **extra_kwargs, ): import torch_sendnn From 661871f2975b35b22a3d748bb63d4bcfb645a807 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Tue, 24 Feb 2026 17:06:36 -0500 Subject: [PATCH 30/42] removing unneeded imports Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/utils/__init__.py | 1 - aiu_fms_testing_utils/utils/resource_collection.py | 5 ++++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index 18cc8850..34cee9f4 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -23,7 +23,6 @@ import math import contextlib import warnings -from prometheus_api_client import PrometheusConnect @contextlib.contextmanager diff --git a/aiu_fms_testing_utils/utils/resource_collection.py b/aiu_fms_testing_utils/utils/resource_collection.py index b15f3b6e..4215beea 100644 --- a/aiu_fms_testing_utils/utils/resource_collection.py +++ b/aiu_fms_testing_utils/utils/resource_collection.py @@ -1,7 +1,10 @@ # Imports import os -from prometheus_api_client import PrometheusConnect +try: + from prometheus_api_client import PrometheusConnect +except Exception as e: + print("WARNING: Cannot import `prometheus_api_client`. Make sure the package is installed if you are trying to report resource utilization.") def instantiate_prometheus(): From d8635b55420eb36b39db078e82283790a41e8c7d Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Tue, 24 Feb 2026 21:22:19 -0500 Subject: [PATCH 31/42] Making a function for repeated stage prints Signed-off-by: Christian Sarmiento --- .../scripts/drive_paged_programs.py | 41 +++---------------- .../utils/resource_collection.py | 19 +++++++++ 2 files changed, 25 insertions(+), 35 deletions(-) diff --git a/aiu_fms_testing_utils/scripts/drive_paged_programs.py b/aiu_fms_testing_utils/scripts/drive_paged_programs.py index d6425536..fe5ff409 100644 --- a/aiu_fms_testing_utils/scripts/drive_paged_programs.py +++ b/aiu_fms_testing_utils/scripts/drive_paged_programs.py @@ -36,8 +36,7 @@ sample_rag_factoid_requests, sample_sharegpt_requests, stagger_region, - warmup_model, - print_comp_resource_metrics + warmup_model ) from aiu_fms_testing_utils.utils.aiu_setup import aiu_dist_setup, dprint, local_rank from aiu_fms_testing_utils.utils.paged import ( @@ -45,9 +44,7 @@ get_programs_prompts, ) from aiu_fms_testing_utils.testing.utils import format_kwargs_to_string -from aiu_fms_testing_utils.utils.resource_collection import ( - instantiate_prometheus, get_static_read, get_peak_read -) +from aiu_fms_testing_utils.utils.resource_collection import instantiate_prometheus # Constants PAD_MULTIPLE = 64 @@ -1277,9 +1274,7 @@ def generate_validation_info_and_test( ) # Start inference - metric_start = datetime.now(timezone.utc) - initial_cpu, initial_mem = get_static_read(profile, metric_start) - print_comp_resource_metrics(initial_cpu, initial_mem, "started", "Inference") + metric_start = print_step(profile, "started", "Inference") if not skip_validation: # Generate or load CPU validation info @@ -1297,15 +1292,7 @@ def generate_validation_info_and_test( cpu_dtype=env_config.cpu_dtype, tokenizer=tokenizer, ) - - ## Get completed metric read - cpu_inference_metric_end = datetime.now(timezone.utc) - end_cpu_inference_cpu, end_mem_inference_cpu = get_static_read(profile, cpu_inference_metric_end) - print_comp_resource_metrics(end_cpu_inference_cpu, end_mem_inference_cpu, "completed", "CPU inference") - - ## Get the peak usage during compilation - peak_cpu_inference_cpu, peak_mem_inference_cpu = get_peak_read(profile, metric_start, cpu_inference_metric_end) - print_comp_resource_metrics(peak_cpu_inference_cpu, peak_mem_inference_cpu, "peak", "CPU inference") + print_step(profile, "completed", "CPU inference", metric_start) # Generate AIU validation info aiu_validation_info = generate_aiu_validation( @@ -1318,15 +1305,7 @@ def generate_validation_info_and_test( cpu_validation_info=cpu_validation_info, extra_kwargs=valid_prompt.extra_kwargs, ) - - ## Get completed metric read - aiu_inference_metric_end = datetime.now(timezone.utc) - end_cpu_inference_aiu, end_mem_inference_aiu = get_static_read(profile, aiu_inference_metric_end) - print_comp_resource_metrics(end_cpu_inference_aiu, end_mem_inference_aiu, "completed", "AIU inference") - - ## Get the peak usage during compilation - peak_cpu_inference_aiu, peak_mem_inference_aiu = get_peak_read(profile, metric_start, aiu_inference_metric_end) - print_comp_resource_metrics(peak_cpu_inference_aiu, peak_mem_inference_aiu, "peak", "AIU inference") + print_step(profile, "completed", "AIU inference", metric_start) if test_type == "metrics": failure_rate = evaluate_cross_entropy_metrics( @@ -1366,15 +1345,7 @@ def generate_validation_info_and_test( cpu_validation_info=None, extra_kwargs=valid_prompt.extra_kwargs, ) - - ## Get completed metric read - aiu_inference_metric_end = datetime.now(timezone.utc) - end_cpu_inference_aiu, end_mem_inference_aiu = get_static_read(profile, aiu_inference_metric_end) - print_comp_resource_metrics(end_cpu_inference_aiu, end_mem_inference_aiu, "completed", "AIU inference") - - ## Get the peak usage during compilation - peak_cpu_inference_aiu, peak_mem_inference_aiu = get_peak_read(profile, metric_start, aiu_inference_metric_end) - print_comp_resource_metrics(peak_cpu_inference_aiu, peak_mem_inference_aiu, "peak", "AIU inference") + print_step(profile, "completed", "AIU inference", metric_start) if local_rank == 0: for sentence_idx, test_sentence in enumerate( diff --git a/aiu_fms_testing_utils/utils/resource_collection.py b/aiu_fms_testing_utils/utils/resource_collection.py index 4215beea..621578fa 100644 --- a/aiu_fms_testing_utils/utils/resource_collection.py +++ b/aiu_fms_testing_utils/utils/resource_collection.py @@ -1,6 +1,8 @@ # Imports import os +from datetime import datetime, timezone +from aiu_fms_testing_utils.utils import print_comp_resource_metrics try: from prometheus_api_client import PrometheusConnect except Exception as e: @@ -138,3 +140,20 @@ def get_peak_read(client, start, end): peak_mem_value = get_value(mem_response, "range") return peak_cpu_value, peak_mem_value + + +def print_step(p, step, stage, start_time=None): + """ + """ + + ## Get metric read + timestep = datetime.now(timezone.utc) + cpu_usage, mem_usage = get_static_read(p, timestep) + print_comp_resource_metrics(cpu_usage, mem_usage, step, stage) + + ## Get and print the peak usage + if start_time is not None: + peak_cpu_inference_cpu, peak_mem_inference_cpu = get_peak_read(p, start_time, timestep) + print_comp_resource_metrics(peak_cpu_inference_cpu, peak_mem_inference_cpu, "peak", stage) + + return timestep From 6e7ea31830d2a79662ef14511eaaa3126f0e3fe7 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Tue, 24 Feb 2026 21:44:43 -0500 Subject: [PATCH 32/42] Using function in compilation stage Signed-off-by: Christian Sarmiento --- .../scripts/drive_paged_programs.py | 5 +- aiu_fms_testing_utils/utils/__init__.py | 49 +--------------- .../utils/resource_collection.py | 57 +++++++++++++++++-- 3 files changed, 58 insertions(+), 53 deletions(-) diff --git a/aiu_fms_testing_utils/scripts/drive_paged_programs.py b/aiu_fms_testing_utils/scripts/drive_paged_programs.py index fe5ff409..c39b9f42 100644 --- a/aiu_fms_testing_utils/scripts/drive_paged_programs.py +++ b/aiu_fms_testing_utils/scripts/drive_paged_programs.py @@ -44,8 +44,9 @@ get_programs_prompts, ) from aiu_fms_testing_utils.testing.utils import format_kwargs_to_string -from aiu_fms_testing_utils.utils.resource_collection import instantiate_prometheus - +from aiu_fms_testing_utils.utils.resource_collection import ( + instantiate_prometheus, print_step +) # Constants PAD_MULTIPLE = 64 diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index 34cee9f4..51643bc0 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -14,9 +14,7 @@ from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank, world_size from transformers.tokenization_utils_base import PreTrainedTokenizerBase from aiu_fms_testing_utils.testing.utils import format_kwargs_to_string -from aiu_fms_testing_utils.utils.resource_collection import ( - get_static_read, get_peak_read -) +from aiu_fms_testing_utils.utils.resource_collection import print_step from fms.utils.generation import pad_input_ids import torch import torch.nn as nn @@ -50,38 +48,6 @@ def stagger_region(limit: int): torch.distributed.barrier() dprint("Stagger: All Complete") -def timestamp_print(given_string): - """ - Helper method that will add a timestamp before the given string that needs to be - printed. - - Args: - - given_string: the string that is to be printed with the timestamp. - """ - - timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") - print(f"[{timestamp}] {given_string}") - -def print_comp_resource_metrics(cpu_val, mem_val, stage, step): - """ - Helper method that will do a timestamp print for a specific step to report resource - usage. - - Args: - - cpu_val: the value for CPU usage as a percentage that we want to print. - - mem_val: the value for memory usage in gigabytes we want to print. - - stage: The stage of the step we are in, either "peak" or "started". - - step: The step that we performing in the script, either "compilation" or "inference". - """ - - if stage != "peak": - if cpu_val is None or mem_val is None: - timestamp_print(f"{step} {stage}") - else: - timestamp_print(f"{step} {stage} - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB") - - elif cpu_val is not None and mem_val is not None: - dprint(f"Peak Resource Utilization - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB") def warmup_model( model: nn.Module, @@ -114,9 +80,7 @@ def warmup_model( pt_compile_model_time = time.time() ## Report on initial resource usage - metric_start = datetime.now(timezone.utc) - initial_cpu, initial_mem = get_static_read(profile, metric_start) - print_comp_resource_metrics(initial_cpu, initial_mem, "started", "Compilation") + metric_start = print_step(profile, "started", "Compilation") # adjust inputs depending on attn_type and dynamic shapes _warmup_input_ids = input_ids @@ -148,14 +112,7 @@ def warmup_model( pt_compile_model_time = time.time() - pt_compile_model_time # Get completed metric read - metric_end = datetime.now(timezone.utc) - end_cpu, end_mem = get_static_read(profile, metric_end) - print_comp_resource_metrics(end_cpu, end_mem, "completed", "Compilation") - - # Get the peak usage during compilation - peak_cpu, peak_mem = get_peak_read(profile, metric_start, metric_end) - print_comp_resource_metrics(peak_cpu, peak_mem, "peak", "Compilation") - + print_step(profile, "completed", "Compilation", metric_start) dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s") diff --git a/aiu_fms_testing_utils/utils/resource_collection.py b/aiu_fms_testing_utils/utils/resource_collection.py index 621578fa..a775bfe2 100644 --- a/aiu_fms_testing_utils/utils/resource_collection.py +++ b/aiu_fms_testing_utils/utils/resource_collection.py @@ -2,7 +2,7 @@ import os from datetime import datetime, timezone -from aiu_fms_testing_utils.utils import print_comp_resource_metrics +from aiu_fms_testing_utils.utils.aiu_setup import dprint try: from prometheus_api_client import PrometheusConnect except Exception as e: @@ -142,18 +142,65 @@ def get_peak_read(client, start, end): return peak_cpu_value, peak_mem_value +def timestamp_print(given_string): + """ + Helper method that will add a timestamp before the given string that needs to be + printed. + + Args: + - given_string: the string that is to be printed with the timestamp. + """ + + timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") + print(f"[{timestamp}] {given_string}") + + +def print_comp_resource_metrics(cpu_val, mem_val, stage, step): + """ + Helper method that will do a timestamp print for a specific step to report resource + usage. + + Args: + - cpu_val: the value for CPU usage as a percentage that we want to print. + - mem_val: the value for memory usage in gigabytes we want to print. + - stage: The stage of the step we are in, either "peak" or "started". + - step: The step that we performing in the script, either "compilation" or "inference". + """ + + if stage != "peak": + if cpu_val is None or mem_val is None: + timestamp_print(f"{step} {stage}") + else: + timestamp_print(f"{step} {stage} - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB") + + elif cpu_val is not None and mem_val is not None: + dprint(f"Peak Resource Utilization - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB") + + def print_step(p, step, stage, start_time=None): """ + Print function to print out when a specific stage starts and ends, + as well as reporting resource usage if enabled. + + Args: + - p: the Prometheus profile client to resource utilization collection. + - step: string denoting what step we are at ("inference" or "compilation"). + - stage: string denoting what stage of the step we are at ("started" or "completed"). + - start_time: datetime object that denotes when the step started (optional). + + Returns: + - recorded_time: the time that was recorded when getting a metric read. Returned for + scenarios where we need to use the recorded time in a later step (i.e completed stages). """ ## Get metric read - timestep = datetime.now(timezone.utc) - cpu_usage, mem_usage = get_static_read(p, timestep) + recorded_time = datetime.now(timezone.utc) + cpu_usage, mem_usage = get_static_read(p, recorded_time) print_comp_resource_metrics(cpu_usage, mem_usage, step, stage) ## Get and print the peak usage if start_time is not None: - peak_cpu_inference_cpu, peak_mem_inference_cpu = get_peak_read(p, start_time, timestep) + peak_cpu_inference_cpu, peak_mem_inference_cpu = get_peak_read(p, start_time, recorded_time) print_comp_resource_metrics(peak_cpu_inference_cpu, peak_mem_inference_cpu, "peak", stage) - return timestep + return recorded_time From d0c5494837fdffc794ecb45c7545b97a6a85bf83 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Tue, 24 Feb 2026 22:35:33 -0500 Subject: [PATCH 33/42] fixing linting errors Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/scripts/drive_paged_programs.py | 2 +- aiu_fms_testing_utils/utils/__init__.py | 2 -- aiu_fms_testing_utils/utils/resource_collection.py | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/aiu_fms_testing_utils/scripts/drive_paged_programs.py b/aiu_fms_testing_utils/scripts/drive_paged_programs.py index c39b9f42..f3992733 100644 --- a/aiu_fms_testing_utils/scripts/drive_paged_programs.py +++ b/aiu_fms_testing_utils/scripts/drive_paged_programs.py @@ -1,6 +1,6 @@ import argparse from dataclasses import dataclass -from datetime import datetime, timezone +from datetime import datetime import itertools import json import os diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index 51643bc0..54852c1b 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -6,8 +6,6 @@ import requests import time import bisect -from datetime import datetime, timezone -import sys # Third Party diff --git a/aiu_fms_testing_utils/utils/resource_collection.py b/aiu_fms_testing_utils/utils/resource_collection.py index a775bfe2..f6f4e002 100644 --- a/aiu_fms_testing_utils/utils/resource_collection.py +++ b/aiu_fms_testing_utils/utils/resource_collection.py @@ -5,7 +5,7 @@ from aiu_fms_testing_utils.utils.aiu_setup import dprint try: from prometheus_api_client import PrometheusConnect -except Exception as e: +except Exception: print("WARNING: Cannot import `prometheus_api_client`. Make sure the package is installed if you are trying to report resource utilization.") From bd961026d6f3042febfe3f5930aebfe3da7b066b Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Thu, 26 Feb 2026 09:47:17 -0500 Subject: [PATCH 34/42] Adding type annotations for profile Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/scripts/drive_paged_programs.py | 2 +- aiu_fms_testing_utils/utils/__init__.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/aiu_fms_testing_utils/scripts/drive_paged_programs.py b/aiu_fms_testing_utils/scripts/drive_paged_programs.py index f3992733..34197a4f 100644 --- a/aiu_fms_testing_utils/scripts/drive_paged_programs.py +++ b/aiu_fms_testing_utils/scripts/drive_paged_programs.py @@ -1253,7 +1253,7 @@ def generate_validation_info_and_test( timing: str, prefill_chunk_size: int, model_variant: str, - profile + profile: Optional[Any] = None ) -> list[Any]: """Generates tokens using AIU and CPU models and validates the results. diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index 54852c1b..65c3a480 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -1,5 +1,5 @@ # Standard -from typing import Optional, List, Tuple +from typing import Optional, List, Tuple, Any import json import os import random @@ -55,7 +55,7 @@ def warmup_model( use_cache: bool = True, stagger_update_lazyhandle: int = 0, prefill_chunk_size: int = 0, - profile = None, + profile: Optional[Any] = None, **extra_kwargs, ): import torch_sendnn From 60013ae989afbc7090ad5efc46b2646f75e4fc28 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Fri, 27 Feb 2026 12:28:43 -0500 Subject: [PATCH 35/42] More explicit inference start/end steps Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/scripts/drive_paged_programs.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/aiu_fms_testing_utils/scripts/drive_paged_programs.py b/aiu_fms_testing_utils/scripts/drive_paged_programs.py index 34197a4f..dd5c4025 100644 --- a/aiu_fms_testing_utils/scripts/drive_paged_programs.py +++ b/aiu_fms_testing_utils/scripts/drive_paged_programs.py @@ -1275,10 +1275,10 @@ def generate_validation_info_and_test( ) # Start inference - metric_start = print_step(profile, "started", "Inference") if not skip_validation: # Generate or load CPU validation info + cpu_metric_start = print_step(profile, "started", "CPU Inference") cpu_validation_info = generate_cpu_validation( model_variant=model_variant, max_new_tokens=max_new_tokens, @@ -1293,9 +1293,10 @@ def generate_validation_info_and_test( cpu_dtype=env_config.cpu_dtype, tokenizer=tokenizer, ) - print_step(profile, "completed", "CPU inference", metric_start) + print_step(profile, "completed", "CPU inference", cpu_metric_start) # Generate AIU validation info + aiu_metric_start = print_step(profile, "started", "AIU Inference") aiu_validation_info = generate_aiu_validation( test_type=test_type, max_new_tokens=max_new_tokens, @@ -1306,7 +1307,7 @@ def generate_validation_info_and_test( cpu_validation_info=cpu_validation_info, extra_kwargs=valid_prompt.extra_kwargs, ) - print_step(profile, "completed", "AIU inference", metric_start) + print_step(profile, "completed", "AIU inference", aiu_metric_start) if test_type == "metrics": failure_rate = evaluate_cross_entropy_metrics( @@ -1336,6 +1337,7 @@ def generate_validation_info_and_test( else: # Generate AIU validation info + aiu_metric_start = print_step(profile, "started", "AIU Inference") aiu_validation_info = generate_aiu_validation( test_type=test_type, max_new_tokens=max_new_tokens, @@ -1346,7 +1348,7 @@ def generate_validation_info_and_test( cpu_validation_info=None, extra_kwargs=valid_prompt.extra_kwargs, ) - print_step(profile, "completed", "AIU inference", metric_start) + print_step(profile, "completed", "AIU inference", aiu_metric_start) if local_rank == 0: for sentence_idx, test_sentence in enumerate( From 5e7a5e0cad333bc19580c6328d1252a797bcb46e Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Tue, 3 Mar 2026 16:02:44 -0500 Subject: [PATCH 36/42] Changing capitalization on inference run Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/scripts/drive_paged_programs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/aiu_fms_testing_utils/scripts/drive_paged_programs.py b/aiu_fms_testing_utils/scripts/drive_paged_programs.py index dd5c4025..67f2d615 100644 --- a/aiu_fms_testing_utils/scripts/drive_paged_programs.py +++ b/aiu_fms_testing_utils/scripts/drive_paged_programs.py @@ -1293,7 +1293,7 @@ def generate_validation_info_and_test( cpu_dtype=env_config.cpu_dtype, tokenizer=tokenizer, ) - print_step(profile, "completed", "CPU inference", cpu_metric_start) + print_step(profile, "completed", "CPU Inference", cpu_metric_start) # Generate AIU validation info aiu_metric_start = print_step(profile, "started", "AIU Inference") @@ -1307,7 +1307,7 @@ def generate_validation_info_and_test( cpu_validation_info=cpu_validation_info, extra_kwargs=valid_prompt.extra_kwargs, ) - print_step(profile, "completed", "AIU inference", aiu_metric_start) + print_step(profile, "completed", "AIU Inference", aiu_metric_start) if test_type == "metrics": failure_rate = evaluate_cross_entropy_metrics( @@ -1348,7 +1348,7 @@ def generate_validation_info_and_test( cpu_validation_info=None, extra_kwargs=valid_prompt.extra_kwargs, ) - print_step(profile, "completed", "AIU inference", aiu_metric_start) + print_step(profile, "completed", "AIU Inference", aiu_metric_start) if local_rank == 0: for sentence_idx, test_sentence in enumerate( From 53e41b697afe21a88ae059c2b3aaaa699720a1ca Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Thu, 5 Mar 2026 14:54:57 -0500 Subject: [PATCH 37/42] Adding flag for reporting resource utilization Signed-off-by: Christian Sarmiento --- .../scripts/drive_paged_programs.py | 22 ++++--- aiu_fms_testing_utils/utils/__init__.py | 5 +- .../utils/resource_collection.py | 60 ++++++++++++------- 3 files changed, 58 insertions(+), 29 deletions(-) diff --git a/aiu_fms_testing_utils/scripts/drive_paged_programs.py b/aiu_fms_testing_utils/scripts/drive_paged_programs.py index 67f2d615..06f80063 100644 --- a/aiu_fms_testing_utils/scripts/drive_paged_programs.py +++ b/aiu_fms_testing_utils/scripts/drive_paged_programs.py @@ -278,6 +278,11 @@ def parse_cli_args() -> argparse.Namespace: action="store_true", help="set to true ensure that all prompts hit the same prompt program for a given test", ) + parser.add_argument( + "--report_resource_utilization", + action="store_true", + help="set to true to report CPU/memory utilization during compilation and inference stages" + ) return parser.parse_args() @@ -1253,6 +1258,7 @@ def generate_validation_info_and_test( timing: str, prefill_chunk_size: int, model_variant: str, + print_utilization: bool = False, profile: Optional[Any] = None ) -> list[Any]: """Generates tokens using AIU and CPU models and validates the results. @@ -1278,7 +1284,7 @@ def generate_validation_info_and_test( if not skip_validation: # Generate or load CPU validation info - cpu_metric_start = print_step(profile, "started", "CPU Inference") + cpu_metric_start = print_step(profile, print_utilization, "started", "CPU Inference") cpu_validation_info = generate_cpu_validation( model_variant=model_variant, max_new_tokens=max_new_tokens, @@ -1293,10 +1299,10 @@ def generate_validation_info_and_test( cpu_dtype=env_config.cpu_dtype, tokenizer=tokenizer, ) - print_step(profile, "completed", "CPU Inference", cpu_metric_start) + print_step(profile, print_utilization, "completed", "CPU Inference", cpu_metric_start) # Generate AIU validation info - aiu_metric_start = print_step(profile, "started", "AIU Inference") + aiu_metric_start = print_step(profile, print_utilization, "started", "AIU Inference") aiu_validation_info = generate_aiu_validation( test_type=test_type, max_new_tokens=max_new_tokens, @@ -1307,7 +1313,7 @@ def generate_validation_info_and_test( cpu_validation_info=cpu_validation_info, extra_kwargs=valid_prompt.extra_kwargs, ) - print_step(profile, "completed", "AIU Inference", aiu_metric_start) + print_step(profile, print_utilization, "completed", "AIU Inference", aiu_metric_start) if test_type == "metrics": failure_rate = evaluate_cross_entropy_metrics( @@ -1337,7 +1343,7 @@ def generate_validation_info_and_test( else: # Generate AIU validation info - aiu_metric_start = print_step(profile, "started", "AIU Inference") + aiu_metric_start = print_step(profile, print_utilization, "started", "AIU Inference") aiu_validation_info = generate_aiu_validation( test_type=test_type, max_new_tokens=max_new_tokens, @@ -1348,7 +1354,7 @@ def generate_validation_info_and_test( cpu_validation_info=None, extra_kwargs=valid_prompt.extra_kwargs, ) - print_step(profile, "completed", "AIU Inference", aiu_metric_start) + print_step(profile, print_utilization, "completed", "AIU Inference", aiu_metric_start) if local_rank == 0: for sentence_idx, test_sentence in enumerate( @@ -1407,7 +1413,7 @@ def main() -> None: ) # Instantiate the Prometheus client for resource metric collection - p = instantiate_prometheus() + p = instantiate_prometheus(args.report_resource_utilization) # Model Loading model_kwargs: Dict[str, Any] = _get_model_kwargs(model_variant=args.model_variant) @@ -1465,6 +1471,7 @@ def main() -> None: compile_dynamic_sendnn=True, stagger_update_lazyhandle=args.stagger_update_lazyhandle, prefill_chunk_size=args.prefill_chunk_size, + print_utilization=args.report_resource_utilization, profile=p, **extra_kwargs, ) @@ -1508,6 +1515,7 @@ def main() -> None: timing=args.timing, prefill_chunk_size=args.prefill_chunk_size, model_variant=args.model_variant, + print_utilization=args.report_resource_utilization, profile=p ) diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index 65c3a480..23e1f95e 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -55,6 +55,7 @@ def warmup_model( use_cache: bool = True, stagger_update_lazyhandle: int = 0, prefill_chunk_size: int = 0, + print_utilization: bool = False, profile: Optional[Any] = None, **extra_kwargs, ): @@ -78,7 +79,7 @@ def warmup_model( pt_compile_model_time = time.time() ## Report on initial resource usage - metric_start = print_step(profile, "started", "Compilation") + metric_start = print_step(profile, print_utilization, "started", "Compilation") # adjust inputs depending on attn_type and dynamic shapes _warmup_input_ids = input_ids @@ -110,7 +111,7 @@ def warmup_model( pt_compile_model_time = time.time() - pt_compile_model_time # Get completed metric read - print_step(profile, "completed", "Compilation", metric_start) + print_step(profile, print_utilization, "completed", "Compilation", metric_start) dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s") diff --git a/aiu_fms_testing_utils/utils/resource_collection.py b/aiu_fms_testing_utils/utils/resource_collection.py index f6f4e002..d99587a9 100644 --- a/aiu_fms_testing_utils/utils/resource_collection.py +++ b/aiu_fms_testing_utils/utils/resource_collection.py @@ -1,15 +1,27 @@ # Imports import os from datetime import datetime, timezone +import subprocess from aiu_fms_testing_utils.utils.aiu_setup import dprint -try: - from prometheus_api_client import PrometheusConnect -except Exception: - print("WARNING: Cannot import `prometheus_api_client`. Make sure the package is installed if you are trying to report resource utilization.") -def instantiate_prometheus(): +def install_prometheus(): + """ + Top-level method that will install Prometheus if needed + """ + + # See if it is installed + run = subprocess.run(["pip", "show", "prometheus_api_client"], + check=False) + + # Install if needed + if run.returncode != 0: + print("prometheus_api_client not found, installing...") + subprocess.run(["pip", "install", "prometheus_api_client"], check=True) + + +def instantiate_prometheus(report_utilization): """ Top-level method that will instantiate the Prometheus Client to collect resource usage metrics. @@ -19,18 +31,24 @@ def instantiate_prometheus(): """ client = None - try: - # Get required env variables - connection_url = os.environ["PROMETHEUS_URL"] - api_token = os.environ.get("PROMETHEUS_API_KEY") + if report_utilization: + + # Install and import Prometheus if needed + install_prometheus() + from prometheus_api_client import PrometheusConnect + + try: + # Get required env variables + connection_url = os.environ["PROMETHEUS_URL"] + api_token = os.environ.get("PROMETHEUS_API_KEY") - # Define necessary headers - request_headers = {"Authorization": f"Bearer {api_token}"} if api_token else None + # Define necessary headers + request_headers = {"Authorization": f"Bearer {api_token}"} if api_token else None - client = PrometheusConnect(url=connection_url, headers=request_headers, disable_ssl=True) + client = PrometheusConnect(url=connection_url, headers=request_headers, disable_ssl=True) - except Exception as e: - print(f"WARNING: Cannot instantiate Prometheus. Make sure PROMETHEUS_URL and PROMETHEUS_API_KEY are set in your environment if you are trying to collect resource metrics. Error: {e}") + except Exception as e: + print(f"WARNING: Cannot instantiate Prometheus. Make sure PROMETHEUS_URL and PROMETHEUS_API_KEY are set in your environment if you are trying to collect resource metrics. Error: {e}") return client @@ -155,7 +173,7 @@ def timestamp_print(given_string): print(f"[{timestamp}] {given_string}") -def print_comp_resource_metrics(cpu_val, mem_val, stage, step): +def print_comp_resource_metrics(cpu_val, mem_val, stage, step, print_utilization): """ Helper method that will do a timestamp print for a specific step to report resource usage. @@ -165,25 +183,27 @@ def print_comp_resource_metrics(cpu_val, mem_val, stage, step): - mem_val: the value for memory usage in gigabytes we want to print. - stage: The stage of the step we are in, either "peak" or "started". - step: The step that we performing in the script, either "compilation" or "inference". + - print_utilization: a boolean denoting if we want to print resource utilization metrics. """ if stage != "peak": - if cpu_val is None or mem_val is None: + if not print_utilization and (cpu_val is None or mem_val is None): timestamp_print(f"{step} {stage}") else: timestamp_print(f"{step} {stage} - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB") - elif cpu_val is not None and mem_val is not None: + elif not print_utilization and (cpu_val is not None and mem_val is not None): dprint(f"Peak Resource Utilization - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB") -def print_step(p, step, stage, start_time=None): +def print_step(p, report_utilization, step, stage, start_time=None): """ Print function to print out when a specific stage starts and ends, as well as reporting resource usage if enabled. Args: - p: the Prometheus profile client to resource utilization collection. + - report_utilization: a boolean denoting if we want to print resource utilization metrics. - step: string denoting what step we are at ("inference" or "compilation"). - stage: string denoting what stage of the step we are at ("started" or "completed"). - start_time: datetime object that denotes when the step started (optional). @@ -196,11 +216,11 @@ def print_step(p, step, stage, start_time=None): ## Get metric read recorded_time = datetime.now(timezone.utc) cpu_usage, mem_usage = get_static_read(p, recorded_time) - print_comp_resource_metrics(cpu_usage, mem_usage, step, stage) + print_comp_resource_metrics(cpu_usage, mem_usage, step, stage, report_utilization) ## Get and print the peak usage if start_time is not None: peak_cpu_inference_cpu, peak_mem_inference_cpu = get_peak_read(p, start_time, recorded_time) - print_comp_resource_metrics(peak_cpu_inference_cpu, peak_mem_inference_cpu, "peak", stage) + print_comp_resource_metrics(peak_cpu_inference_cpu, peak_mem_inference_cpu, "peak", stage, report_utilization) return recorded_time From b80377d7e28c64d09527572b73b40e347f135e24 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Thu, 5 Mar 2026 15:40:09 -0500 Subject: [PATCH 38/42] Fixing conditional Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/utils/resource_collection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiu_fms_testing_utils/utils/resource_collection.py b/aiu_fms_testing_utils/utils/resource_collection.py index d99587a9..7b18a6ff 100644 --- a/aiu_fms_testing_utils/utils/resource_collection.py +++ b/aiu_fms_testing_utils/utils/resource_collection.py @@ -192,7 +192,7 @@ def print_comp_resource_metrics(cpu_val, mem_val, stage, step, print_utilization else: timestamp_print(f"{step} {stage} - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB") - elif not print_utilization and (cpu_val is not None and mem_val is not None): + elif print_utilization and (cpu_val is not None and mem_val is not None): dprint(f"Peak Resource Utilization - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB") From ae5724a54a2a9dd54c905ebe9078538044444151 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Thu, 5 Mar 2026 15:50:07 -0500 Subject: [PATCH 39/42] Making sure we don't see pip show output Signed-off-by: Christian Sarmiento --- aiu_fms_testing_utils/utils/resource_collection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/aiu_fms_testing_utils/utils/resource_collection.py b/aiu_fms_testing_utils/utils/resource_collection.py index 7b18a6ff..645ae2cd 100644 --- a/aiu_fms_testing_utils/utils/resource_collection.py +++ b/aiu_fms_testing_utils/utils/resource_collection.py @@ -13,6 +13,7 @@ def install_prometheus(): # See if it is installed run = subprocess.run(["pip", "show", "prometheus_api_client"], + capture_output=True, check=False) # Install if needed From c22760195b3df71311203f0285ce87a13c27eb88 Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Sun, 22 Mar 2026 12:53:54 -0400 Subject: [PATCH 40/42] Adding more graceful error handling Signed-off-by: Christian Sarmiento --- .../utils/resource_collection.py | 56 +++++++++++-------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/aiu_fms_testing_utils/utils/resource_collection.py b/aiu_fms_testing_utils/utils/resource_collection.py index 645ae2cd..5756b951 100644 --- a/aiu_fms_testing_utils/utils/resource_collection.py +++ b/aiu_fms_testing_utils/utils/resource_collection.py @@ -109,16 +109,20 @@ def get_static_read(client, recorded_time): cpu_value = None mem_value = None if client is not None: + try: - # Make the request for CPU and Mem - cpu_query = '100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[2m])))' - mem_query = '(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 1024 / 1024 / 1024' - cpu_response = client.custom_query(query=cpu_query, params={"time": recorded_time.timestamp()}) - mem_response = client.custom_query(query=mem_query, params={"time": recorded_time.timestamp()}) + # Make the request for CPU and Mem + cpu_query = '100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[2m])))' + mem_query = '(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 1024 / 1024 / 1024' + cpu_response = client.custom_query(query=cpu_query, params={"time": recorded_time.timestamp()}) + mem_response = client.custom_query(query=mem_query, params={"time": recorded_time.timestamp()}) - ## Get the CPU & Mem metrics out of the response - cpu_value = get_value(cpu_response) - mem_value = get_value(mem_response) + ## Get the CPU & Mem metrics out of the response + cpu_value = get_value(cpu_response) + mem_value = get_value(mem_response) + + except Exception as e: + print(f"WARNING: Failed to retrieve utilization values. Ensure PROMETHEUS_API_KEY is set. Error: {e}") return cpu_value, mem_value @@ -143,20 +147,24 @@ def get_peak_read(client, start, end): peak_cpu_value = None peak_mem_value = None if client is not None: + try: - # Make the request for CPU and Mem - cpu_query = '100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[2m])))' - mem_query = '(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 1024 / 1024 / 1024' - cpu_response = client.custom_query_range( - query=cpu_query, start_time=start, end_time=end, step="3s" - ) - mem_response = client.custom_query_range( - query=mem_query, start_time=start, end_time=end, step="3s" - ) - - ## Get the CPU & Mem metrics out of the response - peak_cpu_value = get_value(cpu_response, "range") - peak_mem_value = get_value(mem_response, "range") + # Make the request for CPU and Mem + cpu_query = '100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[2m])))' + mem_query = '(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 1024 / 1024 / 1024' + cpu_response = client.custom_query_range( + query=cpu_query, start_time=start, end_time=end, step="3s" + ) + mem_response = client.custom_query_range( + query=mem_query, start_time=start, end_time=end, step="3s" + ) + + ## Get the CPU & Mem metrics out of the response + peak_cpu_value = get_value(cpu_response, "range") + peak_mem_value = get_value(mem_response, "range") + + except Exception as e: + print(f"WARNING: Failed to retrieve utilization values. Ensure PROMETHEUS_API_KEY is set. Error: {e}") return peak_cpu_value, peak_mem_value @@ -188,7 +196,7 @@ def print_comp_resource_metrics(cpu_val, mem_val, stage, step, print_utilization """ if stage != "peak": - if not print_utilization and (cpu_val is None or mem_val is None): + if not print_utilization or (cpu_val is None or mem_val is None): timestamp_print(f"{step} {stage}") else: timestamp_print(f"{step} {stage} - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB") @@ -222,6 +230,8 @@ def print_step(p, report_utilization, step, stage, start_time=None): ## Get and print the peak usage if start_time is not None: peak_cpu_inference_cpu, peak_mem_inference_cpu = get_peak_read(p, start_time, recorded_time) - print_comp_resource_metrics(peak_cpu_inference_cpu, peak_mem_inference_cpu, "peak", stage, report_utilization) + print_comp_resource_metrics( + peak_cpu_inference_cpu, peak_mem_inference_cpu, "peak", stage, report_utilization + ) return recorded_time From dfd9bbeaea7343f21dbd79b5115711ffa380464c Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Sun, 22 Mar 2026 13:10:07 -0400 Subject: [PATCH 41/42] Updating README with reporting instructions Signed-off-by: Christian Sarmiento --- README.md | 25 ------------------- aiu_fms_testing_utils/scripts/README.md | 33 +++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 9e7c4410..6d30905f 100644 --- a/README.md +++ b/README.md @@ -139,31 +139,6 @@ export TORCH_SENDNN_LOG=CRITICAL export DT_DEEPRT_VERBOSE=-1 ``` -### Setup the environment for reporting resource usage - -When running `drive_paged_programs.py` you may want to see how much CPU and memory usage is -happening. This is done using Prometheus, thus if you are running in a container, you want to set up a simple Prometheus server to start collecting these metrics. To do this, do the following: - -1. Run `podman network create promnet` -2. Run `podman run -d --name node-exporter --network promnet quay.io/prometheus/node-exporter:latest` -3. Create a file called `prometheus.yml` that has the following contents: - -```yaml -global: -  scrape_interval: 5s - -scrape_configs: -  - job_name: "node" -    static_configs: -      - targets: ["node-exporter:9100"] -``` - -4. Run `podman run -d --name prometheus --network promnet -p 9091:9090   -v "$PWD/prometheus.yml:/etc/prometheus/prometheus.yml:Z"   quay.io/prometheus/prometheus:latest   --config.file=/etc/prometheus/prometheus.yml` -5. Check the status of the server by running `curl -s "http://localhost:9091/api/v1/targets" | python3 -m json.tool | grep health` and ensuring that "health" says "up". -6. When you are about to run DPP, run `export PROMETHEUS_URL="http://localhost:9091"` - -If you are running in OpenShift, you are going to want to set `PROMETHEUS_URL` to an OpenShift route that has Prometheus set up. Additionally, you are going to want to set `PROMETHEUS_API_KEY` to your OpenShift OAuth token if the Prometheus instance on the cluster is protected. You can get this token by running `oc whoami -t`. - ## How to use Foundation Model Stack (FMS) on AIU hardware The [scripts](https://github.com/foundation-model-stack/aiu-fms-testing-utils/tree/main/scripts) directory provides various scripts to use FMS on AIU hardware for many use cases. These scripts provide robust support for passing desired command line options for running encoder and decoder models along with other use cases. Refer to the documentation on [using different scripts](https://github.com/foundation-model-stack/aiu-fms-testing-utils/blob/main/scripts/README.md) for more details. diff --git a/aiu_fms_testing_utils/scripts/README.md b/aiu_fms_testing_utils/scripts/README.md index a652fe6a..c196aae5 100644 --- a/aiu_fms_testing_utils/scripts/README.md +++ b/aiu_fms_testing_utils/scripts/README.md @@ -75,3 +75,36 @@ python3 scripts/validation.py --architecture=hf_configured --model_path=/home/de ``` To run a logits-based validation, pass `--validation_level=1` to the validation script. This will check for the logits output to match at every step of the model through cross-entropy loss. You can control the acceptable threshold with `--logits_loss_threshold`. + +## Setup the environment for reporting resource usage + +When running `drive_paged_programs.py` you may want to see how much CPU and memory usage is +happening. This is done using Prometheus, thus if you are running in a container environment (non-OpenShift), you want to set up a simple Prometheus server to start collecting these metrics. To do this, do the following: + +1. Run `podman network create promnet` +2. Run `podman run -d --name node-exporter --network promnet quay.io/prometheus/node-exporter:latest` +3. Create a file called `prometheus.yml` that has the following contents: + +```yaml +global: +  scrape_interval: 5s + +scrape_configs: +  - job_name: "node" +    static_configs: +      - targets: ["node-exporter:9100"] +``` + +4. Run `podman run -d --name prometheus --network promnet -p 9091:9090   -v "$PWD/prometheus.yml:/etc/prometheus/prometheus.yml:Z"   quay.io/prometheus/prometheus:latest   --config.file=/etc/prometheus/prometheus.yml` +5. Check the status of the server by running `curl -s "http://localhost:9091/api/v1/targets" | python3 -m json.tool | grep health` and ensuring that "health" says "up". +6. When you are about to run DPP, run `export PROMETHEUS_URL="http://localhost:9091"` + +If you are running in OpenShift, the aformentioned instructions are not neccessary and instead, you are going to want to set `PROMETHEUS_URL` to an OpenShift route that already has Prometheus set up. Additionally, you are going to want to set `PROMETHEUS_API_KEY` to your OpenShift OAuth token if the Prometheus instance on the cluster is protected. You can get this token by running `oc whoami -t`. + +When actually running a DPP test, you are going to want to set the `--report_resource_utilization` flag to see outputs. Regardless if you have this flag set or if you do not have Prometheus installed or any of the environment variables set, DPP should always run. These instructions are simply just to see resource utilization outputs. + +Sample test to run with resource utilization outputs: + +```bash +torchrun --nproc-per-node=4 aiu-fms-testing-utils/scripts/drive_paged_programs.py --model_variant=/ibm-granite/granite-3.3-8b-instruct --program_criteria_json_path=path/to/program_criterion.json --dataset_type=sharegpt --skip_validation --programs "*:0,<8192" --prioritize_large_batch_sizes --enforce_homogeneous_prompt_programs --prefill_chunk_size=1024 --dataset_path=ShareGPT_V3_unfiltered_cleaned_split.json --report_resource_utilization +``` From 0d0f08dc3842557a65cef113142892cc6df6dbbf Mon Sep 17 00:00:00 2001 From: Christian Sarmiento Date: Sun, 22 Mar 2026 13:19:31 -0400 Subject: [PATCH 42/42] Fixing linting errors Signed-off-by: Christian Sarmiento --- .../scripts/drive_paged_programs.py | 50 ++++++++++---- .../utils/resource_collection.py | 67 ++++++++++++------- 2 files changed, 81 insertions(+), 36 deletions(-) diff --git a/aiu_fms_testing_utils/scripts/drive_paged_programs.py b/aiu_fms_testing_utils/scripts/drive_paged_programs.py index 06f80063..eade65b0 100644 --- a/aiu_fms_testing_utils/scripts/drive_paged_programs.py +++ b/aiu_fms_testing_utils/scripts/drive_paged_programs.py @@ -36,7 +36,7 @@ sample_rag_factoid_requests, sample_sharegpt_requests, stagger_region, - warmup_model + warmup_model, ) from aiu_fms_testing_utils.utils.aiu_setup import aiu_dist_setup, dprint, local_rank from aiu_fms_testing_utils.utils.paged import ( @@ -45,8 +45,10 @@ ) from aiu_fms_testing_utils.testing.utils import format_kwargs_to_string from aiu_fms_testing_utils.utils.resource_collection import ( - instantiate_prometheus, print_step + instantiate_prometheus, + print_step, ) + # Constants PAD_MULTIPLE = 64 @@ -281,7 +283,7 @@ def parse_cli_args() -> argparse.Namespace: parser.add_argument( "--report_resource_utilization", action="store_true", - help="set to true to report CPU/memory utilization during compilation and inference stages" + help="set to true to report CPU/memory utilization during compilation and inference stages", ) return parser.parse_args() @@ -1259,7 +1261,7 @@ def generate_validation_info_and_test( prefill_chunk_size: int, model_variant: str, print_utilization: bool = False, - profile: Optional[Any] = None + profile: Optional[Any] = None, ) -> list[Any]: """Generates tokens using AIU and CPU models and validates the results. @@ -1282,9 +1284,10 @@ def generate_validation_info_and_test( # Start inference if not skip_validation: - # Generate or load CPU validation info - cpu_metric_start = print_step(profile, print_utilization, "started", "CPU Inference") + cpu_metric_start = print_step( + profile, print_utilization, "started", "CPU Inference" + ) cpu_validation_info = generate_cpu_validation( model_variant=model_variant, max_new_tokens=max_new_tokens, @@ -1299,10 +1302,18 @@ def generate_validation_info_and_test( cpu_dtype=env_config.cpu_dtype, tokenizer=tokenizer, ) - print_step(profile, print_utilization, "completed", "CPU Inference", cpu_metric_start) + print_step( + profile, + print_utilization, + "completed", + "CPU Inference", + cpu_metric_start, + ) # Generate AIU validation info - aiu_metric_start = print_step(profile, print_utilization, "started", "AIU Inference") + aiu_metric_start = print_step( + profile, print_utilization, "started", "AIU Inference" + ) aiu_validation_info = generate_aiu_validation( test_type=test_type, max_new_tokens=max_new_tokens, @@ -1313,7 +1324,13 @@ def generate_validation_info_and_test( cpu_validation_info=cpu_validation_info, extra_kwargs=valid_prompt.extra_kwargs, ) - print_step(profile, print_utilization, "completed", "AIU Inference", aiu_metric_start) + print_step( + profile, + print_utilization, + "completed", + "AIU Inference", + aiu_metric_start, + ) if test_type == "metrics": failure_rate = evaluate_cross_entropy_metrics( @@ -1341,9 +1358,10 @@ def generate_validation_info_and_test( else: raise ValueError("test type must be one of metrics or tokens") else: - # Generate AIU validation info - aiu_metric_start = print_step(profile, print_utilization, "started", "AIU Inference") + aiu_metric_start = print_step( + profile, print_utilization, "started", "AIU Inference" + ) aiu_validation_info = generate_aiu_validation( test_type=test_type, max_new_tokens=max_new_tokens, @@ -1354,7 +1372,13 @@ def generate_validation_info_and_test( cpu_validation_info=None, extra_kwargs=valid_prompt.extra_kwargs, ) - print_step(profile, print_utilization, "completed", "AIU Inference", aiu_metric_start) + print_step( + profile, + print_utilization, + "completed", + "AIU Inference", + aiu_metric_start, + ) if local_rank == 0: for sentence_idx, test_sentence in enumerate( @@ -1516,7 +1540,7 @@ def main() -> None: prefill_chunk_size=args.prefill_chunk_size, model_variant=args.model_variant, print_utilization=args.report_resource_utilization, - profile=p + profile=p, ) if not args.skip_validation and local_rank == 0: diff --git a/aiu_fms_testing_utils/utils/resource_collection.py b/aiu_fms_testing_utils/utils/resource_collection.py index 5756b951..74dfc4e5 100644 --- a/aiu_fms_testing_utils/utils/resource_collection.py +++ b/aiu_fms_testing_utils/utils/resource_collection.py @@ -12,9 +12,9 @@ def install_prometheus(): """ # See if it is installed - run = subprocess.run(["pip", "show", "prometheus_api_client"], - capture_output=True, - check=False) + run = subprocess.run( + ["pip", "show", "prometheus_api_client"], capture_output=True, check=False + ) # Install if needed if run.returncode != 0: @@ -33,7 +33,6 @@ def instantiate_prometheus(report_utilization): client = None if report_utilization: - # Install and import Prometheus if needed install_prometheus() from prometheus_api_client import PrometheusConnect @@ -44,12 +43,18 @@ def instantiate_prometheus(report_utilization): api_token = os.environ.get("PROMETHEUS_API_KEY") # Define necessary headers - request_headers = {"Authorization": f"Bearer {api_token}"} if api_token else None + request_headers = ( + {"Authorization": f"Bearer {api_token}"} if api_token else None + ) - client = PrometheusConnect(url=connection_url, headers=request_headers, disable_ssl=True) + client = PrometheusConnect( + url=connection_url, headers=request_headers, disable_ssl=True + ) except Exception as e: - print(f"WARNING: Cannot instantiate Prometheus. Make sure PROMETHEUS_URL and PROMETHEUS_API_KEY are set in your environment if you are trying to collect resource metrics. Error: {e}") + print( + f"WARNING: Cannot instantiate Prometheus. Make sure PROMETHEUS_URL and PROMETHEUS_API_KEY are set in your environment if you are trying to collect resource metrics. Error: {e}" + ) return client @@ -77,7 +82,7 @@ def get_value(given_res, query_type="static"): except Exception: pass value = values[0] if values else None - + else: ## For peak reads for series in given_res or []: for timestamp, val in series.get("values", []): @@ -97,7 +102,7 @@ def get_static_read(client, recorded_time): Args: - client: the Prometheus client to use to get our metrics. - - recorded_time: the time that we want to get the metric read at. + - recorded_time: the time that we want to get the metric read at. Returns: - cpu_value: this is the reported value for percentage of CPU usage at the given @@ -110,19 +115,24 @@ def get_static_read(client, recorded_time): mem_value = None if client is not None: try: - # Make the request for CPU and Mem cpu_query = '100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[2m])))' - mem_query = '(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 1024 / 1024 / 1024' - cpu_response = client.custom_query(query=cpu_query, params={"time": recorded_time.timestamp()}) - mem_response = client.custom_query(query=mem_query, params={"time": recorded_time.timestamp()}) + mem_query = "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 1024 / 1024 / 1024" + cpu_response = client.custom_query( + query=cpu_query, params={"time": recorded_time.timestamp()} + ) + mem_response = client.custom_query( + query=mem_query, params={"time": recorded_time.timestamp()} + ) ## Get the CPU & Mem metrics out of the response cpu_value = get_value(cpu_response) mem_value = get_value(mem_response) - + except Exception as e: - print(f"WARNING: Failed to retrieve utilization values. Ensure PROMETHEUS_API_KEY is set. Error: {e}") + print( + f"WARNING: Failed to retrieve utilization values. Ensure PROMETHEUS_API_KEY is set. Error: {e}" + ) return cpu_value, mem_value @@ -148,10 +158,9 @@ def get_peak_read(client, start, end): peak_mem_value = None if client is not None: try: - # Make the request for CPU and Mem cpu_query = '100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[2m])))' - mem_query = '(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 1024 / 1024 / 1024' + mem_query = "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 1024 / 1024 / 1024" cpu_response = client.custom_query_range( query=cpu_query, start_time=start, end_time=end, step="3s" ) @@ -162,9 +171,11 @@ def get_peak_read(client, start, end): ## Get the CPU & Mem metrics out of the response peak_cpu_value = get_value(cpu_response, "range") peak_mem_value = get_value(mem_response, "range") - + except Exception as e: - print(f"WARNING: Failed to retrieve utilization values. Ensure PROMETHEUS_API_KEY is set. Error: {e}") + print( + f"WARNING: Failed to retrieve utilization values. Ensure PROMETHEUS_API_KEY is set. Error: {e}" + ) return peak_cpu_value, peak_mem_value @@ -199,10 +210,14 @@ def print_comp_resource_metrics(cpu_val, mem_val, stage, step, print_utilization if not print_utilization or (cpu_val is None or mem_val is None): timestamp_print(f"{step} {stage}") else: - timestamp_print(f"{step} {stage} - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB") + timestamp_print( + f"{step} {stage} - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB" + ) elif print_utilization and (cpu_val is not None and mem_val is not None): - dprint(f"Peak Resource Utilization - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB") + dprint( + f"Peak Resource Utilization - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB" + ) def print_step(p, report_utilization, step, stage, start_time=None): @@ -229,9 +244,15 @@ def print_step(p, report_utilization, step, stage, start_time=None): ## Get and print the peak usage if start_time is not None: - peak_cpu_inference_cpu, peak_mem_inference_cpu = get_peak_read(p, start_time, recorded_time) + peak_cpu_inference_cpu, peak_mem_inference_cpu = get_peak_read( + p, start_time, recorded_time + ) print_comp_resource_metrics( - peak_cpu_inference_cpu, peak_mem_inference_cpu, "peak", stage, report_utilization + peak_cpu_inference_cpu, + peak_mem_inference_cpu, + "peak", + stage, + report_utilization, ) return recorded_time