Using function in compilation stage

cfsarmiento · cfsarmiento · commit b2fcd8ee9e99 · 2026-02-24T21:44:43.000-05:00
Signed-off-by: Christian Sarmiento &lt;cfsarmiento03@gmail.com&gt;
diff --git a/aiu_fms_testing_utils/scripts/drive_paged_programs.py b/aiu_fms_testing_utils/scripts/drive_paged_programs.py
@@ -44,8 +44,9 @@
     get_programs_prompts,
 )
 from aiu_fms_testing_utils.testing.utils import format_kwargs_to_string
-from aiu_fms_testing_utils.utils.resource_collection import instantiate_prometheus
-
+from aiu_fms_testing_utils.utils.resource_collection import (
+    instantiate_prometheus, print_step
+)
 # Constants
 PAD_MULTIPLE = 64
 
diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py
@@ -14,9 +14,7 @@
 from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank, world_size
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from aiu_fms_testing_utils.testing.utils import format_kwargs_to_string
-from aiu_fms_testing_utils.utils.resource_collection import (
-    get_static_read, get_peak_read
-)
+from aiu_fms_testing_utils.utils.resource_collection import print_step
 from fms.utils.generation import pad_input_ids
 import torch
 import torch.nn as nn
@@ -50,38 +48,6 @@ def stagger_region(limit: int):
             torch.distributed.barrier()
         dprint("Stagger: All Complete")
 
-def timestamp_print(given_string):
-    """
-    Helper method that will add a timestamp before the given string that needs to be
-    printed.
-
-    Args:
-    - given_string: the string that is to be printed with the timestamp.
-    """
-
-    timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S")
-    print(f"[{timestamp}] {given_string}")
-
-def print_comp_resource_metrics(cpu_val, mem_val, stage, step):
-    """
-    Helper method that will do a timestamp print for a specific step to report resource
-    usage.
-
-    Args:
-    - cpu_val: the value for CPU usage as a percentage that we want to print.
-    - mem_val: the value for memory usage in gigabytes we want to print.
-    - stage: The stage of the step we are in, either "peak" or "started".
-    - step: The step that we performing in the script, either "compilation" or "inference".
-    """
-
-    if stage != "peak":
-        if cpu_val is None or mem_val is None:
-            timestamp_print(f"{step} {stage}")
-        else:
-            timestamp_print(f"{step} {stage} - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB")
-
-    elif cpu_val is not None and mem_val is not None:
-        dprint(f"Peak Resource Utilization - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB")
 
 def warmup_model(
     model: nn.Module,
@@ -114,9 +80,7 @@ def warmup_model(
     pt_compile_model_time = time.time()
 
     ## Report on initial resource usage
-    metric_start = datetime.now(timezone.utc)
-    initial_cpu, initial_mem = get_static_read(profile, metric_start)
-    print_comp_resource_metrics(initial_cpu, initial_mem, "started", "Compilation")
+    metric_start = print_step(profile, "started", "Compilation")
 
     # adjust inputs depending on attn_type and dynamic shapes
     _warmup_input_ids = input_ids
@@ -148,14 +112,7 @@ def warmup_model(
     pt_compile_model_time = time.time() - pt_compile_model_time
 
     # Get completed metric read
-    metric_end = datetime.now(timezone.utc)
-    end_cpu, end_mem = get_static_read(profile, metric_end)
-    print_comp_resource_metrics(end_cpu, end_mem, "completed", "Compilation")
-
-    # Get the peak usage during compilation
-    peak_cpu, peak_mem = get_peak_read(profile, metric_start, metric_end)
-    print_comp_resource_metrics(peak_cpu, peak_mem, "peak", "Compilation")
-
+    print_step(profile, "completed", "Compilation", metric_start)
     dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s")
 
 
diff --git a/aiu_fms_testing_utils/utils/resource_collection.py b/aiu_fms_testing_utils/utils/resource_collection.py
@@ -2,7 +2,7 @@
 import os
 from datetime import datetime, timezone
 
-from aiu_fms_testing_utils.utils import print_comp_resource_metrics
+from aiu_fms_testing_utils.utils.aiu_setup import dprint
 try:
     from prometheus_api_client import PrometheusConnect
 except Exception as e:
@@ -142,18 +142,65 @@ def get_peak_read(client, start, end):
     return peak_cpu_value, peak_mem_value
 
 
+def timestamp_print(given_string):
+    """
+    Helper method that will add a timestamp before the given string that needs to be
+    printed.
+
+    Args:
+    - given_string: the string that is to be printed with the timestamp.
+    """
+
+    timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S")
+    print(f"[{timestamp}] {given_string}")
+
+
+def print_comp_resource_metrics(cpu_val, mem_val, stage, step):
+    """
+    Helper method that will do a timestamp print for a specific step to report resource
+    usage.
+
+    Args:
+    - cpu_val: the value for CPU usage as a percentage that we want to print.
+    - mem_val: the value for memory usage in gigabytes we want to print.
+    - stage: The stage of the step we are in, either "peak" or "started".
+    - step: The step that we performing in the script, either "compilation" or "inference".
+    """
+
+    if stage != "peak":
+        if cpu_val is None or mem_val is None:
+            timestamp_print(f"{step} {stage}")
+        else:
+            timestamp_print(f"{step} {stage} - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB")
+
+    elif cpu_val is not None and mem_val is not None:
+        dprint(f"Peak Resource Utilization - CPU: {cpu_val:.2f}%, Memory: {mem_val:.2f} GB")
+
+
 def print_step(p, step, stage, start_time=None):
     """
+    Print function to print out when a specific stage starts and ends,
+    as well as reporting resource usage if enabled.
+
+    Args:
+    - p: the Prometheus profile client to resource utilization collection.
+    - step: string denoting what step we are at ("inference" or "compilation").
+    - stage: string denoting what stage of the step we are at ("started" or "completed").
+    - start_time: datetime object that denotes when the step started (optional).
+
+    Returns:
+    - recorded_time: the time that was recorded when getting a metric read. Returned for
+    scenarios where we need to use the recorded time in a later step (i.e completed stages).
     """
 
     ## Get metric read
-    timestep = datetime.now(timezone.utc)
-    cpu_usage, mem_usage = get_static_read(p, timestep)
+    recorded_time = datetime.now(timezone.utc)
+    cpu_usage, mem_usage = get_static_read(p, recorded_time)
     print_comp_resource_metrics(cpu_usage, mem_usage, step, stage)
 
     ## Get and print the peak usage
     if start_time is not None:
-        peak_cpu_inference_cpu, peak_mem_inference_cpu = get_peak_read(p, start_time, timestep)
+        peak_cpu_inference_cpu, peak_mem_inference_cpu = get_peak_read(p, start_time, recorded_time)
         print_comp_resource_metrics(peak_cpu_inference_cpu, peak_mem_inference_cpu, "peak", stage)
 
-    return timestep
+    return recorded_time