Skip to content

Commit 05bc380

Browse files
committed
Update GPU VRAM Usage histogram and make number in ranking bar plot easy to read
1 parent bfb003d commit 05bc380

File tree

4 files changed

+144
-35
lines changed

4 files changed

+144
-35
lines changed

notebooks/analysis/Requested and Used VRAM.ipynb

Lines changed: 58 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,11 @@
2626
" - [Find most inefficient jobs hoarding node RAM based on `ram_hoarding_fraction_diff`](#toc1_5_2_1_) \n",
2727
" - [Find most inefficient jobs hoarding CPU cores based on `core_hoarding_fraction_diff`](#toc1_5_2_2_) \n",
2828
" - [User-Level Analysis](#toc1_6_) \n",
29-
" - [Find Inefficient Users based on `requested_vram_efficiency_score`](#toc1_6_1_) \n",
30-
" - [Generate all hoarding analysis metrics for users:](#toc1_6_2_) \n",
31-
" - [Find most inefficient users hoarding node RAM based on `expected_value_ram_hoarding_fraction_diff`](#toc1_6_2_1_) \n",
32-
" - [Find most inefficient users hoarding CPU cores based on `expected_value_core_hoarding_fraction_diff`](#toc1_6_2_2_) \n",
29+
" - [Find Inefficient Users based on `avg_requested_vram_efficiency_score`](#toc1_6_1_) \n",
30+
" - [Find Inefficient Users based on EV(alloc_vram_efficiency)](#toc1_6_2_) \n",
31+
" - [Generate all hoarding analysis metrics for users:](#toc1_6_3_) \n",
32+
" - [Find most inefficient users hoarding node RAM based on `expected_value_ram_hoarding_fraction_diff`](#toc1_6_3_1_) \n",
33+
" - [Find most inefficient users hoarding CPU cores based on `expected_value_core_hoarding_fraction_diff`](#toc1_6_3_2_) \n",
3334
" - [PI Group Analysis](#toc1_7_) \n",
3435
" - [Find Inefficient PIs based on `avg_requested_vram_efficiency_score`](#toc1_7_1_1_) \n",
3536
"\n",
@@ -325,7 +326,10 @@
325326
" metrics_df_name_enum=ResourceHoardingDataFrameNameEnum.JOBS_WITH_RESOURCE_HOARDING_METRICS,\n",
326327
" sorting_key=\"core_hoarding_fraction_diff\",\n",
327328
" ascending=False, # Sort in descending order\n",
328-
" filter_criteria={\"core_hoarding_fraction_diff\": {\"min\": 0, \"inclusive\": True}},\n",
329+
" filter_criteria={\n",
330+
" \"core_hoarding_fraction_diff\": {\"min\": 0, \"inclusive\": True},\n",
331+
" \"job_count\": {\"min\": 15, \"inclusive\": True},\n",
332+
" },\n",
329333
")\n",
330334
"\n",
331335
"# Plot top inefficient jobs by CPU core hoarding fraction, with CPU core hoarding fraction as labels\n",
@@ -360,7 +364,7 @@
360364
"id": "26",
361365
"metadata": {},
362366
"source": [
363-
"### <a id='toc1_6_1_'></a>[Find Inefficient Users based on `requested_vram_efficiency_score`](#toc0_)"
367+
"### <a id='toc1_6_1_'></a>[Find Inefficient Users based on `avg_requested_vram_efficiency_score`](#toc0_)"
364368
]
365369
},
366370
{
@@ -376,7 +380,7 @@
376380
" ascending=True, # Sort by avg_requested_vram_efficiency_score in ascending order\n",
377381
" filter_criteria={\n",
378382
" \"avg_requested_vram_efficiency_score\": {\"max\": -10, \"inclusive\": True}, # score threshold\n",
379-
" \"job_count\": {\"min\": 5, \"inclusive\": True}, # minimum job count threshold\n",
383+
" \"job_count\": {\"min\": 15, \"inclusive\": True}, # minimum job count threshold\n",
380384
" },\n",
381385
")\n",
382386
"# Plot top inefficient users by Avg Requested VRAM Efficiency Score, with avg_requested_vram_efficiency_score as labels\n",
@@ -402,10 +406,43 @@
402406
")"
403407
]
404408
},
409+
{
410+
"cell_type": "markdown",
411+
"id": "29",
412+
"metadata": {},
413+
"source": [
414+
"### <a id='toc1_6_2_'></a>[Find Inefficient Users based on EV(alloc_vram_efficiency)](#toc0_)"
415+
]
416+
},
405417
{
406418
"cell_type": "code",
407419
"execution_count": null,
408-
"id": "29",
420+
"id": "30",
421+
"metadata": {},
422+
"outputs": [],
423+
"source": [
424+
"inefficient_users_avg_req_vram_eff_score = analyzer.sort_and_filter_records_with_metrics(\n",
425+
" metrics_df_name_enum=ResourceHoardingDataFrameNameEnum.USERS,\n",
426+
" sorting_key=\"expected_value_requested_vram_efficiency\",\n",
427+
" ascending=True, # Sort by expected_value_requested_vram_efficiency in ascending order\n",
428+
" filter_criteria={\n",
429+
" \"job_count\": {\"min\": 15, \"inclusive\": True}, # minimum job count threshold\n",
430+
" },\n",
431+
")\n",
432+
"# Plot top inefficient users by Expected Value Requested VRAM Efficiency\n",
433+
"users_with_metrics_visualizer = UsersWithMetricsVisualizer(inefficient_users_avg_req_vram_eff_score.head(10))\n",
434+
"users_with_metrics_visualizer.visualize(\n",
435+
" column=\"expected_value_requested_vram_efficiency\",\n",
436+
" bar_label_columns=[\"vram_hours\", \"job_count\"],\n",
437+
" figsize=(10, 6),\n",
438+
" anonymize=True,\n",
439+
")"
440+
]
441+
},
442+
{
443+
"cell_type": "code",
444+
"execution_count": null,
445+
"id": "31",
409446
"metadata": {},
410447
"outputs": [],
411448
"source": [
@@ -425,16 +462,16 @@
425462
},
426463
{
427464
"cell_type": "markdown",
428-
"id": "30",
465+
"id": "32",
429466
"metadata": {},
430467
"source": [
431-
"### <a id='toc1_6_2_'></a>[Generate all hoarding analysis metrics for users:](#toc0_)"
468+
"### <a id='toc1_6_3_'></a>[Generate all hoarding analysis metrics for users:](#toc0_)"
432469
]
433470
},
434471
{
435472
"cell_type": "code",
436473
"execution_count": null,
437-
"id": "31",
474+
"id": "33",
438475
"metadata": {},
439476
"outputs": [],
440477
"source": [
@@ -444,16 +481,16 @@
444481
},
445482
{
446483
"cell_type": "markdown",
447-
"id": "32",
484+
"id": "34",
448485
"metadata": {},
449486
"source": [
450-
"#### <a id='toc1_6_2_1_'></a>[Find most inefficient users hoarding node RAM based on `expected_value_ram_hoarding_fraction_diff`](#toc0_)"
487+
"#### <a id='toc1_6_3_1_'></a>[Find most inefficient users hoarding node RAM based on `expected_value_ram_hoarding_fraction_diff`](#toc0_)"
451488
]
452489
},
453490
{
454491
"cell_type": "code",
455492
"execution_count": null,
456-
"id": "33",
493+
"id": "35",
457494
"metadata": {},
458495
"outputs": [],
459496
"source": [
@@ -476,16 +513,16 @@
476513
},
477514
{
478515
"cell_type": "markdown",
479-
"id": "34",
516+
"id": "36",
480517
"metadata": {},
481518
"source": [
482-
"#### <a id='toc1_6_2_2_'></a>[Find most inefficient users hoarding CPU cores based on `expected_value_core_hoarding_fraction_diff`](#toc0_)"
519+
"#### <a id='toc1_6_3_2_'></a>[Find most inefficient users hoarding CPU cores based on `expected_value_core_hoarding_fraction_diff`](#toc0_)"
483520
]
484521
},
485522
{
486523
"cell_type": "code",
487524
"execution_count": null,
488-
"id": "35",
525+
"id": "37",
489526
"metadata": {},
490527
"outputs": [],
491528
"source": [
@@ -508,7 +545,7 @@
508545
},
509546
{
510547
"cell_type": "markdown",
511-
"id": "36",
548+
"id": "38",
512549
"metadata": {},
513550
"source": [
514551
"## <a id='toc1_7_'></a>[PI Group Analysis](#toc0_)"
@@ -517,7 +554,7 @@
517554
{
518555
"cell_type": "code",
519556
"execution_count": null,
520-
"id": "37",
557+
"id": "39",
521558
"metadata": {},
522559
"outputs": [],
523560
"source": [
@@ -526,7 +563,7 @@
526563
},
527564
{
528565
"cell_type": "markdown",
529-
"id": "38",
566+
"id": "40",
530567
"metadata": {},
531568
"source": [
532569
"#### <a id='toc1_7_1_1_'></a>[Find Inefficient PIs based on `avg_requested_vram_efficiency_score`](#toc0_)"
@@ -535,7 +572,7 @@
535572
{
536573
"cell_type": "code",
537574
"execution_count": null,
538-
"id": "39",
575+
"id": "41",
539576
"metadata": {},
540577
"outputs": [],
541578
"source": [

notebooks/module_demos/Attribute Visualization.ipynb

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@
6464
"metadata": {},
6565
"outputs": [],
6666
"source": [
67-
"project_root = str(Path.cwd().resolve().parent.parent)\n",
68-
"print(f\"Project root: {project_root}\")"
67+
"project_root = Path.cwd().resolve().parent.parent\n",
68+
"print(f\"Project root: {project_root.name}\")"
6969
]
7070
},
7171
{
@@ -75,8 +75,8 @@
7575
"metadata": {},
7676
"outputs": [],
7777
"source": [
78-
"if project_root not in sys.path:\n",
79-
" sys.path.insert(0, project_root)"
78+
"if str(project_root) not in sys.path:\n",
79+
" sys.path.insert(0, str(project_root))"
8080
]
8181
},
8282
{
@@ -101,7 +101,7 @@
101101
"outputs": [],
102102
"source": [
103103
"from src.visualization import ColumnVisualizer\n",
104-
"from src.preprocess import preprocess_data\n",
104+
"from src.preprocess import Preprocess\n",
105105
"from src.database import DatabaseConnection"
106106
]
107107
},
@@ -121,7 +121,7 @@
121121
"outputs": [],
122122
"source": [
123123
"db_path = Path(project_root) / \"data/slurm_data.db\"\n",
124-
"db_connection = DatabaseConnection(str(db_path.resolve()))\n",
124+
"db_connection = DatabaseConnection(str(db_path.resolve()), anonymize=True)\n",
125125
"\n",
126126
"jobs_df = db_connection.fetch_all_jobs()"
127127
]
@@ -133,7 +133,10 @@
133133
"metadata": {},
134134
"outputs": [],
135135
"source": [
136-
"clean_jobs_df = preprocess_data(jobs_df, min_elapsed_seconds=600)\n",
136+
"clean_jobs_df = Preprocess().preprocess_data(\n",
137+
" jobs_df,\n",
138+
" min_elapsed_seconds=600,\n",
139+
" anonymize=True)\n",
137140
"display(clean_jobs_df)\n",
138141
"print(clean_jobs_df.shape)"
139142
]

src/visualization/columns.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1148,9 +1148,10 @@ def _generate_gpu_memory_usage_histogram_categorical_bins(
11481148
vram_labels = [str(v) for v in VRAM_CATEGORIES]
11491149

11501150
# Bin the data by closest category (floor to the largest category <= value)
1151-
bins = [-0.1] + VRAM_CATEGORIES # -0.1 to include 0 exactly
1152-
binned = pd.cut(col_data, bins=bins, labels=vram_labels, right=True, include_lowest=True)
1151+
bins = [-0.1] + VRAM_CATEGORIES # -0.1 to include 0 exactly
1152+
binned = pd.cut(col_data, bins=bins, labels=vram_labels, right=False, include_lowest=True)
11531153
binned[col_data == 0] = "0"
1154+
binned[col_data > max(VRAM_CATEGORIES)] = str(max(VRAM_CATEGORIES))
11541155

11551156
bin_counts = binned.value_counts(sort=False, dropna=False)
11561157
bin_percents = bin_counts / bin_counts.sum() * 100
@@ -1187,8 +1188,8 @@ def _generate_gpu_memory_usage_histogram_categorical_bins(
11871188
ax.set_xticks(x_ticks)
11881189
ax.set_xticklabels(vram_labels)
11891190
ax.set_xlabel("GPU Memory (GiB)")
1190-
ax.set_ylabel("Percent of Jobs")
1191-
ax.set_title(f"Histogram of GPU VRAM Usage ({col})")
1191+
ax.set_ylabel("Percentage of Jobs")
1192+
ax.set_title("Histogram of GPU VRAM Usage")
11921193
plt.grid(axis="y", linestyle="--", alpha=0.5)
11931194

11941195
# --- Bar labels with gap above tallest label ---

src/visualization/efficiency_metrics.py

Lines changed: 71 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
from abc import ABC
6+
import math
67
from pathlib import Path
78
from typing import Any, cast
89

@@ -64,6 +65,73 @@ def validate_visualize_kwargs(
6465
self.validate_figsize(col_kwargs.figsize)
6566
return col_kwargs
6667

68+
@staticmethod
69+
def _human_readable_value(val: object) -> str:
70+
"""Format numeric values human-readably.
71+
72+
Rules (assumptions where unspecified):
73+
- "Small" numbers (abs(value) < 1_000) -> always show two decimals, rounding UP (toward +infinity)
74+
e.g. 1.234 -> 1.24, 0.001 -> 0.01, -1.231 -> -1.23 (up toward +inf makes negative less negative)
75+
- Thousands (>= 1_000 and < 1_000_000) -> comma separated with no decimals (123,456)
76+
- Millions and above use suffix with two decimals: 12.35 M, 3.40 B, 1.00 T
77+
- Handles ints, floats, numpy numeric types; returns original repr for non-numerics.
78+
- NA/None -> 'NA'
79+
80+
Args:
81+
val (object): The value to format.
82+
83+
Returns:
84+
str: Human-readable formatted representation.
85+
"""
86+
if val is None or (isinstance(val, float) and math.isnan(val)):
87+
return "NA"
88+
# numpy / pandas NA
89+
try:
90+
import pandas as _pd # local import to avoid circular issues
91+
try:
92+
_tmp_val = val # help type checkers
93+
isna_func = getattr(_pd, "isna", None)
94+
if callable(isna_func) and isna_func(_tmp_val): # type: ignore[call-arg]
95+
return "NA"
96+
except TypeError: # Non-array-like objects may raise
97+
pass
98+
except Exception: # pragma: no cover - defensive
99+
pass
100+
if not isinstance(val, (int, float, np.integer, np.floating)):
101+
return str(val)
102+
# Cast to float for magnitude / operations
103+
fval = float(val)
104+
abs_val = abs(fval)
105+
# Small number branch
106+
if abs_val < 1_000:
107+
if fval >= 0:
108+
up = math.ceil(fval * 100) / 100.0
109+
else:
110+
# Up toward +infinity for negatives makes value less negative
111+
up = -math.ceil(-fval * 100) / 100.0
112+
if abs(up - int(up)) < 1e-9:
113+
return f"{int(up)}"
114+
return f"{up:.2f}".rstrip("0").rstrip(".")
115+
# Large number branches with suffixes
116+
suffixes = [
117+
(1_000_000_000_000, "T"),
118+
(1_000_000_000, "B"),
119+
(1_000_000, "M"),
120+
]
121+
for threshold, suffix in suffixes:
122+
if abs_val >= threshold:
123+
scaled = fval / threshold
124+
formatted = f"{scaled:.2f}"
125+
if formatted.endswith(".00"):
126+
formatted = formatted[:-3]
127+
else:
128+
# Trim a single trailing 0 if present (e.g., 1.50 -> 1.5) but keep at least one decimal
129+
if formatted.endswith("0"):
130+
formatted = formatted[:-1]
131+
return f"{formatted} {suffix}"
132+
# Thousands (no suffix) -> comma separated, no decimals
133+
return f"{int(round(fval)):,}"
134+
67135

68136
class JobsWithMetricsVisualizer(EfficiencyMetricsVisualizer):
69137
"""Visualizer for jobs with efficiency metrics.
@@ -173,7 +241,7 @@ def _format_col(col: str) -> str:
173241
return col
174242

175243
label_lines = [
176-
f"{_format_col(col)}: {val:.2f}"
244+
f"{_format_col(col)}: {EfficiencyMetricsVisualizer._human_readable_value(val)}"
177245
for col, val in zip(
178246
bar_label_columns,
179247
label_values_columns,
@@ -282,7 +350,7 @@ def _format_col(col: str) -> str:
282350
return col
283351

284352
label_lines = [
285-
f"{_format_col(col)}: {val:.2f}"
353+
f"{_format_col(col)}: {EfficiencyMetricsVisualizer._human_readable_value(val)}"
286354
for col, val in zip(
287355
bar_label_columns,
288356
label_values_columns,
@@ -682,7 +750,7 @@ def _format_col(col: str) -> str:
682750
return col
683751

684752
label_lines = [
685-
f"{_format_col(col)}: {val:.2f}"
753+
f"{_format_col(col)}: {EfficiencyMetricsVisualizer._human_readable_value(val)}"
686754
for col, val in zip(
687755
bar_label_columns,
688756
label_values_columns,

0 commit comments

Comments
 (0)