Update GPU VRAM Usage histogram and make number in ranking bar plot easy to read

MisterArdavan · MisterArdavan · commit 05bc38006d45 · 2025-08-20T16:58:36.000-04:00
diff --git a/notebooks/analysis/Requested and Used VRAM.ipynb b/notebooks/analysis/Requested and Used VRAM.ipynb
@@ -26,10 +26,11 @@
     "      - [Find most inefficient jobs hoarding node RAM based on `ram_hoarding_fraction_diff`](#toc1_5_2_1_)    \n",
     "      - [Find most inefficient jobs hoarding CPU cores based on `core_hoarding_fraction_diff`](#toc1_5_2_2_)    \n",
     "  - [User-Level Analysis](#toc1_6_)    \n",
-    "    - [Find Inefficient Users based on `requested_vram_efficiency_score`](#toc1_6_1_)    \n",
-    "    - [Generate all hoarding analysis metrics for users:](#toc1_6_2_)    \n",
-    "      - [Find most inefficient users hoarding node RAM based on `expected_value_ram_hoarding_fraction_diff`](#toc1_6_2_1_)    \n",
-    "      - [Find most inefficient users hoarding CPU cores based on `expected_value_core_hoarding_fraction_diff`](#toc1_6_2_2_)    \n",
+    "    - [Find Inefficient Users based on `avg_requested_vram_efficiency_score`](#toc1_6_1_)    \n",
+    "    - [Find Inefficient Users based on EV(alloc_vram_efficiency)](#toc1_6_2_)    \n",
+    "    - [Generate all hoarding analysis metrics for users:](#toc1_6_3_)    \n",
+    "      - [Find most inefficient users hoarding node RAM based on `expected_value_ram_hoarding_fraction_diff`](#toc1_6_3_1_)    \n",
+    "      - [Find most inefficient users hoarding CPU cores based on `expected_value_core_hoarding_fraction_diff`](#toc1_6_3_2_)    \n",
     "  - [PI Group Analysis](#toc1_7_)    \n",
     "      - [Find Inefficient PIs based on `avg_requested_vram_efficiency_score`](#toc1_7_1_1_)    \n",
     "\n",
@@ -325,7 +326,10 @@
     "    metrics_df_name_enum=ResourceHoardingDataFrameNameEnum.JOBS_WITH_RESOURCE_HOARDING_METRICS,\n",
     "    sorting_key=\"core_hoarding_fraction_diff\",\n",
     "    ascending=False,  # Sort in descending order\n",
-    "    filter_criteria={\"core_hoarding_fraction_diff\": {\"min\": 0, \"inclusive\": True}},\n",
+    "    filter_criteria={\n",
+    "        \"core_hoarding_fraction_diff\": {\"min\": 0, \"inclusive\": True},\n",
+    "        \"job_count\": {\"min\": 15, \"inclusive\": True},\n",
+    "    },\n",
     ")\n",
     "\n",
     "# Plot top inefficient jobs by CPU core hoarding fraction, with CPU core hoarding fraction as labels\n",
@@ -360,7 +364,7 @@
    "id": "26",
    "metadata": {},
    "source": [
-    "### <a id='toc1_6_1_'></a>[Find Inefficient Users based on `requested_vram_efficiency_score`](#toc0_)"
+    "### <a id='toc1_6_1_'></a>[Find Inefficient Users based on `avg_requested_vram_efficiency_score`](#toc0_)"
    ]
   },
   {
@@ -376,7 +380,7 @@
     "    ascending=True,  # Sort by avg_requested_vram_efficiency_score in ascending order\n",
     "    filter_criteria={\n",
     "        \"avg_requested_vram_efficiency_score\": {\"max\": -10, \"inclusive\": True},  # score threshold\n",
-    "        \"job_count\": {\"min\": 5, \"inclusive\": True},  # minimum job count threshold\n",
+    "        \"job_count\": {\"min\": 15, \"inclusive\": True},  # minimum job count threshold\n",
     "    },\n",
     ")\n",
     "# Plot top inefficient users by Avg Requested VRAM Efficiency Score, with avg_requested_vram_efficiency_score as labels\n",
@@ -402,10 +406,43 @@
     ")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "29",
+   "metadata": {},
+   "source": [
+    "### <a id='toc1_6_2_'></a>[Find Inefficient Users based on EV(alloc_vram_efficiency)](#toc0_)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "29",
+   "id": "30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inefficient_users_avg_req_vram_eff_score = analyzer.sort_and_filter_records_with_metrics(\n",
+    "    metrics_df_name_enum=ResourceHoardingDataFrameNameEnum.USERS,\n",
+    "    sorting_key=\"expected_value_requested_vram_efficiency\",\n",
+    "    ascending=True,  # Sort by expected_value_requested_vram_efficiency in ascending order\n",
+    "    filter_criteria={\n",
+    "        \"job_count\": {\"min\": 15, \"inclusive\": True},  # minimum job count threshold\n",
+    "    },\n",
+    ")\n",
+    "# Plot top inefficient users by Expected Value Requested VRAM Efficiency\n",
+    "users_with_metrics_visualizer = UsersWithMetricsVisualizer(inefficient_users_avg_req_vram_eff_score.head(10))\n",
+    "users_with_metrics_visualizer.visualize(\n",
+    "    column=\"expected_value_requested_vram_efficiency\",\n",
+    "    bar_label_columns=[\"vram_hours\", \"job_count\"],\n",
+    "    figsize=(10, 6),\n",
+    "    anonymize=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "31",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -425,16 +462,16 @@
   },
   {
    "cell_type": "markdown",
-   "id": "30",
+   "id": "32",
    "metadata": {},
    "source": [
-    "### <a id='toc1_6_2_'></a>[Generate all hoarding analysis metrics for users:](#toc0_)"
+    "### <a id='toc1_6_3_'></a>[Generate all hoarding analysis metrics for users:](#toc0_)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "31",
+   "id": "33",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -444,16 +481,16 @@
   },
   {
    "cell_type": "markdown",
-   "id": "32",
+   "id": "34",
    "metadata": {},
    "source": [
-    "#### <a id='toc1_6_2_1_'></a>[Find most inefficient users hoarding node RAM based on `expected_value_ram_hoarding_fraction_diff`](#toc0_)"
+    "#### <a id='toc1_6_3_1_'></a>[Find most inefficient users hoarding node RAM based on `expected_value_ram_hoarding_fraction_diff`](#toc0_)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "33",
+   "id": "35",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -476,16 +513,16 @@
   },
   {
    "cell_type": "markdown",
-   "id": "34",
+   "id": "36",
    "metadata": {},
    "source": [
-    "#### <a id='toc1_6_2_2_'></a>[Find most inefficient users hoarding CPU cores based on `expected_value_core_hoarding_fraction_diff`](#toc0_)"
+    "#### <a id='toc1_6_3_2_'></a>[Find most inefficient users hoarding CPU cores based on `expected_value_core_hoarding_fraction_diff`](#toc0_)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "35",
+   "id": "37",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -508,7 +545,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "36",
+   "id": "38",
    "metadata": {},
    "source": [
     "## <a id='toc1_7_'></a>[PI Group Analysis](#toc0_)"
@@ -517,7 +554,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "37",
+   "id": "39",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -526,7 +563,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "38",
+   "id": "40",
    "metadata": {},
    "source": [
     "#### <a id='toc1_7_1_1_'></a>[Find Inefficient PIs based on `avg_requested_vram_efficiency_score`](#toc0_)"
@@ -535,7 +572,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "39",
+   "id": "41",
    "metadata": {},
    "outputs": [],
    "source": [
diff --git a/notebooks/module_demos/Attribute Visualization.ipynb b/notebooks/module_demos/Attribute Visualization.ipynb
@@ -64,8 +64,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "project_root = str(Path.cwd().resolve().parent.parent)\n",
-    "print(f\"Project root: {project_root}\")"
+    "project_root = Path.cwd().resolve().parent.parent\n",
+    "print(f\"Project root: {project_root.name}\")"
    ]
   },
   {
@@ -75,8 +75,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if project_root not in sys.path:\n",
-    "    sys.path.insert(0, project_root)"
+    "if str(project_root) not in sys.path:\n",
+    "    sys.path.insert(0, str(project_root))"
    ]
   },
   {
@@ -101,7 +101,7 @@
    "outputs": [],
    "source": [
     "from src.visualization import ColumnVisualizer\n",
-    "from src.preprocess import preprocess_data\n",
+    "from src.preprocess import Preprocess\n",
     "from src.database import DatabaseConnection"
    ]
   },
@@ -121,7 +121,7 @@
    "outputs": [],
    "source": [
     "db_path = Path(project_root) / \"data/slurm_data.db\"\n",
-    "db_connection = DatabaseConnection(str(db_path.resolve()))\n",
+    "db_connection = DatabaseConnection(str(db_path.resolve()), anonymize=True)\n",
     "\n",
     "jobs_df = db_connection.fetch_all_jobs()"
    ]
@@ -133,7 +133,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "clean_jobs_df = preprocess_data(jobs_df, min_elapsed_seconds=600)\n",
+    "clean_jobs_df = Preprocess().preprocess_data(\n",
+    "    jobs_df,\n",
+    "    min_elapsed_seconds=600,\n",
+    "    anonymize=True)\n",
     "display(clean_jobs_df)\n",
     "print(clean_jobs_df.shape)"
    ]
diff --git a/src/visualization/columns.py b/src/visualization/columns.py
@@ -1148,9 +1148,10 @@ def _generate_gpu_memory_usage_histogram_categorical_bins(
         vram_labels = [str(v) for v in VRAM_CATEGORIES]
 
         # Bin the data by closest category (floor to the largest category <= value)
-        bins = [-0.1] + VRAM_CATEGORIES  # -0.1 to include 0 exactly
-        binned = pd.cut(col_data, bins=bins, labels=vram_labels, right=True, include_lowest=True)
+        bins = [-0.1] + VRAM_CATEGORIES # -0.1 to include 0 exactly
+        binned = pd.cut(col_data, bins=bins, labels=vram_labels, right=False, include_lowest=True)
         binned[col_data == 0] = "0"
+        binned[col_data > max(VRAM_CATEGORIES)] = str(max(VRAM_CATEGORIES))
 
         bin_counts = binned.value_counts(sort=False, dropna=False)
         bin_percents = bin_counts / bin_counts.sum() * 100
@@ -1187,8 +1188,8 @@ def _generate_gpu_memory_usage_histogram_categorical_bins(
         ax.set_xticks(x_ticks)
         ax.set_xticklabels(vram_labels)
         ax.set_xlabel("GPU Memory (GiB)")
-        ax.set_ylabel("Percent of Jobs")
-        ax.set_title(f"Histogram of GPU VRAM Usage ({col})")
+        ax.set_ylabel("Percentage of Jobs")
+        ax.set_title("Histogram of GPU VRAM Usage")
         plt.grid(axis="y", linestyle="--", alpha=0.5)
 
         # --- Bar labels with gap above tallest label ---
diff --git a/src/visualization/efficiency_metrics.py b/src/visualization/efficiency_metrics.py
@@ -3,6 +3,7 @@
 """
 
 from abc import ABC
+import math
 from pathlib import Path
 from typing import Any, cast
 
@@ -64,6 +65,73 @@ def validate_visualize_kwargs(
         self.validate_figsize(col_kwargs.figsize)
         return col_kwargs
 
+    @staticmethod
+    def _human_readable_value(val: object) -> str:
+        """Format numeric values human-readably.
+
+        Rules (assumptions where unspecified):
+        - "Small" numbers (abs(value) < 1_000) -> always show two decimals, rounding UP (toward +infinity)
+        e.g. 1.234 -> 1.24, 0.001 -> 0.01, -1.231 -> -1.23 (up toward +inf makes negative less negative)
+        - Thousands (>= 1_000 and < 1_000_000) -> comma separated with no decimals (123,456)
+        - Millions and above use suffix with two decimals: 12.35 M, 3.40 B, 1.00 T
+        - Handles ints, floats, numpy numeric types; returns original repr for non-numerics.
+        - NA/None -> 'NA'
+
+        Args:
+            val (object): The value to format.
+
+        Returns:
+            str: Human-readable formatted representation.
+        """
+        if val is None or (isinstance(val, float) and math.isnan(val)):
+            return "NA"
+        # numpy / pandas NA
+        try:
+            import pandas as _pd  # local import to avoid circular issues
+            try:
+                _tmp_val = val  # help type checkers
+                isna_func = getattr(_pd, "isna", None)
+                if callable(isna_func) and isna_func(_tmp_val):  # type: ignore[call-arg]
+                    return "NA"
+            except TypeError:  # Non-array-like objects may raise
+                pass
+        except Exception:  # pragma: no cover - defensive
+            pass
+        if not isinstance(val, (int, float, np.integer, np.floating)):
+            return str(val)
+        # Cast to float for magnitude / operations
+        fval = float(val)
+        abs_val = abs(fval)
+        # Small number branch
+        if abs_val < 1_000:
+            if fval >= 0:
+                up = math.ceil(fval * 100) / 100.0
+            else:
+                # Up toward +infinity for negatives makes value less negative
+                up = -math.ceil(-fval * 100) / 100.0
+            if abs(up - int(up)) < 1e-9:
+                return f"{int(up)}"
+            return f"{up:.2f}".rstrip("0").rstrip(".")
+        # Large number branches with suffixes
+        suffixes = [
+            (1_000_000_000_000, "T"),
+            (1_000_000_000, "B"),
+            (1_000_000, "M"),
+        ]
+        for threshold, suffix in suffixes:
+            if abs_val >= threshold:
+                scaled = fval / threshold
+                formatted = f"{scaled:.2f}"
+                if formatted.endswith(".00"):
+                    formatted = formatted[:-3]
+                else:
+                    # Trim a single trailing 0 if present (e.g., 1.50 -> 1.5) but keep at least one decimal
+                    if formatted.endswith("0"):
+                        formatted = formatted[:-1]
+                return f"{formatted} {suffix}"
+        # Thousands (no suffix) -> comma separated, no decimals
+        return f"{int(round(fval)):,}"
+
 
 class JobsWithMetricsVisualizer(EfficiencyMetricsVisualizer):
     """Visualizer for jobs with efficiency metrics.
@@ -173,7 +241,7 @@ def _format_col(col: str) -> str:
                     return col
 
                 label_lines = [
-                    f"{_format_col(col)}: {val:.2f}"
+                    f"{_format_col(col)}: {EfficiencyMetricsVisualizer._human_readable_value(val)}"
                     for col, val in zip(
                         bar_label_columns,
                         label_values_columns,
@@ -282,7 +350,7 @@ def _format_col(col: str) -> str:
                     return col
 
                 label_lines = [
-                    f"{_format_col(col)}: {val:.2f}"
+                    f"{_format_col(col)}: {EfficiencyMetricsVisualizer._human_readable_value(val)}"
                     for col, val in zip(
                         bar_label_columns,
                         label_values_columns,
@@ -682,7 +750,7 @@ def _format_col(col: str) -> str:
                     return col
 
                 label_lines = [
-                    f"{_format_col(col)}: {val:.2f}"
+                    f"{_format_col(col)}: {EfficiencyMetricsVisualizer._human_readable_value(val)}"
                     for col, val in zip(
                         bar_label_columns,
                         label_values_columns,