Add histogram visualization for score metrics to the visualization class

MisterArdavan · MisterArdavan · commit 980732c1a1b3 · 2025-08-18T18:52:46.000-04:00
diff --git a/notebooks/Analysis/Requested and Used VRAM.ipynb b/notebooks/Analysis/Requested and Used VRAM.ipynb
@@ -5,7 +5,7 @@
    "id": "0",
    "metadata": {},
    "source": [
-    "# <a id='toc1_'></a>[Analysis of Jobs that Requested and Used VRAM](#toc0_)\n",
+    "# <a id='toc1_'></a>[Analysis of GPU Jobs that Requested and Used VRAM](#toc0_)\n",
     "This notebook generates the analysis for jobs that requested some VRAM and run on partitions that their type is GPU and some GPU VRAM is used. It looks at these jobs, corresponding users, and PI groups."
    ]
   },
@@ -15,7 +15,7 @@
    "metadata": {},
    "source": [
     "**Table of contents**<a id='toc0_'></a>    \n",
-    "- [Analysis of Jobs that Requested and Used VRAM](#toc1_)    \n",
+    "- [Analysis of GPU Jobs that Requested and Used VRAM](#toc1_)    \n",
     "  - [Setup](#toc1_1_)    \n",
     "  - [Data Digestion and Preprocessing](#toc1_2_)    \n",
     "  - [Narrowing Dataset to Relevant Partition](#toc1_3_)    \n",
@@ -61,9 +61,7 @@
     "# Import required modules\n",
     "import sys\n",
     "from pathlib import Path\n",
-    "import pandas as pd\n",
-    "import matplotlib.pyplot as plt\n",
-    "import seaborn as sns"
+    "import pandas as pd"
    ]
   },
   {
@@ -102,9 +100,17 @@
     "\n",
     "from src.analysis import ResourceHoarding\n",
     "from src.analysis import efficiency_analysis as ea\n",
-    "from src.visualization import JobsWithMetricsVisualizer, UsersWithMetricsVisualizer, PIGroupsWithMetricsVisualizer\n",
+    "from src.visualization import (\n",
+    "    JobsWithMetricsVisualizer,\n",
+    "    UsersWithMetricsVisualizer,\n",
+    "    PIGroupsWithMetricsVisualizer,\n",
+    ")\n",
     "from src.config.enum_constants import ResourceHoardingDataFrameNameEnum\n",
-    "from src.config.paths import PI_GROUPS_VISUALIZATION_DATA_DIR"
+    "from src.config.paths import (\n",
+    "    JOBS_VISUALIZATION_DATA_DIR,\n",
+    "    USERS_VISUALIZATION_DATA_DIR,\n",
+    "    PI_GROUPS_VISUALIZATION_DATA_DIR,\n",
+    ")"
    ]
   },
   {
@@ -239,7 +245,10 @@
     "# Plot top inefficient jobs by requested VRAM efficiency score, with VRAM-hours as labels\n",
     "jobs_with_metrics_visualizer = JobsWithMetricsVisualizer(inefficient_jobs_vram_hours.head(10))\n",
     "jobs_with_metrics_visualizer.visualize(\n",
-    "    column=\"requested_vram_efficiency_score\", bar_label_columns=[\"vram_hours\", \"allocated_vram\"], figsize=(10, 6)\n",
+    "    output_dir_path=JOBS_VISUALIZATION_DATA_DIR,\n",
+    "    column=\"requested_vram_efficiency_score\",\n",
+    "    bar_label_columns=[\"vram_hours\", \"allocated_vram\"],\n",
+    "    figsize=(10, 6),\n",
     ")"
    ]
   },
@@ -399,96 +408,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Distribution of Avg Requested VRAM Efficiency Score (actual values; all are <= 0)\n",
-    "# We keep scores as-is (negative or zero) and construct bins that respect the skew while\n",
-    "# still giving higher resolution near zero using log-spaced absolute values mapped back to negatives.\n",
-    "scores = inefficient_users_avg_req_vram_eff_score[\"avg_requested_vram_efficiency_score\"].dropna()\n",
-    "print(len(scores), \"scores found for plotting.\")\n",
-    "if scores.empty:\n",
-    "    print(\"No scores to plot.\")\n",
-    "else:\n",
-    "    # If all scores are exactly zero, a histogram is not informative\n",
-    "    if (scores != 0).sum() == 0:\n",
-    "        print(\"All scores are zero; histogram not informative.\")\n",
-    "    else:\n",
-    "        import numpy as np\n",
-    "\n",
-    "        fig, ax = plt.subplots(figsize=(8, 5))\n",
-    "\n",
-    "        # Separate negatives (expected) from zeros\n",
-    "        neg_scores = scores[scores < 0]\n",
-    "        zero_scores = scores[scores == 0]\n",
-    "\n",
-    "        min_abs = None  # track smallest non-zero absolute value for symlog threshold\n",
-    "\n",
-    "        # Build bins: if we have negative values, create log-spaced absolute edges then map back\n",
-    "        if not neg_scores.empty:\n",
-    "            n_bins = 100\n",
-    "            min_abs = neg_scores.abs().min()\n",
-    "            max_abs = neg_scores.abs().max()\n",
-    "            if min_abs == max_abs:\n",
-    "                # Degenerate case: all negative values identical -> fall back to linear bins\n",
-    "                bins = np.linspace(neg_scores.min(), 0, 20)\n",
-    "            else:\n",
-    "                abs_edges = np.logspace(np.log10(min_abs), np.log10(max_abs), n_bins)\n",
-    "                # Convert absolute edges to negative edges (descending), then append 0 as the last edge\n",
-    "                neg_edges = -abs_edges[::-1]\n",
-    "                bins = np.unique(np.concatenate([neg_edges, [0]]))  # ensure strictly increasing\n",
-    "        else:\n",
-    "            # No negative values (only zeros) already handled earlier; fallback just in case\n",
-    "            bins = 3\n",
-    "\n",
-    "        sns.histplot(scores, bins=bins, color=\"#1f77b4\", ax=ax)\n",
-    "        ax.set_xlabel(\"Avg Requested VRAM Efficiency Score (<= 0)\")\n",
-    "        ax.set_ylabel(\"Count\")\n",
-    "        ax.set_title(\"Distribution of Avg Requested VRAM Efficiency Scores (Actual Values, Log X)\")\n",
-    "\n",
-    "        # Apply symmetrical log scale to x-axis to compress the long negative tail while keeping zero.\n",
-    "        # linthresh defines the range around zero that stays linear; choose smallest non-zero magnitude.\n",
-    "        if min_abs is not None and min_abs > 0:\n",
-    "            linthresh = min_abs\n",
-    "        else:\n",
-    "            linthresh = 1e-6  # fallback small threshold\n",
-    "        ax.set_xscale(\"symlog\", linthresh=linthresh, linscale=1.0, base=10)\n",
-    "\n",
-    "        # Annotation: counts (negative & zero) and total\n",
-    "        neg_count = (scores < 0).sum()\n",
-    "        zero_count = (scores == 0).sum()\n",
-    "        total = len(scores)\n",
-    "        ax.text(\n",
-    "            0.98,\n",
-    "            0.95,\n",
-    "            (f\"Counts:\\nNegative: {neg_count}\\nZero: {zero_count}\\n# of Users: {total}\"),\n",
-    "            transform=ax.transAxes,\n",
-    "            ha=\"right\",\n",
-    "            va=\"top\",\n",
-    "            fontsize=9,\n",
-    "            bbox=dict(boxstyle=\"round,pad=0.3\", fc=\"white\", ec=\"gray\", alpha=0.9),\n",
-    "        )\n",
-    "\n",
-    "        # Cumulative distribution (CDF) over actual score values\n",
-    "        counts, bin_edges = np.histogram(scores, bins=bins)\n",
-    "        cdf = np.cumsum(counts) / counts.sum()\n",
-    "        mids = (bin_edges[1:] + bin_edges[:-1]) / 2\n",
-    "        ax2 = ax.twinx()\n",
-    "        ax2.plot(mids, cdf, color=\"crimson\", marker=\"o\", linestyle=\"-\", linewidth=1, markersize=3)\n",
-    "        ax2.set_ylim(0, 1)\n",
-    "        ax2.set_ylabel(\"Cumulative Fraction\", color=\"crimson\")\n",
-    "        ax2.tick_params(axis=\"y\", colors=\"crimson\")\n",
-    "\n",
-    "        # Ensure x-axis spans to (slightly) include zero for clarity\n",
-    "        left, right = ax.get_xlim()\n",
-    "        if right < 0:\n",
-    "            ax.set_xlim(left, 0)\n",
-    "\n",
-    "        plt.tight_layout()\n",
-    "        plt.show()\n",
-    "\n",
-    "# Notes:\n",
-    "# - We plot the actual (negative/zero) scores instead of absolute values.\n",
-    "# - symlog x-scale provides a log-like compression for large negative magnitudes while keeping zero.\n",
-    "# - linthresh picks the smallest non-zero magnitude so near-zero structure is visible.\n",
-    "# - CDF is computed over actual values to show accumulation from most negative toward zero."
+    "users_with_metrics_visualizer = UsersWithMetricsVisualizer(inefficient_users_avg_req_vram_eff_score)\n",
+    "users_with_metrics_visualizer.visualize_metric_distribution(\n",
+    "    output_dir_path=USERS_VISUALIZATION_DATA_DIR, column=\"avg_requested_vram_efficiency_score\", figsize=(8, 5)\n",
+    ")"
    ]
   },
   {
diff --git a/src/config/paths.py b/src/config/paths.py
@@ -11,6 +11,8 @@
 DATA_DIR = (PROJECT_ROOT / "data").resolve()
 PREPROCESSING_DATA_DIR = (DATA_DIR / "preprocessing").resolve()
 VISUALIZATION_DATA_DIR = (DATA_DIR / "visualizations").resolve()
+JOBS_VISUALIZATION_DATA_DIR = (VISUALIZATION_DATA_DIR / "jobs").resolve()
+USERS_VISUALIZATION_DATA_DIR = (VISUALIZATION_DATA_DIR / "users").resolve()
 PI_GROUPS_VISUALIZATION_DATA_DIR = (VISUALIZATION_DATA_DIR / "pi_groups").resolve()
 REPORTS_DATA_DIR = (DATA_DIR / "reports").resolve()
 
@@ -21,5 +23,7 @@
 DATA_DIR.mkdir(exist_ok=True)
 PREPROCESSING_DATA_DIR.mkdir(exist_ok=True)
 VISUALIZATION_DATA_DIR.mkdir(exist_ok=True)
+JOBS_VISUALIZATION_DATA_DIR.mkdir(exist_ok=True)
+USERS_VISUALIZATION_DATA_DIR.mkdir(exist_ok=True)
 PI_GROUPS_VISUALIZATION_DATA_DIR.mkdir(exist_ok=True)
 REPORTS_DATA_DIR.mkdir(exist_ok=True)
diff --git a/src/visualization/efficiency_metrics.py b/src/visualization/efficiency_metrics.py
@@ -4,17 +4,22 @@
 
 from abc import ABC
 from pathlib import Path
-from typing import Any
+from typing import Any, cast
 
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import seaborn as sns
 from pydantic import ValidationError
 from matplotlib.transforms import blended_transform_factory
+from .models import (
+    EfficiencyMetricsKwargsModel,
+    JobsWithMetricsKwargsModel,
+    UsersWithMetricsKwargsModel,
+    UsersWithMetricsHistKwargsModel,
+    PIGroupsWithMetricsKwargsModel,
+)
 
-
-from .models import EfficiencyMetricsKwargsModel, UsersWithMetricsKwargsModel, PIGroupsWithMetricsKwargsModel
 from .visualization import DataVisualizer
 
 
@@ -54,7 +59,8 @@ def validate_visualize_kwargs(
             ) from e
 
         self.validate_column_argument(col_kwargs.column, validated_jobs_df)
-        self.validate_columns(col_kwargs.bar_label_columns, validated_jobs_df)
+        if hasattr(col_kwargs, "bar_label_columns") and col_kwargs.bar_label_columns is not None:
+            self.validate_columns(col_kwargs.bar_label_columns, validated_jobs_df)
         self.validate_figsize(col_kwargs.figsize)
         return col_kwargs
 
@@ -83,7 +89,8 @@ def visualize(self, output_dir_path: Path | None = None, **kwargs: dict[str, Any
             None: Displays the bar plot of jobs ranked by the specified efficiency metric.
         """
         jobs_with_metrics_df = self.validate_dataframe()
-        validated_kwargs = self.validate_visualize_kwargs(kwargs, jobs_with_metrics_df, EfficiencyMetricsKwargsModel)
+        validated_kwargs = self.validate_visualize_kwargs(kwargs, jobs_with_metrics_df, JobsWithMetricsKwargsModel)
+        validated_kwargs = cast(JobsWithMetricsKwargsModel, validated_kwargs)
         column = validated_kwargs.column
         bar_label_columns = validated_kwargs.bar_label_columns
         figsize = validated_kwargs.figsize
@@ -208,6 +215,7 @@ def visualize(self, output_dir_path: Path | None = None, **kwargs: dict[str, Any
         users_with_metrics_df = self.validate_dataframe()
         validated_kwargs = self.validate_visualize_kwargs(kwargs, users_with_metrics_df, UsersWithMetricsKwargsModel)
         column = validated_kwargs.column
+        validated_kwargs = cast(UsersWithMetricsKwargsModel, validated_kwargs)
         bar_label_columns = validated_kwargs.bar_label_columns
         figsize = validated_kwargs.figsize
         output_dir_path = self.validate_output_dir(output_dir_path)
@@ -287,6 +295,119 @@ def _format_col(col: str) -> str:
             plt.savefig(output_dir_path / f"users_ranked_by_{column}_barplot.png", bbox_inches="tight")
         plt.show()
 
+    def visualize_metric_distribution(self, output_dir_path: Path | None = None, **kwargs: dict[str, Any]) -> None:
+        """Visualize the distribution of efficiency metrics for users.
+
+        Args:
+            output_dir_path (Path | None): Path to save the output plot.
+            **kwargs (dict[str, Any]): Keyword arguments for visualization.
+                This can include:
+                - column (str): The efficiency metric to visualize.
+                - figsize (tuple[int | float, int | float]): Size of the figure.
+
+        Returns:
+            None: Displays the distribution plot of the specified efficiency metric.
+        """
+        users_with_metrics_df = self.validate_dataframe()
+        validated_kwargs = self.validate_visualize_kwargs(
+            kwargs,
+            users_with_metrics_df,
+            UsersWithMetricsHistKwargsModel,
+        )
+        column = validated_kwargs.column
+        figsize = validated_kwargs.figsize
+        output_dir_path = self.validate_output_dir(output_dir_path)
+
+        # Distribution of Avg Requested VRAM Efficiency Score (actual values; all are <= 0)
+        # We keep scores as-is (negative or zero) and construct bins that respect the skew while
+        # still giving higher resolution near zero using log-spaced absolute values mapped back to negatives.
+        scores = users_with_metrics_df[column]
+        xmin = users_with_metrics_df[column].min()
+
+        if scores.empty:
+            print("No values to plot.")
+        if xmin > 0:
+            # TODO (Arda): implement histogram for positive values
+            print("All values are positive; histogram not implemented.")
+        # If all scores are exactly zero, a histogram is not informative
+        if (scores != 0).sum() == 0:
+            print("All values are zero; histogram not informative.")
+            return None
+        fig, ax = plt.subplots(figsize=figsize)
+
+        # Separate negatives (expected) from zeros
+        neg_scores = scores[scores < 0]
+
+        min_abs = None  # track smallest non-zero absolute value for symlog threshold
+
+        # Build bins: if we have negative values, create log-spaced absolute edges then map back
+        if not neg_scores.empty:
+            n_bins = 100
+            min_abs = neg_scores.abs().min()
+            max_abs = neg_scores.abs().max()
+            if min_abs == max_abs:
+                # Degenerate case: all negative values identical -> fall back to linear bins
+                bins = np.linspace(neg_scores.min(), 0, 20)
+            else:
+                abs_edges = np.logspace(np.log10(min_abs), np.log10(max_abs), n_bins)
+                # Convert absolute edges to negative edges (descending), then append 0 as the last edge
+                neg_edges = -abs_edges[::-1]
+                bins = np.unique(np.concatenate([neg_edges, [0]]))  # ensure strictly increasing
+
+        sns.histplot(scores, bins=bins, color="#1f77b4", ax=ax)
+        ax.set_xlabel("Avg Requested VRAM Efficiency Score (<= 0)")
+        ax.set_ylabel("Count")
+        ax.set_title("Distribution of Avg Requested VRAM Efficiency Scores (Actual Values, Log X)")
+
+        # Apply symmetrical log scale to x-axis to compress the long negative tail while keeping zero.
+        # linthresh defines the range around zero that stays linear; choose smallest non-zero magnitude.
+        if min_abs is not None and min_abs > 0:
+            linthresh = min_abs
+        else:
+            linthresh = 1e-6  # fallback small threshold
+        ax.set_xscale("symlog", linthresh=linthresh, linscale=1.0, base=10)
+
+        # Annotation: counts (negative & zero) and total
+        neg_count = (scores < 0).sum()
+        zero_count = (scores == 0).sum()
+        total = len(scores)
+        ax.text(
+            0.98,
+            0.95,
+            (f"Counts:\nNegative: {neg_count}\nZero: {zero_count}\n# of Users: {total}"),
+            transform=ax.transAxes,
+            ha="right",
+            va="top",
+            fontsize=9,
+            bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.9),
+        )
+
+        # Cumulative distribution (CDF) over actual score values
+        counts, bin_edges = np.histogram(scores, bins=bins)
+        cdf = np.cumsum(counts) / counts.sum()
+        mids = (bin_edges[1:] + bin_edges[:-1]) / 2
+        ax2 = ax.twinx()
+        ax2.plot(mids, cdf, color="crimson", marker="o", linestyle="-", linewidth=1, markersize=3)
+        ax2.set_ylim(0, 1)
+        ax2.set_ylabel("Cumulative Fraction", color="crimson")
+        ax2.tick_params(axis="y", colors="crimson")
+
+        # Ensure x-axis spans to (slightly) include zero for clarity
+        left, right = ax.get_xlim()
+        if right < 0:
+            ax.set_xlim(left, 0)
+
+        # Notes:
+        # - We plot the actual (negative/zero) scores instead of absolute values.
+        # - symlog x-scale provides a log-like compression for large negative magnitudes while keeping zero.
+        # - linthresh picks the smallest non-zero magnitude so near-zero structure is visible.
+        # - CDF is computed over actual values to show accumulation from most negative toward zero.
+
+        plt.tight_layout()
+        if output_dir_path is not None:
+            plt.savefig(output_dir_path / f"user_{column}_distribution.png", bbox_inches="tight")
+        plt.show()
+
 
 class PIGroupsWithMetricsVisualizer(EfficiencyMetricsVisualizer):
     """Visualizer for PI groups with efficiency metrics.
@@ -318,6 +439,7 @@ def visualize(self, output_dir_path: Path | None = None, **kwargs: dict[str, Any
             PIGroupsWithMetricsKwargsModel,
         )
         column = validated_kwargs.column
+        validated_kwargs = cast(PIGroupsWithMetricsKwargsModel, validated_kwargs)
         bar_label_columns = validated_kwargs.bar_label_columns
         figsize = validated_kwargs.figsize
         output_dir_path = self.validate_output_dir(output_dir_path)
diff --git a/src/visualization/models.py b/src/visualization/models.py
@@ -15,6 +15,14 @@ class ColumnVisualizationKwargsModel(BaseModel):
 class EfficiencyMetricsKwargsModel(BaseModel):
     """Model for keyword arguments used in efficiency metrics visualizations."""
 
+    model_config = ConfigDict(strict=True, extra="allow")
+    column: str
+    figsize: tuple[int | float, int | float] = Field(default=(8, 10))
+
+
+class JobsWithMetricsKwargsModel(EfficiencyMetricsKwargsModel):
+    """Model for keyword arguments used in jobs with metrics visualizations."""
+
     model_config = ConfigDict(strict=True, extra="forbid")
     column: str
     bar_label_columns: list[str] | None
@@ -30,6 +38,14 @@ class UsersWithMetricsKwargsModel(EfficiencyMetricsKwargsModel):
     figsize: tuple[int | float, int | float] = Field(default=(8, 8))
 
 
+class UsersWithMetricsHistKwargsModel(EfficiencyMetricsKwargsModel):
+    """Model for keyword arguments used in user metrics histogram visualizations."""
+
+    model_config = ConfigDict(strict=True, extra="forbid")
+    column: str
+    figsize: tuple[int | float, int | float] = Field(default=(8, 5))
+
+
 class PIGroupsWithMetricsKwargsModel(EfficiencyMetricsKwargsModel):
     """Model for keyword arguments used in PI group metrics visualizations."""