diff --git a/notebooks/Efficiency Analysis.ipynb b/notebooks/Efficiency Analysis.ipynb
index c308329..3cef967 100644
--- a/notebooks/Efficiency Analysis.ipynb
+++ b/notebooks/Efficiency Analysis.ipynb
@@ -1,618 +1,619 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "0",
- "metadata": {},
- "source": [
- "# [Efficiency Analysis](#toc0_)\n",
- "This notebook demonstrates the use of `EfficiencyAnalysis` class in `src/analysis/efficiency_analysis.py` for analyzing the efficiency of jobs, users, and PI groups."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "1",
- "metadata": {},
- "source": [
- "**Table of contents** \n",
- "- [Efficiency Analysis](#toc1_) \n",
- " - [Setup](#toc1_1_) \n",
- " - [Example: Analyze workload efficiency of GPU users who set no VRAM constraints and used 0 GB of VRAM](#toc1_2_) \n",
- " - [Job Efficiency Metrics](#toc1_2_1_) \n",
- " - [Find most inefficient jobs with no VRAM constraints based on `vram_hours`](#toc1_2_1_1_) \n",
- " - [User Efficiency Metrics](#toc1_2_2_) \n",
- " - [Find Inefficient Users based on `expected_value_alloc_vram_efficiency`](#toc1_2_2_1_) \n",
- " - [Find Inefficient Users based on `vram_hours`](#toc1_2_2_2_) \n",
- " - [PI Group Efficiency Metrics](#toc1_2_3_) \n",
- " - [Find Inefficient PIs based on `vram_hours`](#toc1_2_3_1_) \n",
- " - [Example: Analyze all jobs with no VRAM constraints](#toc1_3_) \n",
- " - [Job Efficiency Metrics](#toc1_3_1_) \n",
- " - [Problem with duplicate JobIDs](#toc1_3_1_1_) \n",
- " - [Top users with most number of jobs that have no VRAM constraints](#toc1_3_1_2_) \n",
- " - [Find inefficient jobs with no VRAM Constraints based on `alloc_vram_efficiency_score`](#toc1_3_1_3_) \n",
- "\n",
- "\n",
- ""
- ]
- },
- {
- "cell_type": "markdown",
- "id": "2",
- "metadata": {},
- "source": [
- "## [Setup](#toc0_)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "3",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Import required modules\n",
- "import sys\n",
- "from pathlib import Path\n",
- "import pandas as pd\n",
- "import matplotlib.pyplot as plt\n",
- "import seaborn as sns"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4",
- "metadata": {},
- "source": [
- "Jupyter server should be run at the notebook directory, so the output of the following cell would be the project root:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "5",
- "metadata": {},
- "outputs": [],
- "source": [
- "project_root = str(Path.cwd().resolve().parent)\n",
- "print(f\"Project root: {project_root}\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Add project root to sys.path for module imports\n",
- "if project_root not in sys.path:\n",
- " sys.path.insert(0, project_root)\n",
- "\n",
- "from src.analysis import efficiency_analysis as ea\n",
- "from src.visualization import JobsWithMetricsVisualizer, UsersWithMetricsVisualizer\n",
- "from src.utilities import load_and_preprocess_jobs\n",
- "# Automatically reload modules before executing code\n",
- "# This is useful for development to see changes without restarting the kernel.\n",
- "%load_ext autoreload\n",
- "# Reload all modules imported with %aimport every time before executing the Python code typed.\n",
- "%autoreload 2"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "7",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Load the jobs DataFrame from DuckDB\n",
- "preprocessed_jobs_df = load_and_preprocess_jobs(\n",
- " db_path=\"../data/slurm_data.db\",\n",
- " table_name=\"Jobs\",\n",
- ")\n",
- "display(preprocessed_jobs_df.head(10))\n",
- "print(preprocessed_jobs_df.shape)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8",
- "metadata": {},
- "source": [
- "## [Example: Analyze workload efficiency of GPU users who set no VRAM constraints and used 0 GB of VRAM](#toc0_)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9",
- "metadata": {},
- "outputs": [],
- "source": [
- "efficiency_analysis = ea.EfficiencyAnalysis(jobs_df=preprocessed_jobs_df)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "10",
- "metadata": {},
- "outputs": [],
- "source": [
- "filtered_jobs = efficiency_analysis.filter_jobs_for_analysis(\n",
- " vram_constraint_filter=pd.NA, # No VRAM constraints\n",
- " gpu_mem_usage_filter=0, # Used 0 GB of VRAM\n",
- ")\n",
- "filtered_jobs"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "11",
- "metadata": {},
- "source": [
- "Generate all metrics:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "12",
- "metadata": {},
- "outputs": [],
- "source": [
- "metrics_dict = efficiency_analysis.calculate_all_efficiency_metrics(filtered_jobs)\n",
- "\n",
- "jobs_with_metrics = metrics_dict[\"jobs_with_efficiency_metrics\"]\n",
- "users_with_metrics = metrics_dict[\"users_with_efficiency_metrics\"]\n",
- "pi_accounts_with_metrics = metrics_dict[\"pi_accounts_with_efficiency_metrics\"]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "13",
- "metadata": {},
- "source": [
- "### [Job Efficiency Metrics](#toc0_)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "14",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Set option to display all columns\n",
- "pd.set_option(\"display.max_columns\", None)\n",
- "# Display the DataFrame\n",
- "display(jobs_with_metrics.head(10))\n",
- "# To revert to default settings (optional)\n",
- "pd.reset_option(\"display.max_columns\")\n",
- "\n",
- "print(f\"Jobs found: {len(jobs_with_metrics)}\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "15",
- "metadata": {},
- "source": [
- "#### [Find most inefficient jobs with no VRAM constraints based on `vram_hours`](#toc0_)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "16",
- "metadata": {},
- "outputs": [],
- "source": [
- "inefficient_jobs_vram_hours = efficiency_analysis.sort_and_filter_records_with_metrics(\n",
- " metrics_df_name_enum=ea.MetricsDataFrameNameEnum.JOBS,\n",
- " sorting_key=\"vram_hours\",\n",
- " ascending=False, # Sort by vram_hours in descending order\n",
- " filter_criteria={\n",
- " \"vram_hours\": {\"min\": 80 * 24, \"inclusive\": True}, # VRAM-hours threshold for identifying inefficient jobs\n",
- " },\n",
- ")\n",
- "# Display top inefficient users by VRAM-hours\n",
- "print(\"\\nTop inefficient Jobs by VRAM-hours:\")\n",
- "display(inefficient_jobs_vram_hours.head(10))\n",
- "\n",
- "# Plot top inefficient jobs by VRAM-hours, with VRAM-hours as labels\n",
- "jobs_with_metrics_visualizer = JobsWithMetricsVisualizer(inefficient_jobs_vram_hours.head(20))\n",
- "jobs_with_metrics_visualizer.visualize(\n",
- " column=\"vram_hours\",\n",
- " bar_label_columns=[\"vram_hours\", \"job_hours\"],\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "17",
- "metadata": {},
- "source": [
- "### [User Efficiency Metrics](#toc0_)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "18",
- "metadata": {},
- "outputs": [],
- "source": [
- "users_with_metrics"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "19",
- "metadata": {},
- "source": [
- "#### [Find Inefficient Users based on `expected_value_alloc_vram_efficiency`](#toc0_)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "20",
- "metadata": {},
- "outputs": [],
- "source": [
- "inefficient_users_alloc_vram_eff = efficiency_analysis.sort_and_filter_records_with_metrics(\n",
- " metrics_df_name_enum=ea.MetricsDataFrameNameEnum.USERS,\n",
- " sorting_key=\"expected_value_alloc_vram_efficiency\",\n",
- " ascending=True, # we want to find users with low efficiency\n",
- " filter_criteria={\n",
- " \"expected_value_alloc_vram_efficiency\": {\"max\": 0.3, \"inclusive\": True},\n",
- " \"job_count\": {\"min\": 5, \"inclusive\": True}, # Minimum number of jobs to consider a user\n",
- " },\n",
- ")\n",
- "print(\"\\nTop inefficient users by allocated vram efficiency:\")\n",
- "display(inefficient_users_alloc_vram_eff.head(20))\n",
- "\n",
- "# Plot top inefficient users by allocated vram efficiency, with allocated vram efficiency as labels\n",
- "users_with_metrics_visualizer = UsersWithMetricsVisualizer(inefficient_users_alloc_vram_eff.head(20))\n",
- "users_with_metrics_visualizer.visualize(\n",
- " column=\"expected_value_alloc_vram_efficiency\",\n",
- " bar_label_columns=[\"expected_value_alloc_vram_efficiency\", \"user_job_hours\"],\n",
- " figsize=(8, 10),\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "21",
- "metadata": {},
- "outputs": [],
- "source": [
- "inefficient_users = efficiency_analysis.sort_and_filter_records_with_metrics(\n",
- " metrics_df_name_enum=ea.MetricsDataFrameNameEnum.USERS,\n",
- " sorting_key=\"expected_value_alloc_vram_efficiency\",\n",
- " ascending=True, # we want to find users with low efficiency\n",
- " filter_criteria={\n",
- " \"expected_value_alloc_vram_efficiency\": {\"max\": 0.3, \"inclusive\": True},\n",
- " \"job_count\": {\"min\": 5, \"inclusive\": True}, # Minimum number of jobs to consider a user\n",
- " },\n",
- ")\n",
- "\n",
- "# Display top inefficient users by job count\n",
- "print(\"\\nTop inefficient users by allocated vram efficiency:\")\n",
- "display(inefficient_users.head(10))\n",
- "\n",
- "\n",
- "# Plot top inefficient users by GPU hours, with efficiency as labels\n",
- "top_users = inefficient_users.head(10)\n",
- "\n",
- "plt.figure(figsize=(8, 5))\n",
- "barplot = sns.barplot(y=top_users[\"User\"], x=top_users[\"user_job_hours\"], orient=\"h\")\n",
- "plt.xlabel(\"Job Hours\")\n",
- "plt.ylabel(\"User\")\n",
- "plt.title(\"Top 10 Inefficient Users by Allocated VRAM Efficiency Contribution\")\n",
- "\n",
- "# Annotate bars with expected_value_alloc_vram_efficiency, keeping text fully inside the plot's right spine\n",
- "ax = barplot\n",
- "xmax = top_users[\"user_job_hours\"].max()\n",
- "# Add headroom for annotation space (20% extra)\n",
- "xlim = xmax * 1.20 if xmax > 0 else 1\n",
- "ax.set_xlim(0, xlim)\n",
- "\n",
- "# Calculate annotation x-position: place at 98% of xlim or just left of the right spine, whichever is smaller\n",
- "for i, (job_hours, efficiency) in enumerate(\n",
- " zip(\n",
- " top_users[\"user_job_hours\"],\n",
- " top_users[\"expected_value_alloc_vram_efficiency\"],\n",
- " strict=True,\n",
- " )\n",
- "):\n",
- " # Place annotation at min(job_hours + 2% of xlim, 98% of xlim)\n",
- " xpos = min(job_hours + xlim * 0.02, xlim * 0.98)\n",
- " # If bar is very close to right spine, nudge annotation left to avoid overlap\n",
- " if xpos > xlim * 0.96:\n",
- " xpos = xlim * 0.96\n",
- " ax.text(xpos, i, f\"Eff: {efficiency:.2f}\", va=\"center\", ha=\"left\", fontsize=10, color=\"black\", clip_on=True)\n",
- "\n",
- "plt.tight_layout()\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "22",
- "metadata": {},
- "source": [
- "#### [Find Inefficient Users based on `vram_hours`](#toc0_)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "23",
- "metadata": {},
- "outputs": [],
- "source": [
- "inefficient_users_vram_hours = efficiency_analysis.find_inefficient_users_by_vram_hours(\n",
- " vram_hours_filter={\"min\": 200, \"inclusive\": True}, # VRAM-hours threshold for identifying inefficient users\n",
- " min_jobs=5, # Minimum number of jobs to consider a user\n",
- ")\n",
- "# Display top inefficient users by VRAM-hours\n",
- "print(\"\\nTop inefficient users by VRAM-hours:\")\n",
- "display(inefficient_users_vram_hours.head(20))\n",
- "\n",
- "\n",
- "# Plot top inefficient users by VRAM-hours, with VRAM-hours as labels\n",
- "users_with_metrics_visualizer = UsersWithMetricsVisualizer(inefficient_users_vram_hours.head(20))\n",
- "users_with_metrics_visualizer.visualize(\n",
- " column=\"vram_hours\", bar_label_columns=[\"vram_hours\", \"user_job_hours\"], figsize=(8, 10)\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "24",
- "metadata": {},
- "source": [
- "### [PI Group Efficiency Metrics](#toc0_)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "25",
- "metadata": {},
- "outputs": [],
- "source": [
- "pi_accounts_with_metrics"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "26",
- "metadata": {},
- "source": [
- "#### [Find Inefficient PIs based on `vram_hours`](#toc0_)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "27",
- "metadata": {},
- "outputs": [],
- "source": [
- "inefficient_pis_vram_hours = efficiency_analysis.sort_and_filter_records_with_metrics(\n",
- " metrics_df_name_enum=ea.MetricsDataFrameNameEnum.PI_GROUPS,\n",
- " sorting_key=\"pi_acc_vram_hours\",\n",
- " ascending=False,\n",
- " filter_criteria={\n",
- " \"pi_acc_vram_hours\": {\"min\": 200, \"inclusive\": True}, # VRAM-hours threshold for identifying inefficient users\n",
- " \"job_count\": {\"min\": 5, \"inclusive\": True}, # Minimum number of jobs to consider a PI account\n",
- " },\n",
- ")\n",
- "# Display top inefficient users by VRAM-hours\n",
- "print(\"\\nTop inefficient PI Groups by VRAM-hours:\")\n",
- "display(inefficient_pis_vram_hours.head(20))\n",
- "\n",
- "top_pi_accounts = inefficient_pis_vram_hours.head(20)\n",
- "\n",
- "# Plot top inefficient users by VRAM-hours, with VRAM-hours as labels\n",
- "plt.figure(figsize=(8, 8))\n",
- "barplot = sns.barplot(\n",
- " y=top_pi_accounts[\"pi_account\"],\n",
- " x=top_pi_accounts[\"pi_acc_vram_hours\"],\n",
- " order=top_pi_accounts[\"pi_account\"].tolist(), # Only show present values\n",
- " orient=\"h\",\n",
- ")\n",
- "plt.xlabel(\"VRAM-Hours\")\n",
- "plt.ylabel(\"PI Account\")\n",
- "plt.title(\"Top Inefficient PI Accounts by VRAM-Hours\")\n",
- "# Annotate bars with gpu_hours, keeping text fully inside the plot's right spine\n",
- "ax = barplot\n",
- "xmax = top_pi_accounts[\"pi_acc_vram_hours\"].max()\n",
- "# Add headroom for annotation space (20% extra)\n",
- "xlim = xmax * 1.6 if xmax > 0 else 1\n",
- "ax.set_xlim(0, xlim)\n",
- "# Calculate annotation x-position: place at 98% of xlim or just left of the right spine, whichever is smaller\n",
- "for i, (vram_hours, pi_acc_job_hours) in enumerate(\n",
- " zip(\n",
- " top_pi_accounts[\"pi_acc_vram_hours\"],\n",
- " top_pi_accounts[\"pi_acc_job_hours\"],\n",
- " strict=True,\n",
- " )\n",
- "):\n",
- " # Place annotation at min(vram_hours + 2% of xlim, 98% of xlim)\n",
- " xpos = min(vram_hours + xlim * 0.02, xlim * 0.98)\n",
- " ax.text(\n",
- " xpos,\n",
- " i,\n",
- " f\"VRAM-Hours: {vram_hours:.2f}\\n Job Hours: {pi_acc_job_hours:.2f}\",\n",
- " va=\"center\",\n",
- " ha=\"left\",\n",
- " fontsize=10,\n",
- " color=\"black\",\n",
- " clip_on=True,\n",
- " )\n",
- "plt.tight_layout()\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "28",
- "metadata": {},
- "source": [
- "## [Example: Analyze all jobs with no VRAM constraints](#toc0_)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "29",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Filter jobs where no VRAM constraint was set but a GPU was allocated\n",
- "no_vram_constraint_efficiency_analysis = ea.EfficiencyAnalysis(jobs_df=preprocessed_jobs_df)\n",
- "all_no_vram_constraint_jobs = no_vram_constraint_efficiency_analysis.filter_jobs_for_analysis(\n",
- " vram_constraint_filter={\"min\": 0, \"inclusive\": False}, # No VRAM constraints\n",
- " gpu_count_filter={\"min\": 1, \"inclusive\": True}, # At least one GPU allocated\n",
- " gpu_mem_usage_filter={\"min\": 0, \"inclusive\": False}, # Used more than 0 GiB of VRAM\n",
- ")\n",
- "\n",
- "display(all_no_vram_constraint_jobs.head(10))\n",
- "print(all_no_vram_constraint_jobs.shape)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "30",
- "metadata": {},
- "source": [
- "### [Job Efficiency Metrics](#toc0_)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "31",
- "metadata": {},
- "outputs": [],
- "source": [
- "no_vram_constraint_jobs_with_metrics = no_vram_constraint_efficiency_analysis.calculate_job_efficiency_metrics(\n",
- " all_no_vram_constraint_jobs\n",
- ")\n",
- "\n",
- "# Set option to display all columns\n",
- "pd.set_option(\"display.max_columns\", None)\n",
- "# Display the DataFrame\n",
- "display(no_vram_constraint_jobs_with_metrics.head(10))\n",
- "# To revert to default settings (optional)\n",
- "pd.reset_option(\"display.max_columns\")\n",
- "print(f\"Jobs found: {len(no_vram_constraint_jobs_with_metrics)}\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "32",
- "metadata": {},
- "source": [
- "#### [Problem with duplicate JobIDs](#toc0_)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "33",
- "metadata": {},
- "outputs": [],
- "source": [
- "# select jobs with specific job id\n",
- "pd.set_option(\"display.max_columns\", None)\n",
- "# Display the DataFrame\n",
- "display(no_vram_constraint_jobs_with_metrics[no_vram_constraint_jobs_with_metrics[\"JobID\"] == 24374463])\n",
- "pd.reset_option(\"display.max_columns\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "34",
- "metadata": {},
- "source": [
- "#### [Top users with most number of jobs that have no VRAM constraints](#toc0_)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "35",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Plot top users by number of jobs with no VRAM constraints\n",
- "if not all_no_vram_constraint_jobs.empty:\n",
- " plt.figure(figsize=(10, 5))\n",
- " user_counts = all_no_vram_constraint_jobs[\"User\"].value_counts().head(20)\n",
- " sns.barplot(x=user_counts.values, y=user_counts.index, orient=\"h\")\n",
- " plt.xlabel(\"Number of Jobs\")\n",
- " plt.ylabel(\"User\")\n",
- " plt.title(\"Top 20 Users: Jobs with no VRAM Constraints\")\n",
- " plt.tight_layout()\n",
- " plt.show()\n",
- "else:\n",
- " print(\"No jobs found without VRAM constraints.\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "36",
- "metadata": {},
- "source": [
- "#### [Find inefficient jobs with no VRAM Constraints based on `alloc_vram_efficiency_score`](#toc0_)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "37",
- "metadata": {},
- "outputs": [],
- "source": [
- "low_alloc_vram_score_jobs = no_vram_constraint_efficiency_analysis.sort_and_filter_records_with_metrics(\n",
- " metrics_df_name_enum=ea.MetricsDataFrameNameEnum.JOBS,\n",
- " sorting_key=\"alloc_vram_efficiency_score\",\n",
- " ascending=True, # Sort by alloc_vram_efficiency_score in ascending order\n",
- " filter_criteria={\n",
- " \"alloc_vram_efficiency_score\": {\"max\": -10, \"inclusive\": True}, # score threshold\n",
- " },\n",
- ")\n",
- "# Display top inefficient users by alloc_vram_efficiency_score\n",
- "print(\"\\nTop inefficient Jobs by allocated VRAM efficiency score:\")\n",
- "\n",
- "display(low_alloc_vram_score_jobs.head(20))\n",
- "\n",
- "jobs_with_metrics_visualizer = JobsWithMetricsVisualizer(low_alloc_vram_score_jobs.head(20))\n",
- "jobs_with_metrics_visualizer.visualize(\n",
- " column=\"alloc_vram_efficiency_score\",\n",
- " bar_label_columns=[\"alloc_vram_efficiency_score\", \"job_hours\"],\n",
- " figsize=(10, 12),\n",
- ")"
- ]
- }
- ],
- "metadata": {
- "language_info": {
- "name": "python"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "0",
+ "metadata": {},
+ "source": [
+ "# [Efficiency Analysis](#toc0_)\n",
+ "This notebook demonstrates the use of `EfficiencyAnalysis` class in `src/analysis/efficiency_analysis.py` for analyzing the efficiency of jobs, users, and PI groups."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1",
+ "metadata": {},
+ "source": [
+ "**Table of contents** \n",
+ "- [Efficiency Analysis](#toc1_) \n",
+ " - [Setup](#toc1_1_) \n",
+ " - [Example: Analyze workload efficiency of GPU users who set no VRAM constraints and used 0 GB of VRAM](#toc1_2_) \n",
+ " - [Job Efficiency Metrics](#toc1_2_1_) \n",
+ " - [Find most inefficient jobs with no VRAM constraints based on `vram_hours`](#toc1_2_1_1_) \n",
+ " - [User Efficiency Metrics](#toc1_2_2_) \n",
+ " - [Find Inefficient Users based on `expected_value_alloc_vram_efficiency`](#toc1_2_2_1_) \n",
+ " - [Find Inefficient Users based on `vram_hours`](#toc1_2_2_2_) \n",
+ " - [PI Group Efficiency Metrics](#toc1_2_3_) \n",
+ " - [Find Inefficient PIs based on `vram_hours`](#toc1_2_3_1_) \n",
+ " - [Example: Analyze all jobs with no VRAM constraints](#toc1_3_) \n",
+ " - [Job Efficiency Metrics](#toc1_3_1_) \n",
+ " - [Problem with duplicate JobIDs](#toc1_3_1_1_) \n",
+ " - [Top users with most number of jobs that have no VRAM constraints](#toc1_3_1_2_) \n",
+ " - [Find inefficient jobs with no VRAM Constraints based on `alloc_vram_efficiency_score`](#toc1_3_1_3_) \n",
+ "\n",
+ "\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2",
+ "metadata": {},
+ "source": [
+ "## [Setup](#toc0_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Import required modules\n",
+ "import sys\n",
+ "from pathlib import Path\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4",
+ "metadata": {},
+ "source": [
+ "Jupyter server should be run at the notebook directory, so the output of the following cell would be the project root:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "project_root = str(Path.cwd().resolve().parent)\n",
+ "print(f\"Project root: {project_root}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Add project root to sys.path for module imports\n",
+ "if project_root not in sys.path:\n",
+ " sys.path.insert(0, project_root)\n",
+ "\n",
+ "from src.analysis import efficiency_analysis as ea\n",
+ "from src.visualization import JobsWithMetricsVisualizer, UsersWithMetricsVisualizer\n",
+ "from src.utilities import load_and_preprocess_jobs\n",
+ "\n",
+ "# Automatically reload modules before executing code\n",
+ "# This is useful for development to see changes without restarting the kernel.\n",
+ "%load_ext autoreload\n",
+ "# Reload all modules imported with %aimport every time before executing the Python code typed.\n",
+ "%autoreload 2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load the jobs DataFrame from DuckDB\n",
+ "preprocessed_jobs_df = load_and_preprocess_jobs(\n",
+ " db_path=\"../data/slurm_data.db\",\n",
+ " table_name=\"Jobs\",\n",
+ ")\n",
+ "display(preprocessed_jobs_df.head(10))\n",
+ "print(preprocessed_jobs_df.shape)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8",
+ "metadata": {},
+ "source": [
+ "## [Example: Analyze workload efficiency of GPU users who set no VRAM constraints and used 0 GB of VRAM](#toc0_)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "efficiency_analysis = ea.EfficiencyAnalysis(jobs_df=preprocessed_jobs_df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "10",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "filtered_jobs = efficiency_analysis.filter_jobs_for_analysis(\n",
+ " vram_constraint_filter=pd.NA, # No VRAM constraints\n",
+ " gpu_mem_usage_filter=0, # Used 0 GB of VRAM\n",
+ ")\n",
+ "filtered_jobs"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "11",
+ "metadata": {},
+ "source": [
+ "Generate all metrics:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "12",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "metrics_dict = efficiency_analysis.calculate_all_efficiency_metrics(filtered_jobs)\n",
+ "\n",
+ "jobs_with_metrics = metrics_dict[\"jobs_with_efficiency_metrics\"]\n",
+ "users_with_metrics = metrics_dict[\"users_with_efficiency_metrics\"]\n",
+ "pi_accounts_with_metrics = metrics_dict[\"pi_accounts_with_efficiency_metrics\"]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "13",
+ "metadata": {},
+ "source": [
+ "### [Job Efficiency Metrics](#toc0_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "14",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Set option to display all columns\n",
+ "pd.set_option(\"display.max_columns\", None)\n",
+ "# Display the DataFrame\n",
+ "display(jobs_with_metrics.head(10))\n",
+ "# To revert to default settings (optional)\n",
+ "pd.reset_option(\"display.max_columns\")\n",
+ "\n",
+ "print(f\"Jobs found: {len(jobs_with_metrics)}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "15",
+ "metadata": {},
+ "source": [
+ "#### [Find most inefficient jobs with no VRAM constraints based on `vram_hours`](#toc0_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "16",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inefficient_jobs_vram_hours = efficiency_analysis.sort_and_filter_records_with_metrics(\n",
+ " metrics_df_name_enum=ea.MetricsDataFrameNameEnum.JOBS,\n",
+ " sorting_key=\"vram_hours\",\n",
+ " ascending=False, # Sort by vram_hours in descending order\n",
+ " filter_criteria={\n",
+ " \"vram_hours\": {\"min\": 80 * 24, \"inclusive\": True}, # VRAM-hours threshold for identifying inefficient jobs\n",
+ " },\n",
+ ")\n",
+ "# Display top inefficient users by VRAM-hours\n",
+ "print(\"\\nTop inefficient Jobs by VRAM-hours:\")\n",
+ "display(inefficient_jobs_vram_hours.head(10))\n",
+ "\n",
+ "# Plot top inefficient jobs by VRAM-hours, with VRAM-hours as labels\n",
+ "jobs_with_metrics_visualizer = JobsWithMetricsVisualizer(inefficient_jobs_vram_hours.head(20))\n",
+ "jobs_with_metrics_visualizer.visualize(\n",
+ " column=\"vram_hours\",\n",
+ " bar_label_columns=[\"vram_hours\", \"job_hours\"],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "17",
+ "metadata": {},
+ "source": [
+ "### [User Efficiency Metrics](#toc0_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "18",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "users_with_metrics"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "19",
+ "metadata": {},
+ "source": [
+ "#### [Find Inefficient Users based on `expected_value_alloc_vram_efficiency`](#toc0_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "20",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inefficient_users_alloc_vram_eff = efficiency_analysis.sort_and_filter_records_with_metrics(\n",
+ " metrics_df_name_enum=ea.MetricsDataFrameNameEnum.USERS,\n",
+ " sorting_key=\"expected_value_alloc_vram_efficiency\",\n",
+ " ascending=True, # we want to find users with low efficiency\n",
+ " filter_criteria={\n",
+ " \"expected_value_alloc_vram_efficiency\": {\"max\": 0.3, \"inclusive\": True},\n",
+ " \"job_count\": {\"min\": 5, \"inclusive\": True}, # Minimum number of jobs to consider a user\n",
+ " },\n",
+ ")\n",
+ "print(\"\\nTop inefficient users by allocated vram efficiency:\")\n",
+ "display(inefficient_users_alloc_vram_eff.head(20))\n",
+ "\n",
+ "# Plot top inefficient users by allocated vram efficiency, with allocated vram efficiency as labels\n",
+ "users_with_metrics_visualizer = UsersWithMetricsVisualizer(inefficient_users_alloc_vram_eff.head(20))\n",
+ "users_with_metrics_visualizer.visualize(\n",
+ " column=\"expected_value_alloc_vram_efficiency\",\n",
+ " bar_label_columns=[\"expected_value_alloc_vram_efficiency\", \"user_job_hours\"],\n",
+ " figsize=(8, 10),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "21",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inefficient_users = efficiency_analysis.sort_and_filter_records_with_metrics(\n",
+ " metrics_df_name_enum=ea.MetricsDataFrameNameEnum.USERS,\n",
+ " sorting_key=\"expected_value_alloc_vram_efficiency\",\n",
+ " ascending=True, # we want to find users with low efficiency\n",
+ " filter_criteria={\n",
+ " \"expected_value_alloc_vram_efficiency\": {\"max\": 0.3, \"inclusive\": True},\n",
+ " \"job_count\": {\"min\": 5, \"inclusive\": True}, # Minimum number of jobs to consider a user\n",
+ " },\n",
+ ")\n",
+ "\n",
+ "# Display top inefficient users by job count\n",
+ "print(\"\\nTop inefficient users by allocated vram efficiency:\")\n",
+ "display(inefficient_users.head(10))\n",
+ "\n",
+ "\n",
+ "# Plot top inefficient users by GPU hours, with efficiency as labels\n",
+ "top_users = inefficient_users.head(10)\n",
+ "\n",
+ "plt.figure(figsize=(8, 5))\n",
+ "barplot = sns.barplot(y=top_users[\"User\"], x=top_users[\"user_job_hours\"], orient=\"h\")\n",
+ "plt.xlabel(\"Job Hours\")\n",
+ "plt.ylabel(\"User\")\n",
+ "plt.title(\"Top 10 Inefficient Users by Allocated VRAM Efficiency Contribution\")\n",
+ "\n",
+ "# Annotate bars with expected_value_alloc_vram_efficiency, keeping text fully inside the plot's right spine\n",
+ "ax = barplot\n",
+ "xmax = top_users[\"user_job_hours\"].max()\n",
+ "# Add headroom for annotation space (20% extra)\n",
+ "xlim = xmax * 1.20 if xmax > 0 else 1\n",
+ "ax.set_xlim(0, xlim)\n",
+ "\n",
+ "# Calculate annotation x-position: place at 98% of xlim or just left of the right spine, whichever is smaller\n",
+ "for i, (job_hours, efficiency) in enumerate(\n",
+ " zip(\n",
+ " top_users[\"user_job_hours\"],\n",
+ " top_users[\"expected_value_alloc_vram_efficiency\"],\n",
+ " strict=True,\n",
+ " )\n",
+ "):\n",
+ " # Place annotation at min(job_hours + 2% of xlim, 98% of xlim)\n",
+ " xpos = min(job_hours + xlim * 0.02, xlim * 0.98)\n",
+ " # If bar is very close to right spine, nudge annotation left to avoid overlap\n",
+ " if xpos > xlim * 0.96:\n",
+ " xpos = xlim * 0.96\n",
+ " ax.text(xpos, i, f\"Eff: {efficiency:.2f}\", va=\"center\", ha=\"left\", fontsize=10, color=\"black\", clip_on=True)\n",
+ "\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "22",
+ "metadata": {},
+ "source": [
+ "#### [Find Inefficient Users based on `vram_hours`](#toc0_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "23",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inefficient_users_vram_hours = efficiency_analysis.find_inefficient_users_by_vram_hours(\n",
+ " vram_hours_filter={\"min\": 200, \"inclusive\": True}, # VRAM-hours threshold for identifying inefficient users\n",
+ " min_jobs=5, # Minimum number of jobs to consider a user\n",
+ ")\n",
+ "# Display top inefficient users by VRAM-hours\n",
+ "print(\"\\nTop inefficient users by VRAM-hours:\")\n",
+ "display(inefficient_users_vram_hours.head(20))\n",
+ "\n",
+ "\n",
+ "# Plot top inefficient users by VRAM-hours, with VRAM-hours as labels\n",
+ "users_with_metrics_visualizer = UsersWithMetricsVisualizer(inefficient_users_vram_hours.head(20))\n",
+ "users_with_metrics_visualizer.visualize(\n",
+ " column=\"vram_hours\", bar_label_columns=[\"vram_hours\", \"user_job_hours\"], figsize=(8, 10)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "24",
+ "metadata": {},
+ "source": [
+ "### [PI Group Efficiency Metrics](#toc0_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "25",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pi_accounts_with_metrics"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "26",
+ "metadata": {},
+ "source": [
+ "#### [Find Inefficient PIs based on `vram_hours`](#toc0_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "27",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inefficient_pis_vram_hours = efficiency_analysis.sort_and_filter_records_with_metrics(\n",
+ " metrics_df_name_enum=ea.MetricsDataFrameNameEnum.PI_GROUPS,\n",
+ " sorting_key=\"pi_acc_vram_hours\",\n",
+ " ascending=False,\n",
+ " filter_criteria={\n",
+ " \"pi_acc_vram_hours\": {\"min\": 200, \"inclusive\": True}, # VRAM-hours threshold for identifying inefficient users\n",
+ " \"job_count\": {\"min\": 5, \"inclusive\": True}, # Minimum number of jobs to consider a PI account\n",
+ " },\n",
+ ")\n",
+ "# Display top inefficient users by VRAM-hours\n",
+ "print(\"\\nTop inefficient PI Groups by VRAM-hours:\")\n",
+ "display(inefficient_pis_vram_hours.head(20))\n",
+ "\n",
+ "top_pi_accounts = inefficient_pis_vram_hours.head(20)\n",
+ "\n",
+ "# Plot top inefficient users by VRAM-hours, with VRAM-hours as labels\n",
+ "plt.figure(figsize=(8, 8))\n",
+ "barplot = sns.barplot(\n",
+ " y=top_pi_accounts[\"pi_account\"],\n",
+ " x=top_pi_accounts[\"pi_acc_vram_hours\"],\n",
+ " order=top_pi_accounts[\"pi_account\"].tolist(), # Only show present values\n",
+ " orient=\"h\",\n",
+ ")\n",
+ "plt.xlabel(\"VRAM-Hours\")\n",
+ "plt.ylabel(\"PI Account\")\n",
+ "plt.title(\"Top Inefficient PI Accounts by VRAM-Hours\")\n",
+ "# Annotate bars with gpu_hours, keeping text fully inside the plot's right spine\n",
+ "ax = barplot\n",
+ "xmax = top_pi_accounts[\"pi_acc_vram_hours\"].max()\n",
+ "# Add headroom for annotation space (20% extra)\n",
+ "xlim = xmax * 1.6 if xmax > 0 else 1\n",
+ "ax.set_xlim(0, xlim)\n",
+ "# Calculate annotation x-position: place at 98% of xlim or just left of the right spine, whichever is smaller\n",
+ "for i, (vram_hours, pi_acc_job_hours) in enumerate(\n",
+ " zip(\n",
+ " top_pi_accounts[\"pi_acc_vram_hours\"],\n",
+ " top_pi_accounts[\"pi_acc_job_hours\"],\n",
+ " strict=True,\n",
+ " )\n",
+ "):\n",
+ " # Place annotation at min(vram_hours + 2% of xlim, 98% of xlim)\n",
+ " xpos = min(vram_hours + xlim * 0.02, xlim * 0.98)\n",
+ " ax.text(\n",
+ " xpos,\n",
+ " i,\n",
+ " f\"VRAM-Hours: {vram_hours:.2f}\\n Job Hours: {pi_acc_job_hours:.2f}\",\n",
+ " va=\"center\",\n",
+ " ha=\"left\",\n",
+ " fontsize=10,\n",
+ " color=\"black\",\n",
+ " clip_on=True,\n",
+ " )\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "28",
+ "metadata": {},
+ "source": [
+ "## [Example: Analyze all jobs with no VRAM constraints](#toc0_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "29",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Filter jobs where no VRAM constraint was set but a GPU was allocated\n",
+ "no_vram_constraint_efficiency_analysis = ea.EfficiencyAnalysis(jobs_df=preprocessed_jobs_df)\n",
+ "all_no_vram_constraint_jobs = no_vram_constraint_efficiency_analysis.filter_jobs_for_analysis(\n",
+ " vram_constraint_filter={\"min\": 0, \"inclusive\": False}, # No VRAM constraints\n",
+ " gpu_count_filter={\"min\": 1, \"inclusive\": True}, # At least one GPU allocated\n",
+ " gpu_mem_usage_filter={\"min\": 0, \"inclusive\": False}, # Used more than 0 GiB of VRAM\n",
+ ")\n",
+ "\n",
+ "display(all_no_vram_constraint_jobs.head(10))\n",
+ "print(all_no_vram_constraint_jobs.shape)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "30",
+ "metadata": {},
+ "source": [
+ "### [Job Efficiency Metrics](#toc0_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "31",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "no_vram_constraint_jobs_with_metrics = no_vram_constraint_efficiency_analysis.calculate_job_efficiency_metrics(\n",
+ " all_no_vram_constraint_jobs\n",
+ ")\n",
+ "\n",
+ "# Set option to display all columns\n",
+ "pd.set_option(\"display.max_columns\", None)\n",
+ "# Display the DataFrame\n",
+ "display(no_vram_constraint_jobs_with_metrics.head(10))\n",
+ "# To revert to default settings (optional)\n",
+ "pd.reset_option(\"display.max_columns\")\n",
+ "print(f\"Jobs found: {len(no_vram_constraint_jobs_with_metrics)}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "32",
+ "metadata": {},
+ "source": [
+ "#### [Problem with duplicate JobIDs](#toc0_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "33",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# select jobs with specific job id\n",
+ "pd.set_option(\"display.max_columns\", None)\n",
+ "# Display the DataFrame\n",
+ "display(no_vram_constraint_jobs_with_metrics[no_vram_constraint_jobs_with_metrics[\"JobID\"] == 24374463])\n",
+ "pd.reset_option(\"display.max_columns\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "34",
+ "metadata": {},
+ "source": [
+ "#### [Top users with most number of jobs that have no VRAM constraints](#toc0_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "35",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Plot top users by number of jobs with no VRAM constraints\n",
+ "if not all_no_vram_constraint_jobs.empty:\n",
+ " plt.figure(figsize=(10, 5))\n",
+ " user_counts = all_no_vram_constraint_jobs[\"User\"].value_counts().head(20)\n",
+ " sns.barplot(x=user_counts.values, y=user_counts.index, orient=\"h\")\n",
+ " plt.xlabel(\"Number of Jobs\")\n",
+ " plt.ylabel(\"User\")\n",
+ " plt.title(\"Top 20 Users: Jobs with no VRAM Constraints\")\n",
+ " plt.tight_layout()\n",
+ " plt.show()\n",
+ "else:\n",
+ " print(\"No jobs found without VRAM constraints.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "36",
+ "metadata": {},
+ "source": [
+ "#### [Find inefficient jobs with no VRAM Constraints based on `alloc_vram_efficiency_score`](#toc0_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "37",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "low_alloc_vram_score_jobs = no_vram_constraint_efficiency_analysis.sort_and_filter_records_with_metrics(\n",
+ " metrics_df_name_enum=ea.MetricsDataFrameNameEnum.JOBS,\n",
+ " sorting_key=\"alloc_vram_efficiency_score\",\n",
+ " ascending=True, # Sort by alloc_vram_efficiency_score in ascending order\n",
+ " filter_criteria={\n",
+ " \"alloc_vram_efficiency_score\": {\"max\": -10, \"inclusive\": True}, # score threshold\n",
+ " },\n",
+ ")\n",
+ "# Display top inefficient users by alloc_vram_efficiency_score\n",
+ "print(\"\\nTop inefficient Jobs by allocated VRAM efficiency score:\")\n",
+ "\n",
+ "display(low_alloc_vram_score_jobs.head(20))\n",
+ "\n",
+ "jobs_with_metrics_visualizer = JobsWithMetricsVisualizer(low_alloc_vram_score_jobs.head(20))\n",
+ "jobs_with_metrics_visualizer.visualize(\n",
+ " column=\"alloc_vram_efficiency_score\",\n",
+ " bar_label_columns=[\"alloc_vram_efficiency_score\", \"job_hours\"],\n",
+ " figsize=(10, 12),\n",
+ ")"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
}
diff --git a/notebooks/Resource Hoarding.ipynb b/notebooks/Resource Hoarding.ipynb
new file mode 100644
index 0000000..4fcd882
--- /dev/null
+++ b/notebooks/Resource Hoarding.ipynb
@@ -0,0 +1,367 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "0",
+ "metadata": {},
+ "source": [
+ "# [Resource Hoarding Analysis](#toc0_)\n",
+ "This notebook demonstrates the use of `ResourceHoarding` class in `src/analysis/hoarding.py` for analyzing the jobs and users that hoard resources by requesting a disproportionate amount of CPU Memory and Cores."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1",
+ "metadata": {},
+ "source": [
+ "**Table of contents** \n",
+ "- [Resource Hoarding Analysis](#toc1_) \n",
+ " - [Setup](#toc1_1_) \n",
+ " - [Filter jobs for resource hoarding analysis](#toc1_1_1_) \n",
+ " - [Analyze Jobs Hoarding Resources:](#toc1_2_) \n",
+ " - [Generate all hoarding analysis metrics for jobs:](#toc1_2_1_1_) \n",
+ " - [Find most inefficient jobs hoarding node RAM based on `ram_hoarding_fraction_diff`](#toc1_2_1_2_) \n",
+ " - [Find most inefficient jobs hoarding CPU cores based on `core_hoarding_fraction_diff`](#toc1_2_1_3_) \n",
+ " - [Analyze Users Hoarding Resources:](#toc1_3_) \n",
+ " - [Generate all hoarding analysis metrics for users:](#toc1_3_1_1_) \n",
+ " - [Find most inefficient users hoarding node RAM based on `expected_value_ram_hoarding_fraction_diff`](#toc1_3_1_2_) \n",
+ " - [Find most inefficient users hoarding CPU cores based on `expected_value_core_hoarding_fraction_diff`](#toc1_3_1_3_) \n",
+ "\n",
+ "\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2",
+ "metadata": {},
+ "source": [
+ "## [Setup](#toc0_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Import required modules\n",
+ "import sys\n",
+ "from pathlib import Path\n",
+ "import pandas as pd\n",
+ "\n",
+ "# import matplotlib.pyplot as plt\n",
+ "# import seaborn as sns\n",
+ "import os"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4",
+ "metadata": {},
+ "source": [
+ "Jupyter server should be run at the notebook directory, so the output of the following cell would be the project root:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "project_root = str(Path.cwd().resolve().parent)\n",
+ "print(f\"Project root: {project_root}\")\n",
+ "os.environ[\"OUTPUT_MODE\"] = \"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Automatically reload modules before executing code (set this up BEFORE imports)\n",
+ "%load_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "# Add project root to sys.path for module imports\n",
+ "if project_root not in sys.path:\n",
+ " sys.path.insert(0, project_root)\n",
+ "\n",
+ "from src.analysis import ResourceHoarding as ResourceHoarding\n",
+ "from src.analysis import efficiency_analysis as ea\n",
+ "from src.visualization import JobsWithMetricsVisualizer, UsersWithMetricsVisualizer\n",
+ "from src.config.enum_constants import ResourceHoardingDataFrameNameEnum"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load the jobs DataFrame from DuckDB\n",
+ "preprocessed_jobs_df = ea.load_preprocessed_jobs_dataframe_from_duckdb(\n",
+ " db_path=\"../data/slurm_data.db\",\n",
+ " table_name=\"Jobs\",\n",
+ ")\n",
+ "display(preprocessed_jobs_df.head(10))\n",
+ "print(preprocessed_jobs_df.shape)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8",
+ "metadata": {},
+ "source": [
+ "### [Filter jobs for resource hoarding analysis](#toc0_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "hoarding_analysis = ResourceHoarding(jobs_df=preprocessed_jobs_df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "10",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "filtered_jobs = hoarding_analysis.filter_jobs_for_analysis()\n",
+ "filtered_jobs"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "11",
+ "metadata": {},
+ "source": [
+ "## [Analyze Jobs Hoarding Resources:](#toc0_)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "12",
+ "metadata": {},
+ "source": [
+ "#### [Generate all hoarding analysis metrics for jobs:](#toc0_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "13",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "memory_hoarding_jobs = hoarding_analysis.calculate_node_resource_hoarding_for_jobs(filtered_jobs)\n",
+ "\n",
+ "# Set option to display all columns\n",
+ "pd.set_option(\"display.max_columns\", None)\n",
+ "# Display the DataFrame\n",
+ "display(memory_hoarding_jobs.head(10))\n",
+ "# To revert to default settings (optional)\n",
+ "pd.reset_option(\"display.max_columns\")\n",
+ "\n",
+ "print(f\"Jobs found: {len(memory_hoarding_jobs)}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "14",
+ "metadata": {},
+ "source": [
+ "#### [Find most inefficient jobs hoarding node RAM based on `ram_hoarding_fraction_diff`](#toc0_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "15",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inefficient_jobs_hoarding_ram = hoarding_analysis.sort_and_filter_records_with_metrics(\n",
+ " metrics_df_name_enum=ResourceHoardingDataFrameNameEnum.JOBS_WITH_RESOURCE_HOARDING_METRICS,\n",
+ " sorting_key=\"ram_hoarding_fraction_diff\",\n",
+ " ascending=False, # Sort in descending order\n",
+ " filter_criteria={\"ram_hoarding_fraction_diff\": {\"min\": 0, \"inclusive\": True}},\n",
+ ")\n",
+ "# Display top inefficient users by RAM hoarding fraction\n",
+ "print(\"\\nTop inefficient Jobs by RAM hoarding fraction:\")\n",
+ "display(inefficient_jobs_hoarding_ram.head(10))\n",
+ "\n",
+ "# Plot top inefficient jobs by RAM hoarding fraction, with RAM hoarding fraction as labels\n",
+ "jobs_with_metrics_visualizer = JobsWithMetricsVisualizer(inefficient_jobs_hoarding_ram.head(20))\n",
+ "jobs_with_metrics_visualizer.visualize(\n",
+ " column=\"ram_hoarding_fraction_diff\",\n",
+ " bar_label_columns=[\"ram_hoarding_fraction_diff\", \"cpu_mem_efficiency\", \"alloc_vram_efficiency\"],\n",
+ " figsize=(12, 12),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "16",
+ "metadata": {},
+ "source": [
+ "#### [Find most inefficient jobs hoarding CPU cores based on `core_hoarding_fraction_diff`](#toc0_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "17",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inefficient_jobs_hoarding_cpu_cores = hoarding_analysis.sort_and_filter_records_with_metrics(\n",
+ " metrics_df_name_enum=ResourceHoardingDataFrameNameEnum.JOBS_WITH_RESOURCE_HOARDING_METRICS,\n",
+ " sorting_key=\"core_hoarding_fraction_diff\",\n",
+ " ascending=False, # Sort in descending order\n",
+ " filter_criteria={\"core_hoarding_fraction_diff\": {\"min\": 0, \"inclusive\": True}},\n",
+ ")\n",
+ "# Display top inefficient users by CPU core hoarding fraction\n",
+ "print(\"\\nTop inefficient Jobs by CPU core hoarding fraction:\")\n",
+ "display(inefficient_jobs_hoarding_cpu_cores.head(10))\n",
+ "\n",
+ "# Plot top inefficient jobs by CPU core hoarding fraction, with CPU core hoarding fraction as labels\n",
+ "jobs_with_metrics_visualizer = JobsWithMetricsVisualizer(inefficient_jobs_hoarding_cpu_cores.head(20))\n",
+ "jobs_with_metrics_visualizer.visualize(\n",
+ " column=\"core_hoarding_fraction_diff\",\n",
+ " bar_label_columns=[\"core_hoarding_fraction_diff\", \"ram_hoarding_fraction_diff\", \"alloc_vram_efficiency\"],\n",
+ " figsize=(12, 12),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "18",
+ "metadata": {},
+ "source": [
+ "## [Analyze Users Hoarding Resources:](#toc0_)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "19",
+ "metadata": {},
+ "source": [
+ "#### [Generate all hoarding analysis metrics for users:](#toc0_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "20",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "memory_hoarding_users = hoarding_analysis.calculate_node_resource_hoarding_for_users(filtered_jobs)\n",
+ "display(memory_hoarding_users)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "21",
+ "metadata": {},
+ "source": [
+ "#### [Find most inefficient users hoarding node RAM based on `expected_value_ram_hoarding_fraction_diff`](#toc0_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "22",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inefficient_users_hoarding_ram = hoarding_analysis.sort_and_filter_records_with_metrics(\n",
+ " metrics_df_name_enum=ResourceHoardingDataFrameNameEnum.USERS_WITH_RESOURCE_HOARDING_METRICS,\n",
+ " sorting_key=\"expected_value_ram_hoarding_fraction_diff\",\n",
+ " ascending=False, # Sort in descending order\n",
+ " filter_criteria={\"expected_value_ram_hoarding_fraction_diff\": {\"min\": 0, \"inclusive\": True}},\n",
+ ")\n",
+ "# Display top inefficient users by RAM hoarding fraction\n",
+ "\n",
+ "print(\"\\nTop inefficient Users by RAM hoarding fraction:\")\n",
+ "display(inefficient_users_hoarding_ram.head(10))\n",
+ "\n",
+ "# Plot top inefficient users by RAM hoarding fraction, with RAM hoarding fraction as labels\n",
+ "users_with_metrics_visualizer = UsersWithMetricsVisualizer(inefficient_users_hoarding_ram.head(20))\n",
+ "users_with_metrics_visualizer.visualize(\n",
+ " column=\"expected_value_ram_hoarding_fraction_diff\",\n",
+ " bar_label_columns=[\n",
+ " \"expected_value_ram_hoarding_fraction_diff\",\n",
+ " \"expected_value_core_hoarding_fraction_diff\",\n",
+ " \"expected_value_alloc_vram_efficiency\",\n",
+ " ],\n",
+ " figsize=(14, 12),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "23",
+ "metadata": {},
+ "source": [
+ "#### [Find most inefficient users hoarding CPU cores based on `expected_value_core_hoarding_fraction_diff`](#toc0_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "24",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inefficient_users_hoarding_cpu_cores = hoarding_analysis.sort_and_filter_records_with_metrics(\n",
+ " metrics_df_name_enum=ResourceHoardingDataFrameNameEnum.USERS_WITH_RESOURCE_HOARDING_METRICS,\n",
+ " sorting_key=\"expected_value_core_hoarding_fraction_diff\",\n",
+ " ascending=False, # Sort in descending order\n",
+ " filter_criteria={\"expected_value_core_hoarding_fraction_diff\": {\"min\": 0, \"inclusive\": True}},\n",
+ ")\n",
+ "# Display top inefficient users by CPU core hoarding fraction\n",
+ "\n",
+ "print(\"\\nTop inefficient Users by CPU core hoarding fraction:\")\n",
+ "display(inefficient_users_hoarding_cpu_cores.head(10))\n",
+ "\n",
+ "# Plot top inefficient users by CPU core hoarding fraction, with CPU core hoarding fraction as labels\n",
+ "users_with_metrics_visualizer = UsersWithMetricsVisualizer(inefficient_users_hoarding_cpu_cores.head(20))\n",
+ "users_with_metrics_visualizer.visualize(\n",
+ " column=\"expected_value_core_hoarding_fraction_diff\",\n",
+ " bar_label_columns=[\n",
+ " \"expected_value_core_hoarding_fraction_diff\",\n",
+ " \"expected_value_ram_hoarding_fraction_diff\",\n",
+ " \"expected_value_alloc_vram_efficiency\",\n",
+ " ],\n",
+ " figsize=(14, 12),\n",
+ ")"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/analysis/__init__.py b/src/analysis/__init__.py
index 403d021..3e2ecee 100644
--- a/src/analysis/__init__.py
+++ b/src/analysis/__init__.py
@@ -1 +1,5 @@
-from .efficiency_analysis import EfficiencyAnalysis as EfficiencyAnalysis
+from .efficiency_analysis import (
+ EfficiencyAnalysis as EfficiencyAnalysis,
+ load_preprocessed_jobs_dataframe_from_duckdb as load_preprocessed_jobs_dataframe_from_duckdb
+)
+from .hoarding import ResourceHoarding as ResourceHoarding
diff --git a/src/analysis/efficiency_analysis.py b/src/analysis/efficiency_analysis.py
index 879fde6..bd0f134 100644
--- a/src/analysis/efficiency_analysis.py
+++ b/src/analysis/efficiency_analysis.py
@@ -4,15 +4,80 @@
The aim is to identify potential inefficiencies in GPU usage and notify users or PIs about these issues.
"""
-from typing import cast
-
-import numpy as np
+from typing import Generic, TypeVar, Annotated, cast
import pandas as pd
+import numpy as np
+from pathlib import Path
from src.config.constants import DEFAULT_MIN_ELAPSED_SECONDS
-from src.config.enum_constants import FilterTypeEnum, MetricsDataFrameNameEnum
+from src.config.enum_constants import FilterTypeEnum, MetricsDataFrameNameBase, MetricsDataFrameNameEnum
+from pydantic import validate_call, AfterValidator, SkipValidation
+from src.database import DatabaseConnection
+from src.preprocess.preprocess import preprocess_data
+
+
+def load_preprocessed_jobs_dataframe_from_duckdb(
+ db_path: str | Path,
+ table_name: str = "Jobs",
+ sample_size: int | None = None,
+ random_state: pd._typing.RandomState | None = None,
+) -> pd.DataFrame:
+ """
+ Load jobs DataFrame from a DuckDB database and preprocess it.
+
+ Args:
+ db_path (str or Path): Path to the DuckDB database.
+ table_name (str, optional): Table name to query. Defaults to 'Jobs'.
+ sample_size (int, optional): Number of rows to sample from the DataFrame. Defaults to None (no sampling).
+ random_state (pd._typing.RandomState, optional): Random state for reproducibility. Defaults to None.
+
+ Returns:
+ pd.DataFrame: DataFrame containing the table data.
+
+ Raises:
+ RuntimeError: If the jobs DataFrame cannot be loaded from the database.
+ """
+ if isinstance(db_path, Path):
+ db_path = db_path.resolve()
+ try:
+ db = DatabaseConnection(str(db_path))
+
+ jobs_df = db.fetch_all_jobs(table_name=table_name)
+ processed_data = preprocess_data(
+ jobs_df, min_elapsed_seconds=0, include_failed_cancelled_jobs=False, include_cpu_only_jobs=False
+ )
+ if sample_size is not None:
+ processed_data = processed_data.sample(n=sample_size, random_state=random_state)
+ return processed_data
+ except Exception as e:
+ raise RuntimeError(f"Failed to load jobs DataFrame: {e}") from e
+
+
+# Generic type for metrics enums constrained to our abstract base Enum class
+MetricsDFNameEnumT = TypeVar("MetricsDFNameEnumT", bound=MetricsDataFrameNameBase)
+
+def _ensure_concrete_metrics_enum(
+ cls: type[MetricsDFNameEnumT],
+) -> type[MetricsDFNameEnumT]:
+ """Validate that the provided class is a concrete subclass of MetricsDataFrameNameBase.
-class EfficiencyAnalysis:
+ Used by Pydantic to validate the enum argument to the constructor.
+
+ Raises:
+ TypeError: If the type is not a subclass of the base, or is the abstract base itself.
+
+ Returns:
+ type[MetricsDFNameEnumT]: The validated enum class.
+ """
+ # Ensure it's a subclass of our abstract base (defensive; helps type checkers and runtime safety)
+ if not isinstance(cls, type) or not issubclass(cls, MetricsDataFrameNameBase):
+ raise TypeError("metrics_df_name_enum must be a subclass of MetricsDataFrameNameBase")
+ if cls is MetricsDataFrameNameBase:
+ raise TypeError("metrics_df_name_enum must be a concrete Enum subclass, not the abstract base")
+ return cls
+
+
+class EfficiencyAnalysis(Generic[MetricsDFNameEnumT]):
"""
Class to encapsulate the efficiency analysis of jobs based on various metrics.
@@ -21,15 +86,20 @@ class EfficiencyAnalysis:
The metrics are generated in separate DataFrames for each category in MetricsDataFrameNameEnum.
"""
+ # Apply Pydantic runtime validation for constructor arguments
+ @validate_call(config={"arbitrary_types_allowed": True})
def __init__(
self,
- jobs_df: pd.DataFrame,
+ jobs_df: Annotated[pd.DataFrame, SkipValidation()],
+ metrics_df_name_enum: Annotated[type[MetricsDFNameEnumT], AfterValidator(_ensure_concrete_metrics_enum)],
) -> None:
"""
Initialize the EfficiencyAnalysis class.
Args:
jobs_df (pd.DataFrame): DataFrame containing job data.
+ metrics_df_name_enum (type[MetricsDFNameEnumT]): Enum class whose members'
+ .value names map to attributes on this instance.
Raises:
ValueError: If the jobs DataFrame is empty.
@@ -37,9 +107,10 @@ def __init__(
if jobs_df.empty:
raise ValueError("The jobs DataFrame is empty. Please provide a valid DataFrame with job data.")
self.jobs_df = jobs_df
+ self.metrics_df_name_enum: type[MetricsDFNameEnumT] = metrics_df_name_enum
# Initialize efficiency metric class attributes to None
- for var in MetricsDataFrameNameEnum:
- setattr(self, var.value, None)
+ for names in self.metrics_df_name_enum:
+ setattr(self, names.value, None)
self.analysis_results: dict | None = None
@staticmethod
@@ -56,6 +127,20 @@ def is_numeric_type(val: object) -> bool:
"""
return pd.api.types.is_integer_dtype(type(val)) or pd.api.types.is_float_dtype(type(val))
+ @staticmethod
+ def avg_non_inf(x: pd.Series) -> float | pd.api.typing.NAType:
+ """
+ Helper function to calculate the average of a Series, ignoring -np.inf values.
+
+ Args:
+ x (pd.Series): Series to calculate the average from.
+
+ Returns:
+ float: Average of the Series, ignoring -np.inf values. Returns pd.NA if no valid values.
+ """
+ valid = x[x != -np.inf]
+ return valid.mean() if not valid.empty else pd.NA
+
@staticmethod
def apply_numeric_filter(
col: pd.Series,
@@ -276,13 +361,16 @@ def calculate_job_efficiency_metrics(
filtered_jobs.loc[:, "vram_constraint_efficiency_score"] = score
# Add CPU memory metrics if available
- if "CPUMemUsage" in self.jobs_df.columns and "Memory" in self.jobs_df.columns:
+ if {"CPUMemUsage", "Memory", "CPUs"}.issubset(self.jobs_df.columns):
filtered_jobs.loc[:, "used_cpu_mem_gib"] = filtered_jobs["CPUMemUsage"] / (2**30)
- filtered_jobs.loc[:, "allocated_cpu_mem_gib"] = filtered_jobs["Memory"] / (2**10) # Memory is in MiB
+ filtered_jobs.loc[:, "allocated_cpu_mem_gib"] = (
+ filtered_jobs["Memory"] / (2**10) * filtered_jobs["NodeList"].apply(len)
+ ) # Memory is in MiB
filtered_jobs.loc[:, "cpu_mem_efficiency"] = (
filtered_jobs["used_cpu_mem_gib"] / filtered_jobs["allocated_cpu_mem_gib"]
)
- filtered_jobs = filtered_jobs.drop(columns=["CPUMemUsage", "Memory"])
+ filtered_jobs.loc[:, "cpu_core_count"] = filtered_jobs["CPUs"]
+ filtered_jobs = filtered_jobs.drop(columns=["CPUMemUsage", "Memory", "CPUs"])
self.jobs_with_efficiency_metrics = filtered_jobs
return self.jobs_with_efficiency_metrics
@@ -306,27 +394,17 @@ def calculate_user_efficiency_metrics(self) -> pd.DataFrame:
"vram_hours"
].transform("sum")
- def avg_non_inf(x: pd.Series) -> float | pd.api.typing.NAType:
- """
- Helper function to calculate the average of a Series, ignoring -np.inf values.
-
- Args:
- x (pd.Series): Series to calculate the average from.
-
- Returns:
- float: Average of the Series, ignoring -np.inf values. Returns pd.NA if no valid values.
- """
- valid = x[x != -np.inf]
- return valid.mean() if not valid.empty else pd.NA
-
users_w_efficiency_metrics = (
self.jobs_with_efficiency_metrics.groupby("User", observed=True)
.agg(
job_count=("JobID", "count"),
user_job_hours=("job_hours", "sum"),
pi_account=("Account", "first"),
- avg_alloc_vram_efficiency_score=("alloc_vram_efficiency_score", avg_non_inf),
- avg_vram_constraint_efficiency_score=("vram_constraint_efficiency_score", avg_non_inf),
+ avg_alloc_vram_efficiency_score=("alloc_vram_efficiency_score", EfficiencyAnalysis.avg_non_inf),
+ avg_vram_constraint_efficiency_score=(
+ "vram_constraint_efficiency_score",
+ EfficiencyAnalysis.avg_non_inf,
+ ),
)
.reset_index()
)
@@ -481,7 +559,10 @@ def find_inefficient_users_by_vram_hours(
inefficient_users = inefficient_users.sort_values("vram_hours", ascending=False)
return inefficient_users
- def calculate_all_efficiency_metrics(self, filtered_jobs: pd.DataFrame) -> dict:
+ def calculate_all_efficiency_metrics(
+ self,
+ filtered_jobs: pd.DataFrame,
+ ) -> dict:
"""
Calculate all efficiency metrics for jobs, users, and PI accounts.
@@ -501,7 +582,10 @@ def calculate_all_efficiency_metrics(self, filtered_jobs: pd.DataFrame) -> dict:
self.calculate_job_efficiency_metrics(filtered_jobs)
self.calculate_user_efficiency_metrics()
self.calculate_pi_account_efficiency_metrics()
- return {var.value: getattr(self, var.value) for var in MetricsDataFrameNameEnum}
+ result = {}
+ for var in self.metrics_df_name_enum:
+ result[var.value] = getattr(self, var.value)
+ return result
except (KeyError, ValueError, TypeError, AttributeError) as e:
raise RuntimeError(f"Failed to calculate all efficiency metrics: {e}") from e
@@ -669,7 +753,7 @@ def sort_and_filter_records_with_metrics(
ValueError: If the sorting key is not valid or if ascending is not a boolean value
ValueError: If the filter criteria are invalid
"""
- if not isinstance(metrics_df_name_enum, MetricsDataFrameNameEnum):
+ if not isinstance(metrics_df_name_enum, self.metrics_df_name_enum):
raise ValueError(
f"Invalid efficiency metric type: {metrics_df_name_enum}. "
f"Must be a member of MetricsDataFrameNameEnum."
@@ -707,7 +791,6 @@ def sort_and_filter_records_with_metrics(
filtered_records = getattr(self, metrics_df_name_enum.value)[mask]
# Sort by the specified key and order
-
filtered_records = filtered_records.sort_values(sorting_key, ascending=ascending)
return filtered_records
diff --git a/src/analysis/hoarding.py b/src/analysis/hoarding.py
new file mode 100644
index 0000000..51ab8c9
--- /dev/null
+++ b/src/analysis/hoarding.py
@@ -0,0 +1,249 @@
+from .efficiency_analysis import EfficiencyAnalysis
+import pandas as pd
+from src.config.remote_config import NodeInfoFetcher
+from src.warnings import NodeNotFoundWarning
+from src.config.enum_constants import (
+ RequiredHoardingAnalysisColumnsEnum,
+ NodeInfoKeyEnum,
+ ResourceHoardingDataFrameNameEnum,
+)
+import warnings
+
+
+class ResourceHoarding(EfficiencyAnalysis[ResourceHoardingDataFrameNameEnum]):
+ """Analyze resource hoarding in jobs."""
+
+ def __init__(self, jobs_df: pd.DataFrame) -> None:
+ # Pass the subclass-specific enum to the base initializer
+ super().__init__(jobs_df, metrics_df_name_enum=ResourceHoardingDataFrameNameEnum)
+
+ def _get_resource_totals_for_job(
+ self, jobs_node_list: list[str], node_info: list[dict]
+ ) -> dict[NodeInfoKeyEnum, int | pd.api.typing.NAType]:
+ """Get total resource values for a list of nodes.
+
+ Args:
+ jobs_node_list (list[str]): List of node names.
+ node_info (list[dict]): Node information dictionary obtained from NodeInfoFetcher.
+
+ Returns:
+ dict[NodeInfoKeyEnum, int | pd.api.typing.NAType]:
+ Dictionary containing total RAM, GPU count, and CPU core count.
+ """
+ if len(jobs_node_list) == 0:
+ return {
+ NodeInfoKeyEnum.RAM: pd.NA,
+ NodeInfoKeyEnum.GPU_COUNT: pd.NA,
+ NodeInfoKeyEnum.CORE_COUNT_PER_NODE: pd.NA,
+ }
+
+ ram_values = []
+ gpu_values = []
+ core_values = []
+ for node in jobs_node_list:
+ values = NodeInfoFetcher.get_node_info_values(
+ node,
+ node_info,
+ members={
+ NodeInfoKeyEnum.GPU_COUNT,
+ NodeInfoKeyEnum.RAM,
+ NodeInfoKeyEnum.CORE_COUNT_PER_NODE,
+ },
+ offline=True,
+ )
+ ram_values.append(values[NodeInfoKeyEnum.RAM])
+ gpu_values.append(values[NodeInfoKeyEnum.GPU_COUNT])
+ core_values.append(values[NodeInfoKeyEnum.CORE_COUNT_PER_NODE])
+
+ total_ram = pd.Series(ram_values, dtype=pd.Int64Dtype()).sum(skipna=False)
+ total_gpu = pd.Series(gpu_values, dtype=pd.Int64Dtype()).sum(skipna=False)
+ total_cores = pd.Series(core_values, dtype=pd.Int64Dtype()).sum(skipna=False)
+ return {
+ NodeInfoKeyEnum.RAM: total_ram,
+ NodeInfoKeyEnum.GPU_COUNT: total_gpu,
+ NodeInfoKeyEnum.CORE_COUNT_PER_NODE: total_cores,
+ }
+
+ def _calculate_total_resources_of_nodes_per_job(
+ self, memory_hoarding_jobs: pd.DataFrame
+ ) -> dict[NodeInfoKeyEnum, pd.Series]:
+ """Calculate total available resources for nodes assigned to each job.
+
+ Args:
+ memory_hoarding_jobs (pd.DataFrame): DataFrame containing memory hoarding jobs.
+
+ Returns:
+ pd.Series: Series containing total available resources for each job.
+ """
+ node_info = NodeInfoFetcher().get_info()
+
+ missing_nodes: set[str] = set()
+ with warnings.catch_warnings(record=True) as node_warnings:
+ warnings.simplefilter("default")
+ resource_totals_per_job = memory_hoarding_jobs["NodeList"].apply(
+ lambda node_list: self._get_resource_totals_for_job(node_list, node_info)
+ )
+ total_available_ram_per_job = resource_totals_per_job.apply(lambda d: d[NodeInfoKeyEnum.RAM]).astype(
+ pd.Int32Dtype()
+ )
+ total_available_gpu_per_job = resource_totals_per_job.apply(lambda d: d[NodeInfoKeyEnum.GPU_COUNT]).astype(
+ pd.Int32Dtype()
+ )
+ total_available_cores_per_job = resource_totals_per_job.apply(
+ lambda d: d[NodeInfoKeyEnum.CORE_COUNT_PER_NODE]
+ ).astype(pd.Int32Dtype())
+
+ # Extract node names from warning messages
+ for warn in node_warnings:
+ if (
+ "not found in node configuration file" in str(warn.message)
+ and type(warn.category) is NodeNotFoundWarning
+ and hasattr(warn.message, "node_name")
+ ):
+ node_name = warn.message.node_name
+ missing_nodes.add(node_name)
+ if len(missing_nodes) > 0:
+ warnings.warn(
+ f"Missing node information for nodes: {', '.join(missing_nodes)}. "
+ "This may affect the accuracy of memory hoarding analysis.",
+ UserWarning,
+ stacklevel=2,
+ )
+
+ return {
+ NodeInfoKeyEnum.RAM: total_available_ram_per_job,
+ NodeInfoKeyEnum.GPU_COUNT: total_available_gpu_per_job,
+ NodeInfoKeyEnum.CORE_COUNT_PER_NODE: total_available_cores_per_job,
+ }
+
+ def calculate_node_resource_hoarding_for_jobs(self, filtered_jobs: pd.DataFrame) -> pd.DataFrame:
+ """Detect memory hoarding in each job
+
+ Checks if the ratio of requested memory to the available memory in each node is larger than
+ the ratio of GPUs allocated to the number of GPUs available in each node. Raises warnings if
+ any nodes are not found in the configuration.
+
+ Args:
+ filtered_jobs (pd.DataFrame): DataFrame containing jobs to analyze.
+
+ Raises:
+ ValueError: If required memory metrics are missing.
+
+ Returns:
+ pd.DataFrame: DataFrame with hoarding information for each job.
+ """
+ resource_hoarding_jobs = self.calculate_job_efficiency_metrics(filtered_jobs)
+
+ # check if cpu_mem_efficiency and used_cpu_mem_gib and allocated_cpu_mem_gib are present
+ missing_columns = [
+ key.value
+ for key in RequiredHoardingAnalysisColumnsEnum.__members__.values()
+ if key.value not in resource_hoarding_jobs.columns
+ ]
+ if len(missing_columns) > 0:
+ raise ValueError(
+ f"Missing required CPU memory efficiency metrics: "
+ f"{', '.join(missing_columns)}. "
+ "CPU-related metrics are required for analysis."
+ )
+
+ total_node_resources_per_job = self._calculate_total_resources_of_nodes_per_job(resource_hoarding_jobs)
+
+ # Add memory hoarding metrics
+ resource_hoarding_jobs.loc[:, "total_ram_of_nodes_gib"] = total_node_resources_per_job[NodeInfoKeyEnum.RAM]
+ resource_hoarding_jobs.loc[:, "total_gpu_count_of_nodes"] = total_node_resources_per_job[
+ NodeInfoKeyEnum.GPU_COUNT
+ ]
+ resource_hoarding_jobs.loc[:, "gpu_count_fraction"] = (
+ resource_hoarding_jobs.loc[:, "gpu_count"] / resource_hoarding_jobs.loc[:, "total_gpu_count_of_nodes"]
+ )
+
+ resource_hoarding_jobs.loc[:, "allocated_ram_fraction"] = (
+ resource_hoarding_jobs.loc[:, "allocated_cpu_mem_gib"]
+ / resource_hoarding_jobs.loc[:, "total_ram_of_nodes_gib"]
+ )
+
+ resource_hoarding_jobs.loc[:, "ram_hoarding_fraction_diff"] = (
+ resource_hoarding_jobs.loc[:, "allocated_ram_fraction"]
+ - resource_hoarding_jobs.loc[:, "gpu_count_fraction"]
+ )
+
+ # Add CPU core hoarding metrics
+ resource_hoarding_jobs.loc[:, "total_cores_of_nodes"] = total_node_resources_per_job[
+ NodeInfoKeyEnum.CORE_COUNT_PER_NODE
+ ]
+ resource_hoarding_jobs.loc[:, "allocated_cores_fraction"] = (
+ resource_hoarding_jobs.loc[:, "cpu_core_count"] / resource_hoarding_jobs.loc[:, "total_cores_of_nodes"]
+ )
+
+ resource_hoarding_jobs.loc[:, "core_hoarding_fraction_diff"] = (
+ resource_hoarding_jobs.loc[:, "allocated_cores_fraction"]
+ - resource_hoarding_jobs.loc[:, "gpu_count_fraction"]
+ )
+
+ self.jobs_with_resource_hoarding_metrics = resource_hoarding_jobs
+ return self.jobs_with_resource_hoarding_metrics
+
+ def calculate_node_resource_hoarding_for_users(self, filtered_jobs: pd.DataFrame) -> pd.DataFrame:
+ """Calculate resource hoarding for users based on jobs with resource hoarding metrics.
+
+ Args:
+ filtered_jobs (pd.DataFrame): DataFrame containing jobs to analyze.
+
+ Returns:
+ pd.DataFrame: DataFrame with user-level resource hoarding metrics.
+ """
+ if self.jobs_with_resource_hoarding_metrics is None:
+ self.calculate_node_resource_hoarding_for_jobs(filtered_jobs)
+ print(
+ "Jobs DataFrame with resource hoarding metrics was not available. "
+ "Calculated it using the filtered_jobs DataFrame."
+ )
+
+ if self.users_with_efficiency_metrics is None:
+ self.calculate_user_efficiency_metrics()
+ print(
+ "Users DataFrame with efficiency metrics was not available. "
+ "Calculated it using the filtered_jobs DataFrame."
+ )
+
+ user_vram_hours_per_job = self.jobs_with_resource_hoarding_metrics.groupby("User", observed=True)[
+ "vram_hours"
+ ].transform("sum")
+
+ users_w_resource_hoarding_metrics = self.users_with_efficiency_metrics.copy()
+
+ self.jobs_with_resource_hoarding_metrics.loc[:, "weighted_ram_hoarding_fraction_diff"] = (
+ self.jobs_with_resource_hoarding_metrics["ram_hoarding_fraction_diff"]
+ * self.jobs_with_resource_hoarding_metrics["vram_hours"]
+ / user_vram_hours_per_job
+ )
+
+ users_w_resource_hoarding_metrics.loc[:, "expected_value_ram_hoarding_fraction_diff"] = (
+ self.jobs_with_resource_hoarding_metrics.groupby("User", observed=True)[
+ "weighted_ram_hoarding_fraction_diff"
+ ]
+ .apply(lambda series: series.sum() if not series.isna().all() else pd.NA)
+ .to_numpy()
+ )
+
+ self.jobs_with_resource_hoarding_metrics.loc[:, "weighted_core_hoarding_fraction_diff"] = (
+ self.jobs_with_resource_hoarding_metrics["core_hoarding_fraction_diff"]
+ * self.jobs_with_resource_hoarding_metrics["vram_hours"]
+ / user_vram_hours_per_job
+ )
+
+ users_w_resource_hoarding_metrics.loc[:, "expected_value_core_hoarding_fraction_diff"] = (
+ self.jobs_with_resource_hoarding_metrics.groupby("User", observed=True)[
+ "weighted_core_hoarding_fraction_diff"
+ ]
+ .apply(lambda series: series.sum() if not series.isna().all() else pd.NA)
+ .to_numpy()
+ )
+
+ self.jobs_with_resource_hoarding_metrics = self.jobs_with_resource_hoarding_metrics.drop(
+ columns=["weighted_ram_hoarding_fraction_diff", "weighted_core_hoarding_fraction_diff"]
+ )
+
+ self.users_with_resource_hoarding_metrics = users_w_resource_hoarding_metrics
+ return self.users_with_resource_hoarding_metrics
diff --git a/src/config/constants.py b/src/config/constants.py
index 317e865..65ae098 100644
--- a/src/config/constants.py
+++ b/src/config/constants.py
@@ -44,9 +44,9 @@
MULTIVALENT_GPUS = {"a100": [40, 80], "v100": [16, 32]}
# TODO (Ayush): Refactor to obtain partitions that have one GPUType from `src/config/remote_config.py` instead.
-# Mapping partitions to GPU types for specific constraints to calculate requested VRAM
+# Mapping partitions to GPU types for partition constraint used to calculate requested VRAM
# Update this map as new partitions are added or existing ones change
-PARTITION_TO_GPU_MAP = {
+MONO_GPU_PARTITION_GPU_TYPE = {
"superpod-a100": "a100-80g",
"umd-cscdr-gpu": "a100-80g",
"uri-gpu": "a100-80g",
diff --git a/src/config/enum_constants.py b/src/config/enum_constants.py
index de7b886..ea22d8a 100644
--- a/src/config/enum_constants.py
+++ b/src/config/enum_constants.py
@@ -2,7 +2,8 @@
Declaration of some enum class such as constants values of categorical types.
"""
-from enum import Enum, unique, auto
+from enum import Enum, EnumMeta, unique, auto
+from typing import Any
@unique
@@ -142,7 +143,80 @@ class PartitionTypeEnum(Enum):
@unique
-class MetricsDataFrameNameEnum(Enum):
+class RequiredHoardingAnalysisColumnsEnum(Enum):
+ """
+ An enumeration representing required columns for hoarding analysis.
+
+ Attributes:
+ USED_CPU_MEM_GIB: Represents the used CPU memory in GiB.
+ ALLOCATED_CPU_MEM_GIB: Represents the allocated CPU memory in GiB.
+ CPU_MEM_EFFICIENCY: Represents the CPU memory efficiency.
+ """
+
+ USED_CPU_MEM_GIB = "used_cpu_mem_gib"
+ ALLOCATED_CPU_MEM_GIB = "allocated_cpu_mem_gib"
+ CPU_MEM_EFFICIENCY = "cpu_mem_efficiency"
+ CPU_CORE_COUNT = "cpu_core_count"
+
+
+@unique
+class NodeInfoKeyEnum(Enum):
+ """
+ An enumeration representing important keys in node information configuration.
+
+ Attributes:
+ NAME: Represents the name of the node.
+ TYPE: Represents the type of the node (e.g., CPU, GPU).
+ RAM: Represents the total RAM available on the node.
+ COUNT: Represents the number of nodes of this type.
+ GPU_COUNT: Represents the number of GPUs available on the node.
+ """
+
+ NODES = "nodes"
+ RAM = "ram"
+ COUNT = "count"
+ GPU_COUNT = "gpu_count"
+ CORE_COUNT_PER_NODE = "cores"
+
+
+class MetricsDataFrameNameMeta(EnumMeta):
+ """Metaclass enforcing required members and their values on concrete metrics enums."""
+
+ _required_values: dict[str, str] = {
+ "JOBS": "jobs_with_efficiency_metrics",
+ "USERS": "users_with_efficiency_metrics",
+ "PI_GROUPS": "pi_accounts_with_efficiency_metrics",
+ }
+ _required_members: set[str] = set(_required_values.keys())
+
+ def __init__(cls, name: str, bases: tuple, namespace: dict, **kwargs: dict[str, Any]) -> None:
+ """Finalize Enum subclass creation and enforce required members and values.
+
+ Raises:
+ TypeError: If required members are missing or have unexpected values.
+ """
+ super().__init__(name, bases, namespace, **kwargs)
+ # Skip enforcement for the abstract base itself
+ if name != "MetricsDataFrameNameBase":
+ member_names: set[str] = set(cls.__members__.keys())
+ missing = cls._required_members - member_names
+ if missing:
+ raise TypeError(f"{name} must define members: {cls._required_members}; missing: {sorted(missing)}")
+ # Enforce exact expected string values for required members
+ for req_name, expected_value in cls._required_values.items():
+ actual_value = getattr(cls, req_name).value
+ if actual_value != expected_value:
+ raise TypeError(f"{name}.{req_name} must equal {expected_value!r}, got {actual_value!r}")
+
+
+class MetricsDataFrameNameBase(Enum, metaclass=MetricsDataFrameNameMeta):
+ """Base class for all metrics DataFrame name enums."""
+
+ pass
+
+
+@unique
+class MetricsDataFrameNameEnum(MetricsDataFrameNameBase):
"""
An enumeration representing the names of DataFrames containing efficiency metrics.
@@ -157,6 +231,29 @@ class MetricsDataFrameNameEnum(Enum):
PI_GROUPS = "pi_accounts_with_efficiency_metrics"
+@unique
+class ResourceHoardingDataFrameNameEnum(MetricsDataFrameNameBase):
+ """
+ An enumeration representing the names of DataFrames containing resource hoarding metrics.
+
+ Attributes:
+ JOBS: DataFrame name for jobs with efficiency metrics.
+ USERS: DataFrame name for users with efficiency metrics.
+ PI_GROUPS: DataFrame name for PI accounts/groups with efficiency metrics.
+ JOBS_WITH_RESOURCE_HOARDING_METRICS: DataFrame name for jobs with resource hoarding metrics.
+ USERS_WITH_RESOURCE_HOARDING_METRICS: DataFrame name for users with resource hoarding metrics.
+ PI_GROUPS_WITH_RESOURCE_HOARDING_METRICS: DataFrame name for PI accounts/groups with resource hoarding metrics.
+ """
+
+ # Reuse canonical values to avoid duplicate string literals
+ JOBS = MetricsDataFrameNameEnum.JOBS.value
+ USERS = MetricsDataFrameNameEnum.USERS.value
+ PI_GROUPS = MetricsDataFrameNameEnum.PI_GROUPS.value
+ JOBS_WITH_RESOURCE_HOARDING_METRICS = "jobs_with_resource_hoarding_metrics"
+ USERS_WITH_RESOURCE_HOARDING_METRICS = "users_with_resource_hoarding_metrics"
+ PI_GROUPS_WITH_RESOURCE_HOARDING_METRICS = "pi_accounts_with_resource_hoarding_metrics"
+
+
@unique
class PreprocessingErrorTypeEnum(Enum):
"""An enumeration representing different error types that could occur during preprocessing.
diff --git a/src/config/remote_config.py b/src/config/remote_config.py
index 8a0c039..d4afb8a 100644
--- a/src/config/remote_config.py
+++ b/src/config/remote_config.py
@@ -1,8 +1,12 @@
from abc import ABC, abstractmethod
import json
import os
+import warnings
from requests_cache import CachedSession
from pathlib import Path
+from .enum_constants import NodeInfoKeyEnum
+import pandas as pd
+from src.warnings import NodeNotFoundWarning
class RemoteConfigFetcher(ABC):
@@ -28,7 +32,8 @@ def info_name(self) -> str:
"""Type of information being fetched (e.g., 'partition')."""
pass
- def _validate_info(self, info: list[dict]) -> bool:
+ @classmethod
+ def _validate_info(cls, info: list[dict]) -> bool:
"""Validate that the fetched information is a list of dictionaries.
Args:
@@ -64,14 +69,15 @@ def get_info(self, offline: bool = False) -> list[dict]:
response = self.session.get(self.url, timeout=10)
if response.status_code == 200:
remote_info = response.json()
- if not self._validate_info(remote_info):
+ if not RemoteConfigFetcher._validate_info(remote_info):
raise ValueError(f"Invalid {self.info_name} information format.")
if os.getenv("RUN_ENV") != "TEST":
# Ensure directory exists
self.local_path.parent.mkdir(parents=True, exist_ok=True)
with open(self.local_path, "w") as f:
json.dump(remote_info, f, indent=2)
- print(f"Fetched and saved {self.local_path.name} from remote URL.")
+ if os.getenv("OUTPUT_MODE") == "VERBOSE":
+ print(f"Fetched and saved {self.local_path.name} from remote URL.")
return remote_info
else:
print(
@@ -87,9 +93,10 @@ def get_info(self, offline: bool = False) -> list[dict]:
try:
with open(self.local_path) as file:
local_info = json.load(file)
- if not self._validate_info(local_info):
+ if not self.__class__._validate_info(local_info):
raise ValueError(f"Invalid {self.info_name} information format in local file.")
- print(f"Loaded {self.local_path.name} from local file.")
+ if os.getenv("OUTPUT_MODE") == "VERBOSE":
+ print(f"Loaded {self.local_path.name} from local file.")
return local_info
except FileNotFoundError as e:
raise FileNotFoundError(
@@ -115,7 +122,8 @@ def info_name(self) -> str:
"""Type of information being fetched."""
return "partition"
- def _validate_info(self, info: list[dict]) -> bool:
+ @classmethod
+ def _validate_info(cls, info: list[dict]) -> bool:
"""Validate the fetched partition information.
This function checks that each partition info dictionary contains 'name' and 'type' keys.
@@ -132,7 +140,7 @@ def _validate_info(self, info: list[dict]) -> bool:
if not all("name" in p and "type" in p for p in info):
raise ValueError("Each partition info dictionary must contain 'name' and 'type' keys.")
- return super()._validate_info(info)
+ return RemoteConfigFetcher._validate_info(info)
class NodeInfoFetcher(RemoteConfigFetcher):
@@ -152,3 +160,117 @@ def local_path(self) -> Path:
def info_name(self) -> str:
"""Type of information being fetched."""
return "node"
+
+ @classmethod
+ def _validate_info(cls, info: list[dict]) -> bool:
+ """Validate the fetched node information.
+
+ This function checks that each node info dictionary contains the keys listed in NodeInfoKeyEnum.
+ It also applies the base class validation.
+
+ Args:
+ info (list[dict]): The fetched node information to validate.
+
+ Raises:
+ ValueError: If the fetched data does not contain the expected keys.
+
+ Returns:
+ bool: True if the data is valid, False otherwise."""
+
+ for p in info:
+ missing_keys = [key.value for key in NodeInfoKeyEnum.__members__.values() if key.value not in p]
+ if missing_keys:
+ raise ValueError(
+ "Each node info dictionary must contain the keys listed in NodeInfoKeyEnum."
+ f" Missing keys: {missing_keys}"
+ f"{' in ' + p['nodes_folded'] if 'nodes_folded' in p else ''}"
+ )
+ return RemoteConfigFetcher._validate_info(info)
+
+ @classmethod
+ def get_node_memory(
+ cls,
+ node_name: str,
+ node_info: list[dict],
+ offline: bool = False,
+ ) -> int | pd.api.typing.NAType:
+ """Get the total CPU memory of a node in GiB.
+
+ Args:
+ node_name (str): The name of the node.
+ node_info (list[dict]): The list of node information dictionaries obtained by calling get_info().
+ offline (bool): If True, skip fetching from remote and only read from local file.
+
+ Raises:
+ ValueError: If there is an error calculating the node memory.
+
+ Returns:
+ int: Total memory of the node in GiB, or pd.NA if not found.
+ """
+ if not NodeInfoFetcher._validate_info(node_info):
+ raise ValueError(
+ f"Invalid {NodeInfoFetcher.info_name} information format in node_info."
+ f" Call {NodeInfoFetcher.get_info.__name__}() to fetch the latest information."
+ )
+ try:
+ # see if that name exists in the nodes info file
+ for node_batch in node_info:
+ for name in node_batch[NodeInfoKeyEnum.NODES.value]:
+ if name == node_name:
+ return node_batch[NodeInfoKeyEnum.RAM.value]
+ # If the node is not found, raise a warning
+ warnings.warn(
+ NodeNotFoundWarning(
+ node_name, f"Node '{node_name}' not found in node configuration file {NodeInfoFetcher.local_path}."
+ ),
+ stacklevel=2,
+ )
+ return pd.NA
+ except Exception as e:
+ raise ValueError(f"Error calculating node memory from {NodeInfoFetcher.local_path}: {e}") from e
+
+ @classmethod
+ def get_node_info_values(
+ cls,
+ node_name: str,
+ node_info: list[dict],
+ members: set[NodeInfoKeyEnum],
+ offline: bool = False,
+ ) -> dict[NodeInfoKeyEnum, int | pd.api.typing.NAType]:
+ """
+ Get the values for specified NodeInfoKeyEnum members for a given node.
+
+ Args:
+ node_name (str): The name of the node.
+ node_info (list[dict]): The list of node information dictionaries obtained by calling get_info().
+ members (set[NodeInfoKeyEnum]): Set of NodeInfoKeyEnum members to retrieve values for.
+ offline (bool): If True, skip fetching from remote and only read from local file.
+
+ Raises:
+ ValueError: If there is an error retrieving the node info.
+
+ Returns:
+ dict[NodeInfoKeyEnum, int | pd.api.typing.NAType]:
+ Dictionary of requested member values for the node,
+ or pd.NA if node not found.
+ """
+ if not NodeInfoFetcher._validate_info(node_info):
+ raise ValueError(
+ f"Invalid {NodeInfoFetcher.info_name} information format in node_info."
+ f" Call {NodeInfoFetcher.get_info.__name__}() to fetch the latest information first."
+ )
+ try:
+ for node_batch in node_info:
+ for name in node_batch[NodeInfoKeyEnum.NODES.value]:
+ if name == node_name:
+ return {member: node_batch[member.value] for member in members}
+ # If the node is not found, raise a warning
+ warnings.warn(
+ NodeNotFoundWarning(
+ node_name, f"Node '{node_name}' not found in node configuration file {NodeInfoFetcher.local_path}."
+ ),
+ stacklevel=2,
+ )
+ return {member: pd.NA for member in members}
+ except Exception as e:
+ raise ValueError(f"Error retrieving node info from {NodeInfoFetcher.local_path}: {e}") from e
diff --git a/src/config/snapshots/node_info.json b/src/config/snapshots/node_info.json
index 082ffca..9bc68d6 100644
--- a/src/config/snapshots/node_info.json
+++ b/src/config/snapshots/node_info.json
@@ -1835,10 +1835,9 @@
"owner_email": ""
},
{
- "nodes_folded": "gypsum-gpu[160-164,166,168,171,173-177,181,190-192]",
- "count": 17,
+ "nodes_folded": "gypsum-gpu[161-164,166,168,171,173-177,181,190-192]",
+ "count": 16,
"nodes": [
- "gypsum-gpu160",
"gypsum-gpu161",
"gypsum-gpu162",
"gypsum-gpu163",
diff --git a/src/database/database_connection.py b/src/database/database_connection.py
index 848493d..dbbe028 100644
--- a/src/database/database_connection.py
+++ b/src/database/database_connection.py
@@ -14,9 +14,18 @@ class DatabaseConnection:
"""
def __init__(self, db_url: str, read_only: bool = True) -> None:
+ """
+ Initialize the DatabaseConnection object.
+
+ Args:
+ db_url (str): The URL of the DuckDB database.
+ read_only (bool): If True, the connection will be read-only. Defaults to True.
+
+ Returns:
+ None
+ """
self.db_url = db_url
- self.connection = None
- self.connection = self._connect(read_only=read_only)
+ self.connection: duckdb.DuckDBPyConnection | None = self._connect(read_only=read_only)
print(f"Connected to {self.db_url}")
def _connect(self, read_only: bool) -> duckdb.DuckDBPyConnection:
diff --git a/src/preprocess/allocated_vram.py b/src/preprocess/allocated_vram.py
new file mode 100644
index 0000000..ed96f11
--- /dev/null
+++ b/src/preprocess/allocated_vram.py
@@ -0,0 +1,400 @@
+import pandas as pd
+import re
+
+from ..config.constants import (
+ VRAM_VALUES,
+ MULTIVALENT_GPUS,
+)
+from ..config.enum_constants import PreprocessingErrorTypeEnum
+from .errors import JobPreprocessingError
+
+
+def _get_multivalent_vram_based_on_node(gpu_type: str, node: str) -> int:
+ """
+ Calculate specific VRAM based on a node name for GPUs with multiple VRAM sizes.
+
+ The function checks if a pairing of a given GPU and a node name exists. If it exists, it determines the amount of
+ VRAM available for that GPU on that node.
+
+ Args:
+ gpu_type (str): Type of GPU (e.g., "a100", "v100").
+ node (str): Name of the node.
+
+ Returns:
+ int: VRAM size in GiB for the given GPU type and node.
+ Returns 0 if the node does not match any of the patterns for the given GPU type.
+
+ Notes:
+ This logic is based on the cluster specifications documented at:
+ https://docs.unity.rc.umass.edu/documentation/cluster_specs/nodes/
+ """
+ gpu_type = gpu_type.lower()
+ vram = 0
+ if gpu_type not in MULTIVALENT_GPUS:
+ # if the GPU is not multivalent we do not need to check the node
+ vram = 0
+
+ else:
+ if gpu_type == "a100":
+ if node.startswith("ece-gpu"):
+ vram = 40 # A100 with 40GB
+ elif re.match("^(gpu0(1[3-9]|2[0-4]))|(gpu042)|(umd-cscdr-gpu00[1-2])|(uri-gpu00[1-8])$", node):
+ vram = 80 # A100 with 80GB
+ else:
+ # if the node does not match any of the patterns, it is not a valid node for this GPU type
+ # so we return 0
+ vram = 0
+ elif gpu_type == "v100":
+ if re.match("^(gpu00[1-7])|(power9-gpu009)|(power9-gpu01[0-6])$", node):
+ vram = 16 # V100 with 16GB
+ elif re.match("^(gpu01[1-2])|(power9-gpu00[1-8])$", node):
+ vram = 32 # V100 with 32GB
+ else:
+ # if the node does not match any of the patterns, it is not a valid node for this GPU type
+ # so we return 0
+ vram = 0
+ return vram
+
+
+def _calculate_approx_vram_single_gpu_type(
+ gpu_types: list[str] | dict[str, int], node_list: list[str], gpu_count: int, gpu_mem_usage: int
+) -> int:
+ """
+ Calculate the approximate VRAM for a job with a single GPU type.
+
+ This helper function computes the total VRAM allocated for a job based on the GPU type,
+ the nodes where the job ran, the number of GPUs requested, and the GPU memory usage.
+
+ Args:
+ gpu_types:
+ - list[str]: list containing a single GPU type used in the job.
+ - dict[str, int]: dictionary of GPU types and the count of GPUs of each type used in the job.
+ node_list (list[str]): List of nodes that the job ran on.
+ gpu_count (int): Number of GPUs requested by the job.
+ gpu_mem_usage (int): GPU memory usage in bytes.
+
+ Returns:
+ int: Total allocated VRAM for the job in GiB (gibibyte).
+
+ Raises:
+ JobPreprocessingError: If an unknown GPU type is encountered or if no valid nodes are found for a
+ multivalent GPU.
+ """
+
+ if isinstance(gpu_types, dict):
+ gpu, gpu_count = list(gpu_types.items())[0]
+ else:
+ gpu = gpu_types[0]
+ gpu = gpu.lower()
+
+ if gpu not in MULTIVALENT_GPUS:
+ # if the GPU is not multivalent, return the VRAM value for that GPU
+ if gpu in VRAM_VALUES:
+ return VRAM_VALUES[gpu] * gpu_count
+ else:
+ raise JobPreprocessingError(PreprocessingErrorTypeEnum.UNKNOWN_GPU_TYPE, gpu)
+
+ # calculate VRAM for multivalent GPUs
+ total_vram = 0
+
+ # if all GPUs are on the same node, multiply the VRAM of that node by the number of GPUs
+ if len(node_list) == 1:
+ node = node_list[0]
+ total_vram = _get_multivalent_vram_based_on_node(gpu, node) * gpu_count
+
+ # if all GPUs are on different nodes, sum the VRAM of each node
+ # and return the total VRAM
+ elif len(node_list) == gpu_count:
+ for node in node_list:
+ total_vram += _get_multivalent_vram_based_on_node(gpu, node)
+
+ # if there are multiple nodes, but not all GPUs are on different nodes
+ # we need to calculate the total VRAM based on the minimum VRAM of the nodes
+ else:
+ # calculate available VRAM for all nodes in the node_list
+ vram_values = set() # to avoid duplicates
+ for node in node_list:
+ node_vram = _get_multivalent_vram_based_on_node(gpu, node)
+ if node_vram != 0: # only consider nodes with non-zero VRAM
+ vram_values.add(_get_multivalent_vram_based_on_node(gpu, node))
+
+ if not vram_values:
+ # if no valid nodes are found for the multivalent GPU type in the node list, log an error
+ raise JobPreprocessingError(
+ PreprocessingErrorTypeEnum.UNKNOWN_GPU_TYPE,
+ f"No valid nodes found for multivalent GPU type '{gpu}' in node list: {node_list}",
+ )
+
+ sorted_vram_values = sorted(list(vram_values))
+ total_vram = sorted_vram_values.pop(0) * gpu_count # use the node with the minimum VRAM value
+ # if the total VRAM is less than the GPU memory usage, use the VRAM from the GPU in the next larger node
+ while total_vram < (gpu_mem_usage / 2**30) and sorted_vram_values:
+ total_vram = sorted_vram_values.pop(0) * gpu_count
+
+ return total_vram
+
+
+def _adjust_vram_for_multivalent_gpus(
+ multivalent: dict,
+ allocated_vram: int,
+ gpu_mem_usage: int | float,
+ gpus_with_exact_values: dict[str, int],
+) -> int:
+ """
+ Adjust the allocated VRAM for multivalent GPUs to meet or exceed the GPU memory usage.
+
+ This function increases the allocated VRAM by adding the minimum VRAM for each multivalent GPU
+ until the total allocated VRAM is at least as large as the required GPU memory usage.
+
+ Args:
+ multivalent (dict): Dictionary of GPU types (str) to counts (int) for multivalent GPUs.
+ allocated_vram (int): Current total allocated VRAM in GiB.
+ gpu_mem_usage (int | float): GPU memory usage in bytes.
+ gpus_with_exact_values (dict[str, int]): Dictionary of GPU types (str) to exact VRAM values (int).
+
+ Returns:
+ int: Adjusted total allocated VRAM in GiB.
+ """
+ # Adjust VRAM for GPUs with exact values first
+ for gpu, exact_vram in gpus_with_exact_values.items():
+ allocated_vram += exact_vram
+ multivalent[gpu] -= 1 # Reduce count for GPUs with exact values
+
+ # Assume they wanted the bigger VRAM variant for each GPU until the condition is satisfied
+ for gpu, gpu_count in multivalent.items():
+ while gpu_count > 0 and allocated_vram < (gpu_mem_usage / 2**30):
+ allocated_vram += min(MULTIVALENT_GPUS[gpu])
+ gpu_count -= 1
+
+ return allocated_vram
+
+
+def _get_possible_vram_values(multivalent_gpu_type: str, node_list: list[str]) -> list[int]:
+ """
+ Return all possible VRAM values for a given multivalent GPU type across a list of nodes.
+
+ Args:
+ multivalent_gpu_type (str): The GPU type (e.g., "a100", "v100").
+ node_list (list[str]): List of node names that the job ran on.
+
+ Returns:
+ list[int]: List of non-zero VRAM values for the given GPU across nodes.
+ """
+
+ multivalent_gpu = multivalent_gpu_type.lower()
+ possible_vrams = []
+ for node in node_list:
+ vram = _get_multivalent_vram_based_on_node(multivalent_gpu, node)
+ if vram in MULTIVALENT_GPUS[multivalent_gpu]: # if it matches a node for the given GPU
+ possible_vrams.append(vram)
+ return possible_vrams
+
+
+def _can_calculate_accurately(multivalent_gpu_type: str, count: int, node_list: list[str]) -> bool:
+ """
+ Determine whether VRAM can be calculated accurately for a multivalent GPU type based on the job's node list.
+
+ Args:
+ multivalent_gpu_type (str): The GPU type (e.g., "a100", "v100").
+ count (int): Number of GPUs of this type.
+ node_list (list[str]): List of node names that the job ran on.
+
+ Returns:
+ bool: True if all there's one possible VRAM value for this GPU type across given nodes or
+ if each node corresponds to a different possible VRAM value for this GPU type.
+ """
+ multivalent_gpu = multivalent_gpu_type.lower()
+ possible_vrams = _get_possible_vram_values(multivalent_gpu, node_list)
+ return len(possible_vrams) == count or len(set(possible_vrams)) == 1
+
+
+def _calculate_vram_accurately(multivalent_gpu_type: str, count: int, node_list: list[str]) -> int:
+ """
+ Calculate VRAM for a multivalent GPU type based on the job's node list.
+
+ This can be done when all matched nodes have consistent VRAM configuration or enough distinct nodes exist.
+
+ Args:
+ multivalent_gpu_type (str): The GPU type (e.g., "a100", "v100").
+ count (int): The number of GPUs of this type used in the job.
+ node_list (list[str]): List of node names that the job ran on.
+
+ Returns:
+ int: Total VRAM in GiB for the given GPU type.
+ """
+ multivalent_gpu = multivalent_gpu_type.lower()
+ possible_vrams = _get_possible_vram_values(multivalent_gpu, node_list)
+ # If all possible VRAM values are the same, return that value multiplied by the count
+ if len(set(possible_vrams)) == 1:
+ return possible_vrams[0] * count
+
+ # Otherwise, return the sum of all matching VRAM values
+ return sum(possible_vrams)
+
+
+def _calculate_non_multivalent_vram(non_multivalent: dict) -> int:
+ """
+ Calculate the VRAM allocated for non-multivalent GPUs.
+
+ Args:
+ non_multivalent (dict): Dictionary with non-multivalent GPU types as keys and their counts as values.
+
+ Returns:
+ int: Total allocated VRAM for non-multivalent GPUs in GiB.
+
+ Raises:
+ JobPreprocessingError: If an unknown GPU type is encountered.
+ """
+ allocated_vram = 0
+ for gpu, count in non_multivalent.items():
+ if gpu in VRAM_VALUES:
+ allocated_vram += VRAM_VALUES[gpu] * count
+ else:
+ raise JobPreprocessingError(PreprocessingErrorTypeEnum.UNKNOWN_GPU_TYPE, gpu)
+ return allocated_vram
+
+
+def _calculate_multivalent_vram(multivalent: dict, node_list: list[str], gpu_mem_usage: int) -> int:
+ """
+ Calculate the VRAM allocated for multivalent GPUs.
+
+ Args:
+ multivalent (dict): Dictionary with multivalent GPU types as keys and their counts as values.
+ node_list (list[str]): List of nodes that the job ran on.
+ gpu_mem_usage (int): GPU memory usage in bytes.
+
+ Returns:
+ int: Total allocated VRAM for multivalent GPUs in GiB.
+ """
+ allocated_vram = 0
+ gpus_with_exact_values: dict[str, int] = dict()
+
+ for gpu, count in multivalent.items():
+ if _can_calculate_accurately(gpu, count, node_list):
+ vram_value = _calculate_vram_accurately(gpu, count, node_list)
+ allocated_vram += vram_value
+ gpus_with_exact_values[gpu] = vram_value
+ else:
+ allocated_vram += min(MULTIVALENT_GPUS[gpu]) * count
+
+ if allocated_vram < gpu_mem_usage / 2**30 and len(gpus_with_exact_values) < len(multivalent):
+ allocated_vram = _adjust_vram_for_multivalent_gpus(
+ multivalent, allocated_vram, gpu_mem_usage, gpus_with_exact_values
+ )
+ return allocated_vram
+
+
+def _calculate_alloc_vram_multiple_gpu_types_with_count(
+ gpu_types: dict[str, int], node_list: list[str], gpu_mem_usage: int
+) -> int:
+ """
+ Calculate allocated VRAM for a job with multiple GPU types given a dictionary.
+
+ The dictionary has GPU Types as keys and their respective counts as values.
+
+ Args:
+ gpu_types (dict[str, int]): Dictionary with GPU types as keys and their counts as values.
+ node_list (list[str]): List of nodes that the job ran on.
+ gpu_mem_usage (int): GPU memory usage in bytes.
+
+ Returns:
+ int: Total allocated VRAM for the job in GiB.
+ """
+ multivalent_gpus = {gpu.lower(): count for gpu, count in gpu_types.items() if gpu.lower() in MULTIVALENT_GPUS}
+ non_multivalent_gpus = {
+ gpu.lower(): count for gpu, count in gpu_types.items() if gpu.lower() not in MULTIVALENT_GPUS
+ }
+
+ alloc_vram = 0
+ alloc_vram += _calculate_non_multivalent_vram(non_multivalent_gpus)
+ alloc_vram += _calculate_multivalent_vram(multivalent_gpus, node_list, gpu_mem_usage)
+
+ return alloc_vram
+
+
+def _get_approx_allocated_vram(
+ gpu_types: list[str] | dict[str, int], node_list: list[str], gpu_count: int, gpu_mem_usage: int
+) -> int:
+ """
+ Get the total allocated VRAM for a job based on its GPU type and node list.
+
+ This function estimates the total VRAM allocated for a job based on the GPU types used
+ and the nodes that the job ran on.
+
+ Args:
+ gpu_types:
+ This could be a list of strings (if using the old database format) or a dictionary with GPU types as keys
+ and their counts as values (if using the new database format).
+ - list[str]: List containing the types of GPUs used in the job.
+ - dict[str, int]: Dictionary with the type of GPUs and the exact count used in the job.
+ node_list (list[str]): List of nodes that the job ran on.
+ gpu_count (int): Number of GPUs requested by the job.
+ gpu_mem_usage (int): GPU memory usage in bytes.
+
+ Returns:
+ int: Total allocated (estimate) VRAM for the job in GiB (gibibyte).
+
+ Raises:
+ JobPreprocessingError: If an unknown GPU type is encountered or if the GPU types are malformed.
+
+ Notes:
+ - When `gpu_types` is a dictionary, the function calculates VRAM based on the counts of each GPU type.
+ - For multivalent GPUs, the VRAM is determined based on the nodes where the GPUs are located.
+ - If the exact number of GPUs is not known, the function uses the minimum VRAM value among the available GPUs.
+ """
+
+ if isinstance(gpu_types, (list, dict)):
+ if not gpu_types:
+ return 0
+ elif pd.isna(gpu_types):
+ return 0
+
+ # Case 1: Handle jobs with one type of GPU
+ if len(gpu_types) == 1:
+ return _calculate_approx_vram_single_gpu_type(gpu_types, node_list, gpu_count, gpu_mem_usage)
+
+ # Case 2: Handle jobs with multiple types of GPUs
+ # Case 2.1: Handle jobs using the new GPUType format
+ if isinstance(gpu_types, dict):
+ gpu_types = {gpu.lower(): count for gpu, count in gpu_types.items()}
+ total_vram = _calculate_alloc_vram_multiple_gpu_types_with_count(gpu_types, node_list, gpu_mem_usage)
+ return total_vram
+
+ # Case 2.2: Handle jobs with the old GPUType format (a list)
+
+ # Case 2.2.1: Handle cases where each GPU has a different type.
+ if len(gpu_types) == gpu_count:
+ total_vram = 0
+ for gpu in gpu_types:
+ gpu = gpu.lower()
+ if gpu in MULTIVALENT_GPUS:
+ for node in node_list:
+ total_vram += _get_multivalent_vram_based_on_node(gpu, node)
+ else:
+ if gpu in VRAM_VALUES:
+ total_vram += VRAM_VALUES[gpu]
+ else:
+ raise JobPreprocessingError(PreprocessingErrorTypeEnum.UNKNOWN_GPU_TYPE, gpu)
+ return total_vram
+
+ # Case 2.2.2: Handle cases where the number of GPUs is different from number of GPUTypes.
+ allocated_vrams = set()
+ for gpu in gpu_types:
+ gpu = gpu.lower()
+ if gpu in MULTIVALENT_GPUS:
+ for node in node_list:
+ multivalent_vram = _get_multivalent_vram_based_on_node(gpu, node)
+ if multivalent_vram != 0:
+ allocated_vrams.add(multivalent_vram)
+ else:
+ if gpu in VRAM_VALUES:
+ allocated_vrams.add(VRAM_VALUES[gpu])
+ else:
+ raise JobPreprocessingError(PreprocessingErrorTypeEnum.UNKNOWN_GPU_TYPE, gpu)
+
+ vram_values = sorted(list(allocated_vrams))
+ total_vram = vram_values.pop(0) * gpu_count # use the GPU with the minimum VRAM value
+ # if the total VRAM is less than the GPU memory usage, use the VRAM from the next smallest GPU
+ while total_vram < (gpu_mem_usage / 2**30) and vram_values:
+ total_vram = vram_values.pop(0) * gpu_count
+ return total_vram
diff --git a/src/preprocess/constraints.py b/src/preprocess/constraints.py
new file mode 100644
index 0000000..3ae374a
--- /dev/null
+++ b/src/preprocess/constraints.py
@@ -0,0 +1,116 @@
+from ..config.enum_constants import PreprocessingErrorTypeEnum
+from ..config.constants import (
+ VRAM_VALUES,
+ MONO_GPU_PARTITION_GPU_TYPE,
+)
+from pandas.api.typing import NAType
+import pandas as pd
+from .errors import JobPreprocessingError
+
+
+def _get_vram_constraint(constraints: list[str], gpu_count: int) -> int | NAType:
+ """
+ Get the VRAM assigned to a job based on its constraints and GPU usage.
+
+ This function extracts VRAM requests from the job constraints and returns the maximum requested VRAM from the
+ constraints.
+
+ Args:
+ constraints (list[str]): List of constraints from the job, which may include VRAM requests.
+ gpu_count (int): Number of GPUs requested by the job.
+
+ Returns:
+ int | NAType: Maximum VRAM amount in GiB obtained based on the provided constraints, multiplied by the
+ number of GPUs. Returns pd.NA if no VRAM constraints are provided or if no GPUs are requested.
+
+ Raises:
+ JobPreprocessingError: If a malformed constraint is encountered or if an unknown GPU type is specified.
+ """
+ vram_constraints = []
+ for constr in constraints:
+ constr = constr.strip("'").lower() # Normalize constraints to lowercase and strip quotes
+ if constr.startswith("vram"):
+ vram_constraints.append(int(constr.replace("vram", "")))
+ elif constr.startswith("gpu"):
+ # if the constraint starts with "gpu", it is expected to be in the format "gpu:type"
+ split_constr = constr.split(":")
+ if len(split_constr) <= 1:
+ # Add error records for malformed constraints and missing GPU types
+ raise JobPreprocessingError(PreprocessingErrorTypeEnum.MALFORMED_CONSTRAINT, constr)
+
+ gpu_type = split_constr[1].lower()
+
+ if gpu_type in VRAM_VALUES:
+ vram_constraints.append(VRAM_VALUES[gpu_type])
+ else:
+ raise JobPreprocessingError(PreprocessingErrorTypeEnum.UNKNOWN_GPU_TYPE, gpu_type)
+ else:
+ # if they enter a GPU name without the prefix
+ if constr in VRAM_VALUES:
+ vram_constraints.append(VRAM_VALUES[constr])
+
+ if not (len(vram_constraints)):
+ return pd.NA # if no VRAM constraints are provided or no GPUs are requested return pd.NA
+
+ return max(vram_constraints) * gpu_count
+
+
+def _get_monogpu_partition_gpu_type(partition: str) -> str | None:
+ """
+ Get the GPU type based on the partition if it only has one type of GPU.
+
+ This function relies on the mapping of mono-GPU partition names to their corresponding GPU types.
+
+ Args:
+ partition (str): The name of the partition (e.g., "superpod-a100", "umd-cscdr-gpu").
+
+ Returns:
+ str | None: The GPU type associated with the partition or None if no specific mapping exists.
+ """
+ return MONO_GPU_PARTITION_GPU_TYPE.get(partition.lower(), None)
+
+
+def _get_partition_constraint(partition: str, gpu_count: int) -> int | NAType:
+ """
+ Get the VRAM size based on the partition name requested.
+
+ This function returns the VRAM size in GiB for a given partition name if it has only one type of GPU.
+ If the partition is not recognized, or if it has multiple types of GPUs, it returns NAType.
+
+ Args:
+ partition (str): The name of the partition (e.g., "superpod-a100", "umd-cscdr-gpu").
+ gpu_count (int): The number of GPUs requested by the job.
+
+ Returns:
+ int | NAType: The requested VRAM in GiB or NAType if the partition is not recognized.
+ """
+ gpu_type = _get_monogpu_partition_gpu_type(partition)
+ if gpu_type is None:
+ # if the GPU Type is not inferrable from the partition, return NAType
+ return pd.NA
+ return VRAM_VALUES[gpu_type] * gpu_count
+
+
+def _get_requested_vram(vram_constraint: int | NAType, partition_constraint: int | NAType) -> int | NAType:
+ """
+ Get the requested VRAM for a job based on its constraints and partition.
+
+ This function determines the requested VRAM for a job by checking the VRAM constraint and the partition constraint.
+ If both are provided, it returns the partition constraint as that is more accurate.
+ If only one is provided, it returns that value.
+ If neither is provided, it returns NAType.
+
+ Args:
+ vram_constraint (int | NAType): The VRAM constraint from the job's constraints.
+ partition_constraint (int | NAType): The VRAM size based on the partition name.
+
+ Returns:
+ int | NAType: The requested VRAM in GiB or NAType if no constraints are provided.
+ """
+ if pd.isna(vram_constraint) and pd.isna(partition_constraint):
+ return pd.NA
+ if pd.isna(partition_constraint):
+ return vram_constraint
+
+ # if a partition constraint is provided, we use it
+ return partition_constraint
diff --git a/src/preprocess/errors.py b/src/preprocess/errors.py
index 86d8e85..3fc971d 100644
--- a/src/preprocess/errors.py
+++ b/src/preprocess/errors.py
@@ -1,7 +1,7 @@
from ..config.enum_constants import PreprocessingErrorTypeEnum
-class JobProcessingError(ValueError):
+class JobPreprocessingError(ValueError):
"""
Custom exception for errors encountered during job processing.
"""
diff --git a/src/preprocess/preprocess.py b/src/preprocess/preprocess.py
index 7a8f86b..d271179 100644
--- a/src/preprocess/preprocess.py
+++ b/src/preprocess/preprocess.py
@@ -1,4 +1,3 @@
-import re
import warnings
from collections.abc import Callable
@@ -7,11 +6,8 @@
from pandas.api.typing import NAType
from ..config.constants import (
- VRAM_VALUES,
DEFAULT_MIN_ELAPSED_SECONDS,
ATTRIBUTE_CATEGORIES,
- MULTIVALENT_GPUS,
- PARTITION_TO_GPU_MAP,
)
from ..config.enum_constants import (
StatusEnum,
@@ -24,510 +20,14 @@
RequiredColumnsEnum,
ExcludedColumnsEnum,
)
+from .allocated_vram import _get_approx_allocated_vram
+from .constraints import _get_vram_constraint, _get_partition_constraint, _get_requested_vram
from ..config.remote_config import PartitionInfoFetcher
from ..config.paths import PREPROCESSING_ERRORS_LOG_FILE
-from .errors import JobProcessingError
+from .errors import JobPreprocessingError
-processing_error_logs = []
-error_indices = set()
-
-
-def _get_multivalent_vram_based_on_node(gpu_type: str, node: str) -> int:
- """
- Calculate specific VRAM based on a node name for GPUs with multiple VRAM sizes.
-
- The function checks if a pairing of a given GPU and a node name exists. If it exists, it determines the amount of
- VRAM available for that GPU on that node.
-
- Args:
- gpu_type (str): Type of GPU (e.g., "a100", "v100").
- node (str): Name of the node.
-
- Returns:
- int: VRAM size in GiB for the given GPU type and node.
- Returns 0 if the node does not match any of the patterns for the given GPU type.
-
- Notes:
- This logic is based on the cluster specifications documented at:
- https://docs.unity.rc.umass.edu/documentation/cluster_specs/nodes/
- """
- gpu_type = gpu_type.lower()
- vram = 0
- if gpu_type not in MULTIVALENT_GPUS:
- # if the GPU is not multivalent we do not need to check the node
- vram = 0
-
- else:
- if gpu_type == "a100":
- if node.startswith("ece-gpu"):
- vram = 40 # A100 with 40GB
- elif re.match("^(gpu0(1[3-9]|2[0-4]))|(gpu042)|(umd-cscdr-gpu00[1-2])|(uri-gpu00[1-8])$", node):
- vram = 80 # A100 with 80GB
- else:
- # if the node does not match any of the patterns, it is not a valid node for this GPU type
- # so we return 0
- vram = 0
- elif gpu_type == "v100":
- if re.match("^(gpu00[1-7])|(power9-gpu009)|(power9-gpu01[0-6])$", node):
- vram = 16 # V100 with 16GB
- elif re.match("^(gpu01[1-2])|(power9-gpu00[1-8])$", node):
- vram = 32 # V100 with 32GB
- else:
- # if the node does not match any of the patterns, it is not a valid node for this GPU type
- # so we return 0
- vram = 0
- return vram
-
-
-def _get_vram_constraint(constraints: list[str], gpu_count: int) -> int | NAType:
- """
- Get the VRAM assigned to a job based on its constraints and GPU usage.
-
- This function extracts VRAM requests from the job constraints and returns the maximum requested VRAM from the
- constraints.
-
- Args:
- constraints (list[str]): List of constraints from the job, which may include VRAM requests.
- gpu_count (int): Number of GPUs requested by the job.
-
- Returns:
- int | NAType: Maximum VRAM amount in GiB obtained based on the provided constraints, multiplied by the
- number of GPUs. Returns pd.NA if no VRAM constraints are provided or if no GPUs are requested.
-
- Raises:
- JobProcessingError: If a malformed constraint is encountered or if an unknown GPU type is specified.
- """
- vram_constraints = []
- for constr in constraints:
- constr = constr.strip("'").lower() # Normalize constraints to lowercase and strip quotes
- if constr.startswith("vram"):
- vram_constraints.append(int(constr.replace("vram", "")))
- elif constr.startswith("gpu"):
- # if the constraint starts with "gpu", it is expected to be in the format "gpu:type"
- split_constr = constr.split(":")
- if len(split_constr) <= 1:
- # Add error records for malformed constraints and missing GPU types
- raise JobProcessingError(PreprocessingErrorTypeEnum.MALFORMED_CONSTRAINT, constr)
-
- gpu_type = split_constr[1].lower()
-
- if gpu_type in VRAM_VALUES:
- vram_constraints.append(VRAM_VALUES[gpu_type])
- else:
- raise JobProcessingError(PreprocessingErrorTypeEnum.UNKNOWN_GPU_TYPE, gpu_type)
- else:
- # if they enter a GPU name without the prefix
- if constr in VRAM_VALUES:
- vram_constraints.append(VRAM_VALUES[constr])
-
- if not (len(vram_constraints)):
- return pd.NA # if no VRAM constraints are provided or no GPUs are requested return pd.NA
-
- return max(vram_constraints) * gpu_count
-
-
-def _get_partition_gpu(partition: str) -> str | None:
- """
- Get the GPU type based on the partition if it only has one type of GPU.
-
- This function maps specific partition names to their corresponding GPU types.
-
- Args:
- partition (str): The name of the partition (e.g., "superpod-a100", "umd-cscdr-gpu").
-
- Returns:
- str | None: The GPU type associated with the partition or None if no specific mapping exists.
- """
- return PARTITION_TO_GPU_MAP.get(partition.lower(), None)
-
-
-def _get_partition_constraint(partition: str, gpu_count: int) -> int | NAType:
- """
- Get the VRAM size based on the partition name requested.
-
- This function returns the VRAM size in GiB for a given partition name if it has only one type of GPU.
- If the partition is not recognized, or if it has multiple types of GPUs, it returns NAType.
-
- Args:
- partition (str): The name of the partition (e.g., "superpod-a100", "umd-cscdr-gpu").
- gpu_count (int): The number of GPUs requested by the job.
-
- Returns:
- int | NAType: The requested VRAM in GiB or NAType if the partition is not recognized.
- """
- gpu_type = _get_partition_gpu(partition)
- if gpu_type is None:
- # if the GPU Type is not inferrable from the partition, return NAType
- return pd.NA
- return VRAM_VALUES[gpu_type] * gpu_count
-
-
-def _get_requested_vram(vram_constraint: int | NAType, partition_constraint: int | NAType) -> int | NAType:
- """
- Get the requested VRAM for a job based on its constraints and partition.
-
- This function determines the requested VRAM for a job by checking the VRAM constraint and the partition constraint.
- If both are provided, it returns the partition constraint as that is more accurate.
- If only one is provided, it returns that value.
- If neither is provided, it returns NAType.
-
- Args:
- vram_constraint (int | NAType): The VRAM constraint from the job's constraints.
- partition_constraint (int | NAType): The VRAM size based on the partition name.
-
- Returns:
- int | NAType: The requested VRAM in GiB or NAType if no constraints are provided.
- """
- if pd.isna(vram_constraint) and pd.isna(partition_constraint):
- return pd.NA
- if pd.isna(partition_constraint):
- return vram_constraint
-
- # if a partition constraint is provided, we use it
- return partition_constraint
-
-
-def _calculate_approx_vram_single_gpu_type(
- gpu_types: list[str] | dict[str, int], node_list: list[str], gpu_count: int, gpu_mem_usage: int
-) -> int:
- """
- Calculate the approximate VRAM for a job with a single GPU type.
-
- This helper function computes the total VRAM allocated for a job based on the GPU type,
- the nodes where the job ran, the number of GPUs requested, and the GPU memory usage.
-
- Args:
- gpu_types:
- - list[str]: list containing a single GPU type used in the job.
- - dict[str, int]: dictionary of GPU types and the count of GPUs of each type used in the job.
- node_list (list[str]): List of nodes that the job ran on.
- gpu_count (int): Number of GPUs requested by the job.
- gpu_mem_usage (int): GPU memory usage in bytes.
-
- Returns:
- int: Total allocated VRAM for the job in GiB (gibibyte).
-
- Raises:
- JobProcessingError: If an unknown GPU type is encountered or if no valid nodes are found for a multivalent GPU.
- """
-
- if isinstance(gpu_types, dict):
- gpu, gpu_count = list(gpu_types.items())[0]
- else:
- gpu = gpu_types[0]
- gpu = gpu.lower()
-
- if gpu not in MULTIVALENT_GPUS:
- # if the GPU is not multivalent, return the VRAM value for that GPU
- if gpu in VRAM_VALUES:
- return VRAM_VALUES[gpu] * gpu_count
- else:
- raise JobProcessingError(PreprocessingErrorTypeEnum.UNKNOWN_GPU_TYPE, gpu)
-
- # calculate VRAM for multivalent GPUs
- total_vram = 0
-
- # if all GPUs are on the same node, multiply the VRAM of that node by the number of GPUs
- if len(node_list) == 1:
- node = node_list[0]
- total_vram = _get_multivalent_vram_based_on_node(gpu, node) * gpu_count
-
- # if all GPUs are on different nodes, sum the VRAM of each node
- # and return the total VRAM
- elif len(node_list) == gpu_count:
- for node in node_list:
- total_vram += _get_multivalent_vram_based_on_node(gpu, node)
-
- # if there are multiple nodes, but not all GPUs are on different nodes
- # we need to calculate the total VRAM based on the minimum VRAM of the nodes
- else:
- # calculate available VRAM for all nodes in the node_list
- vram_values = set() # to avoid duplicates
- for node in node_list:
- node_vram = _get_multivalent_vram_based_on_node(gpu, node)
- if node_vram != 0: # only consider nodes with non-zero VRAM
- vram_values.add(_get_multivalent_vram_based_on_node(gpu, node))
-
- if not vram_values:
- # if no valid nodes are found for the multivalent GPU type in the node list, log an error
- raise JobProcessingError(
- PreprocessingErrorTypeEnum.UNKNOWN_GPU_TYPE,
- f"No valid nodes found for multivalent GPU type '{gpu}' in node list: {node_list}",
- )
-
- sorted_vram_values = sorted(list(vram_values))
- total_vram = sorted_vram_values.pop(0) * gpu_count # use the node with the minimum VRAM value
- # if the total VRAM is less than the GPU memory usage, use the VRAM from the GPU in the next larger node
- while total_vram < (gpu_mem_usage / 2**30) and sorted_vram_values:
- total_vram = sorted_vram_values.pop(0) * gpu_count
-
- return total_vram
-
-
-def _adjust_vram_for_multivalent_gpus(
- multivalent: dict,
- allocated_vram: int,
- gpu_mem_usage: int | float,
- gpus_with_exact_values: dict[str, int],
-) -> int:
- """
- Adjust the allocated VRAM for multivalent GPUs to meet or exceed the GPU memory usage.
-
- This function increases the allocated VRAM by adding the minimum VRAM for each multivalent GPU
- until the total allocated VRAM is at least as large as the required GPU memory usage.
-
- Args:
- multivalent (dict): Dictionary of GPU types (str) to counts (int) for multivalent GPUs.
- allocated_vram (int): Current total allocated VRAM in GiB.
- gpu_mem_usage (int | float): GPU memory usage in bytes.
- gpus_with_exact_values (dict[str, int]): Dictionary of GPU types (str) to exact VRAM values (int).
-
- Returns:
- int: Adjusted total allocated VRAM in GiB.
- """
- # Adjust VRAM for GPUs with exact values first
- for gpu, exact_vram in gpus_with_exact_values.items():
- allocated_vram += exact_vram
- multivalent[gpu] -= 1 # Reduce count for GPUs with exact values
-
- # Assume they wanted the bigger VRAM variant for each GPU until the condition is satisfied
- for gpu, gpu_count in multivalent.items():
- while gpu_count > 0 and allocated_vram < (gpu_mem_usage / 2**30):
- allocated_vram += min(MULTIVALENT_GPUS[gpu])
- gpu_count -= 1
-
- return allocated_vram
-
-
-def _get_possible_vram_values(multivalent_gpu_type: str, node_list: list[str]) -> list[int]:
- """
- Return all possible VRAM values for a given multivalent GPU type across a list of nodes.
-
- Args:
- multivalent_gpu_type (str): The GPU type (e.g., "a100", "v100").
- node_list (list[str]): List of node names that the job ran on.
-
- Returns:
- list[int]: List of non-zero VRAM values for the given GPU across nodes.
- """
-
- multivalent_gpu = multivalent_gpu_type.lower()
- possible_vrams = []
- for node in node_list:
- vram = _get_multivalent_vram_based_on_node(multivalent_gpu, node)
- if vram in MULTIVALENT_GPUS[multivalent_gpu]: # if it matches a node for the given GPU
- possible_vrams.append(vram)
- return possible_vrams
-
-
-def _can_calculate_accurately(multivalent_gpu_type: str, count: int, node_list: list[str]) -> bool:
- """
- Determine whether VRAM can be calculated accurately for a multivalent GPU type based on the job's node list.
-
- Args:
- multivalent_gpu_type (str): The GPU type (e.g., "a100", "v100").
- count (int): Number of GPUs of this type.
- node_list (list[str]): List of node names that the job ran on.
-
- Returns:
- bool: True if all there's one possible VRAM value for this GPU type across given nodes or
- if each node corresponds to a different possible VRAM value for this GPU type.
- """
- multivalent_gpu = multivalent_gpu_type.lower()
- possible_vrams = _get_possible_vram_values(multivalent_gpu, node_list)
- return len(possible_vrams) == count or len(set(possible_vrams)) == 1
-
-
-def _calculate_vram_accurately(multivalent_gpu_type: str, count: int, node_list: list[str]) -> int:
- """
- Calculate VRAM for a multivalent GPU type based on the job's node list.
-
- This can be done when all matched nodes have consistent VRAM configuration or enough distinct nodes exist.
-
- Args:
- multivalent_gpu_type (str): The GPU type (e.g., "a100", "v100").
- count (int): The number of GPUs of this type used in the job.
- node_list (list[str]): List of node names that the job ran on.
-
- Returns:
- int: Total VRAM in GiB for the given GPU type.
- """
- multivalent_gpu = multivalent_gpu_type.lower()
- possible_vrams = _get_possible_vram_values(multivalent_gpu, node_list)
- # If all possible VRAM values are the same, return that value multiplied by the count
- if len(set(possible_vrams)) == 1:
- return possible_vrams[0] * count
-
- # Otherwise, return the sum of all matching VRAM values
- return sum(possible_vrams)
-
-
-def _calculate_non_multivalent_vram(non_multivalent: dict) -> int:
- """
- Calculate the VRAM allocated for non-multivalent GPUs.
-
- Args:
- non_multivalent (dict): Dictionary with non-multivalent GPU types as keys and their counts as values.
-
- Returns:
- int: Total allocated VRAM for non-multivalent GPUs in GiB.
-
- Raises:
- JobProcessingError: If an unknown GPU type is encountered.
- """
- allocated_vram = 0
- for gpu, count in non_multivalent.items():
- if gpu in VRAM_VALUES:
- allocated_vram += VRAM_VALUES[gpu] * count
- else:
- raise JobProcessingError(PreprocessingErrorTypeEnum.UNKNOWN_GPU_TYPE, gpu)
- return allocated_vram
-
-
-def _calculate_multivalent_vram(multivalent: dict, node_list: list[str], gpu_mem_usage: int) -> int:
- """
- Calculate the VRAM allocated for multivalent GPUs.
-
- Args:
- multivalent (dict): Dictionary with multivalent GPU types as keys and their counts as values.
- node_list (list[str]): List of nodes that the job ran on.
- gpu_mem_usage (int): GPU memory usage in bytes.
-
- Returns:
- int: Total allocated VRAM for multivalent GPUs in GiB.
- """
- allocated_vram = 0
- gpus_with_exact_values: dict[str, int] = dict()
-
- for gpu, count in multivalent.items():
- if _can_calculate_accurately(gpu, count, node_list):
- vram_value = _calculate_vram_accurately(gpu, count, node_list)
- allocated_vram += vram_value
- gpus_with_exact_values[gpu] = vram_value
- else:
- allocated_vram += min(MULTIVALENT_GPUS[gpu]) * count
-
- if allocated_vram < gpu_mem_usage / 2**30 and len(gpus_with_exact_values) < len(multivalent):
- allocated_vram = _adjust_vram_for_multivalent_gpus(
- multivalent, allocated_vram, gpu_mem_usage, gpus_with_exact_values
- )
- return allocated_vram
-
-
-def _calculate_alloc_vram_multiple_gpu_types_with_count(
- gpu_types: dict[str, int], node_list: list[str], gpu_mem_usage: int
-) -> int:
- """
- Calculate allocated VRAM for a job with multiple GPU types given a dictionary.
-
- The dictionary has GPU Types as keys and their respective counts as values.
-
- Args:
- gpu_types (dict[str, int]): Dictionary with GPU types as keys and their counts as values.
- node_list (list[str]): List of nodes that the job ran on.
- gpu_mem_usage (int): GPU memory usage in bytes.
-
- Returns:
- int: Total allocated VRAM for the job in GiB.
- """
- multivalent_gpus = {gpu.lower(): count for gpu, count in gpu_types.items() if gpu.lower() in MULTIVALENT_GPUS}
- non_multivalent_gpus = {
- gpu.lower(): count for gpu, count in gpu_types.items() if gpu.lower() not in MULTIVALENT_GPUS
- }
-
- alloc_vram = 0
- alloc_vram += _calculate_non_multivalent_vram(non_multivalent_gpus)
- alloc_vram += _calculate_multivalent_vram(multivalent_gpus, node_list, gpu_mem_usage)
-
- return alloc_vram
-
-
-def _get_approx_allocated_vram(
- gpu_types: list[str] | dict[str, int], node_list: list[str], gpu_count: int, gpu_mem_usage: int
-) -> int:
- """
- Get the total allocated VRAM for a job based on its GPU type and node list.
-
- This function estimates the total VRAM allocated for a job based on the GPU types used
- and the nodes that the job ran on.
-
- Args:
- gpu_types:
- This could be a list of strings (if using the old database format) or a dictionary with GPU types as keys
- and their counts as values (if using the new database format).
- - list[str]: List containing the types of GPUs used in the job.
- - dict[str, int]: Dictionary with the type of GPUs and the exact count used in the job.
- node_list (list[str]): List of nodes that the job ran on.
- gpu_count (int): Number of GPUs requested by the job.
- gpu_mem_usage (int): GPU memory usage in bytes.
-
- Returns:
- int: Total allocated (estimate) VRAM for the job in GiB (gibibyte).
-
- Raises:
- JobProcessingError: If an unknown GPU type is encountered or if the GPU types are malformed.
-
- Notes:
- - When `gpu_types` is a dictionary, the function calculates VRAM based on the counts of each GPU type.
- - For multivalent GPUs, the VRAM is determined based on the nodes where the GPUs are located.
- - If the exact number of GPUs is not known, the function uses the minimum VRAM value among the available GPUs.
- """
-
- if isinstance(gpu_types, (list, dict)):
- if not gpu_types:
- return 0
- elif pd.isna(gpu_types):
- return 0
-
- # Case 1: Handle jobs with one type of GPU
- if len(gpu_types) == 1:
- return _calculate_approx_vram_single_gpu_type(gpu_types, node_list, gpu_count, gpu_mem_usage)
-
- # Case 2: Handle jobs with multiple types of GPUs
- # Case 2.1: Handle jobs using the new GPUType format
- if isinstance(gpu_types, dict):
- gpu_types = {gpu.lower(): count for gpu, count in gpu_types.items()}
- total_vram = _calculate_alloc_vram_multiple_gpu_types_with_count(gpu_types, node_list, gpu_mem_usage)
- return total_vram
-
- # Case 2.2: Handle jobs with the old GPUType format (a list)
-
- # Calculate allocated VRAM when there are multiple GPU types in a job
- if len(gpu_types) == gpu_count:
- total_vram = 0
- for gpu in gpu_types:
- gpu = gpu.lower()
- if gpu in MULTIVALENT_GPUS:
- for node in node_list:
- total_vram += _get_multivalent_vram_based_on_node(gpu, node)
- else:
- if gpu in VRAM_VALUES:
- total_vram += VRAM_VALUES[gpu]
- else:
- raise JobProcessingError(PreprocessingErrorTypeEnum.UNKNOWN_GPU_TYPE, gpu)
- return total_vram
-
- # Handle cases where the number of GPUs is different from number of GPUTypes.
- allocated_vrams = set()
- for gpu in gpu_types:
- gpu = gpu.lower()
- if gpu in MULTIVALENT_GPUS:
- for node in node_list:
- multivalent_vram = _get_multivalent_vram_based_on_node(gpu, node)
- if multivalent_vram != 0:
- allocated_vrams.add(multivalent_vram)
- else:
- if gpu in VRAM_VALUES:
- allocated_vrams.add(VRAM_VALUES[gpu])
- else:
- raise JobProcessingError(PreprocessingErrorTypeEnum.UNKNOWN_GPU_TYPE, gpu)
-
- vram_values = sorted(list(allocated_vrams))
- total_vram = vram_values.pop(0) * gpu_count # use the GPU with the minimum VRAM value
- # if the total VRAM is less than the GPU memory usage, use the VRAM from the next smallest GPU
- while total_vram < (gpu_mem_usage / 2**30) and vram_values:
- total_vram = vram_values.pop(0) * gpu_count
- return total_vram
+processing_error_logs: list = []
+error_indices: set = set()
def _validate_gpu_type(
@@ -544,7 +44,7 @@ def _validate_gpu_type(
list | dict | NATpe: Processed GPU type value for GPU jobs or pd.NA for CPU-only jobs.
Raises:
- JobProcessingError: If GPU type is null and CPU-only jobs are not allowed.
+ JobPreprocessingError: If GPU type is null and CPU-only jobs are not allowed.
"""
# Handle dict and list types first (these are never NA)
@@ -558,7 +58,7 @@ def _validate_gpu_type(
# Handle missing/empty values (now only NAType remains)
elif pd.isna(gpu_type_value):
if not include_cpu_only_jobs:
- raise JobProcessingError(
+ raise JobPreprocessingError(
error_type=PreprocessingErrorTypeEnum.GPU_TYPE_NULL,
info="GPU Type is null but include_cpu_only_jobs is False",
)
@@ -569,9 +69,9 @@ def _safe_apply_function(
func: Callable, *args: object, job_id: int | None = None, idx: int | None = None
) -> int | NAType:
"""
- Safely apply calculation functions, catching JobProcessingError and logging it.
+ Safely apply calculation functions, catching JobPreprocessingError and logging it.
- This function wraps calculation functions to catch JobProcessingError exceptions
+ This function wraps calculation functions to catch JobPreprocessingError exceptions
that may occur during column or metric processing, logs the error details for
later review, and returns pd.NA instead of allowing the error to propagate.
@@ -585,17 +85,17 @@ def _safe_apply_function(
Returns:
int | NAType: The result of calling func(*args) if successful, or pd.NA if a
- JobProcessingError occurs.
+ JobPreprocessingError occurs.
Note:
- When a JobProcessingError is caught, the function:
+ When a JobPreprocessingError is caught, the function:
- Adds the index to error_indices for later row removal
- Logs error details to processing_error_logs for summary reporting
- Returns pd.NA to maintain DataFrame structure
"""
try:
return func(*args)
- except JobProcessingError as e:
+ except JobPreprocessingError as e:
if idx is not None:
error_indices.add(idx)
processing_error_logs.append({
@@ -626,6 +126,7 @@ def _fill_missing(res: pd.DataFrame, include_cpu_only_jobs: bool) -> None:
"Interactive": lambda col: col.fillna("non-interactive"),
"Constraints": lambda col: col.fillna("").apply(lambda x: [] if isinstance(x, str) and x == "" else list(x)),
"GPUs": lambda col: col.fillna(0),
+ "NodeList": lambda col: col.fillna("").apply(lambda x: [] if isinstance(x, str) and x == "" else list(x)),
}
res.loc[:, "GPUType"] = res.apply(
diff --git a/src/visualization/columns.py b/src/visualization/columns.py
index 48f3e54..85186b9 100644
--- a/src/visualization/columns.py
+++ b/src/visualization/columns.py
@@ -113,7 +113,7 @@ def pct(n: int, total_count: int = total_count) -> str:
return f"{(n / total_count * 100):.1f}%" if total_count > 0 else "0.0%"
# Helper to reduce xticks and rotate if width is small
- def choose_ticks_and_rotation(ticks: list[int], width: float, max_labels: int = 3) -> tuple[list[int], int]:
+ def choose_ticks_and_rotation(ticks: list[Any], width: float, max_labels: int = 3) -> tuple[list[int], int]:
# Always include first and last tick, and always have at least 4 ticks
if width < min_width and len(ticks) > max_labels:
# Always include first and last, and evenly space the rest
@@ -376,7 +376,7 @@ def _generate_interactive_pie_chart(
# Define threshold for small slices
threshold_pct = 5
total = counts.sum()
- pct_values = counts.div(total).multiply(100)
+ pct_values = counts.div(total) * 100
# Explode small slices to separate them visually
explode = [max(0.15 - p / 100 * 4, 0.1) if p < threshold_pct else 0 for p in pct_values]
@@ -463,7 +463,7 @@ def _generate_status_pie_chart(self, jobs_df: pd.DataFrame, col: str, output_dir
# Prepare labels and explode small slices
threshold_pct = 5
- pct_values = exit_code_counts.div(total_count).multiply(100)
+ pct_values = exit_code_counts.div(total_count) * 100
explode = [max(0.15 - p / 100 * 4, 0.1) if p < threshold_pct else 0 for p in pct_values]
# Prepare labels: only show label on pie if above threshold
@@ -632,7 +632,7 @@ def _generate_qos_pie_chart(self, jobs_df: pd.DataFrame, col: str, output_dir_pa
# Prepare labels and explode small slices
threshold_pct = 5
- pct_values = qos_counts.div(total_count).multiply(100)
+ pct_values = qos_counts.div(total_count) * 100
explode = [max(0.15 - p / 100 * 4, 0.1) if p < threshold_pct else 0 for p in pct_values]
# Prepare labels: only show label on pie if above threshold
@@ -726,7 +726,7 @@ def _generate_gpu_count_pie_chart(
raise ValueError(f"No valid GPU counts in {col}. Skipping visualization.")
threshold_pct = 5
- pct_values = gpu_counts.div(total_count).multiply(100)
+ pct_values = gpu_counts.div(total_count) * 100
# Explode values increase with index: first group (0), second (small), ..., last (largest)
explode = [i * 0.04 for i in range(len(gpu_counts))]
@@ -1307,7 +1307,7 @@ def _generate_exit_code_pie_chart(
# Prepare labels and explode small slices
threshold_pct = 5
- pct_values = exit_code_counts.div(total_count).multiply(100)
+ pct_values = exit_code_counts.div(total_count) * 100
explode = [max(0.15 - p / 100 * 4, 0.2) if p < threshold_pct else 0 for p in pct_values]
# Prepare labels: only show label on pie if above threshold
diff --git a/src/warnings/__init__.py b/src/warnings/__init__.py
new file mode 100644
index 0000000..8e570ef
--- /dev/null
+++ b/src/warnings/__init__.py
@@ -0,0 +1 @@
+from .analysis import NodeNotFoundWarning as NodeNotFoundWarning
diff --git a/src/warnings/analysis.py b/src/warnings/analysis.py
new file mode 100644
index 0000000..8dc31cc
--- /dev/null
+++ b/src/warnings/analysis.py
@@ -0,0 +1,9 @@
+class NodeNotFoundWarning(UserWarning):
+ """Warning raised when a node is not found in the provided configuration file."""
+
+ def __init__(self, node_name: str, message: str) -> None:
+ """
+ Initialize the warning with the missing node's name and message.
+ """
+ super().__init__(message)
+ self.node_name = node_name
diff --git a/tests/config/test_enum_meta.py b/tests/config/test_enum_meta.py
new file mode 100644
index 0000000..4fde431
--- /dev/null
+++ b/tests/config/test_enum_meta.py
@@ -0,0 +1,48 @@
+import pytest
+
+from src.config.enum_constants import (
+ MetricsDataFrameNameBase,
+ MetricsDataFrameNameEnum,
+ ResourceHoardingDataFrameNameEnum,
+)
+
+
+def test_valid_subclass_creation_passes() -> None:
+ class ValidMetricsEnum(MetricsDataFrameNameBase):
+ JOBS = "jobs_with_efficiency_metrics"
+ USERS = "users_with_efficiency_metrics"
+ PI_GROUPS = "pi_accounts_with_efficiency_metrics"
+
+ # Sanity checks
+ assert ValidMetricsEnum.JOBS.value == "jobs_with_efficiency_metrics"
+ assert ValidMetricsEnum.USERS.value == "users_with_efficiency_metrics"
+ assert ValidMetricsEnum.PI_GROUPS.value == "pi_accounts_with_efficiency_metrics"
+
+
+def test_missing_required_members_raises() -> None:
+ with pytest.raises(TypeError, match="must define members"):
+
+ class MissingMembersEnum(MetricsDataFrameNameBase):
+ USERS = "users_with_efficiency_metrics"
+ PI_GROUPS = "pi_accounts_with_efficiency_metrics"
+
+
+def test_wrong_value_for_required_member_raises() -> None:
+ with pytest.raises(TypeError, match=r"JOBS must equal 'jobs_with_efficiency_metrics'"):
+
+ class WrongValueEnum(MetricsDataFrameNameBase):
+ JOBS = "wrong_value"
+ USERS = "users_with_efficiency_metrics"
+ PI_GROUPS = "pi_accounts_with_efficiency_metrics"
+
+
+def test_existing_enums_satisfy_contract() -> None:
+ # The canonical enum should satisfy the metaclass contract exactly
+ assert MetricsDataFrameNameEnum.JOBS.value == "jobs_with_efficiency_metrics"
+ assert MetricsDataFrameNameEnum.USERS.value == "users_with_efficiency_metrics"
+ assert MetricsDataFrameNameEnum.PI_GROUPS.value == "pi_accounts_with_efficiency_metrics"
+
+ # The ResourceHoarding enum reuses canonical values and should also satisfy the contract
+ assert ResourceHoardingDataFrameNameEnum.JOBS.value == "jobs_with_efficiency_metrics"
+ assert ResourceHoardingDataFrameNameEnum.USERS.value == "users_with_efficiency_metrics"
+ assert ResourceHoardingDataFrameNameEnum.PI_GROUPS.value == "pi_accounts_with_efficiency_metrics"
diff --git a/tests/config/test_remote_config.py b/tests/config/test_remote_config.py
new file mode 100644
index 0000000..dcb9507
--- /dev/null
+++ b/tests/config/test_remote_config.py
@@ -0,0 +1,134 @@
+import pytest
+from src.config.remote_config import NodeInfoFetcher
+from src.config.enum_constants import NodeInfoKeyEnum
+from src.warnings import NodeNotFoundWarning
+from pathlib import Path
+import json
+import tempfile
+import shutil
+
+
+class DummyNodeInfoFetcher(NodeInfoFetcher):
+ """Dummy fetcher for testing NodeInfoFetcher with static mock data."""
+
+ @property
+ def local_path(self) -> Path:
+ """Return path to static mock node info file."""
+ return Path("tests/mock_data/mock_remote_configs/node_info.json")
+
+
+@pytest.fixture
+def dummy_node_info_fetcher() -> DummyNodeInfoFetcher:
+ return DummyNodeInfoFetcher()
+
+
+def test_get_node_memory_valid(dummy_node_info_fetcher: DummyNodeInfoFetcher) -> None:
+ node_info = dummy_node_info_fetcher.get_info()
+ found = False
+ for batch in node_info:
+ nodes = batch.get(NodeInfoKeyEnum.NODES.value)
+ ram = batch.get(NodeInfoKeyEnum.RAM.value)
+ if nodes and ram is not None:
+ node_name = nodes[0]
+ node_memory = NodeInfoFetcher.get_node_memory(node_name, node_info)
+ assert node_memory == ram, (
+ f"Memory for node '{node_name}' with a value of {node_memory} does not match expected value of {ram}."
+ )
+ found = True
+ break
+ assert found, f"No valid node with '{NodeInfoKeyEnum.RAM.value}' memory found in mock data."
+
+
+def test_get_node_memory_node_not_found(dummy_node_info_fetcher: DummyNodeInfoFetcher) -> None:
+ node_info = dummy_node_info_fetcher.get_info()
+ all_nodes: set[str] = set()
+ for batch in node_info:
+ nodes = batch.get(NodeInfoKeyEnum.NODES.value)
+ if nodes:
+ all_nodes.update(nodes)
+ missing_node = "definitely_not_a_real_node"
+ while missing_node in all_nodes:
+ missing_node += "_x"
+ with pytest.warns(NodeNotFoundWarning, match=f"Node '{missing_node}' not found"):
+ NodeInfoFetcher.get_node_memory(missing_node, node_info)
+
+
+def test_get_node_memory_missing_ram(dummy_node_info_fetcher: DummyNodeInfoFetcher) -> None:
+ node_info = dummy_node_info_fetcher.get_info()
+ for batch in node_info:
+ nodes = batch.get(NodeInfoKeyEnum.NODES.value)
+ if nodes and NodeInfoKeyEnum.RAM.value not in batch:
+ node_name = nodes[0]
+ with pytest.raises(
+ ValueError, match="Each node info dictionary must contain the keys listed in NodeInfoKeyEnum."
+ ):
+ NodeInfoFetcher.get_node_memory(node_name, node_info)
+ break
+
+
+def test_get_node_info_values_return_one_key(dummy_node_info_fetcher: DummyNodeInfoFetcher) -> None:
+ node_info = dummy_node_info_fetcher.get_info()
+ found = False
+ for batch in node_info:
+ nodes = batch.get(NodeInfoKeyEnum.NODES.value)
+ if nodes:
+ node_name = nodes[0]
+ values = NodeInfoFetcher.get_node_info_values(node_name, node_info, members={NodeInfoKeyEnum.RAM})
+ assert NodeInfoKeyEnum.RAM in values, f"RAM value not found for node '{node_name}'."
+ found = True
+ break
+ assert found, "No valid node with RAM information found in mock data."
+ assert values[NodeInfoKeyEnum.RAM] == batch[NodeInfoKeyEnum.RAM.value], (
+ f"Unexpected RAM value for node '{node_name}': {values[NodeInfoKeyEnum.RAM]}"
+ )
+
+
+def test_get_node_info_values_return_multiple_keys(dummy_node_info_fetcher: DummyNodeInfoFetcher) -> None:
+ node_info = dummy_node_info_fetcher.get_info()
+ found = False
+ for batch in node_info:
+ nodes = batch.get(NodeInfoKeyEnum.NODES.value)
+ if nodes:
+ node_name = nodes[0]
+ values = NodeInfoFetcher.get_node_info_values(
+ node_name, node_info, members={NodeInfoKeyEnum.RAM, NodeInfoKeyEnum.GPU_COUNT}
+ )
+ assert NodeInfoKeyEnum.RAM in values, f"RAM value not found for node '{node_name}'."
+ assert NodeInfoKeyEnum.GPU_COUNT in values, f"GPU_COUNT value not found for node '{node_name}'."
+ found = True
+ break
+ assert found, "No valid node with RAM and GPU_COUNT information found in mock data."
+ assert values[NodeInfoKeyEnum.RAM] == batch[NodeInfoKeyEnum.RAM.value], (
+ f"Unexpected RAM value for node '{node_name}': {values[NodeInfoKeyEnum.RAM]}"
+ )
+ assert values[NodeInfoKeyEnum.GPU_COUNT] == batch[NodeInfoKeyEnum.GPU_COUNT.value], (
+ f"Unexpected GPU_COUNT value for node '{node_name}': {values[NodeInfoKeyEnum.GPU_COUNT]}"
+ )
+
+
+@pytest.mark.parametrize("missing_key", [e.value for e in NodeInfoKeyEnum])
+def test_get_node_memory_missing_any_key(
+ dummy_node_info_fetcher: DummyNodeInfoFetcher,
+ missing_key: str,
+ monkeypatch: pytest.MonkeyPatch,
+) -> None:
+ node_info = dummy_node_info_fetcher.get_info()
+ temp_dir = tempfile.mkdtemp()
+ temp_path = f"{temp_dir}/node_info_missing_key.json"
+ for i, batch in enumerate(node_info):
+ nodes = batch.get(NodeInfoKeyEnum.NODES.value)
+ if nodes and missing_key in batch:
+ node_info[i].pop(missing_key)
+ node_name = nodes[0]
+ with open(temp_path, "w") as tf:
+ json.dump(node_info, tf)
+ monkeypatch.setattr(type(dummy_node_info_fetcher), "local_path", property(lambda self: Path(temp_path)))
+ with pytest.raises(
+ ValueError, match="Each node info dictionary must contain the keys listed in NodeInfoKeyEnum."
+ ):
+ NodeInfoFetcher.get_node_memory(node_name, node_info, offline=True)
+ temp_path_obj = Path(temp_path)
+ if temp_path_obj.exists():
+ temp_path_obj.unlink()
+ break
+ shutil.rmtree(temp_dir)
diff --git a/tests/test_database_connection.py b/tests/database/test_database_connection.py
similarity index 100%
rename from tests/test_database_connection.py
rename to tests/database/test_database_connection.py
diff --git a/tests/mock_data/mock_remote_configs/node_info.json b/tests/mock_data/mock_remote_configs/node_info.json
new file mode 100644
index 0000000..082ffca
--- /dev/null
+++ b/tests/mock_data/mock_remote_configs/node_info.json
@@ -0,0 +1,3498 @@
+[
+ {
+ "nodes_folded": "arm-gpu[001-002]",
+ "count": 2,
+ "nodes": [
+ "arm-gpu001",
+ "arm-gpu002"
+ ],
+ "node_group": "arm-gpu",
+ "partitions": [
+ "arm-gpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "aarch64",
+ "arm64",
+ "bf16",
+ "fp64",
+ "gh200",
+ "gracehopper",
+ "ib",
+ "localscratch",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "sm_86",
+ "sm_87",
+ "sm_89",
+ "sm_90",
+ "vram102",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram40",
+ "vram48",
+ "vram8",
+ "vram80"
+ ],
+ "cores": 72,
+ "ram": 560,
+ "cpus": "1x NVIDIA Grace Hopper Superchip (72 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA Grace Hopper Superchip",
+ "gpu_count": 1,
+ "vram": 80,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "astroth-cpu[001-008]",
+ "count": 8,
+ "nodes": [
+ "astroth-cpu001",
+ "astroth-cpu002",
+ "astroth-cpu003",
+ "astroth-cpu004",
+ "astroth-cpu005",
+ "astroth-cpu006",
+ "astroth-cpu007",
+ "astroth-cpu008"
+ ],
+ "node_group": "astroth-cpu",
+ "partitions": [
+ "astroth-cpu"
+ ],
+ "features": [
+ "10gbps",
+ "avx512",
+ "cascadelake",
+ "intel",
+ "intel4215r",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 16,
+ "ram": 180,
+ "cpus": "2x Intel(R) Xeon(R) Silver 4215R (16 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "Martin Weinberg",
+ "owner_email": "weinberg@astro.umass.edu"
+ },
+ {
+ "nodes_folded": "astroth-gpu[001-003]",
+ "count": 3,
+ "nodes": [
+ "astroth-gpu001",
+ "astroth-gpu002",
+ "astroth-gpu003"
+ ],
+ "node_group": "astroth-gpu",
+ "partitions": [
+ "astroth-gpu"
+ ],
+ "features": [
+ "2080",
+ "amd",
+ "amd1900x",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "zen"
+ ],
+ "cores": 8,
+ "ram": 30,
+ "cpus": "1x AMD Ryzen Threadripper 1900X 8-Core Processor (8 core)",
+ "general_use": false,
+ "gpu_model": "NVIDIA RTX 2080",
+ "gpu_count": 2,
+ "vram": 8,
+ "open_hardware": true,
+ "owner": "Martin Weinberg",
+ "owner_email": "weinberg@astro.umass.edu"
+ },
+ {
+ "nodes_folded": "ceewater-cpu[001-007]",
+ "count": 7,
+ "nodes": [
+ "ceewater-cpu001",
+ "ceewater-cpu002",
+ "ceewater-cpu003",
+ "ceewater-cpu004",
+ "ceewater-cpu005",
+ "ceewater-cpu006",
+ "ceewater-cpu007"
+ ],
+ "node_group": "ceewater-cpu",
+ "partitions": [
+ "ceewater_casey-cpu",
+ "ceewater_cjgleason-cpu",
+ "ceewater_kandread-cpu",
+ "cpu-preempt"
+ ],
+ "features": [
+ "10gbps",
+ "amd",
+ "amd7402",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "zen2"
+ ],
+ "cores": 24,
+ "ram": 120,
+ "cpus": "1x AMD EPYC 7402 24-Core Processor (24 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "CEEWater",
+ "owner_email": "cjgleason@umass.edu"
+ },
+ {
+ "nodes_folded": "ceewater-cpu[008-010]",
+ "count": 3,
+ "nodes": [
+ "ceewater-cpu008",
+ "ceewater-cpu009",
+ "ceewater-cpu010"
+ ],
+ "node_group": "ceewater-cpu",
+ "partitions": [
+ "ceewater_casey-cpu",
+ "ceewater_cjgleason-cpu",
+ "ceewater_kandread-cpu",
+ "cpu-preempt"
+ ],
+ "features": [
+ "10gbps",
+ "amd",
+ "amd7702",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "zen2"
+ ],
+ "cores": 128,
+ "ram": 500,
+ "cpus": "2x AMD EPYC 7702 64-Core Processor (128 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "CEEWater",
+ "owner_email": "cjgleason@umass.edu"
+ },
+ {
+ "nodes_folded": "cpu001",
+ "count": 1,
+ "nodes": [
+ "cpu001"
+ ],
+ "node_group": "cpu",
+ "partitions": [
+ "cpu"
+ ],
+ "features": [
+ "10gbps",
+ "avx512",
+ "intel",
+ "intel6126",
+ "skylake_avx512",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 24,
+ "ram": 180,
+ "cpus": "2x Intel(R) Xeon(R) Gold 6126 (24 core)",
+ "general_use": true,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "cpu[002-004]",
+ "count": 3,
+ "nodes": [
+ "cpu002",
+ "cpu003",
+ "cpu004"
+ ],
+ "node_group": "cpu",
+ "partitions": [
+ "cpu"
+ ],
+ "features": [
+ "10gbps",
+ "avx512",
+ "intel",
+ "intel6126",
+ "skylake_avx512",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 24,
+ "ram": 370,
+ "cpus": "2x Intel(R) Xeon(R) Gold 6126 (24 core)",
+ "general_use": true,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "cpu[005-008]",
+ "count": 4,
+ "nodes": [
+ "cpu005",
+ "cpu006",
+ "cpu007",
+ "cpu008"
+ ],
+ "node_group": "cpu",
+ "partitions": [
+ "cpu"
+ ],
+ "features": [
+ "10gbps",
+ "avx512",
+ "intel",
+ "intel5118",
+ "skylake_avx512",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 24,
+ "ram": 180,
+ "cpus": "2x Intel(R) Xeon(R) Gold 5118 (24 core)",
+ "general_use": true,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "cpu[009-021]",
+ "count": 13,
+ "nodes": [
+ "cpu009",
+ "cpu010",
+ "cpu011",
+ "cpu012",
+ "cpu013",
+ "cpu014",
+ "cpu015",
+ "cpu016",
+ "cpu017",
+ "cpu018",
+ "cpu019",
+ "cpu020",
+ "cpu021"
+ ],
+ "node_group": "cpu",
+ "partitions": [
+ "cpu"
+ ],
+ "features": [
+ "10gbps",
+ "avx512",
+ "intel",
+ "intel6148",
+ "skylake_avx512",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 40,
+ "ram": 180,
+ "cpus": "2x Intel(R) Xeon(R) Gold 6148 (40 core)",
+ "general_use": true,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "cpu[022-029]",
+ "count": 8,
+ "nodes": [
+ "cpu022",
+ "cpu023",
+ "cpu024",
+ "cpu025",
+ "cpu026",
+ "cpu027",
+ "cpu028",
+ "cpu029"
+ ],
+ "node_group": "cpu",
+ "partitions": [
+ "cpu"
+ ],
+ "features": [
+ "10gbps",
+ "amd",
+ "amd7763",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "zen3"
+ ],
+ "cores": 128,
+ "ram": 1000,
+ "cpus": "2x AMD EPYC 7763 64-Core Processor (128 core)",
+ "general_use": true,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "cpu030",
+ "count": 1,
+ "nodes": [
+ "cpu030"
+ ],
+ "node_group": "cpu",
+ "partitions": [
+ "astroth-cpu"
+ ],
+ "features": [
+ "10gbps",
+ "amd",
+ "amd7763",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "zen3"
+ ],
+ "cores": 128,
+ "ram": 1000,
+ "cpus": "2x AMD EPYC 7763 64-Core Processor (128 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "Martin Weinberg",
+ "owner_email": "weinberg@astro.umass.edu"
+ },
+ {
+ "nodes_folded": "cpu[031-044]",
+ "count": 14,
+ "nodes": [
+ "cpu031",
+ "cpu032",
+ "cpu033",
+ "cpu034",
+ "cpu035",
+ "cpu036",
+ "cpu037",
+ "cpu038",
+ "cpu039",
+ "cpu040",
+ "cpu041",
+ "cpu042",
+ "cpu043",
+ "cpu044"
+ ],
+ "node_group": "cpu",
+ "partitions": [
+ "cpu"
+ ],
+ "features": [
+ "10gbps",
+ "amd",
+ "amd7543",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "zen3"
+ ],
+ "cores": 64,
+ "ram": 500,
+ "cpus": "2x AMD EPYC 7543 32-Core Processor (64 core)",
+ "general_use": true,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "cpu[045-048]",
+ "count": 4,
+ "nodes": [
+ "cpu045",
+ "cpu046",
+ "cpu047",
+ "cpu048"
+ ],
+ "node_group": "cpu",
+ "partitions": [
+ "cpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "amd",
+ "amd7502",
+ "ib",
+ "localscratch",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "zen2"
+ ],
+ "cores": 64,
+ "ram": 500,
+ "cpus": "2x AMD EPYC 7502 32-Core Processor (64 core)",
+ "general_use": true,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "cpu[049-068]",
+ "count": 20,
+ "nodes": [
+ "cpu049",
+ "cpu050",
+ "cpu051",
+ "cpu052",
+ "cpu053",
+ "cpu054",
+ "cpu055",
+ "cpu056",
+ "cpu057",
+ "cpu058",
+ "cpu059",
+ "cpu060",
+ "cpu061",
+ "cpu062",
+ "cpu063",
+ "cpu064",
+ "cpu065",
+ "cpu066",
+ "cpu067",
+ "cpu068"
+ ],
+ "node_group": "cpu",
+ "partitions": [
+ "cpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "amd",
+ "amd7763",
+ "ib",
+ "localscratch",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "zen3"
+ ],
+ "cores": 128,
+ "ram": 1000,
+ "cpus": "2x AMD EPYC 7763 64-Core Processor (128 core)",
+ "general_use": true,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "cpu[069-078]",
+ "count": 10,
+ "nodes": [
+ "cpu069",
+ "cpu070",
+ "cpu071",
+ "cpu072",
+ "cpu073",
+ "cpu074",
+ "cpu075",
+ "cpu076",
+ "cpu077",
+ "cpu078"
+ ],
+ "node_group": "cpu",
+ "partitions": [
+ "cpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "amd",
+ "amd9654",
+ "avx512",
+ "ib",
+ "localscratch",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4",
+ "zen4"
+ ],
+ "cores": 192,
+ "ram": 1510,
+ "cpus": "2x AMD EPYC 9654 96-Core Processor (192 core)",
+ "general_use": true,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "cpu079",
+ "count": 1,
+ "nodes": [
+ "cpu079"
+ ],
+ "node_group": "cpu",
+ "partitions": [
+ "cpu-preempt",
+ "fsi-lab"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "amd",
+ "amd9654",
+ "avx512",
+ "ib",
+ "localscratch",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4",
+ "zen4"
+ ],
+ "cores": 192,
+ "ram": 1510,
+ "cpus": "2x AMD EPYC 9654 96-Core Processor (192 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "cpu[080-081]",
+ "count": 2,
+ "nodes": [
+ "cpu080",
+ "cpu081"
+ ],
+ "node_group": "cpu",
+ "partitions": [
+ "cbio-cpu",
+ "cpu-preempt"
+ ],
+ "features": [
+ "10gbps",
+ "amd",
+ "amd7h12",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "zen2"
+ ],
+ "cores": 128,
+ "ram": 500,
+ "cpus": "2x AMD EPYC 7H12 64-Core Processor (128 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "cpu[082-084]",
+ "count": 3,
+ "nodes": [
+ "cpu082",
+ "cpu083",
+ "cpu084"
+ ],
+ "node_group": "cpu",
+ "partitions": [
+ "cbio-cpu",
+ "cpu-preempt"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "amd",
+ "amd7h12",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "zen2"
+ ],
+ "cores": 128,
+ "ram": 500,
+ "cpus": "2x AMD EPYC 7H12 64-Core Processor (128 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "cpu085",
+ "count": 1,
+ "nodes": [
+ "cpu085"
+ ],
+ "node_group": "cpu",
+ "partitions": [
+ "cpu-preempt",
+ "jdelhommelle"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "amd",
+ "amd9734",
+ "avx512",
+ "localscratch",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4",
+ "zen4"
+ ],
+ "cores": 224,
+ "ram": 1510,
+ "cpus": "2x AMD EPYC 9734 112-Core Processor (224 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "ece-gpu[001-002]",
+ "count": 2,
+ "nodes": [
+ "ece-gpu001",
+ "ece-gpu002"
+ ],
+ "node_group": "ece-gpu",
+ "partitions": [
+ "ece-gpu",
+ "gpu-preempt"
+ ],
+ "features": [
+ "10gbps",
+ "a100",
+ "a100-40g",
+ "avx512",
+ "bf16",
+ "cascadelake",
+ "fp64",
+ "intel",
+ "intel6226r",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram40",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 32,
+ "ram": 370,
+ "cpus": "2x Intel(R) Xeon(R) Gold 6226R (32 core)",
+ "general_use": false,
+ "gpu_model": "NVIDIA Tesla A100",
+ "gpu_count": 4,
+ "vram": 40,
+ "open_hardware": true,
+ "owner": "Mario Parente",
+ "owner_email": "mparente@ecs.umass.edu"
+ },
+ {
+ "nodes_folded": "gaoseismolab-cpu[001-005]",
+ "count": 5,
+ "nodes": [
+ "gaoseismolab-cpu001",
+ "gaoseismolab-cpu002",
+ "gaoseismolab-cpu003",
+ "gaoseismolab-cpu004",
+ "gaoseismolab-cpu005"
+ ],
+ "node_group": "gaoseismolab-cpu",
+ "partitions": [
+ "cpu-preempt",
+ "gaoseismolab-cpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "avx512",
+ "icelake",
+ "intel",
+ "intel8358",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 64,
+ "ram": 500,
+ "cpus": "2x Intel(R) Xeon(R) Platinum 8358 (64 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "Haiying Gao",
+ "owner_email": "haiyinggao@umass.edu"
+ },
+ {
+ "nodes_folded": "gpu[001-002]",
+ "count": 2,
+ "nodes": [
+ "gpu001",
+ "gpu002"
+ ],
+ "node_group": "gpu",
+ "partitions": [
+ "gpu"
+ ],
+ "features": [
+ "10gbps",
+ "avx512",
+ "fp64",
+ "intel",
+ "intel4110",
+ "skylake_avx512",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "v100",
+ "v100-16g",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 16,
+ "ram": 180,
+ "cpus": "2x Intel(R) Xeon(R) Silver 4110 (16 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA Tesla V100",
+ "gpu_count": 2,
+ "vram": 16,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "gpu[003-004]",
+ "count": 2,
+ "nodes": [
+ "gpu003",
+ "gpu004"
+ ],
+ "node_group": "gpu",
+ "partitions": [
+ "gpu"
+ ],
+ "features": [
+ "10gbps",
+ "avx512",
+ "fp64",
+ "intel",
+ "intel6140",
+ "skylake_avx512",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "v100",
+ "v100-16g",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 36,
+ "ram": 180,
+ "cpus": "2x Intel(R) Xeon(R) Gold 6140 (36 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA Tesla V100",
+ "gpu_count": 2,
+ "vram": 16,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "gpu[005-007]",
+ "count": 3,
+ "nodes": [
+ "gpu005",
+ "gpu006",
+ "gpu007"
+ ],
+ "node_group": "gpu",
+ "partitions": [
+ "gpu"
+ ],
+ "features": [
+ "10gbps",
+ "avx512",
+ "fp64",
+ "ib",
+ "intel",
+ "intel6130",
+ "skylake_avx512",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "v100",
+ "v100-16g",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 32,
+ "ram": 180,
+ "cpus": "2x Intel(R) Xeon(R) Gold 6130 (32 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA Tesla V100",
+ "gpu_count": 3,
+ "vram": 16,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "gpu[008-009]",
+ "count": 2,
+ "nodes": [
+ "gpu008",
+ "gpu009"
+ ],
+ "node_group": "gpu",
+ "partitions": [
+ "gpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "a40",
+ "avx512",
+ "bf16",
+ "ib",
+ "icelake",
+ "intel",
+ "intel6326",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "sm_86",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram40",
+ "vram48",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 32,
+ "ram": 500,
+ "cpus": "2x Intel(R) Xeon(R) Gold 6326 (32 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA A40",
+ "gpu_count": 4,
+ "vram": 48,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "gpu010",
+ "count": 1,
+ "nodes": [
+ "gpu010"
+ ],
+ "node_group": "gpu",
+ "partitions": [
+ "gpu-preempt",
+ "lan"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "a40",
+ "avx512",
+ "bf16",
+ "ib",
+ "icelake",
+ "intel",
+ "intel6326",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "sm_86",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram40",
+ "vram48",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 32,
+ "ram": 500,
+ "cpus": "2x Intel(R) Xeon(R) Gold 6326 (32 core)",
+ "general_use": false,
+ "gpu_model": "NVIDIA A40",
+ "gpu_count": 4,
+ "vram": 48,
+ "open_hardware": true,
+ "owner": "Andrew Lan",
+ "owner_email": "andrewlan@umass.edu"
+ },
+ {
+ "nodes_folded": "gpu[011-012]",
+ "count": 2,
+ "nodes": [
+ "gpu011",
+ "gpu012"
+ ],
+ "node_group": "gpu",
+ "partitions": [
+ "gpu"
+ ],
+ "features": [
+ "10gbps",
+ "avx512",
+ "cascadelake",
+ "fp64",
+ "intel",
+ "intel6240",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "v100",
+ "v100-32g",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 36,
+ "ram": 370,
+ "cpus": "2x Intel(R) Xeon(R) Gold 6240 Processor (36 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA Tesla V100",
+ "gpu_count": 4,
+ "vram": 32,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "gpu[013-021]",
+ "count": 9,
+ "nodes": [
+ "gpu013",
+ "gpu014",
+ "gpu015",
+ "gpu016",
+ "gpu017",
+ "gpu018",
+ "gpu019",
+ "gpu020",
+ "gpu021"
+ ],
+ "node_group": "gpu",
+ "partitions": [
+ "gpu-preempt",
+ "superpod-a100"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "a100",
+ "a100-80g",
+ "avx512",
+ "bf16",
+ "fp64",
+ "ib",
+ "intel",
+ "intel8480",
+ "localscratch",
+ "sapphirerapids",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram40",
+ "vram48",
+ "vram8",
+ "vram80",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 112,
+ "ram": 2010,
+ "cpus": "2x Intel(R) Xeon(R) Platinum 8480+ (112 core)",
+ "general_use": false,
+ "gpu_model": "NVIDIA Tesla A100",
+ "gpu_count": 8,
+ "vram": 80,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "gpu[022-024]",
+ "count": 3,
+ "nodes": [
+ "gpu022",
+ "gpu023",
+ "gpu024"
+ ],
+ "node_group": "gpu",
+ "partitions": [
+ "gpu",
+ "gpu-preempt",
+ "superpod-a100"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "a100",
+ "a100-80g",
+ "avx512",
+ "bf16",
+ "fp64",
+ "ib",
+ "intel",
+ "intel8480",
+ "localscratch",
+ "sapphirerapids",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram40",
+ "vram48",
+ "vram8",
+ "vram80",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 112,
+ "ram": 2010,
+ "cpus": "2x Intel(R) Xeon(R) Platinum 8480+ (112 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA Tesla A100",
+ "gpu_count": 8,
+ "vram": 80,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "gpu[025-034]",
+ "count": 10,
+ "nodes": [
+ "gpu025",
+ "gpu026",
+ "gpu027",
+ "gpu028",
+ "gpu029",
+ "gpu030",
+ "gpu031",
+ "gpu032",
+ "gpu033",
+ "gpu034"
+ ],
+ "node_group": "gpu",
+ "partitions": [
+ "gpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "avx512",
+ "bf16",
+ "ib",
+ "icelake",
+ "intel",
+ "intel6526y",
+ "l40s",
+ "localscratch",
+ "sapphirerapids",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "sm_86",
+ "sm_87",
+ "sm_89",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram40",
+ "vram48",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 32,
+ "ram": 500,
+ "cpus": "2x Intel(R) Xeon(R) Gold 6526Y (32 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA Ada Lovelace L40S",
+ "gpu_count": 4,
+ "vram": 48,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "gpu[035-041]",
+ "count": 7,
+ "nodes": [
+ "gpu035",
+ "gpu036",
+ "gpu037",
+ "gpu038",
+ "gpu039",
+ "gpu040",
+ "gpu041"
+ ],
+ "node_group": "gpu",
+ "partitions": [
+ "gpu-preempt",
+ "gpupod-l40s"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "avx512",
+ "bf16",
+ "ib",
+ "icelake",
+ "intel",
+ "intel6526y",
+ "l40s",
+ "localscratch",
+ "sapphirerapids",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "sm_86",
+ "sm_87",
+ "sm_89",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram40",
+ "vram48",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 32,
+ "ram": 500,
+ "cpus": "2x Intel(R) Xeon(R) Gold 6526Y (32 core)",
+ "general_use": false,
+ "gpu_model": "NVIDIA Ada Lovelace L40S",
+ "gpu_count": 4,
+ "vram": 48,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "gpu042",
+ "count": 1,
+ "nodes": [
+ "gpu042"
+ ],
+ "node_group": "gpu",
+ "partitions": [
+ "cbio-gpu",
+ "gpu-preempt"
+ ],
+ "features": [
+ "10gbps",
+ "a100",
+ "a100-80g",
+ "amd",
+ "amd7h12",
+ "bf16",
+ "fp64",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram40",
+ "vram48",
+ "vram8",
+ "vram80",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "zen2"
+ ],
+ "cores": 128,
+ "ram": 500,
+ "cpus": "2x AMD EPYC 7H12 64-Core Processor (128 core)",
+ "general_use": false,
+ "gpu_model": "NVIDIA Tesla A100",
+ "gpu_count": 1,
+ "vram": 80,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "gpu043",
+ "count": 1,
+ "nodes": [
+ "gpu043"
+ ],
+ "node_group": "gpu",
+ "partitions": [
+ "gpu"
+ ],
+ "features": [
+ "10gbps",
+ "avx512",
+ "bf16",
+ "intel",
+ "intel4116",
+ "l4",
+ "skylake_avx512",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "sm_86",
+ "sm_87",
+ "sm_89",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 24,
+ "ram": 370,
+ "cpus": "2x Intel(R) Xeon(R) Silver 4116 (24 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA L4",
+ "gpu_count": 8,
+ "vram": 24,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "gpu[044-046]",
+ "count": 3,
+ "nodes": [
+ "gpu044",
+ "gpu045",
+ "gpu046"
+ ],
+ "node_group": "gpu",
+ "partitions": [
+ "gpu-preempt",
+ "gpupod-l40s"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "avx512",
+ "bf16",
+ "ib",
+ "intel",
+ "intel6526y",
+ "l40s",
+ "localscratch",
+ "sapphirerapids",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "sm_86",
+ "sm_87",
+ "sm_89",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram40",
+ "vram48",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 32,
+ "ram": 500,
+ "cpus": "2x Intel(R) Xeon(R) Gold 6526Y (32 core)",
+ "general_use": false,
+ "gpu_model": "NVIDIA Ada Lovelace L40S",
+ "gpu_count": 4,
+ "vram": 48,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "gpu047",
+ "count": 1,
+ "nodes": [
+ "gpu047"
+ ],
+ "node_group": "gpu",
+ "partitions": [
+ "gpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "a16",
+ "avx512",
+ "intel",
+ "intel4116",
+ "skylake_avx512",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "sm_86",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 24,
+ "ram": 370,
+ "cpus": "2x Intel(R) Xeon(R) Silver 4116 (24 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA A16",
+ "gpu_count": 4,
+ "vram": 16,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "gpu[048-055]",
+ "count": 8,
+ "nodes": [
+ "gpu048",
+ "gpu049",
+ "gpu050",
+ "gpu051",
+ "gpu052",
+ "gpu053",
+ "gpu054",
+ "gpu055"
+ ],
+ "node_group": "gpu",
+ "partitions": [
+ "gpu-preempt",
+ "umd-cscdr-gpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "a16",
+ "amd",
+ "amd9354",
+ "avx512",
+ "localscratch",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "sm_86",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4",
+ "zen4"
+ ],
+ "cores": 64,
+ "ram": 750,
+ "cpus": "2x AMD EPYC 9354 32-Core Processor (64 core)",
+ "general_use": false,
+ "gpu_model": "NVIDIA A16",
+ "gpu_count": 8,
+ "vram": 16,
+ "open_hardware": true,
+ "owner": "UMassD CSCDR",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "gypsum-gpu[001-005,007-013,015-021]",
+ "count": 19,
+ "nodes": [
+ "gypsum-gpu001",
+ "gypsum-gpu002",
+ "gypsum-gpu003",
+ "gypsum-gpu004",
+ "gypsum-gpu005",
+ "gypsum-gpu007",
+ "gypsum-gpu008",
+ "gypsum-gpu009",
+ "gypsum-gpu010",
+ "gypsum-gpu011",
+ "gypsum-gpu012",
+ "gypsum-gpu013",
+ "gypsum-gpu015",
+ "gypsum-gpu016",
+ "gypsum-gpu017",
+ "gypsum-gpu018",
+ "gypsum-gpu019",
+ "gypsum-gpu020",
+ "gypsum-gpu021"
+ ],
+ "node_group": "gypsum-gpu",
+ "partitions": [
+ "gpu"
+ ],
+ "features": [
+ "haswell",
+ "intel",
+ "intel2620v3",
+ "m40",
+ "sm_52",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3"
+ ],
+ "cores": 12,
+ "ram": 250,
+ "cpus": "2x Intel(R) Xeon(R) E5-2620 v3 (12 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA Tesla M40",
+ "gpu_count": 4,
+ "vram": 24,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "gypsum-gpu[044-046,048-050,053-063,065-078,080-086,088-092,094-096,099]",
+ "count": 47,
+ "nodes": [
+ "gypsum-gpu044",
+ "gypsum-gpu045",
+ "gypsum-gpu046",
+ "gypsum-gpu048",
+ "gypsum-gpu049",
+ "gypsum-gpu050",
+ "gypsum-gpu053",
+ "gypsum-gpu054",
+ "gypsum-gpu055",
+ "gypsum-gpu056",
+ "gypsum-gpu057",
+ "gypsum-gpu058",
+ "gypsum-gpu059",
+ "gypsum-gpu060",
+ "gypsum-gpu061",
+ "gypsum-gpu062",
+ "gypsum-gpu063",
+ "gypsum-gpu065",
+ "gypsum-gpu066",
+ "gypsum-gpu067",
+ "gypsum-gpu068",
+ "gypsum-gpu069",
+ "gypsum-gpu070",
+ "gypsum-gpu071",
+ "gypsum-gpu072",
+ "gypsum-gpu073",
+ "gypsum-gpu074",
+ "gypsum-gpu075",
+ "gypsum-gpu076",
+ "gypsum-gpu077",
+ "gypsum-gpu078",
+ "gypsum-gpu080",
+ "gypsum-gpu081",
+ "gypsum-gpu082",
+ "gypsum-gpu083",
+ "gypsum-gpu084",
+ "gypsum-gpu085",
+ "gypsum-gpu086",
+ "gypsum-gpu088",
+ "gypsum-gpu089",
+ "gypsum-gpu090",
+ "gypsum-gpu091",
+ "gypsum-gpu092",
+ "gypsum-gpu094",
+ "gypsum-gpu095",
+ "gypsum-gpu096",
+ "gypsum-gpu099"
+ ],
+ "node_group": "gypsum-gpu",
+ "partitions": [
+ "cpu",
+ "gpu"
+ ],
+ "features": [
+ "haswell",
+ "intel",
+ "intel2620v3",
+ "sm_52",
+ "titan_x",
+ "titanx",
+ "vram11",
+ "vram12",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3"
+ ],
+ "cores": 12,
+ "ram": 250,
+ "cpus": "2x Intel(R) Xeon(R) E5-2620 v3 (12 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA GeForce GTX TITAN X",
+ "gpu_count": 4,
+ "vram": 12,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "gypsum-gpu[104,116-117,126-127,129-132,134,136,138,140,143-155]",
+ "count": 26,
+ "nodes": [
+ "gypsum-gpu104",
+ "gypsum-gpu116",
+ "gypsum-gpu117",
+ "gypsum-gpu126",
+ "gypsum-gpu127",
+ "gypsum-gpu129",
+ "gypsum-gpu130",
+ "gypsum-gpu131",
+ "gypsum-gpu132",
+ "gypsum-gpu134",
+ "gypsum-gpu136",
+ "gypsum-gpu138",
+ "gypsum-gpu140",
+ "gypsum-gpu143",
+ "gypsum-gpu144",
+ "gypsum-gpu145",
+ "gypsum-gpu146",
+ "gypsum-gpu147",
+ "gypsum-gpu148",
+ "gypsum-gpu149",
+ "gypsum-gpu150",
+ "gypsum-gpu151",
+ "gypsum-gpu152",
+ "gypsum-gpu153",
+ "gypsum-gpu154",
+ "gypsum-gpu155"
+ ],
+ "node_group": "gypsum-gpu",
+ "partitions": [
+ "gpu"
+ ],
+ "features": [
+ "1080_ti",
+ "1080ti",
+ "10gbps",
+ "avx512",
+ "intel",
+ "intel4116",
+ "skylake_avx512",
+ "sm_52",
+ "sm_61",
+ "vram11",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 24,
+ "ram": 370,
+ "cpus": "2x Intel(R) Xeon(R) Silver 4116 (24 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA GeForce GTX 1080 Ti",
+ "gpu_count": 8,
+ "vram": 11,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "gypsum-gpu[106,108-115,118-119,121,124]",
+ "count": 13,
+ "nodes": [
+ "gypsum-gpu106",
+ "gypsum-gpu108",
+ "gypsum-gpu109",
+ "gypsum-gpu110",
+ "gypsum-gpu111",
+ "gypsum-gpu112",
+ "gypsum-gpu113",
+ "gypsum-gpu114",
+ "gypsum-gpu115",
+ "gypsum-gpu118",
+ "gypsum-gpu119",
+ "gypsum-gpu121",
+ "gypsum-gpu124"
+ ],
+ "node_group": "gypsum-gpu",
+ "partitions": [
+ "gpu"
+ ],
+ "features": [
+ "1080_ti",
+ "1080ti",
+ "10gbps",
+ "25gbps",
+ "avx512",
+ "intel",
+ "intel4116",
+ "skylake_avx512",
+ "sm_52",
+ "sm_61",
+ "vram11",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 24,
+ "ram": 370,
+ "cpus": "2x Intel(R) Xeon(R) Silver 4116 (24 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA GeForce GTX 1080 Ti",
+ "gpu_count": 8,
+ "vram": 11,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "gypsum-gpu[160-164,166,168,171,173-177,181,190-192]",
+ "count": 17,
+ "nodes": [
+ "gypsum-gpu160",
+ "gypsum-gpu161",
+ "gypsum-gpu162",
+ "gypsum-gpu163",
+ "gypsum-gpu164",
+ "gypsum-gpu166",
+ "gypsum-gpu168",
+ "gypsum-gpu171",
+ "gypsum-gpu173",
+ "gypsum-gpu174",
+ "gypsum-gpu175",
+ "gypsum-gpu176",
+ "gypsum-gpu177",
+ "gypsum-gpu181",
+ "gypsum-gpu190",
+ "gypsum-gpu191",
+ "gypsum-gpu192"
+ ],
+ "node_group": "gypsum-gpu",
+ "partitions": [
+ "gpu"
+ ],
+ "features": [
+ "10gbps",
+ "2080_ti",
+ "2080ti",
+ "avx512",
+ "intel",
+ "intel4116",
+ "skylake_avx512",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "vram11",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 24,
+ "ram": 370,
+ "cpus": "2x Intel(R) Xeon(R) Silver 4116 (24 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA RTX 2080ti",
+ "gpu_count": 8,
+ "vram": 11,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "gypsum-gpu[182,186,188]",
+ "count": 3,
+ "nodes": [
+ "gypsum-gpu182",
+ "gypsum-gpu186",
+ "gypsum-gpu188"
+ ],
+ "node_group": "gypsum-gpu",
+ "partitions": [
+ "gpu"
+ ],
+ "features": [
+ "10gbps",
+ "avx512",
+ "intel",
+ "intel4116",
+ "rtx8000",
+ "rtx_8000",
+ "skylake_avx512",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram40",
+ "vram48",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 24,
+ "ram": 370,
+ "cpus": "2x Intel(R) Xeon(R) Silver 4116 (24 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA Quadro RTX 8000",
+ "gpu_count": 6,
+ "vram": 48,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "ials-gpu[001-013,017-018,020-026,029-033]",
+ "count": 27,
+ "nodes": [
+ "ials-gpu001",
+ "ials-gpu002",
+ "ials-gpu003",
+ "ials-gpu004",
+ "ials-gpu005",
+ "ials-gpu006",
+ "ials-gpu007",
+ "ials-gpu008",
+ "ials-gpu009",
+ "ials-gpu010",
+ "ials-gpu011",
+ "ials-gpu012",
+ "ials-gpu013",
+ "ials-gpu017",
+ "ials-gpu018",
+ "ials-gpu020",
+ "ials-gpu021",
+ "ials-gpu022",
+ "ials-gpu023",
+ "ials-gpu024",
+ "ials-gpu025",
+ "ials-gpu026",
+ "ials-gpu029",
+ "ials-gpu030",
+ "ials-gpu031",
+ "ials-gpu032",
+ "ials-gpu033"
+ ],
+ "node_group": "ials-gpu",
+ "partitions": [
+ "gpu-preempt",
+ "ials-gpu"
+ ],
+ "features": [
+ "10gbps",
+ "2080_ti",
+ "2080ti",
+ "avx512",
+ "cascadelake",
+ "intel",
+ "intel4214r",
+ "localscratch",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "vram11",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 24,
+ "ram": 180,
+ "cpus": "2x Intel(R) Xeon(R) Silver 4214R (24 core)",
+ "general_use": false,
+ "gpu_model": "NVIDIA RTX 2080ti",
+ "gpu_count": 8,
+ "vram": 11,
+ "open_hardware": true,
+ "owner": "IALS",
+ "owner_email": "jianhanc@umass.edu"
+ },
+ {
+ "nodes_folded": "ials-gpu014",
+ "count": 1,
+ "nodes": [
+ "ials-gpu014"
+ ],
+ "node_group": "ials-gpu",
+ "partitions": [
+ "gpu-preempt",
+ "ials-gpu"
+ ],
+ "features": [
+ "10gbps",
+ "2080_ti",
+ "2080ti",
+ "avx512",
+ "cascadelake",
+ "intel",
+ "intel4214r",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "vram11",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 24,
+ "ram": 180,
+ "cpus": "2x Intel(R) Xeon(R) Silver 4214R (24 core)",
+ "general_use": false,
+ "gpu_model": "NVIDIA RTX 2080ti",
+ "gpu_count": 8,
+ "vram": 11,
+ "open_hardware": true,
+ "owner": "IALS",
+ "owner_email": "jianhanc@umass.edu"
+ },
+ {
+ "nodes_folded": "ials-gpu[034-036]",
+ "count": 3,
+ "nodes": [
+ "ials-gpu034",
+ "ials-gpu035",
+ "ials-gpu036"
+ ],
+ "node_group": "ials-gpu",
+ "partitions": [
+ "gpu-preempt",
+ "ials-gpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "avx512",
+ "bf16",
+ "intel",
+ "intel6526y",
+ "l40s",
+ "localscratch",
+ "sapphirerapids",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "sm_86",
+ "sm_87",
+ "sm_89",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram40",
+ "vram48",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 32,
+ "ram": 500,
+ "cpus": "2x Intel(R) Xeon(R) Gold 6526Y (32 core)",
+ "general_use": false,
+ "gpu_model": "NVIDIA Ada Lovelace L40S",
+ "gpu_count": 4,
+ "vram": 48,
+ "open_hardware": true,
+ "owner": "IALS",
+ "owner_email": "jianhanc@umass.edu"
+ },
+ {
+ "nodes_folded": "power9-gpu[001-006]",
+ "count": 6,
+ "nodes": [
+ "power9-gpu001",
+ "power9-gpu002",
+ "power9-gpu003",
+ "power9-gpu004",
+ "power9-gpu005",
+ "power9-gpu006"
+ ],
+ "node_group": "power",
+ "partitions": [
+ "power9-gpu",
+ "power9-gpu-preempt"
+ ],
+ "features": [
+ "10gbps",
+ "altivec",
+ "fp64",
+ "p923",
+ "power9le",
+ "ppc64le",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "v100",
+ "v100-32g",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram8"
+ ],
+ "cores": 32,
+ "ram": 250,
+ "cpus": "2x POWER9, altivec supported (32 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA Tesla V100",
+ "gpu_count": 2,
+ "vram": 32,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "power9-gpu[007-008]",
+ "count": 2,
+ "nodes": [
+ "power9-gpu007",
+ "power9-gpu008"
+ ],
+ "node_group": "power",
+ "partitions": [
+ "power9-gpu",
+ "power9-gpu-osg",
+ "power9-gpu-preempt"
+ ],
+ "features": [
+ "10gbps",
+ "altivec",
+ "fp64",
+ "p923",
+ "power9le",
+ "ppc64le",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "v100",
+ "v100-32g",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram8"
+ ],
+ "cores": 32,
+ "ram": 250,
+ "cpus": "2x POWER9, altivec supported (32 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA Tesla V100",
+ "gpu_count": 2,
+ "vram": 32,
+ "open_hardware": true,
+ "owner": "Mike Zink",
+ "owner_email": "zink@ecs.umass.edu"
+ },
+ {
+ "nodes_folded": "power9-gpu[009-016]",
+ "count": 8,
+ "nodes": [
+ "power9-gpu009",
+ "power9-gpu010",
+ "power9-gpu011",
+ "power9-gpu012",
+ "power9-gpu013",
+ "power9-gpu014",
+ "power9-gpu015",
+ "power9-gpu016"
+ ],
+ "node_group": "power",
+ "partitions": [
+ "power9-gpu",
+ "power9-gpu-preempt"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "40gbps",
+ "altivec",
+ "fp64",
+ "ib",
+ "p922",
+ "power9le",
+ "ppc64le",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "v100",
+ "v100-16g",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram8"
+ ],
+ "cores": 40,
+ "ram": 510,
+ "cpus": "2x POWER9, altivec supported (40 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA Tesla V100",
+ "gpu_count": 4,
+ "vram": 16,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "toltec-cpu001",
+ "count": 1,
+ "nodes": [
+ "toltec-cpu001"
+ ],
+ "node_group": "toltec-cpu",
+ "partitions": [
+ "cpu-preempt",
+ "toltec-cpu"
+ ],
+ "features": [
+ "100gbps",
+ "10gbps",
+ "25gbps",
+ "40gbps",
+ "avx512",
+ "cascadelake",
+ "intel",
+ "intel5218",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 32,
+ "ram": 370,
+ "cpus": "2x Intel(R) Xeon(R) Gold 5218 (32 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "Grant Wilson",
+ "owner_email": "wilson@astro.umass.edu"
+ },
+ {
+ "nodes_folded": "toltec-cpu[002-007]",
+ "count": 6,
+ "nodes": [
+ "toltec-cpu002",
+ "toltec-cpu003",
+ "toltec-cpu004",
+ "toltec-cpu005",
+ "toltec-cpu006",
+ "toltec-cpu007"
+ ],
+ "node_group": "toltec-cpu",
+ "partitions": [
+ "cpu-preempt",
+ "toltec-cpu"
+ ],
+ "features": [
+ "100gbps",
+ "10gbps",
+ "25gbps",
+ "40gbps",
+ "avx512",
+ "cascadelake",
+ "intel",
+ "intel5218",
+ "localscratch",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 32,
+ "ram": 370,
+ "cpus": "2x Intel(R) Xeon(R) Gold 5218 (32 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "Grant Wilson",
+ "owner_email": "wilson@astro.umass.edu"
+ },
+ {
+ "nodes_folded": "umd-cscdr-arm[001-003]",
+ "count": 3,
+ "nodes": [
+ "umd-cscdr-arm001",
+ "umd-cscdr-arm002",
+ "umd-cscdr-arm003"
+ ],
+ "node_group": "umd-cscdr-arm",
+ "partitions": [
+ "arm-preempt",
+ "umd-cscdr-arm"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "aarch64",
+ "arm64",
+ "armn1"
+ ],
+ "cores": 80,
+ "ram": 250,
+ "cpus": "1x Neoverse-N1 (80 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "UMassD CSCDR",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "umd-cscdr-cpu[001-035]",
+ "count": 35,
+ "nodes": [
+ "umd-cscdr-cpu001",
+ "umd-cscdr-cpu002",
+ "umd-cscdr-cpu003",
+ "umd-cscdr-cpu004",
+ "umd-cscdr-cpu005",
+ "umd-cscdr-cpu006",
+ "umd-cscdr-cpu007",
+ "umd-cscdr-cpu008",
+ "umd-cscdr-cpu009",
+ "umd-cscdr-cpu010",
+ "umd-cscdr-cpu011",
+ "umd-cscdr-cpu012",
+ "umd-cscdr-cpu013",
+ "umd-cscdr-cpu014",
+ "umd-cscdr-cpu015",
+ "umd-cscdr-cpu016",
+ "umd-cscdr-cpu017",
+ "umd-cscdr-cpu018",
+ "umd-cscdr-cpu019",
+ "umd-cscdr-cpu020",
+ "umd-cscdr-cpu021",
+ "umd-cscdr-cpu022",
+ "umd-cscdr-cpu023",
+ "umd-cscdr-cpu024",
+ "umd-cscdr-cpu025",
+ "umd-cscdr-cpu026",
+ "umd-cscdr-cpu027",
+ "umd-cscdr-cpu028",
+ "umd-cscdr-cpu029",
+ "umd-cscdr-cpu030",
+ "umd-cscdr-cpu031",
+ "umd-cscdr-cpu032",
+ "umd-cscdr-cpu033",
+ "umd-cscdr-cpu034",
+ "umd-cscdr-cpu035"
+ ],
+ "node_group": "umd-cscdr-cpu",
+ "partitions": [
+ "cpu-preempt",
+ "umd-cscdr-cpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "avx512",
+ "ib",
+ "icelake",
+ "intel",
+ "intel8352y",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 64,
+ "ram": 250,
+ "cpus": "2x Intel(R) Xeon(R) Platinum 8352Y (64 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "UMassD CSCDR",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "umd-cscdr-cpu[036-048]",
+ "count": 13,
+ "nodes": [
+ "umd-cscdr-cpu036",
+ "umd-cscdr-cpu037",
+ "umd-cscdr-cpu038",
+ "umd-cscdr-cpu039",
+ "umd-cscdr-cpu040",
+ "umd-cscdr-cpu041",
+ "umd-cscdr-cpu042",
+ "umd-cscdr-cpu043",
+ "umd-cscdr-cpu044",
+ "umd-cscdr-cpu045",
+ "umd-cscdr-cpu046",
+ "umd-cscdr-cpu047",
+ "umd-cscdr-cpu048"
+ ],
+ "node_group": "umd-cscdr-cpu",
+ "partitions": [
+ "cpu",
+ "cpu-preempt"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "avx512",
+ "ib",
+ "icelake",
+ "intel",
+ "intel8352y",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 64,
+ "ram": 250,
+ "cpus": "2x Intel(R) Xeon(R) Platinum 8352Y (64 core)",
+ "general_use": true,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "UMassD CSCDR",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "umd-cscdr-cpu[049-053]",
+ "count": 5,
+ "nodes": [
+ "umd-cscdr-cpu049",
+ "umd-cscdr-cpu050",
+ "umd-cscdr-cpu051",
+ "umd-cscdr-cpu052",
+ "umd-cscdr-cpu053"
+ ],
+ "node_group": "umd-cscdr-cpu",
+ "partitions": [
+ "cpu-preempt",
+ "mpi"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "avx512",
+ "ib",
+ "icelake",
+ "intel",
+ "intel8352y",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 64,
+ "ram": 250,
+ "cpus": "2x Intel(R) Xeon(R) Platinum 8352Y (64 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "UMassD CSCDR",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "umd-cscdr-gpu001",
+ "count": 1,
+ "nodes": [
+ "umd-cscdr-gpu001"
+ ],
+ "node_group": "umd-cscdr-gpu",
+ "partitions": [
+ "gpu-preempt",
+ "umd-cscdr-gpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "a100",
+ "a100-80g",
+ "avx512",
+ "bf16",
+ "fp64",
+ "ib",
+ "icelake",
+ "intel",
+ "intel8352y",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram40",
+ "vram48",
+ "vram8",
+ "vram80",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 64,
+ "ram": 500,
+ "cpus": "2x Intel(R) Xeon(R) Platinum 8352Y (64 core)",
+ "general_use": false,
+ "gpu_model": "NVIDIA Tesla A100",
+ "gpu_count": 4,
+ "vram": 80,
+ "open_hardware": true,
+ "owner": "UMassD CSCDR",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "umd-cscdr-gpu002",
+ "count": 1,
+ "nodes": [
+ "umd-cscdr-gpu002"
+ ],
+ "node_group": "umd-cscdr-gpu",
+ "partitions": [
+ "gpu-preempt",
+ "umd-cscdr-gpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "a100",
+ "a100-80g",
+ "amd",
+ "amd7763",
+ "bf16",
+ "fp64",
+ "ib",
+ "localscratch",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram40",
+ "vram48",
+ "vram8",
+ "vram80",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "zen3"
+ ],
+ "cores": 128,
+ "ram": 2000,
+ "cpus": "2x AMD EPYC 7763 64-Core Processor (128 core)",
+ "general_use": false,
+ "gpu_model": "NVIDIA Tesla A100",
+ "gpu_count": 8,
+ "vram": 80,
+ "open_hardware": true,
+ "owner": "UMassD CSCDR",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "uri-arm-gpu001",
+ "count": 1,
+ "nodes": [
+ "uri-arm-gpu001"
+ ],
+ "node_group": "uri-arm-gpu",
+ "partitions": [
+ "arm-gpu"
+ ],
+ "features": [
+ "100gbps",
+ "10gbps",
+ "25gbps",
+ "40gbps",
+ "aarch64",
+ "arm64",
+ "bf16",
+ "fp64",
+ "gh200",
+ "gracehopper",
+ "localscratch",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "sm_86",
+ "sm_87",
+ "sm_89",
+ "sm_90",
+ "vram102",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram40",
+ "vram48",
+ "vram8",
+ "vram80"
+ ],
+ "cores": 72,
+ "ram": 560,
+ "cpus": "1x NVIDIA Grace Hopper Superchip (72 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA Grace Hopper Superchip",
+ "gpu_count": 1,
+ "vram": 80,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "uri-cpu[001-005]",
+ "count": 5,
+ "nodes": [
+ "uri-cpu001",
+ "uri-cpu002",
+ "uri-cpu003",
+ "uri-cpu004",
+ "uri-cpu005"
+ ],
+ "node_group": "uri-cpu",
+ "partitions": [
+ "cpu"
+ ],
+ "features": [
+ "10gbps",
+ "avx512",
+ "cascadelake",
+ "intel",
+ "intel6238r",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 56,
+ "ram": 500,
+ "cpus": "2x Intel(R) Xeon(R) Gold 6238R (56 core)",
+ "general_use": true,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "URI",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "uri-cpu[006-008]",
+ "count": 3,
+ "nodes": [
+ "uri-cpu006",
+ "uri-cpu007",
+ "uri-cpu008"
+ ],
+ "node_group": "uri-cpu",
+ "partitions": [
+ "cpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "avx512",
+ "ib",
+ "icelake",
+ "intel",
+ "intel8352y",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 64,
+ "ram": 120,
+ "cpus": "2x Intel(R) Xeon(R) Platinum 8352Y (64 core)",
+ "general_use": true,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "URI",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "uri-cpu[009-013]",
+ "count": 5,
+ "nodes": [
+ "uri-cpu009",
+ "uri-cpu010",
+ "uri-cpu011",
+ "uri-cpu012",
+ "uri-cpu013"
+ ],
+ "node_group": "uri-cpu",
+ "partitions": [
+ "uri-cpu",
+ "uri-richamp"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "avx512",
+ "ib",
+ "icelake",
+ "intel",
+ "intel8352y",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 64,
+ "ram": 120,
+ "cpus": "2x Intel(R) Xeon(R) Platinum 8352Y (64 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "URI",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "uri-cpu[014-033]",
+ "count": 20,
+ "nodes": [
+ "uri-cpu014",
+ "uri-cpu015",
+ "uri-cpu016",
+ "uri-cpu017",
+ "uri-cpu018",
+ "uri-cpu019",
+ "uri-cpu020",
+ "uri-cpu021",
+ "uri-cpu022",
+ "uri-cpu023",
+ "uri-cpu024",
+ "uri-cpu025",
+ "uri-cpu026",
+ "uri-cpu027",
+ "uri-cpu028",
+ "uri-cpu029",
+ "uri-cpu030",
+ "uri-cpu031",
+ "uri-cpu032",
+ "uri-cpu033"
+ ],
+ "node_group": "uri-cpu",
+ "partitions": [
+ "cpu-preempt",
+ "uri-cpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "avx512",
+ "ib",
+ "icelake",
+ "intel",
+ "intel8352y",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 64,
+ "ram": 120,
+ "cpus": "2x Intel(R) Xeon(R) Platinum 8352Y (64 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "URI",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "uri-cpu[034-037]",
+ "count": 4,
+ "nodes": [
+ "uri-cpu034",
+ "uri-cpu035",
+ "uri-cpu036",
+ "uri-cpu037"
+ ],
+ "node_group": "uri-cpu",
+ "partitions": [
+ "cpu-preempt",
+ "uri-cpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "avx512",
+ "ib",
+ "icelake",
+ "intel",
+ "intel8352y",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 64,
+ "ram": 500,
+ "cpus": "2x Intel(R) Xeon(R) Platinum 8352Y (64 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "URI",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "uri-cpu[038-043]",
+ "count": 6,
+ "nodes": [
+ "uri-cpu038",
+ "uri-cpu039",
+ "uri-cpu040",
+ "uri-cpu041",
+ "uri-cpu042",
+ "uri-cpu043"
+ ],
+ "node_group": "uri-cpu",
+ "partitions": [
+ "cpu-preempt",
+ "uri-cpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "avx512",
+ "ib",
+ "icelake",
+ "intel",
+ "intel8352y",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 64,
+ "ram": 1000,
+ "cpus": "2x Intel(R) Xeon(R) Platinum 8352Y (64 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "URI",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "uri-cpu[044-045]",
+ "count": 2,
+ "nodes": [
+ "uri-cpu044",
+ "uri-cpu045"
+ ],
+ "node_group": "uri-cpu",
+ "partitions": [
+ "cpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "avx512",
+ "ib",
+ "icelake",
+ "intel",
+ "intel8352y",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 64,
+ "ram": 1000,
+ "cpus": "2x Intel(R) Xeon(R) Platinum 8352Y (64 core)",
+ "general_use": true,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "URI",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "uri-cpu[046-049]",
+ "count": 4,
+ "nodes": [
+ "uri-cpu046",
+ "uri-cpu047",
+ "uri-cpu048",
+ "uri-cpu049"
+ ],
+ "node_group": "uri-cpu",
+ "partitions": [
+ "mpi"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "avx512",
+ "ib",
+ "icelake",
+ "intel",
+ "intel8352y",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 64,
+ "ram": 250,
+ "cpus": "2x Intel(R) Xeon(R) Platinum 8352Y (64 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "URI",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "uri-cpu[050-053]",
+ "count": 4,
+ "nodes": [
+ "uri-cpu050",
+ "uri-cpu051",
+ "uri-cpu052",
+ "uri-cpu053"
+ ],
+ "node_group": "uri-cpu",
+ "partitions": [
+ "cpu-preempt",
+ "mpi"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "avx512",
+ "ib",
+ "icelake",
+ "intel",
+ "intel8352y",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 64,
+ "ram": 250,
+ "cpus": "2x Intel(R) Xeon(R) Platinum 8352Y (64 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "URI",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "uri-cpu[054-061]",
+ "count": 8,
+ "nodes": [
+ "uri-cpu054",
+ "uri-cpu055",
+ "uri-cpu056",
+ "uri-cpu057",
+ "uri-cpu058",
+ "uri-cpu059",
+ "uri-cpu060",
+ "uri-cpu061"
+ ],
+ "node_group": "uri-cpu",
+ "partitions": [
+ "cpu-preempt",
+ "uri-cpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "avx512",
+ "ib",
+ "icelake",
+ "intel",
+ "intel8352y",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 64,
+ "ram": 250,
+ "cpus": "2x Intel(R) Xeon(R) Platinum 8352Y (64 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "URI",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "uri-gpu[001-002]",
+ "count": 2,
+ "nodes": [
+ "uri-gpu001",
+ "uri-gpu002"
+ ],
+ "node_group": "uri-gpu",
+ "partitions": [
+ "gpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "a100",
+ "a100-80g",
+ "avx512",
+ "bf16",
+ "fp64",
+ "ib",
+ "icelake",
+ "intel",
+ "intel8352y",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram40",
+ "vram48",
+ "vram8",
+ "vram80",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 64,
+ "ram": 500,
+ "cpus": "2x Intel(R) Xeon(R) Platinum 8352Y (64 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA Tesla A100",
+ "gpu_count": 4,
+ "vram": 80,
+ "open_hardware": true,
+ "owner": "URI",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "uri-gpu[003-004]",
+ "count": 2,
+ "nodes": [
+ "uri-gpu003",
+ "uri-gpu004"
+ ],
+ "node_group": "uri-gpu",
+ "partitions": [
+ "gpu-preempt",
+ "uri-gpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "a100",
+ "a100-80g",
+ "avx512",
+ "bf16",
+ "fp64",
+ "ib",
+ "icelake",
+ "intel",
+ "intel8352y",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram40",
+ "vram48",
+ "vram8",
+ "vram80",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 64,
+ "ram": 500,
+ "cpus": "2x Intel(R) Xeon(R) Platinum 8352Y (64 core)",
+ "general_use": false,
+ "gpu_model": "NVIDIA Tesla A100",
+ "gpu_count": 4,
+ "vram": 80,
+ "open_hardware": true,
+ "owner": "URI",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "uri-gpu[005-008]",
+ "count": 4,
+ "nodes": [
+ "uri-gpu005",
+ "uri-gpu006",
+ "uri-gpu007",
+ "uri-gpu008"
+ ],
+ "node_group": "uri-gpu",
+ "partitions": [
+ "gpu-preempt",
+ "uri-gpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "a100",
+ "a100-80g",
+ "avx512",
+ "bf16",
+ "fp64",
+ "ib",
+ "icelake",
+ "intel",
+ "intel8352y",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram40",
+ "vram48",
+ "vram8",
+ "vram80",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 64,
+ "ram": 1000,
+ "cpus": "2x Intel(R) Xeon(R) Platinum 8352Y (64 core)",
+ "general_use": false,
+ "gpu_model": "NVIDIA Tesla A100",
+ "gpu_count": 4,
+ "vram": 80,
+ "open_hardware": true,
+ "owner": "URI",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "uri-gpu009",
+ "count": 1,
+ "nodes": [
+ "uri-gpu009"
+ ],
+ "node_group": "uri-gpu",
+ "partitions": [
+ "gpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "avx512",
+ "bf16",
+ "fp64",
+ "h100",
+ "ib",
+ "intel",
+ "intel6448y",
+ "sapphirerapids",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "sm_86",
+ "sm_87",
+ "sm_89",
+ "sm_90",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram40",
+ "vram48",
+ "vram8",
+ "vram80",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 64,
+ "ram": 1000,
+ "cpus": "2x Intel(R) Xeon(R) Gold 6448Y (64 core)",
+ "general_use": true,
+ "gpu_model": "NVIDIA H100 80GB HBM3",
+ "gpu_count": 4,
+ "vram": 80,
+ "open_hardware": true,
+ "owner": "URI",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "uri-gpu[010-011,013-014]",
+ "count": 4,
+ "nodes": [
+ "uri-gpu010",
+ "uri-gpu011",
+ "uri-gpu013",
+ "uri-gpu014"
+ ],
+ "node_group": "uri-gpu",
+ "partitions": [
+ "gpu-preempt",
+ "uri-gpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "avx512",
+ "bf16",
+ "ib",
+ "intel",
+ "intel6526y",
+ "l40s",
+ "sapphirerapids",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "sm_86",
+ "sm_87",
+ "sm_89",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram40",
+ "vram48",
+ "vram8",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 32,
+ "ram": 500,
+ "cpus": "2x Intel(R) Xeon(R) Gold 6526Y (32 core)",
+ "general_use": false,
+ "gpu_model": "NVIDIA Ada Lovelace L40S",
+ "gpu_count": 4,
+ "vram": 48,
+ "open_hardware": true,
+ "owner": "URI",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "uri-gpu012",
+ "count": 1,
+ "nodes": [
+ "uri-gpu012"
+ ],
+ "node_group": "uri-gpu",
+ "partitions": [
+ "gpu-preempt",
+ "uri-gpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "avx512",
+ "bf16",
+ "fp64",
+ "h100",
+ "ib",
+ "intel",
+ "intel6448y",
+ "sapphirerapids",
+ "sm_52",
+ "sm_61",
+ "sm_70",
+ "sm_75",
+ "sm_80",
+ "sm_86",
+ "sm_87",
+ "sm_89",
+ "sm_90",
+ "vram11",
+ "vram12",
+ "vram16",
+ "vram23",
+ "vram32",
+ "vram40",
+ "vram48",
+ "vram8",
+ "vram80",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 64,
+ "ram": 1000,
+ "cpus": "2x Intel(R) Xeon(R) Gold 6448Y (64 core)",
+ "general_use": false,
+ "gpu_model": "NVIDIA H100 80GB HBM3",
+ "gpu_count": 4,
+ "vram": 80,
+ "open_hardware": true,
+ "owner": "URI",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "uri-power9-cpu[001-002]",
+ "count": 2,
+ "nodes": [
+ "uri-power9-cpu001",
+ "uri-power9-cpu002"
+ ],
+ "node_group": "uri-power",
+ "partitions": [
+ "power9"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "altivec",
+ "p923",
+ "power9le",
+ "ppc64le"
+ ],
+ "cores": 32,
+ "ram": 250,
+ "cpus": "2x POWER9, altivec supported (32 core)",
+ "general_use": true,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": null,
+ "owner": "",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "visterra-cpu[001-008]",
+ "count": 8,
+ "nodes": [
+ "visterra-cpu001",
+ "visterra-cpu002",
+ "visterra-cpu003",
+ "visterra-cpu004",
+ "visterra-cpu005",
+ "visterra-cpu006",
+ "visterra-cpu007",
+ "visterra-cpu008"
+ ],
+ "node_group": "visterra-cpu",
+ "partitions": [
+ "cpu-preempt",
+ "visterra"
+ ],
+ "features": [
+ "10gbps",
+ "avx512",
+ "cascadelake",
+ "ib",
+ "intel",
+ "intel6248r",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "x86_64_v4"
+ ],
+ "cores": 48,
+ "ram": 370,
+ "cpus": "2x Intel(R) Xeon(R) Gold 6248R (48 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "Visterra",
+ "owner_email": ""
+ },
+ {
+ "nodes_folded": "zhoulin-cpu[001-006]",
+ "count": 6,
+ "nodes": [
+ "zhoulin-cpu001",
+ "zhoulin-cpu002",
+ "zhoulin-cpu003",
+ "zhoulin-cpu004",
+ "zhoulin-cpu005",
+ "zhoulin-cpu006"
+ ],
+ "node_group": "zhoulin-cpu",
+ "partitions": [
+ "cpu-preempt",
+ "zhoulin-cpu"
+ ],
+ "features": [
+ "10gbps",
+ "25gbps",
+ "amd",
+ "amd7702",
+ "x86_64",
+ "x86_64_v2",
+ "x86_64_v3",
+ "zen2"
+ ],
+ "cores": 128,
+ "ram": 500,
+ "cpus": "2x AMD EPYC 7702 64-Core Processor (128 core)",
+ "general_use": false,
+ "gpu_model": null,
+ "gpu_count": 0,
+ "vram": null,
+ "open_hardware": true,
+ "owner": "Zhou Lin",
+ "owner_email": "zhoulin@umass.edu"
+ }
+]
\ No newline at end of file
diff --git a/tests/mock_data/mock_remote_configs/partition_info.json b/tests/mock_data/mock_remote_configs/partition_info.json
new file mode 100644
index 0000000..edfff1d
--- /dev/null
+++ b/tests/mock_data/mock_remote_configs/partition_info.json
@@ -0,0 +1,317 @@
+[
+ {
+ "name": "arm-gpu",
+ "type": "gpu",
+ "node_count": 3,
+ "maxtime": "14 days",
+ "deftime": "1 hour",
+ "max_ram": 560,
+ "max_cpus": 72
+ },
+ {
+ "name": "arm-preempt",
+ "type": "cpu",
+ "node_count": 3,
+ "maxtime": "14 days",
+ "deftime": "1 hour",
+ "max_ram": 250,
+ "max_cpus": 80
+ },
+ {
+ "name": "astroth-cpu",
+ "type": "cpu",
+ "node_count": 9,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 1000,
+ "max_cpus": 128
+ },
+ {
+ "name": "astroth-gpu",
+ "type": "gpu",
+ "node_count": 3,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 30,
+ "max_cpus": 8
+ },
+ {
+ "name": "cbio-cpu",
+ "type": "cpu",
+ "node_count": 5,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 500,
+ "max_cpus": 128
+ },
+ {
+ "name": "cbio-gpu",
+ "type": "gpu",
+ "node_count": 1,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 500,
+ "max_cpus": 128
+ },
+ {
+ "name": "ceewater_casey-cpu",
+ "type": "cpu",
+ "node_count": 10,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 500,
+ "max_cpus": 128
+ },
+ {
+ "name": "ceewater_cjgleason-cpu",
+ "type": "cpu",
+ "node_count": 10,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 500,
+ "max_cpus": 128
+ },
+ {
+ "name": "ceewater_kandread-cpu",
+ "type": "cpu",
+ "node_count": 10,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 500,
+ "max_cpus": 128
+ },
+ {
+ "name": "cpu",
+ "type": "cpu",
+ "node_count": 147,
+ "maxtime": "2 days",
+ "deftime": "1 hour",
+ "max_ram": 1510,
+ "max_cpus": 192
+ },
+ {
+ "name": "cpu-preempt",
+ "type": "cpu",
+ "node_count": 138,
+ "maxtime": "2 days",
+ "deftime": "1 hour",
+ "max_ram": 1510,
+ "max_cpus": 224
+ },
+ {
+ "name": "ece-gpu",
+ "type": "gpu",
+ "node_count": 2,
+ "maxtime": "5 days",
+ "deftime": "1 hour",
+ "max_ram": 370,
+ "max_cpus": 32
+ },
+ {
+ "name": "fsi-lab",
+ "type": "cpu",
+ "node_count": 1,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 1510,
+ "max_cpus": 192
+ },
+ {
+ "name": "gaoseismolab-cpu",
+ "type": "cpu",
+ "node_count": 5,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 500,
+ "max_cpus": 64
+ },
+ {
+ "name": "gpu",
+ "type": "gpu",
+ "node_count": 153,
+ "maxtime": "2 days",
+ "deftime": "1 hour",
+ "max_ram": 2010,
+ "max_cpus": 112
+ },
+ {
+ "name": "gpu-preempt",
+ "type": "gpu",
+ "node_count": 78,
+ "maxtime": "2 days",
+ "deftime": "1 hour",
+ "max_ram": 2010,
+ "max_cpus": 128
+ },
+ {
+ "name": "superpod-a100",
+ "type": "gpu",
+ "node_count": 12,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 2010,
+ "max_cpus": 112
+ },
+ {
+ "name": "gpupod-l40s",
+ "type": "gpu",
+ "node_count": 10,
+ "maxtime": "14 days",
+ "deftime": "1 hour",
+ "max_ram": 500,
+ "max_cpus": 32
+ },
+ {
+ "name": "ials-gpu",
+ "type": "gpu",
+ "node_count": 31,
+ "maxtime": "14 days",
+ "deftime": "1 hour",
+ "max_ram": 500,
+ "max_cpus": 32
+ },
+ {
+ "name": "jdelhommelle",
+ "type": "cpu",
+ "node_count": 1,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 1510,
+ "max_cpus": 224
+ },
+ {
+ "name": "lan",
+ "type": "gpu",
+ "node_count": 1,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 500,
+ "max_cpus": 32
+ },
+ {
+ "name": "mpi",
+ "type": "cpu",
+ "node_count": 13,
+ "maxtime": "2 days",
+ "deftime": "1 hour",
+ "max_ram": 250,
+ "max_cpus": 64
+ },
+ {
+ "name": "power9",
+ "type": "cpu",
+ "node_count": 2,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 250,
+ "max_cpus": 32
+ },
+ {
+ "name": "power9-gpu",
+ "type": "gpu",
+ "node_count": 16,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 510,
+ "max_cpus": 40
+ },
+ {
+ "name": "power9-gpu-osg",
+ "type": "gpu",
+ "node_count": 2,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 250,
+ "max_cpus": 32
+ },
+ {
+ "name": "power9-gpu-preempt",
+ "type": "gpu",
+ "node_count": 16,
+ "maxtime": "2 days",
+ "deftime": "1 hour",
+ "max_ram": 510,
+ "max_cpus": 40
+ },
+ {
+ "name": "toltec-cpu",
+ "type": "cpu",
+ "node_count": 7,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 370,
+ "max_cpus": 32
+ },
+ {
+ "name": "umd-cscdr-arm",
+ "type": "cpu",
+ "node_count": 3,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 250,
+ "max_cpus": 80
+ },
+ {
+ "name": "umd-cscdr-cpu",
+ "type": "cpu",
+ "node_count": 35,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 250,
+ "max_cpus": 64
+ },
+ {
+ "name": "umd-cscdr-gpu",
+ "type": "gpu",
+ "node_count": 10,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 2000,
+ "max_cpus": 128
+ },
+ {
+ "name": "uri-cpu",
+ "type": "cpu",
+ "node_count": 43,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 1000,
+ "max_cpus": 64
+ },
+ {
+ "name": "uri-gpu",
+ "type": "gpu",
+ "node_count": 11,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 1000,
+ "max_cpus": 64
+ },
+ {
+ "name": "uri-richamp",
+ "type": "cpu",
+ "node_count": 5,
+ "maxtime": "Unlimited",
+ "deftime": "1 hour",
+ "max_ram": 120,
+ "max_cpus": 64
+ },
+ {
+ "name": "visterra",
+ "type": "cpu",
+ "node_count": 8,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 370,
+ "max_cpus": 48
+ },
+ {
+ "name": "zhoulin-cpu",
+ "type": "cpu",
+ "node_count": 6,
+ "maxtime": "30 days",
+ "deftime": "1 hour",
+ "max_ram": 500,
+ "max_cpus": 128
+ }
+]
\ No newline at end of file
diff --git a/tests/test_approx_allocated_vram.py b/tests/preprocess/test_approx_allocated_vram.py
similarity index 98%
rename from tests/test_approx_allocated_vram.py
rename to tests/preprocess/test_approx_allocated_vram.py
index ccf1b1a..41d5f8f 100644
--- a/tests/test_approx_allocated_vram.py
+++ b/tests/preprocess/test_approx_allocated_vram.py
@@ -1,6 +1,6 @@
import pandas as pd
-from src.preprocess.preprocess import _get_approx_allocated_vram
+from src.preprocess.allocated_vram import _get_approx_allocated_vram
# Test for single node VRAM allocation
diff --git a/tests/test_preprocess.py b/tests/preprocess/test_preprocess.py
similarity index 99%
rename from tests/test_preprocess.py
rename to tests/preprocess/test_preprocess.py
index 6cb51d9..18368e8 100644
--- a/tests/test_preprocess.py
+++ b/tests/preprocess/test_preprocess.py
@@ -16,7 +16,7 @@
)
from src.preprocess import preprocess_data
from src.preprocess.preprocess import _get_partition_constraint, _get_requested_vram, _get_vram_constraint
-from .conftest import preprocess_mock_data
+from ..conftest import preprocess_mock_data
@pytest.mark.parametrize("mock_data_path", [False, True], ids=["false_case", "true_case"], indirect=True)