Skip to content

Commit 4e45a93

Browse files
authored
Merge pull request #22 from UnityHPC/feature/high-cpu-mem-analysis
Feature/high cpu mem analysis
2 parents 80d42bd + 829025c commit 4e45a93

24 files changed

+6139
-1184
lines changed

notebooks/Efficiency Analysis.ipynb

Lines changed: 617 additions & 616 deletions
Large diffs are not rendered by default.

notebooks/Resource Hoarding.ipynb

Lines changed: 367 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,367 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "0",
6+
"metadata": {},
7+
"source": [
8+
"# <a id='toc1_'></a>[Resource Hoarding Analysis](#toc0_)\n",
9+
"This notebook demonstrates the use of `ResourceHoarding` class in `src/analysis/hoarding.py` for analyzing the jobs and users that hoard resources by requesting a disproportionate amount of CPU Memory and Cores."
10+
]
11+
},
12+
{
13+
"cell_type": "markdown",
14+
"id": "1",
15+
"metadata": {},
16+
"source": [
17+
"**Table of contents**<a id='toc0_'></a> \n",
18+
"- [Resource Hoarding Analysis](#toc1_) \n",
19+
" - [Setup](#toc1_1_) \n",
20+
" - [Filter jobs for resource hoarding analysis](#toc1_1_1_) \n",
21+
" - [Analyze Jobs Hoarding Resources:](#toc1_2_) \n",
22+
" - [Generate all hoarding analysis metrics for jobs:](#toc1_2_1_1_) \n",
23+
" - [Find most inefficient jobs hoarding node RAM based on `ram_hoarding_fraction_diff`](#toc1_2_1_2_) \n",
24+
" - [Find most inefficient jobs hoarding CPU cores based on `core_hoarding_fraction_diff`](#toc1_2_1_3_) \n",
25+
" - [Analyze Users Hoarding Resources:](#toc1_3_) \n",
26+
" - [Generate all hoarding analysis metrics for users:](#toc1_3_1_1_) \n",
27+
" - [Find most inefficient users hoarding node RAM based on `expected_value_ram_hoarding_fraction_diff`](#toc1_3_1_2_) \n",
28+
" - [Find most inefficient users hoarding CPU cores based on `expected_value_core_hoarding_fraction_diff`](#toc1_3_1_3_) \n",
29+
"\n",
30+
"<!-- vscode-jupyter-toc-config\n",
31+
"\tnumbering=false\n",
32+
"\tanchor=true\n",
33+
"\tflat=false\n",
34+
"\tminLevel=1\n",
35+
"\tmaxLevel=6\n",
36+
"\t/vscode-jupyter-toc-config -->\n",
37+
"<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->"
38+
]
39+
},
40+
{
41+
"cell_type": "markdown",
42+
"id": "2",
43+
"metadata": {},
44+
"source": [
45+
"## <a id='toc1_1_'></a>[Setup](#toc0_)"
46+
]
47+
},
48+
{
49+
"cell_type": "code",
50+
"execution_count": null,
51+
"id": "3",
52+
"metadata": {},
53+
"outputs": [],
54+
"source": [
55+
"# Import required modules\n",
56+
"import sys\n",
57+
"from pathlib import Path\n",
58+
"import pandas as pd\n",
59+
"\n",
60+
"# import matplotlib.pyplot as plt\n",
61+
"# import seaborn as sns\n",
62+
"import os"
63+
]
64+
},
65+
{
66+
"cell_type": "markdown",
67+
"id": "4",
68+
"metadata": {},
69+
"source": [
70+
"Jupyter server should be run at the notebook directory, so the output of the following cell would be the project root:"
71+
]
72+
},
73+
{
74+
"cell_type": "code",
75+
"execution_count": null,
76+
"id": "5",
77+
"metadata": {},
78+
"outputs": [],
79+
"source": [
80+
"project_root = str(Path.cwd().resolve().parent)\n",
81+
"print(f\"Project root: {project_root}\")\n",
82+
"os.environ[\"OUTPUT_MODE\"] = \"\""
83+
]
84+
},
85+
{
86+
"cell_type": "code",
87+
"execution_count": null,
88+
"id": "6",
89+
"metadata": {},
90+
"outputs": [],
91+
"source": [
92+
"# Automatically reload modules before executing code (set this up BEFORE imports)\n",
93+
"%load_ext autoreload\n",
94+
"%autoreload 2\n",
95+
"\n",
96+
"# Add project root to sys.path for module imports\n",
97+
"if project_root not in sys.path:\n",
98+
" sys.path.insert(0, project_root)\n",
99+
"\n",
100+
"from src.analysis import ResourceHoarding as ResourceHoarding\n",
101+
"from src.analysis import efficiency_analysis as ea\n",
102+
"from src.visualization import JobsWithMetricsVisualizer, UsersWithMetricsVisualizer\n",
103+
"from src.config.enum_constants import ResourceHoardingDataFrameNameEnum"
104+
]
105+
},
106+
{
107+
"cell_type": "code",
108+
"execution_count": null,
109+
"id": "7",
110+
"metadata": {},
111+
"outputs": [],
112+
"source": [
113+
"# Load the jobs DataFrame from DuckDB\n",
114+
"preprocessed_jobs_df = ea.load_preprocessed_jobs_dataframe_from_duckdb(\n",
115+
" db_path=\"../data/slurm_data.db\",\n",
116+
" table_name=\"Jobs\",\n",
117+
")\n",
118+
"display(preprocessed_jobs_df.head(10))\n",
119+
"print(preprocessed_jobs_df.shape)"
120+
]
121+
},
122+
{
123+
"cell_type": "markdown",
124+
"id": "8",
125+
"metadata": {},
126+
"source": [
127+
"### <a id='toc1_1_1_'></a>[Filter jobs for resource hoarding analysis](#toc0_)"
128+
]
129+
},
130+
{
131+
"cell_type": "code",
132+
"execution_count": null,
133+
"id": "9",
134+
"metadata": {},
135+
"outputs": [],
136+
"source": [
137+
"hoarding_analysis = ResourceHoarding(jobs_df=preprocessed_jobs_df)"
138+
]
139+
},
140+
{
141+
"cell_type": "code",
142+
"execution_count": null,
143+
"id": "10",
144+
"metadata": {},
145+
"outputs": [],
146+
"source": [
147+
"filtered_jobs = hoarding_analysis.filter_jobs_for_analysis()\n",
148+
"filtered_jobs"
149+
]
150+
},
151+
{
152+
"cell_type": "markdown",
153+
"id": "11",
154+
"metadata": {},
155+
"source": [
156+
"## <a id='toc1_2_'></a>[Analyze Jobs Hoarding Resources:](#toc0_)\n"
157+
]
158+
},
159+
{
160+
"cell_type": "markdown",
161+
"id": "12",
162+
"metadata": {},
163+
"source": [
164+
"#### <a id='toc1_2_1_1_'></a>[Generate all hoarding analysis metrics for jobs:](#toc0_)"
165+
]
166+
},
167+
{
168+
"cell_type": "code",
169+
"execution_count": null,
170+
"id": "13",
171+
"metadata": {},
172+
"outputs": [],
173+
"source": [
174+
"memory_hoarding_jobs = hoarding_analysis.calculate_node_resource_hoarding_for_jobs(filtered_jobs)\n",
175+
"\n",
176+
"# Set option to display all columns\n",
177+
"pd.set_option(\"display.max_columns\", None)\n",
178+
"# Display the DataFrame\n",
179+
"display(memory_hoarding_jobs.head(10))\n",
180+
"# To revert to default settings (optional)\n",
181+
"pd.reset_option(\"display.max_columns\")\n",
182+
"\n",
183+
"print(f\"Jobs found: {len(memory_hoarding_jobs)}\")"
184+
]
185+
},
186+
{
187+
"cell_type": "markdown",
188+
"id": "14",
189+
"metadata": {},
190+
"source": [
191+
"#### <a id='toc1_2_1_2_'></a>[Find most inefficient jobs hoarding node RAM based on `ram_hoarding_fraction_diff`](#toc0_)"
192+
]
193+
},
194+
{
195+
"cell_type": "code",
196+
"execution_count": null,
197+
"id": "15",
198+
"metadata": {},
199+
"outputs": [],
200+
"source": [
201+
"inefficient_jobs_hoarding_ram = hoarding_analysis.sort_and_filter_records_with_metrics(\n",
202+
" metrics_df_name_enum=ResourceHoardingDataFrameNameEnum.JOBS_WITH_RESOURCE_HOARDING_METRICS,\n",
203+
" sorting_key=\"ram_hoarding_fraction_diff\",\n",
204+
" ascending=False, # Sort in descending order\n",
205+
" filter_criteria={\"ram_hoarding_fraction_diff\": {\"min\": 0, \"inclusive\": True}},\n",
206+
")\n",
207+
"# Display top inefficient users by RAM hoarding fraction\n",
208+
"print(\"\\nTop inefficient Jobs by RAM hoarding fraction:\")\n",
209+
"display(inefficient_jobs_hoarding_ram.head(10))\n",
210+
"\n",
211+
"# Plot top inefficient jobs by RAM hoarding fraction, with RAM hoarding fraction as labels\n",
212+
"jobs_with_metrics_visualizer = JobsWithMetricsVisualizer(inefficient_jobs_hoarding_ram.head(20))\n",
213+
"jobs_with_metrics_visualizer.visualize(\n",
214+
" column=\"ram_hoarding_fraction_diff\",\n",
215+
" bar_label_columns=[\"ram_hoarding_fraction_diff\", \"cpu_mem_efficiency\", \"alloc_vram_efficiency\"],\n",
216+
" figsize=(12, 12),\n",
217+
")"
218+
]
219+
},
220+
{
221+
"cell_type": "markdown",
222+
"id": "16",
223+
"metadata": {},
224+
"source": [
225+
"#### <a id='toc1_2_1_3_'></a>[Find most inefficient jobs hoarding CPU cores based on `core_hoarding_fraction_diff`](#toc0_)"
226+
]
227+
},
228+
{
229+
"cell_type": "code",
230+
"execution_count": null,
231+
"id": "17",
232+
"metadata": {},
233+
"outputs": [],
234+
"source": [
235+
"inefficient_jobs_hoarding_cpu_cores = hoarding_analysis.sort_and_filter_records_with_metrics(\n",
236+
" metrics_df_name_enum=ResourceHoardingDataFrameNameEnum.JOBS_WITH_RESOURCE_HOARDING_METRICS,\n",
237+
" sorting_key=\"core_hoarding_fraction_diff\",\n",
238+
" ascending=False, # Sort in descending order\n",
239+
" filter_criteria={\"core_hoarding_fraction_diff\": {\"min\": 0, \"inclusive\": True}},\n",
240+
")\n",
241+
"# Display top inefficient users by CPU core hoarding fraction\n",
242+
"print(\"\\nTop inefficient Jobs by CPU core hoarding fraction:\")\n",
243+
"display(inefficient_jobs_hoarding_cpu_cores.head(10))\n",
244+
"\n",
245+
"# Plot top inefficient jobs by CPU core hoarding fraction, with CPU core hoarding fraction as labels\n",
246+
"jobs_with_metrics_visualizer = JobsWithMetricsVisualizer(inefficient_jobs_hoarding_cpu_cores.head(20))\n",
247+
"jobs_with_metrics_visualizer.visualize(\n",
248+
" column=\"core_hoarding_fraction_diff\",\n",
249+
" bar_label_columns=[\"core_hoarding_fraction_diff\", \"ram_hoarding_fraction_diff\", \"alloc_vram_efficiency\"],\n",
250+
" figsize=(12, 12),\n",
251+
")"
252+
]
253+
},
254+
{
255+
"cell_type": "markdown",
256+
"id": "18",
257+
"metadata": {},
258+
"source": [
259+
"## <a id='toc1_3_'></a>[Analyze Users Hoarding Resources:](#toc0_)\n"
260+
]
261+
},
262+
{
263+
"cell_type": "markdown",
264+
"id": "19",
265+
"metadata": {},
266+
"source": [
267+
"#### <a id='toc1_3_1_1_'></a>[Generate all hoarding analysis metrics for users:](#toc0_)"
268+
]
269+
},
270+
{
271+
"cell_type": "code",
272+
"execution_count": null,
273+
"id": "20",
274+
"metadata": {},
275+
"outputs": [],
276+
"source": [
277+
"memory_hoarding_users = hoarding_analysis.calculate_node_resource_hoarding_for_users(filtered_jobs)\n",
278+
"display(memory_hoarding_users)"
279+
]
280+
},
281+
{
282+
"cell_type": "markdown",
283+
"id": "21",
284+
"metadata": {},
285+
"source": [
286+
"#### <a id='toc1_3_1_2_'></a>[Find most inefficient users hoarding node RAM based on `expected_value_ram_hoarding_fraction_diff`](#toc0_)"
287+
]
288+
},
289+
{
290+
"cell_type": "code",
291+
"execution_count": null,
292+
"id": "22",
293+
"metadata": {},
294+
"outputs": [],
295+
"source": [
296+
"inefficient_users_hoarding_ram = hoarding_analysis.sort_and_filter_records_with_metrics(\n",
297+
" metrics_df_name_enum=ResourceHoardingDataFrameNameEnum.USERS_WITH_RESOURCE_HOARDING_METRICS,\n",
298+
" sorting_key=\"expected_value_ram_hoarding_fraction_diff\",\n",
299+
" ascending=False, # Sort in descending order\n",
300+
" filter_criteria={\"expected_value_ram_hoarding_fraction_diff\": {\"min\": 0, \"inclusive\": True}},\n",
301+
")\n",
302+
"# Display top inefficient users by RAM hoarding fraction\n",
303+
"\n",
304+
"print(\"\\nTop inefficient Users by RAM hoarding fraction:\")\n",
305+
"display(inefficient_users_hoarding_ram.head(10))\n",
306+
"\n",
307+
"# Plot top inefficient users by RAM hoarding fraction, with RAM hoarding fraction as labels\n",
308+
"users_with_metrics_visualizer = UsersWithMetricsVisualizer(inefficient_users_hoarding_ram.head(20))\n",
309+
"users_with_metrics_visualizer.visualize(\n",
310+
" column=\"expected_value_ram_hoarding_fraction_diff\",\n",
311+
" bar_label_columns=[\n",
312+
" \"expected_value_ram_hoarding_fraction_diff\",\n",
313+
" \"expected_value_core_hoarding_fraction_diff\",\n",
314+
" \"expected_value_alloc_vram_efficiency\",\n",
315+
" ],\n",
316+
" figsize=(14, 12),\n",
317+
")"
318+
]
319+
},
320+
{
321+
"cell_type": "markdown",
322+
"id": "23",
323+
"metadata": {},
324+
"source": [
325+
"#### <a id='toc1_3_1_3_'></a>[Find most inefficient users hoarding CPU cores based on `expected_value_core_hoarding_fraction_diff`](#toc0_)"
326+
]
327+
},
328+
{
329+
"cell_type": "code",
330+
"execution_count": null,
331+
"id": "24",
332+
"metadata": {},
333+
"outputs": [],
334+
"source": [
335+
"inefficient_users_hoarding_cpu_cores = hoarding_analysis.sort_and_filter_records_with_metrics(\n",
336+
" metrics_df_name_enum=ResourceHoardingDataFrameNameEnum.USERS_WITH_RESOURCE_HOARDING_METRICS,\n",
337+
" sorting_key=\"expected_value_core_hoarding_fraction_diff\",\n",
338+
" ascending=False, # Sort in descending order\n",
339+
" filter_criteria={\"expected_value_core_hoarding_fraction_diff\": {\"min\": 0, \"inclusive\": True}},\n",
340+
")\n",
341+
"# Display top inefficient users by CPU core hoarding fraction\n",
342+
"\n",
343+
"print(\"\\nTop inefficient Users by CPU core hoarding fraction:\")\n",
344+
"display(inefficient_users_hoarding_cpu_cores.head(10))\n",
345+
"\n",
346+
"# Plot top inefficient users by CPU core hoarding fraction, with CPU core hoarding fraction as labels\n",
347+
"users_with_metrics_visualizer = UsersWithMetricsVisualizer(inefficient_users_hoarding_cpu_cores.head(20))\n",
348+
"users_with_metrics_visualizer.visualize(\n",
349+
" column=\"expected_value_core_hoarding_fraction_diff\",\n",
350+
" bar_label_columns=[\n",
351+
" \"expected_value_core_hoarding_fraction_diff\",\n",
352+
" \"expected_value_ram_hoarding_fraction_diff\",\n",
353+
" \"expected_value_alloc_vram_efficiency\",\n",
354+
" ],\n",
355+
" figsize=(14, 12),\n",
356+
")"
357+
]
358+
}
359+
],
360+
"metadata": {
361+
"language_info": {
362+
"name": "python"
363+
}
364+
},
365+
"nbformat": 4,
366+
"nbformat_minor": 5
367+
}

src/analysis/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,5 @@
1-
from .efficiency_analysis import EfficiencyAnalysis as EfficiencyAnalysis
1+
from .efficiency_analysis import (
2+
EfficiencyAnalysis as EfficiencyAnalysis,
3+
load_preprocessed_jobs_dataframe_from_duckdb as load_preprocessed_jobs_dataframe_from_duckdb
4+
)
5+
from .hoarding import ResourceHoarding as ResourceHoarding

0 commit comments

Comments
 (0)