Skip to content

Commit f7a9425

Browse files
committed
Remove methods in efficiency analysis that were dead code and were replaced by sort_and_filter
1 parent cb119d8 commit f7a9425

File tree

5 files changed

+13
-160
lines changed

5 files changed

+13
-160
lines changed

notebooks/analysis/No VRAM Use Analysis.ipynb

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@
108108
"source": [
109109
"# Load the jobs DataFrame from DuckDB\n",
110110
"preprocessed_jobs_df = ea.load_preprocessed_jobs_dataframe_from_duckdb(\n",
111-
" db_path=Path(project_root) / \"data/slurm_data_small.db\", table_name=\"Jobs\", anonymize=True\n",
111+
" db_path=Path(project_root) / \"data/slurm_data.db\", table_name=\"Jobs\", anonymize=True\n",
112112
")\n",
113113
"display(preprocessed_jobs_df.head(10))\n",
114114
"print(preprocessed_jobs_df.shape)"
@@ -261,9 +261,14 @@
261261
"metadata": {},
262262
"outputs": [],
263263
"source": [
264-
"inefficient_users_vram_hours = efficiency_analysis.find_inefficient_users_by_vram_hours(\n",
265-
" vram_hours_filter={\"min\": 200, \"inclusive\": True}, # VRAM-hours threshold for identifying inefficient users\n",
266-
" min_jobs=5, # Minimum number of jobs to consider a user\n",
264+
"inefficient_users_vram_hours = efficiency_analysis.sort_and_filter_records_with_metrics(\n",
265+
" metrics_df_name_enum=MetricsDataFrameNameEnum.USERS,\n",
266+
" sorting_key=\"vram_hours\",\n",
267+
" ascending=False, # Sort by vram_hours in descending order\n",
268+
" filter_criteria={\n",
269+
" \"vram_hours\": {\"min\": 200, \"inclusive\": True}, # VRAM-hours threshold for identifying inefficient users\n",
270+
" \"job_count\": {\"min\": 5, \"inclusive\": True}, # Job count threshold for identifying inefficient users\n",
271+
" },\n",
267272
")\n",
268273
"# Display top inefficient users by VRAM-hours\n",
269274
"print(\"\\nTop inefficient users by VRAM-hours:\")\n",

notebooks/analysis/Requested and Used VRAM.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@
231231
"metadata": {},
232232
"outputs": [],
233233
"source": [
234-
"inefficient_jobs_vram_hours = analyzer.sort_and_filter_records_with_metrics(\n",
234+
"inefficient_jobs_req_vram_score = analyzer.sort_and_filter_records_with_metrics(\n",
235235
" metrics_df_name_enum=ResourceHoardingDataFrameNameEnum.JOBS,\n",
236236
" sorting_key=\"requested_vram_efficiency_score\",\n",
237237
" ascending=True, # Sort by requested_vram_efficiency_score in ascending order\n",
@@ -240,7 +240,7 @@
240240
" },\n",
241241
")\n",
242242
"# Plot top inefficient jobs by requested VRAM efficiency score, with VRAM-hours as labels\n",
243-
"jobs_with_metrics_visualizer = JobsWithMetricsVisualizer(inefficient_jobs_vram_hours.head(10))\n",
243+
"jobs_with_metrics_visualizer = JobsWithMetricsVisualizer(inefficient_jobs_req_vram_score.head(10))\n",
244244
"jobs_with_metrics_visualizer.visualize(\n",
245245
" output_dir_path=JOBS_VISUALIZATION_DATA_DIR,\n",
246246
" column=\"requested_vram_efficiency_score\",\n",

notebooks/module_demos/Attribute Visualization.ipynb

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -133,10 +133,7 @@
133133
"metadata": {},
134134
"outputs": [],
135135
"source": [
136-
"clean_jobs_df = Preprocess().preprocess_data(\n",
137-
" jobs_df,\n",
138-
" min_elapsed_seconds=600,\n",
139-
" anonymize=True)\n",
136+
"clean_jobs_df = Preprocess().preprocess_data(jobs_df, min_elapsed_seconds=600, anonymize=True)\n",
140137
"display(clean_jobs_df)\n",
141138
"print(clean_jobs_df.shape)"
142139
]

src/analysis/efficiency_analysis.py

Lines changed: 0 additions & 150 deletions
Original file line numberDiff line numberDiff line change
@@ -507,105 +507,6 @@ def calculate_user_efficiency_metrics(self) -> pd.DataFrame:
507507
self.users_with_efficiency_metrics = users_w_efficiency_metrics
508508
return self.users_with_efficiency_metrics
509509

510-
def find_inefficient_users_by_alloc_vram_efficiency(
511-
self, alloc_vram_efficiency_filter: int | float | dict | None, min_jobs: int = 5
512-
) -> pd.DataFrame:
513-
"""
514-
Identify users with low expected allocated VRAM efficiency across their jobs compared to others
515-
516-
Args:
517-
alloc_vram_efficiency_filter:
518-
- int | float : select rows where expected_value_alloc_vram_efficiency == value
519-
- dict with 'min'/'max' and required 'inclusive' (bool): select rows in the range
520-
min_jobs (int): Minimum number of jobs a user must have to be included in the analysis
521-
522-
Returns:
523-
pd.DataFrame: DataFrame with users and their average VRAM efficiency
524-
525-
Raises:
526-
ValueError: If the filter for expected_value_alloc_vram_efficiency is invalid.
527-
"""
528-
if self.users_with_efficiency_metrics is None:
529-
self.calculate_user_efficiency_metrics()
530-
print(
531-
"Users DataFrame with efficiency metrics was not available. "
532-
"Calculated it using the DataFrame of jobs with efficiency metrics."
533-
)
534-
535-
mask = pd.Series(
536-
[True] * len(self.users_with_efficiency_metrics), index=self.users_with_efficiency_metrics.index
537-
)
538-
539-
if alloc_vram_efficiency_filter is not None:
540-
try:
541-
mask &= EfficiencyAnalysis.apply_numeric_filter(
542-
self.users_with_efficiency_metrics["expected_value_alloc_vram_efficiency"],
543-
alloc_vram_efficiency_filter,
544-
{FilterTypeEnum.NUMERIC_SCALAR, FilterTypeEnum.DICTIONARY},
545-
filter_name="expected_value_alloc_vram_efficiency",
546-
)
547-
except ValueError as e:
548-
raise ValueError("Invalid filter for expected_value_alloc_vram_efficiency.") from e
549-
550-
col = self.users_with_efficiency_metrics["job_count"]
551-
mask &= col.ge(min_jobs)
552-
553-
inefficient_users = self.users_with_efficiency_metrics[mask]
554-
555-
# Sort by the metric ascending (lower is worse)
556-
inefficient_users = inefficient_users.sort_values("expected_value_alloc_vram_efficiency", ascending=True)
557-
return inefficient_users
558-
559-
def find_inefficient_users_by_vram_hours(
560-
self, vram_hours_filter: int | float | dict = 200, min_jobs: int = 5
561-
) -> pd.DataFrame:
562-
"""
563-
Identify users with high VRAM-hours across their jobs compared to others.
564-
565-
Args:
566-
vram_hours_filter:
567-
- None: no filtering on vram_hours
568-
- int | float: select rows where vram_hours == value
569-
- dict with 'min'/'max' and required 'inclusive' (bool): select rows in the range
570-
min_jobs (int): Minimum number of jobs a user must have to be included in the analysis
571-
572-
Returns:
573-
pd.DataFrame: DataFrame with users and their total VRAM hours
574-
575-
Raises:
576-
ValueError: If the filter is invalid
577-
"""
578-
if self.users_with_efficiency_metrics is None:
579-
self.calculate_user_efficiency_metrics()
580-
print(
581-
"Users DataFrame with efficiency metrics was not available. "
582-
"Calculated it using the DataFrame of jobs with efficiency metrics."
583-
)
584-
585-
mask = pd.Series(
586-
[True] * len(self.users_with_efficiency_metrics), index=self.users_with_efficiency_metrics.index
587-
)
588-
589-
if vram_hours_filter is not None:
590-
try:
591-
mask &= EfficiencyAnalysis.apply_numeric_filter(
592-
self.users_with_efficiency_metrics["vram_hours"],
593-
vram_hours_filter,
594-
{FilterTypeEnum.NUMERIC_SCALAR, FilterTypeEnum.DICTIONARY},
595-
filter_name="vram_hours_filter",
596-
)
597-
except ValueError as e:
598-
raise ValueError("Invalid filter for vram_hours.") from e
599-
600-
col = self.users_with_efficiency_metrics["job_count"]
601-
mask &= col.ge(min_jobs)
602-
603-
inefficient_users = self.users_with_efficiency_metrics[mask]
604-
605-
# Sort by the metric descending (higher is worse)
606-
inefficient_users = inefficient_users.sort_values("vram_hours", ascending=False)
607-
return inefficient_users
608-
609510
def calculate_all_efficiency_metrics(
610511
self,
611512
filtered_jobs: pd.DataFrame,
@@ -722,57 +623,6 @@ def calculate_pi_account_efficiency_metrics(self) -> pd.DataFrame:
722623
self.pi_accounts_with_efficiency_metrics = pi_efficiency_metrics
723624
return self.pi_accounts_with_efficiency_metrics
724625

725-
def find_inefficient_pis_by_vram_hours(
726-
self, vram_hours_filter: int | float | dict = 200, min_jobs: int = 5
727-
) -> pd.DataFrame:
728-
"""
729-
Identify inefficient PI accounts based on VRAM hours.
730-
731-
Args:
732-
vram_hours_filter:
733-
- None: no filtering on vram_hours
734-
- int | float: select rows where pi_acc_vram_hours == value
735-
- dict with 'min'/'max' and required 'inclusive' (bool): select rows in the range
736-
min_jobs (int): Minimum number of jobs a PI account must have to be included in the analysis
737-
738-
Returns:
739-
pd.DataFrame: DataFrame with PI accounts and their VRAM hours
740-
741-
Raises:
742-
ValueError: If the filter is invalid
743-
"""
744-
if self.pi_accounts_with_efficiency_metrics is None:
745-
self.calculate_pi_account_efficiency_metrics()
746-
print(
747-
"PI accounts with efficiency metrics DataFrame was not available. "
748-
"Calculated it using the DataFrame of users with efficiency metrics."
749-
)
750-
751-
mask = pd.Series(
752-
[True] * len(self.pi_accounts_with_efficiency_metrics),
753-
index=self.pi_accounts_with_efficiency_metrics.index,
754-
)
755-
756-
if vram_hours_filter is not None:
757-
try:
758-
mask &= EfficiencyAnalysis.apply_numeric_filter(
759-
self.pi_accounts_with_efficiency_metrics["pi_acc_vram_hours"],
760-
vram_hours_filter,
761-
{FilterTypeEnum.NUMERIC_SCALAR, FilterTypeEnum.DICTIONARY},
762-
filter_name="pi_acc_vram_hours_filter",
763-
)
764-
except ValueError as e:
765-
raise ValueError("Invalid filter for pi_acc_vram_hours.") from e
766-
767-
col = self.pi_accounts_with_efficiency_metrics["job_count"]
768-
mask &= col.ge(min_jobs)
769-
770-
inefficient_pi_accounts = self.pi_accounts_with_efficiency_metrics[mask]
771-
772-
# Sort by the metric descending (higher is worse)
773-
inefficient_pi_accounts = inefficient_pi_accounts.sort_values("pi_acc_vram_hours", ascending=False)
774-
return inefficient_pi_accounts
775-
776626
def sort_and_filter_records_with_metrics(
777627
self,
778628
metrics_df_name_enum: MetricsDFNameEnumT,

src/visualization/efficiency_metrics.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ def _human_readable_value(val: object) -> str:
8888
# numpy / pandas NA
8989
try:
9090
import pandas as _pd # local import to avoid circular issues
91+
9192
try:
9293
_tmp_val = val # help type checkers
9394
isna_func = getattr(_pd, "isna", None)

0 commit comments

Comments
 (0)