Skip to content

Commit 8b11e87

Browse files
author
Benjamin Pachev
committed
Added missing requirements. Removed incorrect duplicate handling in preprocessing. Removed breaking pydantic changes in EfficiencyAnalysis. Fixed KeyError in report generation, and check for Quarto output in the wrong place.
1 parent b482425 commit 8b11e87

File tree

6 files changed

+25
-50
lines changed

6 files changed

+25
-50
lines changed

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ seaborn
99
jupyterlab
1010
jq
1111
pydantic
12+
papermill
1213
requests
1314
requests-cache
1415
ipykernel

scripts/generate_user_reports.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ def generate_user_report(
218218
text=True,
219219
cwd=reports_dir,
220220
)
221-
221+
output_path = os.path.join(reports_dir, output_file)
222222
# Check if command was successful
223223
if result.returncode != 0:
224224
print(f" ❌ Quarto rendering failed (code {result.returncode})")
@@ -229,7 +229,7 @@ def generate_user_report(
229229
return None
230230

231231
# Check if output file was created
232-
if not os.path.exists(output_file_abs):
232+
if not os.path.exists(output_path):
233233
print(" ❌ Output file not created")
234234
print(f" Expected at: {output_file_abs}")
235235
return None
@@ -552,4 +552,4 @@ def generate_all_reports(
552552

553553
print("\n🎉 ALL TASKS COMPLETED!")
554554
print(f" Output directory: {args.output_dir}")
555-
print(" Check the individual function outputs above for details.")
555+
print(" Check the individual function outputs above for details.")

src/analysis/efficiency_analysis.py

Lines changed: 2 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
generate_recommendations,
2323
)
2424
from src.config.enum_constants import FilterTypeEnum, MetricsDataFrameNameBase, MetricsDataFrameNameEnum
25-
from pydantic import validate_call, AfterValidator, SkipValidation
2625
from src.database import DatabaseConnection
2726
from src.preprocess.preprocess import preprocess_data
2827

@@ -67,28 +66,6 @@ def load_preprocessed_jobs_dataframe_from_duckdb(
6766
# Generic type for metrics enums constrained to our abstract base Enum class
6867
MetricsDFNameEnumT = TypeVar("MetricsDFNameEnumT", bound=MetricsDataFrameNameBase)
6968

70-
71-
def _ensure_concrete_metrics_enum(
72-
cls: type[MetricsDFNameEnumT],
73-
) -> type[MetricsDFNameEnumT]:
74-
"""Validate that the provided class is a concrete subclass of MetricsDataFrameNameBase.
75-
76-
Used by Pydantic to validate the enum argument to the constructor.
77-
78-
Raises:
79-
TypeError: If the type is not a subclass of the base, or is the abstract base itself.
80-
81-
Returns:
82-
type[MetricsDFNameEnumT]: The validated enum class.
83-
"""
84-
# Ensure it's a subclass of our abstract base (defensive; helps type checkers and runtime safety)
85-
if not isinstance(cls, type) or not issubclass(cls, MetricsDataFrameNameBase):
86-
raise TypeError("metrics_df_name_enum must be a subclass of MetricsDataFrameNameBase")
87-
if cls is MetricsDataFrameNameBase:
88-
raise TypeError("metrics_df_name_enum must be a concrete Enum subclass, not the abstract base")
89-
return cls
90-
91-
9269
class EfficiencyAnalysis(Generic[MetricsDFNameEnumT]):
9370
"""
9471
Class to encapsulate the efficiency analysis of jobs based on various metrics.
@@ -98,12 +75,10 @@ class EfficiencyAnalysis(Generic[MetricsDFNameEnumT]):
9875
The metrics are generated in separate DataFrames for each category in MetricsDataFrameNameEnum.
9976
"""
10077

101-
# Apply Pydantic runtime validation for constructor arguments
102-
@validate_call(config={"arbitrary_types_allowed": True})
10378
def __init__(
10479
self,
105-
jobs_df: Annotated[pd.DataFrame, SkipValidation()],
106-
metrics_df_name_enum: Annotated[type[MetricsDFNameEnumT], AfterValidator(_ensure_concrete_metrics_enum)],
80+
jobs_df: pd.DataFrame,
81+
metrics_df_name_enum: type[MetricsDFNameEnumT] = MetricsDataFrameNameEnum
10782
) -> None:
10883
"""
10984
Initialize the EfficiencyAnalysis class.

src/config/snapshots/partition_info.json

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,17 @@
11
[
2+
{
3+
"name": "arm",
4+
"type": "cpu",
5+
"node_count": 9,
6+
"maxtime": "14 days",
7+
"deftime": "1 hour",
8+
"max_ram": 470,
9+
"max_cpus": 144
10+
},
211
{
312
"name": "arm-gpu",
413
"type": "gpu",
5-
"node_count": 3,
14+
"node_count": 4,
615
"maxtime": "14 days",
716
"deftime": "1 hour",
817
"max_ram": 560,
@@ -83,7 +92,7 @@
8392
{
8493
"name": "cpu",
8594
"type": "cpu",
86-
"node_count": 147,
95+
"node_count": 153,
8796
"maxtime": "2 days",
8897
"deftime": "1 hour",
8998
"max_ram": 1510,
@@ -92,7 +101,7 @@
92101
{
93102
"name": "cpu-preempt",
94103
"type": "cpu",
95-
"node_count": 138,
104+
"node_count": 144,
96105
"maxtime": "2 days",
97106
"deftime": "1 hour",
98107
"max_ram": 1510,
@@ -155,7 +164,7 @@
155164
{
156165
"name": "gpupod-l40s",
157166
"type": "gpu",
158-
"node_count": 10,
167+
"node_count": 13,
159168
"maxtime": "14 days",
160169
"deftime": "1 hour",
161170
"max_ram": 500,
@@ -164,11 +173,11 @@
164173
{
165174
"name": "ials-gpu",
166175
"type": "gpu",
167-
"node_count": 31,
176+
"node_count": 28,
168177
"maxtime": "14 days",
169178
"deftime": "1 hour",
170-
"max_ram": 500,
171-
"max_cpus": 32
179+
"max_ram": 180,
180+
"max_cpus": 24
172181
},
173182
{
174183
"name": "jdelhommelle",
@@ -272,7 +281,7 @@
272281
{
273282
"name": "uri-cpu",
274283
"type": "cpu",
275-
"node_count": 43,
284+
"node_count": 49,
276285
"maxtime": "30 days",
277286
"deftime": "1 hour",
278287
"max_ram": 1000,

src/preprocess/preprocess.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -466,17 +466,6 @@ def preprocess_data(
466466
# Check for infinity values in memory usage columns
467467
_check_for_infinity_values(data)
468468

469-
# Identify and handle duplicate JobIDs
470-
duplicate_rows = data[data["JobID"].duplicated(keep=False)]
471-
if not duplicate_rows.empty:
472-
duplicate_message = (
473-
f"{len(duplicate_rows['JobID'].unique().tolist())} duplicate JobIDs detected. "
474-
"Keeping only the latest entry for each JobID."
475-
)
476-
warnings.warn(message=duplicate_message, stacklevel=2, category=UserWarning)
477-
data_sorted = data.sort_values(by="SubmitTime", ascending=False) # Sort by SubmitTime to keep the latest entry
478-
data = data_sorted.drop_duplicates(subset=["JobID"], keep="first") # Keep the latest entry for each JobID
479-
480469
# Save preprocessing error logs to a file.
481470
_write_preprocessing_error_logs(processing_error_logs)
482471

src/utilities/report_generation.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,8 @@ def generate_recommendations(user_jobs: pd.DataFrame, user_data: pd.Series = Non
448448
)
449449

450450
# Requested VRAM efficiency (if user_data is provided)
451-
eff = user_data["requested_vram_efficiency"]
451+
eff = user_data['expected_value_requested_vram_efficiency']
452+
#eff = user_data["requested_vram_efficiency"]
452453
if user_data is not None and (eff.mean() if hasattr(eff, 'mean') else float(eff)) < 0.5:
453454
recommendations.append(
454455
"📊 **Resource Planning**: You consistently request more VRAM than you use. "

0 commit comments

Comments
 (0)