Added missing requirements. Removed incorrect duplicate handling in preprocessing. Removed breaking pydantic changes in EfficiencyAnalysis. Fixed KeyError in report generation, and check for Quarto output in the wrong place.

Benjamin Pachev · Benjamin Pachev · commit 8b11e876840d · 2025-09-04T01:31:30.000Z
diff --git a/requirements.txt b/requirements.txt
@@ -9,6 +9,7 @@ seaborn
 jupyterlab
 jq
 pydantic
+papermill
 requests
 requests-cache
 ipykernel
diff --git a/scripts/generate_user_reports.py b/scripts/generate_user_reports.py
@@ -218,7 +218,7 @@ def generate_user_report(
             text=True,
             cwd=reports_dir,
         )
-
+        output_path = os.path.join(reports_dir, output_file)
         # Check if command was successful
         if result.returncode != 0:
             print(f"      ❌ Quarto rendering failed (code {result.returncode})")
@@ -229,7 +229,7 @@ def generate_user_report(
             return None
 
         # Check if output file was created
-        if not os.path.exists(output_file_abs):
+        if not os.path.exists(output_path):
             print("      ❌ Output file not created")
             print(f"         Expected at: {output_file_abs}")
             return None
@@ -552,4 +552,4 @@ def generate_all_reports(
 
     print("\n🎉 ALL TASKS COMPLETED!")
     print(f"   Output directory: {args.output_dir}")
-    print("   Check the individual function outputs above for details.")
+    print("   Check the individual function outputs above for details.")
diff --git a/src/analysis/efficiency_analysis.py b/src/analysis/efficiency_analysis.py
@@ -22,7 +22,6 @@
     generate_recommendations,
 )
 from src.config.enum_constants import FilterTypeEnum, MetricsDataFrameNameBase, MetricsDataFrameNameEnum
-from pydantic import validate_call, AfterValidator, SkipValidation
 from src.database import DatabaseConnection
 from src.preprocess.preprocess import preprocess_data
 
@@ -67,28 +66,6 @@ def load_preprocessed_jobs_dataframe_from_duckdb(
 # Generic type for metrics enums constrained to our abstract base Enum class
 MetricsDFNameEnumT = TypeVar("MetricsDFNameEnumT", bound=MetricsDataFrameNameBase)
 
-
-def _ensure_concrete_metrics_enum(
-    cls: type[MetricsDFNameEnumT],
-) -> type[MetricsDFNameEnumT]:
-    """Validate that the provided class is a concrete subclass of MetricsDataFrameNameBase.
-
-    Used by Pydantic to validate the enum argument to the constructor.
-
-    Raises:
-        TypeError: If the type is not a subclass of the base, or is the abstract base itself.
-
-    Returns:
-        type[MetricsDFNameEnumT]: The validated enum class.
-    """
-    # Ensure it's a subclass of our abstract base (defensive; helps type checkers and runtime safety)
-    if not isinstance(cls, type) or not issubclass(cls, MetricsDataFrameNameBase):
-        raise TypeError("metrics_df_name_enum must be a subclass of MetricsDataFrameNameBase")
-    if cls is MetricsDataFrameNameBase:
-        raise TypeError("metrics_df_name_enum must be a concrete Enum subclass, not the abstract base")
-    return cls
-
-
 class EfficiencyAnalysis(Generic[MetricsDFNameEnumT]):
     """
     Class to encapsulate the efficiency analysis of jobs based on various metrics.
@@ -98,12 +75,10 @@ class EfficiencyAnalysis(Generic[MetricsDFNameEnumT]):
     The metrics are generated in separate DataFrames for each category in MetricsDataFrameNameEnum.
     """
 
-    # Apply Pydantic runtime validation for constructor arguments
-    @validate_call(config={"arbitrary_types_allowed": True})
     def __init__(
         self,
-        jobs_df: Annotated[pd.DataFrame, SkipValidation()],
-        metrics_df_name_enum: Annotated[type[MetricsDFNameEnumT], AfterValidator(_ensure_concrete_metrics_enum)],
+        jobs_df: pd.DataFrame,
+        metrics_df_name_enum: type[MetricsDFNameEnumT] = MetricsDataFrameNameEnum
     ) -> None:
         """
         Initialize the EfficiencyAnalysis class.
diff --git a/src/config/snapshots/partition_info.json b/src/config/snapshots/partition_info.json
@@ -1,8 +1,17 @@
 [
+  {
+    "name": "arm",
+    "type": "cpu",
+    "node_count": 9,
+    "maxtime": "14 days",
+    "deftime": "1 hour",
+    "max_ram": 470,
+    "max_cpus": 144
+  },
   {
     "name": "arm-gpu",
     "type": "gpu",
-    "node_count": 3,
+    "node_count": 4,
     "maxtime": "14 days",
     "deftime": "1 hour",
     "max_ram": 560,
@@ -83,7 +92,7 @@
   {
     "name": "cpu",
     "type": "cpu",
-    "node_count": 147,
+    "node_count": 153,
     "maxtime": "2 days",
     "deftime": "1 hour",
     "max_ram": 1510,
@@ -92,7 +101,7 @@
   {
     "name": "cpu-preempt",
     "type": "cpu",
-    "node_count": 138,
+    "node_count": 144,
     "maxtime": "2 days",
     "deftime": "1 hour",
     "max_ram": 1510,
@@ -155,7 +164,7 @@
   {
     "name": "gpupod-l40s",
     "type": "gpu",
-    "node_count": 10,
+    "node_count": 13,
     "maxtime": "14 days",
     "deftime": "1 hour",
     "max_ram": 500,
@@ -164,11 +173,11 @@
   {
     "name": "ials-gpu",
     "type": "gpu",
-    "node_count": 31,
+    "node_count": 28,
     "maxtime": "14 days",
     "deftime": "1 hour",
-    "max_ram": 500,
-    "max_cpus": 32
+    "max_ram": 180,
+    "max_cpus": 24
   },
   {
     "name": "jdelhommelle",
@@ -272,7 +281,7 @@
   {
     "name": "uri-cpu",
     "type": "cpu",
-    "node_count": 43,
+    "node_count": 49,
     "maxtime": "30 days",
     "deftime": "1 hour",
     "max_ram": 1000,
diff --git a/src/preprocess/preprocess.py b/src/preprocess/preprocess.py
@@ -466,17 +466,6 @@ def preprocess_data(
     # Check for infinity values in memory usage columns
     _check_for_infinity_values(data)
 
-    # Identify and handle duplicate JobIDs
-    duplicate_rows = data[data["JobID"].duplicated(keep=False)]
-    if not duplicate_rows.empty:
-        duplicate_message = (
-            f"{len(duplicate_rows['JobID'].unique().tolist())} duplicate JobIDs detected. "
-            "Keeping only the latest entry for each JobID."
-        )
-        warnings.warn(message=duplicate_message, stacklevel=2, category=UserWarning)
-        data_sorted = data.sort_values(by="SubmitTime", ascending=False)  # Sort by SubmitTime to keep the latest entry
-        data = data_sorted.drop_duplicates(subset=["JobID"], keep="first")  # Keep the latest entry for each JobID
-
     # Save preprocessing error logs to a file.
     _write_preprocessing_error_logs(processing_error_logs)
 
diff --git a/src/utilities/report_generation.py b/src/utilities/report_generation.py
@@ -448,7 +448,8 @@ def generate_recommendations(user_jobs: pd.DataFrame, user_data: pd.Series = Non
         )
 
     # Requested VRAM efficiency (if user_data is provided)
-    eff = user_data["requested_vram_efficiency"]
+    eff = user_data['expected_value_requested_vram_efficiency']
+    #eff = user_data["requested_vram_efficiency"]
     if user_data is not None and (eff.mean() if hasattr(eff, 'mean') else float(eff)) < 0.5:
         recommendations.append(
             "📊 **Resource Planning**: You consistently request more VRAM than you use. "

Original file line number	Diff line number	Diff line change
`@@ -448,7 +448,8 @@ def generate_recommendations(user_jobs: pd.DataFrame, user_data: pd.Series = Non`
`448`	`448`	`)`
`449`	`449`
`450`	`450`	`# Requested VRAM efficiency (if user_data is provided)`
`451`		`- eff = user_data["requested_vram_efficiency"]`
	`451`	`+ eff = user_data['expected_value_requested_vram_efficiency']`
	`452`	`+ #eff = user_data["requested_vram_efficiency"]`
`452`	`453`	`if user_data is not None and (eff.mean() if hasattr(eff, 'mean') else float(eff)) < 0.5:`
`453`	`454`	`recommendations.append(`
`454`	`455`	`"📊 Resource Planning: You consistently request more VRAM than you use. "`