Remove duplicate job ids (#21)

Espiobest · web-flow · commit 4de683cb4e73 · 2025-08-16T19:58:54.000Z
* update notebooks

* fix gitattributes syntax

* update scripts and run notebooks

* remove filters

* fix: detect duplicate Job IDs and keep latest one

* fix ruff error

* update function args

* Re-add code to handle duplicate IDs and keep latest entry

* fix ruff error

* revert gitattributes
diff --git a/src/preprocess/preprocess.py b/src/preprocess/preprocess.py
@@ -803,6 +803,17 @@ def preprocess_data(
             message = f"Some entries in {col_name} having infinity values. This may be caused by an overflow."
             warnings.warn(message=message, stacklevel=2, category=UserWarning)
 
+    # Identify and handle duplicate JobIDs
+    duplicate_rows = data[data["JobID"].duplicated(keep=False)]
+    if not duplicate_rows.empty:
+        duplicate_message = (
+            f"{len(duplicate_rows['JobID'].unique().tolist())} duplicate JobIDs detected. "
+            "Keeping only the latest entry for each JobID."
+        )
+        warnings.warn(message=duplicate_message, stacklevel=2, category=UserWarning)
+        data_sorted = data.sort_values(by="SubmitTime", ascending=False)  # Sort by SubmitTime to keep the latest entry
+        data = data_sorted.drop_duplicates(subset=["JobID"], keep="first")  # Keep the latest entry for each JobID
+
     # Save preprocessing error logs to a file.
     _write_preprocessing_error_logs(processing_error_logs)