Skip to content

Commit 4de683c

Browse files
authored
Remove duplicate job ids (#21)
* update notebooks * fix gitattributes syntax * update scripts and run notebooks * remove filters * fix: detect duplicate Job IDs and keep latest one * fix ruff error * update function args * Re-add code to handle duplicate IDs and keep latest entry * fix ruff error * revert gitattributes
1 parent 9aab59f commit 4de683c

File tree

1 file changed

+11
-0
lines changed

1 file changed

+11
-0
lines changed

src/preprocess/preprocess.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -803,6 +803,17 @@ def preprocess_data(
803803
message = f"Some entries in {col_name} having infinity values. This may be caused by an overflow."
804804
warnings.warn(message=message, stacklevel=2, category=UserWarning)
805805

806+
# Identify and handle duplicate JobIDs
807+
duplicate_rows = data[data["JobID"].duplicated(keep=False)]
808+
if not duplicate_rows.empty:
809+
duplicate_message = (
810+
f"{len(duplicate_rows['JobID'].unique().tolist())} duplicate JobIDs detected. "
811+
"Keeping only the latest entry for each JobID."
812+
)
813+
warnings.warn(message=duplicate_message, stacklevel=2, category=UserWarning)
814+
data_sorted = data.sort_values(by="SubmitTime", ascending=False) # Sort by SubmitTime to keep the latest entry
815+
data = data_sorted.drop_duplicates(subset=["JobID"], keep="first") # Keep the latest entry for each JobID
816+
806817
# Save preprocessing error logs to a file.
807818
_write_preprocessing_error_logs(processing_error_logs)
808819

0 commit comments

Comments
 (0)