further deduplicate the incremental rows

Tian-2017 · Tian-2017 · commit 9fcd13183b65 · 2025-04-11T16:14:57.000+01:00
diff --git a/scripts/jobs/planning/tascomi_create_daily_snapshot.py b/scripts/jobs/planning/tascomi_create_daily_snapshot.py
@@ -44,6 +44,24 @@ def add_snapshot_date_columns(data_frame):
     return data_frame
 
 
+def deduplicate_by_id_and_last_updated(df):
+    """
+    Deduplicates rows with the same (id, last_updated) combination by keeping the one with the latest import_date.
+    To resolve: spotted duplicated rows with same id and last_updated timestamp in some incremental tables (e.g. documents)
+    """
+    window_spec = Window.partitionBy("id", "last_updated").orderBy(
+        F.col("import_date").desc()
+    )
+
+    deduplicated_df = (
+        df.withColumn("row_num", F.row_number().over(window_spec))
+        .filter(F.col("row_num") == 1)
+        .drop("row_num")
+    )
+
+    return deduplicated_df
+
+
 def prepare_increments(increment_df):
     # In case there are several days worth of increments: only keep the latest version of a record
     id_partition = Window.partitionBy("id")
@@ -62,6 +80,15 @@ def prepare_increments(increment_df):
         .where(F.col("last_updated_nonull") == F.col("latest"))
         .drop("latest", "last_updated_nonull")
     )
+
+    # Check for residual duplicates - print and further de-duplicate
+    duplicate_ids = increment_df.groupBy("id").count().filter("count > 1")
+    if duplicate_ids.count() > 0:
+        duplicate_ids.join(increment_df, "id").show(truncate=False)
+        increment_df = deduplicate_by_id_and_last_updated(increment_df)
+    else:
+        logger.info("No duplicated rows after initial deduplication.")
+
     return increment_df
 
 
@@ -356,7 +383,7 @@ def purge_today_partition(
         job.commit()
     finally:
         if len(dq_errors) > 0:
-            logger.error(f"DQ Errors: {dq_errors}")
-            raise Exception(f"Data quality check failed: {'; '.join(dq_errors)}")
+            logger.error(f"Errors: {dq_errors}")
+            raise Exception(f"Job Failed: {'; '.join(dq_errors)}")
         spark.sparkContext._gateway.close()
         spark.stop()