@@ -67,7 +67,7 @@ def get_contacts_mapping(cls):
67
67
]
68
68
69
69
70
- def dedup_consecutive (table , id , order_by , dedup_on ):
70
+ def dedup_consecutive (table , unique_id , id , order_by , dedup_on ):
71
71
# Many of our raw data tables have a similar structure: a contact id column,
72
72
# an insert time column, and several other pieces of raw data. If someone
73
73
# inserts a "new" record for a certain id, but none of the raw data is
@@ -81,15 +81,16 @@ def dedup_consecutive(table, id, order_by, dedup_on):
81
81
# not work well on null values.
82
82
83
83
sq = select (
84
+ unique_id ,
84
85
id ,
85
86
order_by ,
86
87
dedup_on .bool_op ("IS NOT DISTINCT FROM" )(
87
88
func .lag (dedup_on ).over (partition_by = id , order_by = order_by )
88
89
).label ("is_dupe" ),
89
90
).subquery ()
90
91
91
- to_delete = select (sq .c [0 ], sq . c [ 1 ] ).where (sq .c [2 ]).subquery ()
92
- return delete (table ).where (( id == to_delete .c [0 ]) & ( order_by == to_delete . c [ 1 ]) )
92
+ to_delete = select (sq .c [0 ]).where (sq .c [3 ]).subquery ()
93
+ return delete (table ).where (unique_id == to_delete .c [0 ])
93
94
94
95
95
96
def normalize_phone_number (number ):
@@ -181,6 +182,7 @@ def insert_from_file_df(cls, df, conn):
181
182
conn .execute (
182
183
dedup_consecutive (
183
184
cls .__table__ ,
185
+ unique_id = cls ._id ,
184
186
id = cls .contact_id ,
185
187
order_by = cls .created_date ,
186
188
dedup_on = tuple_ (* dedup_on ),
@@ -249,6 +251,7 @@ def insert_from_df(cls, df, conn):
249
251
conn .execute (
250
252
dedup_consecutive (
251
253
cls .__table__ ,
254
+ unique_id = cls ._id ,
252
255
id = cls .internal_id ,
253
256
order_by = cls .created_date ,
254
257
dedup_on = tuple_ (* dedup_on ),
@@ -315,6 +318,7 @@ def insert_from_file(cls, xl_file, conn):
315
318
conn .execute (
316
319
dedup_consecutive (
317
320
cls .__table__ ,
321
+ unique_id = cls ._id ,
318
322
id = cls .number ,
319
323
order_by = cls .created_date ,
320
324
dedup_on = tuple_ (* dedup_on ),
0 commit comments