@@ -802,27 +802,33 @@ def deduplicate_final_csv(csv_path, subset=None):
802802 -------
803803 None (file is overwritten in-place)
804804 """
805- # Clear everything from memory except what is needed here
806- globals_to_clear = [k for k in globals () if not k .startswith ("__" )]
807- for k in globals_to_clear :
808- if k not in ("deduplicate_final_csv" , "tempfile" , "pl" , "gc" , "os" ):
809- del globals ()[k ]
810- gc .collect ()
811-
812- is_gz = csv_path .endswith (".gz" )
813- fd , tmp = tempfile .mkstemp (suffix = ".csv.gz" if is_gz else ".csv" )
814- os .close (fd )
805+ # 1. build a temp file with .csv.gz suffix
806+ fd , tmp = tempfile .mkstemp (suffix = ".csv.gz" )
807+ os .close (fd )
815808
809+ # 2. lazy-scan original CSV, drop dupes, write compressed
816810 (
817811 pl .scan_csv (csv_path )
818812 .unique (subset = subset , maintain_order = True )
819- .sink_csv (tmp , has_header = True ,
820- compression = "gzip" if is_gz else None ,
813+ .sink_csv (tmp ,
814+ has_header = True ,
815+ compression = "gzip" ,
821816 separator = "," )
822817 )
823818
824- os .replace (tmp , csv_path )
825- print (f"De-duplicated rows written back to { csv_path } " )
819+ # 3. define the final output name
820+ out_path = csv_path + ".gz"
821+
822+ # 4. atomically move temp → final
823+ os .replace (tmp , out_path )
824+
825+ # 5. remove the old uncompressed CSV
826+ os .remove (csv_path )
827+
828+ print (f"De-duplicated and gzipped file written to: { out_path } " )
829+
830+ # 6. free memory
831+ gc .collect ()
826832
827833
828834
0 commit comments