Skip to content

Commit 57b9fe0

Browse files
committed
apparently pl.scan_csv can't handle gzipped files. fixed hcmi stream
1 parent f2535f8 commit 57b9fe0

File tree

1 file changed

+20
-14
lines changed

1 file changed

+20
-14
lines changed

coderbuild/hcmi/02-getHCMIData.py

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -802,27 +802,33 @@ def deduplicate_final_csv(csv_path, subset=None):
802802
-------
803803
None (file is overwritten in-place)
804804
"""
805-
# Clear everything from memory except what is needed here
806-
globals_to_clear = [k for k in globals() if not k.startswith("__")]
807-
for k in globals_to_clear:
808-
if k not in ("deduplicate_final_csv", "tempfile", "pl", "gc", "os"):
809-
del globals()[k]
810-
gc.collect()
811-
812-
is_gz = csv_path.endswith(".gz")
813-
fd, tmp = tempfile.mkstemp(suffix=".csv.gz" if is_gz else ".csv")
814-
os.close(fd)
805+
# 1. build a temp file with .csv.gz suffix
806+
fd, tmp = tempfile.mkstemp(suffix=".csv.gz")
807+
os.close(fd)
815808

809+
# 2. lazy-scan original CSV, drop dupes, write compressed
816810
(
817811
pl.scan_csv(csv_path)
818812
.unique(subset=subset, maintain_order=True)
819-
.sink_csv(tmp, has_header=True,
820-
compression="gzip" if is_gz else None,
813+
.sink_csv(tmp,
814+
has_header=True,
815+
compression="gzip",
821816
separator=",")
822817
)
823818

824-
os.replace(tmp, csv_path)
825-
print(f"De-duplicated rows written back to {csv_path}")
819+
# 3. define the final output name
820+
out_path = csv_path + ".gz"
821+
822+
# 4. atomically move temp → final
823+
os.replace(tmp, out_path)
824+
825+
# 5. remove the old uncompressed CSV
826+
os.remove(csv_path)
827+
828+
print(f"De-duplicated and gzipped file written to: {out_path}")
829+
830+
# 6. free memory
831+
gc.collect()
826832

827833

828834

0 commit comments

Comments
 (0)