Final hcmi fix I hope. I just used bash and subprocess to dedup instead of polars. Way less RAM

jjacobson95 · jjacobson95 · commit 759cef310881 · 2025-08-10T15:58:20.000-07:00
diff --git a/coderbuild/hcmi/02-getHCMIData.py b/coderbuild/hcmi/02-getHCMIData.py
@@ -19,6 +19,8 @@
 import hashlib
 from pathlib import Path
 import tempfile
+import shlex
+
 
 def download_tool(url):
     """
@@ -784,41 +786,68 @@ def write_dataframe_to_csv(dataframe, outname):
     return
 
 
-
-def deduplicate_final_csv(csv_path, subset=None):
+def deduplicate_final_csv(csv_path: str, subset=None):
     """
-    Re-open the finished CSV in streaming mode, drop duplicate rows (optionally
-    only on a subset of columns), and atomically replace the file.
-
-    Parameters
-    ----------
-    csv_path : str
-        Full path to the written CSV (gzip OK; .gz recognised automatically).
-    subset : list[str] | None
-        Columns to consider when identifying duplicates.
-        None = all columns (same behaviour as df.drop_duplicates()).
-
-    Returns
-    -------
-    None  (file is overwritten in-place)
+    Very low-RAM dedup via external sort+uniq.
+    - Preserves header.
+    - Full-line dedupe (subset ignored).
+    - Writes gz output; replaces input if it was .gz, else creates <input>.gz and removes the original.
     """
-    fd, tmp = tempfile.mkstemp(suffix=".csv.gz")
-    os.close(fd)
-    (
-        pl.scan_csv(csv_path)
-          .unique(subset=subset, maintain_order=True)
-          .sink_csv(tmp,
-                    has_header=True,
-                    compression="gzip",
-                    separator=",")
-    )
-
-    out_path = csv_path + ".gz"
-    os.replace(tmp, out_path)
-    os.remove(csv_path)
-    print(f"De-duplicated and gzipped file written to: {out_path}")
-    # 6. free memory
-    gc.collect()
+    if subset is not None:
+        print("Warning: 'subset' is ignored by sort/uniq dedup (full-line dedupe).")
+
+    is_gz = csv_path.endswith(".gz")
+    out_path = csv_path if is_gz else f"{csv_path}.gz"
+    # temp gz path
+    tmp_fd, tmp_path = tempfile.mkstemp(suffix=".csv.gz")
+    os.close(tmp_fd)
+    # Decompressor
+    src_cmd = f"gzip -cd {shlex.quote(csv_path)}" if is_gz else f"cat {shlex.quote(csv_path)}"
+
+    # Detect sort verion
+    def _has_gnu_sort():
+        try:
+            out = subprocess.run(["sort", "--version"], capture_output=True, text=True)
+            return out.returncode == 0 and "GNU coreutils" in (out.stdout + out.stderr)
+        except Exception:
+            return False
+
+    gnu = _has_gnu_sort()
+    sort_opts = []
+    if gnu:
+        # Safe defaults
+        sort_opts += ["-S", "1G", "--temporary-directory", "."]
+
+    sort_cmd = " ".join(["sort"] + [shlex.quote(o) for o in sort_opts])
+
+    try:
+        # Grab header first
+        header = subprocess.check_output(
+            f"{src_cmd} | head -n 1",
+            shell=True,
+            executable="/bin/bash",
+            text=True
+        ).rstrip("\n")
+
+        # Print header, then stream body,sort,uniq, then gzip
+        cmd = (
+            "{ "
+            f"printf %s\\\\n {shlex.quote(header)}; "
+            f"LC_ALL=C {src_cmd} | tail -n +2 | {sort_cmd} | uniq; "
+            f"}} | gzip -c > {shlex.quote(tmp_path)}"
+        )
+
+        subprocess.run(cmd, shell=True, check=True, executable="/bin/bash")
+        os.replace(tmp_path, out_path)
+        if not is_gz and os.path.exists(csv_path):
+            os.remove(csv_path)
+        print(f"De-duplicated and gzipped file written to: {out_path}")
+    except subprocess.CalledProcessError as e:
+        if os.path.exists(tmp_path):
+            os.remove(tmp_path)
+        raise RuntimeError(f"Deduplication failed with exit code {e.returncode}") from e
+    finally:
+        gc.collect()
 
 
 
@@ -929,7 +958,7 @@ def main():
     print(f"Done. Dataset written to {args.outname}")
     
     print("Running global de-duplication pass …")
-    deduplicate_final_csv(args.outname)         # subset=None  ⇒ all columns
+    deduplicate_final_csv(args.outname)      
     print("All done.")
     
     
diff --git a/coderbuild/hcmi/build_omics.sh b/coderbuild/hcmi/build_omics.sh
@@ -4,10 +4,10 @@ set -euo pipefail
 trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
 
 echo "Running 02-getHCMIData.py for transcriptomics."
-python 02-getHCMIData.py -m full_manifest.txt -t transcriptomics -o /tmp/hcmi_transcriptomics.csv -g $1 -s $2
+python 02-getHCMIData.py -m full_manifest.txt -t transcriptomics -o /tmp/hcmi_transcriptomics.csv.gz -g $1 -s $2
 
 echo "Running 02-getHCMIData.py for copy_number."
-python 02-getHCMIData.py -m full_manifest.txt -t copy_number -o /tmp/hcmi_copy_number.csv -g $1 -s $2
+python 02-getHCMIData.py -m full_manifest.txt -t copy_number -o /tmp/hcmi_copy_number.csv.gz -g $1 -s $2
 
 echo "Running 02-getHCMIData.py for mutations."
-python 02-getHCMIData.py -m full_manifest.txt -t mutations -o /tmp/hcmi_mutations.csv -g $1 -s $2
+python 02-getHCMIData.py -m full_manifest.txt -t mutations -o /tmp/hcmi_mutations.csv.gz -g $1 -s $2
diff --git a/scripts/map_improve_drug_ids.py b/scripts/map_improve_drug_ids.py
@@ -369,7 +369,7 @@ def main():
                         help='Build date in YYYY-MM-DD. Default=now.')
     parser.add_argument('--version', required=True,
                         help='Build version. Must be unique per build.')
-    parser.add_argument('--datasets', default='gdscv1,ccle,ctrpv2,fimm,gcsi,gdscv2,nci60,prism,beataml,pancpdo,bladderpdo,sarcpdo,liverpdo,novartispdx,mpnst',
+    parser.add_argument('--datasets', default='gdscv1,ccle,ctrpv2,fimm,gcsi,gdscv2,nci60,prism,beataml,pancreatic,bladder,sarcoma,liver,novartis,colorectal,mpnst',
                         help='Comma-separated list of datasets.')
     parser.add_argument('--local_dir', default='data',
                         help='Directory containing TSV files.')