Chunking added to hcmi and pancreatic datasets. Added an extra check for rare bug where a MAF file is empty and it crashes. Moved HCMI to the end of build

jjacobson95 · jjacobson95 · commit 74c14a6d8944 · 2025-08-11T22:49:13.000-07:00
diff --git a/coderbuild/bladder/02_createBladderDrugsFile.py b/coderbuild/bladder/02_createBladderDrugsFile.py
@@ -25,7 +25,7 @@ def create_bladder_drugs_file(synObject, prevDrugFilepath, outputPath):
         print("No bladder drug names extracted; exiting.")
         return
 
-    print(f"bladder raw drug names: {raw_names}")
+    # print(f"bladder raw drug names: {raw_names}")
 
     #New pubchem call
     update_dataframe_and_write_tsv(
diff --git a/coderbuild/build_all.py b/coderbuild/build_all.py
@@ -40,7 +40,7 @@ def main():
     parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.")
     parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands")
     parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.")
-    parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,pancreatic,bladder,sarcoma,liver,novartis,colorectal,mpnst',help='Datasets to process. Defaults to all available.')
+    parser.add_argument('--dataset',dest='datasets',default='broad_sanger,beataml,pancreatic,bladder,sarcoma,liver,novartis,colorectal,mpnst,hcmi',help='Datasets to process. Defaults to all available.')
     parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.')
     parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.')
     parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.')
diff --git a/coderbuild/hcmi/02-getHCMIData.py b/coderbuild/hcmi/02-getHCMIData.py
@@ -68,6 +68,14 @@ def download_tool(url):
 
     return gdc_client_path
 
+def _df_chunks(df, size):
+    """
+    Helper function to yield chunks of a DataFrame.
+    This is so the GDC tool can process smaller batches of data.
+    """
+    for i in range(0, len(df), size):
+        yield df.iloc[i:i+size]
+
 
 def _append_to_csv(df: pl.DataFrame, out_path: str, header_written: bool) -> bool:
     """
@@ -210,8 +218,7 @@ def use_gdc_tool(manifest_data, data_type, download_data):
 
         # Read the list of expected file IDs, their MD5 checksums, and filenames
         expected_files = newfm[['id', 'md5', 'filename']].values.tolist()
-        total_files = len(expected_files)
-        print(f"Total files to download: {total_files}")
+        print(f"Total files to download: {len(expected_files)}")
 
         # Initialize retry variables
         retries = 0
@@ -243,9 +250,19 @@ def _verify_md5(file_id, expected_md5, expected_filename):
                 return False
 
         # Initial download attempt
-        print("Starting secondary download...")
-        subprocess.run(['./gdc-client', 'download', '-d', manifest_loc, '-m', 'new_manifest.txt'])
-        print("Secondary download complete.")
+        print("Starting to download batches of files through the GDC Client...")
+        
+        batch_size = 150
+        print(f"Starting batched download ({batch_size} per batch)...")
+        for bi, batch_df in enumerate(_df_chunks(newfm, batch_size), start=1):
+            tmp_manifest = f"manifest_batch_{bi:04d}.txt"
+            batch_df.to_csv(tmp_manifest, sep="\t", index=False)
+            print(f"  Batch {bi}: downloading {len(batch_df)} files …")
+            subprocess.run(['./gdc-client', 'download', '-d', manifest_loc, '-m', tmp_manifest])
+            os.remove(tmp_manifest)
+        print("Batched download complete.")
+
+        print("All download batches complete.")
 
         # Check for missing or corrupt files and retry if necessary
         while retries <= max_retries:
@@ -287,15 +304,17 @@ def _verify_md5(file_id, expected_md5, expected_filename):
                 shutil.rmtree(file_dir, ignore_errors=True)
                 print(f"  Removed corrupt file: {file_id}")
 
-            # Create a new manifest with missing or corrupt IDs
-            retry_manifest = newfm[newfm['id'].isin(missing_or_corrupt_ids)]
-            retry_manifest.to_csv('retry_manifest.txt', sep='\t', index=False)
-
-            # Retry download
-            print(f"Starting retry {retries} download...")
-            subprocess.run(['./gdc-client', 'download', '-d', manifest_loc, '-m', 'retry_manifest.txt'])
+            retry_df = newfm[newfm['id'].isin(missing_or_corrupt_ids)]
+            print(f"Starting retry {retries} in batches of {batch_size} …")
+            for bi, batch_df in enumerate(_df_chunks(retry_df, batch_size), start=1):
+                tmp_manifest = f"retry_{retries:02d}_batch_{bi:04d}.txt"
+                batch_df.to_csv(tmp_manifest, sep="\t", index=False)
+                print(f"  Retry {retries} · Batch {bi}: {len(batch_df)} files")
+                subprocess.run(['./gdc-client', 'download', '-d', manifest_loc, '-m', tmp_manifest])
+                os.remove(tmp_manifest)
             print(f"Retry {retries} complete.")
 
+
         if missing_or_corrupt_ids:
             print(f"\nFailed to download or verify {len(missing_or_corrupt_ids)} files after {max_retries} retries.")
             print("Proceeding with available files.")
@@ -395,13 +414,16 @@ def stream_clean_files(data_type: str):
                 fpath = os.path.join(manifest, folder, fname)
 
                 # ---- read single file ------------------------------
-                if fpath.endswith(".gz"):                     # mutation data
-                    with gzip.open(fpath, "rt") as fh:
-                        df = pl.read_csv(fh, separator="\t", skip_rows=7)
+                if fpath.endswith(".gz"):  # mutation data is always gzipped
+                    try:
+                        df = pl.read_csv(fpath, separator="\t", skip_rows=7)
+                    except Exception as e:
+                        print(f"[warn] skipping MAF due to read error: {fpath} ({type(e).__name__}: {e})")
+                        continue
                 else:                                        # copy-number / tx
                     skip = 1 if data_type == "transcriptomics" else 0
                     df   = pl.read_csv(fpath, separator="\t", skip_rows=skip)
-
+    
                 df = df.with_columns(pl.lit(folder).alias("file_id"))
 
                 if data_type == "transcriptomics":
diff --git a/coderbuild/pancreatic/02-getPancreaticData.py b/coderbuild/pancreatic/02-getPancreaticData.py
@@ -98,6 +98,14 @@ def ensure_gdc_client():
     else:
         print("gdc-client already installed")
 
+def _df_chunks(df, size):
+    """
+    Helper function to yield chunks of a DataFrame.
+    This is so the GDC tool can process smaller batches of data.
+    """
+    for i in range(0, len(df), size):
+        yield df.iloc[i:i+size]
+
 
 
 def extract_uuids_from_manifest(manifest_data):
@@ -183,7 +191,7 @@ def use_gdc_tool(manifest_data, data_type, download_data):
 
         # Initialize retry variables
         retries = 0
-        max_retries = 1
+        max_retries = 5
 
         # Function to get downloaded file IDs
         def get_downloaded_ids(manifest_loc):
@@ -211,9 +219,17 @@ def _verify_md5(file_id, expected_md5, expected_filename):
                 return False
 
         # Initial download attempt
-        print("Starting secondary download...")
-        subprocess.run(['./gdc-client', 'download', '-d', manifest_loc, '-m', 'new_manifest.txt'])
-        print("Secondary download complete.")
+        print("Starting download...")
+        # subprocess.run(['./gdc-client', 'download', '-d', manifest_loc, '-m', 'new_manifest.txt'])
+        batch_size = 150
+        print(f"Starting batched download ({batch_size} per batch)...")
+        for bi, batch_df in enumerate(_df_chunks(newfm, batch_size), start=1):
+            tmp_manifest = f"manifest_batch_{bi:04d}.txt"
+            batch_df.to_csv(tmp_manifest, sep="\t", index=False)
+            print(f"  Batch {bi}: downloading {len(batch_df)} files …")
+            subprocess.run(['./gdc-client', 'download', '-d', manifest_loc, '-m', tmp_manifest])
+            os.remove(tmp_manifest)
+        print("Batched download complete.")
 
         # Check for missing or corrupt files and retry if necessary
         while retries <= max_retries:
@@ -256,12 +272,14 @@ def _verify_md5(file_id, expected_md5, expected_filename):
                 print(f"  Removed corrupt file: {file_id}")
 
             # Create a new manifest with missing or corrupt IDs
-            retry_manifest = newfm[newfm['id'].isin(missing_or_corrupt_ids)]
-            retry_manifest.to_csv('retry_manifest.txt', sep='\t', index=False)
-
-            # Retry download
-            print(f"Starting retry {retries} download...")
-            subprocess.run(['./gdc-client', 'download', '-d', manifest_loc, '-m', 'retry_manifest.txt'])
+            retry_df = newfm[newfm['id'].isin(missing_or_corrupt_ids)]
+            print(f"Starting retry {retries} in batches of {batch_size} …")
+            for bi, batch_df in enumerate(_df_chunks(retry_df, batch_size), start=1):
+                tmp_manifest = f"retry_{retries:02d}_batch_{bi:04d}.txt"
+                batch_df.to_csv(tmp_manifest, sep="\t", index=False)
+                print(f"  Retry {retries} · Batch {bi}: {len(batch_df)} files")
+                subprocess.run(['./gdc-client', 'download', '-d', manifest_loc, '-m', tmp_manifest], check=True)
+                os.remove(tmp_manifest)
             print(f"Retry {retries} complete.")
 
         if missing_or_corrupt_ids: