Skip to content

Commit 74c14a6

Browse files
committed
Chunking added to hcmi and pancreatic datasets. Added an extra check for rare bug where a MAF file is empty and it crashes. Moved HCMI to the end of build
1 parent 759cef3 commit 74c14a6

File tree

4 files changed

+68
-28
lines changed

4 files changed

+68
-28
lines changed

coderbuild/bladder/02_createBladderDrugsFile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def create_bladder_drugs_file(synObject, prevDrugFilepath, outputPath):
2525
print("No bladder drug names extracted; exiting.")
2626
return
2727

28-
print(f"bladder raw drug names: {raw_names}")
28+
# print(f"bladder raw drug names: {raw_names}")
2929

3030
#New pubchem call
3131
update_dataframe_and_write_tsv(

coderbuild/build_all.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def main():
4040
parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.")
4141
parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands")
4242
parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.")
43-
parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,pancreatic,bladder,sarcoma,liver,novartis,colorectal,mpnst',help='Datasets to process. Defaults to all available.')
43+
parser.add_argument('--dataset',dest='datasets',default='broad_sanger,beataml,pancreatic,bladder,sarcoma,liver,novartis,colorectal,mpnst,hcmi',help='Datasets to process. Defaults to all available.')
4444
parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.')
4545
parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.')
4646
parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.')

coderbuild/hcmi/02-getHCMIData.py

Lines changed: 38 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,14 @@ def download_tool(url):
6868

6969
return gdc_client_path
7070

71+
def _df_chunks(df, size):
72+
"""
73+
Helper function to yield chunks of a DataFrame.
74+
This is so the GDC tool can process smaller batches of data.
75+
"""
76+
for i in range(0, len(df), size):
77+
yield df.iloc[i:i+size]
78+
7179

7280
def _append_to_csv(df: pl.DataFrame, out_path: str, header_written: bool) -> bool:
7381
"""
@@ -210,8 +218,7 @@ def use_gdc_tool(manifest_data, data_type, download_data):
210218

211219
# Read the list of expected file IDs, their MD5 checksums, and filenames
212220
expected_files = newfm[['id', 'md5', 'filename']].values.tolist()
213-
total_files = len(expected_files)
214-
print(f"Total files to download: {total_files}")
221+
print(f"Total files to download: {len(expected_files)}")
215222

216223
# Initialize retry variables
217224
retries = 0
@@ -243,9 +250,19 @@ def _verify_md5(file_id, expected_md5, expected_filename):
243250
return False
244251

245252
# Initial download attempt
246-
print("Starting secondary download...")
247-
subprocess.run(['./gdc-client', 'download', '-d', manifest_loc, '-m', 'new_manifest.txt'])
248-
print("Secondary download complete.")
253+
print("Starting to download batches of files through the GDC Client...")
254+
255+
batch_size = 150
256+
print(f"Starting batched download ({batch_size} per batch)...")
257+
for bi, batch_df in enumerate(_df_chunks(newfm, batch_size), start=1):
258+
tmp_manifest = f"manifest_batch_{bi:04d}.txt"
259+
batch_df.to_csv(tmp_manifest, sep="\t", index=False)
260+
print(f" Batch {bi}: downloading {len(batch_df)} files …")
261+
subprocess.run(['./gdc-client', 'download', '-d', manifest_loc, '-m', tmp_manifest])
262+
os.remove(tmp_manifest)
263+
print("Batched download complete.")
264+
265+
print("All download batches complete.")
249266

250267
# Check for missing or corrupt files and retry if necessary
251268
while retries <= max_retries:
@@ -287,15 +304,17 @@ def _verify_md5(file_id, expected_md5, expected_filename):
287304
shutil.rmtree(file_dir, ignore_errors=True)
288305
print(f" Removed corrupt file: {file_id}")
289306

290-
# Create a new manifest with missing or corrupt IDs
291-
retry_manifest = newfm[newfm['id'].isin(missing_or_corrupt_ids)]
292-
retry_manifest.to_csv('retry_manifest.txt', sep='\t', index=False)
293-
294-
# Retry download
295-
print(f"Starting retry {retries} download...")
296-
subprocess.run(['./gdc-client', 'download', '-d', manifest_loc, '-m', 'retry_manifest.txt'])
307+
retry_df = newfm[newfm['id'].isin(missing_or_corrupt_ids)]
308+
print(f"Starting retry {retries} in batches of {batch_size} …")
309+
for bi, batch_df in enumerate(_df_chunks(retry_df, batch_size), start=1):
310+
tmp_manifest = f"retry_{retries:02d}_batch_{bi:04d}.txt"
311+
batch_df.to_csv(tmp_manifest, sep="\t", index=False)
312+
print(f" Retry {retries} · Batch {bi}: {len(batch_df)} files")
313+
subprocess.run(['./gdc-client', 'download', '-d', manifest_loc, '-m', tmp_manifest])
314+
os.remove(tmp_manifest)
297315
print(f"Retry {retries} complete.")
298316

317+
299318
if missing_or_corrupt_ids:
300319
print(f"\nFailed to download or verify {len(missing_or_corrupt_ids)} files after {max_retries} retries.")
301320
print("Proceeding with available files.")
@@ -395,13 +414,16 @@ def stream_clean_files(data_type: str):
395414
fpath = os.path.join(manifest, folder, fname)
396415

397416
# ---- read single file ------------------------------
398-
if fpath.endswith(".gz"): # mutation data
399-
with gzip.open(fpath, "rt") as fh:
400-
df = pl.read_csv(fh, separator="\t", skip_rows=7)
417+
if fpath.endswith(".gz"): # mutation data is always gzipped
418+
try:
419+
df = pl.read_csv(fpath, separator="\t", skip_rows=7)
420+
except Exception as e:
421+
print(f"[warn] skipping MAF due to read error: {fpath} ({type(e).__name__}: {e})")
422+
continue
401423
else: # copy-number / tx
402424
skip = 1 if data_type == "transcriptomics" else 0
403425
df = pl.read_csv(fpath, separator="\t", skip_rows=skip)
404-
426+
405427
df = df.with_columns(pl.lit(folder).alias("file_id"))
406428

407429
if data_type == "transcriptomics":

coderbuild/pancreatic/02-getPancreaticData.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,14 @@ def ensure_gdc_client():
9898
else:
9999
print("gdc-client already installed")
100100

101+
def _df_chunks(df, size):
102+
"""
103+
Helper function to yield chunks of a DataFrame.
104+
This is so the GDC tool can process smaller batches of data.
105+
"""
106+
for i in range(0, len(df), size):
107+
yield df.iloc[i:i+size]
108+
101109

102110

103111
def extract_uuids_from_manifest(manifest_data):
@@ -183,7 +191,7 @@ def use_gdc_tool(manifest_data, data_type, download_data):
183191

184192
# Initialize retry variables
185193
retries = 0
186-
max_retries = 1
194+
max_retries = 5
187195

188196
# Function to get downloaded file IDs
189197
def get_downloaded_ids(manifest_loc):
@@ -211,9 +219,17 @@ def _verify_md5(file_id, expected_md5, expected_filename):
211219
return False
212220

213221
# Initial download attempt
214-
print("Starting secondary download...")
215-
subprocess.run(['./gdc-client', 'download', '-d', manifest_loc, '-m', 'new_manifest.txt'])
216-
print("Secondary download complete.")
222+
print("Starting download...")
223+
# subprocess.run(['./gdc-client', 'download', '-d', manifest_loc, '-m', 'new_manifest.txt'])
224+
batch_size = 150
225+
print(f"Starting batched download ({batch_size} per batch)...")
226+
for bi, batch_df in enumerate(_df_chunks(newfm, batch_size), start=1):
227+
tmp_manifest = f"manifest_batch_{bi:04d}.txt"
228+
batch_df.to_csv(tmp_manifest, sep="\t", index=False)
229+
print(f" Batch {bi}: downloading {len(batch_df)} files …")
230+
subprocess.run(['./gdc-client', 'download', '-d', manifest_loc, '-m', tmp_manifest])
231+
os.remove(tmp_manifest)
232+
print("Batched download complete.")
217233

218234
# Check for missing or corrupt files and retry if necessary
219235
while retries <= max_retries:
@@ -256,12 +272,14 @@ def _verify_md5(file_id, expected_md5, expected_filename):
256272
print(f" Removed corrupt file: {file_id}")
257273

258274
# Create a new manifest with missing or corrupt IDs
259-
retry_manifest = newfm[newfm['id'].isin(missing_or_corrupt_ids)]
260-
retry_manifest.to_csv('retry_manifest.txt', sep='\t', index=False)
261-
262-
# Retry download
263-
print(f"Starting retry {retries} download...")
264-
subprocess.run(['./gdc-client', 'download', '-d', manifest_loc, '-m', 'retry_manifest.txt'])
275+
retry_df = newfm[newfm['id'].isin(missing_or_corrupt_ids)]
276+
print(f"Starting retry {retries} in batches of {batch_size} …")
277+
for bi, batch_df in enumerate(_df_chunks(retry_df, batch_size), start=1):
278+
tmp_manifest = f"retry_{retries:02d}_batch_{bi:04d}.txt"
279+
batch_df.to_csv(tmp_manifest, sep="\t", index=False)
280+
print(f" Retry {retries} · Batch {bi}: {len(batch_df)} files")
281+
subprocess.run(['./gdc-client', 'download', '-d', manifest_loc, '-m', tmp_manifest], check=True)
282+
os.remove(tmp_manifest)
265283
print(f"Retry {retries} complete.")
266284

267285
if missing_or_corrupt_ids:

0 commit comments

Comments
 (0)