Skip to content

Commit 119bc8f

Browse files
authored
Allow missing age values in mt merge pipeline (#1785)
* tolerate missing ages
1 parent fa3bc0f commit 119bc8f

File tree

2 files changed

+17
-15
lines changed

2 files changed

+17
-15
lines changed

all_of_us/mitochondria/mtSwirl_refactor/add_annotations.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,9 @@ def add_age_and_pop(input_mt: hl.MatrixTable, participant_data: str) -> hl.Matri
177177
:return: MatrixTable with select age and pop annotations added
178178
"""
179179
ht = hl.import_table(
180-
participant_data, types={"age": hl.tint32, "pop": hl.tstr},
180+
participant_data,
181+
types={"age": hl.tint32, "pop": hl.tstr},
182+
missing=["", "NA", "NaN", "nan"],
181183
).key_by("s")
182184

183185
ht = ht.select("age", "pop")

all_of_us/mitochondria/mt_coverage_merge.wdl

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,7 @@ task build_vcf_shard_mt {
400400
}
401401

402402
runtime {
403-
docker: "us.gcr.io/broad-gotc-prod/aou-mitochondrial-combine-vcfs-covdb:1.0.0"
403+
docker: "us.gcr.io/broad-gotc-prod/aou-mitochondrial-combine-vcfs-covdb:1.0.1"
404404
memory: memory_gb + " GB"
405405
cpu: cpu
406406
disks: "local-disk " + disk_gb + " " + disk_type
@@ -599,7 +599,7 @@ task merge_mt_shards {
599599
}
600600

601601
runtime {
602-
docker: "us.gcr.io/broad-gotc-prod/aou-mitochondrial-combine-vcfs-covdb:1.0.0"
602+
docker: "us.gcr.io/broad-gotc-prod/aou-mitochondrial-combine-vcfs-covdb:1.0.1"
603603
memory: memory_gb + " GB"
604604
cpu: cpu
605605
disks: "local-disk " + disk_gb + " " + disk_type
@@ -699,7 +699,7 @@ task finalize_mt_with_covdb {
699699
}
700700

701701
runtime {
702-
docker: "us.gcr.io/broad-gotc-prod/aou-mitochondrial-combine-vcfs-covdb:1.0.0"
702+
docker: "us.gcr.io/broad-gotc-prod/aou-mitochondrial-combine-vcfs-covdb:1.0.1"
703703
memory: memory_gb + " GB"
704704
cpu: cpu
705705
disks: "local-disk " + disk_gb + " " + disk_type
@@ -843,17 +843,17 @@ task process_tsv_files {
843843
if filtered_df.shape[0] != df.shape[0]:
844844
raise ValueError("Filtered DataFrame does not have the same number of samples as the original.")
845845
846-
# Calculate age
847-
filtered_df['date_of_birth'] = pd.to_datetime(filtered_df['date_of_birth'])
848-
filtered_df['biosample_collection_date'] = pd.to_datetime(filtered_df['biosample_collection_date'])
846+
# Calculate age (allow missing/invalid dates to yield NaN)
847+
filtered_df['date_of_birth'] = pd.to_datetime(filtered_df['date_of_birth'], errors="coerce")
848+
filtered_df['biosample_collection_date'] = pd.to_datetime(
849+
filtered_df['biosample_collection_date'], errors="coerce"
850+
)
849851
filtered_df['age'] = pd.to_numeric(
850-
np.floor((filtered_df['biosample_collection_date'] - filtered_df['date_of_birth']).dt.days / 365)
852+
np.floor((filtered_df['biosample_collection_date'] - filtered_df['date_of_birth']).dt.days / 365),
853+
errors="coerce"
851854
)
852-
853-
# Age must be an int and must be present
854-
filtered_df['age'] = filtered_df['age'].astype(int)
855-
if filtered_df['age'].isna().any():
856-
raise ValueError("Unexpected missing ages detected.")
855+
# Use pandas nullable integer dtype so NaN values are preserved.
856+
filtered_df['age'] = filtered_df['age'].astype("Int64")
857857
858858
# Rename columns for compatibility
859859
filtered_df.rename(columns={"mean_coverage": "wgs_mean_coverage"}, inplace=True)
@@ -1013,7 +1013,7 @@ task combine_vcfs_and_homref_from_covdb {
10131013

10141014
runtime {
10151015
# NOTE: This must be a Hail-capable image with mtSwirl code baked in at /opt/mtSwirl.
1016-
docker: "us.gcr.io/broad-gotc-prod/aou-mitochondrial-combine-vcfs-covdb:1.0.0"
1016+
docker: "us.gcr.io/broad-gotc-prod/aou-mitochondrial-combine-vcfs-covdb:1.0.1"
10171017
memory: memory_gb + " GB"
10181018
cpu: cpu
10191019
disks: "local-disk " + disk_gb + " " + disk_type
@@ -1104,7 +1104,7 @@ task add_annotations {
11041104
}
11051105

11061106
runtime {
1107-
docker: "us.gcr.io/broad-gotc-prod/aou-mitochondrial-combine-vcfs-covdb:1.0.0"
1107+
docker: "us.gcr.io/broad-gotc-prod/aou-mitochondrial-combine-vcfs-covdb:1.0.1"
11081108
memory: memory_gb + " GB"
11091109
cpu: cpu
11101110
disks: "local-disk " + disk_gb + " " + disk_type

0 commit comments

Comments
 (0)