Skip to content

Commit 4a0ff4c

Browse files
committed
bug fix for cases where csv type inference can be led astray in the variant table collection script
1 parent 65dca28 commit 4a0ff4c

File tree

1 file changed

+31
-47
lines changed

1 file changed

+31
-47
lines changed

bin/collect_full_variant_table.py

Lines changed: 31 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -27,25 +27,34 @@
2727
DEFAULT_CONSENSUS_THRESHOLD = 0.8
2828
VARIANT_EFFECTS_SUFFIX = "_variant_effects.tsv"
2929

30-
SNPSIFT_COLUMNS = [
31-
"chrom",
32-
"ref",
33-
"pos",
34-
"alt",
35-
"af",
36-
"ac",
37-
"dp",
38-
"ref_dp",
39-
"alt_dp",
40-
"alt_freq",
41-
"mq",
42-
"gene",
43-
"effect",
44-
"hgvs_p",
45-
"cds_pos",
46-
"aa_pos",
30+
# Single source of truth for SnpSift column definitions.
31+
# Each tuple: (original_header_name, final_column_name, polars_dtype)
32+
# This ensures consistent type inference across all input files,
33+
# avoiding errors when columns are empty in some files but numeric in others.
34+
SNPSIFT_COLUMN_DEFS = [
35+
("CHROM", "chrom", pl.String),
36+
("REF", "ref", pl.String),
37+
("POS", "pos", pl.Int64),
38+
("ALT", "alt", pl.String),
39+
("AF", "af", pl.Float64),
40+
("AC", "ac", pl.Int64),
41+
("DP", "dp", pl.Int64),
42+
("GEN[0].REF_DP", "ref_dp", pl.Int64),
43+
("GEN[0].ALT_DP", "alt_dp", pl.Int64),
44+
("GEN[0].ALT_FREQ", "alt_freq", pl.Float64),
45+
("MQ", "mq", pl.Float64),
46+
("ANN[0].GENE", "gene", pl.String),
47+
("ANN[0].EFFECT", "effect", pl.String),
48+
("ANN[0].HGVS_P", "hgvs_p", pl.String),
49+
("ANN[0].CDS_POS", "cds_pos", pl.Int64),
50+
("ANN[0].AA_POS", "aa_pos", pl.Int64),
4751
]
4852

53+
# Derived constants from the single source of truth
54+
SNPSIFT_SCHEMA = {orig: dtype for orig, _, dtype in SNPSIFT_COLUMN_DEFS}
55+
SNPSIFT_RENAME_MAP = {orig: final for orig, final, _ in SNPSIFT_COLUMN_DEFS}
56+
SNPSIFT_COLUMNS = [final for _, final, _ in SNPSIFT_COLUMN_DEFS]
57+
4958
FINAL_COLUMNS = [
5059
"sample_id",
5160
"chrom",
@@ -160,37 +169,12 @@ def load_single_file(sample_id: str, file_path: Path) -> pl.LazyFrame:
160169
pl.scan_csv(
161170
file_path,
162171
separator="\t",
163-
has_header=True,
172+
has_header=False,
173+
skip_rows=1,
174+
schema=SNPSIFT_SCHEMA,
164175
null_values=["", "."],
165176
)
166-
.select(
167-
pl.all().name.map(
168-
lambda c: c.lower()
169-
.replace("[", "_")
170-
.replace("]", "")
171-
.replace(".", "_"),
172-
),
173-
)
174-
.rename(
175-
{
176-
"chrom": "chrom",
177-
"ref": "ref",
178-
"pos": "pos",
179-
"alt": "alt",
180-
"af": "af",
181-
"ac": "ac",
182-
"dp": "dp",
183-
"gen_0_ref_dp": "ref_dp",
184-
"gen_0_alt_dp": "alt_dp",
185-
"gen_0_alt_freq": "alt_freq",
186-
"mq": "mq",
187-
"ann_0_gene": "gene",
188-
"ann_0_effect": "effect",
189-
"ann_0_hgvs_p": "hgvs_p",
190-
"ann_0_cds_pos": "cds_pos",
191-
"ann_0_aa_pos": "aa_pos",
192-
},
193-
)
177+
.rename(SNPSIFT_RENAME_MAP)
194178
.select(SNPSIFT_COLUMNS)
195179
.with_columns(pl.lit(sample_id).alias("sample_id"))
196180
)
@@ -208,7 +192,7 @@ def variant_id_expr() -> pl.Expr:
208192
[
209193
pl.col("chrom"),
210194
pl.lit(":"),
211-
pl.col("pos").cast(pl.Utf8),
195+
pl.col("pos").cast(pl.String),
212196
pl.lit(":"),
213197
pl.col("ref"),
214198
pl.lit(">"),

0 commit comments

Comments
 (0)