2727DEFAULT_CONSENSUS_THRESHOLD = 0.8
2828VARIANT_EFFECTS_SUFFIX = "_variant_effects.tsv"
2929
30- SNPSIFT_COLUMNS = [
31- "chrom" ,
32- "ref" ,
33- "pos" ,
34- "alt" ,
35- "af" ,
36- "ac" ,
37- "dp" ,
38- "ref_dp" ,
39- "alt_dp" ,
40- "alt_freq" ,
41- "mq" ,
42- "gene" ,
43- "effect" ,
44- "hgvs_p" ,
45- "cds_pos" ,
46- "aa_pos" ,
30+ # Single source of truth for SnpSift column definitions.
31+ # Each tuple: (original_header_name, final_column_name, polars_dtype)
32+ # This ensures consistent type inference across all input files,
33+ # avoiding errors when columns are empty in some files but numeric in others.
34+ SNPSIFT_COLUMN_DEFS = [
35+ ("CHROM" , "chrom" , pl .String ),
36+ ("REF" , "ref" , pl .String ),
37+ ("POS" , "pos" , pl .Int64 ),
38+ ("ALT" , "alt" , pl .String ),
39+ ("AF" , "af" , pl .Float64 ),
40+ ("AC" , "ac" , pl .Int64 ),
41+ ("DP" , "dp" , pl .Int64 ),
42+ ("GEN[0].REF_DP" , "ref_dp" , pl .Int64 ),
43+ ("GEN[0].ALT_DP" , "alt_dp" , pl .Int64 ),
44+ ("GEN[0].ALT_FREQ" , "alt_freq" , pl .Float64 ),
45+ ("MQ" , "mq" , pl .Float64 ),
46+ ("ANN[0].GENE" , "gene" , pl .String ),
47+ ("ANN[0].EFFECT" , "effect" , pl .String ),
48+ ("ANN[0].HGVS_P" , "hgvs_p" , pl .String ),
49+ ("ANN[0].CDS_POS" , "cds_pos" , pl .Int64 ),
50+ ("ANN[0].AA_POS" , "aa_pos" , pl .Int64 ),
4751]
4852
53+ # Derived constants from the single source of truth
54+ SNPSIFT_SCHEMA = {orig : dtype for orig , _ , dtype in SNPSIFT_COLUMN_DEFS }
55+ SNPSIFT_RENAME_MAP = {orig : final for orig , final , _ in SNPSIFT_COLUMN_DEFS }
56+ SNPSIFT_COLUMNS = [final for _ , final , _ in SNPSIFT_COLUMN_DEFS ]
57+
4958FINAL_COLUMNS = [
5059 "sample_id" ,
5160 "chrom" ,
@@ -160,37 +169,12 @@ def load_single_file(sample_id: str, file_path: Path) -> pl.LazyFrame:
160169 pl .scan_csv (
161170 file_path ,
162171 separator = "\t " ,
163- has_header = True ,
172+ has_header = False ,
173+ skip_rows = 1 ,
174+ schema = SNPSIFT_SCHEMA ,
164175 null_values = ["" , "." ],
165176 )
166- .select (
167- pl .all ().name .map (
168- lambda c : c .lower ()
169- .replace ("[" , "_" )
170- .replace ("]" , "" )
171- .replace ("." , "_" ),
172- ),
173- )
174- .rename (
175- {
176- "chrom" : "chrom" ,
177- "ref" : "ref" ,
178- "pos" : "pos" ,
179- "alt" : "alt" ,
180- "af" : "af" ,
181- "ac" : "ac" ,
182- "dp" : "dp" ,
183- "gen_0_ref_dp" : "ref_dp" ,
184- "gen_0_alt_dp" : "alt_dp" ,
185- "gen_0_alt_freq" : "alt_freq" ,
186- "mq" : "mq" ,
187- "ann_0_gene" : "gene" ,
188- "ann_0_effect" : "effect" ,
189- "ann_0_hgvs_p" : "hgvs_p" ,
190- "ann_0_cds_pos" : "cds_pos" ,
191- "ann_0_aa_pos" : "aa_pos" ,
192- },
193- )
177+ .rename (SNPSIFT_RENAME_MAP )
194178 .select (SNPSIFT_COLUMNS )
195179 .with_columns (pl .lit (sample_id ).alias ("sample_id" ))
196180 )
@@ -208,7 +192,7 @@ def variant_id_expr() -> pl.Expr:
208192 [
209193 pl .col ("chrom" ),
210194 pl .lit (":" ),
211- pl .col ("pos" ).cast (pl .Utf8 ),
195+ pl .col ("pos" ).cast (pl .String ),
212196 pl .lit (":" ),
213197 pl .col ("ref" ),
214198 pl .lit (">" ),
0 commit comments