2020for validation. Features a beautiful command-line interface built with Typer and Rich.
2121"""
2222
23+ import gzip
2324from enum import Enum
2425from pathlib import Path
2526from typing import Annotated , cast
@@ -97,9 +98,17 @@ class IvarVariant(BaseModel):
9798 @field_validator ("ref_rv" , "alt_rv" )
9899 @classmethod
99100 def validate_reverse_depth (cls , v : int , info : ValidationInfo ) -> int :
100- """Ensure reverse depth doesn't exceed total depth."""
101- if "ref_dp" in info .data and v > info .data ["ref_dp" ]:
102- msg = "Reverse depth cannot exceed total depth"
101+ """Ensure reverse depth doesn't exceed total depth for that allele."""
102+ if info .field_name == "ref_rv" :
103+ depth_field = "ref_dp"
104+ elif info .field_name == "alt_rv" :
105+ depth_field = "alt_dp"
106+ else :
107+ msg = f"Unexpected field in reverse depth validator: { info .field_name } "
108+ raise ValueError (msg )
109+
110+ if depth_field in info .data and v > info .data [depth_field ]:
111+ msg = f"Reverse depth ({ info .field_name } ) cannot exceed total depth ({ depth_field } )"
103112 raise ValueError (msg )
104113 return v
105114
@@ -178,7 +187,10 @@ def validate_output_dir(cls, v: Path) -> Path:
178187
179188
180189def calculate_strand_bias_pvalue (
181- ref_dp : int , ref_rv : int , alt_dp : int , alt_rv : int
190+ ref_dp : int ,
191+ ref_rv : int ,
192+ alt_dp : int ,
193+ alt_rv : int ,
182194) -> float :
183195 """Calculate p-value for strand bias using Fisher's exact test.
184196
@@ -199,7 +211,7 @@ def calculate_strand_bias_pvalue(
199211 )
200212 _odds_ratio , pvalue = cast (
201213 "tuple[float, float]" ,
202- fisher_exact (contingency_table , alternative = "greater " ),
214+ fisher_exact (contingency_table , alternative = "two-sided " ),
203215 )
204216 return pvalue
205217
@@ -289,9 +301,7 @@ def create_filter_expr(config: ConversionConfig) -> pl.Expr:
289301
290302 # iVar PASS filter
291303 filters .append (
292- pl .when (pl .col ("PASS" ))
293- .then (pl .lit ("" ))
294- .otherwise (pl .lit (FilterType .FAIL_TEST .value )),
304+ pl .when (pl .col ("PASS" )).then (pl .lit ("" )).otherwise (pl .lit (FilterType .FAIL_TEST .value )),
295305 )
296306
297307 # Quality filter
@@ -317,7 +327,8 @@ def create_filter_expr(config: ConversionConfig) -> pl.Expr:
317327 .list .join (";" )
318328 .fill_null ("" )
319329 .map_elements (
320- lambda x : FilterType .PASS .value if x == "" else x , return_dtype = pl .Utf8
330+ lambda x : FilterType .PASS .value if x == "" else x ,
331+ return_dtype = pl .Utf8 ,
321332 )
322333 )
323334
@@ -345,7 +356,8 @@ def create_sample_info_expr() -> pl.Expr:
345356
346357
347358def transform_ivar_to_vcf (
348- ivar_lf : pl .LazyFrame , config : ConversionConfig
359+ ivar_lf : pl .LazyFrame ,
360+ config : ConversionConfig ,
349361) -> pl .LazyFrame :
350362 """Transform iVar data to VCF format using pure expressions.
351363
@@ -382,7 +394,7 @@ def transform_ivar_to_vcf(
382394 ],
383395 ).alias ("INFO" ),
384396 pl .lit (
385- "GT:DP:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ"
397+ "GT:DP:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ" ,
386398 ).alias ("FORMAT" ),
387399 create_sample_info_expr ().alias ("SAMPLE" ),
388400 ],
@@ -400,7 +412,8 @@ def find_consecutive_variants_expr() -> pl.Expr:
400412
401413
402414def process_consecutive_snps (
403- ivar_lf : pl .LazyFrame , config : ConversionConfig
415+ ivar_lf : pl .LazyFrame ,
416+ config : ConversionConfig ,
404417) -> pl .LazyFrame :
405418 """Process consecutive SNPs for potential merging.
406419
@@ -509,7 +522,6 @@ def write_vcf_file(
509522 # Write file (handle gzipped output)
510523 if str (filepath ).endswith (".gz" ):
511524 # For gzip, we need to write everything as text to the same handle
512- import gzip
513525
514526 with gzip .open (filepath , "wt" ) as f :
515527 f .write (header_text )
@@ -539,6 +551,52 @@ def process_ivar_file(config: ConversionConfig) -> None:
539551 task = progress .add_task ("[cyan]Loading iVar data..." , total = None )
540552 ivar_df = pl .scan_csv (str (config .file_in ), separator = "\t " )
541553
554+ # Check if input has any data rows (collect schema to check)
555+ progress .update (task , description = "[yellow]Checking input data..." )
556+ try :
557+ row_count = ivar_df .select (pl .len ()).collect ().item ()
558+ except pl .exceptions .NoDataError :
559+ row_count = 0
560+
561+ # Generate headers (needed regardless of data)
562+ progress .update (task , description = "[yellow]Generating VCF headers..." )
563+ headers = generate_vcf_header (config )
564+ sample_name = config .file_in .stem
565+
566+ if row_count == 0 :
567+ # Handle empty input: write VCF with headers only
568+ progress .update (
569+ task ,
570+ description = "[yellow]No variants found, writing empty VCF..." ,
571+ )
572+ empty_df = pl .DataFrame (
573+ schema = {
574+ "CHROM" : pl .Utf8 ,
575+ "POS" : pl .Int64 ,
576+ "ID" : pl .Utf8 ,
577+ "REF" : pl .Utf8 ,
578+ "ALT" : pl .Utf8 ,
579+ "QUAL" : pl .Utf8 ,
580+ "FILTER" : pl .Utf8 ,
581+ "INFO" : pl .Utf8 ,
582+ "FORMAT" : pl .Utf8 ,
583+ "SAMPLE" : pl .Utf8 ,
584+ },
585+ )
586+ write_vcf_file (empty_df , config .file_out , headers , sample_name )
587+
588+ all_hap_path = (
589+ config .file_out .parent / f"{ config .file_out .stem } _all_hap{ config .file_out .suffix } "
590+ )
591+ write_vcf_file (empty_df , all_hap_path , headers , sample_name )
592+
593+ progress .update (
594+ task ,
595+ description = "[bold yellow]✓ No variants found, empty VCF written" ,
596+ completed = True ,
597+ )
598+ return
599+
542600 # Transform to VCF format
543601 progress .update (task , description = "[yellow]Transforming to VCF format..." )
544602 vcf_df = transform_ivar_to_vcf (ivar_df , config )
@@ -551,27 +609,21 @@ def process_ivar_file(config: ConversionConfig) -> None:
551609 progress .update (task , description = "[yellow]Collecting results..." )
552610 result_df = processed_df .collect ()
553611
554- # Generate headers
555- progress .update (task , description = "[yellow]Generating VCF headers..." )
556- headers = generate_vcf_header (config )
557-
558- # Get sample name from input file
559- sample_name = config .file_in .stem
560-
561612 # Write consensus output
562613 progress .update (task , description = "[green]Writing consensus VCF..." )
563614 write_vcf_file (result_df , config .file_out , headers , sample_name )
564615
565616 # Write all haplotypes output
566617 progress .update (task , description = "[green]Writing all haplotypes VCF..." )
567618 all_hap_path = (
568- config .file_out .parent
569- / f"{ config .file_out .stem } _all_hap{ config .file_out .suffix } "
619+ config .file_out .parent / f"{ config .file_out .stem } _all_hap{ config .file_out .suffix } "
570620 )
571621 write_vcf_file (result_df , all_hap_path , headers , sample_name )
572622
573623 progress .update (
574- task , description = "[bold green]✓ Conversion complete!" , completed = True
624+ task ,
625+ description = "[bold green]✓ Conversion complete!" ,
626+ completed = True ,
575627 )
576628
577629
@@ -758,14 +810,13 @@ def convert( # noqa: PLR0913
758810
759811 # Success message
760812 console .print (
761- f"\n [bold green]✓[/bold green] Successfully converted to { config .file_out } "
813+ f"\n [bold green]✓[/bold green] Successfully converted to { config .file_out } " ,
762814 )
763815 all_hap_path = (
764- config .file_out .parent
765- / f"{ config .file_out .stem } _all_hap{ config .file_out .suffix } "
816+ config .file_out .parent / f"{ config .file_out .stem } _all_hap{ config .file_out .suffix } "
766817 )
767818 console .print (
768- f"[bold green]✓[/bold green] All haplotypes written to { all_hap_path } "
819+ f"[bold green]✓[/bold green] All haplotypes written to { all_hap_path } " ,
769820 )
770821
771822 except Exception as e : # noqa: BLE001
@@ -774,7 +825,7 @@ def convert( # noqa: PLR0913
774825
775826
776827@app .command ()
777- def validate (
828+ def validate ( # noqa: C901, PLR0912
778829 file_path : Annotated [
779830 Path ,
780831 typer .Argument (
@@ -792,8 +843,6 @@ def validate(
792843 console .print (f"[cyan]Validating VCF file:[/cyan] { file_path } " )
793844
794845 try :
795- import gzip
796-
797846 # Count header lines (handle gzipped files)
798847 header_count = 0
799848 if str (file_path ).endswith (".gz" ):
@@ -841,7 +890,7 @@ def validate(
841890
842891 if missing_cols :
843892 console .print (
844- f"\n [red]✗ Missing required columns:[/red] { ', ' .join (missing_cols )} "
893+ f"\n [red]✗ Missing required columns:[/red] { ', ' .join (missing_cols )} " ,
845894 )
846895 else :
847896 console .print ("\n [green]✓ All required VCF columns present[/green]" )
@@ -905,8 +954,7 @@ def stats(
905954 console .print ("\n [bold]Variant Types:[/bold]" )
906955 snp_count = len (
907956 ivar_df .filter (
908- ~ pl .col ("ALT" ).str .starts_with ("+" )
909- & ~ pl .col ("ALT" ).str .starts_with ("-" ),
957+ ~ pl .col ("ALT" ).str .starts_with ("+" ) & ~ pl .col ("ALT" ).str .starts_with ("-" ),
910958 ),
911959 )
912960 ins_count = len (ivar_df .filter (pl .col ("ALT" ).str .starts_with ("+" )))
@@ -929,8 +977,8 @@ def stats(
929977 for low , high in freq_bins :
930978 count = len (
931979 ivar_df .filter (
932- (pl .col ("ALT_FREQ" ) >= low ) & (pl .col ("ALT_FREQ" ) < high )
933- )
980+ (pl .col ("ALT_FREQ" ) >= low ) & (pl .col ("ALT_FREQ" ) < high ),
981+ ),
934982 )
935983 if count > 0 :
936984 console .print (f" • { low :.0%} -{ high :.0%} : { count :,} variants" )
0 commit comments