@@ -130,6 +130,7 @@ def smallest_dtype(self):
130
130
class VcfPartition :
131
131
vcf_path : str
132
132
region : str
133
+ num_records : int = - 1
133
134
134
135
135
136
@dataclasses .dataclass
@@ -241,10 +242,10 @@ def scan_vcfs(paths, show_progress, target_num_partitions):
241
242
region = region ,
242
243
)
243
244
)
244
- # TODO figure out if this is correct. It should be fine because of VCF
245
- # sorting, but need to verify there isn't some loophole with BCF, where
246
- # contigs are sorted by their index in the list rather than string value.
247
- partitions .sort (key = lambda x : (x .region .contig , x .region .start ))
245
+ # Sort by contig (in the order they appear in the header) first,
246
+ # then by start coordinate
247
+ contig_index_map = { contig : j for j , contig in enumerate ( vcf . seqnames )}
248
+ partitions .sort (key = lambda x : (contig_index_map [ x .region .contig ] , x .region .start ))
248
249
vcf_metadata .partitions = partitions
249
250
return vcf_metadata , header
250
251
@@ -864,9 +865,10 @@ def variants():
864
865
865
866
logger .info (
866
867
f"Finish p{ partition_index } { partition .vcf_path } __{ partition .region } ="
867
- f"{ num_records } records" )
868
+ f"{ num_records } records"
869
+ )
868
870
869
- return tcw .field_summaries , num_records
871
+ return partition_index , tcw .field_summaries , num_records
870
872
871
873
@staticmethod
872
874
def convert (
@@ -904,11 +906,14 @@ def convert(
904
906
)
905
907
num_records = 0
906
908
partition_summaries = []
907
- for partition_summary , partition_num_records in pwm .results_as_completed ():
908
- num_records += partition_num_records
909
- partition_summaries . append ( partition_summary )
909
+ for index , summary , num_records in pwm .results_as_completed ():
910
+ partition_summaries . append ( summary )
911
+ vcf_metadata . partitions [ index ]. num_records = num_records
910
912
911
- assert num_records == pcvcf .num_records
913
+ total_records = sum (
914
+ partition .num_records for partition in vcf_metadata .partitions
915
+ )
916
+ assert total_records == pcvcf .num_records
912
917
913
918
for field in vcf_metadata .fields :
914
919
# Clear the summary to avoid problems when running in debug
0 commit comments