Skip to content

Commit e899295

Browse files
Fix region sorting problem
1 parent f3c11d8 commit e899295

File tree

1 file changed

+15
-10
lines changed

1 file changed

+15
-10
lines changed

bio2zarr/vcf.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ def smallest_dtype(self):
130130
class VcfPartition:
131131
vcf_path: str
132132
region: str
133+
num_records: int = -1
133134

134135

135136
@dataclasses.dataclass
@@ -241,10 +242,10 @@ def scan_vcfs(paths, show_progress, target_num_partitions):
241242
region=region,
242243
)
243244
)
244-
# TODO figure out if this is correct. It should be fine because of VCF
245-
# sorting, but need to verify there isn't some loophole with BCF, where
246-
# contigs are sorted by their index in the list rather than string value.
247-
partitions.sort(key=lambda x: (x.region.contig, x.region.start))
245+
# Sort by contig (in the order they appear in the header) first,
246+
# then by start coordinate
247+
contig_index_map = {contig: j for j, contig in enumerate(vcf.seqnames)}
248+
partitions.sort(key=lambda x: (contig_index_map[x.region.contig], x.region.start))
248249
vcf_metadata.partitions = partitions
249250
return vcf_metadata, header
250251

@@ -864,9 +865,10 @@ def variants():
864865

865866
logger.info(
866867
f"Finish p{partition_index} {partition.vcf_path}__{partition.region}="
867-
f"{num_records} records")
868+
f"{num_records} records"
869+
)
868870

869-
return tcw.field_summaries, num_records
871+
return partition_index, tcw.field_summaries, num_records
870872

871873
@staticmethod
872874
def convert(
@@ -904,11 +906,14 @@ def convert(
904906
)
905907
num_records = 0
906908
partition_summaries = []
907-
for partition_summary, partition_num_records in pwm.results_as_completed():
908-
num_records += partition_num_records
909-
partition_summaries.append(partition_summary)
909+
for index, summary, num_records in pwm.results_as_completed():
910+
partition_summaries.append(summary)
911+
vcf_metadata.partitions[index].num_records = num_records
910912

911-
assert num_records == pcvcf.num_records
913+
total_records = sum(
914+
partition.num_records for partition in vcf_metadata.partitions
915+
)
916+
assert total_records == pcvcf.num_records
912917

913918
for field in vcf_metadata.fields:
914919
# Clear the summary to avoid problems when running in debug

0 commit comments

Comments
 (0)