25
25
26
26
from . import core
27
27
from . import provenance
28
- from .vcf_partition import partition_into_regions , region_filter
28
+ from . import vcf_utils
29
29
30
30
logger = logging .getLogger (__name__ )
31
31
@@ -229,8 +229,11 @@ def scan_vcfs(paths, show_progress, target_num_partitions):
229
229
raise ValueError ("Incompatible VCF chunks" )
230
230
vcf_metadata .num_records += vcf .num_records
231
231
232
- # https://github.com/pystatgen/sgkit/issues/1200
233
- regions = partition_into_regions (path , num_parts = target_num_partitions )
232
+ # TODO: Move all our usage of the VCF class behind the IndexedVCF
233
+ # so that we open the VCF once, and we explicitly set the index.
234
+ # Otherwise cyvcf2 will do things behind our backs.
235
+ indexed_vcf = vcf_utils .IndexedVcf (path )
236
+ regions = indexed_vcf .partition_into_regions (num_parts = target_num_partitions )
234
237
for region in regions :
235
238
partitions .append (
236
239
# Requires cyvcf2>=0.30.27
@@ -241,8 +244,7 @@ def scan_vcfs(paths, show_progress, target_num_partitions):
241
244
)
242
245
# TODO figure out if this is safe when we have multiple chrs
243
246
# in the file
244
- # FIXME this isn't working because region strings don't sort correctly
245
- partitions .sort (key = lambda x : x .region )
247
+ partitions .sort (key = lambda x : (x .region .contig , x .region .start ))
246
248
vcf_metadata .partitions = partitions
247
249
return vcf_metadata , header
248
250
@@ -805,14 +807,20 @@ def convert_partition(
805
807
format_fields .append (field )
806
808
807
809
def variants ():
808
- with warnings .catch_warnings ():
809
- # TODO cyvcf2 emits a warning for empty regions; either make the
810
- # warning more specific, or remove the need for querying empty
811
- # regions.
812
- # FIXME this also absorbs any warnings emitted within the loop,
813
- # so definitely need to do this a different way.
814
- warnings .simplefilter ("ignore" )
815
- for var in region_filter (vcf (partition .region ), partition .region ):
810
+ # with warnings.catch_warnings():
811
+ # # TODO cyvcf2 emits a warning for empty regions; either make the
812
+ # # warning more specific, or remove the need for querying empty
813
+ # # regions.
814
+ # # FIXME this also absorbs any warnings emitted within the loop,
815
+ # # so definitely need to do this a different way.
816
+ # warnings.simplefilter("ignore")
817
+ # for var in region_filter(vcf(partition.region), partition.region):
818
+ # yield var
819
+
820
+ # TODO move this into the IndexedVCF class
821
+ start = 1 if partition .region .start is None else partition .region .start
822
+ for var in vcf (str (partition .region )):
823
+ if var .POS >= start :
816
824
yield var
817
825
818
826
# FIXME it looks like this is actually a bit pointless now that we
0 commit comments