Skip to content

Commit 999a70f

Browse files
Minimal changes to get things working again
1 parent 6c58c9e commit 999a70f

File tree

2 files changed

+43
-26
lines changed

2 files changed

+43
-26
lines changed

bio2zarr/vcf.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
from . import core
2727
from . import provenance
28-
from .vcf_partition import partition_into_regions, region_filter
28+
from . import vcf_utils
2929

3030
logger = logging.getLogger(__name__)
3131

@@ -229,8 +229,11 @@ def scan_vcfs(paths, show_progress, target_num_partitions):
229229
raise ValueError("Incompatible VCF chunks")
230230
vcf_metadata.num_records += vcf.num_records
231231

232-
# https://github.com/pystatgen/sgkit/issues/1200
233-
regions = partition_into_regions(path, num_parts=target_num_partitions)
232+
# TODO: Move all our usage of the VCF class behind the IndexedVCF
233+
# so that we open the VCF once, and we explicitly set the index.
234+
# Otherwise cyvcf2 will do things behind our backs.
235+
indexed_vcf = vcf_utils.IndexedVcf(path)
236+
regions = indexed_vcf.partition_into_regions(num_parts=target_num_partitions)
234237
for region in regions:
235238
partitions.append(
236239
# Requires cyvcf2>=0.30.27
@@ -241,8 +244,7 @@ def scan_vcfs(paths, show_progress, target_num_partitions):
241244
)
242245
# TODO figure out if this is safe when we have multiple chrs
243246
# in the file
244-
# FIXME this isn't working because region strings don't sort correctly
245-
partitions.sort(key=lambda x: x.region)
247+
partitions.sort(key=lambda x: (x.region.contig, x.region.start))
246248
vcf_metadata.partitions = partitions
247249
return vcf_metadata, header
248250

@@ -805,14 +807,20 @@ def convert_partition(
805807
format_fields.append(field)
806808

807809
def variants():
808-
with warnings.catch_warnings():
809-
# TODO cyvcf2 emits a warning for empty regions; either make the
810-
# warning more specific, or remove the need for querying empty
811-
# regions.
812-
# FIXME this also absorbs any warnings emitted within the loop,
813-
# so definitely need to do this a different way.
814-
warnings.simplefilter("ignore")
815-
for var in region_filter(vcf(partition.region), partition.region):
810+
# with warnings.catch_warnings():
811+
# # TODO cyvcf2 emits a warning for empty regions; either make the
812+
# # warning more specific, or remove the need for querying empty
813+
# # regions.
814+
# # FIXME this also absorbs any warnings emitted within the loop,
815+
# # so definitely need to do this a different way.
816+
# warnings.simplefilter("ignore")
817+
# for var in region_filter(vcf(partition.region), partition.region):
818+
# yield var
819+
820+
# TODO move this into the IndexedVCF class
821+
start = 1 if partition.region.start is None else partition.region.start
822+
for var in vcf(str(partition.region)):
823+
if var.POS >= start:
816824
yield var
817825

818826
# FIXME it looks like this is actually a bit pointless now that we

bio2zarr/vcf_utils.py

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,18 @@ def region_string(contig: str, start: int, end: Optional[int] = None) -> str:
5252
else:
5353
return f"{contig}:{start}-"
5454

55+
5556
@dataclass
5657
class Region:
5758
contig: str
5859
start: Optional[int] = None
59-
end: Optional[int]=None
60+
end: Optional[int] = None
61+
62+
def __post_init__(self):
63+
if self.start is not None:
64+
self.start = int(self.start)
65+
if self.end is not None:
66+
self.end = int(self.end)
6067

6168
def __str__(self):
6269
s = f"{self.contig}"
@@ -68,6 +75,7 @@ def __str__(self):
6875

6976
# TODO add "parse" class method
7077

78+
7179
def get_tabix_path(
7280
vcf_path: PathType, storage_options: Optional[Dict[str, str]] = None
7381
) -> Optional[str]:
@@ -530,18 +538,19 @@ def read_tabix(
530538
)
531539

532540

533-
534541
class IndexedVcf:
535-
def __init__(self, path, index_path=None):
536-
# for h in vcf.header_iter():
537-
# print(h)
538-
# if index_path is None:
539-
# index_path = get_tabix_path(vcf_path, storage_options=storage_options)
540-
# if index_path is None:
541-
# index_path = get_csi_path(vcf_path, storage_options=storage_options)
542-
# if index_path is None:
543-
# raise ValueError("Cannot find .tbi or .csi file.")
544-
self.vcf_path = path
542+
def __init__(self, vcf_path, index_path=None):
543+
vcf_path = pathlib.Path(vcf_path)
544+
if index_path is None:
545+
index_path = vcf_path.with_suffix(vcf_path.suffix + ".tbi")
546+
if not index_path.exists():
547+
index_path = vcf_path.with_suffix(vcf_path.suffix + ".csi")
548+
if not index_path.exists():
549+
raise ValueError("Cannot find .tbi or .csi file.")
550+
else:
551+
index_path = pathlib.Path(index_path)
552+
553+
self.vcf_path = vcf_path
545554
self.index_path = index_path
546555
self.file_type = None
547556
self.index_type = None
@@ -561,7 +570,7 @@ def __init__(self, path, index_path=None):
561570
self.file_type = "vcf"
562571
self.sequence_names = self.index.parse_vcf_aux()
563572
else:
564-
with contextlib.closing(cyvcf2.VCF(path)) as vcf:
573+
with contextlib.closing(cyvcf2.VCF(vcf_path)) as vcf:
565574
self.sequence_names = vcf.seqnames
566575
else:
567576
self.sequence_names = self.index.sequence_names

0 commit comments

Comments
 (0)