Skip to content

Commit e741038

Browse files
Quick hacks to see if uncompressed VCF is possible
1 parent 1f9274b commit e741038

File tree

3 files changed

+39
-24
lines changed

3 files changed

+39
-24
lines changed

bio2zarr/vcf2zarr/icf.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -354,13 +354,13 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
354354
# Note: this will be infinity here if any of the chunks has an index
355355
# that doesn't keep track of the number of records per-contig
356356
icf_metadata.num_records = total_records
357-
358-
# Sort by contig (in the order they appear in the header) first,
359-
# then by start coordinate
360-
contig_index_map = {contig.id: j for j, contig in enumerate(metadata.contigs)}
361-
all_partitions.sort(
362-
key=lambda x: (contig_index_map[x.region.contig], x.region.start)
363-
)
357+
if len(all_partitions) > 1:
358+
# Sort by contig (in the order they appear in the header) first,
359+
# then by start coordinate
360+
contig_index_map = {contig.id: j for j, contig in enumerate(metadata.contigs)}
361+
all_partitions.sort(
362+
key=lambda x: (contig_index_map[x.region.contig], x.region.start)
363+
)
364364
icf_metadata.partitions = all_partitions
365365
logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
366366
return icf_metadata, header

bio2zarr/vcf_utils.py

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -84,11 +84,14 @@ class Region:
8484
A htslib style region, where coordinates are 1-based and inclusive.
8585
"""
8686

87-
contig: str
87+
contig: Optional[str] = None
8888
start: Optional[int] = None
8989
end: Optional[int] = None
9090

9191
def __post_init__(self):
92+
if self.contig is None:
93+
return
94+
9295
if self.start is not None:
9396
self.start = int(self.start)
9497
assert self.start > 0
@@ -408,9 +411,8 @@ def __init__(self, vcf_path, index_path=None):
408411
vcf_path.suffix + VcfIndexType.CSI.value
409412
)
410413
if not index_path.exists():
411-
raise FileNotFoundError(
412-
f"Cannot find .tbi or .csi file for {vcf_path}"
413-
)
414+
# Use this as a proxy for "no index"
415+
index_path = vcf_path
414416
else:
415417
index_path = pathlib.Path(index_path)
416418

@@ -424,14 +426,18 @@ def __init__(self, vcf_path, index_path=None):
424426
elif index_path.suffix == VcfIndexType.TABIX.value:
425427
self.index_type = VcfIndexType.TABIX
426428
self.file_type = VcfFileType.VCF
427-
else:
428-
raise ValueError("Only .tbi or .csi indexes are supported.")
429+
# else:
430+
431+
# raise ValueError("Only .tbi or .csi indexes are supported.")
429432

430433
self.vcf = cyvcf2.VCF(vcf_path)
431-
self.vcf.set_index(str(self.index_path))
434+
if self.index_type is not None:
435+
self.vcf.set_index(str(self.index_path))
436+
432437
logger.debug(f"Loaded {vcf_path} with index {self.index_path}")
433438
self.sequence_names = None
434439

440+
self.index = None
435441
if self.index_type == VcfIndexType.CSI:
436442
# Determine the file-type based on the "aux" field.
437443
self.index = read_csi(self.index_path)
@@ -441,7 +447,7 @@ def __init__(self, vcf_path, index_path=None):
441447
self.sequence_names = self.index.parse_vcf_aux()
442448
else:
443449
self.sequence_names = self.vcf.seqnames
444-
else:
450+
elif self.index_type == VcfIndexType.TABIX:
445451
self.index = read_tabix(self.index_path)
446452
self.sequence_names = self.index.sequence_names
447453

@@ -452,6 +458,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
452458
return False
453459

454460
def contig_record_counts(self):
461+
if self.index is None:
462+
return {None: np.inf}
455463
d = dict(zip(self.sequence_names, self.index.record_counts))
456464
if self.file_type == VcfFileType.BCF:
457465
d = {k: v for k, v in d.items() if v > 0}
@@ -461,11 +469,15 @@ def count_variants(self, region):
461469
return sum(1 for _ in self.variants(region))
462470

463471
def variants(self, region):
464-
start = 1 if region.start is None else region.start
465-
for var in self.vcf(str(region)):
466-
# Need to filter because of indels overlapping the region
467-
if var.POS >= start:
468-
yield var
472+
if self.index is None:
473+
assert region.contig is None
474+
yield from self.vcf
475+
else:
476+
start = 1 if region.start is None else region.start
477+
for var in self.vcf(str(region)):
478+
# Need to filter because of indels overlapping the region
479+
if var.POS >= start:
480+
yield var
469481

470482
def _filter_empty_and_refine(self, regions):
471483
"""
@@ -486,6 +498,9 @@ def partition_into_regions(
486498
num_parts: Optional[int] = None,
487499
target_part_size: Union[None, int, str] = None,
488500
):
501+
if self.index is None:
502+
return [Region()]
503+
489504
if num_parts is None and target_part_size is None:
490505
raise ValueError("One of num_parts or target_part_size must be specified")
491506

tests/test_simulated_data.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import sys
22

33
import numpy.testing as nt
4-
import pysam
54
import pytest
65
import sgkit as sg
76

@@ -45,9 +44,10 @@ def assert_ts_ds_equal(ts, ds, ploidy=1):
4544
def write_vcf(ts, vcf_path, contig_id="1"):
4645
with open(vcf_path, "w") as f:
4746
ts.write_vcf(f, contig_id=contig_id)
48-
# This also compresses the input file
49-
pysam.tabix_index(str(vcf_path), preset="vcf")
50-
return vcf_path.with_suffix(vcf_path.suffix + ".gz")
47+
# # This also compresses the input file
48+
# pysam.tabix_index(str(vcf_path), preset="vcf")
49+
# return vcf_path.with_suffix(vcf_path.suffix + ".gz")
50+
return vcf_path
5151

5252

5353
# https://github.com/sgkit-dev/bio2zarr/issues/336

0 commit comments

Comments
 (0)