Skip to content

Commit 9eb9d30

Browse files
Merge pull request #211 from jeromekelleher/final-vcfpartition-version
Final vcfpartition version
2 parents a1ddde0 + e363bf0 commit 9eb9d30

File tree

5 files changed

+102
-28
lines changed

5 files changed

+102
-28
lines changed

bio2zarr/cli.py

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -518,10 +518,42 @@ def plink2zarr():
518518
@click.command
519519
@version
520520
@click.argument("vcf_path", type=click.Path())
521-
@click.option("-i", "--index", type=click.Path(), default=None)
522-
@click.option("-n", "--num-parts", type=int, default=None)
523-
# @click.option("-s", "--part-size", type=int, default=None)
524-
def vcfpartition(vcf_path, index, num_parts):
525-
indexed_vcf = vcf_utils.IndexedVcf(vcf_path, index)
526-
regions = indexed_vcf.partition_into_regions(num_parts=num_parts)
527-
click.echo("\n".join(map(str, regions)))
521+
@verbose
522+
@click.option(
523+
"-n",
524+
"--num-parts",
525+
type=int,
526+
default=None,
527+
help="Target number of partitions to split the VCF into",
528+
)
529+
@click.option(
530+
"-s",
531+
"--part-size",
532+
type=str,
533+
default=None,
534+
help="Target (compressed) size of VCF partitions, e.g. 100KB, 10MiB, 1G.",
535+
)
536+
def vcfpartition(vcf_path, verbose, num_parts, part_size):
537+
"""
538+
Output bcftools region strings that partition an indexed VCF/BCF file
539+
into either an approximate number of parts (-n), or parts of approximately
540+
a given size (-s). One of -n or -s must be supplied.
541+
542+
Note that both the number of partitions and sizes are a target, and the
543+
returned number of partitions may not exactly correspond. In particular,
544+
there is a maximum level of granularity determined by the associated index
545+
which cannot be exceeded.
546+
547+
Note also that the partitions returned may vary considerably in the number
548+
of records that they contain.
549+
"""
550+
setup_logging(verbose)
551+
if num_parts is None and part_size is None:
552+
raise click.UsageError("Either --num-parts or --part-size must be specified")
553+
554+
indexed_vcf = vcf_utils.IndexedVcf(vcf_path)
555+
regions = indexed_vcf.partition_into_regions(
556+
num_parts=num_parts, target_part_size=part_size
557+
)
558+
for region in regions:
559+
click.echo(f"{region}\t{vcf_path}")

bio2zarr/vcf2zarr/icf.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -263,9 +263,6 @@ def scan_vcf(path, target_num_partitions):
263263
)
264264

265265
regions = indexed_vcf.partition_into_regions(num_parts=target_num_partitions)
266-
logger.info(
267-
f"Split {path} into {len(regions)} regions (target={target_num_partitions})"
268-
)
269266
for region in regions:
270267
metadata.partitions.append(
271268
VcfPartition(
@@ -275,6 +272,10 @@ def scan_vcf(path, target_num_partitions):
275272
region=region,
276273
)
277274
)
275+
logger.info(
276+
f"Split {path} into {len(metadata.partitions)} "
277+
f"partitions target={target_num_partitions})"
278+
)
278279
core.update_progress(1)
279280
return metadata, vcf.raw_header
280281

bio2zarr/vcf_utils.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import contextlib
22
import gzip
3+
import logging
34
import os
45
import pathlib
56
import struct
@@ -13,6 +14,8 @@
1314

1415
from bio2zarr.typing import PathType
1516

17+
logger = logging.getLogger(__name__)
18+
1619
CSI_EXTENSION = ".csi"
1720
TABIX_EXTENSION = ".tbi"
1821
TABIX_LINEAR_INDEX_INTERVAL_SIZE = 1 << 14 # 16kb interval size
@@ -411,6 +414,7 @@ def __init__(self, vcf_path, index_path=None):
411414
raise ValueError("Only .tbi or .csi indexes are supported.")
412415
self.vcf = cyvcf2.VCF(vcf_path)
413416
self.vcf.set_index(str(self.index_path))
417+
logger.debug(f"Loaded {vcf_path} with index {self.index_path}")
414418
self.sequence_names = None
415419
if self.index_type == "csi":
416420
# Determine the file-type based on the "aux" field.
@@ -450,15 +454,16 @@ def variants(self, region):
450454
def _filter_empty_and_refine(self, regions):
451455
"""
452456
Return all regions in the specified list that have one or more records,
453-
and refine the start coordinate of the region to be the actual first coord
457+
and refine the start coordinate of the region to be the actual first coord.
458+
459+
Because this is a relatively expensive operation requiring seeking around
460+
the file, we return the results as an iterator.
454461
"""
455-
ret = []
456462
for region in regions:
457463
var = next(self.variants(region), None)
458464
if var is not None:
459465
region.start = var.POS
460-
ret.append(region)
461-
return ret
466+
yield region
462467

463468
def partition_into_regions(
464469
self,
@@ -490,7 +495,7 @@ def partition_into_regions(
490495
target_part_size_bytes = file_length // num_parts
491496
elif target_part_size_bytes is not None:
492497
num_parts = ceildiv(file_length, target_part_size_bytes)
493-
part_lengths = np.array([i * target_part_size_bytes for i in range(num_parts)])
498+
part_lengths = target_part_size_bytes * np.arange(num_parts, dtype=int)
494499
file_offsets, region_contig_indexes, region_positions = self.index.offsets()
495500

496501
# Search the file offsets to find which indexes the part lengths fall at

tests/test_cli.py

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -697,21 +697,55 @@ def test_convert(self, tmp_path):
697697

698698

699699
class TestVcfPartition:
700+
path = "tests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz"
701+
700702
def test_num_parts(self):
701-
path = "tests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz"
703+
runner = ct.CliRunner(mix_stderr=False)
704+
result = runner.invoke(
705+
cli.vcfpartition, [self.path, "-n", "5"], catch_exceptions=False
706+
)
707+
assert result.stderr == ""
708+
assert result.exit_code == 0
709+
assert list(result.stdout.splitlines()) == [
710+
"20:60001-278528\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
711+
"20:278529-442368\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
712+
"20:442381-638976\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
713+
"20:638982-819200\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
714+
"20:819201-\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
715+
]
702716

717+
def test_part_size(self):
703718
runner = ct.CliRunner(mix_stderr=False)
704719
result = runner.invoke(
705-
cli.vcfpartition, [path, "-n", "5"], catch_exceptions=False
720+
cli.vcfpartition, [self.path, "-s", "512K"], catch_exceptions=False
706721
)
722+
assert result.stderr == ""
723+
assert result.exit_code == 0
707724
assert list(result.stdout.splitlines()) == [
708-
"20:60001-278528",
709-
"20:278529-442368",
710-
"20:442381-638976",
711-
"20:638982-819200",
712-
"20:819201-",
725+
"20:60001-212992\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
726+
"20:213070-327680\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
727+
"20:327695-442368\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
728+
"20:442381-557056\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
729+
"20:557078-688128\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
730+
"20:688129-802816\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
731+
"20:802817-933888\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
732+
"20:933890-\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
713733
]
714734

735+
def test_no_part_spec(self):
736+
runner = ct.CliRunner(mix_stderr=False)
737+
result = runner.invoke(cli.vcfpartition, [self.path], catch_exceptions=False)
738+
assert result.exit_code != 0
739+
assert result.stdout == ""
740+
assert len(result.stderr) > 0
741+
742+
def test_no_args(self):
743+
runner = ct.CliRunner(mix_stderr=False)
744+
result = runner.invoke(cli.vcfpartition, [], catch_exceptions=False)
745+
assert result.exit_code != 0
746+
assert result.stdout == ""
747+
assert len(result.stderr) > 0
748+
715749

716750
@pytest.mark.parametrize(
717751
"cmd", [main.bio2zarr, cli.vcf2zarr_main, cli.plink2zarr, cli.vcfpartition]

tests/test_vcf_utils.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def test_contig_record_counts(self, index_file, expected):
9595
)
9696
def test_partition_into_one_part(self, index_file, expected):
9797
indexed_vcf = self.get_instance(index_file)
98-
regions = indexed_vcf.partition_into_regions(num_parts=1)
98+
regions = list(indexed_vcf.partition_into_regions(num_parts=1))
9999
assert all(isinstance(r, vcf_utils.Region) for r in regions)
100100
assert [str(r) for r in regions] == expected
101101

@@ -120,7 +120,7 @@ def test_partition_into_one_part(self, index_file, expected):
120120
)
121121
def test_partition_into_max_parts(self, index_file, num_expected, total_records):
122122
indexed_vcf = self.get_instance(index_file)
123-
regions = indexed_vcf.partition_into_regions(num_parts=1000)
123+
regions = list(indexed_vcf.partition_into_regions(num_parts=1000))
124124
assert all(isinstance(r, vcf_utils.Region) for r in regions)
125125
# print(regions)
126126
assert len(regions) == num_expected
@@ -151,7 +151,7 @@ def test_partition_into_max_parts(self, index_file, num_expected, total_records)
151151
@pytest.mark.parametrize("num_parts", [2, 3, 4, 5, 16, 33])
152152
def test_partition_into_n_parts(self, index_file, total_records, num_parts):
153153
indexed_vcf = self.get_instance(index_file)
154-
regions = indexed_vcf.partition_into_regions(num_parts=num_parts)
154+
regions = list(indexed_vcf.partition_into_regions(num_parts=num_parts))
155155
assert all(isinstance(r, vcf_utils.Region) for r in regions)
156156
part_variant_counts = np.array(
157157
[indexed_vcf.count_variants(region) for region in regions]
@@ -161,7 +161,7 @@ def test_partition_into_n_parts(self, index_file, total_records, num_parts):
161161

162162
def test_tabix_multi_chrom_bug(self):
163163
indexed_vcf = self.get_instance("multi_contig.vcf.gz.tbi")
164-
regions = indexed_vcf.partition_into_regions(num_parts=10)
164+
regions = list(indexed_vcf.partition_into_regions(num_parts=10))
165165
# An earlier version of the code returned this, i.e. with a duplicate
166166
# for 4 with end coord of 0
167167
# ["0:1-", "1", "2", "3", "4:1-0", "4:1-"]
@@ -185,7 +185,9 @@ def test_tabix_multi_chrom_bug(self):
185185
)
186186
def test_target_part_size(self, target_part_size, filename):
187187
indexed_vcf = self.get_instance(filename)
188-
regions = indexed_vcf.partition_into_regions(target_part_size=target_part_size)
188+
regions = list(
189+
indexed_vcf.partition_into_regions(target_part_size=target_part_size)
190+
)
189191
assert len(regions) == 5
190192
part_variant_counts = [indexed_vcf.count_variants(region) for region in regions]
191193
assert part_variant_counts == [3450, 3869, 4525, 7041, 1025]
@@ -197,7 +199,7 @@ def test_partition_invalid_arguments(self):
197199
with pytest.raises(
198200
ValueError, match=r"One of num_parts or target_part_size must be specified"
199201
):
200-
indexed_vcf.partition_into_regions()
202+
list(indexed_vcf.partition_into_regions())
201203

202204
with pytest.raises(
203205
ValueError,

0 commit comments

Comments
 (0)