Skip to content

Commit e363bf0

Browse files
Tidy up vcfpartition interface
1 parent 9ad0cf6 commit e363bf0

File tree

3 files changed

+83
-13
lines changed

3 files changed

+83
-13
lines changed

bio2zarr/cli.py

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -518,10 +518,42 @@ def plink2zarr():
518518
@click.command
519519
@version
520520
@click.argument("vcf_path", type=click.Path())
521-
@click.option("-n", "--num-parts", type=int, default=None)
522-
# @click.option("-s", "--part-size", type=int, default=None)
523-
def vcfpartition(vcf_path, num_parts):
521+
@verbose
522+
@click.option(
523+
"-n",
524+
"--num-parts",
525+
type=int,
526+
default=None,
527+
help="Target number of partitions to split the VCF into",
528+
)
529+
@click.option(
530+
"-s",
531+
"--part-size",
532+
type=str,
533+
default=None,
534+
help="Target (compressed) size of VCF partitions, e.g. 100KB, 10MiB, 1G.",
535+
)
536+
def vcfpartition(vcf_path, verbose, num_parts, part_size):
537+
"""
538+
Output bcftools region strings that partition an indexed VCF/BCF file
539+
into either an approximate number of parts (-n), or parts of approximately
540+
a given size (-s). One of -n or -s must be supplied.
541+
542+
Note that both the number of partitions and sizes are a target, and the
543+
returned number of partitions may not exactly correspond. In particular,
544+
there is a maximum level of granularity determined by the associated index
545+
which cannot be exceeded.
546+
547+
Note also that the partitions returned may vary considerably in the number
548+
of records that they contain.
549+
"""
550+
setup_logging(verbose)
551+
if num_parts is None and part_size is None:
552+
raise click.UsageError("Either --num-parts or --part-size must be specified")
553+
524554
indexed_vcf = vcf_utils.IndexedVcf(vcf_path)
525-
regions = indexed_vcf.partition_into_regions(num_parts=num_parts)
555+
regions = indexed_vcf.partition_into_regions(
556+
num_parts=num_parts, target_part_size=part_size
557+
)
526558
for region in regions:
527-
click.echo(region)
559+
click.echo(f"{region}\t{vcf_path}")

bio2zarr/vcf_utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import contextlib
22
import gzip
3+
import logging
34
import os
45
import pathlib
56
import struct
@@ -13,6 +14,8 @@
1314

1415
from bio2zarr.typing import PathType
1516

17+
logger = logging.getLogger(__name__)
18+
1619
CSI_EXTENSION = ".csi"
1720
TABIX_EXTENSION = ".tbi"
1821
TABIX_LINEAR_INDEX_INTERVAL_SIZE = 1 << 14 # 16kb interval size
@@ -411,6 +414,7 @@ def __init__(self, vcf_path, index_path=None):
411414
raise ValueError("Only .tbi or .csi indexes are supported.")
412415
self.vcf = cyvcf2.VCF(vcf_path)
413416
self.vcf.set_index(str(self.index_path))
417+
logger.debug(f"Loaded {vcf_path} with index {self.index_path}")
414418
self.sequence_names = None
415419
if self.index_type == "csi":
416420
# Determine the file-type based on the "aux" field.
@@ -491,7 +495,7 @@ def partition_into_regions(
491495
target_part_size_bytes = file_length // num_parts
492496
elif target_part_size_bytes is not None:
493497
num_parts = ceildiv(file_length, target_part_size_bytes)
494-
part_lengths = np.array([i * target_part_size_bytes for i in range(num_parts)])
498+
part_lengths = target_part_size_bytes * np.arange(num_parts, dtype=int)
495499
file_offsets, region_contig_indexes, region_positions = self.index.offsets()
496500

497501
# Search the file offsets to find which indexes the part lengths fall at

tests/test_cli.py

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -697,21 +697,55 @@ def test_convert(self, tmp_path):
697697

698698

699699
class TestVcfPartition:
700+
path = "tests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz"
701+
700702
def test_num_parts(self):
701-
path = "tests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz"
703+
runner = ct.CliRunner(mix_stderr=False)
704+
result = runner.invoke(
705+
cli.vcfpartition, [self.path, "-n", "5"], catch_exceptions=False
706+
)
707+
assert result.stderr == ""
708+
assert result.exit_code == 0
709+
assert list(result.stdout.splitlines()) == [
710+
"20:60001-278528\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
711+
"20:278529-442368\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
712+
"20:442381-638976\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
713+
"20:638982-819200\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
714+
"20:819201-\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
715+
]
702716

717+
def test_part_size(self):
703718
runner = ct.CliRunner(mix_stderr=False)
704719
result = runner.invoke(
705-
cli.vcfpartition, [path, "-n", "5"], catch_exceptions=False
720+
cli.vcfpartition, [self.path, "-s", "512K"], catch_exceptions=False
706721
)
722+
assert result.stderr == ""
723+
assert result.exit_code == 0
707724
assert list(result.stdout.splitlines()) == [
708-
"20:60001-278528",
709-
"20:278529-442368",
710-
"20:442381-638976",
711-
"20:638982-819200",
712-
"20:819201-",
725+
"20:60001-212992\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
726+
"20:213070-327680\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
727+
"20:327695-442368\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
728+
"20:442381-557056\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
729+
"20:557078-688128\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
730+
"20:688129-802816\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
731+
"20:802817-933888\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
732+
"20:933890-\ttests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz",
713733
]
714734

735+
def test_no_part_spec(self):
736+
runner = ct.CliRunner(mix_stderr=False)
737+
result = runner.invoke(cli.vcfpartition, [self.path], catch_exceptions=False)
738+
assert result.exit_code != 0
739+
assert result.stdout == ""
740+
assert len(result.stderr) > 0
741+
742+
def test_no_args(self):
743+
runner = ct.CliRunner(mix_stderr=False)
744+
result = runner.invoke(cli.vcfpartition, [], catch_exceptions=False)
745+
assert result.exit_code != 0
746+
assert result.stdout == ""
747+
assert len(result.stderr) > 0
748+
715749

716750
@pytest.mark.parametrize(
717751
"cmd", [main.bio2zarr, cli.vcf2zarr_main, cli.plink2zarr, cli.vcfpartition]

0 commit comments

Comments
 (0)