Skip to content

Commit a2aefac

Browse files
Add vcf-partition command and isolate Tabix issue
1 parent 999a70f commit a2aefac

File tree

5 files changed

+55
-9
lines changed

5 files changed

+55
-9
lines changed

bio2zarr/__main__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ def bio2zarr():
1414
# up in the right way.
1515
bio2zarr.add_command(cli.vcf2zarr)
1616
bio2zarr.add_command(cli.plink2zarr)
17+
bio2zarr.add_command(cli.vcf_partition)
1718

1819
if __name__ == "__main__":
1920
bio2zarr()

bio2zarr/cli.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import coloredlogs
44

55
from . import vcf
6+
from . import vcf_utils
67
from . import plink
78
from . import provenance
89

@@ -15,6 +16,7 @@
1516

1617
version = click.version_option(version=provenance.__version__)
1718

19+
1820
# Note: logging hasn't been implemented in the code at all, this is just
1921
# a first pass to try out some ways of doing things to see what works.
2022
def setup_logging(verbosity):
@@ -155,3 +157,15 @@ def plink2zarr():
155157

156158

157159
plink2zarr.add_command(convert_plink)
160+
161+
162+
@click.command
163+
@version
164+
@click.argument("vcf_path", type=click.Path())
165+
@click.option("-i", "--index", type=click.Path(), default=None)
166+
@click.option("-n", "--num-parts", type=int, default=None)
167+
# @click.option("-s", "--part-size", type=int, default=None)
168+
def vcf_partition(vcf_path, index, num_parts):
169+
indexed_vcf = vcf_utils.IndexedVcf(vcf_path, index)
170+
regions = indexed_vcf.partition_into_regions(num_parts=num_parts)
171+
click.echo("\n".join(map(str, regions)))

bio2zarr/vcf.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -236,14 +236,14 @@ def scan_vcfs(paths, show_progress, target_num_partitions):
236236
regions = indexed_vcf.partition_into_regions(num_parts=target_num_partitions)
237237
for region in regions:
238238
partitions.append(
239-
# Requires cyvcf2>=0.30.27
240239
VcfPartition(
241240
vcf_path=str(path),
242241
region=region,
243242
)
244243
)
245-
# TODO figure out if this is safe when we have multiple chrs
246-
# in the file
244+
# TODO figure out if this is correct. It should be fine because of VCF
245+
# sorting, but need to verify there isn't some loophole with BCF, where
246+
# contigs are sorted by their index in the list rather than string value.
247247
partitions.sort(key=lambda x: (x.region.contig, x.region.start))
248248
vcf_metadata.partitions = partitions
249249
return vcf_metadata, header
@@ -791,7 +791,7 @@ def convert_partition(
791791
partition = vcf_metadata.partitions[partition_index]
792792
vcf = cyvcf2.VCF(partition.vcf_path)
793793
logger.info(
794-
f"Start partition {partition_index} {partition.vcf_path}: {partition.region}"
794+
f"Start p{partition_index} {partition.vcf_path}__{partition.region}"
795795
)
796796

797797
info_fields = []
@@ -835,6 +835,7 @@ def variants():
835835
encoder_threads=0,
836836
chunk_size=column_chunk_size,
837837
) as tcw:
838+
num_records = 0
838839
for variant in variants():
839840
tcw.append("CHROM", variant.CHROM)
840841
tcw.append("POS", variant.POS)
@@ -859,8 +860,13 @@ def variants():
859860
# is that we get a significant pause at the end of the counter while
860861
# all the "small" fields get flushed. Possibly not much to be done about it.
861862
core.update_progress(1)
863+
num_records += 1
862864

863-
return tcw.field_summaries
865+
logger.info(
866+
f"Finish p{partition_index} {partition.vcf_path}__{partition.region}="
867+
f"{num_records} records")
868+
869+
return tcw.field_summaries, num_records
864870

865871
@staticmethod
866872
def convert(
@@ -896,7 +902,13 @@ def convert(
896902
out_path,
897903
column_chunk_size=column_chunk_size,
898904
)
899-
partition_summaries = list(pwm.results_as_completed())
905+
num_records = 0
906+
partition_summaries = []
907+
for partition_summary, partition_num_records in pwm.results_as_completed():
908+
num_records += partition_num_records
909+
partition_summaries.append(partition_summary)
910+
911+
assert num_records == pcvcf.num_records
900912

901913
for field in vcf_metadata.fields:
902914
# Clear the summary to avoid problems when running in debug

bio2zarr/vcf_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@
2323
)
2424

2525

26-
# TODO create a Region dataclass that will sort correctly, and has
27-
# a str method that does the correct thing
2826

2927

3028
def region_filter(variants, region=None):
@@ -62,8 +60,10 @@ class Region:
6260
def __post_init__(self):
6361
if self.start is not None:
6462
self.start = int(self.start)
63+
assert self.start > 0
6564
if self.end is not None:
6665
self.end = int(self.end)
66+
assert self.end > 0
6767

6868
def __str__(self):
6969
s = f"{self.contig}"

tests/test_cli.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,26 @@ def test_convert_plink(self):
122122
)
123123

124124

125-
@pytest.mark.parametrize("cmd", [main.bio2zarr, cli.vcf2zarr, cli.plink2zarr])
125+
class TestVcfPartition:
126+
def test_num_parts(self):
127+
path = "tests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz"
128+
129+
runner = ct.CliRunner(mix_stderr=False)
130+
result = runner.invoke(
131+
cli.vcf_partition, [path, "-n", "5"], catch_exceptions=False
132+
)
133+
assert list(result.stdout.splitlines()) == [
134+
"20:1-278528",
135+
"20:278529-442368",
136+
"20:442369-638976",
137+
"20:638977-819200",
138+
"20:819201-",
139+
]
140+
141+
142+
@pytest.mark.parametrize(
143+
"cmd", [main.bio2zarr, cli.vcf2zarr, cli.plink2zarr, cli.vcf_partition]
144+
)
126145
def test_version(cmd):
127146
runner = ct.CliRunner(mix_stderr=False)
128147
result = runner.invoke(cmd, ["--version"], catch_exceptions=False)

0 commit comments

Comments
 (0)