Skip to content

Commit 3448f1c

Browse files
Add infrastructure for validating split files
1 parent 1ae0762 commit 3448f1c

File tree

4 files changed

+44
-6
lines changed

4 files changed

+44
-6
lines changed

bio2zarr/vcf.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,6 @@ def make_field_def(name, vcf_type, vcf_number):
199199

200200

201201
def scan_vcf(path, target_num_partitions):
202-
logger.debug(f"Scanning {path}")
203202
with vcf_utils.IndexedVcf(path) as indexed_vcf:
204203
vcf = indexed_vcf.vcf
205204
filters = [
@@ -238,6 +237,8 @@ def scan_vcf(path, target_num_partitions):
238237
pass
239238

240239
regions = indexed_vcf.partition_into_regions(num_parts=target_num_partitions)
240+
logger.info(
241+
f"Split {path} into {len(regions)} regions (target={target_num_partitions})")
241242
for region in regions:
242243
metadata.partitions.append(
243244
VcfPartition(

validation-data/Makefile

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,14 @@ all: 1kg_2020_chr20.bcf.csi \
1515
1kg_p3_all_chr1.vcf.gz.csi\
1616
1kg_p1_all_chr6.bcf.csi\
1717
1kg_p1_all_chr6.vcf.gz.tbi \
18-
1kg_p1_all_chr6.vcf.gz.csi
18+
1kg_p1_all_chr6.vcf.gz.csi\
19+
1kg_2020_chr20.vcf.gz.2.split\
20+
1kg_2020_others.vcf.gz.2.split \
21+
1kg_2020_others.vcf.gz.5.split \
22+
1kg_2020_freebayes.vcf.gz.5.split \
1923

2024
.PRECIOUS: %.bcf
25+
.PRECIOUS: %.split
2126
.PRECIOUS: %.vcf.gz
2227

2328
# Modern 1000 genomes data
@@ -38,6 +43,12 @@ all: 1kg_2020_chr20.bcf.csi \
3843
%.vcf.gz.tbi: %.vcf.gz
3944
tabix $<
4045

46+
%.2.split: %
47+
./split.sh $< 2
48+
49+
%.5.split: %
50+
./split.sh $< 5
51+
4152
%.csi: %
4253
bcftools index $<
4354

validation-data/split.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
VCF=$1
3+
NUM_PARTS=$2
4+
5+
regions=$( PYTHONPATH="../" python3 -m bio2zarr vcf-partition $VCF -n $NUM_PARTS )
6+
dest=$VCF.$NUM_PARTS.split
7+
echo $dest
8+
rm -fR $dest
9+
mkdir $dest
10+
# TODO figure out how to do this with xargs
11+
for r in $regions; do
12+
f=$dest/$r.vcf.gz
13+
bcftools view $VCF -r $r -Oz > $f
14+
bcftools index $f
15+
done

validation.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# the original unsplit vs split files in there following some
1313
# naming conventions.
1414

15+
1516
@click.command
1617
@click.argument("vcfs", nargs=-1)
1718
@click.option("-p", "--worker-processes", type=int, default=1)
@@ -21,17 +22,27 @@
2122
def cli(vcfs, worker_processes, force):
2223
data_path = pathlib.Path("validation-data")
2324
if len(vcfs) == 0:
24-
vcfs = list(data_path.glob("*.vcf.gz")) + list(data_path.glob("*.bcf"))
25+
vcfs = (
26+
list(data_path.glob("*.vcf.gz"))
27+
+ list(data_path.glob("*.bcf"))
28+
+ list(data_path.glob("*.split"))
29+
)
2530
else:
2631
vcfs = [pathlib.Path(f) for f in vcfs]
2732
tmp_path = pathlib.Path("validation-tmp")
2833
tmp_path.mkdir(exist_ok=True)
2934
for f in vcfs:
30-
print(f)
35+
print("Validate", f)
36+
if f.is_dir():
37+
files = list(f.glob("*.vcf.gz")) + list(f.glob("*.bcf"))
38+
source_file = f.with_suffix("").with_suffix("")
39+
else:
40+
files = [f]
41+
source_file = f
3142
exploded = tmp_path / (f.name + ".exploded")
3243
if force or not exploded.exists():
3344
vcf.explode(
34-
[f],
45+
files,
3546
exploded,
3647
worker_processes=worker_processes,
3748
show_progress=True,
@@ -51,7 +62,7 @@ def cli(vcfs, worker_processes, force):
5162
show_progress=True,
5263
)
5364

54-
vcf.validate(f, zarr, show_progress=True)
65+
vcf.validate(source_file, zarr, show_progress=True)
5566

5667

5768
if __name__ == "__main__":

0 commit comments

Comments
 (0)