Skip to content

Commit ad7cd2b

Browse files
authored
Merge pull request #24 from RacimoLab/vcf
more vcf updates
2 parents 306bde2 + 215a9db commit ad7cd2b

File tree

6 files changed

+130
-41
lines changed

6 files changed

+130
-41
lines changed

dinf/vcf.py

Lines changed: 58 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,16 @@
11
from __future__ import annotations
22
from typing import Dict, Iterable, List, Tuple
33
import collections
4-
import contextlib
54
import logging
6-
import os
75
import pathlib
8-
import sys
6+
import warnings
97

108
import numpy as np
119
import cyvcf2
1210

1311
logger = logging.getLogger(__name__)
1412

1513

16-
@contextlib.contextmanager
17-
def redirect_stderr(fp):
18-
"""Redirect stderr to fp."""
19-
olderr = sys.stderr
20-
try:
21-
sys.stderr = fp
22-
yield
23-
finally:
24-
sys.stderr = olderr
25-
26-
2714
def get_genotype_matrix(
2815
vcf: cyvcf2.VCF,
2916
*,
@@ -123,7 +110,8 @@ def __init__(
123110
self,
124111
files: Iterable[str | pathlib.Path],
125112
*,
126-
individuals: List[str] | None = None,
113+
individuals: Iterable[str] | None = None,
114+
contigs: Iterable[str] | None = None,
127115
):
128116
"""
129117
:param files:
@@ -132,17 +120,34 @@ def __init__(
132120
:param individuals:
133121
An iterable of individual names corresponding to the VCF columns
134122
for which genotypes will be sampled.
123+
:param contigs:
124+
An iterable of contig names to use. If not specified, all contigs
125+
identified in the VCFs/BCFs will be used.
135126
"""
136-
with open(os.devnull, "w") as dev_null:
137-
self._fill_bag(files=files, individuals=individuals, dev_null=dev_null)
127+
# Silence some warnings from cyvcf2.
128+
with warnings.catch_warnings():
129+
# We check if a contig is usable for a given vcf by querying
130+
# that contig for variants. If there are no variants (e.g. vcfs are
131+
# split by chromosome so each vcf has data for only one contig),
132+
# then cyvcf2 warns us.
133+
warnings.filterwarnings(
134+
"ignore", message="no intervals found", category=UserWarning
135+
)
136+
# We detect this condition and explicitly raise an error.
137+
warnings.filterwarnings(
138+
"ignore",
139+
message="not all requested samples found",
140+
category=UserWarning,
141+
)
142+
self._fill_bag(files=files, individuals=individuals, contigs=contigs)
138143
self._regions: List[Tuple[str, int, int]] = []
139144

140145
def _fill_bag(
141146
self,
142147
*,
143148
files: Iterable[str | pathlib.Path],
144-
individuals: List[str] | None = None,
145-
dev_null,
149+
individuals: Iterable[str] | None = None,
150+
contigs: Iterable[str] | None = None,
146151
) -> None:
147152
"""
148153
Construct a mapping from contig id to cyvcf2.VCF object.
@@ -156,6 +161,19 @@ def _fill_bag(
156161
if len(set(files)) != len(files):
157162
raise ValueError("File list contains duplicates.")
158163

164+
if individuals is not None:
165+
individuals = list(individuals)
166+
if len(set(individuals)) != len(individuals):
167+
raise ValueError("Individuals list contains duplicates.")
168+
169+
if contigs is not None:
170+
contigs = list(contigs)
171+
if len(set(contigs)) != len(contigs):
172+
raise ValueError("Contigs list contains duplicates.")
173+
contigs = set(contigs)
174+
175+
contigs_seen = set()
176+
159177
for file in files:
160178
vcf = cyvcf2.VCF(file, samples=individuals, lazy=True, threads=1)
161179
if "GT" not in vcf:
@@ -165,15 +183,23 @@ def _fill_bag(
165183
or pathlib.Path(f"{file}.csi").exists()
166184
):
167185
raise ValueError(f"No index found for {file}.")
186+
if individuals is not None:
187+
individuals_not_found = set(individuals) - set(vcf.samples)
188+
if len(individuals_not_found) > 0:
189+
raise ValueError(
190+
f"Requested individuals not found in {file}: "
191+
f"{', '.join(individuals_not_found)}"
192+
)
168193
for contig_id, contig_length in zip(vcf.seqnames, vcf.seqlens):
194+
contigs_seen.add(contig_id)
195+
if contigs is not None and contig_id not in contigs:
196+
continue
169197
# Check there's at least one variant. If present, we assume
170198
# there exist usable variants (i.e. SNPs) for the contig.
171-
# We redirect stderr to silence cyvcf2 "no intervals" warnings.
172-
with redirect_stderr(dev_null):
173-
try:
174-
next(vcf(contig_id))
175-
except StopIteration:
176-
continue
199+
try:
200+
next(vcf(contig_id))
201+
except StopIteration:
202+
continue
177203
if contig_id in contig2file:
178204
first_file = contig2file[contig_id]
179205
raise ValueError(
@@ -184,6 +210,13 @@ def _fill_bag(
184210
contig_lengths.append(contig_length)
185211
contig2vcf[contig_id] = vcf
186212

213+
if contigs is not None:
214+
contigs_not_found = contigs - contigs_seen
215+
if len(contigs_not_found) > 0:
216+
raise ValueError(
217+
f"Requested contigs not found: {', '.join(contigs_not_found)}"
218+
)
219+
187220
if len(contig2file) == 0:
188221
raise ValueError("No usable vcf/bcf files in the list.")
189222

environment-gpu.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@ dependencies:
1717
- optax
1818
# The rest.
1919
- arviz
20-
- cyvcf2
20+
- cyvcf2 >= 0.30.14
2121
- emcee
22-
- demes
23-
- msprime
22+
- demes >= 0.2.1
23+
- msprime >= 1.0.4
2424
- pip
2525
- pip:
2626
# Last jaxlib version to support CUDA 10.1

environment.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ dependencies:
1313
- optax
1414
# The rest.
1515
- arviz
16-
- cyvcf2
16+
- cyvcf2 >= 0.30.14
1717
- emcee
18-
- demes
19-
- msprime
18+
- demes >= 0.2.1
19+
- msprime >= 1.0.4

requirements/minimal.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
arviz==0.11.4
2-
cyvcf2==0.30.12
2+
cyvcf2==0.30.14
33
emcee==3.1.1
44
numpy==1.21.4
55
jax==0.2.25
66
jaxlib==0.1.74
77
flax==0.3.6
88
optax==0.1.0
9-
msprime==1.0.3
10-
demes==0.1.2
9+
msprime==1.0.4
10+
demes==0.2.1

setup.cfg

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,14 @@ include_package_data = True
3030
python_requires = >=3.7
3131
install_requires =
3232
arviz
33-
cyvcf2
33+
cyvcf2 >= 0.30.14
3434
emcee
3535
numpy
3636
jax
3737
flax
3838
optax
3939
msprime >= 1.0.4
40-
demes
40+
demes >= 0.2.1
4141
setup_requires =
4242
setuptools
4343
setuptools_scm

tests/test_vcf.py

Lines changed: 61 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -487,8 +487,17 @@ def test_unindexed_bcf(self, tmp_path):
487487
with pytest.raises(ValueError, match="No index"):
488488
dinf.BagOfVcf(tmp_path.glob("*.bcf"))
489489

490+
def test_missing_file(self):
491+
missing_file = "nonexistent.vcf.gz"
492+
with pytest.raises(OSError, match=missing_file):
493+
dinf.BagOfVcf([missing_file])
494+
495+
def test_no_files(self):
496+
with pytest.raises(ValueError, match="No usable vcf/bcf files"):
497+
dinf.BagOfVcf([])
498+
490499
@pytest.mark.usefixtures("tmp_path")
491-
def test_file_list_duplicates(self, tmp_path):
500+
def test_duplicate_files(self, tmp_path):
492501
create_vcf_dataset(tmp_path, contig_lengths=[100_000])
493502
files = 2 * list(tmp_path.glob("*.vcf.gz"))
494503
with pytest.raises(ValueError, match="File list contains duplicates"):
@@ -504,10 +513,6 @@ def test_multiple_files_claim_a_contig(self, tmp_path):
504513
with pytest.raises(ValueError, match="Both .* contain records for sequence"):
505514
dinf.BagOfVcf(files)
506515

507-
def test_no_files(self):
508-
with pytest.raises(ValueError, match="No usable vcf/bcf files"):
509-
dinf.BagOfVcf([])
510-
511516
@pytest.mark.usefixtures("tmp_path")
512517
def test_no_GT_field(self, tmp_path):
513518
def remove_GT_header(filename):
@@ -523,6 +528,57 @@ def remove_GT_header(filename):
523528
with pytest.raises(ValueError, match="doesn't contain GT field"):
524529
dinf.BagOfVcf(tmp_path.glob("*.vcf.gz"))
525530

531+
@pytest.mark.parametrize("num_individuals", [20, 97])
532+
@pytest.mark.usefixtures("tmp_path")
533+
def test_individuals(self, tmp_path, num_individuals):
534+
samples = create_vcf_dataset(tmp_path, contig_lengths=[100_000])
535+
individuals = samples["A"][:num_individuals]
536+
vb = dinf.BagOfVcf(tmp_path.glob("*.vcf.gz"), individuals=individuals)
537+
assert vb["1"].samples == individuals
538+
539+
@pytest.mark.filterwarnings("ignore:not all requested samples found:UserWarning")
540+
@pytest.mark.usefixtures("tmp_path")
541+
def test_missing_individuals(self, tmp_path):
542+
samples = create_vcf_dataset(tmp_path, contig_lengths=[100_000])
543+
individuals = ["nonexistent_1"] + samples["A"] + ["nonexistent_2"]
544+
with pytest.raises(ValueError, match="individuals not found") as err:
545+
dinf.BagOfVcf(tmp_path.glob("*.vcf.gz"), individuals=individuals)
546+
assert "nonexistent_1" in err.value.args[0]
547+
assert "nonexistent_2" in err.value.args[0]
548+
for ind in samples["A"]:
549+
assert ind not in err.value.args[0]
550+
551+
@pytest.mark.usefixtures("tmp_path")
552+
def test_duplicate_individuals(self, tmp_path):
553+
samples = create_vcf_dataset(tmp_path, contig_lengths=[100_000])
554+
individuals = samples["A"] + [samples["A"][0]]
555+
with pytest.raises(ValueError, match="Individuals list contains duplicates"):
556+
dinf.BagOfVcf(tmp_path.glob("*.vcf.gz"), individuals=individuals)
557+
558+
@pytest.mark.usefixtures("tmp_path")
559+
def test_contigs(self, tmp_path):
560+
create_vcf_dataset(tmp_path, contig_lengths=[100_000, 200_000, 300_000])
561+
contigs = ["1", "3"]
562+
vb = dinf.BagOfVcf(tmp_path.glob("*.vcf.gz"), contigs=contigs)
563+
assert set(vb) == set(contigs)
564+
565+
@pytest.mark.usefixtures("tmp_path")
566+
def test_missing_contigs(self, tmp_path):
567+
create_vcf_dataset(tmp_path, contig_lengths=[100_000, 200_000])
568+
contigs = ["nonexistent_a", "1", "2", "nonexistent_b"]
569+
with pytest.raises(ValueError, match="contigs not found") as err:
570+
dinf.BagOfVcf(tmp_path.glob("*.vcf.gz"), contigs=contigs)
571+
assert "nonexistent_a" in err.value.args[0]
572+
assert "nonexistent_b" in err.value.args[0]
573+
assert "1" not in err.value.args[0]
574+
assert "2" not in err.value.args[0]
575+
576+
@pytest.mark.usefixtures("tmp_path")
577+
def test_duplicate_contigs(self, tmp_path):
578+
create_vcf_dataset(tmp_path, contig_lengths=[100_000])
579+
with pytest.raises(ValueError, match="Contigs list contains duplicates"):
580+
dinf.BagOfVcf(tmp_path.glob("*.vcf.gz"), contigs=["1", "1"])
581+
526582
@pytest.mark.usefixtures("tmp_path")
527583
def test_unused_contigs(self, tmp_path):
528584
# Contigs in the header should be ignored if they have no variants.

0 commit comments

Comments
 (0)