Skip to content

Commit 215a9db

Browse files
committed
Support for specifying a list of contigs to use.
1 parent 57ff6c4 commit 215a9db

File tree

2 files changed

+72
-10
lines changed

2 files changed

+72
-10
lines changed

dinf/vcf.py

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ def __init__(
111111
files: Iterable[str | pathlib.Path],
112112
*,
113113
individuals: Iterable[str] | None = None,
114+
contigs: Iterable[str] | None = None,
114115
):
115116
"""
116117
:param files:
@@ -119,6 +120,9 @@ def __init__(
119120
:param individuals:
120121
An iterable of individual names corresponding to the VCF columns
121122
for which genotypes will be sampled.
123+
:param contigs:
124+
An iterable of contig names to use. If not specified, all contigs
125+
identified in the VCFs/BCFs will be used.
122126
"""
123127
# Silence some warnings from cyvcf2.
124128
with warnings.catch_warnings():
@@ -135,14 +139,15 @@ def __init__(
135139
message="not all requested samples found",
136140
category=UserWarning,
137141
)
138-
self._fill_bag(files=files, individuals=individuals)
142+
self._fill_bag(files=files, individuals=individuals, contigs=contigs)
139143
self._regions: List[Tuple[str, int, int]] = []
140144

141145
def _fill_bag(
142146
self,
143147
*,
144148
files: Iterable[str | pathlib.Path],
145149
individuals: Iterable[str] | None = None,
150+
contigs: Iterable[str] | None = None,
146151
) -> None:
147152
"""
148153
Construct a mapping from contig id to cyvcf2.VCF object.
@@ -155,8 +160,19 @@ def _fill_bag(
155160
files = list(files)
156161
if len(set(files)) != len(files):
157162
raise ValueError("File list contains duplicates.")
163+
158164
if individuals is not None:
159165
individuals = list(individuals)
166+
if len(set(individuals)) != len(individuals):
167+
raise ValueError("Individuals list contains duplicates.")
168+
169+
if contigs is not None:
170+
contigs = list(contigs)
171+
if len(set(contigs)) != len(contigs):
172+
raise ValueError("Contigs list contains duplicates.")
173+
contigs = set(contigs)
174+
175+
contigs_seen = set()
160176

161177
for file in files:
162178
vcf = cyvcf2.VCF(file, samples=individuals, lazy=True, threads=1)
@@ -168,13 +184,16 @@ def _fill_bag(
168184
):
169185
raise ValueError(f"No index found for {file}.")
170186
if individuals is not None:
171-
not_found = set(individuals) - set(vcf.samples)
172-
if len(not_found) > 0:
187+
individuals_not_found = set(individuals) - set(vcf.samples)
188+
if len(individuals_not_found) > 0:
173189
raise ValueError(
174190
f"Requested individuals not found in {file}: "
175-
f"{', '.join(not_found)}"
191+
f"{', '.join(individuals_not_found)}"
176192
)
177193
for contig_id, contig_length in zip(vcf.seqnames, vcf.seqlens):
194+
contigs_seen.add(contig_id)
195+
if contigs is not None and contig_id not in contigs:
196+
continue
178197
# Check there's at least one variant. If present, we assume
179198
# there exist usable variants (i.e. SNPs) for the contig.
180199
try:
@@ -191,6 +210,13 @@ def _fill_bag(
191210
contig_lengths.append(contig_length)
192211
contig2vcf[contig_id] = vcf
193212

213+
if contigs is not None:
214+
contigs_not_found = contigs - contigs_seen
215+
if len(contigs_not_found) > 0:
216+
raise ValueError(
217+
f"Requested contigs not found: {', '.join(contigs_not_found)}"
218+
)
219+
194220
if len(contig2file) == 0:
195221
raise ValueError("No usable vcf/bcf files in the list.")
196222

tests/test_vcf.py

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -487,8 +487,17 @@ def test_unindexed_bcf(self, tmp_path):
487487
with pytest.raises(ValueError, match="No index"):
488488
dinf.BagOfVcf(tmp_path.glob("*.bcf"))
489489

490+
def test_missing_file(self):
491+
missing_file = "nonexistent.vcf.gz"
492+
with pytest.raises(OSError, match=missing_file):
493+
dinf.BagOfVcf([missing_file])
494+
495+
def test_no_files(self):
496+
with pytest.raises(ValueError, match="No usable vcf/bcf files"):
497+
dinf.BagOfVcf([])
498+
490499
@pytest.mark.usefixtures("tmp_path")
491-
def test_file_list_duplicates(self, tmp_path):
500+
def test_duplicate_files(self, tmp_path):
492501
create_vcf_dataset(tmp_path, contig_lengths=[100_000])
493502
files = 2 * list(tmp_path.glob("*.vcf.gz"))
494503
with pytest.raises(ValueError, match="File list contains duplicates"):
@@ -504,10 +513,6 @@ def test_multiple_files_claim_a_contig(self, tmp_path):
504513
with pytest.raises(ValueError, match="Both .* contain records for sequence"):
505514
dinf.BagOfVcf(files)
506515

507-
def test_no_files(self):
508-
with pytest.raises(ValueError, match="No usable vcf/bcf files"):
509-
dinf.BagOfVcf([])
510-
511516
@pytest.mark.usefixtures("tmp_path")
512517
def test_no_GT_field(self, tmp_path):
513518
def remove_GT_header(filename):
@@ -533,7 +538,7 @@ def test_individuals(self, tmp_path, num_individuals):
533538

534539
@pytest.mark.filterwarnings("ignore:not all requested samples found:UserWarning")
535540
@pytest.mark.usefixtures("tmp_path")
536-
def test_bad_individuals(self, tmp_path):
541+
def test_missing_individuals(self, tmp_path):
537542
samples = create_vcf_dataset(tmp_path, contig_lengths=[100_000])
538543
individuals = ["nonexistent_1"] + samples["A"] + ["nonexistent_2"]
539544
with pytest.raises(ValueError, match="individuals not found") as err:
@@ -543,6 +548,37 @@ def test_bad_individuals(self, tmp_path):
543548
for ind in samples["A"]:
544549
assert ind not in err.value.args[0]
545550

551+
@pytest.mark.usefixtures("tmp_path")
552+
def test_duplicate_individuals(self, tmp_path):
553+
samples = create_vcf_dataset(tmp_path, contig_lengths=[100_000])
554+
individuals = samples["A"] + [samples["A"][0]]
555+
with pytest.raises(ValueError, match="Individuals list contains duplicates"):
556+
dinf.BagOfVcf(tmp_path.glob("*.vcf.gz"), individuals=individuals)
557+
558+
@pytest.mark.usefixtures("tmp_path")
559+
def test_contigs(self, tmp_path):
560+
create_vcf_dataset(tmp_path, contig_lengths=[100_000, 200_000, 300_000])
561+
contigs = ["1", "3"]
562+
vb = dinf.BagOfVcf(tmp_path.glob("*.vcf.gz"), contigs=contigs)
563+
assert set(vb) == set(contigs)
564+
565+
@pytest.mark.usefixtures("tmp_path")
566+
def test_missing_contigs(self, tmp_path):
567+
create_vcf_dataset(tmp_path, contig_lengths=[100_000, 200_000])
568+
contigs = ["nonexistent_a", "1", "2", "nonexistent_b"]
569+
with pytest.raises(ValueError, match="contigs not found") as err:
570+
dinf.BagOfVcf(tmp_path.glob("*.vcf.gz"), contigs=contigs)
571+
assert "nonexistent_a" in err.value.args[0]
572+
assert "nonexistent_b" in err.value.args[0]
573+
assert "1" not in err.value.args[0]
574+
assert "2" not in err.value.args[0]
575+
576+
@pytest.mark.usefixtures("tmp_path")
577+
def test_duplicate_contigs(self, tmp_path):
578+
create_vcf_dataset(tmp_path, contig_lengths=[100_000])
579+
with pytest.raises(ValueError, match="Contigs list contains duplicates"):
580+
dinf.BagOfVcf(tmp_path.glob("*.vcf.gz"), contigs=["1", "1"])
581+
546582
@pytest.mark.usefixtures("tmp_path")
547583
def test_unused_contigs(self, tmp_path):
548584
# Contigs in the header should be ignored if they have no variants.

0 commit comments

Comments
 (0)