Skip to content

Commit 258d83f

Browse files
Merge pull request #35 from jeromekelleher/fix-string-bugs
Fix string bugs
2 parents fb35fee + d4b9686 commit 258d83f

11 files changed

+954
-244
lines changed

bio2zarr/vcf.py

Lines changed: 364 additions & 241 deletions
Large diffs are not rendered by default.
8.45 KB
Binary file not shown.
101 Bytes
Binary file not shown.
4.15 KB
Binary file not shown.
116 Bytes
Binary file not shown.
2.53 KB
Binary file not shown.
133 Bytes
Binary file not shown.

tests/test_cli.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import click.testing as ct
2+
3+
from bio2zarr import cli
4+
5+
# NOTE just putting things together here to see what works.
6+
# Probably want to mock the module functions here to
7+
# avoid testing any real functionality.
8+
def test_vcf_summarise():
9+
runner = ct.CliRunner()
10+
result = runner.invoke(cli.vcf2zarr, "summarise", "filename")
11+
# FIXME not testing anything!

tests/test_pcvcf.py

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
import pytest
2+
import numpy as np
3+
import numpy.testing as nt
4+
5+
from bio2zarr import vcf
6+
7+
8+
class TestSmallExample:
9+
data_path = "tests/data/vcf/sample.vcf.gz"
10+
11+
# fmt: off
12+
columns = [
13+
'ALT', 'CHROM', 'FILTERS', 'FORMAT/DP', 'FORMAT/GQ',
14+
'FORMAT/GT', 'FORMAT/HQ', 'ID', 'INFO/AA', 'INFO/AC',
15+
'INFO/AF', 'INFO/AN', 'INFO/DB', 'INFO/DP', 'INFO/H2',
16+
'INFO/NS', 'POS', 'QUAL', 'REF'
17+
]
18+
# fmt: on
19+
20+
@pytest.fixture(scope="class")
21+
def pcvcf(self, tmp_path_factory):
22+
out = tmp_path_factory.mktemp("data") / "example.exploded"
23+
return vcf.explode([self.data_path], out)
24+
25+
def test_summary_table(self, pcvcf):
26+
data = pcvcf.summary_table()
27+
cols = [d["name"] for d in data]
28+
assert sorted(cols) == self.columns
29+
30+
def test_mapping_methods(self, pcvcf):
31+
assert len(pcvcf) == len(self.columns)
32+
assert pcvcf["ALT"] is pcvcf.columns["ALT"]
33+
assert list(iter(pcvcf)) == list(iter(pcvcf))
34+
35+
def test_num_partitions(self, pcvcf):
36+
assert pcvcf.num_partitions == 1
37+
38+
def test_num_records(self, pcvcf):
39+
assert pcvcf.num_records == 9
40+
41+
def test_POS(self, pcvcf):
42+
nt.assert_array_equal(
43+
[v[0] for v in pcvcf["POS"].values],
44+
[111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10],
45+
)
46+
47+
def test_REF(self, pcvcf):
48+
ref = ["A", "A", "G", "T", "A", "T", "G", "T", "AC"]
49+
assert pcvcf["REF"].values == ref
50+
51+
def test_ALT(self, pcvcf):
52+
alt = [
53+
["C"],
54+
["G"],
55+
["A"],
56+
["A"],
57+
["G", "T"],
58+
[],
59+
["GA", "GAC"],
60+
[],
61+
["A", "ATG", "C"],
62+
]
63+
assert [list(v) for v in pcvcf["ALT"].values] == alt
64+
65+
def test_INFO_NS(self, pcvcf):
66+
assert pcvcf["INFO/NS"].values == [None, None, 3, 3, 2, 3, 3, None, None]
67+
68+
69+
class TestGeneratedFieldsExample:
70+
data_path = "tests/data/vcf/field_type_combos.vcf.gz"
71+
72+
@pytest.fixture(scope="class")
73+
def pcvcf(self, tmp_path_factory):
74+
out = tmp_path_factory.mktemp("data") / "example.exploded"
75+
# import sgkit
76+
# from sgkit.io.vcf import vcf_to_zarr
77+
# vcf_to_zarr(self.data_path, "tmp/fields.vcf.sg", fields=
78+
# ["INFO/IS1", "INFO/IC2", "INFO/IS2", "INFO/ISR", "FORMAT/FS2"])
79+
# df = sgkit.load_dataset("tmp/fields.vcf.sg")
80+
# print(df["variant_IC2"])
81+
# print(df["variant_IC2"].values)
82+
return vcf.explode([self.data_path], out)
83+
84+
@pytest.fixture(scope="class")
85+
def schema(self, pcvcf):
86+
return vcf.ZarrConversionSpec.generate(pcvcf)
87+
88+
@pytest.mark.parametrize(
89+
("name", "dtype", "shape"),
90+
[
91+
("variant_II1", "i1", (208,)),
92+
("variant_II2", "i2", (208, 2)),
93+
("variant_IIA", "i2", (208, 2)),
94+
("variant_IIR", "i2", (208, 3)),
95+
("variant_IID", "i2", (208, 7)),
96+
("variant_IF1", "f4", (208,)),
97+
("variant_IF2", "f4", (208, 2)),
98+
("variant_IFA", "f4", (208, 2)),
99+
("variant_IFR", "f4", (208, 3)),
100+
("variant_IFD", "f4", (208, 9)),
101+
("variant_IC1", "U1", (208,)),
102+
("variant_IC2", "U1", (208, 2)),
103+
("variant_IS1", "O", (208,)),
104+
("variant_IS2", "O", (208, 2)),
105+
("call_FS2", "O", (208, 2, 2)),
106+
("call_FC2", "U1", (208, 2, 2)),
107+
],
108+
)
109+
def test_info_schemas(self, schema, name, dtype, shape):
110+
variables = [v for v in schema.variables if v.name == name]
111+
v = variables[0]
112+
assert v.dtype == dtype
113+
assert tuple(v.shape) == shape
114+
115+
def test_info_string1(self, pcvcf):
116+
non_missing = [v for v in pcvcf["INFO/IS1"].values if v is not None]
117+
assert non_missing[0] == "bc"
118+
assert non_missing[1] == "."
119+
120+
def test_info_char1(self, pcvcf):
121+
non_missing = [v for v in pcvcf["INFO/IC1"].values if v is not None]
122+
assert non_missing[0] == "f"
123+
assert non_missing[1] == "."
124+
125+
def test_info_string2(self, pcvcf):
126+
non_missing = [v for v in pcvcf["INFO/IS2"].values if v is not None]
127+
nt.assert_array_equal(non_missing[0], ["hij", "d"])
128+
nt.assert_array_equal(non_missing[1], [".", "d"])
129+
nt.assert_array_equal(non_missing[2], ["hij", "."])
130+
nt.assert_array_equal(non_missing[3], [".", "."])
131+
132+
def test_format_string1(self, pcvcf):
133+
non_missing = [v for v in pcvcf["FORMAT/FS1"].values if v is not None]
134+
nt.assert_array_equal(non_missing[0], [["bc"], ["."]])
135+
136+
def test_format_string2(self, pcvcf):
137+
non_missing = [v for v in pcvcf["FORMAT/FS2"].values if v is not None]
138+
nt.assert_array_equal(non_missing[0], [["bc", "op"], [".", "op"]])
139+
nt.assert_array_equal(non_missing[1], [["bc", "."], [".", "."]])

tests/test_simulated_data.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import pytest
2+
import msprime
3+
import pysam
4+
import sgkit as sg
5+
import numpy.testing as nt
6+
7+
from bio2zarr import vcf
8+
9+
10+
class TestTskitRoundTripVcf:
11+
@pytest.mark.parametrize("ploidy", [1, 2, 3, 4])
12+
def test_ploidy(self, ploidy, tmp_path):
13+
ts = msprime.sim_ancestry(
14+
2,
15+
population_size=10**4,
16+
ploidy=ploidy,
17+
sequence_length=100_000,
18+
random_seed=42,
19+
)
20+
tables = ts.dump_tables()
21+
for u in ts.samples():
22+
site = tables.sites.add_row(u + 1, "A")
23+
tables.mutations.add_row(site, derived_state="T", node=u)
24+
ts = tables.tree_sequence()
25+
vcf_file = tmp_path / "sim.vcf"
26+
with open(vcf_file, "w") as f:
27+
ts.write_vcf(f)
28+
# This also compresses the input file
29+
pysam.tabix_index(str(vcf_file), preset="vcf")
30+
out = tmp_path / "example.vcf.zarr"
31+
vcf.convert_vcf([tmp_path / "sim.vcf.gz"], out)
32+
ds = sg.load_dataset(out)
33+
assert ds.sizes["ploidy"] == ploidy
34+
assert ds.sizes["variants"] == ts.num_sites
35+
assert ds.sizes["samples"] == ts.num_individuals
36+
# Msprime guarantees that this will be true.
37+
nt.assert_array_equal(
38+
ts.genotype_matrix().reshape((ts.num_sites, ts.num_individuals, ploidy)),
39+
ds.call_genotype.values,
40+
)
41+
nt.assert_equal(ds.variant_allele[:, 0].values, "A")
42+
nt.assert_equal(ds.variant_allele[:, 1].values, "T")
43+
nt.assert_equal(ds.variant_position, ts.sites_position)
44+
45+
46+
# TODO add a plink equivalant if we can find a way of programatically
47+
# generating plink data?

0 commit comments

Comments
 (0)