Skip to content

Commit 62458c5

Browse files
Merge pull request #248 from jeromekelleher/test_alignments_updates
Test alignments updates
2 parents 02ee0b8 + b93c800 commit 62458c5

File tree

3 files changed

+194
-148
lines changed

3 files changed

+194
-148
lines changed

sc2ts/alignments.py

Lines changed: 1 addition & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ class AlignmentStore(collections.abc.Mapping):
8080
def __init__(self, path, mode="r"):
8181
map_size = 1024**4
8282
self.env = lmdb.Environment(
83-
path, subdir=False, readonly=mode == "r", map_size=map_size
83+
str(path), subdir=False, readonly=mode == "r", map_size=map_size
8484
)
8585

8686
def __enter__(self):
@@ -95,21 +95,6 @@ def close(self):
9595
def __str__(self):
9696
return f"AlignmentStore at {self.env.path()} contains {len(self)} alignments"
9797

98-
@staticmethod
99-
def initialise(path):
100-
"""
101-
Create a new store at this path.
102-
"""
103-
db_path = pathlib.Path(path)
104-
if db_path.exists():
105-
db_path.unlink()
106-
107-
reference = core.get_reference_sequence()
108-
with lmdb.Environment(str(db_path), subdir=False) as env:
109-
with env.begin(write=True) as txn:
110-
txn.put("MN908947".encode(), compress_alignment(reference))
111-
return AlignmentStore(path, "a")
112-
11398
def _flush(self, chunk):
11499
logger.debug(f"Flushing {len(chunk)} sequences")
115100
with self.env.begin(write=True) as txn:
@@ -157,20 +142,6 @@ def __len__(self):
157142
with self.env.begin() as txn:
158143
return txn.stat()["entries"]
159144

160-
def get_all(self, strains, sequence_length):
161-
A = np.zeros((len(strains), sequence_length), dtype=np.int8)
162-
with self.env.begin() as txn:
163-
for j, strain in enumerate(strains):
164-
val = txn.get(strain.encode())
165-
if val is None:
166-
raise KeyError(f"{strain} not found")
167-
a = decompress_alignment(val)
168-
if len(a) != sequence_length:
169-
raise ValueError(
170-
f"Alignment for {strain} not of length {sequence_length}"
171-
)
172-
return A
173-
174145

175146
@dataclasses.dataclass
176147
class MaskedAlignment:

tests/test_alignments.py

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
import pathlib
2+
import shutil
3+
import gzip
4+
5+
import numpy as np
6+
import pytest
7+
from numpy.testing import assert_array_equal
8+
9+
from sc2ts import alignments as sa
10+
from sc2ts import core
11+
12+
13+
@pytest.fixture
14+
def data_cache():
15+
cache_path = pathlib.Path("tests/data/cache")
16+
if not cache_path.exists():
17+
cache_path.mkdir()
18+
return cache_path
19+
20+
21+
@pytest.fixture
22+
def alignments_fasta(data_cache):
23+
cache_path = data_cache / "alignments.fasta"
24+
if not cache_path.exists():
25+
with gzip.open("tests/data/alignments.fasta.gz") as src:
26+
with open(cache_path, "wb") as dest:
27+
shutil.copyfileobj(src, dest)
28+
return cache_path
29+
30+
31+
@pytest.fixture
32+
def alignments_store(data_cache, alignments_fasta):
33+
cache_path = data_cache / "alignments.db"
34+
if not cache_path.exists():
35+
with sa.AlignmentStore(cache_path, "a") as a:
36+
fasta = core.FastaReader(alignments_fasta)
37+
a.append(fasta, show_progress=False)
38+
return sa.AlignmentStore(cache_path)
39+
40+
41+
class TestAlignmentsStore:
42+
def test_info(self, alignments_store):
43+
assert "contains" in str(alignments_store)
44+
45+
def test_len(self, alignments_store):
46+
assert len(alignments_store) == 55
47+
48+
def test_fetch_known(self, alignments_store):
49+
a = alignments_store["SRR11772659"]
50+
assert a.shape == (core.REFERENCE_SEQUENCE_LENGTH,)
51+
assert a[0] == "X"
52+
assert a[1] == "N"
53+
assert a[-1] == "N"
54+
55+
def test_keys(self, alignments_store):
56+
keys = list(alignments_store.keys())
57+
assert len(keys) == len(alignments_store)
58+
assert "SRR11772659" in keys
59+
60+
def test_in(self, alignments_store):
61+
assert "SRR11772659" in alignments_store
62+
assert "NOT_IN_STORE" not in alignments_store
63+
64+
65+
def test_get_gene_coordinates():
66+
d = core.get_gene_coordinates()
67+
assert len(d) == 11
68+
assert d["S"] == (21563, 25384)
69+
70+
71+
class TestEncodeAligment:
72+
@pytest.mark.parametrize(
73+
["hap", "expected"],
74+
[
75+
("A", [0]),
76+
("C", [1]),
77+
("G", [2]),
78+
("T", [3]),
79+
("-", [4]),
80+
("N", [-1]),
81+
("ACGT-N", [0, 1, 2, 3, 4, -1]),
82+
("N-TGCA", [-1, 4, 3, 2, 1, 0]),
83+
("ACAGTAC-N", [0, 1, 0, 2, 3, 0, 1, 4, -1]),
84+
],
85+
)
86+
def test_examples(self, hap, expected):
87+
h = np.array(list(hap), dtype="U1")
88+
a = sa.encode_alignment(h)
89+
assert_array_equal(a, expected)
90+
assert_array_equal(h, sa.decode_alignment(a))
91+
92+
@pytest.mark.parametrize("hap", "RYSWKMDHVN.")
93+
def test_iupac_uncertain_missing(self, hap):
94+
h = np.array(list(hap), dtype="U1")
95+
a = sa.encode_alignment(h)
96+
assert_array_equal(a, [-1])
97+
98+
@pytest.mark.parametrize("hap", "XZxz")
99+
def test_other_missing(self, hap):
100+
h = np.array(list(hap), dtype="U1")
101+
a = sa.encode_alignment(h)
102+
assert_array_equal(a, [-1])
103+
104+
@pytest.mark.parametrize("hap", "acgt")
105+
def test_lowercase_nucleotide_missing(self, hap):
106+
h = np.array(list(hap), dtype="U1")
107+
a = sa.encode_alignment(h)
108+
assert_array_equal(a, [-1])
109+
110+
@pytest.mark.parametrize(
111+
"a",
112+
[
113+
[-2],
114+
[-3],
115+
[5],
116+
[6],
117+
[0, -2],
118+
],
119+
)
120+
def test_examples(self, a):
121+
with pytest.raises(ValueError):
122+
sa.decode_alignment(np.array(a))
123+
124+
def test_encode_real(self, alignments_store):
125+
h = alignments_store["SRR11772659"]
126+
a = sa.encode_alignment(h)
127+
assert a[0] == -1
128+
assert a[-1] == -1
129+
130+
131+
class TestMasking:
132+
# Window size of 1 is weird because we have to have two or more
133+
# ambiguous characters. That means we only filter if something is
134+
# surrounded.
135+
@pytest.mark.parametrize(
136+
["hap", "expected", "masked"],
137+
[
138+
("A", "A", 0),
139+
("-", "-", 0),
140+
("-A-", "-N-", 1),
141+
("NAN", "NNN", 1),
142+
("---AAC---", "-N-AAC-N-", 2),
143+
],
144+
)
145+
def test_examples_w1(self, hap, expected, masked):
146+
hap = np.array(list(hap), dtype="U1")
147+
a = sa.encode_alignment(hap)
148+
expected = np.array(list(expected), dtype="U1")
149+
m = sa.mask_alignment(a, window_size=1)
150+
assert len(m) == masked
151+
assert_array_equal(expected, sa.decode_alignment(a))
152+
153+
@pytest.mark.parametrize(
154+
["hap", "expected", "masked"],
155+
[
156+
("A", "A", 0),
157+
("-", "-", 0),
158+
("--A--", "-NNN-", 3),
159+
("---AAAA---", "NNNNAANNNN", 8),
160+
("NNNAAAANNN", "NNNNAANNNN", 8),
161+
("-N-AAAA-N-", "NNNNAANNNN", 8),
162+
],
163+
)
164+
def test_examples_w2(self, hap, expected, masked):
165+
hap = np.array(list(hap), dtype="U1")
166+
a = sa.encode_alignment(hap)
167+
expected = np.array(list(expected), dtype="U1")
168+
m = sa.mask_alignment(a, window_size=2)
169+
assert len(m) == masked
170+
assert_array_equal(expected, sa.decode_alignment(a))
171+
172+
@pytest.mark.parametrize("w", [0, -1, -2])
173+
def test_bad_window_size(self, w):
174+
a = np.zeros(2, dtype=np.int8)
175+
with pytest.raises(ValueError):
176+
sa.mask_alignment(a, window_size=w)
177+
178+
179+
class TestEncodeAndMask:
180+
def test_known(self, alignments_store):
181+
a = alignments_store["SRR11772659"]
182+
ma = sa.encode_and_mask(a)
183+
assert ma.original_base_composition == {
184+
"T": 9566,
185+
"A": 8894,
186+
"G": 5850,
187+
"C": 5472,
188+
"N": 121,
189+
}
190+
assert ma.original_md5 == "e96feaa72c4f4baba73c2e147ede7502"
191+
assert len(ma.masked_sites) == 133
192+
assert ma.masked_sites[0] == 1
193+
assert ma.masked_sites[-1] == 29903

tests/test_convert.py

Lines changed: 0 additions & 118 deletions
This file was deleted.

0 commit comments

Comments
 (0)