Skip to content

Commit 239f081

Browse files
Rename decode_alignments to decode_alleles
1 parent f011ea4 commit 239f081

File tree

4 files changed

+117
-7
lines changed

4 files changed

+117
-7
lines changed

docs/alignments_analysis.md

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,11 +110,76 @@ for j, a in enumerate(ds.alignment.values()):
110110
np.mean(gap_count)
111111
```
112112

113+
:::{warning}
114+
The arrays returned by the ``alignment`` are **zero based** and you
115+
must compensate to use **one-based** coordinates.
116+
:::
117+
118+
If you want to access
119+
specific slices of the array based on **one-based** coordinates, it's important
120+
to take the zero-based nature of this into account. Suppose we wanted to
121+
access the first 10 bases of Spike for a give sample. The first
122+
base of Spike is 21563 in standard one-based coordinates. While we could do
123+
some arithmetic to compensate, the simplest way to translate is to simply
124+
prepend some value to the alignment array:
125+
126+
```{code-cell}
127+
a = np.append([-1], ds.alignment["SRR11597146"])
128+
spike_start = 21_563
129+
a[spike_start: spike_start + 10]
130+
```
131+
113132

114133
(sec_alignments_analysis_data_encoding)=
115134

116135
## Alignment data encoding
117136

118-
Stuff
137+
A key element of processing data efficiently in [tskit](https://tskit.dev) and VCF
138+
Zarr is to use numpy
139+
arrays of integers to represent allelic states, instead of the classical
140+
approach of using strings. In sc2ts, alleles are given fixed integer
141+
representations, such that A=0, C=1, G=2, and T=3. So, to represent the DNA
142+
string "AACTG" we would use the numpy array [0, 0, 1, 3, 2] instead. This has
143+
many advantages and makes it much easier to write efficient code.
144+
145+
The drawback of this is that it's not as easy to inspect and debug, and we must
146+
always be aware of the translation required.
147+
148+
Sc2ts provides some utilities for doing this. The easiest way to get the string
149+
values is to use {func}`decode_alignment` function:
150+
151+
```{code-cell}
152+
a = sc2ts.decode_alignment(ds.alignment["SRR11597146"])
153+
a
154+
```
155+
This is a numpy string array, which can still be processed quite efficiently.
156+
However, it is best to stay in native integer encoding where possible, as it
157+
is much more efficient.
158+
159+
160+
Sc2ts uses the [IUPAC](https://www.bioinformatics.org/sms/iupac.html)
161+
uncertainty codes to encode ambiguous bases, and the {attr}`sc2ts.IUPAC_ALLELES`
162+
variable stores the mapping from these values to their integer indexes.
163+
164+
```{code-cell}
165+
sc2ts.IUPAC_ALLELES
166+
```
167+
168+
Thus, "A" corresponds to 0, "-" to 4 and so on.
119169

120170

171+
### Missing data
172+
173+
Missing data is an important element of the data model. Usually, missing data is
174+
encoded as an "N" character in the alignments. Howevever, there is no "N"
175+
in the ``IUPAC_ALLELES`` list above. This is because missing data is handled specially
176+
in VCF Zarr by mapping to the reserved ``-1`` value. Missing data can therefore be flagged
177+
easily and handled correctly by downstream utilities.
178+
179+
:::{warning}
180+
It is important to take this into account when translating the integer encoded data into
181+
strings, because -1 is interpreted as the last element of the list in Python. Please
182+
use the {func}`decode_alignment` function
183+
184+
:::
185+

sc2ts/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22

33
# star imports are fine here as it's just a bunch of constants
44
from .core import *
5-
from .dataset import mask_ambiguous, mask_flanking_deletions, decode_alignment, Dataset
5+
from .dataset import mask_ambiguous, mask_flanking_deletions, decode_alleles, Dataset
66
from .stats import node_data, mutation_data

sc2ts/dataset.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,18 +24,22 @@
2424
DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7, shuffle=0)
2525

2626

27-
def decode_alignment(a):
27+
def decode_alleles(a):
2828
"""
29-
Decode an array of integer-encoded alleles into their IUPAC string values.
29+
Decode an array of integer-encoded alleles into their IUPAC string values
30+
returned as a numpy array.
3031
3132
The input should use the encoding defined by ``core.IUPAC_ALLELES``,
32-
with ``-1`` representing missing data; a trailing ``\"N\"`` allele is
33-
added here for convenience when working with masked arrays.
33+
with ``-1`` representing missing data.
3434
3535
:param numpy.ndarray a: Integer-encoded alignment array.
3636
:return: Array of single-character IUPAC allele codes.
3737
:rtype: numpy.ndarray
3838
"""
39+
if np.any(a < -1):
40+
raise ValueError("Negative values < -1 not supported")
41+
if np.any(a >= len(core.IUPAC_ALLELES)):
42+
raise ValueError("Unknown allele value")
3943
alleles = np.array(tuple(core.IUPAC_ALLELES + "N"), dtype=str)
4044
return alleles[a]
4145

@@ -288,13 +292,21 @@ def __len__(self):
288292
def alignment(self):
289293
"""
290294
Efficient mapping of sample ID strings to integer encoded alignment data.
295+
296+
The returned object is dictionary-like implemening the Mapping protocol.
297+
Access to the underlying Zarr store is mediated by a chunk cache, so that
298+
chunks are not repeatedly decompressed.
291299
"""
292300
return self._alignment
293301

294302
@property
295303
def metadata(self):
296304
"""
297305
Efficient mapping of sample ID strings to metadata dictionaries.
306+
307+
The returned object is dictionary-like implemening the Mapping protocol.
308+
Access to the underlying Zarr store is mediated by a chunk cache, so that
309+
chunks are not repeatedly decompressed.
298310
"""
299311
return self._metadata
300312

tests/test_dataset.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,7 @@ def test_compare_fasta(self, fx_dataset, fx_alignments_fasta):
409409
fr = data_import.FastaReader(fx_alignments_fasta)
410410
for k, a1 in fr.items():
411411
h = fx_dataset.alignment[k]
412-
a2 = sc2ts.decode_alignment(h)
412+
a2 = sc2ts.decode_alleles(h)
413413
nt.assert_array_equal(a1[1:], a2)
414414

415415
def test_len(self, fx_dataset):
@@ -560,6 +560,39 @@ def test_other_error(self, hap):
560560
jit.encode_alignment(h)
561561

562562

563+
class TestDecodeAlleles:
564+
@pytest.mark.parametrize(
565+
["a", "expected"],
566+
[
567+
([], []),
568+
([0], ["A"]),
569+
([1], ["C"]),
570+
([2], ["G"]),
571+
([3], ["T"]),
572+
([4], ["-"]),
573+
([15], ["."]),
574+
([-1], ["N"]),
575+
([0, 1, 2, 3, 4, -1], list("ACGT-N")),
576+
([-1, 4, 3, 2, 1, 0], list("N-TGCA")),
577+
([0, 1, 0, 2, 3, 0, 1, 4, -1], list("ACAGTAC-N")),
578+
(range(len(sc2ts.IUPAC_ALLELES)), list(sc2ts.IUPAC_ALLELES)),
579+
],
580+
)
581+
def test_examples(self, a, expected):
582+
h = sc2ts.decode_alleles(np.array(a, dtype=int))
583+
nt.assert_array_equal(h, expected)
584+
585+
@pytest.mark.parametrize("a", [-2, -3, -100])
586+
def test_too_negative(self, a):
587+
with pytest.raises(ValueError, match="Negative values"):
588+
sc2ts.decode_alleles(np.array(a, dtype=int))
589+
590+
@pytest.mark.parametrize("a", [16, 17, 100])
591+
def test_too_large(self, a):
592+
with pytest.raises(ValueError, match="Unknown allele"):
593+
sc2ts.decode_alleles(np.array(a, dtype=int))
594+
595+
563596
class TestMaskFlankingDeletions:
564597

565598
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)