Rename decode_alignments to decode_alleles

jeromekelleher · jeromekelleher · commit 239f081fcdc3 · 2025-11-21T14:05:55.000Z
diff --git a/docs/alignments_analysis.md b/docs/alignments_analysis.md
@@ -110,11 +110,76 @@ for j, a in enumerate(ds.alignment.values()):
 np.mean(gap_count)
 ```
 
+:::{warning}
+The arrays returned by the ``alignment`` are **zero based** and you
+must compensate to use **one-based** coordinates.
+:::
+
+If you want to access
+specific slices of the array based on **one-based** coordinates, it's important
+to take the zero-based nature of this into account. Suppose we wanted to
+access the first 10 bases of Spike for a give sample. The first
+base of Spike is 21563 in standard one-based coordinates. While we could do
+some arithmetic to compensate, the simplest way to translate is to simply
+prepend some value to the alignment array:
+
+```{code-cell}
+a = np.append([-1], ds.alignment["SRR11597146"])
+spike_start = 21_563
+a[spike_start: spike_start + 10]
+```
+
 
 (sec_alignments_analysis_data_encoding)=
 
 ## Alignment data encoding
 
-Stuff
+A key element of processing data efficiently in [tskit](https://tskit.dev) and VCF
+Zarr is to use numpy
+arrays of integers to represent allelic states, instead of the classical
+approach of using strings. In sc2ts, alleles are given fixed integer
+representations, such that A=0, C=1, G=2, and T=3. So, to represent the DNA
+string "AACTG" we would use the numpy array [0, 0, 1, 3, 2] instead. This has
+many advantages and makes it much easier to write efficient code.
+
+The drawback of this is that it's not as easy to inspect and debug, and we must
+always be aware of the translation required.
+
+Sc2ts provides some utilities for doing this. The easiest way to get the string
+values is to use {func}`decode_alignment` function:
+
+```{code-cell}
+a = sc2ts.decode_alignment(ds.alignment["SRR11597146"])
+a
+```
+This is a numpy string array, which can still be processed quite efficiently.
+However, it is best to stay in native integer encoding where possible, as it
+is much more efficient.
+
+
+Sc2ts uses the [IUPAC](https://www.bioinformatics.org/sms/iupac.html)
+uncertainty codes to encode ambiguous bases, and the {attr}`sc2ts.IUPAC_ALLELES`
+variable stores the mapping from these values to their integer indexes.
+
+```{code-cell}
+sc2ts.IUPAC_ALLELES
+```
+
+Thus, "A" corresponds to 0, "-" to 4 and so on.
 
 
+### Missing data
+
+Missing data is an important element of the data model. Usually, missing data is
+encoded as an "N" character in the alignments. Howevever, there is no "N"
+in the ``IUPAC_ALLELES`` list above. This is because missing data is handled specially
+in VCF Zarr by mapping to the reserved ``-1`` value. Missing data can therefore be flagged
+easily and handled correctly by downstream utilities.
+
+:::{warning}
+It is important to take this into account when translating the integer encoded data into
+strings, because -1 is interpreted as the last element of the list in Python. Please
+use the {func}`decode_alignment` function
+
+:::
+
diff --git a/sc2ts/__init__.py b/sc2ts/__init__.py
@@ -2,5 +2,5 @@
 
 # star imports are fine here as it's just a bunch of constants
 from .core import *
-from .dataset import mask_ambiguous, mask_flanking_deletions, decode_alignment, Dataset
+from .dataset import mask_ambiguous, mask_flanking_deletions, decode_alleles, Dataset
 from .stats import node_data, mutation_data
diff --git a/sc2ts/dataset.py b/sc2ts/dataset.py
@@ -24,18 +24,22 @@
 DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7, shuffle=0)
 
 
-def decode_alignment(a):
+def decode_alleles(a):
     """
-    Decode an array of integer-encoded alleles into their IUPAC string values.
+    Decode an array of integer-encoded alleles into their IUPAC string values
+    returned as a numpy array.
 
     The input should use the encoding defined by ``core.IUPAC_ALLELES``,
-    with ``-1`` representing missing data; a trailing ``\"N\"`` allele is
-    added here for convenience when working with masked arrays.
+    with ``-1`` representing missing data.
 
     :param numpy.ndarray a: Integer-encoded alignment array.
     :return: Array of single-character IUPAC allele codes.
     :rtype: numpy.ndarray
     """
+    if np.any(a < -1):
+        raise ValueError("Negative values < -1 not supported")
+    if np.any(a >= len(core.IUPAC_ALLELES)):
+        raise ValueError("Unknown allele value")
     alleles = np.array(tuple(core.IUPAC_ALLELES + "N"), dtype=str)
     return alleles[a]
 
@@ -288,13 +292,21 @@ def __len__(self):
     def alignment(self):
         """
         Efficient mapping of sample ID strings to integer encoded alignment data.
+
+        The returned object is dictionary-like implemening the Mapping protocol.
+        Access to the underlying Zarr store is mediated by a chunk cache, so that
+        chunks are not repeatedly decompressed.
         """
         return self._alignment
 
     @property
     def metadata(self):
         """
         Efficient mapping of sample ID strings to metadata dictionaries.
+
+        The returned object is dictionary-like implemening the Mapping protocol.
+        Access to the underlying Zarr store is mediated by a chunk cache, so that
+        chunks are not repeatedly decompressed.
         """
         return self._metadata
 
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -409,7 +409,7 @@ def test_compare_fasta(self, fx_dataset, fx_alignments_fasta):
         fr = data_import.FastaReader(fx_alignments_fasta)
         for k, a1 in fr.items():
             h = fx_dataset.alignment[k]
-            a2 = sc2ts.decode_alignment(h)
+            a2 = sc2ts.decode_alleles(h)
             nt.assert_array_equal(a1[1:], a2)
 
     def test_len(self, fx_dataset):
@@ -560,6 +560,39 @@ def test_other_error(self, hap):
             jit.encode_alignment(h)
 
 
+class TestDecodeAlleles:
+    @pytest.mark.parametrize(
+        ["a", "expected"],
+        [
+            ([], []),
+            ([0], ["A"]),
+            ([1], ["C"]),
+            ([2], ["G"]),
+            ([3], ["T"]),
+            ([4], ["-"]),
+            ([15], ["."]),
+            ([-1], ["N"]),
+            ([0, 1, 2, 3, 4, -1], list("ACGT-N")),
+            ([-1, 4, 3, 2, 1, 0], list("N-TGCA")),
+            ([0, 1, 0, 2, 3, 0, 1, 4, -1], list("ACAGTAC-N")),
+            (range(len(sc2ts.IUPAC_ALLELES)), list(sc2ts.IUPAC_ALLELES)),
+        ],
+    )
+    def test_examples(self, a, expected):
+        h = sc2ts.decode_alleles(np.array(a, dtype=int))
+        nt.assert_array_equal(h, expected)
+
+    @pytest.mark.parametrize("a", [-2, -3, -100])
+    def test_too_negative(self, a):
+        with pytest.raises(ValueError, match="Negative values"):
+            sc2ts.decode_alleles(np.array(a, dtype=int))
+
+    @pytest.mark.parametrize("a", [16, 17, 100])
+    def test_too_large(self, a):
+        with pytest.raises(ValueError, match="Unknown allele"):
+            sc2ts.decode_alleles(np.array(a, dtype=int))
+
+
 class TestMaskFlankingDeletions:
 
     @pytest.mark.parametrize(