Merge pull request #581 from tskit-dev/more-docs-2

jeromekelleher · web-flow · commit c56546c492e2 · 2025-11-21T14:44:37.000Z
Work on document Dataset interface
diff --git a/docs/alignments_analysis.md b/docs/alignments_analysis.md
@@ -53,9 +53,144 @@ and {attr}`Dataset.num_variants` attributes.
 
 To get information on the metadata fields that are present, we can use
 
-
 ```{code-cell}
 ds.metadata.field_descriptors()
 ```
+:::{warning}
+The ``description`` column is currently empty because of a bug in the
+data ingest pipeline for the Virian data. Later versions will include
+this information so that the dataset is self-describing.
+See [GitHub issue](https://github.com/tskit-dev/sc2ts/issues/579).
+:::
+
+
+
+## Accessing per-sample information
+
+The easiest way to get information about a single sample is through the
+the ``.metadata`` and ``.haplotypes`` interfaces. First, let's get
+the sample IDs for the first 10 samples:
+
+```{code-cell}
+ds.sample_id[:10]
+```
+Then, we can get the metadata for a given sample as a dictionary using
+the {attr}`Dataset.metadata` interface:
+
+```{code-cell}
+ds.metadata["SRR11597146"]
+```
+
+Similarly, we can get the integer encoded alignment for a sample using
+the {attr}`Dataset.alignment` interface:
+
+```{code-cell}
+ds.alignment["SRR11597146"]
+```
+
+:::{seealso}
+See the section {ref}`sec_alignments_analysis_data_encoding` for
+details on the integer encoding for alignment data used here.
+:::
+
+Both the ``.metadata`` and ``.aligments`` interfaces are **cached**
+(avoiding repeated decompression of the same underlying Zarr chunks)
+and support iteration, and so provide an efficient way of accessing
+data in bulk. For example, here we compute the mean number of
+gap ("-") characters per sample:
+
+```{code-cell}
+import numpy as np
+
+GAP = sc2ts.IUPAC_ALLELES.index("-")
+
+gap_count = np.zeros(ds.num_samples)
+for j, a in enumerate(ds.alignment.values()):
+    gap_count[j] = np.sum(a == GAP)
+np.mean(gap_count)
+```
+
+:::{warning}
+The arrays returned by the ``alignment`` interface are **zero based** and you
+must compensate to use **one-based** coordinates.
+:::
+
+If you want to access
+specific slices of the array based on **one-based** coordinates, it's important
+to take the zero-based nature of this into account. Suppose we wanted to
+access the first 10 bases of Spike for a given sample. The first
+base of Spike is 21563 in standard one-based coordinates. While we could do
+some arithmetic to compensate, the simplest way to translate is to simply
+prepend some value to the alignment array:
+
+```{code-cell}
+a = np.append([-1], ds.alignment["SRR11597146"])
+spike_start = 21_563
+a[spike_start: spike_start + 10]
+```
+
+(sec_alignments_analysis_data_encoding)=
+
+## Alignment data encoding
+
+A key element of processing data efficiently in [tskit](https://tskit.dev) and VCF
+Zarr is to use numpy
+arrays of integers to represent allelic states, instead of the classical
+approach of using strings. In sc2ts, alleles are given fixed integer
+representations, such that A=0, C=1, G=2, and T=3. So, to represent the DNA
+string "AACTG" we would use the numpy array [0, 0, 1, 3, 2] instead. This has
+many advantages and makes it much easier to write efficient code.
+
+The drawback of this is that it's not as easy to inspect and debug, and we must
+always be aware of the translation required.
+
+Sc2ts provides some utilities for doing this. The easiest way to get the string
+values is to use {func}`decode_alleles` function:
+
+```{code-cell}
+a = sc2ts.decode_alleles(ds.alignment["SRR11597146"])
+a
+```
+This is a numpy string array, which can still be processed quite efficiently.
+However, it is best to stay in native integer encoding where possible, as it
+is much more efficient.
+
+
+Sc2ts uses the [IUPAC](https://www.bioinformatics.org/sms/iupac.html)
+uncertainty codes to encode ambiguous bases, and the {attr}`sc2ts.IUPAC_ALLELES`
+variable stores the mapping from these values to their integer indexes.
+
+```{code-cell}
+sc2ts.IUPAC_ALLELES
+```
+
+Thus, "A" corresponds to 0, "-" to 4 and so on.
+
+
+### Missing data
+
+Missing data is an important element of the data model. Usually, missing data is
+encoded as an "N" character in the alignments. Howevever, there is no "N"
+in the ``IUPAC_ALLELES`` list above. This is because missing data is handled specially
+in VCF Zarr by mapping to the reserved ``-1`` value. Missing data can therefore be flagged
+easily and handled correctly by downstream utilities.
+
+:::{warning}
+It is important to take this into account when translating the integer encoded data into
+strings, because -1 is interpreted as the last element of the list in Python. Please
+use the {func}`decode_alleles` function to avoid this tripwire.
+:::
+
+
+## Accessing by variant
+
+A unique feature of the VCF Zarr encoding used here is that we can efficiently access
+the alignment data by sample **and** by site. The best way to access data by site
+is to use the {meth}`Dataset.variants` method.
 
+:::{note}
+The {meth}`Dataset.variants` method is deliberately designed to mirror the API
+of the corresponding [tskit](https://tskit.dev) function
+({meth}`tskit.TreeSequence.variants`).
+:::
 
diff --git a/docs/api.md b/docs/api.md
@@ -35,7 +35,7 @@ notebooks.
 ```{eval-rst}
 .. autosummary::
    Dataset
-   decode_alignment
+   decode_alleles
    mask_ambiguous
    mask_flanking_deletions
 ```
@@ -44,7 +44,7 @@ notebooks.
 .. autoclass:: Dataset
    :members:
 
-.. autofunction:: decode_alignment
+.. autofunction:: decode_alleles
 
 .. autofunction:: mask_ambiguous
 
diff --git a/sc2ts/__init__.py b/sc2ts/__init__.py
@@ -2,5 +2,5 @@
 
 # star imports are fine here as it's just a bunch of constants
 from .core import *
-from .dataset import mask_ambiguous, mask_flanking_deletions, decode_alignment, Dataset
+from .dataset import mask_ambiguous, mask_flanking_deletions, decode_alleles, Dataset
 from .stats import node_data, mutation_data
diff --git a/sc2ts/cli.py b/sc2ts/cli.py
@@ -157,7 +157,7 @@ def import_alignments(dataset, fastas, initialise, progress, verbose):
             position=1,
         )
         for k, v in a_bar:
-            alignments[k] = jit.encode_alignment(v)
+            alignments[k] = jit.encode_alleles(v)
         sc2ts.Dataset.append_alignments(dataset, alignments)
 
 
diff --git a/sc2ts/dataset.py b/sc2ts/dataset.py
@@ -24,18 +24,22 @@
 DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7, shuffle=0)
 
 
-def decode_alignment(a):
+def decode_alleles(a):
     """
-    Decode an array of integer-encoded alleles into their IUPAC string values.
+    Decode an array of integer-encoded alleles into their IUPAC string values
+    returned as a numpy array.
 
     The input should use the encoding defined by ``core.IUPAC_ALLELES``,
-    with ``-1`` representing missing data; a trailing ``\"N\"`` allele is
-    added here for convenience when working with masked arrays.
+    with ``-1`` representing missing data.
 
     :param numpy.ndarray a: Integer-encoded alignment array.
     :return: Array of single-character IUPAC allele codes.
     :rtype: numpy.ndarray
     """
+    if np.any(a < -1):
+        raise ValueError("Negative values < -1 not supported")
+    if np.any(a >= len(core.IUPAC_ALLELES)):
+        raise ValueError("Unknown allele value")
     alleles = np.array(tuple(core.IUPAC_ALLELES + "N"), dtype=str)
     return alleles[a]
 
@@ -246,7 +250,7 @@ def __init__(self, path, chunk_cache_size=1, date_field=None):
 
         :param str path: Path to a directory or ``.zip`` Zarr store.
         :param int chunk_cache_size: Maximum number of chunks to cache for
-            haplotypes and metadata. Defaults to 1.
+            alignments and metadata. Defaults to 1.
         :param str date_field: Name of the metadata field to use as the
             sample date, or ``None`` to disable date handling. Defaults
             to ``None``.
@@ -259,16 +263,16 @@ def __init__(self, path, chunk_cache_size=1, date_field=None):
         else:
             self.store = zarr.DirectoryStore(path)
         self.root = zarr.open(self.store, mode="r")
-        self.sample_id = self.root["sample_id"][:].astype(str)
+        self._sample_id = self.root["sample_id"][:].astype(str)
 
         # TODO we should be storing this mapping in the Zarr somehow.
         self.sample_id_map = {
-            sample_id: k for k, sample_id in enumerate(self.sample_id)
+            sample_id: k for k, sample_id in enumerate(self._sample_id)
         }
-        self.haplotypes = CachedHaplotypeMapping(
+        self._alignment = CachedHaplotypeMapping(
             self.root, self.sample_id_map, chunk_cache_size
         )
-        self.metadata = CachedMetadataMapping(
+        self._metadata = CachedMetadataMapping(
             self.root,
             self.sample_id_map,
             date_field,
@@ -284,6 +288,35 @@ def __iter__(self):
     def __len__(self):
         return len(self.root)
 
+    @property
+    def alignment(self):
+        """
+        Efficient mapping of sample ID strings to integer encoded alignment data.
+
+        The returned object is dictionary-like implemening the Mapping protocol.
+        Access to the underlying Zarr store is mediated by a chunk cache, so that
+        chunks are not repeatedly decompressed.
+        """
+        return self._alignment
+
+    @property
+    def metadata(self):
+        """
+        Efficient mapping of sample ID strings to metadata dictionaries.
+
+        The returned object is dictionary-like implemening the Mapping protocol.
+        Access to the underlying Zarr store is mediated by a chunk cache, so that
+        chunks are not repeatedly decompressed.
+        """
+        return self._metadata
+
+    @property
+    def sample_id(self):
+        """
+        The sample IDs as a numpy string array.
+        """
+        return self._sample_id
+
     @property
     def samples_chunk_size(self):
         return self.root["call_genotype"].chunks[1]
@@ -380,8 +413,8 @@ def write_fasta(self, out, sample_id=None):
             sample_id = self.sample_id
 
         for sid in sample_id:
-            h = self.haplotypes[sid]
-            a = decode_alignment(h)
+            h = self.alignment[sid]
+            a = decode_alleles(h)
             print(f">{sid}", file=out)
             # FIXME this is probably a terrible way to write a large numpy string to
             # a file
@@ -409,7 +442,7 @@ def copy(
         alignments = {}
         bar = tqdm.tqdm(sample_id, desc="Samples", disable=not show_progress)
         for s in bar:
-            alignments[s] = self.haplotypes[s]
+            alignments[s] = self.alignment[s]
             if len(alignments) == samples_chunk_size:
                 Dataset.append_alignments(path, alignments)
                 alignments = {}
diff --git a/sc2ts/inference.py b/sc2ts/inference.py
@@ -489,7 +489,7 @@ def preprocess(
     samples = []
     bar = get_progress(strains, progress_title, "preprocess", show_progress)
     for strain in bar:
-        alignment = dataset.haplotypes[strain]
+        alignment = dataset.alignment[strain]
         alignment = _dataset.mask_flanking_deletions(alignment)
         sample = Sample(strain)
         # No padding zero site in the alignment
@@ -1227,10 +1227,10 @@ def make_tsb(ts, num_alleles, mirror_coordinates=False):
     ts = tree_ops.insert_vestigial_root_edge(ts)
 
     # Convert arrays for numba compatibility
-    ancestral_state = jit.encode_alignment(
+    ancestral_state = jit.encode_alleles(
         np.asarray(ts.sites_ancestral_state, dtype="U1")
     )
-    derived_state = jit.encode_alignment(
+    derived_state = jit.encode_alleles(
         np.asarray(ts.mutations_derived_state, dtype="U1")
     )
 
diff --git a/sc2ts/jit.py b/sc2ts/jit.py
@@ -203,7 +203,7 @@ def count(ts):
 
 # FIXME make cache optional.
 @numba.njit(cache=True)
-def encode_alignment(h):
+def encode_alleles(h):
     # Just so numba knows this is a constant string.
     alleles = "ACGT-RYSWKMBDHV."
     n = h.shape[0]
diff --git a/tests/sc2ts_fixtures.py b/tests/sc2ts_fixtures.py
@@ -47,7 +47,7 @@ def encoded_alignments(path):
     fr = data_import.FastaReader(path)
     alignments = {}
     for k, v in fr.items():
-        alignments[k] = jit.encode_alignment(v[1:])
+        alignments[k] = jit.encode_alleles(v[1:])
     return alignments
 
 
@@ -205,8 +205,8 @@ def recombinant_alignments(dataset):
     Generate some recombinant alignments from existing haplotypes
     """
     strains = ["SRR11597188", "SRR11597163"]
-    left_a = dataset.haplotypes[strains[0]]
-    right_a = dataset.haplotypes[strains[1]]
+    left_a = dataset.alignment[strains[0]]
+    right_a = dataset.alignment[strains[1]]
     # Recombine in the middle
     bp = 9_999
     h = left_a.copy()
@@ -243,7 +243,7 @@ def recombinant_example_2(tmp_path, fx_ts_map, fx_dataset, ds_path):
     # Pick a distinct strain to be the root of our two new haplotypes added
     # on the first day.
     root_strain = "SRR11597116"
-    a = fx_dataset.haplotypes[root_strain]
+    a = fx_dataset.alignment[root_strain]
     base_ts = fx_ts_map["2020-02-13"]
     # This sequence has a bunch of Ns at the start, so we have to go inwards
     # from them to make sure we're not masking them out.
@@ -310,7 +310,7 @@ def recombinant_example_3(tmp_path, fx_ts_map, fx_dataset, ds_path):
     # Pick a distinct strain to be the root of our three new haplotypes added
     # on the first day.
     root_strain = "SRR11597116"
-    a = fx_dataset.haplotypes[root_strain]
+    a = fx_dataset.alignment[root_strain]
     base_ts = fx_ts_map["2020-02-13"]
     # This sequence has a bunch of Ns at the start, so we have to go inwards
     # from them to make sure we're not masking them out.
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
diff --git a/tests/test_inference.py b/tests/test_inference.py

Original file line number	Diff line number	Diff line change
`@@ -157,7 +157,7 @@ def import_alignments(dataset, fastas, initialise, progress, verbose):`
`157`	`157`	`position=1,`
`158`	`158`	`)`
`159`	`159`	`for k, v in a_bar:`
`160`		`- alignments[k] = jit.encode_alignment(v)`
	`160`	`+ alignments[k] = jit.encode_alleles(v)`
`161`	`161`	`sc2ts.Dataset.append_alignments(dataset, alignments)`
`162`	`162`
`163`	`163`