Switch haplotypes to alignment and document

jeromekelleher · jeromekelleher · commit f011ea46de33 · 2025-11-21T12:18:24.000Z
diff --git a/docs/alignments_analysis.md b/docs/alignments_analysis.md
@@ -53,9 +53,68 @@ and {attr}`Dataset.num_variants` attributes.
 
 To get information on the metadata fields that are present, we can use
 
-
 ```{code-cell}
 ds.metadata.field_descriptors()
 ```
+:::{warning}
+The ``description`` column is currently empty because of a bug in the
+data ingest pipeline for the Virian data. Later versions will include
+this information so that the dataset is self-describing.
+See [GitHub issue](https://github.com/tskit-dev/sc2ts/issues/579).
+:::
+
+
+
+## Accessing per-sample information
+
+The easiest way to get information about a single sample is through the
+the ``.metadata`` and ``.haplotypes`` interfaces. First, let's get
+the sample IDs for the first 10 samples:
+
+```{code-cell}
+ds.sample_id[:10]
+```
+Then, we can get the metadata for a given sample as a dictionary using
+the {attr}`Dataset.metadata` interface:
+
+```{code-cell}
+ds.metadata["SRR11597146"]
+```
+
+Similarly, we can get the integer encoded alignment for a sample using
+the {attr}`Dataset.alignment` interface:
+
+```{code-cell}
+ds.alignment["SRR11597146"]
+```
+
+:::{seealso}
+See the section {ref}`sec_alignments_analysis_data_encoding` for
+details on the integer encoding for alignment data used here.
+:::
+
+Both the ``.metadata`` and ``.aligments`` interfaces are **cached**
+(avoiding repeated decompression of the same underlying Zarr chunks)
+and support iteration, and so provide an efficient way of accessing
+data in bulk. For example, here we compute the mean number of
+gap ("-") characters per sample:
+
+```{code-cell}
+import numpy as np
+
+GAP = sc2ts.IUPAC_ALLELES.index("-")
+
+gap_count = np.zeros(ds.num_samples)
+for j, a in enumerate(ds.alignment.values()):
+    gap_count[j] = np.sum(a == GAP)
+np.mean(gap_count)
+```
+
+
+(sec_alignments_analysis_data_encoding)=
+
+## Alignment data encoding
+
+Stuff
 
 
diff --git a/sc2ts/dataset.py b/sc2ts/dataset.py
@@ -246,7 +246,7 @@ def __init__(self, path, chunk_cache_size=1, date_field=None):
 
         :param str path: Path to a directory or ``.zip`` Zarr store.
         :param int chunk_cache_size: Maximum number of chunks to cache for
-            haplotypes and metadata. Defaults to 1.
+            alignments and metadata. Defaults to 1.
         :param str date_field: Name of the metadata field to use as the
             sample date, or ``None`` to disable date handling. Defaults
             to ``None``.
@@ -265,10 +265,10 @@ def __init__(self, path, chunk_cache_size=1, date_field=None):
         self.sample_id_map = {
             sample_id: k for k, sample_id in enumerate(self._sample_id)
         }
-        self.haplotypes = CachedHaplotypeMapping(
+        self._alignment = CachedHaplotypeMapping(
             self.root, self.sample_id_map, chunk_cache_size
         )
-        self.metadata = CachedMetadataMapping(
+        self._metadata = CachedMetadataMapping(
             self.root,
             self.sample_id_map,
             date_field,
@@ -284,6 +284,20 @@ def __iter__(self):
     def __len__(self):
         return len(self.root)
 
+    @property
+    def alignment(self):
+        """
+        Efficient mapping of sample ID strings to integer encoded alignment data.
+        """
+        return self._alignment
+
+    @property
+    def metadata(self):
+        """
+        Efficient mapping of sample ID strings to metadata dictionaries.
+        """
+        return self._metadata
+
     @property
     def sample_id(self):
         """
@@ -387,7 +401,7 @@ def write_fasta(self, out, sample_id=None):
             sample_id = self.sample_id
 
         for sid in sample_id:
-            h = self.haplotypes[sid]
+            h = self.alignment[sid]
             a = decode_alignment(h)
             print(f">{sid}", file=out)
             # FIXME this is probably a terrible way to write a large numpy string to
@@ -416,7 +430,7 @@ def copy(
         alignments = {}
         bar = tqdm.tqdm(sample_id, desc="Samples", disable=not show_progress)
         for s in bar:
-            alignments[s] = self.haplotypes[s]
+            alignments[s] = self.alignment[s]
             if len(alignments) == samples_chunk_size:
                 Dataset.append_alignments(path, alignments)
                 alignments = {}
diff --git a/sc2ts/inference.py b/sc2ts/inference.py
@@ -489,7 +489,7 @@ def preprocess(
     samples = []
     bar = get_progress(strains, progress_title, "preprocess", show_progress)
     for strain in bar:
-        alignment = dataset.haplotypes[strain]
+        alignment = dataset.alignment[strain]
         alignment = _dataset.mask_flanking_deletions(alignment)
         sample = Sample(strain)
         # No padding zero site in the alignment
diff --git a/tests/sc2ts_fixtures.py b/tests/sc2ts_fixtures.py
@@ -205,8 +205,8 @@ def recombinant_alignments(dataset):
     Generate some recombinant alignments from existing haplotypes
     """
     strains = ["SRR11597188", "SRR11597163"]
-    left_a = dataset.haplotypes[strains[0]]
-    right_a = dataset.haplotypes[strains[1]]
+    left_a = dataset.alignment[strains[0]]
+    right_a = dataset.alignment[strains[1]]
     # Recombine in the middle
     bp = 9_999
     h = left_a.copy()
@@ -243,7 +243,7 @@ def recombinant_example_2(tmp_path, fx_ts_map, fx_dataset, ds_path):
     # Pick a distinct strain to be the root of our two new haplotypes added
     # on the first day.
     root_strain = "SRR11597116"
-    a = fx_dataset.haplotypes[root_strain]
+    a = fx_dataset.alignment[root_strain]
     base_ts = fx_ts_map["2020-02-13"]
     # This sequence has a bunch of Ns at the start, so we have to go inwards
     # from them to make sure we're not masking them out.
@@ -310,7 +310,7 @@ def recombinant_example_3(tmp_path, fx_ts_map, fx_dataset, ds_path):
     # Pick a distinct strain to be the root of our three new haplotypes added
     # on the first day.
     root_strain = "SRR11597116"
-    a = fx_dataset.haplotypes[root_strain]
+    a = fx_dataset.alignment[root_strain]
     base_ts = fx_ts_map["2020-02-13"]
     # This sequence has a bunch of Ns at the start, so we have to go inwards
     # from them to make sure we're not masking them out.
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -167,8 +167,8 @@ def test_create_zip(self, tmp_path, fx_encoded_alignments, fx_metadata_df):
 
         ds1 = sc2ts.Dataset(path)
         ds2 = sc2ts.Dataset(zip_path)
-        alignments1 = dict(ds1.haplotypes)
-        alignments2 = dict(ds2.haplotypes)
+        alignments1 = dict(ds1.alignment)
+        alignments2 = dict(ds2.alignment)
         assert alignments1.keys() == alignments2.keys()
         for k in alignments1.keys():
             nt.assert_array_equal(alignments1[k], alignments2[k])
@@ -388,9 +388,9 @@ def test_import(self, tmp_path, fx_encoded_alignments_mafft):
         sc2ts.Dataset.new(path)
         sc2ts.Dataset.append_alignments(path, fx_encoded_alignments_mafft)
         ds = sc2ts.Dataset(path)
-        assert len(ds.haplotypes) == 19
+        assert len(ds.alignment) == 19
         for k, v in fx_encoded_alignments_mafft.items():
-            h = ds.haplotypes[k]
+            h = ds.alignment[k]
             nt.assert_array_equal(v, h)
             # The flanks are marked as deletions
             assert h[0] == 4
@@ -400,29 +400,29 @@ def test_import(self, tmp_path, fx_encoded_alignments_mafft):
 class TestDatasetAlignments:
 
     def test_fetch_known(self, fx_dataset):
-        a = fx_dataset.haplotypes["SRR11772659"]
+        a = fx_dataset.alignment["SRR11772659"]
         assert a.shape == (sc2ts.REFERENCE_SEQUENCE_LENGTH - 1,)
         assert a[0] == -1
         assert a[-1] == -1
 
     def test_compare_fasta(self, fx_dataset, fx_alignments_fasta):
         fr = data_import.FastaReader(fx_alignments_fasta)
         for k, a1 in fr.items():
-            h = fx_dataset.haplotypes[k]
+            h = fx_dataset.alignment[k]
             a2 = sc2ts.decode_alignment(h)
             nt.assert_array_equal(a1[1:], a2)
 
     def test_len(self, fx_dataset):
-        assert len(fx_dataset.haplotypes) == 55
+        assert len(fx_dataset.alignment) == 55
 
     def test_keys(self, fx_dataset):
-        keys = list(fx_dataset.haplotypes.keys())
-        assert len(keys) == len(fx_dataset.haplotypes)
+        keys = list(fx_dataset.alignment.keys())
+        assert len(keys) == len(fx_dataset.alignment)
         assert "SRR11772659" in keys
 
     def test_in(self, fx_dataset):
-        assert "SRR11772659" in fx_dataset.haplotypes
-        assert "NOT_IN_STORE" not in fx_dataset.haplotypes
+        assert "SRR11772659" in fx_dataset.alignment
+        assert "NOT_IN_STORE" not in fx_dataset.alignment
 
     @pytest.mark.parametrize(
         ["chunk_size", "cache_size"],
@@ -445,7 +445,7 @@ def test_chunk_size_cache_size(
         sc2ts.Dataset.add_metadata(path, fx_metadata_df)
         ds = sc2ts.Dataset(path, chunk_cache_size=cache_size)
         for k, v in fx_encoded_alignments.items():
-            nt.assert_array_equal(v, ds.haplotypes[k])
+            nt.assert_array_equal(v, ds.alignment[k])
 
 
 class TestDatasetMetadata:
@@ -454,7 +454,7 @@ def test_len(self, fx_dataset):
         assert len(fx_dataset.metadata) == 55
 
     def test_keys(self, fx_dataset):
-        assert fx_dataset.metadata.keys() == fx_dataset.haplotypes.keys()
+        assert fx_dataset.metadata.keys() == fx_dataset.alignment.keys()
 
     def test_known(self, fx_dataset):
         d = fx_dataset.metadata["SRR11772659"]
diff --git a/tests/test_inference.py b/tests/test_inference.py
@@ -597,7 +597,7 @@ def test_2020_02_02_missing_sample(
         strain,
         num_missing,
     ):
-        a = fx_dataset.haplotypes[strain]
+        a = fx_dataset.alignment[strain]
         a = sc2ts.mask_ambiguous(a)
 
         missing_positions = np.where(a == -1)[0] + 1
@@ -986,7 +986,7 @@ def test_exact_match(self, tmp_path, fx_ts_map, fx_dataset):
         strains = ["SRR11597218", "ERR4204459"]
         fake_strains = ["fake" + s for s in strains]
         alignments = {
-            name: fx_dataset.haplotypes[s] for name, s in zip(fake_strains, strains)
+            name: fx_dataset.alignment[s] for name, s in zip(fake_strains, strains)
         }
         date = "2020-03-01"
         ds = sc2ts.dataset.tmp_dataset(tmp_path / "tmp.zarr", alignments, date=date)
@@ -1110,7 +1110,7 @@ def test_recombinant_example_2(self, fx_ts_map, fx_recombinant_example_2):
     def test_all_As(self, tmp_path, fx_ts_map, fx_dataset):
         # Same as the recombinant_example_1() function above
         # Just to get something that looks like an alignment easily
-        a = fx_dataset.haplotypes["SRR11597188"]
+        a = fx_dataset.alignment["SRR11597188"]
         a[1:] = 0
         alignments = {"crazytype": a}
         date = "2020-03-01"