Merge pull request #582 from tskit-dev/more-docs-3

jeromekelleher · web-flow · commit 25cf17344752 · 2025-11-21T15:49:28.000Z
More docs 3
diff --git a/docs/alignments_analysis.md b/docs/alignments_analysis.md
@@ -189,8 +189,76 @@ the alignment data by sample **and** by site. The best way to access data by sit
 is to use the {meth}`Dataset.variants` method.
 
 :::{note}
-The {meth}`Dataset.variants` method is deliberately designed to mirror the API
+The {meth}`Dataset.variants` method is deliberately designed to follow the semantics
 of the corresponding [tskit](https://tskit.dev) function
-({meth}`tskit.TreeSequence.variants`).
+({meth}`tskit.TreeSequence.variants`), enabling straightforward joint analysis of the
+ARG and alignments.
 :::
 
+Here we use this interface to count the number of samples that carry the gap characters
+at each site along the genome:
+
+```{code-cell}
+GAP = sc2ts.IUPAC_ALLELES.index("-")
+
+gap_count = np.zeros(ds.num_variants)
+for j, var in enumerate(ds.variants()):
+    gap_count[j] = np.sum(var.genotypes == GAP)
+
+gap_count
+```
+
+Here, we can see that all 1000 samples in our small subset have flanking deletions called.
+
+We can use the ``position`` argument to supply a list of (**one-based**) site positions
+of interest:
+
+```{code-cell}
+spike_pos = np.arange(21_563, 25_385)
+gap_count = np.zeros_like(spike_pos)
+for j, var in enumerate(ds.variants(position=spike_pos)):
+    gap_count[j] = np.sum(var.genotypes == GAP)
+
+gap_count
+```
+
+We can also use the ``sample_id`` argument to specify subsets of samples.
+
+## Bulk metadata analysis
+
+Accessing the metadata row-by-row using the ``.metadata`` mapping above is
+inefficient when we want to look at large numbers of samples. In this case,
+it is much more convenient to export the metadata to a Pandas dataframe
+using the {meth}`Dataset.metadata_dataframe` and then work with this.
+
+```{code-cell}
+df = ds.metadata_dataframe()
+df
+```
+
+Then, suppose we want to find all samples from the USA:
+
+```{code-cell}
+usa_samples = df[df["Country"] == "USA"].index
+usa_samples
+```
+
+:::{important}
+For performance reasons it's a good idea to use the ``fields`` parameter to
+{meth}`Dataset.metadata_dataframe` to limit the amount of metadata decoded
+to what you actually need.
+:::
+
+
+## Getting FASTA output
+
+Getting FASTA output is straightforward using the {meth}`Dataset.write_fasta`
+method. Here, we use the ``sample_id`` argument to write the FASTA aligments
+of the USA samples found in the last example:
+
+```{code-cell}
+with open("/tmp/usa.fa", "w") as f:
+    ds.write_fasta(f, sample_id=usa_samples)
+```
+
+
diff --git a/docs/api.md b/docs/api.md
@@ -8,9 +8,6 @@ Inference is driven via the command line interface (see the
 listed here are intended for working with tree sequences and datasets
 that have already been generated.
 
-The reference documentation is concise and exhaustive; for higher level
-discussion and worked examples, see the project README and example
-notebooks.
 
 ```{eval-rst}
 .. currentmodule:: sc2ts
@@ -30,7 +27,7 @@ notebooks.
 .. autofunction:: mutation_data
 ```
 
-## Dataset access
+## Alignment and metadata analysis
 
 ```{eval-rst}
 .. autosummary::
@@ -44,6 +41,9 @@ notebooks.
 .. autoclass:: Dataset
    :members:
 
+.. autoclass:: Variant
+   :members:
+
 .. autofunction:: decode_alleles
 
 .. autofunction:: mask_ambiguous
@@ -55,25 +55,12 @@ notebooks.
 
 ```{eval-rst}
 .. autosummary::
-   REFERENCE_STRAIN
-   REFERENCE_DATE
-   REFERENCE_GENBANK
-   REFERENCE_SEQUENCE_LENGTH
-   IUPAC_ALLELES
    decode_flags
    flags_summary
 ```
 
 ```{eval-rst}
-.. autodata:: REFERENCE_STRAIN
-
-.. autodata:: REFERENCE_DATE
-
-.. autodata:: REFERENCE_GENBANK
-
-.. autodata:: REFERENCE_SEQUENCE_LENGTH
-
-.. autodata:: IUPAC_ALLELES
+.. data:: IUPAC_ALLELES
 
 .. autofunction:: decode_flags
 
diff --git a/sc2ts/__init__.py b/sc2ts/__init__.py
@@ -2,5 +2,5 @@
 
 # star imports are fine here as it's just a bunch of constants
 from .core import *
-from .dataset import mask_ambiguous, mask_flanking_deletions, decode_alleles, Dataset
+from .dataset import mask_ambiguous, mask_flanking_deletions, decode_alleles, Dataset, Variant
 from .stats import node_data, mutation_data
diff --git a/sc2ts/core.py b/sc2ts/core.py
@@ -22,7 +22,11 @@
 # NOTE!! This string is also used in the jit module where it's
 # hard-coded into a numba function, so if this ever changes
 # it needs to be updated there also!
+
 IUPAC_ALLELES = "ACGT-RYSWKMBDHV."
+"""
+The allele-integer encoding used by sc2ts.
+"""
 
 NODE_IS_MUTATION_OVERLAP = 1 << 21
 NODE_IS_REVERSION_PUSH = 1 << 22
diff --git a/sc2ts/dataset.py b/sc2ts/dataset.py
@@ -232,29 +232,27 @@ def field_descriptors(self):
 
 @dataclasses.dataclass
 class Variant:
+    """
+    Represents a single variant, including the genomic position and the integer encoded
+    genotypes.
+    """
     position: int
     genotypes: np.ndarray
     alleles: list
 
 
 class Dataset(collections.abc.Mapping):
+    """
+    Open a sc2ts VCF Zarr dataset for convenient access to alignments and metadata.
 
+    The dataset is opened read-only from ``path``, which may be either a
+    directory store or a consolidated ``.zip`` file.
+
+    :param str path: Path to a directory or ``.zip`` Zarr store.
+    :param int chunk_cache_size: Maximum number of chunks to cache for
+        alignments and metadata. Defaults to 1.
+    """
     def __init__(self, path, chunk_cache_size=1, date_field=None):
-        """
-        Open a sc2ts VCF Zarr dataset for convenient access to alignments and metadata.
-
-        The dataset is opened read-only from ``path``, which may be either a
-        directory store or a consolidated ``.zip`` file. The ``date_field``
-        argument specifies which metadata field should be interpreted as the
-        sample date when constructing :attr:`metadata`.
-
-        :param str path: Path to a directory or ``.zip`` Zarr store.
-        :param int chunk_cache_size: Maximum number of chunks to cache for
-            alignments and metadata. Defaults to 1.
-        :param str date_field: Name of the metadata field to use as the
-            sample date, or ``None`` to disable date handling. Defaults
-            to ``None``.
-        """
         logger.info(f"Loading dataset @{path} using {date_field} as date field")
         self.date_field = date_field
         self.path = pathlib.Path(path)
@@ -310,6 +308,17 @@ def metadata(self):
         """
         return self._metadata
 
+    def metadata_dataframe(self, fields=None):
+        """
+        Returns the metadata in this dataset as a Pandas dataframe,
+        indexed by sample_id.
+
+        :param fields: List of metadata fields to include; if ``None``,
+            all fields are used.
+        :return: Pandas dataframe
+        """
+        return self.metadata.as_dataframe(fields)
+
     @property
     def sample_id(self):
         """
@@ -356,7 +365,7 @@ def variants(self, sample_id=None, position=None):
         """
         Iterate over variants at the specified positions for the given samples.
 
-        Yields Variant objects containing the genomic position, encoded
+        Yields :class:`.Variant` objects containing the genomic position, encoded
         genotypes, and allele labels for each requested site.
 
         :param sample_id: Iterable of sample IDs to include; if ``None``,
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -507,6 +507,14 @@ def test_samples_for_date(self, fx_dataset):
         samples = fx_dataset.metadata.samples_for_date("2020-01-19")
         assert samples == ["SRR11772659"]
 
+    def test_metadata_dataframe(self, fx_dataset):
+        df1 = fx_dataset.metadata.as_dataframe()
+        df2 = fx_dataset.metadata_dataframe()
+        assert df1.shape[0] == df2.shape[0]
+        for col, data1 in df2.items():
+            data2 = df2[col]
+            nt.assert_array_equal(data1.to_numpy(), data2.to_numpy())
+
     def test_as_dataframe(self, fx_dataset, fx_metadata_df):
         df1 = fx_dataset.metadata.as_dataframe()
         df2 = fx_metadata_df.loc[df1.index]