Merge pull request #586 from tskit-dev/final-docs-fixes-pre1

jeromekelleher · web-flow · commit e0ee253829dc · 2025-11-23T14:42:21.000Z
Final docs fixes pre1
diff --git a/docs/cli.md b/docs/cli.md
@@ -33,19 +33,84 @@ following order:
 
 ## CLI reference
 
-<!-- Below we list all subcommands and options provided by the CLI. This -->
-<!-- output is generated directly from the Click definitions in -->
-<!-- ``sc2ts.cli`` using the ``sphinx-click`` extension, and so stays in -->
-<!-- sync with the implementation. -->
+% A note on cross references... There's some weird long-standing problem with
+% cross referencing program values in Sphinx, which means that we can't use
+% the built-in labels generated by sphinx-click. We can make our own explicit
+% targets, but these have to have slightly weird names to avoid conflicting
+% with what sphinx-click is doing. So, hence the cmd- prefix.
+% Based on: https://github.com/skypilot-org/skypilot/pull/2834
 
-:::{todo}
-Add the sphinx-click output here somehow.
-:::
+### Data import
+
+```{eval-rst}
+.. _cmd-sc2ts-import-alignments:
+.. click:: sc2ts.cli:import_alignments
+   :prog: sc2ts import-alignments
+```
+
+```{eval-rst}
+.. _cmd-sc2ts-import-metadata:
+.. click:: sc2ts.cli:import_metadata
+   :prog: sc2ts import-metadata
+```
+
+### Inference
+
+```{eval-rst}
+.. _cmd-sc2ts-infer:
+.. click:: sc2ts.cli:infer
+   :prog: sc2ts infer
+```
+
+### Inspection
+
+```{eval-rst}
+.. _cmd-sc2ts-info-dataset:
+.. click:: sc2ts.cli:info_dataset
+   :prog: sc2ts info-dataset
+```
+
+```{eval-rst}
+.. _cmd-sc2ts-info-matches:
+.. click:: sc2ts.cli:info_matches
+   :prog: sc2ts info-matches
+```
+
+### Postprocessing
+
+```{eval-rst}
+.. _cmd-sc2ts-postprocess:
+.. click:: sc2ts.cli:postprocess
+   :prog: sc2ts postprocess
+```
+
+```{eval-rst}
+.. _cmd-sc2ts-map-parsimony:
+.. click:: sc2ts.cli:map_parsimony
+   :prog: sc2ts map-parsimony
+```
+
+```{eval-rst}
+.. _cmd-sc2ts-minimise-metadata:
+.. click:: sc2ts.cli:minimise_metadata
+   :prog: sc2ts minimise-metadata
+```
+
+### Miscellaneous
+
+% For some reason this one isn't working. Not worth worrying about.
 
 <!-- ```{eval-rst} -->
-<!-- .. click:: sc2ts.cli:cli -->
-<!--    :prog: sc2ts infer -->
-<!--    :nested: full -->
+<!-- .. _cmd-sc2ts-validate: -->
+<!-- .. click:: sc2ts.cli:validate -->
+<!--    :prog: sc2ts validate -->
 <!-- ``` -->
 
 
+```{eval-rst}
+.. _cmd-sc2ts-run-hmm:
+.. click:: sc2ts.cli:run_hmm
+   :prog: sc2ts run-hmm
+```
+
+
diff --git a/docs/example_config.toml b/docs/example_config.toml
@@ -1,6 +1,13 @@
+
+# This is a path to the dataset, in VCZ format.
 dataset="viridian_mafft_2024-10-14_v1.vcz.zip"
+# The metadata field used for dates. For the Viridian dataset, this is 
+# "Date_tree" (which means, "date used to partition samples when building 
+# the Viridian tree")
 date_field="Date_tree"
 
+# The run_id is a prefix added to all output files. This is useful when 
+# running lots of different parameter combinations.
 run_id="ex1"
 # Configure where the result files are stored. For simplicity
 # we put them all in the "example_inference" directory.
@@ -13,8 +20,9 @@ matches_dir= "example_inference/"
 # This is full debug output, which is verbose (but useful!)
 log_level = 2
 
-# Dates to exclude from inference. This one is a large outlier in terms of the 
-# numbers of samples, and enriched for incorrectly assigned dates.
+# Dates to exclude from inference. This one is a large outlier in the 
+# Viridian data in terms of the numbers of samples, and enriched for 
+# incorrectly assigned dates.
 exclude_dates = ["2020-12-31"]
 
 # The set of site positions to mask during inference (list of integers).
@@ -23,24 +31,49 @@ exclude_dates = ["2020-12-31"]
 exclude_sites = []
 
 [extend_parameters]
+# The recombination penalty "k" parameter
 num_mismatches=4
+# Any samples with a HMM cost <= to this value are included in the ARG
 hmm_cost_threshold=7
+# The maximum number of missing sites for a sample to be considered
 max_missing_sites=500
+# Do we mask deletions as missing data?
 deletions_as_missing=true
+# The maximum number of samples to consider, per day
 # max_daily_samples=1000
 
-# Knobs for tuning retro group insertion
+## Various knobs for tuning retro group insertion:
+
+# The minimum number of samples in a retro group
 min_group_size=10
+# The minimum number of mutations shared by all samples
 min_root_mutations=2
+# The maxmimum number of recurrent mutations in the group tree
 max_recurrent_mutations=2
+# The maxmimum number of mutations per sample, overall
 max_mutations_per_sample=5
+# The size of the windown in which to consider samples for retrospective
+# inclusion, in days.
 retrospective_window=7
 
+## Performance parameters.
+
+# The number of matching threads to use. -1 means use all available cores.
+# Note that this will likely not make much difference until large numbers
+# of samples per days are involved.
 num_threads=-1
+# An approximate ceiling on the total amount of memory used (in GiB) by HMM
+# matching. Once the memory used goes above this value, new HMM match jobs are
+# held back until it goes under it again. If many memory intensive match jobs
+# are run at once however, this will not prevent them from exceeding this
+# limit.
 memory_limit=32
 
+# A list of sample IDs (strings) for unconditional inclusion (e.g., to 
+# help seed major saltation events).
 include_samples=[]
 
+# Override specific parameter values over a time period.
 [[override]]
 start = "2020-01-01"
 stop = "2020-03-01"
diff --git a/docs/inference.md b/docs/inference.md
@@ -8,7 +8,7 @@ on a local machine using an example config file, using the Viridian data downloa
 from Zenodo.
 
 Inference is performed using the CLI, which is composed of number of subcommands.
-See {ref}`sc2ts_sec_cli` section for more information
+See the {ref}`sc2ts_sec_cli` section for more information
 
 ## Prerequisites
 
@@ -94,9 +94,17 @@ debugging metadata included (see the section on the Debug utilities below)
 Primary inference can be stopped and picked up again at any point using
 the ``--start`` option.
 
-:::{todo}
-Add documentation for the toml config file
-:::
+<!-- :::{todo} -->
+<!-- Add documentation for the toml config file -->
+<!-- ::: -->
+### Config file format
+
+All parameters for primary inference are specified using the [toml](https://toml.io/en/)
+config file. There are documented in the example config file used here:
+
+```{literalinclude} example_config.toml
+:language: toml
+```
 
 ## Postprocessing
 
diff --git a/docs/intro.md b/docs/intro.md
@@ -11,12 +11,20 @@ It consists of:
 3. A lightweight wrapper around [Zarr](https://zarr.dev) for convenient access to the
    Viridian dataset (alignments and metadata) in VCF Zarr format.
 
-The underlying methods are described in the sc2ts [preprint](
+The methods are described in the sc2ts [preprint](
 <https://www.biorxiv.org/content/10.1101/2023.06.08.544212v2>).
 
-Most users will use the {ref}`sec_python_api` to perform {ref}`sec_arg_analysis`
-on the sc2ts inferred ARG or {ref}`sec_alignments_analysis` on the
-Zarr-formatted Viridian dataset distributed on Zenodo.
 
-Uses who wish to perform {ref}`sec_inference` use the
-{ref}`sc2ts_sec_cli`.
+## Quickstart
+
+- See the {ref}`sec_inference` section for an example of running
+primary inference using the {ref}`sc2ts_sec_cli`.
+
+- See the {ref}`sec_arg_analysis` section for examples of using the
+{ref}`sec_python_api` to analyse the sc2ts Viridian ARG.
+
+- See the {ref}`sec_alignments_analysis` section for examples
+of using the {ref}`sec_python_api` to analyse the Viridian
+alignments and metadata in
+[VCF Zarr format](https://doi.org/10.1093/gigascience/giaf049).
+
diff --git a/pyproject.toml b/pyproject.toml
@@ -59,6 +59,13 @@ docs = [
   "sphinx-argparse==0.5.2",
   "sphinx-issues==5.0.1",
   "IPython",
+  # docs requires running the CLI, which means we need to full inference 
+  # requirements also
+  "scipy",
+  "biotite",
+  "tsinfer>=0.5",
+  "pyfaidx",
+  "numba",
 ]
 
 [build-system]
diff --git a/sc2ts/cli.py b/sc2ts/cli.py
@@ -16,7 +16,6 @@
 import tqdm
 import tskit
 import tszip
-import tsinfer
 import click
 import humanize
 import pandas as pd
@@ -130,7 +129,7 @@ def setup_logging(verbosity, log_file=None, date=None):
     is_flag=True,
     flag_value=True,
     help=(
-        "If true, initialise a new dataset. WARNING! This will erase and existing "
+        "If true, initialise a new dataset. WARNING! This will erase an existing "
         "store"
     ),
 )