RacimoLab
diff --git a/‎dinf/__init__.py‎
Lines changed: 23 additions & 62 deletions b/‎dinf/__init__.py‎
Lines changed: 23 additions & 62 deletions
diff --git a/‎dinf/dinf.py‎
Lines changed: 12 additions & 9 deletions b/‎dinf/dinf.py‎
Lines changed: 12 additions & 9 deletions
diff --git a/‎dinf/dinf_model.py‎
Lines changed: 21 additions & 7 deletions b/‎dinf/dinf_model.py‎
Lines changed: 21 additions & 7 deletions
diff --git a/‎dinf/misc.py‎
Lines changed: 11 additions & 5 deletions b/‎dinf/misc.py‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎dinf/parameters.py‎
Lines changed: 10 additions & 2 deletions b/‎dinf/parameters.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎dinf/plot.py‎
Lines changed: 3 additions & 4 deletions b/‎dinf/plot.py‎
Lines changed: 3 additions & 4 deletions
@@ -6,23 +6,28 @@
 except ImportError:
     pass
 
-import os
-
-if "TF_CPP_MIN_LOG_LEVEL" not in os.environ:
-    # Mute tensorflow/xla.
-    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
-
-if "KMP_AFFINITY" not in os.environ:
-    # Pin threads to cpus. This can improve blas performance.
-    os.environ["KMP_AFFINITY"] = "granularity=fine,noverbose,compact,1,0"
-
-# https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html
-# if "XLA_PYTHON_CLIENT_PREALLOCATE" not in os.environ:
-#    os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false"
-# if "XLA_PYTHON_CLIENT_ALLOCATOR" not in os.environ:
-#    os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
-
-
+from .misc import ts_individuals
+from .store import Store as Store
+from .parameters import Param, Parameters
+from .dinf_model import DinfModel
+from .vcf import (
+    BagOfVcf,
+    get_contig_lengths,
+    get_samples_from_1kgp_metadata,
+)
+from .feature_extractor import (
+    HaplotypeMatrix,
+    MultipleHaplotypeMatrices,
+    BinnedHaplotypeMatrix,
+    MultipleBinnedHaplotypeMatrices,
+)
+from .discriminator import (
+    Discriminator,
+    Surrogate,
+    ExchangeableCNN,
+    ExchangeablePGGAN,
+    Symmetric,
+)
 from .dinf import (
     abc_gan,
     alfi_mcmc_gan,
@@ -32,49 +37,5 @@
     train,
     save_results,
     load_results,
+    sample_smooth,
 )
-from .discriminator import (
-    Discriminator,
-    Surrogate,
-    ExchangeableCNN,
-    ExchangeablePGGAN,
-    Symmetric,
-)
-from .feature_extractor import (
-    HaplotypeMatrix,
-    MultipleHaplotypeMatrices,
-    BinnedHaplotypeMatrix,
-    MultipleBinnedHaplotypeMatrices,
-)
-from .dinf_model import DinfModel
-from .parameters import Param, Parameters
-from .store import Store
-from .vcf import BagOfVcf, get_contig_lengths, get_samples_from_1kgp_metadata
-
-__all__ = [
-    "__version__",
-    "BagOfVcf",
-    "BinnedHaplotypeMatrix",
-    "Discriminator",
-    "ExchangeableCNN",
-    "ExchangeablePGGAN",
-    "MultipleBinnedHaplotypeMatrices",
-    "MultipleHaplotypeMatrices",
-    "DinfModel",
-    "HaplotypeMatrix",
-    "Param",
-    "Parameters",
-    "Surrogate",
-    "Store",
-    "Symmetric",
-    "get_contig_lengths",
-    "get_samples_from_1kgp_metadata",
-    "abc_gan",
-    "alfi_mcmc_gan",
-    "mcmc_gan",
-    "load_results",
-    "pg_gan",
-    "predict",
-    "save_results",
-    "train",
-]
@@ -51,7 +51,7 @@ def _process_pool_init(parallelism, dinf_model):
     _pool = ctx.Pool(
         processes=parallelism,
         initializer=_initializer,
-        initargs=(dinf_model._filename,),
+        initargs=(dinf_model.filename,),
     )
 
 
@@ -408,7 +408,7 @@ def _train_discriminator(
     )
     train_x, train_y, train_x_generator = _generate_training_data(
         target=dinf_model.target_func,
-        generator=dinf_model.generator_func,
+        generator=dinf_model.generator_func_v,
         thetas=training_thetas,
         parallelism=parallelism,
         ss=ss_train,
@@ -418,7 +418,7 @@ def _train_discriminator(
     if test_thetas is not None and len(test_thetas) > 0:
         val_x, val_y, val_x_generator = _generate_training_data(
             target=dinf_model.target_func,
-            generator=dinf_model.generator_func,
+            generator=dinf_model.generator_func_v,
             thetas=test_thetas,
             parallelism=parallelism,
             ss=ss_val,
@@ -593,7 +593,7 @@ def predict(
             replicates, rng=np.random.default_rng(ss_thetas)
         )
         x = _generate_data(
-            generator=dinf_model.generator_func,
+            generator=dinf_model.generator_func_v,
             thetas=thetas,
             parallelism=parallelism,
             rng=np.random.default_rng(ss_generator),
@@ -734,7 +734,7 @@ def mcmc_gan(
     log_prob_func = functools.partial(
         _log_prob,
         discriminator=discriminator,
-        generator=dinf_model.generator_func,
+        generator=dinf_model.generator_func_v,
         parameters=parameters,
         parallelism=parallelism,
         num_replicates=Dx_replicates,
@@ -815,10 +815,10 @@ def sample_smooth(
     """
     Sample from a smoothed set of weighted observations.
 
-    Samples are drawn from the thetas, weighted by their probability.
+    Samples are drawn from ``thetas``, weighted by their probability.
     New points are drawn within a neighbourhood of the sampled thetas
     using a mulivariate normal whose covariance is calculated from the
-    thetas. This is effectively sampling from a Gaussian KDE, but
+    thetas. This is equivalent to sampling from a Gaussian KDE, but
     avoids doing an explicit density estimation.
     Scott's rule of thumb is used for bandwidth selection.
 
@@ -843,11 +843,14 @@ def sample_smooth(
          * "transform": thetas are transformed before sampling, and
            the sampled values are inverse-transformed before being
            returned.
+           See :meth:`Parameters.transform` and :meth:`Parameters.itransform`.
          * "truncate": sampled values are truncated at the parameter limits.
+           See :meth:`Parameters.truncate`.
          * "reflect": sample values that are out of bounds are reflected
            inside the parameter limits by the same magnitude that they were
            out of bounds. Values that are too far out of bounds to be
            reflected are truncated at the parameter limits.
+           See :meth:`Parameters.reflect`.
 
     :return:
         The sampled values.
@@ -1172,7 +1175,7 @@ def pretraining_dinf(
     lp = _log_prob(
         thetas,
         discriminator=discriminator,
-        generator=dinf_model.generator_func,
+        generator=dinf_model.generator_func_v,
         parameters=parameters,
         num_replicates=1,
         parallelism=parallelism,
@@ -1476,7 +1479,7 @@ def pg_gan(
         lp = _log_prob(
             proposal_thetas,
             discriminator=discriminator,
-            generator=dinf_model.generator_func,
+            generator=dinf_model.generator_func_v,
             parameters=parameters,
             num_replicates=Dx_replicates,
             parallelism=parallelism,
 
@@ -166,6 +166,14 @@ def generator_func3(
     Function that simulates features using concrete parameter values.
     """
 
+    generator_func_v: Callable = dataclasses.field(init=False)
+    """
+    Wrapper for ``generator_func`` that accepts a single argument containing
+    the seed and a vector of parameter values (as opposed to keyword arguments).
+    The signature is ``generator_func_v(a: Tuple[int, v: np.ndarray])``,
+    where the argument is a 2-tuple of ``(seed, vector)``.
+    """
+
     target_func: Callable | None
     """
     Function that samples features from the target distribution.
@@ -182,6 +190,12 @@ def generator_func3(
     A :doc:`flax <flax:index>` neural network. May be ``None``.
     """
 
+    filename: pathlib.Path | None = dataclasses.field(init=False, default=None)
+    """
+    Path to the file from which the model was loaded (if any).
+    May be ``None``.
+    """
+
     def __post_init__(self):
         if len(self.parameters) == 0:
             raise ValueError("Must define one or more parameters")
@@ -203,12 +217,12 @@ def __post_init__(self):
         # Transform generator_func from a function accepting arbitrary kwargs
         # (which limits user error) into a function accepting a sequence of
         # args (which is easier to pass to the mcmc).
-        f = self.generator_func
-        self.generator_func = functools.update_wrapper(
-            functools.partial(_sim_shim, func=f, keys=tuple(self.parameters)), f
+        self.generator_func_v = functools.update_wrapper(
+            functools.partial(
+                _sim_shim, func=self.generator_func, keys=tuple(self.parameters)
+            ),
+            self.generator_func,
         )
-        self._orig_generator_func = f
-        self._filename = None
 
     def check(self, seed=None):
         """
@@ -229,7 +243,7 @@ def check(self, seed=None):
                 f"{thetas.shape}, expected shape {(5, len(self.parameters))}."
             )
 
-        x_g = self.generator_func((rng.integers(low=0, high=2**31), thetas[0]))
+        x_g = self.generator_func_v((rng.integers(low=0, high=2**31), thetas[0]))
         if not tree_equal(tree_shape(x_g), self.feature_shape):
             raise ValueError(
                 f"generator_func produced feature shape {tree_shape(x_g)}, "
@@ -269,5 +283,5 @@ def from_file(filename: str | pathlib.Path) -> DinfModel:
             raise AttributeError(f"{filename}: variable 'dinf_model' not found")
         if not isinstance(dinf_model, DinfModel):
             raise TypeError(f"{filename}: dinf_model is not a dinf.DinfModel object")
-        dinf_model._filename = filename
+        dinf_model.filename = pathlib.Path(filename)
         return dinf_model
@@ -14,7 +14,9 @@
 
 
 def ts_individuals(
-    ts: tskit.TreeSequence, population: str | int | None = None
+    ts: tskit.TreeSequence,
+    /,
+    population: str | int | None = None,
 ) -> npt.NDArray[np.integer]:
     """
     Get the individuals corresponding to the tree sequence's samples.
@@ -41,7 +43,9 @@ def ts_individuals(
 
 
 def ts_nodes_of_individuals(
-    ts: tskit.TreeSequence, individuals: npt.NDArray[np.integer]
+    ts: tskit.TreeSequence,
+    /,
+    individuals: npt.NDArray[np.integer],
 ) -> npt.NDArray[np.integer]:
     """
     Get the nodes for the individuals.
@@ -57,7 +61,9 @@ def ts_nodes_of_individuals(
 
 
 def ts_ploidy_of_individuals(
-    ts: tskit.TreeSequence, individuals: npt.NDArray[np.integer]
+    ts: tskit.TreeSequence,
+    /,
+    individuals: npt.NDArray[np.integer],
 ) -> npt.NDArray[np.integer]:
     """
     Get the ploidy of the individuals.
@@ -148,7 +154,7 @@ def cache(path: str | pathlib.Path, /, *, split: int = 1000):
     """
     A decorator to cache the output of generator and/or target functions.
 
-    This is analogous to {func}`functools.cache`, except each function's
+    This is analogous to :func:`functools.cache`, except each function's
     result is stored in a file under the given directory. Caching can create
     a large number of small files, so the files are split into subdirectories
     to mitigate possible problems.
@@ -201,7 +207,7 @@ def sqlite_cache(db_file: str | pathlib.Path, shape, /):
     """
     A decorator for generator or target functions that caches features to disk.
 
-    This is analogous to {func}`functools.cache`, except the cache is
+    This is analogous to :func:`functools.cache`, except the cache is
     persisted to disk in an sqlite database.
 
     .. warning::
 
@@ -124,7 +124,11 @@ def reflect(self, x: np.ndarray, /) -> np.ndarray:
         """
         Reflect values that are out of bounds by the amount they are out.
 
-        Values that are too far out of bounds to be reflected are truncated.
+        As reflecting does not gaurantee values will be within the bounds,
+        values are first truncated to (2*low - high, 2*high - low),
+        then reflected. For example, with bounds low=0, high=10,
+        a value of -11 will be truncated to -10, then reflected to attain
+        a final value of 10.
 
         :param x:
             The values to be reflected.
@@ -288,7 +292,11 @@ def reflect(self, xs: np.ndarray, /) -> np.ndarray:
         """
         Reflect values that are out of bounds by the amount they are out.
 
-        Values that are too far out of bounds to be reflected are truncated.
+        As reflecting does not gaurantee values will be within the bounds,
+        values are first truncated to (2*low - high, 2*high - low),
+        then reflected. For example, with bounds low=0, high=10,
+        a value of -11 will be truncated to -10, then reflected to attain
+        a final value of 10.
 
         :param xs:
             The values to be reflected.
 
@@ -15,7 +15,6 @@
 
 from .cli import ADRDFormatter, _DINF_MODEL_HELP
 import dinf
-from .dinf import sample_smooth
 
 
 class MultiPage:
@@ -694,7 +693,7 @@ def __call__(self, args: argparse.Namespace):
 
 class _Features(_SubCommand):
     """
-    Plot a feature matrix or matrices as heatmaps.
+    Plot a feature matrices as heatmaps.
 
     By default, one simulation will be performed with the generator to obtain
     a set of features for plotting. To instead extract features from the
@@ -721,7 +720,7 @@ def __call__(self, args: argparse.Namespace):
         else:
             rng = np.random.default_rng(args.seed)
             thetas = dinf_model.parameters.draw_prior(1, rng=rng)
-            mats = dinf_model.generator_func(
+            mats = dinf_model.generator_func_v(
                 (rng.integers(low=0, high=2**31), thetas[0])
             )
 
@@ -1042,7 +1041,7 @@ def __call__(self, args: argparse.Namespace):
                 names = list(data.dtype.names)
                 probs = data["_Pr"]
                 thetas = structured_to_unstructured(data[names[1:]])
-                X = sample_smooth(
+                X = dinf.sample_smooth(
                     thetas=thetas,
                     probs=probs,
                     size=1_000_000,