.

petrelharp · petrelharp · commit 80f0cf71f6c2 · 2025-09-16T08:56:07.000-07:00
diff --git a/c/tskit/trees.c b/c/tskit/trees.c
@@ -3551,6 +3551,7 @@ tsk_treeseq_update_branch_afs(const tsk_treeseq_t *self, tsk_id_t u, double righ
     tsk_size_t k;
     tsk_size_t time_window_index;
     double *afs;
+    // note: moving this malloc outside this function doesn't speed things up
     tsk_size_t *coordinate = tsk_malloc(num_sample_sets * sizeof(*coordinate));
     bool polarised = !!(options & TSK_STAT_POLARISED);
     const double *count_row = GET_2D_ROW(counts, num_sample_sets + 1, u);
diff --git a/python/tests/test_tree_stats.py b/python/tests/test_tree_stats.py
@@ -45,6 +45,24 @@
 
 np.random.seed(5)
 
+# Notes for refactoring:
+#
+# Things we need to test here are:
+# 1. general_stat: correctly uses summary functions
+# 2. general_stat: branch mode, correctness
+# 3. general_stat: site mode, correctness
+# 4. general_stat: node mode, correctness
+# 5. sample sets: correctness
+# 6. indexes: correctness
+# 7. genome windowing: correctness
+# 8. time windowing: correctness
+# 9. dropping dimensions, output
+# 10. span normalise
+# 11. sample_count_stat: correctly uses summary functions
+# 12. each statistic: a single tree sufficies, with edge cases
+#   a. agrees with naive version, polarised and not;
+#   b. agrees with python version, polarised and not;
+#   c. stat-specific options (eg centre)
 
 def cached_np(func):
     """
@@ -724,7 +742,9 @@ def ts_10_recomb_fixture():
 @pytest.fixture(scope="session")
 def ts_10_mut_fixture():
     """10-sample tree sequence with mutations (used 10 times)."""
-    return msprime.simulate(10, mutation_rate=1, random_seed=1)
+    ts = msprime.simulate(10, mutation_rate=1, random_seed=1)
+    assert ts.num_mutations > 0
+    return ts
 
 
 @pytest.fixture(scope="session")
@@ -6810,36 +6830,31 @@ class TestOutputDimensions(StatsTestCase):
     Tests for the dimension stripping behaviour of the stats functions.
     """
 
-    def get_example_ts(self, ts_10_mut_fixture):
-        ts = ts_10_mut_fixture
-        assert ts.num_sites > 1
-        return ts
-
     def test_one_way_no_window_scalar_stat(self, ts_10_mut_fixture):
-        ts = self.get_example_ts(ts_10_mut_fixture)
+        ts = ts_10_mut_fixture
         x = ts.diversity()
         assert isinstance(x, np.floating)
 
     def test_one_way_one_list_scalar_stat(self, ts_10_mut_fixture):
-        ts = self.get_example_ts(ts_10_mut_fixture)
+        ts = ts_10_mut_fixture
         x = ts.diversity(sample_sets=list(ts.samples()))
         assert isinstance(x, np.floating)
 
     def test_one_way_nested_list_not_scalar_stat(self, ts_10_mut_fixture):
-        ts = self.get_example_ts(ts_10_mut_fixture)
+        ts = ts_10_mut_fixture
         x = ts.diversity(sample_sets=[list(ts.samples())])
         assert x.shape == (1,)
 
     def test_one_way_one_window_scalar_stat(self, ts_10_mut_fixture):
-        ts = self.get_example_ts(ts_10_mut_fixture)
+        ts = ts_10_mut_fixture
         x = ts.diversity(windows=[0, ts.sequence_length])
         assert x.shape == (1,)
         for samples in (None, list(ts.samples())):
             x = ts.diversity(sample_sets=samples, windows=[0, ts.sequence_length])
             assert x.shape == (1,)
 
     def test_multi_way_no_window_scalar_stat(self, ts_10_mut_fixture):
-        ts = self.get_example_ts(ts_10_mut_fixture)
+        ts = ts_10_mut_fixture
         n = ts.num_samples
         x = ts.f2(
             sample_sets=[
@@ -6850,7 +6865,7 @@ def test_multi_way_no_window_scalar_stat(self, ts_10_mut_fixture):
         assert isinstance(x, np.floating)
 
     def test_multi_way_one_window_not_scalar_stat(self, ts_10_mut_fixture):
-        ts = self.get_example_ts(ts_10_mut_fixture)
+        ts = ts_10_mut_fixture
         n = ts.num_samples
         x = ts.f2(
             sample_sets=[
@@ -6862,7 +6877,7 @@ def test_multi_way_one_window_not_scalar_stat(self, ts_10_mut_fixture):
         assert x.shape == (1,)
 
     def test_multi_way_no_indexes_scalar_stat(self, ts_10_mut_fixture):
-        ts = self.get_example_ts(ts_10_mut_fixture)
+        ts = ts_10_mut_fixture
         n = ts.num_samples
         x = ts.f2(
             sample_sets=[
@@ -6873,7 +6888,7 @@ def test_multi_way_no_indexes_scalar_stat(self, ts_10_mut_fixture):
         assert isinstance(x, np.floating)
 
     def test_multi_way_indexes_not_scalar_stat(self, ts_10_mut_fixture):
-        ts = self.get_example_ts(ts_10_mut_fixture)
+        ts = ts_10_mut_fixture
         n = ts.num_samples
         x = ts.f2(
             sample_sets=[
@@ -6885,7 +6900,7 @@ def test_multi_way_indexes_not_scalar_stat(self, ts_10_mut_fixture):
         assert x.shape == (1,)
 
     def test_afs_default_windows(self, ts_10_mut_fixture):
-        ts = self.get_example_ts(ts_10_mut_fixture)
+        ts = ts_10_mut_fixture
         n = ts.num_samples
         A = ts.samples()[:4]
         B = ts.samples()[6:]
@@ -6900,7 +6915,7 @@ def test_afs_default_windows(self, ts_10_mut_fixture):
             assert x.shape == (len(A) + 1, len(B) + 1)
 
     def test_afs_windows(self, ts_10_mut_fixture):
-        ts = self.get_example_ts(ts_10_mut_fixture)
+        ts = ts_10_mut_fixture
         L = ts.sequence_length
 
         windows = [0, L / 4, L / 2, L]
@@ -6920,7 +6935,7 @@ def test_afs_windows(self, ts_10_mut_fixture):
             self.assertArrayEqual(x, y)
 
     def test_one_way_stat_default_windows(self, ts_10_mut_fixture):
-        ts = self.get_example_ts(ts_10_mut_fixture)
+        ts = ts_10_mut_fixture
         # Use diversity as the example one-way stat.
         for mode in ["site", "branch"]:
             x = ts.diversity(mode=mode)
@@ -6989,19 +7004,19 @@ def verify_one_way_stat_windows(self, ts, method):
         self.assertArrayEqual(x[0], x[2])
 
     def test_diversity_windows(self, ts_10_mut_fixture):
-        ts = self.get_example_ts(ts_10_mut_fixture)
+        ts = ts_10_mut_fixture
         self.verify_one_way_stat_windows(ts, ts.diversity)
 
     def test_Tajimas_D_windows(self, ts_10_mut_fixture):
-        ts = self.get_example_ts(ts_10_mut_fixture)
+        ts = ts_10_mut_fixture
         self.verify_one_way_stat_windows(ts, ts.Tajimas_D)
 
     def test_segregating_sites_windows(self, ts_10_mut_fixture):
-        ts = self.get_example_ts(ts_10_mut_fixture)
+        ts = ts_10_mut_fixture
         self.verify_one_way_stat_windows(ts, ts.segregating_sites)
 
     def test_two_way_stat_default_windows(self, ts_10_mut_fixture):
-        ts = self.get_example_ts(ts_10_mut_fixture)
+        ts = ts_10_mut_fixture
         # Use divergence as the example one-way stat.
         A = ts.samples()[:6]
         B = ts.samples()[6:]
@@ -7072,15 +7087,15 @@ def verify_two_way_stat_windows(self, ts, method):
         self.assertArrayEqual(x[0], x[2])
 
     def test_divergence_windows(self, ts_10_mut_fixture):
-        ts = self.get_example_ts(ts_10_mut_fixture)
+        ts = ts_10_mut_fixture
         self.verify_two_way_stat_windows(ts, ts.divergence)
 
     def test_Fst_windows(self, ts_10_mut_fixture):
-        ts = self.get_example_ts(ts_10_mut_fixture)
+        ts = ts_10_mut_fixture
         self.verify_two_way_stat_windows(ts, ts.Fst)
 
     def test_f2_windows(self, ts_10_mut_fixture):
-        ts = self.get_example_ts(ts_10_mut_fixture)
+        ts = ts_10_mut_fixture
         self.verify_two_way_stat_windows(ts, ts.f2)
 
     def verify_three_way_stat_windows(self, ts, method):
@@ -7136,11 +7151,11 @@ def verify_three_way_stat_windows(self, ts, method):
         self.assertArrayEqual(x[0], x[2])
 
     def test_Y3_windows(self, ts_10_mut_fixture):
-        ts = self.get_example_ts(ts_10_mut_fixture)
+        ts = ts_10_mut_fixture
         self.verify_three_way_stat_windows(ts, ts.Y3)
 
     def test_f3_windows(self, ts_10_mut_fixture):
-        ts = self.get_example_ts(ts_10_mut_fixture)
+        ts = ts_10_mut_fixture
         self.verify_three_way_stat_windows(ts, ts.f3)
 
 
diff --git a/python/tskit/trees.py b/python/tskit/trees.py
@@ -7994,13 +7994,13 @@ def __one_way_sample_set_stat(
         ll_method,
         sample_sets,
         windows=None,
+        time_windows=None,
         mode=None,
         span_normalise=True,
         polarised=False,
     ):
         if sample_sets is None:
             sample_sets = self.samples()
-
         # First try to convert to a 1D numpy array. If it is, then we strip off
         # the corresponding dimension from the output.
         drop_dimension = False
@@ -8013,82 +8013,43 @@ def __one_way_sample_set_stat(
             # of integers then drop the dimension
             if len(sample_sets.shape) == 1:
                 sample_sets = [sample_sets]
-                drop_dimension = True
-
+                if ll_method.__name__ != "allele_frequency_spectrum":
+                    drop_dimension = True
         sample_set_sizes = np.array(
             [len(sample_set) for sample_set in sample_sets], dtype=np.uint32
         )
         if np.any(sample_set_sizes == 0):
             raise ValueError("Sample sets must contain at least one element")
 
         flattened = util.safe_np_int_cast(np.hstack(sample_sets), np.int32)
-        stat = self.__run_windowed_stat(
-            windows,
-            ll_method,
-            sample_set_sizes,
-            flattened,
-            mode=mode,
-            span_normalise=span_normalise,
-            polarised=polarised,
-        )
+        use_tw = (ll_method.__name__ == "allele_frequency_spectrum")
+        if use_tw:
+            stat = self.__run_windowed_stat_tw(
+                windows,
+                time_windows,
+                ll_method,
+                sample_set_sizes,
+                flattened,
+                mode=mode,
+                span_normalise=span_normalise,
+                polarised=polarised,
+            )
+        else:
+            stat = self.__run_windowed_stat(
+                windows,
+                ll_method,
+                sample_set_sizes,
+                flattened,
+                mode=mode,
+                span_normalise=span_normalise,
+                polarised=polarised,
+            )
         if drop_dimension:
             stat = stat.reshape(stat.shape[:-1])
-            if stat.shape == () and windows is None:
+            if stat.shape == () and windows is None and time_windows is None:
                 stat = stat[()]
         return stat
 
-    # only for temporary tw version
-    def __one_way_sample_set_stat_tw(
-        self,
-        ll_method,
-        sample_sets,
-        windows=None,
-        time_windows=None,
-        mode=None,
-        span_normalise=True,
-        polarised=False,
-    ):
-        if sample_sets is None:
-            sample_sets = self.samples()
-        # First try to convert to a 1D numpy array. If it is, then we strip off
-        # the corresponding dimension from the output.
-        drop_dimension = False
-        try:
-            sample_sets = np.array(sample_sets, dtype=np.uint64)
-        except ValueError:
-            pass
-        else:
-            # If we've successfully converted sample_sets to a 1D numpy array
-            # of integers then drop the dimension
-            if len(sample_sets.shape) == 1:
-                sample_sets = [sample_sets]
-                drop_dimension = True
-        sample_set_sizes = np.array(
-            [len(sample_set) for sample_set in sample_sets], dtype=np.uint32
-        )
-        if np.any(sample_set_sizes == 0):
-            raise ValueError("Sample sets must contain at least one element")
-
-        flattened = util.safe_np_int_cast(np.hstack(sample_sets), np.int32)
-        stat = self.__run_windowed_stat_tw(
-            windows,
-            time_windows,
-            ll_method,
-            sample_set_sizes,
-            flattened,
-            mode=mode,
-            span_normalise=span_normalise,
-            polarised=polarised,
-        )
-        if drop_dimension:
-            # not applicable for AFS
-            if not ll_method.__name__ == "allele_frequency_spectrum":
-                stat = stat.reshape(stat.shape[:-1])
-            # We'll need this for non-AFS functions; but can't test it with AFS:
-            # if stat.shape == () and windows is None and time_windows is None:
-            #     stat = stat[()]
-        return stat
-
     def parse_sites(self, sites):
         row_sites, col_sites = None, None
         if sites is not None:
@@ -9781,7 +9742,7 @@ def allele_frequency_spectrum(
         """
         if sample_sets is None:
             sample_sets = [self.samples()]
-        return self.__one_way_sample_set_stat_tw(
+        return self.__one_way_sample_set_stat(
             self._ll_tree_sequence.allele_frequency_spectrum,
             sample_sets,
             windows=windows,