Merge pull request #741 from stan-dev/feat/738-pathfinder-threads

WardBrian · web-flow · commit de2e73ca5b9a · 2024-03-25T16:51:04.000-04:00
Add a num_threads helper argument to pathfinder()
diff --git a/cmdstanpy/cmdstan_args.py b/cmdstanpy/cmdstan_args.py
@@ -930,6 +930,7 @@ def validate(self) -> None:
                 if not (
                     isinstance(self.method_args, SamplerArgs)
                     and self.method_args.num_chains > 1
+                    or isinstance(self.method_args, PathfinderArgs)
                 ):
                     if not os.path.exists(self.inits):
                         raise ValueError('no such file {}'.format(self.inits))
diff --git a/cmdstanpy/model.py b/cmdstanpy/model.py
@@ -1587,6 +1587,7 @@ def pathfinder(
         refresh: Optional[int] = None,
         time_fmt: str = "%Y%m%d%H%M%S",
         timeout: Optional[float] = None,
+        num_threads: Optional[int] = None,
     ) -> CmdStanPathfinder:
         """
         Run CmdStan's Pathfinder variational inference algorithm.
@@ -1689,6 +1690,10 @@ def pathfinder(
         :param timeout: Duration at which Pathfinder times
             out in seconds. Defaults to None.
 
+        :param num_threads: Number of threads to request for parallel execution.
+            A number other than ``1`` requires the model to have been compiled
+            with STAN_THREADS=True.
+
         :return: A :class:`CmdStanPathfinder` object
 
         References
@@ -1715,6 +1720,17 @@ def pathfinder(
                 "available for CmdStan versions 2.34 and later"
             )
 
+        if num_threads is not None:
+            if (
+                num_threads != 1
+                and exe_info.get('STAN_THREADS', '').lower() != 'true'
+            ):
+                raise ValueError(
+                    "Model must be compiled with 'STAN_THREADS=true' to use"
+                    " 'num_threads' argument"
+                )
+            os.environ['STAN_NUM_THREADS'] = str(num_threads)
+
         if num_paths == 1:
             if num_single_draws is None:
                 num_single_draws = draws
diff --git a/test/test_pathfinder.py b/test/test_pathfinder.py
@@ -2,6 +2,8 @@
     Tests for the Pathfinder method.
 """
 
+import contextlib
+from io import StringIO
 from pathlib import Path
 
 import numpy as np
@@ -129,6 +131,26 @@ def test_pathfinder_init_sampling():
     assert fit.draws().shape == (1000, 4, 9)
 
 
+def test_inits_for_pathfinder():
+    stan = DATAFILES_PATH / 'bernoulli.stan'
+    bern_model = cmdstanpy.CmdStanModel(stan_file=stan)
+    jdata = str(DATAFILES_PATH / 'bernoulli.data.json')
+    bern_model.pathfinder(
+        jdata, inits=[{"theta": 0.1}, {"theta": 0.9}], num_paths=2
+    )
+
+    # second path is initialized too large!
+    with contextlib.redirect_stdout(StringIO()) as captured:
+        bern_model.pathfinder(
+            jdata,
+            inits=[{"theta": 0.1}, {"theta": 1.1}],
+            num_paths=2,
+            show_console=True,
+        )
+
+    assert "Bounded variable is 1.1" in captured.getvalue()
+
+
 def test_pathfinder_no_psis():
     stan = DATAFILES_PATH / 'bernoulli.stan'
     bern_model = cmdstanpy.CmdStanModel(stan_file=stan)
@@ -152,3 +174,20 @@ def test_pathfinder_no_lp_calc():
     n_lp_nan = np.sum(np.isnan(pathfinder.method_variables()['lp__']))
     assert n_lp_nan < 4000  # some lp still calculated during pathfinder
     assert n_lp_nan > 3000  # but most are not
+
+
+def test_pathfinder_threads():
+    stan = DATAFILES_PATH / 'bernoulli.stan'
+    bern_model = cmdstanpy.CmdStanModel(stan_file=stan)
+    jdata = str(DATAFILES_PATH / 'bernoulli.data.json')
+
+    bern_model.pathfinder(data=jdata, num_threads=1)
+
+    with pytest.raises(ValueError, match="STAN_THREADS"):
+        bern_model.pathfinder(data=jdata, num_threads=4)
+
+    bern_model = cmdstanpy.CmdStanModel(
+        stan_file=stan, cpp_options={'STAN_THREADS': True}, force_compile=True
+    )
+    pathfinder = bern_model.pathfinder(data=jdata, num_threads=4)
+    assert pathfinder.draws().shape == (1000, 3)
diff --git a/test/test_sample.py b/test/test_sample.py
@@ -55,7 +55,7 @@
 )
 def test_bernoulli_good(stanfile: str):
     stan = os.path.join(DATAFILES_PATH, stanfile)
-    bern_model = CmdStanModel(stan_file=stan)
+    bern_model = CmdStanModel(stan_file=stan, force_compile=True)
 
     jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json')
     bern_fit = bern_model.sample(
@@ -74,6 +74,8 @@ def test_bernoulli_good(stanfile: str):
 
     for i in range(bern_fit.runset.chains):
         csv_file = bern_fit.runset.csv_files[i]
+        # NB: This will fail if STAN_THREADS is enabled
+        # due to sampling only producing 1 stdout file in that case
         stdout_file = bern_fit.runset.stdout_files[i]
         assert os.path.exists(csv_file)
         assert os.path.exists(stdout_file)