Minimal log_prob functionality

WardBrian · WardBrian · commit e9c4695c4891 · 2022-09-15T16:42:04.000-04:00
Relies on PR-1108
diff --git a/cmdstanpy/model.py b/cmdstanpy/model.py
@@ -7,6 +7,7 @@
 import shutil
 import subprocess
 import sys
+import tempfile
 from collections import OrderedDict
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
@@ -15,10 +16,16 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Union
 
+import pandas as pd
 import ujson as json
 from tqdm.auto import tqdm
 
-from cmdstanpy import _CMDSTAN_REFRESH, _CMDSTAN_SAMPLING, _CMDSTAN_WARMUP
+from cmdstanpy import (
+    _CMDSTAN_REFRESH,
+    _CMDSTAN_SAMPLING,
+    _CMDSTAN_WARMUP,
+    _TMPDIR,
+)
 from cmdstanpy.cmdstan_args import (
     CmdStanArgs,
     GenerateQuantitiesArgs,
@@ -1536,6 +1543,77 @@ def variational(
         vb = CmdStanVB(runset)
         return vb
 
+    def log_prob(
+        self,
+        params: Union[Dict[str, Any], str, os.PathLike],
+        data: Union[Mapping[str, Any], str, os.PathLike, None] = None,
+    ) -> pd.DataFrame:
+        """
+        Calculate the log probability and gradient at the given parameter
+        values.
+
+        NOTE: This function is **NOT** an efficient way to evaluate the log
+        density of the model. It should be used for diagnostics ONLY.
+        Please, do not use this for other purposes such as testing new
+        sampling algorithms!
+
+        Parameters
+        ----------
+        :param data: Values for all parameters in the model, specified
+            either as a dictionary with entries matching the parameter
+            variables, or as the path of a data file in JSON or Rdump format.
+
+            These should be given on the constrained (natural) scale.
+        :param data: Values for all data variables in the model, specified
+            either as a dictionary with entries matching the data variables,
+            or as the path of a data file in JSON or Rdump format.
+
+        :return: A pandas.DataFrame containing columns "lp_" and additional
+            columns for the gradient values. These gradients will be for the
+            unconstrained parameters of the model.
+        """
+
+        if cmdstan_version_before(2, 31, self.exe_info()):
+            raise ValueError(
+                "Method 'log_prob' not available for CmdStan versions "
+                "before 2.31"
+            )
+        with MaybeDictToFilePath(data, params) as (_data, _params):
+            cmd = [
+                str(self.exe_file),
+                "log_prob",
+                f"constrained_params={_params}",
+            ]
+            if _data is not None:
+                cmd += ["data", f"file={_data}"]
+
+            output_dir = tempfile.mkdtemp(prefix=self.name, dir=_TMPDIR)
+
+            output = os.path.join(output_dir, "output.csv")
+            cmd += ["output", f"file={output}"]
+
+            log_p = os.path.join(output_dir, "log_p.csv")
+            cmd += [f"log_prob_output_file={log_p}"]
+
+            get_logger().debug("Cmd: %s", str(cmd))
+
+            proc = subprocess.run(
+                cmd, capture_output=True, check=False, text=True
+            )
+            if proc.returncode:
+                get_logger().error(
+                    "'log_prob' command failed!\nstdout:%s\nstderr:%s",
+                    proc.stdout,
+                    proc.stderr,
+                )
+                raise RuntimeError(
+                    "Method 'log_prob' failed with return code "
+                    + str(proc.returncode)
+                )
+
+            result = pd.read_csv(log_p)
+            return result
+
     def _run_cmdstan(
         self,
         runset: RunSet,
diff --git a/test/test_log_prob.py b/test/test_log_prob.py
@@ -0,0 +1,40 @@
+"""Tests for the `log_prob` method new in CmdStan 2.31.0"""
+
+import logging
+import os
+from test import CustomTestCase
+
+from testfixtures import LogCapture, StringComparison
+
+from cmdstanpy.model import CmdStanModel
+from cmdstanpy.utils import EXTENSION
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+DATAFILES_PATH = os.path.join(HERE, 'data')
+
+BERN_STAN = os.path.join(DATAFILES_PATH, 'bernoulli.stan')
+BERN_DATA = os.path.join(DATAFILES_PATH, 'bernoulli.data.json')
+BERN_EXE = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION)
+BERN_BASENAME = 'bernoulli'
+
+
+class CmdStanLogProb(CustomTestCase):
+    def test_lp_good(self):
+        model = CmdStanModel(stan_file=BERN_STAN)
+        x = model.log_prob({"theta": 0.1}, data=BERN_DATA)
+        assert "lp_" in x.columns
+
+    def test_lp_bad(self):
+        model = CmdStanModel(stan_file=BERN_STAN)
+
+        with LogCapture(level=logging.ERROR) as log:
+            with self.assertRaisesRegex(RuntimeError, "failed with returncode"):
+                model.log_prob({"not_here": 0.1}, data=BERN_DATA)
+
+        log.check_present(
+            (
+                'cmdstanpy',
+                'ERROR',
+                StringComparison(r"(?s).*parameter theta not found.*"),
+            )
+        )