Merge pull request #799 from amas0/faster-mcmc-csv-parsing

WardBrian · web-flow · commit 538342474b12 · 2025-07-31T09:40:06.000-04:00
Implementation of faster MCMC CSV parsing and Stan CSV utilities
diff --git a/cmdstanpy/stanfit/mcmc.py b/cmdstanpy/stanfit/mcmc.py
@@ -39,6 +39,7 @@
     do_command,
     flatten_chains,
     get_logger,
+    stancsv,
 )
 
 from .metadata import InferenceMetadata
@@ -429,83 +430,42 @@ def _assemble_draws(self) -> None:
         """
         if self._draws.shape != (0,):
             return
+
         num_draws = self.num_draws_sampling
-        sampling_iter_start = 0
         if self._save_warmup:
             num_draws += self.num_draws_warmup
-            sampling_iter_start = self.num_draws_warmup
         self._draws = np.empty(
             (num_draws, self.chains, len(self.column_names)),
-            dtype=float,
+            dtype=np.float64,
             order='F',
         )
-        self._step_size = np.empty(self.chains, dtype=float)
+        self._step_size = np.empty(self.chains, dtype=np.float64)
+
+        mass_matrix_per_chain = []
         for chain in range(self.chains):
-            with open(self.runset.csv_files[chain], 'r') as fd:
-                line = fd.readline().strip()
-                # read initial comments, CSV header row
-                while len(line) > 0 and line.startswith('#'):
-                    line = fd.readline().strip()
+            try:
+                comments, draws = stancsv.parse_stan_csv_comments_and_draws(
+                    self.runset.csv_files[chain]
+                )
+
+                self._draws[:, chain, :] = stancsv.csv_bytes_list_to_numpy(
+                    draws
+                )
+
                 if not self._is_fixed_param:
-                    # handle warmup draws, if any
-                    if self._save_warmup:
-                        for i in range(self.num_draws_warmup):
-                            line = fd.readline().strip()
-                            xs = line.split(',')
-                            self._draws[i, chain, :] = [float(x) for x in xs]
-                    line = fd.readline().strip()
-                    if line != '# Adaptation terminated':  # shouldn't happen?
-                        while line != '# Adaptation terminated':
-                            line = fd.readline().strip()
-                    # step_size, metric (diag_e and dense_e only)
-                    line = fd.readline().strip()
-                    _, step_size = line.split('=')
-                    self._step_size[chain] = float(step_size.strip())
-                    if self._metadata.cmdstan_config['metric'] != 'unit_e':
-                        line = fd.readline().strip()  # metric type
-                        line = fd.readline().lstrip(' #\t').rstrip()
-                        num_unconstrained_params = len(line.split(','))
-                        if chain == 0:  # can't allocate w/o num params
-                            if self.metric_type == 'diag_e':
-                                self._metric = np.empty(
-                                    (self.chains, num_unconstrained_params),
-                                    dtype=float,
-                                )
-                            else:
-                                self._metric = np.empty(
-                                    (
-                                        self.chains,
-                                        num_unconstrained_params,
-                                        num_unconstrained_params,
-                                    ),
-                                    dtype=float,
-                                )
-                        if line:
-                            if self.metric_type == 'diag_e':
-                                xs = line.split(',')
-                                self._metric[chain, :] = [float(x) for x in xs]
-                            else:
-                                xs = line.strip().split(',')
-                                self._metric[chain, 0, :] = [
-                                    float(x) for x in xs
-                                ]
-                                for i in range(1, num_unconstrained_params):
-                                    line = fd.readline().lstrip(' #\t').rstrip()
-                                    xs = line.split(',')
-                                    self._metric[chain, i, :] = [
-                                        float(x) for x in xs
-                                    ]
-                    else:  # unit_e changed in 2.34 to have an extra line
-                        pos = fd.tell()
-                        line = fd.readline().strip()
-                        if not line.startswith('#'):
-                            fd.seek(pos)
-
-                # process draws
-                for i in range(sampling_iter_start, num_draws):
-                    line = fd.readline().strip()
-                    xs = line.split(',')
-                    self._draws[i, chain, :] = [float(x) for x in xs]
+                    (
+                        self._step_size[chain],
+                        mass_matrix,
+                    ) = stancsv.parse_hmc_adaptation_lines(comments)
+                    mass_matrix_per_chain.append(mass_matrix)
+            except Exception as exc:
+                raise ValueError(
+                    f"Parsing output from {self.runset.csv_files[chain]} failed"
+                ) from exc
+
+        if all(mm is not None for mm in mass_matrix_per_chain):
+            self._metric = np.array(mass_matrix_per_chain)
+
         assert self._draws is not None
 
     def summary(
diff --git a/cmdstanpy/utils/stancsv.py b/cmdstanpy/utils/stancsv.py
@@ -1,17 +1,146 @@
 """
 Utility functions for reading the Stan CSV format
 """
+
+import io
 import json
 import math
+import os
 import re
-from typing import Any, Dict, List, MutableMapping, Optional, TextIO, Union
+import warnings
+from typing import (
+    Any,
+    Dict,
+    Iterator,
+    List,
+    MutableMapping,
+    Optional,
+    TextIO,
+    Tuple,
+    Union,
+)
 
 import numpy as np
+import numpy.typing as npt
 import pandas as pd
 
 from cmdstanpy import _CMDSTAN_SAMPLING, _CMDSTAN_THIN, _CMDSTAN_WARMUP
 
 
+def parse_stan_csv_comments_and_draws(
+    stan_csv: Union[str, os.PathLike, Iterator[bytes]],
+) -> Tuple[List[bytes], List[bytes]]:
+    """Parses lines of a Stan CSV file into comment lines and draws lines, where
+    a draws line is just a non-commented line.
+
+    Returns a (comment_lines, draws_lines) tuple.
+    """
+
+    def split_comments_and_draws(
+        lines: Iterator[bytes],
+    ) -> Tuple[List[bytes], List[bytes]]:
+        comment_lines, draws_lines = [], []
+        for line in lines:
+            if line.startswith(b"#"):  # is comment line
+                comment_lines.append(line)
+            else:
+                draws_lines.append(line)
+        return comment_lines, draws_lines
+
+    if isinstance(stan_csv, (str, os.PathLike)):
+        with open(stan_csv, "rb") as f:
+            return split_comments_and_draws(f)
+    else:
+        return split_comments_and_draws(stan_csv)
+
+
+def csv_bytes_list_to_numpy(
+    csv_bytes_list: List[bytes], includes_header: bool = True
+) -> npt.NDArray[np.float64]:
+    """Efficiently converts a list of bytes representing whose concatenation
+    represents a CSV file into a numpy array. Includes header specifies
+    whether the bytes contains an initial header line."""
+    try:
+        import polars as pl
+
+        try:
+            if not csv_bytes_list:
+                raise ValueError("No data found to parse")
+            num_cols = csv_bytes_list[0].count(b",") + 1
+            out: npt.NDArray[np.float64] = (
+                pl.read_csv(
+                    io.BytesIO(b"".join(csv_bytes_list)),
+                    has_header=includes_header,
+                    schema_overrides=[pl.Float64] * num_cols,
+                    infer_schema=False,
+                )
+                .to_numpy()
+                .astype(np.float64)
+            )
+            if out.shape[0] == 0:
+                raise ValueError("No data found to parse")
+        except pl.exceptions.NoDataError as exc:
+            raise ValueError("No data found to parse") from exc
+    except ImportError:
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            out = np.loadtxt(
+                csv_bytes_list,
+                skiprows=int(includes_header),
+                delimiter=",",
+                dtype=np.float64,
+                ndmin=1,
+            )
+        if out.shape == (0,):
+            raise ValueError("No data found to parse")  # pylint: disable=W0707
+        if len(out.shape) == 1:
+            out = out.reshape(1, -1)
+
+    return out
+
+
+def parse_hmc_adaptation_lines(
+    comment_lines: List[bytes],
+) -> Tuple[float, Optional[npt.NDArray[np.float64]]]:
+    """Extracts step size/mass matrix information from the Stan CSV comment
+    lines by parsing the adaptation section. If the diag_e metric is used,
+    the returned mass matrix will be a 1D array of the diagnoal elements,
+    if the dense_e metric is used, it will be a 2D array representing the
+    entire matrix, and if unit_e is used then None will be returned.
+
+    Returns a (step_size, mass_matrix) tuple"""
+    step_size, mass_matrix = None, None
+
+    cleaned_lines = (ln.lstrip(b"# ") for ln in comment_lines)
+    in_matrix_block = False
+    diag_e_metric = False
+    matrix_lines = []
+    for line in cleaned_lines:
+        if in_matrix_block and line.strip():
+            # Stop when we get to timing block
+            if line.startswith(b"Elapsed Time"):
+                break
+            matrix_lines.append(line)
+        elif line.startswith(b"Step size"):
+            _, ss_str = line.split(b" = ")
+            step_size = float(ss_str)
+        elif line.startswith(b"Diagonal") or line.startswith(b"Elements"):
+            in_matrix_block = True
+        elif line.startswith(b"No free"):
+            break
+        elif b"diag_e" in line:
+            diag_e_metric = True
+    if step_size is None:
+        raise ValueError("Unable to parse adapated step size")
+    if matrix_lines:
+        mass_matrix = csv_bytes_list_to_numpy(
+            matrix_lines, includes_header=False
+        )
+        if diag_e_metric and mass_matrix.shape[0] == 1:
+            mass_matrix = mass_matrix[0]
+    return step_size, mass_matrix
+
+
 def check_sampler_csv(
     path: str,
     is_fixed_param: bool = False,
diff --git a/pyproject.toml b/pyproject.toml
@@ -40,7 +40,7 @@ packages = ["cmdstanpy", "cmdstanpy.stanfit", "cmdstanpy.utils"]
 "cmdstanpy" = ["py.typed"]
 
 [project.optional-dependencies]
-all = ["xarray"]
+all = ["xarray", "polars>=1.8.2"]
 test = [
     "flake8",
     "pylint",
@@ -49,6 +49,7 @@ test = [
     "pytest-order",
     "mypy",
     "xarray",
+    "polars>=1.8.2"
 ]
 docs = [
     "sphinx>5,<6",
diff --git a/test/test_sample.py b/test/test_sample.py
@@ -204,6 +204,7 @@ def test_bernoulli_unit_e(
         show_progress=False,
     )
     assert bern_fit.metric_type == 'unit_e'
+    assert bern_fit.metric is None
     assert bern_fit.step_size.shape == (2,)
     with caplog.at_level(logging.INFO):
         logging.getLogger()
@@ -2127,3 +2128,13 @@ def test_mcmc_init_sampling():
 
     assert fit.chains == 4
     assert fit.draws().shape == (1000, 4, 9)
+
+
+def test_sample_dense_mass_matrix():
+    stan = os.path.join(DATAFILES_PATH, 'linear_regression.stan')
+    jdata = os.path.join(DATAFILES_PATH, 'linear_regression.data.json')
+    linear_model = CmdStanModel(stan_file=stan)
+
+    fit = linear_model.sample(data=jdata, metric="dense_e", chains=2)
+    assert fit.metric is not None
+    assert fit.metric.shape == (2, 3, 3)
diff --git a/test/test_stancsv.py b/test/test_stancsv.py