diff --git a/conda-envs/environment-alternative-backends.yml b/conda-envs/environment-alternative-backends.yml index d7cb0fe4fc..6ccd5983da 100644 --- a/conda-envs/environment-alternative-backends.yml +++ b/conda-envs/environment-alternative-backends.yml @@ -18,6 +18,7 @@ dependencies: - jaxlib>=0.4.28 - libblas=*=*mkl - mkl-service +- narwhals>=2.11.0 - numpy>=1.25.0 - numpyro>=0.8.0 - pandas>=0.24.0 diff --git a/conda-envs/environment-dev.yml b/conda-envs/environment-dev.yml index 231dfa05cf..e2b0c8d1db 100644 --- a/conda-envs/environment-dev.yml +++ b/conda-envs/environment-dev.yml @@ -9,6 +9,7 @@ dependencies: - blas - cachetools>=4.2.1 - cloudpickle +- narwhals>=2.11.0 - numpy>=1.25.0 - pandas>=0.24.0 - pip diff --git a/conda-envs/environment-docs.yml b/conda-envs/environment-docs.yml index f85f8fc55b..324505de1d 100644 --- a/conda-envs/environment-docs.yml +++ b/conda-envs/environment-docs.yml @@ -8,6 +8,7 @@ dependencies: - arviz>=0.13.0 - cachetools>=4.2.1 - cloudpickle +- narwhals>=2.11.0 - numpy>=1.25.0 - pandas>=0.24.0 - pip diff --git a/conda-envs/environment-test.yml b/conda-envs/environment-test.yml index b6fd3f36e0..0c138bb15b 100644 --- a/conda-envs/environment-test.yml +++ b/conda-envs/environment-test.yml @@ -10,6 +10,7 @@ dependencies: - cachetools>=4.2.1 - cloudpickle - jax +- narwhals>=2.11.0 - numpy>=1.25.0 - pandas>=0.24.0 - pip diff --git a/conda-envs/windows-environment-dev.yml b/conda-envs/windows-environment-dev.yml index 0c2ae00ce2..7e37d78054 100644 --- a/conda-envs/windows-environment-dev.yml +++ b/conda-envs/windows-environment-dev.yml @@ -9,6 +9,7 @@ dependencies: - blas - cachetools>=4.2.1 - cloudpickle +- narwhals>=2.11.0 - numpy>=1.25.0 - pandas>=0.24.0 - pip diff --git a/conda-envs/windows-environment-test.yml b/conda-envs/windows-environment-test.yml index ee711e3a23..a84ed0995f 100644 --- a/conda-envs/windows-environment-test.yml +++ b/conda-envs/windows-environment-test.yml @@ -11,6 +11,7 @@ dependencies: - cloudpickle - libpython - mkl-service>=2.3.0 +- narwhals>=2.11.0 - numpy>=1.25.0 - pandas>=0.24.0 - pip diff --git a/pymc/data.py b/pymc/data.py index cfade37910..c945eacd5f 100644 --- a/pymc/data.py +++ b/pymc/data.py @@ -11,23 +11,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import importlib import io import typing import urllib.request from collections.abc import Sequence from copy import copy +from functools import singledispatch from typing import Union, cast +import narwhals as nw import numpy as np -import pandas as pd import pytensor import pytensor.tensor as pt import xarray as xr +from narwhals.typing import IntoFrameT, IntoLazyFrameT, IntoSeriesT +from pytensor.compile import SharedVariable from pytensor.compile.builders import OpFromGraph -from pytensor.compile.sharedvalue import SharedVariable from pytensor.graph.basic import Variable from pytensor.raise_op import Assert from pytensor.tensor.random.basic import IntegersRV @@ -161,65 +163,178 @@ def Minibatch(variable: TensorVariable, *variables: TensorVariable, batch_size: return mb_tensors if len(variables) else mb_tensors[0] +def _handle_none_dims( + dims: Sequence[str | None] | None, ndim: int +) -> Sequence[str | None] | Sequence[None]: + if dims is None: + return [None] * ndim + else: + return dims + + +@singledispatch def determine_coords( - model, - value: pd.DataFrame | pd.Series | xr.DataArray, - dims: Sequence[str] | None = None, + value, + model: "Model", + dims: Sequence[str | None] | None = None, coords: dict[str, Sequence | np.ndarray] | None = None, -) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str] | Sequence[None]]: +) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None] | Sequence[None]]: """Determine coordinate values from data or the model (via ``dims``).""" + raise NotImplementedError( + f"Cannot determine coordinates for data of type {type(value)}, please provide `coords` explicitly or " + f"convert the data to a supported type" + ) + + +@determine_coords.register(np.ndarray) +def determine_array_coords( + value: np.ndarray, + model: "Model", + dims: Sequence[str] | None = None, + coords: dict[str, Sequence | np.ndarray] | None = None, +) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None] | Sequence[None]]: if coords is None: coords = {} - dim_name = None - # If value is a df or a series, we interpret the index as coords: - if hasattr(value, "index"): - if dims is not None: - dim_name = dims[0] - if dim_name is None and value.index.name is not None: - dim_name = value.index.name - if dim_name is not None: - coords[dim_name] = value.index - - # If value is a df, we also interpret the columns as coords: - if hasattr(value, "columns"): - if dims is not None: - dim_name = dims[1] - if dim_name is None and value.columns.name is not None: - dim_name = value.columns.name - if dim_name is not None: - coords[dim_name] = value.columns - - if isinstance(value, xr.DataArray): - if dims is not None: - for dim in dims: - dim_name = dim - # str is applied because dim entries may be None - coords[str(dim_name)] = cast(xr.DataArray, value[dim]).to_numpy() - - if isinstance(value, np.ndarray) and dims is not None: - if len(dims) != value.ndim: - raise ShapeError( - "Invalid data shape. The rank of the dataset must match the length of `dims`.", - actual=value.shape, - expected=value.ndim, - ) - for size, dim in zip(value.shape, dims): - coord = model.coords.get(dim, None) - if coord is None and dim is not None: - coords[dim] = range(size) + if dims is None: + return coords, _handle_none_dims(dims, value.ndim) + + if len(dims) != value.ndim: + raise ShapeError( + "Invalid data shape. The rank of the dataset must match the length of `dims`.", + actual=value.shape, + expected=len(value.shape), + ) + + for size, dim in zip(value.shape, dims): + coord = model.coords.get(dim, None) + if coord is None and dim is not None: + coords[dim] = range(size) + + return coords, _handle_none_dims(dims, value.ndim) + + +@determine_coords.register(xr.DataArray) +def determine_xarray_coords( + value: xr.DataArray, + model: "Model", + dims: Sequence[str | None] | None = None, + coords: dict[str, Sequence | np.ndarray] | None = None, +) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None] | Sequence[None]]: + if coords is None: + coords = {} if dims is None: - # TODO: Also determine dim names from the index - new_dims: Sequence[str] | Sequence[None] = [None] * np.ndim(value) - else: - new_dims = dims - return coords, new_dims + return coords, _handle_none_dims(dims, value.ndim) + + for dim in dims: + dim_name = dim + # str is applied because dim entries may be None + coords[str(dim_name)] = cast(xr.DataArray, value[dim]).to_numpy() + + return coords, _handle_none_dims(dims, value.ndim) + + +def _dataframe_agnostic_coords( + value: IntoFrameT | IntoLazyFrameT | nw.DataFrame | nw.LazyFrame, + model: "Model", + ndim_in: int = 2, + dims: Sequence[str | None] | None = None, + coords: dict[str, Sequence | np.ndarray] | None = None, +) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None] | Sequence[None]]: + if coords is None: + coords = {} + + value = cast(nw.DataFrame | nw.LazyFrame, nw.from_native(value, allow_series=False)) # type: ignore[type-var] + if isinstance(value, nw.LazyFrame): + value = value.collect() + + index = nw.maybe_get_index(value) + if index is not None: + value = value.with_columns(**{index.name: index.to_numpy()}) + + if dims is None: + return coords, _handle_none_dims(dims, ndim_in) + + if len(dims) != ndim_in: + raise ShapeError( + "Invalid data shape. The rank of the dataset must match the length of `dims`.", + actual=value.shape, + expected=len(dims), + ) + + index_dim = dims[0] + if index_dim is not None: + if index_dim in value.columns: + coords[index_dim] = tuple(value.select(nw.col(index_dim)).to_numpy().flatten()) + elif index_dim in model.coords: + coords[index_dim] = model.coords[index_dim] # type: ignore[assignment] + else: + raise ValueError( + f"Dimension '{index_dim}' not found in DataFrame columns or model coordinates. Cannot infer " + "index coordinates." + ) + + if len(dims) > 1: + column_dim = dims[1] + if column_dim is not None: + select_expr = nw.exclude(index_dim) if index_dim is not None else nw.all() + coords[column_dim] = value.select(select_expr).columns + + return coords, _handle_none_dims(dims, ndim_in) + + +def _series_agnostic_coords( + value: IntoSeriesT, + model: "Model", + dims: Sequence[str | None] | None = None, + coords: dict[str, Sequence | np.ndarray] | None = None, +) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None] | Sequence[None]]: + value = cast(nw.Series, nw.from_native(value, series_only=True)) # type: ignore[assignment] + return _dataframe_agnostic_coords( + cast(nw.DataFrame | nw.LazyFrame, value.to_frame()), # type: ignore[attr-defined] + ndim_in=1, + model=model, + dims=dims, + coords=coords, + ) # type: ignore[arg-type] + + +def _register_dataframe_backend(library_name: str): + try: + library = importlib.import_module(library_name) + + @determine_coords.register(library.Series) + def determine_series_coords( + value: IntoSeriesT, + model: "Model", + dims: Sequence[str] | None = None, + coords: dict[str, Sequence | np.ndarray] | None = None, + ) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None] | Sequence[None]]: + return _series_agnostic_coords(value, model=model, dims=dims, coords=coords) + + @determine_coords.register(library.DataFrame) + def determine_dataframe_coords( + value: IntoFrameT, + model: "Model", + dims: Sequence[str] | None = None, + coords: dict[str, Sequence | np.ndarray] | None = None, + ) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None] | Sequence[None]]: + return _dataframe_agnostic_coords(value, model=model, dims=dims, coords=coords) + + except ImportError: + # Dataframe backends are optional + pass + + +_register_dataframe_backend("pandas") +_register_dataframe_backend("polars") +_register_dataframe_backend("dask.dataframe") def Data( name: str, - value, + value: IntoFrameT | IntoSeriesT | xr.DataArray | np.ndarray, *, dims: Sequence[str] | None = None, coords: dict[str, Sequence | np.ndarray] | None = None, @@ -248,11 +363,11 @@ def Data( ---------- name : str The name for this variable. - value : array_like or pandas.Series, pandas.Dataframe + value : array_like or Narwhals-compatible Series or DataFrame A value to associate with this variable. dims : str, tuple of str or tuple of None, optional Dimension names of the random variables (as opposed to the shapes of these - random variables). Use this when ``value`` is a pandas Series or DataFrame. The + random variables). Use this when ``value`` is a Series or DataFrame. The ``dims`` will then be the name of the Series / DataFrame's columns. See ArviZ documentation for more information about dimensions and coordinates: :ref:`arviz:quickstart`. @@ -265,6 +380,9 @@ def Data( infer_dims_and_coords : bool, default=False If True, the ``Data`` container will try to infer what the coordinates and dimension names should be if there is an index in ``value``. + model : pymc.Model, optional + Model to which to add the data variable. If not specified, the data variable + will be added to the model on the context stack. **kwargs : dict, optional Extra arguments passed to :func:`pytensor.shared`. @@ -333,9 +451,9 @@ def Data( expected=x.ndim, ) - new_dims: Sequence[str] | Sequence[None] | None + new_dims: Sequence[str | None] | Sequence[None] | None if infer_dims_and_coords: - coords, new_dims = determine_coords(model, value, dims) + coords, new_dims = determine_coords(value, model, dims) else: new_dims = dims diff --git a/pymc/pytensorf.py b/pymc/pytensorf.py index d7e097f6dc..713a5bb72f 100644 --- a/pymc/pytensorf.py +++ b/pymc/pytensorf.py @@ -11,13 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import importlib import warnings from collections.abc import Iterable, Sequence from typing import cast +import narwhals as nw import numpy as np -import pandas as pd import pytensor import pytensor.tensor as pt import scipy.sparse as sps @@ -128,11 +129,33 @@ def convert_data(data) -> np.ndarray | Variable: return smarttypeX(ret) -@_as_tensor_variable.register(pd.Series) -@_as_tensor_variable.register(pd.DataFrame) -def dataframe_to_tensor_variable(df: pd.DataFrame, *args, **kwargs) -> TensorVariable: - return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs) - +# Optional registrations for DataFrame packages +def _register_dataframe_backend(library_name: str): + try: + library = importlib.import_module(library_name) + + @_as_tensor_variable.register(library.Series) + def series_to_tensor_variable(s: library.Series, *args, **kwargs) -> TensorVariable: + s = nw.from_native(s, allow_series=True) + if isinstance(s, nw.LazyFrame): + s = s.collect() + return pt.as_tensor_variable(s.to_numpy(), *args, **kwargs) + + @_as_tensor_variable.register(library.DataFrame) + def dataframe_to_tensor_variable(df: library.DataFrame, *args, **kwargs) -> TensorVariable: + df = nw.from_native(df, allow_series=False) + if isinstance(df, nw.LazyFrame): + df = df.collect() + return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs) + + except ImportError: + # Data backends are optional. Take no action if not installed. + pass + + +_register_dataframe_backend("pandas") +_register_dataframe_backend("polars") +_register_dataframe_backend("dask.dataframe") _cheap_eval_mode = Mode(linker="py", optimizer="minimum_compile") diff --git a/requirements-dev.txt b/requirements-dev.txt index 22bcdaf9ea..ad721036e1 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -9,6 +9,7 @@ jupyter-sphinx mcbackend>=0.4.0 mypy==1.15.0 myst-nb<=1.0.0 +narwhals>=2.11.0 numdifftools>=0.9.40 numpy>=1.25.0 numpydoc diff --git a/requirements.txt b/requirements.txt index 8401b78a15..7aeb3d945f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ arviz>=0.13.0 cachetools>=4.2.1 cloudpickle +narwhals>=2.11.0 numpy>=1.25.0 pandas>=0.24.0 pytensor>=2.35.0,<2.36 diff --git a/tests/test_data.py b/tests/test_data.py index afca1831a7..5e71158b6b 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -401,6 +401,42 @@ def test_implicit_coords_dataframe(self, seeded_test): assert "columns" in pmodel.coords assert pmodel.named_vars_to_dims == {"observations": ("rows", "columns")} + def test_implicit_coords_polars_series(self): + pl = pytest.importorskip("polars") + + ser_sales = pl.Series( + "sales", + np.random.randint(low=0, high=30, size=22), + ) + + with pm.Model(coords={"date": range(22)}) as pmodel: + pm.Data("sales", ser_sales, dims=["date"], infer_dims_and_coords=True) + + with pytest.raises( + ValueError, + match="Dimension 'date2' not found in DataFrame columns or model coordinates", + ): + pm.Data("sales_invalid", ser_sales, dims=["date2"], infer_dims_and_coords=True) + + assert "date" in pmodel.coords + assert len(pmodel.coords["date"]) == 22 + + def test_implicit_coords_polars_dataframe(self): + pl = pytest.importorskip("polars") + + size = (5, 7) + df_data = pl.DataFrame( + np.random.normal(size=size), + schema={f"Column {c + 1}": pl.Float64 for c in range(size[1])}, + ).with_row_count("rows") + + with pm.Model() as pmodel: + pm.Data("observations", df_data, dims=("rows", "columns"), infer_dims_and_coords=True) + + assert "rows" in pmodel.coords + assert "columns" in pmodel.coords + assert pmodel.named_vars_to_dims == {"observations": ("rows", "columns")} + def test_implicit_coords_xarray(self): xr = pytest.importorskip("xarray") data = xr.DataArray([[1, 2, 3], [4, 5, 6]], dims=("y", "x")) diff --git a/tests/test_pytensorf.py b/tests/test_pytensorf.py index d172c61a4d..9010dd14d0 100644 --- a/tests/test_pytensorf.py +++ b/tests/test_pytensorf.py @@ -58,8 +58,20 @@ np.ones(shape=(10, 1)), ], ) -def test_pd_dataframe_as_tensor_variable(np_array: np.ndarray) -> None: - df = pd.DataFrame(np_array) +@pytest.mark.parametrize("library", ["pandas", "polars", "dask.dataframe"]) +def test_dataframe_as_tensor_variable(np_array: np.ndarray, library) -> None: + lib = pytest.importorskip(library) + col_names = [f"col_{i}" for i in range(np_array.shape[1])] + match library: + case "polars": + df = lib.DataFrame(np_array, schema=dict.fromkeys(col_names, float)) + case "dask.dataframe": + df = lib.DataFrame.from_dict({col: np_array[:, i] for i, col in enumerate(col_names)}) + case "pandas": + df = lib.DataFrame(np_array, columns=col_names) + case _: + raise ValueError(f"Unsupported library: {library}") + np.testing.assert_array_equal(pt.as_tensor_variable(df).eval(), np_array) @@ -67,8 +79,10 @@ def test_pd_dataframe_as_tensor_variable(np_array: np.ndarray) -> None: argnames="np_array", argvalues=[np.array([1.0, 2.0, -1.0]), np.ones(shape=4), np.zeros(shape=10), [1, 2, 3, 4]], ) -def test_pd_series_as_tensor_variable(np_array: np.ndarray) -> None: - df = pd.Series(np_array) +@pytest.mark.parametrize("library", ["pandas", "polars"]) +def test_series_as_tensor_variable(np_array: np.ndarray, library) -> None: + lib = pytest.importorskip(library) + df = lib.Series(np_array) np.testing.assert_array_equal(pt.as_tensor_variable(df).eval(), np_array)