Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion pymc/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@

import pymc as pm

from pymc.pytensorf import convert_observed_data
from pymc.pytensorf import convert_observed_data, unmask_masked_data

__all__ = [
"get_data",
Expand Down Expand Up @@ -419,10 +419,20 @@ def Data(
)
name = model.name_for(name)

if isinstance(value, np.ma.MaskedArray):
warnings.warn(
"If possible, masked arrays will be converted to standard numpy arrays with np.nan values for compatibility with PyTensor."
)

# `convert_observed_data` takes care of parameter `value` and
# transforms it to something digestible for PyTensor.
arr = convert_observed_data(value)

# because converted_observed_data() is also used outside pyTensor, we need an extra step to ensure that any masked arrays
# produced by it are converted back to np.ndarray() with np.nan value.
# This is not very efficient and will not be necessary once pyTensor implements MaskedArray support
arr = unmask_masked_data(arr)

if mutable is None:
warnings.warn(
"The `mutable` kwarg was not specified. Before v4.1.0 it defaulted to `pm.Data(mutable=True)`,"
Expand Down
13 changes: 13 additions & 0 deletions pymc/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
hessian,
inputvars,
replace_rvs_by_values,
unmask_masked_data,
)
from pymc.util import (
UNSET,
Expand Down Expand Up @@ -1184,7 +1185,19 @@ def set_data(

if isinstance(values, list):
values = np.array(values)

if isinstance(values, np.ma.MaskedArray):
warnings.warn(
"If possible, masked arrays will be converted to standard numpy arrays with np.nan values for compatibility with PyTensor."
)

values = convert_observed_data(values)

# because converted_observed_data() is also used outside pyTensor, we need an extra step to ensure that any masked arrays
# produced by it are converted back to np.ndarray() with np.nan value.
# This is not very efficient and will not be necessary once pyTensor implements MaskedArray support
values = unmask_masked_data(values)

dims = self.named_vars_to_dims.get(name, None) or ()
coords = coords or {}

Expand Down
22 changes: 22 additions & 0 deletions pymc/pytensorf.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,33 @@
"make_shared_replacements",
"generator",
"convert_observed_data",
"unmask_masked_data",
"compile_pymc",
"constant_fold",
]


def unmask_masked_data(data):
"""Unmask masked numpy arrays for usage within PyTensor"""

# PyTensor currently does not support masked arrays
# If a masked array is passed, we convert it to a standard numpy array with np.nans for float type arrays
# In case of integer type arrays, we throw an error as np.nan is a float concept.

if isinstance(data, np.ma.MaskedArray):
if "int" in str(data.dtype):
raise TypeError(
"Masked integer arrays (integer type datasets with missing values) are not supported by pm.Data() / pm.Model.set_data() at this time. \n"
"Consider if using a float type fits your use case. \n"
"Alternatively, if you want to benefit from automatic imputation in pyMC, pass a masked array directly to `observed=` parameter when defining a distribution."
)
else:
ret = data.filled(fill_value=np.nan)
else:
ret = data
return ret


def convert_observed_data(data):
"""Convert user provided dataset to accepted formats."""

Expand Down
27 changes: 27 additions & 0 deletions tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,33 @@ def test_get_data():
assert type(data) == io.BytesIO


def test_masked_data_mutable():
with pm.Model():
data = np.ma.MaskedArray([1.0, 2.0, 3], [0, 0, 1])
expected = np.array([1, 2, np.nan])
with pytest.warns(UserWarning, match="masked arrays"):
result = pm.MutableData("test", data).get_value()
np.testing.assert_array_equal(result, expected)


def test_masked_data_constant():
with pm.Model():
data = np.ma.MaskedArray([1.0, 2.0, 3], [0, 0, 1])
expected = np.array([1, 2, np.nan])
with pytest.warns(UserWarning, match="masked arrays"):
result = pm.ConstantData("test", data).data
np.testing.assert_array_equal(result, expected)


def test_masked_integer_data():
with pm.Model():
data = np.ma.MaskedArray([1, 2, 3], [0, 0, 1])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Integers should be fine, otherwise we can't input discrete variables?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, we can't - you cannot have an integer NumPy array with nan values, ie. this throws an error: np.array([1,2,3, np.nan[. dtype=int). That's because nan is strictly a float concept. So yes, we would not be able to allow users pass an integer masked array into pm.Data(). If they want to benefit from automatic imputation, they can today (and will be after this PR) pass a masked integer array directly into observed parameter of an RV. I have an error message that explains the options in the code:

28d15f8#diff-823b37f218229d363550b4cc387cfffa180c5c6e0e5ad0e174f2f0be7aa4692aR102

if isinstance(data, np.ma.MaskedArray):
        if "int" in str(data.dtype):
            raise TypeError(
                "Masked integer arrays (integer type datasets with missing values) are not supported by pm.Data() / pm.Model.set_data() at this time. \n"
                "Consider if using a float type fits your use case. \n"
                "Alternatively, if you want to benefit from automatic imputation in pyMC, pass a masked array directly to `observed=` parameter when defining a distribution."
            )
        else:
            ret = data.filled(fill_value=np.nan)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wasn't clear. Is any error raised if user pass floats observed to discrete variables? I think it works just fine.

with pytest.raises(TypeError, match="Masked integer"):
pm.ConstantData("test", data)
with pytest.raises(TypeError, match="Masked integer"):
pm.MutableData("test", data)


class _DataSampler:
"""
Not for users
Expand Down
21 changes: 21 additions & 0 deletions tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -967,6 +967,27 @@ def test_set_data_constant_shape_error():
pmodel.set_data("y", np.arange(10))


def test_set_data_masked_array():
data = np.ma.MaskedArray([1.0, 2.0, 3], [0, 0, 1])

with pm.Model() as pmodel:
D = pm.MutableData("test", np.zeros(4))

with pytest.warns(UserWarning, match="masked arrays"):
pmodel.set_data("test", data)
result = D.get_value()
expected = np.array([1.0, 2.0, np.nan])
np.testing.assert_array_equal(result, expected)


def test_set_data_masked_integer_array():
with pm.Model() as pmodel:
D = pm.MutableData("test", np.zeros(4))
with pytest.warns(UserWarning, match="masked arrays"):
with pytest.raises(TypeError, match="Masked integer"):
pmodel.set_data("test", np.ma.MaskedArray([1, 2, 3], [0, 0, 1]))


def test_model_deprecation_warning():
with pm.Model() as m:
x = pm.Normal("x", 0, 1, size=2)
Expand Down
20 changes: 20 additions & 0 deletions tests/test_pytensorf.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
replace_rvs_by_values,
reseed_rngs,
rvs_to_value_vars,
unmask_masked_data,
walk_model,
)
from pymc.testing import assert_no_rvs
Expand Down Expand Up @@ -269,6 +270,25 @@ def test_convert_observed_data(input_dtype):
assert isinstance(wrapped, TensorVariable)


def test_unmask_masked_data():
# test with non-masked data
data = np.array([1, 2, 3])
result = unmask_masked_data(data)
expected = np.array([1, 2, 3])
np.testing.assert_array_equal(result, expected)

# test with masked float data
data = np.ma.MaskedArray([1.0, 2.0, 3.0], [0, 0, 1])
result = unmask_masked_data(data)
expected = np.array([1.0, 2.0, np.nan])
np.testing.assert_array_equal(result, expected)

# test with integer masked data
data = np.ma.MaskedArray([1, 2, 3], [0, 0, 1])
with pytest.raises(TypeError, match="Masked integer"):
unmask_masked_data(data)


def test_pandas_to_array_pandas_index():
data = pd.Index([1, 2, 3])
result = convert_observed_data(data)
Expand Down