diff --git a/ibis/expr/datatypes/core.py b/ibis/expr/datatypes/core.py index 4264b00168f6..2514060eed37 100644 --- a/ibis/expr/datatypes/core.py +++ b/ibis/expr/datatypes/core.py @@ -41,52 +41,58 @@ @overload -def dtype(value: type[int] | Literal["int"], nullable: bool = True) -> Int64: ... +def dtype(value: type[int] | Literal["int"], nullable: bool | None = None) -> Int64: ... @overload def dtype( - value: type[str] | Literal["str", "string"], nullable: bool = True + value: type[str] | Literal["str", "string"], nullable: bool | None = None ) -> String: ... @overload def dtype( - value: type[bool] | Literal["bool", "boolean"], nullable: bool = True + value: type[bool] | Literal["bool", "boolean"], nullable: bool | None = None ) -> Boolean: ... @overload -def dtype(value: type[bytes] | Literal["bytes"], nullable: bool = True) -> Binary: ... +def dtype( + value: type[bytes] | Literal["bytes"], nullable: bool | None = None +) -> Binary: ... @overload -def dtype(value: type[Real] | Literal["float"], nullable: bool = True) -> Float64: ... +def dtype( + value: type[Real] | Literal["float"], nullable: bool | None = None +) -> Float64: ... @overload def dtype( - value: type[pydecimal.Decimal] | Literal["decimal"], nullable: bool = True + value: type[pydecimal.Decimal] | Literal["decimal"], nullable: bool | None = None ) -> Decimal: ... @overload def dtype( - value: type[pydatetime.datetime] | Literal["timestamp"], nullable: bool = True + value: type[pydatetime.datetime] | Literal["timestamp"], + nullable: bool | None = None, ) -> Timestamp: ... @overload def dtype( - value: type[pydatetime.date] | Literal["date"], nullable: bool = True + value: type[pydatetime.date] | Literal["date"], nullable: bool | None = None ) -> Date: ... @overload def dtype( - value: type[pydatetime.time] | Literal["time"], nullable: bool = True + value: type[pydatetime.time] | Literal["time"], nullable: bool | None = None ) -> Time: ... @overload def dtype( - value: type[pydatetime.timedelta] | Literal["interval"], nullable: bool = True + value: type[pydatetime.timedelta] | Literal["interval"], + nullable: bool | None = None, ) -> Interval: ... @overload def dtype( - value: type[pyuuid.UUID] | Literal["uuid"], nullable: bool = True + value: type[pyuuid.UUID] | Literal["uuid"], nullable: bool | None = None ) -> UUID: ... @overload def dtype( value: DataType | str | np.dtype | ExtensionDtype | pl.DataType | pa.DataType, - nullable: bool = True, + nullable: bool | None = None, ) -> DataType: ... @lazy_singledispatch -def dtype(value, nullable=True) -> DataType: +def dtype(value, nullable: bool | None = None) -> DataType: """Create a DataType object. Parameters @@ -96,21 +102,42 @@ def dtype(value, nullable=True) -> DataType: strings, python type annotations, numpy dtypes, pandas dtypes, and pyarrow types. nullable - Whether the type should be nullable. Defaults to True. - If `value` is a string prefixed by "!", the type is always non-nullable. + Whether the resulting type should be nullable. + If `None`, we try to infer nullability from the input value. + For example, if `value` is a string starting with '!', the resulting type + will be non-nullable. + For inputs without an explicit nullability (like the python type `int` or + numpy dtype of `np.int32`), we default to `nullable=True`. Examples -------- >>> import ibis >>> ibis.dtype("int32") Int32(nullable=True) + + Prefixing the type with "!" makes it non-nullable: + >>> ibis.dtype("!int32") Int32(nullable=False) - >>> ibis.dtype("array") - Array(value_type=Float64(nullable=True), length=None, nullable=True) + + We support a rich string syntax for nested and parametric types: + + >>> ibis.dtype("array") + Array(value_type=Float64(nullable=False), length=None, nullable=True) + >>> ibis.dtype("!struct") + Struct([('a', Interval(unit=, nullable=True)), ('b', Boolean(nullable=False))], nullable=False) + >>> ibis.dtype("map") + Map(key_type=Timestamp(timezone='America/Anchorage', scale=6, nullable=True), value_type=Boolean(nullable=True), nullable=True) + + The function is idempotent (AKA is a no-op when passed a DataType): + >>> t = ibis.dtype("int32") + >>> ibis.dtype(t) is t + True DataType objects may also be created from Python types: + >>> ibis.dtype(int) + Int64(nullable=True) >>> ibis.dtype(int, nullable=False) Int64(nullable=False) >>> ibis.dtype(list[float]) @@ -121,36 +148,52 @@ def dtype(value, nullable=True) -> DataType: >>> import pyarrow as pa >>> ibis.dtype(pa.int32()) Int32(nullable=True) + >>> ibis.dtype(pa.int32(), nullable=False) + Int32(nullable=False) + + The `nullable` parameter may be used to override the nullability: + + >>> ibis.dtype("!int32", nullable=True) + Int32(nullable=True) + >>> i = ibis.dtype("int32") + >>> i + Int32(nullable=True) + >>> ibis.dtype(i, nullable=False) + Int32(nullable=False) """ if isinstance(value, DataType): - return value + if nullable is None: + return value + return value.copy(nullable=nullable) else: + if nullable is None: + nullable = True return DataType.from_typehint(value, nullable) @dtype.register(str) -def from_string(value, nullable: bool = True): +def from_string(value, nullable=None): return DataType.from_string(value, nullable) @dtype.register("numpy.dtype") -def from_numpy_dtype(value, nullable=True): +def from_numpy_dtype(value, nullable=None): return DataType.from_numpy(value, nullable) @dtype.register("pandas.core.dtypes.base.ExtensionDtype") -def from_pandas_extension_dtype(value, nullable=True): +def from_pandas_extension_dtype(value, nullable=None): return DataType.from_pandas(value, nullable) @dtype.register("pyarrow.lib.DataType") -def from_pyarrow(value, nullable=True): +def from_pyarrow(value, nullable=None): return DataType.from_pyarrow(value, nullable) @dtype.register("polars.datatypes.classes.DataTypeClass") -def from_polars(value, nullable=True): +def from_polars(value, nullable=None): return DataType.from_polars(value, nullable) @@ -228,7 +271,7 @@ def castable(self, to: DataType, **kwargs) -> bool: return castable(self, to, **kwargs) @classmethod - def from_string(cls, value: str, nullable: bool = True) -> Self: + def from_string(cls, value: str, nullable: bool | None = None) -> Self: from ibis.expr.datatypes.parse import parse try: @@ -236,7 +279,7 @@ def from_string(cls, value: str, nullable: bool = True) -> Self: except SyntaxError: raise TypeError(f"{value!r} cannot be parsed as a datatype") - if not nullable: + if nullable is not None: return typ.copy(nullable=nullable) return typ @@ -309,7 +352,7 @@ def from_typehint(cls, typ, nullable=True) -> Self: raise TypeError(f"Value {typ!r} is not a valid datatype") @classmethod - def from_numpy(cls, numpy_type: np.dtype, nullable: bool = True) -> Self: + def from_numpy(cls, numpy_type: np.dtype, nullable: bool | None = None) -> Self: """Return the equivalent ibis datatype.""" from ibis.formats.numpy import NumpyType @@ -317,7 +360,7 @@ def from_numpy(cls, numpy_type: np.dtype, nullable: bool = True) -> Self: @classmethod def from_pandas( - cls, pandas_type: np.dtype | ExtensionDtype, nullable: bool = True + cls, pandas_type: np.dtype | ExtensionDtype, nullable: bool | None = None ) -> Self: """Return the equivalent ibis datatype.""" from ibis.formats.pandas import PandasType @@ -325,7 +368,9 @@ def from_pandas( return PandasType.to_ibis(pandas_type, nullable=nullable) @classmethod - def from_pyarrow(cls, arrow_type: pa.DataType, nullable: bool = True) -> Self: + def from_pyarrow( + cls, arrow_type: pa.DataType, nullable: bool | None = None + ) -> Self: """Return the equivalent ibis datatype.""" from ibis.formats.pyarrow import PyArrowType diff --git a/ibis/expr/datatypes/tests/test_core.py b/ibis/expr/datatypes/tests/test_core.py index 900f93065002..959404b8ae46 100644 --- a/ibis/expr/datatypes/tests/test_core.py +++ b/ibis/expr/datatypes/tests/test_core.py @@ -74,7 +74,9 @@ def test_dtype(spec, expected): marks=pytest.mark.xfail(sys.version_info < (3, 10), reason="python 3.9"), ), (lambda: ("!int",), dt.Int64(nullable=False)), - (lambda: ("!int", True), dt.Int64(nullable=False)), # "!" overrides `nullable` + (lambda: ("!int", None), dt.Int64(nullable=False)), + (lambda: ("!int", False), dt.Int64(nullable=False)), + (lambda: ("!int", True), dt.Int64(nullable=True)), ], ) def test_nullable_dtype(args, expected): @@ -105,8 +107,73 @@ def test_bogus_union(): (dt.Time, dt.time), ], ) -def test_dtype_from_classes(klass, expected): - assert dt.dtype(klass) == expected +@pytest.mark.parametrize( + ("nullable", "expected_nullable"), + [ + (True, True), + (False, False), + (None, True), + ], +) +def test_dtype_from_classes(klass, expected, nullable, expected_nullable): + assert dt.dtype(klass, nullable=nullable) == expected.copy( + nullable=expected_nullable + ) + + +@pytest.mark.parametrize( + ("inp", "nullable", "expected"), + [ + (dt.Null(nullable=True), True, dt.Null(nullable=True)), + (dt.Null(nullable=True), False, dt.Null(nullable=False)), + (dt.Null(nullable=True), None, dt.Null(nullable=True)), + (dt.Null(nullable=False), True, dt.Null(nullable=True)), + (dt.Null(nullable=False), False, dt.Null(nullable=False)), + (dt.Null(nullable=False), None, dt.Null(nullable=False)), + (dt.Int16(nullable=True), True, dt.Int16(nullable=True)), + (dt.Int16(nullable=True), False, dt.Int16(nullable=False)), + (dt.Int16(nullable=True), None, dt.Int16(nullable=True)), + (dt.Int16(nullable=False), True, dt.Int16(nullable=True)), + (dt.Int16(nullable=False), False, dt.Int16(nullable=False)), + (dt.Int16(nullable=False), None, dt.Int16(nullable=False)), + # The nullability of the element type is NEVER changed, + # only the outer nullability can be changed. + ( + dt.Array(dt.Int16(nullable=True), nullable=True), + True, + dt.Array(dt.Int16(nullable=True), nullable=True), + ), + ( + dt.Array(dt.Int16(nullable=True), nullable=True), + False, + dt.Array(dt.Int16(nullable=True), nullable=False), + ), + ( + dt.Array(dt.Int16(nullable=True), nullable=True), + None, + dt.Array(dt.Int16(nullable=True), nullable=True), + ), + ( + dt.Array(dt.Int16(nullable=False), nullable=True), + True, + dt.Array(dt.Int16(nullable=False), nullable=True), + ), + ( + dt.Array(dt.Int16(nullable=False), nullable=True), + False, + dt.Array(dt.Int16(nullable=False), nullable=False), + ), + ( + dt.Array(dt.Int16(nullable=False), nullable=True), + None, + dt.Array(dt.Int16(nullable=False), nullable=True), + ), + ], +) +def test_dtype_from_datatype_instance( + inp: dt.DataType, nullable: bool | None, expected: dt.DataType +): + assert dt.dtype(inp, nullable=nullable) == expected @pytest.mark.parametrize( diff --git a/ibis/expr/datatypes/tests/test_parse.py b/ibis/expr/datatypes/tests/test_parse.py index 4eced5513750..62c74906fc45 100644 --- a/ibis/expr/datatypes/tests/test_parse.py +++ b/ibis/expr/datatypes/tests/test_parse.py @@ -46,6 +46,11 @@ ) def test_primitive_from_string(nullable, spec, expected): assert dt.dtype(spec, nullable=nullable) == expected(nullable=nullable) + assert dt.dtype(spec, nullable=None) == expected(nullable=True) + assert dt.dtype(spec) == expected(nullable=True) + assert dt.dtype("!" + spec, nullable=nullable) == expected(nullable=nullable) + assert dt.dtype("!" + spec, nullable=None) == expected(nullable=False) + assert dt.dtype("!" + spec) == expected(nullable=False) @pytest.mark.parametrize( diff --git a/ibis/formats/__init__.py b/ibis/formats/__init__.py index 0e76b1db735e..503c08aa6cea 100644 --- a/ibis/formats/__init__.py +++ b/ibis/formats/__init__.py @@ -38,7 +38,7 @@ def from_ibis(cls, dtype: DataType) -> T: raise NotImplementedError @classmethod - def to_ibis(cls, typ: T, nullable: bool = True) -> DataType: + def to_ibis(cls, typ: T, nullable: bool | None = None) -> DataType: """Convert a format-specific type object to an Ibis DataType. Parameters @@ -47,6 +47,8 @@ def to_ibis(cls, typ: T, nullable: bool = True) -> DataType: The format-specific type object to convert. nullable Whether the Ibis DataType should be nullable. + If `None`, the nullability will be inferred from `typ` if possible. + If inference is not possible, we assume `nullable=True`. Returns ------- @@ -56,7 +58,7 @@ def to_ibis(cls, typ: T, nullable: bool = True) -> DataType: raise NotImplementedError @classmethod - def from_string(cls, text: str, nullable: bool = True) -> DataType: + def from_string(cls, text: str, nullable: bool | None = None) -> DataType: """Convert a backend-specific string representation into an Ibis DataType. Parameters @@ -65,6 +67,7 @@ def from_string(cls, text: str, nullable: bool = True) -> DataType: The backend-specific string representation to convert. nullable Whether the Ibis DataType should be nullable. + If `None`, the specific type mapper will choose a default. Returns ------- diff --git a/ibis/formats/numpy.py b/ibis/formats/numpy.py index f04b15809dbd..5f18efca89f7 100644 --- a/ibis/formats/numpy.py +++ b/ibis/formats/numpy.py @@ -36,7 +36,13 @@ class NumpyType(TypeMapper[np.dtype]): @classmethod - def to_ibis(cls, typ: np.dtype, nullable: bool = True) -> dt.DataType: + def to_ibis(cls, typ: np.dtype, nullable: bool | None = True) -> dt.DataType: + # numpy's type system doesn't keep track of nullability. + # We accept nullable=None to be compatible with the rest of TypeMapper.to_ibis() + # implementations, but we treat None as True, since we can't infer nullability + # from a numpy dtype. + if nullable is None: + nullable = True if np.issubdtype(typ, np.datetime64): # TODO(kszucs): the following code provedes proper timestamp roundtrips # between ibis and numpy/pandas but breaks the test suite at several diff --git a/ibis/formats/pandas.py b/ibis/formats/pandas.py index 68423a5d2f4c..e43182318845 100644 --- a/ibis/formats/pandas.py +++ b/ibis/formats/pandas.py @@ -31,7 +31,13 @@ class PandasType(NumpyType): @classmethod - def to_ibis(cls, typ, nullable=True): + def to_ibis(cls, typ, nullable: bool | None = None): + # pandas's type system doesn't keep track of nullability. + # We accept nullable=None to be compatible with the rest of TypeMapper.to_ibis() + # implementations, but we treat None as True, since we can't infer nullability + # from a pandas dtype. + if nullable is None: + nullable = True if isinstance(typ, pdt.DatetimeTZDtype): return dt.Timestamp(timezone=str(typ.tz), nullable=nullable) elif pdt.is_datetime64_dtype(typ): diff --git a/ibis/formats/polars.py b/ibis/formats/polars.py index b49ceb7e6797..ebc2885fe941 100644 --- a/ibis/formats/polars.py +++ b/ibis/formats/polars.py @@ -39,9 +39,14 @@ class PolarsType(TypeMapper): @classmethod - def to_ibis(cls, typ: pl.DataType, nullable=True) -> dt.DataType: + def to_ibis(cls, typ: pl.DataType, nullable: bool | None = None) -> dt.DataType: """Convert a polars type to an ibis type.""" - + # polars's type system doesn't keep track of nullability. + # We accept nullable=None to be compatible with the rest of TypeMapper.to_ibis() + # implementations, but we treat None as True, since we can't infer nullability + # from a polars dtype. + if nullable is None: + nullable = True base_type = typ.base_type() if base_type in (pl.Categorical, pl.Enum): return dt.String(nullable=nullable) diff --git a/ibis/formats/pyarrow.py b/ibis/formats/pyarrow.py index 5ce9205f15f2..c2b59588389c 100644 --- a/ibis/formats/pyarrow.py +++ b/ibis/formats/pyarrow.py @@ -76,10 +76,16 @@ class PyArrowType(TypeMapper): @classmethod - def to_ibis(cls, typ: pa.DataType, nullable=True) -> dt.DataType: + def to_ibis(cls, typ: pa.DataType, nullable: bool | None = None) -> dt.DataType: """Convert a pyarrow type to an ibis type.""" + # arrow's type system doesn't keep track of nullability. + # We accept nullable=None to be compatible with the rest of TypeMapper.to_ibis() + # implementations, but we treat None as True, since we can't infer nullability + # from a pyarrow dtype. + if nullable is None: + nullable = True if pa.types.is_null(typ): - return dt.null + return dt.null(nullable=nullable) elif pa.types.is_decimal(typ): return dt.Decimal(typ.precision, typ.scale, nullable=nullable) elif pa.types.is_timestamp(typ): diff --git a/ibis/formats/tests/test_numpy.py b/ibis/formats/tests/test_numpy.py index 1d29c80dbe3b..da9af9dc0ea2 100644 --- a/ibis/formats/tests/test_numpy.py +++ b/ibis/formats/tests/test_numpy.py @@ -115,6 +115,7 @@ def test_schema_from_numpy(numpy_schema): assert NumpyType.from_ibis(ibis_schema[name]) == numpy_type +@pytest.mark.parametrize("nullable", [True, False, None]) @pytest.mark.parametrize( ("numpy_dtype", "ibis_dtype"), [ @@ -135,8 +136,10 @@ def test_schema_from_numpy(numpy_schema): (np.datetime64, dt.timestamp), ], ) -def test_dtype_from_numpy(numpy_dtype, ibis_dtype): - assert NumpyType.to_ibis(np.dtype(numpy_dtype)) == ibis_dtype +def test_dtype_from_numpy(numpy_dtype, ibis_dtype, nullable): + if nullable is False: + ibis_dtype = ibis_dtype.copy(nullable=False) + assert NumpyType.to_ibis(np.dtype(numpy_dtype), nullable=nullable) == ibis_dtype def test_dtype_from_numpy_dtype_timedelta(): diff --git a/ibis/formats/tests/test_pandas.py b/ibis/formats/tests/test_pandas.py index 13c33da6bcfe..76eac8221f92 100644 --- a/ibis/formats/tests/test_pandas.py +++ b/ibis/formats/tests/test_pandas.py @@ -48,6 +48,7 @@ def test_dtype_to_pandas(pandas_type, ibis_type): assert PandasType.from_ibis(ibis_type) == pandas_type +@pytest.mark.parametrize("nullable", [True, False, None]) @pytest.mark.parametrize( ("pandas_type", "ibis_type"), [ @@ -72,9 +73,11 @@ def test_dtype_to_pandas(pandas_type, ibis_type): ], ids=str, ) -def test_dtype_from_pandas_arrow_dtype(pandas_type, ibis_type): +def test_dtype_from_pandas_arrow_dtype(pandas_type, ibis_type, nullable): + if nullable is False: + ibis_type = ibis_type.copy(nullable=False) series = pd.Series([], dtype=f"{pandas_type}[pyarrow]") - assert PandasType.to_ibis(series.dtype) == ibis_type + assert PandasType.to_ibis(series.dtype, nullable=nullable) == ibis_type def test_dtype_from_pandas_arrow_string_dtype(): diff --git a/ibis/formats/tests/test_polars.py b/ibis/formats/tests/test_polars.py index b8eafb488b22..e42b328d1ce0 100644 --- a/ibis/formats/tests/test_polars.py +++ b/ibis/formats/tests/test_polars.py @@ -66,7 +66,10 @@ ) def test_to_from_ibis_type(ibis_dtype, polars_type): assert PolarsType.from_ibis(ibis_dtype) == polars_type - assert PolarsType.to_ibis(polars_type) == ibis_dtype + assert PolarsType.from_ibis(ibis_dtype.copy(nullable=False)) == polars_type + assert PolarsType.to_ibis(polars_type) == ibis_dtype(nullable=True) + assert PolarsType.to_ibis(polars_type, nullable=None) == ibis_dtype(nullable=True) + assert PolarsType.to_ibis(polars_type, nullable=True) == ibis_dtype(nullable=True) assert PolarsType.to_ibis(polars_type, nullable=False) == ibis_dtype(nullable=False) diff --git a/ibis/formats/tests/test_pyarrow.py b/ibis/formats/tests/test_pyarrow.py index 1d3efd04babb..1311a1d4540f 100644 --- a/ibis/formats/tests/test_pyarrow.py +++ b/ibis/formats/tests/test_pyarrow.py @@ -18,18 +18,6 @@ PyArrowSchema = ipa.PyArrowSchema PyArrowType = ipa.PyArrowType - -def assert_dtype_roundtrip(arrow_type, ibis_type=None, restored_type=None): - dtype = PyArrowType.to_ibis(arrow_type, nullable=False) - if ibis_type is not None: - assert dtype == ibis_type - - patyp = PyArrowType.from_ibis(dtype) - if restored_type is None: - restored_type = arrow_type - assert patyp == restored_type - - roundtripable_types = st.deferred( lambda: ( past.null_type @@ -50,9 +38,16 @@ def assert_dtype_roundtrip(arrow_type, ibis_type=None, restored_type=None): ) -@h.given(roundtripable_types) -def test_roundtripable_types(arrow_type): - assert_dtype_roundtrip(arrow_type) +@h.given( + arrow_type=roundtripable_types, + nullable_pair=st.sampled_from([(True, True), (False, False), (None, True)]), +) +def test_roundtripable_types(arrow_type, nullable_pair): + nullable, expected_nullable = nullable_pair + dtype = PyArrowType.to_ibis(arrow_type, nullable=nullable) + assert dtype.nullable is expected_nullable + restored = PyArrowType.from_ibis(dtype) + assert restored == arrow_type @pytest.mark.parametrize( @@ -81,7 +76,11 @@ def test_roundtripable_types(arrow_type): ], ) def test_non_roundtripable_types(arrow_type, ibis_type, restored_type): - assert_dtype_roundtrip(arrow_type, ibis_type, restored_type) + dtype = PyArrowType.to_ibis(arrow_type, nullable=False) + assert dtype == ibis_type + + patyp = PyArrowType.from_ibis(dtype) + assert patyp == restored_type @pytest.mark.parametrize("timezone", [None, "UTC"])