Skip to content

concat_str raises TypeError on pandas>=3 #3498

@dangotbanned

Description

@dangotbanned

Description

Discovered a concat_str bug when this altair test started failing while looking at #3013 (per #3497 (comment))

Related

Repro

Adapted from (#2097 (comment))

import narwhals as nw
import pandas as pd
import pyarrow as pa

native_pa = pa.table(
    {"store": ["foo", "bar"], "item": ["axe", "saw"]},
    schema=pa.schema([("store", pa.large_string()), ("item", pa.large_string())]),
)
native_pd = native_pa.to_pandas(types_mapper=pd.ArrowDtype)

expr = nw.concat_str("store", "item", separator="-").alias("store_item")

result = nw.from_native(native_pa).with_columns(expr)
print(result)
Output

┌───────────────────────────────────┐
|        Narwhals DataFrame         |
|-----------------------------------|
|pyarrow.Table                      |
|store: large_string                |
|item: large_string                 |
|store_item: large_string           |
|----                               |
|store: [["foo","bar"]]             |
|item: [["axe","saw"]]              |
|store_item: [["foo-axe","bar-saw"]]|
└───────────────────────────────────┘

result = nw.from_native(native_pd).with_columns(expr)
print(result)
TypeError: operation 'add' not supported for dtype 'large_string[pyarrow]' with object of type <class 'str'>

Traceback

This is the full test run failure, just to show it only failed on pandas[pyarrow]

Show 207 lines (sorry)

=========================================================== FAILURES ============================================================ 
_________________________________________ test_reader_cache_exhaustive[pandas[pyarrow]] _________________________________________ 
[gw16] win32 -- Python 3.12.8 C:/Users/danie/Documents/GitHub/altair/.venv/Scripts/python.exe

self = <ArrowExtensionArray>
['608ba6d51fa70584c3fa1d31eb94533302553838',
 '719e73406cfc08f16dda651513ae1113edd75845',
 '11ae...6f2713c94c0c284039506ca2d4f3dee',
 'd3df33e12be0d0544c95f1bd47005add4b7010be']
Length: 73, dtype: large_string[pyarrow]
other = <pyarrow.StringScalar: ''>, op = <built-in function add>
arrow_funcs = {'add': <function add_checked at 0x0000026D46242C00>, 'divmod': NotImplemented, 'floordiv': <function <lambda> at 0x0000026D4640DF80>, 'mod': NotImplemented, ...}

    def _evaluate_op_method(self, other, op, arrow_funcs) -> Self:
        pa_type = self._pa_array.type
        other_original = other       
        other = self._box_pa(other)

        if (
            pa.types.is_string(pa_type)
            or pa.types.is_large_string(pa_type)
            or pa.types.is_binary(pa_type)
        ):
            if op in [operator.add, roperator.radd]:
                sep = pa.scalar("", type=pa_type)
                try:
                    if op is operator.add:
>                       result = pc.binary_join_element_wise(self._pa_array, other, sep)
                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.venv/Lib/site-packages/pandas/core/arrays/arrow/array.py:986:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
.venv/Lib/site-packages/pyarrow/compute.py:271: in wrapper
    return func.call(args, options, memory_pool)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
pyarrow/_compute.pyx:399: in pyarrow._compute.Function.call
    ???
pyarrow/error.pxi:155: in pyarrow.lib.pyarrow_internal_check_status
    ???
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

>   ???
E   pyarrow.lib.ArrowNotImplementedError: Function 'binary_join_element_wise' has no kernel matching input types (large_string, string, large_string)

pyarrow/error.pxi:92: ArrowNotImplementedError

The above exception was the direct cause of the following exception:

backend = 'pandas[pyarrow]', monkeypatch = <_pytest.monkeypatch.MonkeyPatch object at 0x0000026D8A187590>
tmp_path = WindowsPath('C:/Users/danie/AppData/Local/Temp/pytest-of-danie/pytest-12169/popen-gw16/test_reader_cache_exhaustive_p1')
polars_loader = Loader[polars]

    @datasets_debug
    @backends
    def test_reader_cache_exhaustive(
        backend: _Backend,
        monkeypatch: pytest.MonkeyPatch,
        tmp_path: Path,
        polars_loader: PolarsLoader,
    ) -> None:
        """
        Fully populate and then purge the cache for all backends.

        Notes
        -----
        - Does not attempt to read the files
        - Checking we can support pre-downloading and safely deleting
            - Requests work the same for all backends
            - The logic for detecting the cache contents uses ``narwhals``
            - Here, we're testing that these ``narwhals`` operations are consistent
        - `DatasetCache.download_all` is expensive for CI, so aiming for it to run **at most once**
            - 34-45s per call (4x backends)
        """
        polars_loader.cache.download_all()
        CLONED: Path = tmp_path / "clone"
        CLONED.mkdir(exist_ok=True)

        # Copy the cache contents
        import shutil

        shutil.copytree(polars_loader.cache.path, CLONED, dirs_exist_ok=True)

        monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path))
        load = Loader.from_backend(backend)
        assert load.cache.is_active()
        cache_dir = load.cache.path
        assert cache_dir == tmp_path
        assert tuple(load.cache) == (CLONED,)
        load.cache.path = CLONED
        cached_paths = tuple(load.cache)
        assert cached_paths != ()

        # NOTE: Approximating all datasets downloaded (minimum expected count)
        assert len(cached_paths) >= 70
        assert all(bool(fp.exists() and fp.stat().st_size) for fp in load.cache)
        # NOTE: Confirm this is a no-op (already downloaded)
        load.cache.download_all()
        assert len(cached_paths) == len(tuple(load.cache))

        # NOTE: Ensure unrelated files in the directory are not removed during cache clearing
        test_file: Path = tmp_path / "test_file.json"
        test_file.touch(exist_ok=False)
>       load.cache.clear()

tests/test_datasets.py:493:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
altair/datasets/_cache.py:324: in clear
    .select(nw.concat_str("sha", "suffix").alias("sha_suffix"))
     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/dataframe.py:2726: in select
    return super().select(*exprs, **named_exprs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/dataframe.py:240: in select
    return self._with_compliant(self._compliant_frame.select(*compliant_exprs))
                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/_pandas_like/dataframe.py:448: in select
    new_series = self._evaluate_exprs(*exprs)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/_compliant/dataframe.py:367: in _evaluate_exprs
    return tuple(chain.from_iterable(self._evaluate_expr(expr) for expr in exprs))  # pyright: ignore[reportArgumentType]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/_compliant/dataframe.py:367: in <genexpr>
    return tuple(chain.from_iterable(self._evaluate_expr(expr) for expr in exprs))  # pyright: ignore[reportArgumentType]
                                     ^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/_compliant/dataframe.py:380: in _evaluate_expr
    result = expr(self)
             ^^^^^^^^^^
.venv/Lib/site-packages/narwhals/_compliant/expr.py:247: in __call__
    return self._call(df)
           ^^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/_compliant/expr.py:651: in <lambda>
    lambda df: [series.alias(name) for series in self(df)],
                                                 ^^^^^^^^
.venv/Lib/site-packages/narwhals/_compliant/expr.py:247: in __call__
    return self._call(df)
           ^^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/_pandas_like/namespace.py:333: in func
    result = reduce(lambda x, y: x + separator + y, series).zip_with(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/_pandas_like/namespace.py:333: in <lambda>
    result = reduce(lambda x, y: x + separator + y, series).zip_with(
                                 ^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/_pandas_like/series.py:443: in __add__
    return self._with_binary(operator.add, other)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/_pandas_like/series.py:406: in _with_binary
    op(ser, other_native), preserve_broadcast=preserve_broadcast
    ^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/pandas/core/ops/common.py:85: in new_method
    return method(self, other)
           ^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/pandas/core/arraylike.py:190: in __add__
    return self._arith_method(other, operator.add)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/pandas/core/series.py:6751: in _arith_method
    return base.IndexOpsMixin._arith_method(self, other, op)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/pandas/core/base.py:1644: in _arith_method
    result = ops.arithmetic_op(lvalues, rvalues, op)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/pandas/core/ops/array_ops.py:279: in arithmetic_op
    res_values = op(left, right)
                 ^^^^^^^^^^^^^^^
.venv/Lib/site-packages/pandas/core/ops/common.py:85: in new_method
    return method(self, other)
           ^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/pandas/core/arraylike.py:190: in __add__
    return self._arith_method(other, operator.add)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/pandas/core/arrays/arrow/array.py:1079: in _arith_method
    result = self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <ArrowExtensionArray>
['608ba6d51fa70584c3fa1d31eb94533302553838',
 '719e73406cfc08f16dda651513ae1113edd75845',
 '11ae...6f2713c94c0c284039506ca2d4f3dee',
 'd3df33e12be0d0544c95f1bd47005add4b7010be']
Length: 73, dtype: large_string[pyarrow]
other = <pyarrow.StringScalar: ''>, op = <built-in function add>
arrow_funcs = {'add': <function add_checked at 0x0000026D46242C00>, 'divmod': NotImplemented, 'floordiv': <function <lambda> at 0x0000026D4640DF80>, 'mod': NotImplemented, ...}

    def _evaluate_op_method(self, other, op, arrow_funcs) -> Self:
        pa_type = self._pa_array.type
        other_original = other
        other = self._box_pa(other)

        if (
            pa.types.is_string(pa_type)
            or pa.types.is_large_string(pa_type)
            or pa.types.is_binary(pa_type)
        ):
            if op in [operator.add, roperator.radd]:
                sep = pa.scalar("", type=pa_type)
                try:
                    if op is operator.add:
                        result = pc.binary_join_element_wise(self._pa_array, other, sep)
                    elif op is roperator.radd:
                        result = pc.binary_join_element_wise(other, self._pa_array, sep)
                except pa.ArrowNotImplementedError as err:
>                   raise TypeError(
                        self._op_method_error_message(other_original, op)
                    ) from err
E                   TypeError: operation 'add' not supported for dtype 'large_string[pyarrow]' with object of type <class 'str'>  

.venv/Lib/site-packages/pandas/core/arrays/arrow/array.py:990: TypeError

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions