-
Notifications
You must be signed in to change notification settings - Fork 182
Open
Labels
bug: it raises an error but shouldn'tpandas-likeIssue is related to pandas-like backendsIssue is related to pandas-like backends
Description
Description
Discovered a concat_str bug when this altair test started failing while looking at #3013 (per #3497 (comment))
Related
- api: use
large_string/large_listwhen converting Narwhals types -> PyArrow? #2097- Think it is the same issue
- still reproducing with
pandas(dtype_backend="pyarrow")
test_reader_cache_exhaustive[pandas[pyarrow]]- Test that failed doesn't run in ci
- Requires (
uv run task test-datasets)
Repro
Adapted from (#2097 (comment))
import narwhals as nw
import pandas as pd
import pyarrow as pa
native_pa = pa.table(
{"store": ["foo", "bar"], "item": ["axe", "saw"]},
schema=pa.schema([("store", pa.large_string()), ("item", pa.large_string())]),
)
native_pd = native_pa.to_pandas(types_mapper=pd.ArrowDtype)
expr = nw.concat_str("store", "item", separator="-").alias("store_item")
result = nw.from_native(native_pa).with_columns(expr)
print(result)Output
┌───────────────────────────────────┐
| Narwhals DataFrame |
|-----------------------------------|
|pyarrow.Table |
|store: large_string |
|item: large_string |
|store_item: large_string |
|---- |
|store: [["foo","bar"]] |
|item: [["axe","saw"]] |
|store_item: [["foo-axe","bar-saw"]]|
└───────────────────────────────────┘
result = nw.from_native(native_pd).with_columns(expr)
print(result)TypeError: operation 'add' not supported for dtype 'large_string[pyarrow]' with object of type <class 'str'>Traceback
This is the full test run failure, just to show it only failed on pandas[pyarrow]
Show 207 lines (sorry)
=========================================================== FAILURES ============================================================
_________________________________________ test_reader_cache_exhaustive[pandas[pyarrow]] _________________________________________
[gw16] win32 -- Python 3.12.8 C:/Users/danie/Documents/GitHub/altair/.venv/Scripts/python.exe
self = <ArrowExtensionArray>
['608ba6d51fa70584c3fa1d31eb94533302553838',
'719e73406cfc08f16dda651513ae1113edd75845',
'11ae...6f2713c94c0c284039506ca2d4f3dee',
'd3df33e12be0d0544c95f1bd47005add4b7010be']
Length: 73, dtype: large_string[pyarrow]
other = <pyarrow.StringScalar: ''>, op = <built-in function add>
arrow_funcs = {'add': <function add_checked at 0x0000026D46242C00>, 'divmod': NotImplemented, 'floordiv': <function <lambda> at 0x0000026D4640DF80>, 'mod': NotImplemented, ...}
def _evaluate_op_method(self, other, op, arrow_funcs) -> Self:
pa_type = self._pa_array.type
other_original = other
other = self._box_pa(other)
if (
pa.types.is_string(pa_type)
or pa.types.is_large_string(pa_type)
or pa.types.is_binary(pa_type)
):
if op in [operator.add, roperator.radd]:
sep = pa.scalar("", type=pa_type)
try:
if op is operator.add:
> result = pc.binary_join_element_wise(self._pa_array, other, sep)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/pandas/core/arrays/arrow/array.py:986:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
.venv/Lib/site-packages/pyarrow/compute.py:271: in wrapper
return func.call(args, options, memory_pool)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
pyarrow/_compute.pyx:399: in pyarrow._compute.Function.call
???
pyarrow/error.pxi:155: in pyarrow.lib.pyarrow_internal_check_status
???
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
> ???
E pyarrow.lib.ArrowNotImplementedError: Function 'binary_join_element_wise' has no kernel matching input types (large_string, string, large_string)
pyarrow/error.pxi:92: ArrowNotImplementedError
The above exception was the direct cause of the following exception:
backend = 'pandas[pyarrow]', monkeypatch = <_pytest.monkeypatch.MonkeyPatch object at 0x0000026D8A187590>
tmp_path = WindowsPath('C:/Users/danie/AppData/Local/Temp/pytest-of-danie/pytest-12169/popen-gw16/test_reader_cache_exhaustive_p1')
polars_loader = Loader[polars]
@datasets_debug
@backends
def test_reader_cache_exhaustive(
backend: _Backend,
monkeypatch: pytest.MonkeyPatch,
tmp_path: Path,
polars_loader: PolarsLoader,
) -> None:
"""
Fully populate and then purge the cache for all backends.
Notes
-----
- Does not attempt to read the files
- Checking we can support pre-downloading and safely deleting
- Requests work the same for all backends
- The logic for detecting the cache contents uses ``narwhals``
- Here, we're testing that these ``narwhals`` operations are consistent
- `DatasetCache.download_all` is expensive for CI, so aiming for it to run **at most once**
- 34-45s per call (4x backends)
"""
polars_loader.cache.download_all()
CLONED: Path = tmp_path / "clone"
CLONED.mkdir(exist_ok=True)
# Copy the cache contents
import shutil
shutil.copytree(polars_loader.cache.path, CLONED, dirs_exist_ok=True)
monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path))
load = Loader.from_backend(backend)
assert load.cache.is_active()
cache_dir = load.cache.path
assert cache_dir == tmp_path
assert tuple(load.cache) == (CLONED,)
load.cache.path = CLONED
cached_paths = tuple(load.cache)
assert cached_paths != ()
# NOTE: Approximating all datasets downloaded (minimum expected count)
assert len(cached_paths) >= 70
assert all(bool(fp.exists() and fp.stat().st_size) for fp in load.cache)
# NOTE: Confirm this is a no-op (already downloaded)
load.cache.download_all()
assert len(cached_paths) == len(tuple(load.cache))
# NOTE: Ensure unrelated files in the directory are not removed during cache clearing
test_file: Path = tmp_path / "test_file.json"
test_file.touch(exist_ok=False)
> load.cache.clear()
tests/test_datasets.py:493:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
altair/datasets/_cache.py:324: in clear
.select(nw.concat_str("sha", "suffix").alias("sha_suffix"))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/dataframe.py:2726: in select
return super().select(*exprs, **named_exprs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/dataframe.py:240: in select
return self._with_compliant(self._compliant_frame.select(*compliant_exprs))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/_pandas_like/dataframe.py:448: in select
new_series = self._evaluate_exprs(*exprs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/_compliant/dataframe.py:367: in _evaluate_exprs
return tuple(chain.from_iterable(self._evaluate_expr(expr) for expr in exprs)) # pyright: ignore[reportArgumentType]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/_compliant/dataframe.py:367: in <genexpr>
return tuple(chain.from_iterable(self._evaluate_expr(expr) for expr in exprs)) # pyright: ignore[reportArgumentType]
^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/_compliant/dataframe.py:380: in _evaluate_expr
result = expr(self)
^^^^^^^^^^
.venv/Lib/site-packages/narwhals/_compliant/expr.py:247: in __call__
return self._call(df)
^^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/_compliant/expr.py:651: in <lambda>
lambda df: [series.alias(name) for series in self(df)],
^^^^^^^^
.venv/Lib/site-packages/narwhals/_compliant/expr.py:247: in __call__
return self._call(df)
^^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/_pandas_like/namespace.py:333: in func
result = reduce(lambda x, y: x + separator + y, series).zip_with(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/_pandas_like/namespace.py:333: in <lambda>
result = reduce(lambda x, y: x + separator + y, series).zip_with(
^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/_pandas_like/series.py:443: in __add__
return self._with_binary(operator.add, other)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/narwhals/_pandas_like/series.py:406: in _with_binary
op(ser, other_native), preserve_broadcast=preserve_broadcast
^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/pandas/core/ops/common.py:85: in new_method
return method(self, other)
^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/pandas/core/arraylike.py:190: in __add__
return self._arith_method(other, operator.add)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/pandas/core/series.py:6751: in _arith_method
return base.IndexOpsMixin._arith_method(self, other, op)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/pandas/core/base.py:1644: in _arith_method
result = ops.arithmetic_op(lvalues, rvalues, op)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/pandas/core/ops/array_ops.py:279: in arithmetic_op
res_values = op(left, right)
^^^^^^^^^^^^^^^
.venv/Lib/site-packages/pandas/core/ops/common.py:85: in new_method
return method(self, other)
^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/pandas/core/arraylike.py:190: in __add__
return self._arith_method(other, operator.add)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.venv/Lib/site-packages/pandas/core/arrays/arrow/array.py:1079: in _arith_method
result = self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <ArrowExtensionArray>
['608ba6d51fa70584c3fa1d31eb94533302553838',
'719e73406cfc08f16dda651513ae1113edd75845',
'11ae...6f2713c94c0c284039506ca2d4f3dee',
'd3df33e12be0d0544c95f1bd47005add4b7010be']
Length: 73, dtype: large_string[pyarrow]
other = <pyarrow.StringScalar: ''>, op = <built-in function add>
arrow_funcs = {'add': <function add_checked at 0x0000026D46242C00>, 'divmod': NotImplemented, 'floordiv': <function <lambda> at 0x0000026D4640DF80>, 'mod': NotImplemented, ...}
def _evaluate_op_method(self, other, op, arrow_funcs) -> Self:
pa_type = self._pa_array.type
other_original = other
other = self._box_pa(other)
if (
pa.types.is_string(pa_type)
or pa.types.is_large_string(pa_type)
or pa.types.is_binary(pa_type)
):
if op in [operator.add, roperator.radd]:
sep = pa.scalar("", type=pa_type)
try:
if op is operator.add:
result = pc.binary_join_element_wise(self._pa_array, other, sep)
elif op is roperator.radd:
result = pc.binary_join_element_wise(other, self._pa_array, sep)
except pa.ArrowNotImplementedError as err:
> raise TypeError(
self._op_method_error_message(other_original, op)
) from err
E TypeError: operation 'add' not supported for dtype 'large_string[pyarrow]' with object of type <class 'str'>
.venv/Lib/site-packages/pandas/core/arrays/arrow/array.py:990: TypeErrorReactions are currently unavailable
Metadata
Metadata
Assignees
Labels
bug: it raises an error but shouldn'tpandas-likeIssue is related to pandas-like backendsIssue is related to pandas-like backends