chore: bump pyarrow-stubs==19.2 (#2458)

dangotbanned · web-flow · commit b1c7e8e18a18 · 2025-04-30T14:48:27.000+01:00
diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py
@@ -255,7 +255,7 @@ def _gather(self, rows: SizedMultiIndexSelector[ChunkedArrayAny]) -> Self:
             return self._with_native(self.native.slice(0, 0))
         if self._backend_version < (18,) and isinstance(rows, tuple):
             rows = list(rows)
-        return self._with_native(self.native.take(rows))  # pyright: ignore[reportArgumentType]
+        return self._with_native(self.native.take(rows))
 
     def _gather_slice(self, rows: _SliceIndex | range) -> Self:
         start = rows.start or 0
@@ -302,8 +302,7 @@ def _select_multi_name(
             selector = cast("Sequence[str]", columns.to_pylist())
         else:
             selector = columns
-        # TODO @dangotbanned: Fix upstream `pa.Table.select` https://github.com/zen-xu/pyarrow-stubs/blob/f899bb35e10b36f7906a728e9f8acf3e0a1f9f64/pyarrow-stubs/__lib_pxi/table.pyi#L597
-        # NOTE: Investigate what `cython` actually checks
+        # NOTE: Fixed in https://github.com/zen-xu/pyarrow-stubs/pull/221
         return self._with_native(self.native.select(selector))  # pyright: ignore[reportArgumentType]
 
     @property
@@ -370,13 +369,9 @@ def with_columns(self: ArrowDataFrame, *exprs: ArrowExpr) -> ArrowDataFrame:
             col_name = col_value.name
             column = self._extract_comparand(col_value)
             native_frame = (
-                native_frame.set_column(
-                    columns.index(col_name),
-                    field_=col_name,
-                    column=column,  # type: ignore[arg-type]
-                )
+                native_frame.set_column(columns.index(col_name), col_name, column=column)
                 if col_name in columns
-                else native_frame.append_column(field_=col_name, column=column)
+                else native_frame.append_column(col_name, column=column)
             )
 
         return self._with_native(native_frame, validate_column_names=False)
@@ -708,9 +703,9 @@ def unique(
         subset = list(subset or self.columns)
 
         if keep in {"any", "first", "last"}:
-            agg_func_map = {"any": "min", "first": "min", "last": "max"}
+            from narwhals._arrow.group_by import ArrowGroupBy
 
-            agg_func = agg_func_map[keep]
+            agg_func = ArrowGroupBy._REMAP_UNIQUE[keep]
             col_token = generate_temporary_column_name(n_bytes=8, columns=self.columns)
             keep_idx_native = (
                 self.native.append_column(col_token, pa.array(np.arange(len(self))))
diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py
@@ -161,7 +161,7 @@ def func(df: ArrowDataFrame) -> Sequence[ArrowSeries]:
                 # TODO(marco): is there a way to do this efficiently without
                 # doing 2 sorts? Here we're sorting the dataframe and then
                 # again calling `sort_indices`. `ArrowSeries.scatter` would also sort.
-                sorting_indices = pc.sort_indices(df.get_column(token).native)  # type: ignore[call-overload]
+                sorting_indices = pc.sort_indices(df.get_column(token).native)
                 return [s._with_native(s.native.take(sorting_indices)) for s in result]
         else:
 
diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py
@@ -20,12 +20,15 @@
 if TYPE_CHECKING:
     from narwhals._arrow.dataframe import ArrowDataFrame
     from narwhals._arrow.expr import ArrowExpr
+    from narwhals._arrow.typing import AggregateOptions  # type: ignore[attr-defined]
+    from narwhals._arrow.typing import Aggregation  # type: ignore[attr-defined]
     from narwhals._arrow.typing import Incomplete
     from narwhals._compliant.group_by import NarwhalsAggregation
+    from narwhals.typing import UniqueKeepStrategy
 
 
-class ArrowGroupBy(EagerGroupBy["ArrowDataFrame", "ArrowExpr"]):
-    _REMAP_AGGS: ClassVar[Mapping[NarwhalsAggregation, Any]] = {
+class ArrowGroupBy(EagerGroupBy["ArrowDataFrame", "ArrowExpr", "Aggregation"]):
+    _REMAP_AGGS: ClassVar[Mapping[NarwhalsAggregation, Aggregation]] = {
         "sum": "sum",
         "mean": "mean",
         "median": "approximate_median",
@@ -37,6 +40,11 @@ class ArrowGroupBy(EagerGroupBy["ArrowDataFrame", "ArrowExpr"]):
         "n_unique": "count_distinct",
         "count": "count",
     }
+    _REMAP_UNIQUE: ClassVar[Mapping[UniqueKeepStrategy, Aggregation]] = {
+        "any": "min",
+        "first": "min",
+        "last": "max",
+    }
 
     def __init__(
         self,
@@ -54,7 +62,7 @@ def __init__(
 
     def agg(self, *exprs: ArrowExpr) -> ArrowDataFrame:
         self._ensure_all_simple(exprs)
-        aggs: list[tuple[str, str, Any]] = []
+        aggs: list[tuple[str, Aggregation, AggregateOptions | None]] = []
         expected_pyarrow_column_names: list[str] = self._keys.copy()
         new_column_names: list[str] = self._keys.copy()
         exclude = (*self._keys, *self._output_key_names)
diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py
@@ -203,11 +203,10 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]:
             context=self,
         )
 
-    # NOTE: Stub issue fixed in https://github.com/zen-xu/pyarrow-stubs/pull/203
     def _concat_diagonal(self, dfs: Sequence[pa.Table], /) -> pa.Table:
         if self._backend_version >= (14,):
-            return pa.concat_tables(dfs, promote_options="default")  # type: ignore[arg-type]
-        return pa.concat_tables(dfs, promote=True)  # type: ignore[arg-type] # pragma: no cover
+            return pa.concat_tables(dfs, promote_options="default")
+        return pa.concat_tables(dfs, promote=True)  # pragma: no cover
 
     def _concat_horizontal(self, dfs: Sequence[pa.Table], /) -> pa.Table:
         names = list(chain.from_iterable(df.column_names for df in dfs))
@@ -225,7 +224,7 @@ def _concat_vertical(self, dfs: Sequence[pa.Table], /) -> pa.Table:
                     f"   - dataframe {i}: {cols_current}\n"
                 )
                 raise TypeError(msg)
-        return pa.concat_tables(dfs)  # type: ignore[arg-type]
+        return pa.concat_tables(dfs)
 
     @property
     def selectors(self) -> ArrowSelectorNamespace:
diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py
@@ -298,7 +298,7 @@ def __rmod__(self, other: Any) -> Self:
         return self._with_native(res)
 
     def __invert__(self) -> Self:
-        return self._with_native(pc.invert(self.native))  # type: ignore[call-overload]
+        return self._with_native(pc.invert(self.native))
 
     @property
     def _type(self) -> pa.DataType:
@@ -426,6 +426,7 @@ def _gather_slice(self, rows: _SliceIndex | range) -> Self:
     def scatter(self, indices: int | Sequence[int], values: Any) -> Self:
         import numpy as np  # ignore-banned-import
 
+        values_native: ArrayAny
         if isinstance(indices, int):
             indices_native = pa.array([indices])
             values_native = pa.array([values])
@@ -436,20 +437,25 @@ def scatter(self, indices: int | Sequence[int], values: Any) -> Self:
             if isinstance(values, self.__class__):
                 values_native = values.native.combine_chunks()
             else:
-                values_native = pa.array(values)
+                # NOTE: Requires fixes in https://github.com/zen-xu/pyarrow-stubs/pull/209
+                pa_array: Incomplete = pa.array
+                values_native = pa_array(values)
 
-        sorting_indices = pc.sort_indices(indices_native)  # type: ignore[call-overload]
-        indices_native = pc.take(indices_native, sorting_indices)
-        values_native = pc.take(values_native, sorting_indices)
+        sorting_indices = pc.sort_indices(indices_native)
+        indices_native = indices_native.take(sorting_indices)
+        values_native = values_native.take(sorting_indices)
 
         mask: _1DArray = np.zeros(self.len(), dtype=bool)
         mask[indices_native] = True
-        result = pc.replace_with_mask(
-            self.native,
-            cast("list[bool]", mask),
-            values_native.take(indices_native),
+        # NOTE: Multiple issues
+        # - Missing `values` type
+        # - `mask` accepts a `np.ndarray`, but not mentioned in stubs
+        # - Missing `replacements` type
+        # - Missing return type
+        pc_replace_with_mask: Incomplete = pc.replace_with_mask
+        return self._with_native(
+            pc_replace_with_mask(self.native, mask, values_native.take(indices_native))
         )
-        return self._with_native(result)
 
     def to_list(self) -> list[Any]:
         return self.native.to_pylist()
diff --git a/narwhals/_arrow/typing.py b/narwhals/_arrow/typing.py
@@ -15,6 +15,8 @@
         from typing_extensions import TypeAlias
 
     import pyarrow as pa
+    from pyarrow.__lib_pxi.table import AggregateOptions  # noqa: F401
+    from pyarrow.__lib_pxi.table import Aggregation  # noqa: F401
     from pyarrow._stubs_typing import (  # pyright: ignore[reportMissingModuleSource]
         Indices,  # noqa: F401
     )
diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py
@@ -186,7 +186,8 @@ def narwhals_to_native_dtype(dtype: DType | type[DType], version: Version) -> pa
     if isinstance_or_issubclass(dtype, dtypes.Categorical):
         return pa.dictionary(pa.uint32(), pa.string())
     if isinstance_or_issubclass(dtype, dtypes.Datetime):
-        return pa.timestamp(dtype.time_unit, tz=dtype.time_zone)  # pyright: ignore[reportArgumentType]
+        unit = dtype.time_unit
+        return pa.timestamp(unit, tz) if (tz := dtype.time_zone) else pa.timestamp(unit)
     if isinstance_or_issubclass(dtype, dtypes.Duration):
         return pa.duration(dtype.time_unit)
     if isinstance_or_issubclass(dtype, dtypes.Date):
@@ -278,15 +279,18 @@ def floordiv_compat(left: ArrayOrScalar, right: ArrayOrScalar) -> Any:
 
     if pa.types.is_integer(left.type) and pa.types.is_integer(right.type):
         divided = pc.divide_checked(left, right)
+        # TODO @dangotbanned: Use a `TypeVar` in guards
+        # Narrowing to a `Union` isn't interacting well with the rest of the stubs
+        # https://github.com/zen-xu/pyarrow-stubs/pull/215
         if pa.types.is_signed_integer(divided.type):
-            # GH 56676
+            div_type = cast("pa._lib.Int64Type", divided.type)
             has_remainder = pc.not_equal(pc.multiply(divided, right), left)
             has_one_negative_operand = pc.less(
-                pc.bit_wise_xor(left, right), lit(0, type=divided.type)
+                pc.bit_wise_xor(left, right), lit(0, div_type)
             )
             result = pc.if_else(
                 pc.and_(has_remainder, has_one_negative_operand),
-                pc.subtract(divided, lit(1, type=divided.type)),
+                pc.subtract(divided, lit(1, div_type)),
                 divided,
             )
         else:
diff --git a/narwhals/_compliant/group_by.py b/narwhals/_compliant/group_by.py
@@ -195,9 +195,9 @@ def _leaf_name(cls, expr: DepthTrackingExprAny, /) -> NarwhalsAggregation | Any:
 
 
 class EagerGroupBy(
-    DepthTrackingGroupBy[CompliantDataFrameT, EagerExprT_contra, str],
+    DepthTrackingGroupBy[CompliantDataFrameT, EagerExprT_contra, NativeAggregationT_co],
     DataFrameGroupBy[CompliantDataFrameT, EagerExprT_contra],
-    Protocol38[CompliantDataFrameT, EagerExprT_contra],
+    Protocol38[CompliantDataFrameT, EagerExprT_contra, NativeAggregationT_co],
 ): ...
 
 
diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py
@@ -21,7 +21,7 @@
     from narwhals._pandas_like.expr import PandasLikeExpr
 
 
-class PandasLikeGroupBy(EagerGroupBy["PandasLikeDataFrame", "PandasLikeExpr"]):
+class PandasLikeGroupBy(EagerGroupBy["PandasLikeDataFrame", "PandasLikeExpr", str]):
     _REMAP_AGGS: ClassVar[Mapping[NarwhalsAggregation, Any]] = {
         "sum": "sum",
         "mean": "mean",
diff --git a/pyproject.toml b/pyproject.toml
@@ -68,7 +68,7 @@ typing = [
   "typing_extensions",
   "mypy~=1.15.0",
   "pyright",
-  "pyarrow-stubs==19.1",
+  "pyarrow-stubs==19.2",
   "sqlframe",
   "polars==1.25.2",
   "uv",

Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,8 @@`
`15`	`15`	`from typing_extensions import TypeAlias`
`16`	`16`
`17`	`17`	`import pyarrow as pa`
	`18`	`+ from pyarrow.__lib_pxi.table import AggregateOptions # noqa: F401`
	`19`	`+ from pyarrow.__lib_pxi.table import Aggregation # noqa: F401`
`18`	`20`	`from pyarrow._stubs_typing import ( # pyright: ignore[reportMissingModuleSource]`
`19`	`21`	`Indices, # noqa: F401`
`20`	`22`	`)`