chore(typing): Kinda type pandas_like.utils.select_columns_by_name`

dangotbanned · dangotbanned · commit c743de39db15 · 2025-06-21T12:14:01.000+01:00
- Somewhat of a resurrection of #2227 - But this time building on #2693
diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py
@@ -24,7 +24,7 @@
     from types import ModuleType
 
     import dask.dataframe.dask_expr as dx
-    from typing_extensions import Self, TypeIs
+    from typing_extensions import Self, TypeAlias, TypeIs
 
     from narwhals._compliant.typing import CompliantDataFrameAny
     from narwhals._dask.expr import DaskExpr
@@ -35,6 +35,13 @@
     from narwhals.dtypes import DType
     from narwhals.typing import AsofJoinStrategy, JoinStrategy, LazyUniqueKeepStrategy
 
+Incomplete: TypeAlias = "Any"
+"""Using `_pandas_like` utils with `_dask`.
+
+Typing this correctly will complicate the `_pandas_like`-side.
+Very low priority until `dask` adds typing.
+"""
+
 
 class DaskLazyFrame(
     CompliantLazyFrame["DaskExpr", "dd.DataFrame", "LazyFrame[dd.DataFrame]"]
@@ -158,8 +165,9 @@ def filter(self, predicate: DaskExpr) -> Self:
         return self._with_native(self.native.loc[mask])
 
     def simple_select(self, *column_names: str) -> Self:
+        df: Incomplete = self.native
         native = select_columns_by_name(
-            self.native, list(column_names), self._backend_version, self._implementation
+            df, list(column_names), self._backend_version, self._implementation
         )
         return self._with_native(native)
 
@@ -170,8 +178,9 @@ def aggregate(self, *exprs: DaskExpr) -> Self:
 
     def select(self, *exprs: DaskExpr) -> Self:
         new_series = evaluate_exprs(self, *exprs)
+        df: Incomplete = self.native
         df = select_columns_by_name(
-            self.native.assign(**dict(new_series)),
+            df.assign(**dict(new_series)),
             [s[0] for s in new_series],
             self._backend_version,
             self._implementation,
@@ -269,6 +278,7 @@ def join(  # noqa: C901
                 )
                 .drop(columns=key_token)
             )
+        other_native: Incomplete = other.native
 
         if how == "anti":
             indicator_token = generate_temporary_column_name(
@@ -280,7 +290,7 @@ def join(  # noqa: C901
                 raise TypeError(msg)
             other_native = (
                 select_columns_by_name(
-                    other.native,
+                    other_native,
                     list(right_on),
                     self._backend_version,
                     self._implementation,
@@ -307,7 +317,7 @@ def join(  # noqa: C901
                 raise TypeError(msg)
             other_native = (
                 select_columns_by_name(
-                    other.native,
+                    other_native,
                     list(right_on),
                     self._backend_version,
                     self._implementation,
diff --git a/narwhals/_namespace.py b/narwhals/_namespace.py
@@ -101,7 +101,14 @@
     class _NativeDask(Protocol):
         _partition_type: type[pd.DataFrame]
 
-    class _CuDFDataFrame(NativeFrame, Protocol):
+    class _BasePandasLikeFrame(NativeFrame, Protocol):
+        @property
+        def shape(self) -> tuple[int, int]: ...
+        def __getitem__(self, key: Any, /) -> Any: ...
+        @property
+        def loc(self) -> Any: ...
+
+    class _CuDFDataFrame(_BasePandasLikeFrame, Protocol):
         def to_pylibcudf(self, *args: Any, **kwds: Any) -> Any: ...
 
     class _CuDFSeries(NativeSeries, Protocol):
@@ -114,7 +121,7 @@ def __pyarrow_result__(self, *args: Any, **kwds: Any) -> Any: ...
         def __pandas_result__(self, *args: Any, **kwds: Any) -> Any: ...
         def __polars_result__(self, *args: Any, **kwds: Any) -> Any: ...
 
-    class _ModinDataFrame(NativeFrame, Protocol):
+    class _ModinDataFrame(_BasePandasLikeFrame, Protocol):
         _pandas_class: type[pd.DataFrame]
 
     class _ModinSeries(NativeSeries, Protocol):
diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py
@@ -25,6 +25,7 @@
 
     from narwhals._pandas_like.expr import PandasLikeExpr
     from narwhals._pandas_like.series import PandasLikeSeries
+    from narwhals._pandas_like.typing import NativeDataFrameT
     from narwhals.dtypes import DType
     from narwhals.typing import DTypeBackend, IntoDType, TimeUnit, _1DArray
 
@@ -558,35 +559,37 @@ def calculate_timestamp_date(s: pd.Series[int], time_unit: str) -> pd.Series[int
 
 
 def select_columns_by_name(
-    df: T,
+    df: NativeDataFrameT,
     column_names: list[str] | _1DArray,  # NOTE: Cannot be a tuple!
     backend_version: tuple[int, ...],
     implementation: Implementation,
-) -> T:
+) -> NativeDataFrameT | Any:
     """Select columns by name.
 
     Prefer this over `df.loc[:, column_names]` as it's
     generally more performant.
     """
-    if len(column_names) == df.shape[1] and all(column_names == df.columns):  # type: ignore[attr-defined]
-        return df
-    if (df.columns.dtype.kind == "b") or (  # type: ignore[attr-defined]
+    if len(column_names) == df.shape[1]:  # noqa: SIM102
+        # NOTE: I'm pretty unsure on how this doesn't trigger a runtime error
+        if all(column_names == df.columns):  # type: ignore[arg-type]
+            return df
+    if (df.columns.dtype.kind == "b") or (
         implementation is Implementation.PANDAS and backend_version < (1, 5)
     ):
         # See https://github.com/narwhals-dev/narwhals/issues/1349#issuecomment-2470118122
         # for why we need this
         if error := check_columns_exist(
             column_names,  # type: ignore[arg-type]
-            available=df.columns.tolist(),  # type: ignore[attr-defined]
+            available=df.columns.tolist(),
         ):
             raise error
-        return df.loc[:, column_names]  # type: ignore[attr-defined]
+        return df.loc[:, column_names]
     try:
-        return df[column_names]  # type: ignore[index]
+        return df[column_names]
     except KeyError as e:
         if error := check_columns_exist(
             column_names,  # type: ignore[arg-type]
-            available=df.columns.tolist(),  # type: ignore[attr-defined]
+            available=df.columns.tolist(),
         ):
             raise error from e
         raise