Skip to content

Commit 667d9b4

Browse files
cpsievertclaude
andcommitted
feat(pkg-py): Replace pandas dependency with narwhals abstraction layer
Remove pandas as a required dependency in favor of narwhals, which provides a unified DataFrame interface supporting both pandas and polars backends. Changes: - Add _df_compat.py module with read_csv, read_sql, and duckdb_result_to_nw helpers - Update DataSource classes to return narwhals DataFrames - Update df_to_html to generate HTML without pandas dependency - Make pandas and polars optional dependencies - Add comprehensive tests for DataFrameSource and df_compat module Users can now install with either `pip install querychat[pandas]` or `pip install querychat[polars]`. Use `.to_native()` on returned DataFrames to get the underlying pandas or polars DataFrame. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <[email protected]>
1 parent c5e63ed commit 667d9b4

File tree

14 files changed

+637
-105
lines changed

14 files changed

+637
-105
lines changed

pkg-py/CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [UNRELEASED]
99

10+
### Breaking Changes
11+
12+
* Methods like `execute_query()`, `get_data()`, and `df()` now return a `narwhals.DataFrame` instead of a `pandas.DataFrame`. This querychat to drop its `pandas` dependency, and for you to use any `narwhals`-compatible dataframe of your choosing.
13+
* If this breaks existing code, note you can call `.to_native()` on the new dataframe value to get your `pandas` dataframe back.
14+
* Note that `polars` or `pandas` will be needed to realize a `sqlalchemy` connection query as a dataframe. Install with `pip install querychat[pandas]` or `pip install querychat[polars]`
15+
1016
### New features
1117

1218
* `QueryChat.sidebar()`, `QueryChat.ui()`, and `QueryChat.server()` now support an optional `id` parameter to create multiple chat instances from a single `QueryChat` object. (#172)

pkg-py/src/querychat/_datasource.py

Lines changed: 32 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@
55

66
import duckdb
77
import narwhals.stable.v1 as nw
8-
import pandas as pd
98
from sqlalchemy import inspect, text
109
from sqlalchemy.sql import sqltypes
1110

11+
from ._df_compat import duckdb_result_to_nw, read_sql
12+
1213
if TYPE_CHECKING:
13-
from narwhals.stable.v1.typing import IntoFrame
1414
from sqlalchemy.engine import Connection, Engine
1515

1616

@@ -53,7 +53,7 @@ def get_schema(self, *, categorical_threshold: int) -> str:
5353
...
5454

5555
@abstractmethod
56-
def execute_query(self, query: str) -> pd.DataFrame:
56+
def execute_query(self, query: str) -> nw.DataFrame:
5757
"""
5858
Execute SQL query and return results as DataFrame.
5959
@@ -65,20 +65,20 @@ def execute_query(self, query: str) -> pd.DataFrame:
6565
Returns
6666
-------
6767
:
68-
Query results as a pandas DataFrame
68+
Query results as a narwhals DataFrame
6969
7070
"""
7171
...
7272

7373
@abstractmethod
74-
def get_data(self) -> pd.DataFrame:
74+
def get_data(self) -> nw.DataFrame:
7575
"""
7676
Return the unfiltered data as a DataFrame.
7777
7878
Returns
7979
-------
8080
:
81-
The complete dataset as a pandas DataFrame
81+
The complete dataset as a narwhals DataFrame
8282
8383
"""
8484
...
@@ -99,27 +99,26 @@ def cleanup(self) -> None:
9999

100100

101101
class DataFrameSource(DataSource):
102-
"""A DataSource implementation that wraps a pandas DataFrame using DuckDB."""
102+
"""A DataSource implementation that wraps a DataFrame using DuckDB."""
103103

104-
_df: nw.DataFrame | nw.LazyFrame
104+
_df: nw.DataFrame
105105

106-
def __init__(self, df: IntoFrame, table_name: str):
106+
def __init__(self, df: nw.DataFrame, table_name: str):
107107
"""
108-
Initialize with a pandas DataFrame.
108+
Initialize with a DataFrame.
109109
110110
Parameters
111111
----------
112112
df
113-
The DataFrame to wrap
113+
The DataFrame to wrap (pandas, polars, or any narwhals-compatible frame)
114114
table_name
115115
Name of the table in SQL queries
116116
117117
"""
118118
self._conn = duckdb.connect(database=":memory:")
119-
self._df = nw.from_native(df)
119+
self._df = nw.from_native(df) if not isinstance(df, nw.DataFrame) else df
120120
self.table_name = table_name
121-
# TODO(@gadenbuie): If the data frame is already SQL-backed, maybe we shouldn't be making a new copy here.
122-
self._conn.register(table_name, self._df.lazy().collect().to_pandas())
121+
self._conn.register(table_name, self._df.to_native())
123122

124123
def get_db_type(self) -> str:
125124
"""
@@ -151,16 +150,8 @@ def get_schema(self, *, categorical_threshold: int) -> str:
151150
"""
152151
schema = [f"Table: {self.table_name}", "Columns:"]
153152

154-
# Ensure we're working with a DataFrame, not a LazyFrame
155-
ndf = (
156-
self._df.head(10).collect()
157-
if isinstance(self._df, nw.LazyFrame)
158-
else self._df
159-
)
160-
161-
for column in ndf.columns:
162-
# Map pandas dtypes to SQL-like types
163-
dtype = ndf[column].dtype
153+
for column in self._df.columns:
154+
dtype = self._df[column].dtype
164155
if dtype.is_integer():
165156
sql_type = "INTEGER"
166157
elif dtype.is_float():
@@ -176,17 +167,14 @@ def get_schema(self, *, categorical_threshold: int) -> str:
176167

177168
column_info = [f"- {column} ({sql_type})"]
178169

179-
# For TEXT columns, check if they're categorical
180170
if sql_type == "TEXT":
181-
unique_values = ndf[column].drop_nulls().unique()
171+
unique_values = self._df[column].drop_nulls().unique()
182172
if unique_values.len() <= categorical_threshold:
183173
categories = unique_values.to_list()
184174
categories_str = ", ".join([f"'{c}'" for c in categories])
185175
column_info.append(f" Categorical values: {categories_str}")
186-
187-
# For numeric columns, include range
188176
elif sql_type in ["INTEGER", "FLOAT", "DATE", "TIME"]:
189-
rng = ndf[column].min(), ndf[column].max()
177+
rng = self._df[column].min(), self._df[column].max()
190178
if rng[0] is None and rng[1] is None:
191179
column_info.append(" Range: NULL to NULL")
192180
else:
@@ -196,10 +184,12 @@ def get_schema(self, *, categorical_threshold: int) -> str:
196184

197185
return "\n".join(schema)
198186

199-
def execute_query(self, query: str) -> pd.DataFrame:
187+
def execute_query(self, query: str) -> nw.DataFrame:
200188
"""
201189
Execute query using DuckDB.
202190
191+
Uses polars if available, otherwise falls back to pandas.
192+
203193
Parameters
204194
----------
205195
query
@@ -208,23 +198,22 @@ def execute_query(self, query: str) -> pd.DataFrame:
208198
Returns
209199
-------
210200
:
211-
Query results as pandas DataFrame
201+
Query results as narwhals DataFrame
212202
213203
"""
214-
return self._conn.execute(query).df()
204+
return duckdb_result_to_nw(self._conn.execute(query))
215205

216-
def get_data(self) -> pd.DataFrame:
206+
def get_data(self) -> nw.DataFrame:
217207
"""
218208
Return the unfiltered data as a DataFrame.
219209
220210
Returns
221211
-------
222212
:
223-
The complete dataset as a pandas DataFrame
213+
The complete dataset as a narwhals DataFrame
224214
225215
"""
226-
# TODO(@gadenbuie): This should just return `self._df` and not a pandas DataFrame
227-
return self._df.lazy().collect().to_pandas()
216+
return self._df
228217

229218
def cleanup(self) -> None:
230219
"""
@@ -412,10 +401,12 @@ def get_schema(self, *, categorical_threshold: int) -> str: # noqa: PLR0912
412401

413402
return "\n".join(schema)
414403

415-
def execute_query(self, query: str) -> pd.DataFrame:
404+
def execute_query(self, query: str) -> nw.DataFrame:
416405
"""
417406
Execute SQL query and return results as DataFrame.
418407
408+
Uses polars if available, otherwise falls back to pandas.
409+
419410
Parameters
420411
----------
421412
query
@@ -424,20 +415,20 @@ def execute_query(self, query: str) -> pd.DataFrame:
424415
Returns
425416
-------
426417
:
427-
Query results as pandas DataFrame
418+
Query results as narwhals DataFrame
428419
429420
"""
430421
with self._get_connection() as conn:
431-
return pd.read_sql_query(text(query), conn)
422+
return read_sql(text(query), conn)
432423

433-
def get_data(self) -> pd.DataFrame:
424+
def get_data(self) -> nw.DataFrame:
434425
"""
435426
Return the unfiltered data as a DataFrame.
436427
437428
Returns
438429
-------
439430
:
440-
The complete dataset as a pandas DataFrame
431+
The complete dataset as a narwhals DataFrame
441432
442433
"""
443434
return self.execute_query(f"SELECT * FROM {self.table_name}")

pkg-py/src/querychat/_df_compat.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
"""
2+
DataFrame compatibility: try polars first, fall back to pandas.
3+
"""
4+
5+
from __future__ import annotations
6+
7+
from typing import TYPE_CHECKING
8+
9+
import duckdb
10+
import narwhals.stable.v1 as nw
11+
12+
if TYPE_CHECKING:
13+
from sqlalchemy.engine import Connection
14+
from sqlalchemy.sql.elements import TextClause
15+
16+
_INSTALL_MSG = "Install one with: pip install polars OR pip install pandas"
17+
18+
19+
def read_sql(query: TextClause, conn: Connection) -> nw.DataFrame:
20+
try:
21+
import polars as pl # noqa: PLC0415 # pyright: ignore[reportMissingImports]
22+
23+
return nw.from_native(pl.read_database(query, connection=conn))
24+
except ImportError:
25+
pass
26+
27+
try:
28+
import pandas as pd # noqa: PLC0415 # pyright: ignore[reportMissingImports]
29+
30+
return nw.from_native(pd.read_sql_query(query, conn))
31+
except ImportError:
32+
pass
33+
34+
raise ImportError(f"SQLAlchemySource requires 'polars' or 'pandas'. {_INSTALL_MSG}")
35+
36+
37+
def duckdb_result_to_nw(result: duckdb.DuckDBPyResult) -> nw.DataFrame:
38+
try:
39+
import polars # noqa: PLC0415 # pyright: ignore[reportMissingImports,reportUnusedImport]
40+
41+
return nw.from_native(result.pl())
42+
except ImportError:
43+
pass
44+
45+
try:
46+
import pandas # noqa: PLC0415 # pyright: ignore[reportMissingImports,reportUnusedImport]
47+
48+
return nw.from_native(result.df())
49+
except ImportError:
50+
pass
51+
52+
raise ImportError(f"DataFrameSource requires 'polars' or 'pandas'. {_INSTALL_MSG}")
53+
54+
55+
def read_csv(path: str) -> nw.DataFrame:
56+
try:
57+
import polars as pl # noqa: PLC0415 # pyright: ignore[reportMissingImports]
58+
59+
return nw.from_native(pl.read_csv(path))
60+
except ImportError:
61+
pass
62+
63+
try:
64+
import pandas as pd # noqa: PLC0415 # pyright: ignore[reportMissingImports]
65+
66+
return nw.from_native(pd.read_csv(path, compression="gzip"))
67+
except ImportError:
68+
pass
69+
70+
raise ImportError(f"Loading data requires 'polars' or 'pandas'. {_INSTALL_MSG}")

pkg-py/src/querychat/_querychat.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import chatlas
1010
import chevron
11+
import narwhals.stable.v1 as nw
1112
import sqlalchemy
1213
from shiny import App, Inputs, Outputs, Session, reactive, render, req, ui
1314
from shiny.express._stub_session import ExpressStubSession
@@ -29,8 +30,7 @@
2930
if TYPE_CHECKING:
3031
from collections.abc import Callable
3132

32-
import pandas as pd
33-
from narwhals.stable.v1.typing import IntoFrame
33+
from narwhals.typing import IntoDataFrame, IntoFrame, IntoLazyFrame
3434

3535
TOOL_GROUPS = Literal["update", "query"]
3636

@@ -797,14 +797,14 @@ def __init__(
797797
enable_bookmarking=enable,
798798
)
799799

800-
def df(self) -> pd.DataFrame:
800+
def df(self) -> nw.DataFrame:
801801
"""
802802
Reactively read the current filtered data frame that is in effect.
803803
804804
Returns
805805
-------
806806
:
807-
The current filtered data frame as a pandas DataFrame. If no query
807+
The current filtered data frame as a narwhals DataFrame. If no query
808808
has been set, this will return the unfiltered data frame from the
809809
data source.
810810
@@ -883,7 +883,16 @@ def normalize_data_source(
883883
return data_source
884884
if isinstance(data_source, sqlalchemy.Engine):
885885
return SQLAlchemySource(data_source, table_name)
886-
return DataFrameSource(data_source, table_name)
886+
src = nw.from_native(data_source, pass_through=True)
887+
if isinstance(src, nw.DataFrame):
888+
return DataFrameSource(src, table_name)
889+
if isinstance(src, nw.LazyFrame):
890+
raise NotImplementedError("LazyFrame data sources are not yet supported (they will be soon).")
891+
raise TypeError(
892+
f"Unsupported data source type: {type(data_source)}."
893+
"If you believe this type should be supported, please open an issue at"
894+
"https://github.com/posit-dev/querychat/issues"
895+
)
887896

888897

889898
def as_querychat_client(client: str | chatlas.Chat | None) -> chatlas.Chat:

pkg-py/src/querychat/_querychat_module.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
if TYPE_CHECKING:
1616
from collections.abc import Callable
1717

18-
import pandas as pd
18+
import narwhals.stable.v1 as nw
1919
from shiny import Inputs, Outputs, Session
2020
from shiny.bookmark import BookmarkState, RestoreState
2121

@@ -78,7 +78,7 @@ class ServerValues:
7878
7979
"""
8080

81-
df: Callable[[], pd.DataFrame]
81+
df: Callable[[], nw.DataFrame]
8282
sql: ReactiveStringOrNone
8383
title: ReactiveStringOrNone
8484
client: chatlas.Chat
@@ -182,14 +182,14 @@ def _():
182182

183183
@session.bookmark.on_bookmark
184184
def _on_bookmark(x: BookmarkState) -> None:
185-
vals = x.values # noqa: PD011
185+
vals = x.values
186186
vals["querychat_sql"] = sql.get()
187187
vals["querychat_title"] = title.get()
188188
vals["querychat_has_greeted"] = has_greeted.get()
189189

190190
@session.bookmark.on_restore
191191
def _on_restore(x: RestoreState) -> None:
192-
vals = x.values # noqa: PD011
192+
vals = x.values
193193
if "querychat_sql" in vals:
194194
sql.set(vals["querychat_sql"])
195195
if "querychat_title" in vals:

0 commit comments

Comments
 (0)