diff --git a/pkg-py/CHANGELOG.md b/pkg-py/CHANGELOG.md index 2c33cdf5..6600542b 100644 --- a/pkg-py/CHANGELOG.md +++ b/pkg-py/CHANGELOG.md @@ -19,6 +19,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * The current SQL query and title can now be programmatically set through the `.sql()` and `.title()` methods of `QueryChat()`. (#98, #101) +* New `querychat.data` module provides sample datasets (`titanic()` and `tips()`) to make it easier to get started without external dependencies. (#118) + * Added a `.generate_greeting()` method to help you create a greeting message for your querychat bot. (#87) * Added `querychat_reset_dashboard()` tool for easily resetting the dashboard filters when asked by the user. (#81) diff --git a/pkg-py/docs/CHANGELOG.md b/pkg-py/docs/CHANGELOG.md index 2c33cdf5..6600542b 100644 --- a/pkg-py/docs/CHANGELOG.md +++ b/pkg-py/docs/CHANGELOG.md @@ -19,6 +19,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * The current SQL query and title can now be programmatically set through the `.sql()` and `.title()` methods of `QueryChat()`. (#98, #101) +* New `querychat.data` module provides sample datasets (`titanic()` and `tips()`) to make it easier to get started without external dependencies. (#118) + * Added a `.generate_greeting()` method to help you create a greeting message for your querychat bot. (#87) * Added `querychat_reset_dashboard()` tool for easily resetting the dashboard filters when asked by the user. (#81) diff --git a/pkg-py/docs/build.qmd b/pkg-py/docs/build.qmd index 00c92290..71971b24 100644 --- a/pkg-py/docs/build.qmd +++ b/pkg-py/docs/build.qmd @@ -181,14 +181,13 @@ Thanks to Shiny's support for [Jupyter Widgets](https://shiny.posit.co/py/docs/j ```python import plotly.express as px -from seaborn import load_dataset from shiny.express import render, ui from shinywidgets import render_plotly from querychat.express import QueryChat +from querychat.data import titanic -titanic = load_dataset("titanic") -qc = QueryChat(titanic, "titanic") +qc = QueryChat(titanic(), "titanic") qc.sidebar() with ui.layout_columns(): @@ -223,12 +222,11 @@ A more useful, but slightly more involved example like the one below might incor from shiny.express import render, ui from shinywidgets import render_plotly from querychat.express import QueryChat -from seaborn import load_dataset +from querychat.data import titanic from faicons import icon_svg import plotly.express as px -titanic = load_dataset("titanic") -qc = QueryChat(titanic, "titanic") +qc = QueryChat(titanic(), "titanic") qc.sidebar() with ui.layout_column_wrap(fill=False): @@ -356,11 +354,11 @@ You can use multiple QueryChat instances in a single app to explore different da from seaborn import load_dataset from shiny.express import render, ui from querychat.express import QueryChat +from querychat.data import titanic -titanic = load_dataset("titanic") penguins = load_dataset("penguins") -qc_titanic = QueryChat(titanic, "titanic") +qc_titanic = QueryChat(titanic(), "titanic") qc_penguins = QueryChat(penguins, "penguins") with ui.sidebar(): @@ -396,15 +394,12 @@ Here's a complete example bringing together multiple concepts - a Titanic surviv ```python from shiny.express import render, ui from querychat.express import QueryChat -from seaborn import load_dataset +from querychat.data import titanic import plotly.express as px -# Load data -titanic = load_dataset("titanic") - # Create QueryChat qc = QueryChat( - titanic, + titanic(), "titanic", data_description="Titanic passenger data with survival outcomes", ) diff --git a/pkg-py/docs/context.qmd b/pkg-py/docs/context.qmd index 24156ebf..840a233c 100644 --- a/pkg-py/docs/context.qmd +++ b/pkg-py/docs/context.qmd @@ -12,11 +12,9 @@ For full visibility into the full system prompt that Querychat generates for the ```python from querychat import QueryChat -from seaborn import load_dataset +from querychat.data import titanic -titanic = load_dataset("titanic") - -qc = QueryChat(titanic, "titanic") +qc = QueryChat(titanic(), "titanic") print(qc.system_prompt) ``` diff --git a/pkg-py/docs/data-sources.qmd b/pkg-py/docs/data-sources.qmd index 9d040645..5ac97e27 100644 --- a/pkg-py/docs/data-sources.qmd +++ b/pkg-py/docs/data-sources.qmd @@ -160,12 +160,10 @@ Or, if you have a pandas DataFrame, you can create the DuckDB database like so: ```{.python filename="create-duckdb-from-pandas.py"} import duckdb import pandas as pd - -from seaborn import load_dataset -titanic = load_dataset("titanic") +from querychat.data import titanic conn = duckdb.connect("my_database.duckdb") -conn.register('titanic_df', titanic) +conn.register('titanic_df', titanic()) conn.execute(""" CREATE TABLE titanic AS SELECT * FROM titanic_df diff --git a/pkg-py/docs/index.qmd b/pkg-py/docs/index.qmd index 97ba65d8..3a569ddf 100644 --- a/pkg-py/docs/index.qmd +++ b/pkg-py/docs/index.qmd @@ -40,11 +40,10 @@ The quickest way to start chatting is to call the `.app()` method, which returns ```{.python filename="titanic-app.py"} -from seaborn import load_dataset from querychat import QueryChat +from querychat.data import titanic -titanic = load_dataset("titanic") -qc = QueryChat(titanic, "titanic", client="openai/gpt-4.1") +qc = QueryChat(titanic(), "titanic", client="openai/gpt-4.1") app = qc.app() ``` diff --git a/pkg-py/docs/models.qmd b/pkg-py/docs/models.qmd index 3d19e133..83268ebd 100644 --- a/pkg-py/docs/models.qmd +++ b/pkg-py/docs/models.qmd @@ -10,11 +10,10 @@ To use a particular model, pass a `"{provider}/{model}"` string to the `client` ```python from querychat import QueryChat -from seaborn import load_dataset -titanic = load_dataset("titanic") +from querychat.data import titanic qc = QueryChat( - titanic, + titanic(), "titanic", client="anthropic/claude-sonnet-4-5" ) diff --git a/pkg-py/docs/tools.qmd b/pkg-py/docs/tools.qmd index cd348eba..c4f68c53 100644 --- a/pkg-py/docs/tools.qmd +++ b/pkg-py/docs/tools.qmd @@ -23,10 +23,9 @@ Here's a basic example of this tool in action with the `.app()` method. Notice h ```{.python filename="titanic-app.py"} from querychat import QueryChat -from seaborn import load_dataset +from querychat.data import titanic -titanic = load_dataset("titanic") -qc = QueryChat(titanic, "titanic") +qc = QueryChat(titanic(), "titanic") app = qc.app() ``` @@ -46,10 +45,9 @@ Here's an example of it in action: ```{.python filename="titanic-app.py"} from querychat import QueryChat -from seaborn import load_dataset +from querychat.data import titanic -titanic = load_dataset("titanic") -qc = QueryChat(titanic, "titanic") +qc = QueryChat(titanic(), "titanic") app = qc.app() ``` diff --git a/pkg-py/examples/01-hello-app.py b/pkg-py/examples/01-hello-app.py index 856c93bd..ac109a8f 100644 --- a/pkg-py/examples/01-hello-app.py +++ b/pkg-py/examples/01-hello-app.py @@ -1,6 +1,5 @@ -from seaborn import load_dataset from querychat import QueryChat +from querychat.data import titanic -titanic = load_dataset("titanic") -qc = QueryChat(titanic, "titanic") +qc = QueryChat(titanic(), "titanic") app = qc.app() diff --git a/pkg-py/examples/02-prompt-app.py b/pkg-py/examples/02-prompt-app.py index 832059f1..ff738025 100644 --- a/pkg-py/examples/02-prompt-app.py +++ b/pkg-py/examples/02-prompt-app.py @@ -1,15 +1,13 @@ from pathlib import Path -from seaborn import load_dataset from querychat import QueryChat - -titanic = load_dataset("titanic") +from querychat.data import titanic greeting = Path(__file__).parent / "greeting.md" data_desc = Path(__file__).parent / "data_description.md" qc = QueryChat( - titanic, + titanic(), "titanic", greeting=greeting, data_description=data_desc, diff --git a/pkg-py/examples/03-sidebar-core-app.py b/pkg-py/examples/03-sidebar-core-app.py index b37dc750..ef6212e4 100644 --- a/pkg-py/examples/03-sidebar-core-app.py +++ b/pkg-py/examples/03-sidebar-core-app.py @@ -1,11 +1,9 @@ -from seaborn import load_dataset from shiny import App, render, ui from querychat import QueryChat - -titanic = load_dataset("titanic") +from querychat.data import titanic # 1. Provide data source to QueryChat -qc = QueryChat(titanic, "titanic") +qc = QueryChat(titanic(), "titanic") app_ui = ui.page_sidebar( # 2. Create sidebar chat control diff --git a/pkg-py/examples/03-sidebar-express-app.py b/pkg-py/examples/03-sidebar-express-app.py index 8e0b7cac..78d8e632 100644 --- a/pkg-py/examples/03-sidebar-express-app.py +++ b/pkg-py/examples/03-sidebar-express-app.py @@ -1,11 +1,9 @@ -from seaborn import load_dataset from shiny.express import render, ui from querychat.express import QueryChat - -titanic = load_dataset("titanic") +from querychat.data import titanic # 1. Provide data source to QueryChat -qc = QueryChat(titanic, "titanic") +qc = QueryChat(titanic(), "titanic") # 2. Add sidebar chat control qc.sidebar() diff --git a/pkg-py/src/querychat/data/__init__.py b/pkg-py/src/querychat/data/__init__.py new file mode 100644 index 00000000..fe9bec96 --- /dev/null +++ b/pkg-py/src/querychat/data/__init__.py @@ -0,0 +1,68 @@ +""" +Sample datasets for getting started with querychat. + +This module provides easy access to sample datasets that can be used with QueryChat +to quickly get started without needing to install additional dependencies. +""" + +from __future__ import annotations + +from importlib.resources import files + +import pandas as pd + + +def titanic() -> pd.DataFrame: + """ + Load the Titanic dataset. + + This dataset contains information about passengers on the Titanic, including + whether they survived, their class, age, sex, and other demographic information. + + Returns + ------- + pandas.DataFrame + A DataFrame with 891 rows and 15 columns containing Titanic passenger data. + + Examples + -------- + >>> from querychat.data import titanic + >>> from querychat import QueryChat + >>> df = titanic() + >>> qc = QueryChat(df, "titanic") + >>> app = qc.app() + + """ + # Get the path to the gzipped CSV file using importlib.resources + data_file = files("querychat.data") / "titanic.csv.gz" + return pd.read_csv(str(data_file), compression="gzip") + + +def tips() -> pd.DataFrame: + """ + Load the tips dataset. + + This dataset contains information about restaurant tips, including the total + bill, tip amount, and information about the party (sex, smoker status, day, + time, and party size). + + Returns + ------- + pandas.DataFrame + A DataFrame with 244 rows and 7 columns containing restaurant tip data. + + Examples + -------- + >>> from querychat.data import tips + >>> from querychat import QueryChat + >>> df = tips() + >>> qc = QueryChat(df, "tips") + >>> app = qc.app() + + """ + # Get the path to the gzipped CSV file using importlib.resources + data_file = files("querychat.data") / "tips.csv.gz" + return pd.read_csv(str(data_file), compression="gzip") + + +__all__ = ["tips", "titanic"] diff --git a/pkg-py/src/querychat/data/tips.csv.gz b/pkg-py/src/querychat/data/tips.csv.gz new file mode 100644 index 00000000..4d0d18f3 Binary files /dev/null and b/pkg-py/src/querychat/data/tips.csv.gz differ diff --git a/pkg-py/src/querychat/data/titanic.csv.gz b/pkg-py/src/querychat/data/titanic.csv.gz new file mode 100644 index 00000000..30bb696a Binary files /dev/null and b/pkg-py/src/querychat/data/titanic.csv.gz differ diff --git a/pkg-py/tests/test_data.py b/pkg-py/tests/test_data.py new file mode 100644 index 00000000..0ee2f8f8 --- /dev/null +++ b/pkg-py/tests/test_data.py @@ -0,0 +1,127 @@ +"""Tests for the querychat.data module.""" + +import pandas as pd +from querychat.data import tips, titanic + + +def test_titanic_returns_dataframe(): + """Test that titanic() returns a pandas DataFrame.""" + df = titanic() + assert isinstance(df, pd.DataFrame) + + +def test_titanic_has_expected_shape(): + """Test that the Titanic dataset has the expected number of rows and columns.""" + df = titanic() + assert df.shape == (891, 15), f"Expected (891, 15) but got {df.shape}" + + +def test_titanic_has_expected_columns(): + """Test that the Titanic dataset has the expected column names.""" + df = titanic() + expected_columns = [ + "survived", + "pclass", + "sex", + "age", + "sibsp", + "parch", + "fare", + "embarked", + "class", + "who", + "adult_male", + "deck", + "embark_town", + "alive", + "alone", + ] + assert list(df.columns) == expected_columns + + +def test_titanic_data_integrity(): + """Test basic data integrity of the Titanic dataset.""" + df = titanic() + + # Check that survived column has only 0 and 1 values + assert set(df["survived"].dropna().unique()) <= {0, 1} + + # Check that pclass has only 1, 2, 3 + assert set(df["pclass"].dropna().unique()) <= {1, 2, 3} + + # Check that sex has only 'male' and 'female' + assert set(df["sex"].dropna().unique()) <= {"male", "female"} + + # Check that fare is non-negative + assert (df["fare"].dropna() >= 0).all() + + +def test_titanic_creates_new_copy(): + """Test that titanic() returns a new copy each time it's called.""" + df1 = titanic() + df2 = titanic() + + # They should not be the same object + assert df1 is not df2 + + # But they should have the same data + assert df1.equals(df2) + + +def test_tips_returns_dataframe(): + """Test that tips() returns a pandas DataFrame.""" + df = tips() + assert isinstance(df, pd.DataFrame) + + +def test_tips_has_expected_shape(): + """Test that the tips dataset has the expected number of rows and columns.""" + df = tips() + assert df.shape == (244, 7), f"Expected (244, 7) but got {df.shape}" + + +def test_tips_has_expected_columns(): + """Test that the tips dataset has the expected column names.""" + df = tips() + expected_columns = [ + "total_bill", + "tip", + "sex", + "smoker", + "day", + "time", + "size", + ] + assert list(df.columns) == expected_columns + + +def test_tips_data_integrity(): + """Test basic data integrity of the tips dataset.""" + df = tips() + + # Check that total_bill is positive + assert (df["total_bill"] > 0).all() + + # Check that tip is non-negative + assert (df["tip"] >= 0).all() + + # Check that sex has only expected values + assert set(df["sex"].dropna().unique()) <= {"Male", "Female"} + + # Check that smoker has only expected values + assert set(df["smoker"].dropna().unique()) <= {"Yes", "No"} + + # Check that size is positive + assert (df["size"] > 0).all() + + +def test_tips_creates_new_copy(): + """Test that tips() returns a new copy each time it's called.""" + df1 = tips() + df2 = tips() + + # They should not be the same object + assert df1 is not df2 + + # But they should have the same data + assert df1.equals(df2) diff --git a/pkg-py/tests/test_querychat.py b/pkg-py/tests/test_querychat.py index 893ff824..ebf5ec57 100644 --- a/pkg-py/tests/test_querychat.py +++ b/pkg-py/tests/test_querychat.py @@ -60,4 +60,3 @@ def test_querychat_custom_id(sample_df): ) assert qc.id == "custom_id" -