Skip to content

Commit c0e9ee2

Browse files
authored
Merge pull request #216 from posit-dev/feat-data-path-strings
feat: data path strings
2 parents e617965 + 7e80c56 commit c0e9ee2

File tree

4 files changed

+713
-5
lines changed

4 files changed

+713
-5
lines changed

docs/_quarto.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,7 @@ quartodoc:
211211
- name: assistant
212212
- name: load_dataset
213213
- name: get_data_path
214+
- name: connect_to_table
214215
- title: Utility Functions
215216
desc: >
216217
The *Utility Functions* group contains functions that are useful accessing metadata about

pointblank/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from pointblank.validate import (
3131
Validate,
3232
config,
33+
connect_to_table,
3334
get_action_metadata,
3435
get_column_count,
3536
get_data_path,
@@ -63,6 +64,7 @@
6364
"load_dataset",
6465
"get_data_path",
6566
"config",
67+
"connect_to_table",
6668
"preview",
6769
"missing_vals_tbl",
6870
"get_action_metadata",

pointblank/validate.py

Lines changed: 308 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@
104104
"Validate",
105105
"load_dataset",
106106
"config",
107+
"connect_to_table",
107108
"preview",
108109
"missing_vals_tbl",
109110
"get_action_metadata",
@@ -2110,6 +2111,236 @@ def get_val_info(self) -> dict[str, any]:
21102111
return self.val_info
21112112

21122113

2114+
def connect_to_table(connection_string: str) -> Any:
2115+
"""
2116+
Connect to a database table using a connection string.
2117+
2118+
This utility function tests whether a connection string leads to a valid table and returns
2119+
the table object if successful. It provides helpful error messages when no table is specified
2120+
or when backend dependencies are missing.
2121+
2122+
Parameters
2123+
----------
2124+
connection_string
2125+
A database connection string with a required table specification using the `::table_name`
2126+
suffix. Supported formats are outlined in the *Supported Connection String Formats* section.
2127+
2128+
Returns
2129+
-------
2130+
Any
2131+
An Ibis table object for the specified database table.
2132+
2133+
Supported Connection String Formats
2134+
-----------------------------------
2135+
The `connection_string` parameter must include a valid connection string with a table name
2136+
specified using the `::` syntax. For example:
2137+
2138+
- DuckDB: `"duckdb:///path/to/database.ddb::table_name"`
2139+
- SQLite: `"sqlite:///path/to/database.db::table_name"`
2140+
- PostgreSQL: `"postgresql://user:password@localhost:5432/database::table_name"`
2141+
- MySQL: `"mysql://user:password@localhost:3306/database::table_name"`
2142+
- BigQuery: `"bigquery://project/dataset::table_name"`
2143+
- Snowflake: `"snowflake://user:password@account/database/schema::table_name"`
2144+
2145+
If the connection string does not include a table name, the function will attempt to connect to
2146+
the database and list available tables, providing guidance on how to specify a table.
2147+
2148+
Examples
2149+
--------
2150+
Connect to a DuckDB table:
2151+
2152+
```{python}
2153+
import pointblank as pb
2154+
2155+
# Get path to a DuckDB database file from package data
2156+
duckdb_path = pb.get_data_path("game_revenue", "duckdb")
2157+
2158+
# Connect to the `game_revenue` table in the DuckDB database
2159+
game_revenue = pb.connect_to_table(f"duckdb:///{duckdb_path}::game_revenue")
2160+
2161+
# Use with the `preview()` function
2162+
pb.preview(game_revenue)
2163+
```
2164+
2165+
Here are some backend-specific connection examples:
2166+
2167+
```python
2168+
# PostgreSQL
2169+
pg_table = pb.connect_to_table(
2170+
"postgresql://user:password@localhost:5432/warehouse::customer_data"
2171+
)
2172+
2173+
# SQLite
2174+
sqlite_table = pb.connect_to_table("sqlite:///local_data.db::products")
2175+
2176+
# BigQuery
2177+
bq_table = pb.connect_to_table("bigquery://my-project/analytics::daily_metrics")
2178+
```
2179+
2180+
This function requires the Ibis library with appropriate backend drivers:
2181+
2182+
```bash
2183+
# You can install a set of common backends:
2184+
pip install 'ibis-framework[duckdb,postgres,mysql,sqlite]'
2185+
2186+
# ...or specific backends as needed:
2187+
pip install 'ibis-framework[duckdb]' # for DuckDB
2188+
pip install 'ibis-framework[postgres]' # for PostgreSQL
2189+
```
2190+
"""
2191+
# Check if Ibis is available
2192+
if not _is_lib_present(lib_name="ibis"):
2193+
raise ImportError(
2194+
"The Ibis library is not installed but is required for database connection strings.\n"
2195+
"Install it with: pip install 'ibis-framework[duckdb]' (or other backend as needed)"
2196+
)
2197+
2198+
import ibis
2199+
2200+
# Check if connection string includes table specification
2201+
if "::" not in connection_string:
2202+
# Try to connect to get available tables for helpful error message
2203+
try:
2204+
# Extract the base connection string (without table name)
2205+
base_connection = connection_string
2206+
2207+
# Connect to the database
2208+
conn = ibis.connect(base_connection)
2209+
2210+
# Get list of available tables
2211+
try:
2212+
available_tables = conn.list_tables()
2213+
except Exception:
2214+
available_tables = []
2215+
2216+
conn.disconnect()
2217+
2218+
# Create helpful error message
2219+
if available_tables:
2220+
table_list = "\n".join(f" - {table}" for table in available_tables)
2221+
error_msg = (
2222+
f"No table specified in connection string: {connection_string}\n\n"
2223+
f"Available tables in the database:\n{table_list}\n\n"
2224+
f"To access a specific table, use the format:\n"
2225+
f" {connection_string}::TABLE_NAME\n\n"
2226+
f"Examples:\n"
2227+
)
2228+
# Add examples with first few table names
2229+
for table in available_tables[:3]:
2230+
error_msg += f" {connection_string}::{table}\n"
2231+
else:
2232+
error_msg = (
2233+
f"No table specified in connection string: {connection_string}\n\n"
2234+
f"No tables found in the database or unable to list tables.\n\n"
2235+
f"To access a specific table, use the format:\n"
2236+
f" {connection_string}::TABLE_NAME"
2237+
)
2238+
2239+
raise ValueError(error_msg)
2240+
2241+
except Exception as e:
2242+
if isinstance(e, ValueError):
2243+
raise # Re-raise our custom ValueError
2244+
2245+
# Check for backend-specific errors and provide installation guidance
2246+
error_str = str(e).lower()
2247+
backend_install_map = {
2248+
"duckdb": "pip install 'ibis-framework[duckdb]'",
2249+
"postgresql": "pip install 'ibis-framework[postgres]'",
2250+
"postgres": "pip install 'ibis-framework[postgres]'",
2251+
"mysql": "pip install 'ibis-framework[mysql]'",
2252+
"sqlite": "pip install 'ibis-framework[sqlite]'",
2253+
"bigquery": "pip install 'ibis-framework[bigquery]'",
2254+
"snowflake": "pip install 'ibis-framework[snowflake]'",
2255+
}
2256+
2257+
# Check if this is a missing backend dependency
2258+
for backend, install_cmd in backend_install_map.items():
2259+
if backend in error_str and ("not found" in error_str or "no module" in error_str):
2260+
raise ConnectionError(
2261+
f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
2262+
f" {install_cmd}\n\n"
2263+
f"Original error: {e}\n\n"
2264+
f"Supported connection string formats:\n"
2265+
f"- DuckDB: 'duckdb:///path/to/file.ddb::table_name'\n"
2266+
f"- SQLite: 'sqlite:///path/to/file.db::table_name'\n"
2267+
f"- PostgreSQL: 'postgresql://user:pass@host:port/db::table_name'\n"
2268+
f"- MySQL: 'mysql://user:pass@host:port/db::table_name'\n"
2269+
f"- BigQuery: 'bigquery://project/dataset::table_name'\n"
2270+
f"- Snowflake: 'snowflake://user:pass@account/db/schema::table_name'\n"
2271+
f"\nNote: Use '::table_name' to specify the table within the database."
2272+
) from e
2273+
2274+
# Generic connection error
2275+
raise ConnectionError(
2276+
f"Failed to connect to database using connection string: {connection_string}\n"
2277+
f"Error: {e}\n\n"
2278+
f"No table specified. Use the format: {connection_string}::TABLE_NAME"
2279+
) from e
2280+
2281+
# Split connection string and table name
2282+
try:
2283+
base_connection, table_name = connection_string.rsplit("::", 1)
2284+
except ValueError:
2285+
raise ValueError(f"Invalid connection string format: {connection_string}")
2286+
2287+
# Connect to database and get table
2288+
try:
2289+
conn = ibis.connect(base_connection)
2290+
table = conn.table(table_name)
2291+
return table
2292+
2293+
except Exception as e:
2294+
# Check for backend-specific errors and provide installation guidance
2295+
error_str = str(e).lower()
2296+
backend_install_map = {
2297+
"duckdb": "pip install 'ibis-framework[duckdb]'",
2298+
"postgresql": "pip install 'ibis-framework[postgres]'",
2299+
"postgres": "pip install 'ibis-framework[postgres]'",
2300+
"mysql": "pip install 'ibis-framework[mysql]'",
2301+
"sqlite": "pip install 'ibis-framework[sqlite]'",
2302+
"bigquery": "pip install 'ibis-framework[bigquery]'",
2303+
"snowflake": "pip install 'ibis-framework[snowflake]'",
2304+
}
2305+
2306+
# Check if this is a missing backend dependency
2307+
for backend, install_cmd in backend_install_map.items():
2308+
if backend in error_str and ("not found" in error_str or "no module" in error_str):
2309+
raise ConnectionError(
2310+
f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
2311+
f" {install_cmd}\n\n"
2312+
f"Original error: {e}"
2313+
) from e
2314+
2315+
# Check if table doesn't exist
2316+
if "table" in error_str and ("not found" in error_str or "does not exist" in error_str):
2317+
# Try to get available tables for helpful message
2318+
try:
2319+
available_tables = conn.list_tables()
2320+
if available_tables:
2321+
table_list = "\n".join(f" - {table}" for table in available_tables)
2322+
raise ValueError(
2323+
f"Table '{table_name}' not found in database.\n\n"
2324+
f"Available tables:\n{table_list}\n\n"
2325+
f"Check the table name and try again with:\n"
2326+
f" {base_connection}::CORRECT_TABLE_NAME"
2327+
) from e
2328+
else:
2329+
raise ValueError(
2330+
f"Table '{table_name}' not found and no tables available in database."
2331+
) from e
2332+
except Exception:
2333+
raise ValueError(
2334+
f"Table '{table_name}' not found in database. "
2335+
f"Check the table name and connection string."
2336+
) from e
2337+
2338+
# Generic connection error
2339+
raise ConnectionError(
2340+
f"Failed to connect to table '{table_name}' using: {base_connection}\nError: {e}"
2341+
) from e
2342+
2343+
21132344
@dataclass
21142345
class Validate:
21152346
"""
@@ -2143,11 +2374,13 @@ class Validate:
21432374
----------
21442375
data
21452376
The table to validate, which could be a DataFrame object, an Ibis table object, a CSV
2146-
file path, or a Parquet file path. When providing a CSV or Parquet file path (as a string
2147-
or `pathlib.Path` object), the file will be automatically loaded using an available
2148-
DataFrame library (Polars or Pandas). Parquet input also supports glob patterns,
2149-
directories containing .parquet files, and Spark-style partitioned datasets. Read the
2150-
*Supported Input Table Types* section for details on the supported table types.
2377+
file path, a Parquet file path, or a database connection string. When providing a CSV or
2378+
Parquet file path (as a string or `pathlib.Path` object), the file will be automatically
2379+
loaded using an available DataFrame library (Polars or Pandas). Parquet input also supports
2380+
glob patterns, directories containing .parquet files, and Spark-style partitioned datasets.
2381+
Connection strings enable direct database access via Ibis with optional table specification
2382+
using the `::table_name` suffix. Read the *Supported Input Table Types* section for details
2383+
on the supported table types.
21512384
tbl_name
21522385
An optional name to assign to the input table object. If no value is provided, a name will
21532386
be generated based on whatever information is available. This table name will be displayed
@@ -2220,6 +2453,7 @@ class Validate:
22202453
- CSV files (string path or `pathlib.Path` object with `.csv` extension)
22212454
- Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
22222455
extension, or partitioned dataset)
2456+
- Database connection strings (RFC 3986 URI format with optional table specification)
22232457

22242458
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
22252459
`ibis.expr.types.relations.Table`). Furthermore, the use of `Validate` with such tables requires
@@ -2230,6 +2464,18 @@ class Validate:
22302464
provided. The file will be automatically detected and loaded using the best available DataFrame
22312465
library. The loading preference is Polars first, then Pandas as a fallback.
22322466

2467+
Connection strings follow database URL formats and must also specify a table using the
2468+
`::table_name` suffix. Examples include:
2469+
2470+
- `"duckdb:///path/to/database.ddb::table_name"`
2471+
- `"sqlite:///path/to/database.db::table_name"`
2472+
- `"postgresql://user:password@localhost:5432/database::table_name"`
2473+
- `"mysql://user:password@localhost:3306/database::table_name"`
2474+
- `"bigquery://project/dataset::table_name"`
2475+
- `"snowflake://user:password@account/database/schema::table_name"`
2476+
2477+
When using connection strings, the Ibis library with the appropriate backend driver is required.
2478+
22332479
Thresholds
22342480
----------
22352481
The `thresholds=` parameter is used to set the failure-condition levels for all validation
@@ -2612,6 +2858,33 @@ def send_report():
26122858

26132859
Both Polars and Pandas handle partitioned datasets natively, so this works seamlessly with
26142860
either DataFrame library. The loading preference is Polars first, then Pandas as a fallback.
2861+
2862+
### Working with Database Connection Strings
2863+
2864+
The `Validate` class supports database connection strings for direct validation of database
2865+
tables. Connection strings must specify a table using the `::table_name` suffix:
2866+
2867+
```{python}
2868+
# Get path to a DuckDB database file from package data
2869+
duckdb_path = pb.get_data_path("game_revenue", "duckdb")
2870+
2871+
validation_9 = (
2872+
pb.Validate(
2873+
data=f"duckdb:///{duckdb_path}::game_revenue",
2874+
label="DuckDB Game Revenue Validation"
2875+
)
2876+
.col_exists(["player_id", "session_id", "item_revenue"])
2877+
.col_vals_gt(columns="item_revenue", value=0)
2878+
.interrogate()
2879+
)
2880+
2881+
validation_9
2882+
```
2883+
2884+
For comprehensive documentation on supported connection string formats, error handling, and
2885+
installation requirements, see the [`connect_to_table()`](`pointblank.connect_to_table`)
2886+
function. This function handles all the connection logic and provides helpful error messages
2887+
when table specifications are missing or backend dependencies are not installed.
26152888
"""
26162889

26172890
data: FrameT | Any
@@ -2625,6 +2898,9 @@ def send_report():
26252898
locale: str | None = None
26262899

26272900
def __post_init__(self):
2901+
# Handle connection string input for the data parameter
2902+
self.data = self._process_connection_string_input(self.data)
2903+
26282904
# Handle CSV file input for the data parameter
26292905
self.data = self._process_csv_input(self.data)
26302906

@@ -2675,6 +2951,33 @@ def __post_init__(self):
26752951

26762952
self.validation_info = []
26772953

2954+
def _process_connection_string_input(self, data: FrameT | Any) -> FrameT | Any:
2955+
"""
2956+
Process data parameter to handle database connection strings.
2957+
2958+
Uses the `connect_to_table()` utility function to handle RFC 3986 URI-formatted
2959+
connection strings with table specifications. Returns the original data if it's
2960+
not a connection string.
2961+
2962+
For more details on supported connection string formats, see the documentation
2963+
for `connect_to_table()`.
2964+
"""
2965+
# Check if data is a string that looks like a connection string
2966+
if not isinstance(data, str):
2967+
return data
2968+
2969+
# Basic connection string patterns
2970+
connection_patterns = [
2971+
"://", # General URL-like pattern
2972+
]
2973+
2974+
# Check if it looks like a connection string
2975+
if not any(pattern in data for pattern in connection_patterns):
2976+
return data
2977+
2978+
# Use the utility function to connect to the table
2979+
return connect_to_table(data)
2980+
26782981
def _process_csv_input(self, data: FrameT | Any) -> FrameT | Any:
26792982
"""
26802983
Process data parameter to handle CSV file inputs.

0 commit comments

Comments
 (0)