104
104
"Validate",
105
105
"load_dataset",
106
106
"config",
107
+ "connect_to_table",
107
108
"preview",
108
109
"missing_vals_tbl",
109
110
"get_action_metadata",
@@ -2110,6 +2111,236 @@ def get_val_info(self) -> dict[str, any]:
2110
2111
return self.val_info
2111
2112
2112
2113
2114
+ def connect_to_table(connection_string: str) -> Any:
2115
+ """
2116
+ Connect to a database table using a connection string.
2117
+
2118
+ This utility function tests whether a connection string leads to a valid table and returns
2119
+ the table object if successful. It provides helpful error messages when no table is specified
2120
+ or when backend dependencies are missing.
2121
+
2122
+ Parameters
2123
+ ----------
2124
+ connection_string
2125
+ A database connection string with a required table specification using the `::table_name`
2126
+ suffix. Supported formats are outlined in the *Supported Connection String Formats* section.
2127
+
2128
+ Returns
2129
+ -------
2130
+ Any
2131
+ An Ibis table object for the specified database table.
2132
+
2133
+ Supported Connection String Formats
2134
+ -----------------------------------
2135
+ The `connection_string` parameter must include a valid connection string with a table name
2136
+ specified using the `::` syntax. For example:
2137
+
2138
+ - DuckDB: `"duckdb:///path/to/database.ddb::table_name"`
2139
+ - SQLite: `"sqlite:///path/to/database.db::table_name"`
2140
+ - PostgreSQL: `"postgresql://user:password@localhost:5432/database::table_name"`
2141
+ - MySQL: `"mysql://user:password@localhost:3306/database::table_name"`
2142
+ - BigQuery: `"bigquery://project/dataset::table_name"`
2143
+ - Snowflake: `"snowflake://user:password@account/database/schema::table_name"`
2144
+
2145
+ If the connection string does not include a table name, the function will attempt to connect to
2146
+ the database and list available tables, providing guidance on how to specify a table.
2147
+
2148
+ Examples
2149
+ --------
2150
+ Connect to a DuckDB table:
2151
+
2152
+ ```{python}
2153
+ import pointblank as pb
2154
+
2155
+ # Get path to a DuckDB database file from package data
2156
+ duckdb_path = pb.get_data_path("game_revenue", "duckdb")
2157
+
2158
+ # Connect to the `game_revenue` table in the DuckDB database
2159
+ game_revenue = pb.connect_to_table(f"duckdb:///{duckdb_path}::game_revenue")
2160
+
2161
+ # Use with the `preview()` function
2162
+ pb.preview(game_revenue)
2163
+ ```
2164
+
2165
+ Here are some backend-specific connection examples:
2166
+
2167
+ ```python
2168
+ # PostgreSQL
2169
+ pg_table = pb.connect_to_table(
2170
+ "postgresql://user:password@localhost:5432/warehouse::customer_data"
2171
+ )
2172
+
2173
+ # SQLite
2174
+ sqlite_table = pb.connect_to_table("sqlite:///local_data.db::products")
2175
+
2176
+ # BigQuery
2177
+ bq_table = pb.connect_to_table("bigquery://my-project/analytics::daily_metrics")
2178
+ ```
2179
+
2180
+ This function requires the Ibis library with appropriate backend drivers:
2181
+
2182
+ ```bash
2183
+ # You can install a set of common backends:
2184
+ pip install 'ibis-framework[duckdb,postgres,mysql,sqlite]'
2185
+
2186
+ # ...or specific backends as needed:
2187
+ pip install 'ibis-framework[duckdb]' # for DuckDB
2188
+ pip install 'ibis-framework[postgres]' # for PostgreSQL
2189
+ ```
2190
+ """
2191
+ # Check if Ibis is available
2192
+ if not _is_lib_present(lib_name="ibis"):
2193
+ raise ImportError(
2194
+ "The Ibis library is not installed but is required for database connection strings.\n"
2195
+ "Install it with: pip install 'ibis-framework[duckdb]' (or other backend as needed)"
2196
+ )
2197
+
2198
+ import ibis
2199
+
2200
+ # Check if connection string includes table specification
2201
+ if "::" not in connection_string:
2202
+ # Try to connect to get available tables for helpful error message
2203
+ try:
2204
+ # Extract the base connection string (without table name)
2205
+ base_connection = connection_string
2206
+
2207
+ # Connect to the database
2208
+ conn = ibis.connect(base_connection)
2209
+
2210
+ # Get list of available tables
2211
+ try:
2212
+ available_tables = conn.list_tables()
2213
+ except Exception:
2214
+ available_tables = []
2215
+
2216
+ conn.disconnect()
2217
+
2218
+ # Create helpful error message
2219
+ if available_tables:
2220
+ table_list = "\n".join(f" - {table}" for table in available_tables)
2221
+ error_msg = (
2222
+ f"No table specified in connection string: {connection_string}\n\n"
2223
+ f"Available tables in the database:\n{table_list}\n\n"
2224
+ f"To access a specific table, use the format:\n"
2225
+ f" {connection_string}::TABLE_NAME\n\n"
2226
+ f"Examples:\n"
2227
+ )
2228
+ # Add examples with first few table names
2229
+ for table in available_tables[:3]:
2230
+ error_msg += f" {connection_string}::{table}\n"
2231
+ else:
2232
+ error_msg = (
2233
+ f"No table specified in connection string: {connection_string}\n\n"
2234
+ f"No tables found in the database or unable to list tables.\n\n"
2235
+ f"To access a specific table, use the format:\n"
2236
+ f" {connection_string}::TABLE_NAME"
2237
+ )
2238
+
2239
+ raise ValueError(error_msg)
2240
+
2241
+ except Exception as e:
2242
+ if isinstance(e, ValueError):
2243
+ raise # Re-raise our custom ValueError
2244
+
2245
+ # Check for backend-specific errors and provide installation guidance
2246
+ error_str = str(e).lower()
2247
+ backend_install_map = {
2248
+ "duckdb": "pip install 'ibis-framework[duckdb]'",
2249
+ "postgresql": "pip install 'ibis-framework[postgres]'",
2250
+ "postgres": "pip install 'ibis-framework[postgres]'",
2251
+ "mysql": "pip install 'ibis-framework[mysql]'",
2252
+ "sqlite": "pip install 'ibis-framework[sqlite]'",
2253
+ "bigquery": "pip install 'ibis-framework[bigquery]'",
2254
+ "snowflake": "pip install 'ibis-framework[snowflake]'",
2255
+ }
2256
+
2257
+ # Check if this is a missing backend dependency
2258
+ for backend, install_cmd in backend_install_map.items():
2259
+ if backend in error_str and ("not found" in error_str or "no module" in error_str):
2260
+ raise ConnectionError(
2261
+ f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
2262
+ f" {install_cmd}\n\n"
2263
+ f"Original error: {e}\n\n"
2264
+ f"Supported connection string formats:\n"
2265
+ f"- DuckDB: 'duckdb:///path/to/file.ddb::table_name'\n"
2266
+ f"- SQLite: 'sqlite:///path/to/file.db::table_name'\n"
2267
+ f"- PostgreSQL: 'postgresql://user:pass@host:port/db::table_name'\n"
2268
+ f"- MySQL: 'mysql://user:pass@host:port/db::table_name'\n"
2269
+ f"- BigQuery: 'bigquery://project/dataset::table_name'\n"
2270
+ f"- Snowflake: 'snowflake://user:pass@account/db/schema::table_name'\n"
2271
+ f"\nNote: Use '::table_name' to specify the table within the database."
2272
+ ) from e
2273
+
2274
+ # Generic connection error
2275
+ raise ConnectionError(
2276
+ f"Failed to connect to database using connection string: {connection_string}\n"
2277
+ f"Error: {e}\n\n"
2278
+ f"No table specified. Use the format: {connection_string}::TABLE_NAME"
2279
+ ) from e
2280
+
2281
+ # Split connection string and table name
2282
+ try:
2283
+ base_connection, table_name = connection_string.rsplit("::", 1)
2284
+ except ValueError:
2285
+ raise ValueError(f"Invalid connection string format: {connection_string}")
2286
+
2287
+ # Connect to database and get table
2288
+ try:
2289
+ conn = ibis.connect(base_connection)
2290
+ table = conn.table(table_name)
2291
+ return table
2292
+
2293
+ except Exception as e:
2294
+ # Check for backend-specific errors and provide installation guidance
2295
+ error_str = str(e).lower()
2296
+ backend_install_map = {
2297
+ "duckdb": "pip install 'ibis-framework[duckdb]'",
2298
+ "postgresql": "pip install 'ibis-framework[postgres]'",
2299
+ "postgres": "pip install 'ibis-framework[postgres]'",
2300
+ "mysql": "pip install 'ibis-framework[mysql]'",
2301
+ "sqlite": "pip install 'ibis-framework[sqlite]'",
2302
+ "bigquery": "pip install 'ibis-framework[bigquery]'",
2303
+ "snowflake": "pip install 'ibis-framework[snowflake]'",
2304
+ }
2305
+
2306
+ # Check if this is a missing backend dependency
2307
+ for backend, install_cmd in backend_install_map.items():
2308
+ if backend in error_str and ("not found" in error_str or "no module" in error_str):
2309
+ raise ConnectionError(
2310
+ f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
2311
+ f" {install_cmd}\n\n"
2312
+ f"Original error: {e}"
2313
+ ) from e
2314
+
2315
+ # Check if table doesn't exist
2316
+ if "table" in error_str and ("not found" in error_str or "does not exist" in error_str):
2317
+ # Try to get available tables for helpful message
2318
+ try:
2319
+ available_tables = conn.list_tables()
2320
+ if available_tables:
2321
+ table_list = "\n".join(f" - {table}" for table in available_tables)
2322
+ raise ValueError(
2323
+ f"Table '{table_name}' not found in database.\n\n"
2324
+ f"Available tables:\n{table_list}\n\n"
2325
+ f"Check the table name and try again with:\n"
2326
+ f" {base_connection}::CORRECT_TABLE_NAME"
2327
+ ) from e
2328
+ else:
2329
+ raise ValueError(
2330
+ f"Table '{table_name}' not found and no tables available in database."
2331
+ ) from e
2332
+ except Exception:
2333
+ raise ValueError(
2334
+ f"Table '{table_name}' not found in database. "
2335
+ f"Check the table name and connection string."
2336
+ ) from e
2337
+
2338
+ # Generic connection error
2339
+ raise ConnectionError(
2340
+ f"Failed to connect to table '{table_name}' using: {base_connection}\nError: {e}"
2341
+ ) from e
2342
+
2343
+
2113
2344
@dataclass
2114
2345
class Validate:
2115
2346
"""
@@ -2143,11 +2374,13 @@ class Validate:
2143
2374
----------
2144
2375
data
2145
2376
The table to validate, which could be a DataFrame object, an Ibis table object, a CSV
2146
- file path, or a Parquet file path. When providing a CSV or Parquet file path (as a string
2147
- or `pathlib.Path` object), the file will be automatically loaded using an available
2148
- DataFrame library (Polars or Pandas). Parquet input also supports glob patterns,
2149
- directories containing .parquet files, and Spark-style partitioned datasets. Read the
2150
- *Supported Input Table Types* section for details on the supported table types.
2377
+ file path, a Parquet file path, or a database connection string. When providing a CSV or
2378
+ Parquet file path (as a string or `pathlib.Path` object), the file will be automatically
2379
+ loaded using an available DataFrame library (Polars or Pandas). Parquet input also supports
2380
+ glob patterns, directories containing .parquet files, and Spark-style partitioned datasets.
2381
+ Connection strings enable direct database access via Ibis with optional table specification
2382
+ using the `::table_name` suffix. Read the *Supported Input Table Types* section for details
2383
+ on the supported table types.
2151
2384
tbl_name
2152
2385
An optional name to assign to the input table object. If no value is provided, a name will
2153
2386
be generated based on whatever information is available. This table name will be displayed
@@ -2220,6 +2453,7 @@ class Validate:
2220
2453
- CSV files (string path or `pathlib.Path` object with `.csv` extension)
2221
2454
- Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
2222
2455
extension, or partitioned dataset)
2456
+ - Database connection strings (RFC 3986 URI format with optional table specification)
2223
2457
2224
2458
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
2225
2459
`ibis.expr.types.relations.Table`). Furthermore, the use of `Validate` with such tables requires
@@ -2230,6 +2464,18 @@ class Validate:
2230
2464
provided. The file will be automatically detected and loaded using the best available DataFrame
2231
2465
library. The loading preference is Polars first, then Pandas as a fallback.
2232
2466
2467
+ Connection strings follow database URL formats and must also specify a table using the
2468
+ `::table_name` suffix. Examples include:
2469
+
2470
+ - `"duckdb:///path/to/database.ddb::table_name"`
2471
+ - `"sqlite:///path/to/database.db::table_name"`
2472
+ - `"postgresql://user:password@localhost:5432/database::table_name"`
2473
+ - `"mysql://user:password@localhost:3306/database::table_name"`
2474
+ - `"bigquery://project/dataset::table_name"`
2475
+ - `"snowflake://user:password@account/database/schema::table_name"`
2476
+
2477
+ When using connection strings, the Ibis library with the appropriate backend driver is required.
2478
+
2233
2479
Thresholds
2234
2480
----------
2235
2481
The `thresholds=` parameter is used to set the failure-condition levels for all validation
@@ -2612,6 +2858,33 @@ def send_report():
2612
2858
2613
2859
Both Polars and Pandas handle partitioned datasets natively, so this works seamlessly with
2614
2860
either DataFrame library. The loading preference is Polars first, then Pandas as a fallback.
2861
+
2862
+ ### Working with Database Connection Strings
2863
+
2864
+ The `Validate` class supports database connection strings for direct validation of database
2865
+ tables. Connection strings must specify a table using the `::table_name` suffix:
2866
+
2867
+ ```{python}
2868
+ # Get path to a DuckDB database file from package data
2869
+ duckdb_path = pb.get_data_path("game_revenue", "duckdb")
2870
+
2871
+ validation_9 = (
2872
+ pb.Validate(
2873
+ data=f"duckdb:///{duckdb_path}::game_revenue",
2874
+ label="DuckDB Game Revenue Validation"
2875
+ )
2876
+ .col_exists(["player_id", "session_id", "item_revenue"])
2877
+ .col_vals_gt(columns="item_revenue", value=0)
2878
+ .interrogate()
2879
+ )
2880
+
2881
+ validation_9
2882
+ ```
2883
+
2884
+ For comprehensive documentation on supported connection string formats, error handling, and
2885
+ installation requirements, see the [`connect_to_table()`](`pointblank.connect_to_table`)
2886
+ function. This function handles all the connection logic and provides helpful error messages
2887
+ when table specifications are missing or backend dependencies are not installed.
2615
2888
"""
2616
2889
2617
2890
data: FrameT | Any
@@ -2625,6 +2898,9 @@ def send_report():
2625
2898
locale: str | None = None
2626
2899
2627
2900
def __post_init__(self):
2901
+ # Handle connection string input for the data parameter
2902
+ self.data = self._process_connection_string_input(self.data)
2903
+
2628
2904
# Handle CSV file input for the data parameter
2629
2905
self.data = self._process_csv_input(self.data)
2630
2906
@@ -2675,6 +2951,33 @@ def __post_init__(self):
2675
2951
2676
2952
self.validation_info = []
2677
2953
2954
+ def _process_connection_string_input(self, data: FrameT | Any) -> FrameT | Any:
2955
+ """
2956
+ Process data parameter to handle database connection strings.
2957
+
2958
+ Uses the `connect_to_table()` utility function to handle RFC 3986 URI-formatted
2959
+ connection strings with table specifications. Returns the original data if it's
2960
+ not a connection string.
2961
+
2962
+ For more details on supported connection string formats, see the documentation
2963
+ for `connect_to_table()`.
2964
+ """
2965
+ # Check if data is a string that looks like a connection string
2966
+ if not isinstance(data, str):
2967
+ return data
2968
+
2969
+ # Basic connection string patterns
2970
+ connection_patterns = [
2971
+ "://", # General URL-like pattern
2972
+ ]
2973
+
2974
+ # Check if it looks like a connection string
2975
+ if not any(pattern in data for pattern in connection_patterns):
2976
+ return data
2977
+
2978
+ # Use the utility function to connect to the table
2979
+ return connect_to_table(data)
2980
+
2678
2981
def _process_csv_input(self, data: FrameT | Any) -> FrameT | Any:
2679
2982
"""
2680
2983
Process data parameter to handle CSV file inputs.
0 commit comments