|
| 1 | +import re |
1 | 2 | import enum |
2 | 3 | import hashlib |
3 | 4 | import importlib |
|
8 | 9 | from datetime import datetime, timedelta, timezone |
9 | 10 | from types import ModuleType |
10 | 11 | from typing import Any, Dict, List, Literal, Optional, Protocol, Tuple, Union, Callable |
11 | | -from toolz import compose, valfilter # type: ignore |
| 12 | +from toolz import compose, valfilter, first # type: ignore |
12 | 13 | from toolz.curried import map as map_curried |
13 | 14 |
|
14 | 15 | logger = logging.getLogger("cuallee") |
15 | | -__version__ = "0.13.2" |
16 | | -# Verify Libraries Available |
17 | | -# ========================== |
18 | | -try: |
19 | | - from pandas import DataFrame as pandas_dataframe # type: ignore |
20 | | -except (ModuleNotFoundError, ImportError): |
21 | | - logger.debug("KO: Pandas") |
22 | | - |
23 | | -try: |
24 | | - from polars.dataframe.frame import DataFrame as polars_dataframe # type: ignore |
25 | | -except (ModuleNotFoundError, ImportError): |
26 | | - logger.debug("KO: Polars") |
27 | | - |
28 | | -try: |
29 | | - from pyspark.sql import DataFrame as pyspark_dataframe |
30 | | -except (ModuleNotFoundError, ImportError): |
31 | | - logger.debug("KO: PySpark") |
32 | | - |
33 | | -try: |
34 | | - from pyspark.sql.connect.dataframe import DataFrame as pyspark_connect_dataframe |
35 | | -except (ModuleNotFoundError, ImportError): |
36 | | - logger.debug("KO: PySpark Connect") |
37 | | - |
38 | | -try: |
39 | | - from snowflake.snowpark import DataFrame as snowpark_dataframe # type: ignore |
40 | | -except (ModuleNotFoundError, ImportError): |
41 | | - logger.debug("KO: Snowpark") |
42 | | - |
43 | | -try: |
44 | | - from duckdb import DuckDBPyConnection as duckdb_dataframe # type: ignore |
45 | | -except (ModuleNotFoundError, ImportError): |
46 | | - logger.debug("KO: DuckDB") |
47 | | - |
48 | | -try: |
49 | | - from google.cloud import bigquery |
50 | | -except (ModuleNotFoundError, ImportError): |
51 | | - logger.debug("KO: BigQuery") |
52 | | - |
53 | | -try: |
54 | | - from daft import DataFrame as daft_dataframe |
55 | | -except (ModuleNotFoundError, ImportError): |
56 | | - logger.debug("KO: BigQuery") |
| 16 | +__version__ = "0.14.0" |
57 | 17 |
|
58 | 18 |
|
59 | 19 | class CustomComputeException(Exception): |
@@ -252,6 +212,7 @@ def __init__( |
252 | 212 | self.rows = -1 |
253 | 213 | self.config: Dict[str, str] = {} |
254 | 214 | self.table_name = table_name |
| 215 | + self.dtype = "cuallee.dataframe" |
255 | 216 | try: |
256 | 217 | from .iso.checks import ISO |
257 | 218 | from .bio.checks import BioChecks |
@@ -1293,49 +1254,26 @@ def validate(self, dataframe: Any): |
1293 | 1254 | # Stop execution if the there is no rules in the check |
1294 | 1255 | assert not self.empty, "Check is empty. Try adding some rules?" |
1295 | 1256 |
|
1296 | | - # When dataframe is PySpark DataFrame API |
1297 | | - if "pyspark_dataframe" in globals() and isinstance( |
1298 | | - dataframe, pyspark_dataframe |
1299 | | - ): |
1300 | | - self.compute_engine = importlib.import_module("cuallee.pyspark_validation") |
1301 | | - |
1302 | | - elif "pyspark_connect_dataframe" in globals() and isinstance( |
1303 | | - dataframe, pyspark_connect_dataframe |
1304 | | - ): |
1305 | | - self.compute_engine = importlib.import_module("cuallee.pyspark_validation") |
1306 | | - |
1307 | | - # When dataframe is Pandas DataFrame API |
1308 | | - elif "pandas_dataframe" in globals() and isinstance( |
1309 | | - dataframe, pandas_dataframe |
1310 | | - ): |
1311 | | - self.compute_engine = importlib.import_module("cuallee.pandas_validation") |
1312 | | - |
1313 | | - # When dataframe is Snowpark DataFrame API |
1314 | | - elif "snowpark_dataframe" in globals() and isinstance( |
1315 | | - dataframe, snowpark_dataframe |
1316 | | - ): |
1317 | | - self.compute_engine = importlib.import_module("cuallee.snowpark_validation") |
1318 | | - |
1319 | | - elif "duckdb_dataframe" in globals() and isinstance( |
1320 | | - dataframe, duckdb_dataframe |
1321 | | - ): |
1322 | | - self.compute_engine = importlib.import_module("cuallee.duckdb_validation") |
1323 | | - |
1324 | | - elif "bigquery" in globals() and isinstance(dataframe, bigquery.table.Table): |
1325 | | - self.compute_engine = importlib.import_module("cuallee.bigquery_validation") |
1326 | | - |
1327 | | - elif "polars_dataframe" in globals() and isinstance( |
1328 | | - dataframe, polars_dataframe |
1329 | | - ): |
1330 | | - self.compute_engine = importlib.import_module("cuallee.polars_validation") |
1331 | | - |
1332 | | - elif "daft_dataframe" in globals() and isinstance(dataframe, daft_dataframe): |
1333 | | - self.compute_engine = importlib.import_module("cuallee.daft_validation") |
1334 | | - |
1335 | | - else: |
1336 | | - raise Exception( |
1337 | | - "Cuallee is not ready for this data structure. You can log a Feature Request in Github." |
1338 | | - ) |
| 1257 | + self.dtype = first(re.match(r".*'(.*)'", str(type(dataframe))).groups()) |
| 1258 | + match self.dtype: |
| 1259 | + case self.dtype if "pyspark" in self.dtype: |
| 1260 | + self.compute_engine = importlib.import_module("cuallee.pyspark_validation") |
| 1261 | + case self.dtype if "pandas" in self.dtype: |
| 1262 | + self.compute_engine = importlib.import_module("cuallee.pandas_validation") |
| 1263 | + case self.dtype if "snowpark" in self.dtype: |
| 1264 | + self.compute_engine = importlib.import_module("cuallee.snowpark_validation") |
| 1265 | + case self.dtype if "polars" in self.dtype: |
| 1266 | + self.compute_engine = importlib.import_module("cuallee.polars_validation") |
| 1267 | + case self.dtype if "duckdb" in self.dtype: |
| 1268 | + self.compute_engine = importlib.import_module("cuallee.duckdb_validation") |
| 1269 | + case self.dtype if "bigquery" in self.dtype: |
| 1270 | + self.compute_engine = importlib.import_module("cuallee.bigquery_validation") |
| 1271 | + case self.dtype if "daft" in self.dtype: |
| 1272 | + self.compute_engine = importlib.import_module("cuallee.daft_validation") |
| 1273 | + case _: |
| 1274 | + raise NotImplementedError(f"{self.dtype} is not yet implemented in cuallee") |
| 1275 | + |
| 1276 | + |
1339 | 1277 |
|
1340 | 1278 | assert self.compute_engine.validate_data_types( |
1341 | 1279 | self.rules, dataframe |
|
0 commit comments