diff --git a/datacontract/engines/soda/check_soda_execute.py b/datacontract/engines/soda/check_soda_execute.py index 0563999bd..715cb2d68 100644 --- a/datacontract/engines/soda/check_soda_execute.py +++ b/datacontract/engines/soda/check_soda_execute.py @@ -117,6 +117,10 @@ def check_soda_execute( soda_configuration_str = to_athena_soda_configuration(server) scan.add_configuration_yaml_str(soda_configuration_str) scan.set_data_source_name(server.type) + elif server.type == "duckdb": + soda_configuration_str = to_duckdb_soda_configuration(server) + scan.add_configuration_yaml_str(soda_configuration_str) + scan.set_data_source_name(server.type) else: run.checks.append( diff --git a/datacontract/engines/soda/connections/duckdb_connection.py b/datacontract/engines/soda/connections/duckdb_connection.py index d7c6a9795..06a3b8ff8 100644 --- a/datacontract/engines/soda/connections/duckdb_connection.py +++ b/datacontract/engines/soda/connections/duckdb_connection.py @@ -2,12 +2,44 @@ from typing import Any, Dict import duckdb +import yaml from datacontract.export.duckdb_type_converter import convert_to_duckdb_csv_type, convert_to_duckdb_json_type from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server +from datacontract.model.exceptions import DataContractException from datacontract.model.run import Run +def to_duckdb_soda_configuration(server): + if not hasattr(server, "database") or not server.database: + raise DataContractException( + type="duckdb-connection", + name="missing_database", + reason="Database is required for DuckDB connection. Specify the database file in which your tables exist.", + engine="datacontract", + ) + + if not hasattr(server, "read_only") or not server.read_only: + raise DataContractException( + type="duckdb-connection", + name="missing_read_only", + reason="read_only is required for DuckDB connection. Specify if the database should be opened in read-only mode.", + engine="datacontract", + ) + + data_source = { + "type": "duckdb", + "path": server.database, + "read_only": server.read_only, + } + + if server.schema: + data_source["schema_"] = server.schema_ + + soda_configuration = {f"data_source {server.type}": data_source} + soda_configuration_str = yaml.dump(soda_configuration) + return soda_configuration_str + def get_duckdb_connection( data_contract: DataContractSpecification, server: Server, diff --git a/datacontract/export/sql_type_converter.py b/datacontract/export/sql_type_converter.py index a8689e749..977b280f9 100644 --- a/datacontract/export/sql_type_converter.py +++ b/datacontract/export/sql_type_converter.py @@ -24,6 +24,8 @@ def convert_to_sql_type(field: Field, server_type: str) -> str: return convert_type_to_trino(field) elif server_type == "oracle": return convert_type_to_oracle(field) + elif server_type == "duckdb": + return convert_to_duckdb(field) return field.type diff --git a/tests/fixtures/duckdb/data/data.sql b/tests/fixtures/duckdb/data/data.sql new file mode 100644 index 000000000..1c4cbd4b8 --- /dev/null +++ b/tests/fixtures/duckdb/data/data.sql @@ -0,0 +1,19 @@ +-- Create the table +CREATE TABLE my_table ( + field_one VARCHAR(10) primary key, + field_two INT not null, + field_three TIMESTAMPTZ +); + +-- Insert the data +INSERT INTO my_table (field_one, field_two, field_three) VALUES + ('CX-263-DU', 50, '2023-06-16 13:12:56'), + ('IK-894-MN', 47, '2023-10-08 22:40:57'), + ('ER-399-JY', 22, '2023-05-16 01:08:22'), + ('MT-939-FH', 63, '2023-03-15 05:15:21'), + ('LV-849-MI', 33, '2023-09-08 20:08:43'), + ('VS-079-OH', 85, '2023-04-15 00:50:32'), + ('DN-297-XY', 79, '2023-11-08 12:55:42'), + ('ZE-172-FP', 14, '2023-12-03 18:38:38'), + ('ID-840-EG', 89, '2023-10-02 17:17:58'), + ('FK-230-KZ', 64, '2023-11-27 15:21:48'); diff --git a/tests/fixtures/duckdb/datacontract.yaml b/tests/fixtures/duckdb/datacontract.yaml new file mode 100644 index 000000000..5359ebb9b --- /dev/null +++ b/tests/fixtures/duckdb/datacontract.yaml @@ -0,0 +1,25 @@ +dataContractSpecification: 1.2.1 +id: duckdb +info: + title: duckdb + version: 0.0.1 + owner: my-domain-team +servers: + my-dataproduct/duckdb: + type: duckdb + database: fixtures/duckdb/db.duckdb + read_only: true +models: + my_table: + type: table + fields: + field_one: + type: varchar + required: true + unique: true + pattern: "[A-Za-z]{2}-\\d{3}-[A-Za-z]{2}$" + field_two: + type: integer + minimum: 10 + field_three: + type: timestamp_tz diff --git a/tests/test_test_duckdb.py b/tests/test_test_duckdb.py new file mode 100644 index 000000000..ff45d1af4 --- /dev/null +++ b/tests/test_test_duckdb.py @@ -0,0 +1,39 @@ +# import pytest + +from pathlib import Path + +import duckdb + +from datacontract.data_contract import DataContract +from datacontract.model.run import ResultEnum, Run + + +def test_test_duckdb(): + _init_sql("fixtures/duckdb/data/data.sql") + + datacontract_file = "fixtures/duckdb/datacontract.yaml" + data_contract_str = _setup_datacontract(datacontract_file) + data_contract = DataContract(data_contract_str=data_contract_str) + + run: Run = data_contract.test() + + assert run.result == "passed" + assert all(check.result == ResultEnum.passed for check in run.checks) + + +def _setup_datacontract(file): + with open(file) as data_contract_file: + data_contract_str = data_contract_file.read() + return data_contract_str + +def _init_sql(file_path): + if (Path("fixtures/duckdb/db.duckdb").exists()): + Path("fixtures/duckdb/db.duckdb").unlink() + + connection = duckdb.connect(database="fixtures/duckdb/db.duckdb" , read_only=False) + + with open(file_path, "r") as sql_file: + sql_commands = sql_file.read() + connection.sql(sql_commands) + connection.close() + pass