Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Restrict DuckDB dependency to < 1.4.0 (#972)
- Fixed schema evolution support for optional fields in CSV and Parquet formats. Optional fields marked with `required: false` are no longer incorrectly treated as required during validation, enabling proper schema evolution where optional fields can be added to contracts without breaking validation of historical data files (#977)
- Fixed SQL syntax errors when using single quotes in `validValues` or `missingValues` for Soda quality checks. Single quotes are now properly escaped by doubling them (SQL standard) before generating SodaCL YAML

## [0.11.2] - 2025-12-15

Expand Down
26 changes: 18 additions & 8 deletions datacontract/engines/data_contract_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,6 @@ def to_schema_name(schema_object: SchemaObject, server_type: str) -> str:
return schema_object.name



def check_property_is_present(model_name, field_name, quoting_config: QuotingConfig = QuotingConfig()) -> Check:
check_type = "field_is_present"
check_key = f"{model_name}__{field_name}__{check_type}"
Expand Down Expand Up @@ -492,7 +491,9 @@ def check_property_enum(model_name: str, field_name: str, enum: list, quoting_co
)


def check_property_regex(model_name: str, field_name: str, pattern: str, quoting_config: QuotingConfig = QuotingConfig()):
def check_property_regex(
model_name: str, field_name: str, pattern: str, quoting_config: QuotingConfig = QuotingConfig()
):
if quoting_config.quote_field_name:
field_name_for_soda = f'"{field_name}"'
else:
Expand Down Expand Up @@ -661,7 +662,9 @@ def check_property_invalid_values(
}

if valid_values is not None:
sodacl_check_config["valid values"] = valid_values
# Escape single quotes for SQL by doubling them
escaped_values = [v.replace("'", "''") if isinstance(v, str) else v for v in valid_values]
sodacl_check_config["valid values"] = escaped_values

sodacl_check_dict = {
checks_for(model_name, quoting_config, check_type): [
Expand Down Expand Up @@ -706,7 +709,9 @@ def check_property_missing_values(
if missing_values is not None:
filtered_missing_values = [v for v in missing_values if v is not None]
if filtered_missing_values:
sodacl_check_config["missing values"] = filtered_missing_values
# Escape single quotes for SQL by doubling them
escaped_values = [v.replace("'", "''") if isinstance(v, str) else v for v in filtered_missing_values]
sodacl_check_config["missing values"] = escaped_values

sodacl_check_dict = {
checks_for(model_name, quoting_config, check_type): [
Expand Down Expand Up @@ -819,7 +824,9 @@ def check_quality_list(
)
)
else:
checks.append(check_property_duplicate_values(schema_name, property_name, threshold, quoting_config))
checks.append(
check_property_duplicate_values(schema_name, property_name, threshold, quoting_config)
)
elif quality.metric == "nullValues":
if property_name is not None:
checks.append(check_property_null_values(schema_name, property_name, threshold, quoting_config))
Expand All @@ -829,15 +836,19 @@ def check_quality_list(
if property_name is not None:
valid_values = quality.arguments.get("validValues") if quality.arguments else None
checks.append(
check_property_invalid_values(schema_name, property_name, threshold, valid_values, quoting_config)
check_property_invalid_values(
schema_name, property_name, threshold, valid_values, quoting_config
)
)
else:
logger.warning("Quality check invalidValues is only supported at field level")
elif quality.metric == "missingValues":
if property_name is not None:
missing_values = quality.arguments.get("missingValues") if quality.arguments else None
checks.append(
check_property_missing_values(schema_name, property_name, threshold, missing_values, quoting_config)
check_property_missing_values(
schema_name, property_name, threshold, missing_values, quoting_config
)
)
else:
logger.warning("Quality check missingValues is only supported at field level")
Expand Down Expand Up @@ -1125,4 +1136,3 @@ def _parse_iso8601_to_seconds(duration: str) -> int | None:
return int(match.group(1))

return None

40 changes: 38 additions & 2 deletions tests/test_data_contract_checks.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import yaml
from open_data_contract_standard.model import DataQuality, Server

from datacontract.engines.data_contract_checks import QuotingConfig, prepare_query
from datacontract.engines.data_contract_checks import (
QuotingConfig,
check_property_invalid_values,
check_property_missing_values,
prepare_query,
)


def test_prepare_query_schema_placeholder():
Expand Down Expand Up @@ -71,4 +77,34 @@ def test_prepare_query_all_placeholders_with_dollar():

result = prepare_query(quality, "my_table", "my_field", QuotingConfig(), server)

assert result == "SELECT my_field FROM my_schema.my_table"
assert result == "SELECT my_field FROM my_schema.my_table"


def test_check_property_invalid_values_escapes_single_quotes():
"""Test that single quotes in validValues are properly escaped for SQL."""
check = check_property_invalid_values(
model_name="test_model",
field_name="test_field",
threshold="= 0",
valid_values=["peter's", "john's"],
)

yaml_dict = yaml.safe_load(check.implementation)
valid_values = yaml_dict["checks for test_model"][0]["invalid_count(test_field) = 0"]["valid values"]

assert valid_values == ["peter''s", "john''s"]


def test_check_property_missing_values_escapes_single_quotes():
"""Test that single quotes in missingValues are properly escaped for SQL."""
check = check_property_missing_values(
model_name="test_model",
field_name="test_field",
threshold="= 0",
missing_values=["N/A", "peter's"],
)

yaml_dict = yaml.safe_load(check.implementation)
missing_values = yaml_dict["checks for test_model"][0]["missing_count(test_field) = 0"]["missing values"]

assert missing_values == ["N/A", "peter''s"]