diff --git a/CHANGELOG.md b/CHANGELOG.md index 80b53908..d902471e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Restrict DuckDB dependency to < 1.4.0 (#972) - Fixed schema evolution support for optional fields in CSV and Parquet formats. Optional fields marked with `required: false` are no longer incorrectly treated as required during validation, enabling proper schema evolution where optional fields can be added to contracts without breaking validation of historical data files (#977) +- Fixed SQL syntax errors when using single quotes in `validValues` or `missingValues` for Soda quality checks. Single quotes are now properly escaped by doubling them (SQL standard) before generating SodaCL YAML ## [0.11.2] - 2025-12-15 diff --git a/datacontract/engines/data_contract_checks.py b/datacontract/engines/data_contract_checks.py index f42ded43..29dedb6a 100644 --- a/datacontract/engines/data_contract_checks.py +++ b/datacontract/engines/data_contract_checks.py @@ -165,7 +165,6 @@ def to_schema_name(schema_object: SchemaObject, server_type: str) -> str: return schema_object.name - def check_property_is_present(model_name, field_name, quoting_config: QuotingConfig = QuotingConfig()) -> Check: check_type = "field_is_present" check_key = f"{model_name}__{field_name}__{check_type}" @@ -492,7 +491,9 @@ def check_property_enum(model_name: str, field_name: str, enum: list, quoting_co ) -def check_property_regex(model_name: str, field_name: str, pattern: str, quoting_config: QuotingConfig = QuotingConfig()): +def check_property_regex( + model_name: str, field_name: str, pattern: str, quoting_config: QuotingConfig = QuotingConfig() +): if quoting_config.quote_field_name: field_name_for_soda = f'"{field_name}"' else: @@ -661,7 +662,9 @@ def check_property_invalid_values( } if valid_values is not None: - sodacl_check_config["valid values"] = valid_values + # Escape single quotes for SQL by doubling them + escaped_values = [v.replace("'", "''") if isinstance(v, str) else v for v in valid_values] + sodacl_check_config["valid values"] = escaped_values sodacl_check_dict = { checks_for(model_name, quoting_config, check_type): [ @@ -706,7 +709,9 @@ def check_property_missing_values( if missing_values is not None: filtered_missing_values = [v for v in missing_values if v is not None] if filtered_missing_values: - sodacl_check_config["missing values"] = filtered_missing_values + # Escape single quotes for SQL by doubling them + escaped_values = [v.replace("'", "''") if isinstance(v, str) else v for v in filtered_missing_values] + sodacl_check_config["missing values"] = escaped_values sodacl_check_dict = { checks_for(model_name, quoting_config, check_type): [ @@ -819,7 +824,9 @@ def check_quality_list( ) ) else: - checks.append(check_property_duplicate_values(schema_name, property_name, threshold, quoting_config)) + checks.append( + check_property_duplicate_values(schema_name, property_name, threshold, quoting_config) + ) elif quality.metric == "nullValues": if property_name is not None: checks.append(check_property_null_values(schema_name, property_name, threshold, quoting_config)) @@ -829,7 +836,9 @@ def check_quality_list( if property_name is not None: valid_values = quality.arguments.get("validValues") if quality.arguments else None checks.append( - check_property_invalid_values(schema_name, property_name, threshold, valid_values, quoting_config) + check_property_invalid_values( + schema_name, property_name, threshold, valid_values, quoting_config + ) ) else: logger.warning("Quality check invalidValues is only supported at field level") @@ -837,7 +846,9 @@ def check_quality_list( if property_name is not None: missing_values = quality.arguments.get("missingValues") if quality.arguments else None checks.append( - check_property_missing_values(schema_name, property_name, threshold, missing_values, quoting_config) + check_property_missing_values( + schema_name, property_name, threshold, missing_values, quoting_config + ) ) else: logger.warning("Quality check missingValues is only supported at field level") @@ -1125,4 +1136,3 @@ def _parse_iso8601_to_seconds(duration: str) -> int | None: return int(match.group(1)) return None - diff --git a/tests/test_data_contract_checks.py b/tests/test_data_contract_checks.py index 2d21126b..37d98c94 100644 --- a/tests/test_data_contract_checks.py +++ b/tests/test_data_contract_checks.py @@ -1,6 +1,12 @@ +import yaml from open_data_contract_standard.model import DataQuality, Server -from datacontract.engines.data_contract_checks import QuotingConfig, prepare_query +from datacontract.engines.data_contract_checks import ( + QuotingConfig, + check_property_invalid_values, + check_property_missing_values, + prepare_query, +) def test_prepare_query_schema_placeholder(): @@ -71,4 +77,34 @@ def test_prepare_query_all_placeholders_with_dollar(): result = prepare_query(quality, "my_table", "my_field", QuotingConfig(), server) - assert result == "SELECT my_field FROM my_schema.my_table" \ No newline at end of file + assert result == "SELECT my_field FROM my_schema.my_table" + + +def test_check_property_invalid_values_escapes_single_quotes(): + """Test that single quotes in validValues are properly escaped for SQL.""" + check = check_property_invalid_values( + model_name="test_model", + field_name="test_field", + threshold="= 0", + valid_values=["peter's", "john's"], + ) + + yaml_dict = yaml.safe_load(check.implementation) + valid_values = yaml_dict["checks for test_model"][0]["invalid_count(test_field) = 0"]["valid values"] + + assert valid_values == ["peter''s", "john''s"] + + +def test_check_property_missing_values_escapes_single_quotes(): + """Test that single quotes in missingValues are properly escaped for SQL.""" + check = check_property_missing_values( + model_name="test_model", + field_name="test_field", + threshold="= 0", + missing_values=["N/A", "peter's"], + ) + + yaml_dict = yaml.safe_load(check.implementation) + missing_values = yaml_dict["checks for test_model"][0]["missing_count(test_field) = 0"]["missing values"] + + assert missing_values == ["N/A", "peter''s"]