From 34033421188c09c42ba6cd9590e775028c71b987 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Tue, 6 Jan 2026 17:57:26 -0300 Subject: [PATCH 1/5] fix: Decimals in structured generation lead to errors --- .../processing/gsonschema/validators.py | 36 +++++++++++++++++++ .../processing/gsonschema/test_validators.py | 21 +++++++++++ 2 files changed, 57 insertions(+) diff --git a/src/data_designer/engine/processing/gsonschema/validators.py b/src/data_designer/engine/processing/gsonschema/validators.py index d16ad69b..c44f1980 100644 --- a/src/data_designer/engine/processing/gsonschema/validators.py +++ b/src/data_designer/engine/processing/gsonschema/validators.py @@ -70,6 +70,39 @@ def extend_jsonschema_validator_with_pruning(validator): return validators.extend(validator, {"additionalProperties": prune_additional_properties}) +def _has_number_string_anyof(schema: dict) -> bool: + """Check if schema has anyOf with both number and string (Pydantic Decimal pattern).""" + any_of = schema.get("anyOf") + if not isinstance(any_of, list): + return False + types = {item.get("type") for item in any_of} + return "number" in types and "string" in types + + +def normalize_decimal_fields(obj: DataObjectT, schema: JSONSchemaT) -> DataObjectT: + """Convert numeric values to strings for Decimal-like anyOf fields.""" + if not isinstance(obj, dict): + return obj + + defs = schema.get("$defs", {}) + obj_schema = defs.get(schema.get("$ref", "")[len("#/$defs/"):], schema) + props = obj_schema.get("properties", {}) + + for key, value in obj.items(): + field_schema = props.get(key, {}) + if "$ref" in field_schema: + field_schema = defs.get(field_schema["$ref"][len("#/$defs/"):], {}) + + if isinstance(value, dict): + obj[key] = normalize_decimal_fields(value, schema) + elif isinstance(value, list): + obj[key] = [normalize_decimal_fields(v, schema) if isinstance(v, dict) else v for v in value] + elif isinstance(value, (int, float)) and not isinstance(value, bool) and _has_number_string_anyof(field_schema): + obj[key] = str(value) + + return obj + + ## We don't expect the outer data type (e.g. dict, list, or const) to be ## modified by the pruning action. @overload @@ -140,4 +173,7 @@ def validate( except ValidationError as exc: raise JSONSchemaValidationError(str(exc)) from exc + # Normalize Decimal-like fields to ensure consistent string output + final_object = normalize_decimal_fields(final_object, schema) + return final_object diff --git a/tests/engine/processing/gsonschema/test_validators.py b/tests/engine/processing/gsonschema/test_validators.py index f2ca70f0..84508869 100644 --- a/tests/engine/processing/gsonschema/test_validators.py +++ b/tests/engine/processing/gsonschema/test_validators.py @@ -196,3 +196,24 @@ def test_invalid_data_type(): data = {"num": "not a number", "extra": "should be removed"} with pytest.raises(JSONSchemaValidationError): validate(data, schema, pruning=True, no_extra_properties=True) + + +def test_normalize_decimal_anyof_fields() -> None: + """Test that Decimal-like anyOf fields (number|string) are normalized to strings.""" + schema = { + "type": "object", + "properties": { + "name": {"type": "string"}, + "price": {"anyOf": [{"type": "number"}, {"type": "string"}]}, + }, + } + + # Numeric value should be converted to string + result1 = validate({"name": "Widget", "price": 189.99}, schema) + assert result1["price"] == "189.99" + assert isinstance(result1["price"], str) + + # String value should remain a string + result2 = validate({"name": "Gadget", "price": "249.99"}, schema) + assert result2["price"] == "249.99" + assert isinstance(result2["price"], str) From 010df8cd7c1ea2ddff21d863515df65883612b0c Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Tue, 6 Jan 2026 18:05:39 -0300 Subject: [PATCH 2/5] lint --- .../2-structured-outputs-and-jinja-expressions.py | 4 +++- src/data_designer/engine/processing/gsonschema/validators.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/notebook_source/2-structured-outputs-and-jinja-expressions.py b/docs/notebook_source/2-structured-outputs-and-jinja-expressions.py index b64a507d..5a6fb924 100644 --- a/docs/notebook_source/2-structured-outputs-and-jinja-expressions.py +++ b/docs/notebook_source/2-structured-outputs-and-jinja-expressions.py @@ -358,7 +358,7 @@ class ProductReview(BaseModel): # # %% -results = data_designer.create(config_builder, num_records=10, dataset_name="tutorial-2") +results = data_designer.create(config_builder, num_records=100, dataset_name="tutorial-2") # %% # Load the generated dataset as a pandas DataFrame. @@ -381,3 +381,5 @@ class ProductReview(BaseModel): # # - [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/)A # + +# %% diff --git a/src/data_designer/engine/processing/gsonschema/validators.py b/src/data_designer/engine/processing/gsonschema/validators.py index c44f1980..3ab8adc4 100644 --- a/src/data_designer/engine/processing/gsonschema/validators.py +++ b/src/data_designer/engine/processing/gsonschema/validators.py @@ -85,13 +85,13 @@ def normalize_decimal_fields(obj: DataObjectT, schema: JSONSchemaT) -> DataObjec return obj defs = schema.get("$defs", {}) - obj_schema = defs.get(schema.get("$ref", "")[len("#/$defs/"):], schema) + obj_schema = defs.get(schema.get("$ref", "")[len("#/$defs/") :], schema) props = obj_schema.get("properties", {}) for key, value in obj.items(): field_schema = props.get(key, {}) if "$ref" in field_schema: - field_schema = defs.get(field_schema["$ref"][len("#/$defs/"):], {}) + field_schema = defs.get(field_schema["$ref"][len("#/$defs/") :], {}) if isinstance(value, dict): obj[key] = normalize_decimal_fields(value, schema) From e190f4176dadac638a152b3c744d32d68efb41d2 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Tue, 6 Jan 2026 18:07:48 -0300 Subject: [PATCH 3/5] committed something by mistake --- .../2-structured-outputs-and-jinja-expressions.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/notebook_source/2-structured-outputs-and-jinja-expressions.py b/docs/notebook_source/2-structured-outputs-and-jinja-expressions.py index 5a6fb924..b64a507d 100644 --- a/docs/notebook_source/2-structured-outputs-and-jinja-expressions.py +++ b/docs/notebook_source/2-structured-outputs-and-jinja-expressions.py @@ -358,7 +358,7 @@ class ProductReview(BaseModel): # # %% -results = data_designer.create(config_builder, num_records=100, dataset_name="tutorial-2") +results = data_designer.create(config_builder, num_records=10, dataset_name="tutorial-2") # %% # Load the generated dataset as a pandas DataFrame. @@ -381,5 +381,3 @@ class ProductReview(BaseModel): # # - [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/)A # - -# %% From eca3a19e22e4dcb3196790d86b435060e99ca76d Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Tue, 6 Jan 2026 18:09:20 -0300 Subject: [PATCH 4/5] removing comment --- src/data_designer/engine/processing/gsonschema/validators.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/data_designer/engine/processing/gsonschema/validators.py b/src/data_designer/engine/processing/gsonschema/validators.py index 3ab8adc4..e89a2408 100644 --- a/src/data_designer/engine/processing/gsonschema/validators.py +++ b/src/data_designer/engine/processing/gsonschema/validators.py @@ -173,7 +173,6 @@ def validate( except ValidationError as exc: raise JSONSchemaValidationError(str(exc)) from exc - # Normalize Decimal-like fields to ensure consistent string output final_object = normalize_decimal_fields(final_object, schema) return final_object From 8939a3f8b6baa819d7e78888d4249af02df54132 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Wed, 7 Jan 2026 12:52:34 -0300 Subject: [PATCH 5/5] convert to Decimal with proper precision instead --- .../processing/gsonschema/validators.py | 36 ++++++++++++++----- .../processing/gsonschema/test_validators.py | 30 ++++++++++------ 2 files changed, 48 insertions(+), 18 deletions(-) diff --git a/src/data_designer/engine/processing/gsonschema/validators.py b/src/data_designer/engine/processing/gsonschema/validators.py index e89a2408..52ca337d 100644 --- a/src/data_designer/engine/processing/gsonschema/validators.py +++ b/src/data_designer/engine/processing/gsonschema/validators.py @@ -2,7 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 import logging +import re from copy import deepcopy +from decimal import ROUND_HALF_UP, Decimal from typing import Any, overload from jsonschema import Draft202012Validator, ValidationError, validators @@ -70,17 +72,30 @@ def extend_jsonschema_validator_with_pruning(validator): return validators.extend(validator, {"additionalProperties": prune_additional_properties}) -def _has_number_string_anyof(schema: dict) -> bool: - """Check if schema has anyOf with both number and string (Pydantic Decimal pattern).""" +def _get_decimal_info_from_anyof(schema: dict) -> tuple[bool, int | None]: + """Check if schema is a Decimal anyOf and extract decimal places. + + Returns (is_decimal, decimal_places) where decimal_places is None if no constraint. + """ any_of = schema.get("anyOf") if not isinstance(any_of, list): - return False - types = {item.get("type") for item in any_of} - return "number" in types and "string" in types + return False, None + + has_number = any(item.get("type") == "number" for item in any_of) + if not has_number: + return False, None + + for item in any_of: + if item.get("type") == "string" and "pattern" in item: + match = re.search(r"\\d\{0,(\d+)\}", item["pattern"]) + if match: + return True, int(match.group(1)) + return True, None # Decimal without precision constraint + return False, None def normalize_decimal_fields(obj: DataObjectT, schema: JSONSchemaT) -> DataObjectT: - """Convert numeric values to strings for Decimal-like anyOf fields.""" + """Normalize Decimal-like anyOf fields to floats with proper precision.""" if not isinstance(obj, dict): return obj @@ -97,8 +112,13 @@ def normalize_decimal_fields(obj: DataObjectT, schema: JSONSchemaT) -> DataObjec obj[key] = normalize_decimal_fields(value, schema) elif isinstance(value, list): obj[key] = [normalize_decimal_fields(v, schema) if isinstance(v, dict) else v for v in value] - elif isinstance(value, (int, float)) and not isinstance(value, bool) and _has_number_string_anyof(field_schema): - obj[key] = str(value) + elif isinstance(value, (int, float, str)) and not isinstance(value, bool): + is_decimal, decimal_places = _get_decimal_info_from_anyof(field_schema) + if is_decimal: + d = Decimal(str(value)) + if decimal_places is not None: + d = d.quantize(Decimal(f"0.{'0' * decimal_places}"), rounding=ROUND_HALF_UP) + obj[key] = float(d) return obj diff --git a/tests/engine/processing/gsonschema/test_validators.py b/tests/engine/processing/gsonschema/test_validators.py index 84508869..b746d3b3 100644 --- a/tests/engine/processing/gsonschema/test_validators.py +++ b/tests/engine/processing/gsonschema/test_validators.py @@ -199,21 +199,31 @@ def test_invalid_data_type(): def test_normalize_decimal_anyof_fields() -> None: - """Test that Decimal-like anyOf fields (number|string) are normalized to strings.""" + """Test that Decimal-like anyOf fields are normalized to floats with proper precision.""" schema = { "type": "object", "properties": { "name": {"type": "string"}, - "price": {"anyOf": [{"type": "number"}, {"type": "string"}]}, + "price": { + "anyOf": [ + {"type": "number"}, + {"type": "string", "pattern": r"^(?!^[-+.]*$)[+-]?0*\d*\.?\d{0,2}0*$"}, + ] + }, }, } - # Numeric value should be converted to string - result1 = validate({"name": "Widget", "price": 189.99}, schema) - assert result1["price"] == "189.99" - assert isinstance(result1["price"], str) + # Numeric value with extra precision should be rounded to 2 decimal places + result1 = validate({"name": "Widget", "price": 189.999}, schema) + assert result1["price"] == 190.0 + assert isinstance(result1["price"], float) + + # Numeric value should be converted to float + result2 = validate({"name": "Gadget", "price": 50.5}, schema) + assert result2["price"] == 50.5 + assert isinstance(result2["price"], float) - # String value should remain a string - result2 = validate({"name": "Gadget", "price": "249.99"}, schema) - assert result2["price"] == "249.99" - assert isinstance(result2["price"], str) + # String value should be converted to float + result3 = validate({"name": "Gizmo", "price": "249.99"}, schema) + assert result3["price"] == 249.99 + assert isinstance(result3["price"], float)