Skip to content

Commit d978208

Browse files
bleddenclaude
andauthored
Fix additionalProperties in auto-generated KG schema models (#20768)
* fix: clean additionalProperties in auto-generated KG schema models Pydantic generates `additionalProperties: true` for `Dict[str, Any]` fields in the Entity/Relation models created by SchemaLLMPathExtractor. This breaks OpenAI structured outputs (which require `false`) and Google Gemini (which rejects `true` entirely). Added a `_clean_additional_properties` helper that recursively sets `additionalProperties: true` to `false`, applied via ConfigDict on the auto-generated models only when they include a properties dict field. User-provided kg_schema_cls is not affected. Fixes #20629 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * refactor: make additionalProperties cleanup opt-in via allow_additional_properties Address review feedback from AstraBert: the additionalProperties fix is now opt-in rather than unconditional. Users pass allow_additional_properties=False to SchemaLLMPathExtractor when they need strict schemas (OpenAI structured outputs, Google Gemini). Default is True, preserving existing behavior. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: pass __config__ as keyword arg to satisfy mypy overload resolution The dict unpacking pattern (**config_kwargs) caused mypy to see a dict[str, ConfigDict] positional argument that doesn't match any create_model overload. Passing __config__ directly as a keyword argument with a conditional expression resolves the type error. --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent bb5f5d7 commit d978208

File tree

3 files changed

+234
-3
lines changed

3 files changed

+234
-3
lines changed

llama-index-core/llama_index/core/indices/property_graph/transformations/schema_llm.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,12 @@ class SchemaLLMPathExtractor(TransformComponent):
116116
The maximum number of triplets to extract per chunk. Defaults to 10.
117117
num_workers (int, optional):
118118
The number of workers to use. Defaults to 4.
119+
allow_additional_properties (bool, optional):
120+
Whether to allow ``additionalProperties: true`` in auto-generated
121+
JSON schemas for entity/relation models with Dict properties.
122+
Set to ``False`` when using LLM providers that require strict
123+
schemas (e.g. OpenAI structured outputs, Google Gemini).
124+
Defaults to True (preserving existing behavior).
119125
120126
"""
121127

@@ -144,6 +150,7 @@ def __init__(
144150
kg_validation_schema: Optional[Union[Dict[str, str], List[Triple]]] = None,
145151
max_triplets_per_chunk: int = 10,
146152
num_workers: int = 4,
153+
allow_additional_properties: bool = True,
147154
) -> None:
148155
"""Init params."""
149156
if isinstance(extract_prompt, str):
@@ -159,7 +166,12 @@ def __init__(
159166
]
160167
else:
161168
entity_props = possible_entity_props # type: ignore
162-
entity_cls = get_entity_class(possible_entities, entity_props, strict)
169+
entity_cls = get_entity_class(
170+
possible_entities,
171+
entity_props,
172+
strict,
173+
clean_additional_properties=not allow_additional_properties,
174+
)
163175

164176
possible_relations = possible_relations or DEFAULT_RELATIONS # type: ignore
165177
if possible_relation_props and isinstance(
@@ -172,7 +184,10 @@ def __init__(
172184
else:
173185
relation_props = possible_relation_props # type: ignore
174186
relation_cls = get_relation_class(
175-
possible_relations, relation_props, strict
187+
possible_relations,
188+
relation_props,
189+
strict,
190+
clean_additional_properties=not allow_additional_properties,
176191
)
177192

178193
triplet_cls = create_model(

llama-index-core/llama_index/core/indices/property_graph/transformations/utils.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,33 @@
66
# python 3.8 and 3.9 compatibility
77
from typing import Any as TypeAlias # type: ignore
88

9-
from llama_index.core.bridge.pydantic import create_model, Field
9+
from llama_index.core.bridge.pydantic import ConfigDict, create_model, Field
10+
11+
12+
def _clean_additional_properties(schema: Dict[str, Any]) -> None:
13+
"""
14+
Recursively set ``additionalProperties: true`` to ``false`` in a JSON schema.
15+
16+
Pydantic generates ``additionalProperties: true`` for ``Dict[str, Any]``
17+
fields. This is incompatible with OpenAI structured outputs (which require
18+
``false``) and Google Gemini (which rejects the field entirely when set to
19+
``true``). Setting it to ``false`` satisfies both APIs.
20+
"""
21+
if isinstance(schema, dict):
22+
if schema.get("additionalProperties") is True:
23+
schema["additionalProperties"] = False
24+
for value in schema.values():
25+
_clean_additional_properties(value)
26+
elif isinstance(schema, list):
27+
for item in schema:
28+
_clean_additional_properties(item)
1029

1130

1231
def get_entity_class(
1332
possible_entities: TypeAlias,
1433
possible_entity_props: Optional[List[str]],
1534
strict: bool,
35+
clean_additional_properties: bool = False,
1636
) -> Any:
1737
"""Get entity class."""
1838
if not possible_entity_props:
@@ -31,8 +51,14 @@ def get_entity_class(
3151
name=(str, ...),
3252
)
3353
else:
54+
config = (
55+
ConfigDict(json_schema_extra=_clean_additional_properties)
56+
if clean_additional_properties
57+
else None
58+
)
3459
return create_model(
3560
"Entity",
61+
__config__=config,
3662
type=(
3763
possible_entities if strict else str,
3864
Field(
@@ -61,6 +87,7 @@ def get_relation_class(
6187
possible_relations: TypeAlias,
6288
possible_relation_props: Optional[List[str]],
6389
strict: bool,
90+
clean_additional_properties: bool = False,
6491
) -> Any:
6592
"""Get relation class."""
6693
if not possible_relation_props:
@@ -78,8 +105,14 @@ def get_relation_class(
78105
),
79106
)
80107
else:
108+
config = (
109+
ConfigDict(json_schema_extra=_clean_additional_properties)
110+
if clean_additional_properties
111+
else None
112+
)
81113
return create_model(
82114
"Relation",
115+
__config__=config,
83116
type=(
84117
possible_relations if strict else str,
85118
Field(
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
"""
2+
Tests for property graph schema utility functions.
3+
4+
Validates that auto-generated Entity/Relation Pydantic models produce
5+
JSON schemas compatible with OpenAI structured outputs and Google Gemini
6+
when ``clean_additional_properties=True`` is passed.
7+
"""
8+
9+
import json
10+
from typing import Literal
11+
12+
from llama_index.core.indices.property_graph.transformations.utils import (
13+
_clean_additional_properties,
14+
get_entity_class,
15+
get_relation_class,
16+
)
17+
18+
19+
def _schema_contains(schema: dict, key: str, value: object) -> bool:
20+
"""Recursively check whether *schema* contains *key* mapped to *value*."""
21+
if isinstance(schema, dict):
22+
if schema.get(key) is value:
23+
return True
24+
return any(_schema_contains(v, key, value) for v in schema.values())
25+
if isinstance(schema, list):
26+
return any(_schema_contains(item, key, value) for item in schema)
27+
return False
28+
29+
30+
# -- _clean_additional_properties ------------------------------------------
31+
32+
33+
def test_clean_additional_properties_sets_true_to_false():
34+
schema = {"additionalProperties": True, "properties": {"x": {"type": "string"}}}
35+
_clean_additional_properties(schema)
36+
assert schema["additionalProperties"] is False
37+
38+
39+
def test_clean_additional_properties_nested():
40+
schema = {
41+
"properties": {
42+
"inner": {
43+
"additionalProperties": True,
44+
"type": "object",
45+
}
46+
}
47+
}
48+
_clean_additional_properties(schema)
49+
assert schema["properties"]["inner"]["additionalProperties"] is False
50+
51+
52+
def test_clean_additional_properties_ignores_false():
53+
schema = {"additionalProperties": False}
54+
_clean_additional_properties(schema)
55+
assert schema["additionalProperties"] is False
56+
57+
58+
def test_clean_additional_properties_ignores_absent():
59+
schema = {"properties": {"x": {"type": "string"}}}
60+
_clean_additional_properties(schema)
61+
assert "additionalProperties" not in schema
62+
63+
64+
def test_clean_additional_properties_handles_list():
65+
schema = {"anyOf": [{"additionalProperties": True}, {"type": "null"}]}
66+
_clean_additional_properties(schema)
67+
assert schema["anyOf"][0]["additionalProperties"] is False
68+
69+
70+
# -- get_entity_class (no props → no additionalProperties issue) -----------
71+
72+
73+
def test_entity_class_without_props_has_no_additional_properties_true():
74+
entities = Literal["PERSON", "LOCATION"]
75+
cls = get_entity_class(entities, None, strict=True)
76+
schema = cls.model_json_schema()
77+
assert not _schema_contains(schema, "additionalProperties", True)
78+
79+
80+
# -- get_entity_class (default: additionalProperties preserved) ------------
81+
82+
83+
def test_entity_class_with_props_default_preserves_additional_properties():
84+
"""By default, additionalProperties: true is left as-is."""
85+
entities = Literal["PERSON", "LOCATION"]
86+
cls = get_entity_class(entities, ["age", "occupation"], strict=True)
87+
schema = cls.model_json_schema()
88+
assert _schema_contains(schema, "additionalProperties", True), (
89+
f"Expected additionalProperties: true in default mode:\n"
90+
f"{json.dumps(schema, indent=2)}"
91+
)
92+
93+
94+
# -- get_entity_class (opt-in: fix applied) --------------------------------
95+
96+
97+
def test_entity_class_with_props_clean_removes_additional_properties():
98+
entities = Literal["PERSON", "LOCATION"]
99+
cls = get_entity_class(
100+
entities, ["age", "occupation"], strict=True, clean_additional_properties=True
101+
)
102+
schema = cls.model_json_schema()
103+
assert not _schema_contains(schema, "additionalProperties", True), (
104+
f"Schema still contains additionalProperties: true:\n"
105+
f"{json.dumps(schema, indent=2)}"
106+
)
107+
108+
109+
def test_entity_class_with_props_non_strict_clean():
110+
cls = get_entity_class(str, ["age"], strict=False, clean_additional_properties=True)
111+
schema = cls.model_json_schema()
112+
assert not _schema_contains(schema, "additionalProperties", True)
113+
114+
115+
# -- get_relation_class (no props → no issue) ------------------------------
116+
117+
118+
def test_relation_class_without_props_has_no_additional_properties_true():
119+
relations = Literal["USED_BY", "PART_OF"]
120+
cls = get_relation_class(relations, None, strict=True)
121+
schema = cls.model_json_schema()
122+
assert not _schema_contains(schema, "additionalProperties", True)
123+
124+
125+
# -- get_relation_class (default: additionalProperties preserved) ----------
126+
127+
128+
def test_relation_class_with_props_default_preserves_additional_properties():
129+
"""By default, additionalProperties: true is left as-is."""
130+
relations = Literal["USED_BY", "PART_OF"]
131+
cls = get_relation_class(relations, ["weight", "source"], strict=True)
132+
schema = cls.model_json_schema()
133+
assert _schema_contains(schema, "additionalProperties", True), (
134+
f"Expected additionalProperties: true in default mode:\n"
135+
f"{json.dumps(schema, indent=2)}"
136+
)
137+
138+
139+
# -- get_relation_class (opt-in: fix applied) ------------------------------
140+
141+
142+
def test_relation_class_with_props_clean_removes_additional_properties():
143+
relations = Literal["USED_BY", "PART_OF"]
144+
cls = get_relation_class(
145+
relations, ["weight", "source"], strict=True, clean_additional_properties=True
146+
)
147+
schema = cls.model_json_schema()
148+
assert not _schema_contains(schema, "additionalProperties", True), (
149+
f"Schema still contains additionalProperties: true:\n"
150+
f"{json.dumps(schema, indent=2)}"
151+
)
152+
153+
154+
def test_relation_class_with_props_non_strict_clean():
155+
cls = get_relation_class(
156+
str, ["weight"], strict=False, clean_additional_properties=True
157+
)
158+
schema = cls.model_json_schema()
159+
assert not _schema_contains(schema, "additionalProperties", True)
160+
161+
162+
# -- Models still validate correctly after the fix -------------------------
163+
164+
165+
def test_entity_model_with_props_roundtrips():
166+
entities = Literal["PERSON", "LOCATION"]
167+
cls = get_entity_class(
168+
entities, ["age", "occupation"], strict=True, clean_additional_properties=True
169+
)
170+
instance = cls(type="PERSON", name="Alice", properties={"age": 30})
171+
assert instance.type == "PERSON"
172+
assert instance.name == "Alice"
173+
assert instance.properties == {"age": 30}
174+
175+
176+
def test_relation_model_with_props_roundtrips():
177+
relations = Literal["USED_BY", "PART_OF"]
178+
cls = get_relation_class(
179+
relations, ["weight"], strict=True, clean_additional_properties=True
180+
)
181+
instance = cls(type="USED_BY", properties={"weight": 0.9})
182+
assert instance.type == "USED_BY"
183+
assert instance.properties == {"weight": 0.9}

0 commit comments

Comments
 (0)