Skip to content

Commit c9563bc

Browse files
thomasnormalthomasahleclaude
authored
feat(partial): completeness-based streaming validation (#1999)
Co-authored-by: Thomas Dybdahl Ahle <thomas@ahle.dk> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent b7c39e8 commit c9563bc

File tree

3 files changed

+563
-178
lines changed

3 files changed

+563
-178
lines changed

instructor/dsl/json_tracker.py

Lines changed: 302 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,302 @@
1+
"""
2+
JSON Completeness Tracker for Partial Streaming.
3+
4+
Tracks which parts of accumulated JSON are "closed" (complete) vs "open" (incomplete).
5+
A closed object/array has matching braces/brackets; an open one is still being streamed.
6+
7+
This enables validation to run only on complete sub-objects, avoiding validation
8+
errors on incomplete data during streaming.
9+
"""
10+
11+
from __future__ import annotations
12+
13+
14+
class JsonCompleteness:
15+
"""
16+
Track completeness of JSON structures during streaming.
17+
18+
A JSON structure is "complete" if:
19+
- Objects: start with { and end with }
20+
- Arrays: start with [ and end with ]
21+
- Scalars (strings, numbers, booleans, null): always complete once parsed
22+
23+
Example:
24+
tracker = JsonCompleteness()
25+
26+
# Incomplete - missing closing brace
27+
tracker.analyze('{"name": "Alice", "address": {"city": "NY')
28+
tracker.is_path_complete("") # False - root object incomplete
29+
tracker.is_path_complete("name") # True - string is complete
30+
tracker.is_path_complete("address") # False - nested object incomplete
31+
32+
# Complete
33+
tracker.analyze('{"name": "Alice"}')
34+
tracker.is_path_complete("") # True - root object complete
35+
"""
36+
37+
def __init__(self) -> None:
38+
self._json_str: str = ""
39+
self._complete_paths: set[str] = set()
40+
self._path_positions: dict[str, tuple[int, int]] = {} # path -> (start, end)
41+
42+
def analyze(self, json_str: str) -> None:
43+
"""
44+
Analyze a JSON string and determine completeness of each sub-structure.
45+
46+
Args:
47+
json_str: The accumulated JSON string (may be incomplete)
48+
"""
49+
self._json_str = json_str
50+
self._complete_paths = set()
51+
self._path_positions = {}
52+
53+
if not json_str.strip():
54+
return
55+
56+
# Parse and track completeness
57+
self._analyze_structure(json_str, "", 0)
58+
59+
def _analyze_structure(self, json_str: str, path: str, start_pos: int) -> int:
60+
"""
61+
Recursively analyze JSON structure and track completeness.
62+
63+
Returns the position after the current structure, or -1 if incomplete.
64+
"""
65+
s = json_str[start_pos:].lstrip()
66+
if not s:
67+
return -1
68+
69+
pos = start_pos + (
70+
len(json_str) - start_pos - len(json_str[start_pos:].lstrip())
71+
)
72+
73+
if s[0] == "{":
74+
return self._analyze_object(json_str, path, pos)
75+
elif s[0] == "[":
76+
return self._analyze_array(json_str, path, pos)
77+
elif s[0] == '"':
78+
return self._analyze_string(json_str, path, pos)
79+
elif s[0] in "-0123456789":
80+
return self._analyze_number(json_str, path, pos)
81+
elif s.startswith("true"):
82+
self._mark_complete(path, pos, pos + 4)
83+
return pos + 4
84+
elif s.startswith("false"):
85+
self._mark_complete(path, pos, pos + 5)
86+
return pos + 5
87+
elif s.startswith("null"):
88+
self._mark_complete(path, pos, pos + 4)
89+
return pos + 4
90+
else:
91+
return -1 # Invalid or incomplete
92+
93+
def _analyze_object(self, json_str: str, path: str, start_pos: int) -> int:
94+
"""Analyze a JSON object. Returns end position or -1 if incomplete."""
95+
pos = start_pos + 1 # Skip opening {
96+
first = True
97+
98+
while pos < len(json_str):
99+
# Skip whitespace
100+
while pos < len(json_str) and json_str[pos] in " \t\n\r":
101+
pos += 1
102+
103+
if pos >= len(json_str):
104+
return -1 # Incomplete
105+
106+
if json_str[pos] == "}":
107+
# Object is complete
108+
self._mark_complete(path, start_pos, pos + 1)
109+
return pos + 1
110+
111+
if not first:
112+
if json_str[pos] != ",":
113+
return -1 # Invalid
114+
pos += 1
115+
# Skip whitespace after comma
116+
while pos < len(json_str) and json_str[pos] in " \t\n\r":
117+
pos += 1
118+
if pos >= len(json_str):
119+
return -1
120+
121+
first = False
122+
123+
# Parse key
124+
if pos >= len(json_str) or json_str[pos] != '"':
125+
return -1 # Invalid or incomplete
126+
127+
key_start = pos
128+
pos = self._skip_string(json_str, pos)
129+
if pos == -1:
130+
return -1 # Incomplete string
131+
132+
key = json_str[key_start + 1 : pos - 1] # Extract key without quotes
133+
134+
# Skip whitespace and colon
135+
while pos < len(json_str) and json_str[pos] in " \t\n\r":
136+
pos += 1
137+
if pos >= len(json_str) or json_str[pos] != ":":
138+
return -1
139+
pos += 1
140+
141+
# Parse value
142+
child_path = f"{path}.{key}" if path else key
143+
pos = self._analyze_structure(json_str, child_path, pos)
144+
if pos == -1:
145+
return -1 # Incomplete value
146+
147+
return -1 # Incomplete (no closing brace)
148+
149+
def _analyze_array(self, json_str: str, path: str, start_pos: int) -> int:
150+
"""Analyze a JSON array. Returns end position or -1 if incomplete."""
151+
pos = start_pos + 1 # Skip opening [
152+
index = 0
153+
first = True
154+
155+
while pos < len(json_str):
156+
# Skip whitespace
157+
while pos < len(json_str) and json_str[pos] in " \t\n\r":
158+
pos += 1
159+
160+
if pos >= len(json_str):
161+
return -1 # Incomplete
162+
163+
if json_str[pos] == "]":
164+
# Array is complete
165+
self._mark_complete(path, start_pos, pos + 1)
166+
return pos + 1
167+
168+
if not first:
169+
if json_str[pos] != ",":
170+
return -1 # Invalid
171+
pos += 1
172+
# Skip whitespace after comma
173+
while pos < len(json_str) and json_str[pos] in " \t\n\r":
174+
pos += 1
175+
if pos >= len(json_str):
176+
return -1
177+
178+
first = False
179+
180+
# Parse element
181+
child_path = f"{path}[{index}]"
182+
pos = self._analyze_structure(json_str, child_path, pos)
183+
if pos == -1:
184+
return -1 # Incomplete element
185+
186+
index += 1
187+
188+
return -1 # Incomplete (no closing bracket)
189+
190+
def _analyze_string(self, json_str: str, path: str, start_pos: int) -> int:
191+
"""Analyze a JSON string. Returns end position or -1 if incomplete."""
192+
pos = self._skip_string(json_str, start_pos)
193+
if pos != -1:
194+
self._mark_complete(path, start_pos, pos)
195+
return pos
196+
197+
def _skip_string(self, json_str: str, start_pos: int) -> int:
198+
"""Skip a JSON string, handling escapes. Returns position after closing quote or -1."""
199+
pos = start_pos + 1 # Skip opening quote
200+
while pos < len(json_str):
201+
c = json_str[pos]
202+
if c == "\\":
203+
pos += 2 # Skip escape sequence
204+
elif c == '"':
205+
return pos + 1 # Found closing quote
206+
else:
207+
pos += 1
208+
return -1 # Incomplete string
209+
210+
def _analyze_number(self, json_str: str, path: str, start_pos: int) -> int:
211+
"""Analyze a JSON number. Returns end position or -1 if incomplete."""
212+
pos = start_pos
213+
214+
# Optional minus
215+
if pos < len(json_str) and json_str[pos] == "-":
216+
pos += 1
217+
218+
# Integer part
219+
if pos >= len(json_str):
220+
return -1
221+
if json_str[pos] == "0":
222+
pos += 1
223+
elif json_str[pos] in "123456789":
224+
pos += 1
225+
while pos < len(json_str) and json_str[pos] in "0123456789":
226+
pos += 1
227+
else:
228+
return -1
229+
230+
# Fractional part
231+
if pos < len(json_str) and json_str[pos] == ".":
232+
pos += 1
233+
if pos >= len(json_str) or json_str[pos] not in "0123456789":
234+
return -1 # Incomplete fraction
235+
while pos < len(json_str) and json_str[pos] in "0123456789":
236+
pos += 1
237+
238+
# Exponent part
239+
if pos < len(json_str) and json_str[pos] in "eE":
240+
pos += 1
241+
if pos < len(json_str) and json_str[pos] in "+-":
242+
pos += 1
243+
if pos >= len(json_str) or json_str[pos] not in "0123456789":
244+
return -1 # Incomplete exponent
245+
while pos < len(json_str) and json_str[pos] in "0123456789":
246+
pos += 1
247+
248+
# Check if we're at a valid terminator (or end of partial JSON)
249+
if pos < len(json_str) and json_str[pos] not in " \t\n\r,}]":
250+
return -1 # Number continues or is invalid
251+
252+
self._mark_complete(path, start_pos, pos)
253+
return pos
254+
255+
def _mark_complete(self, path: str, start_pos: int, end_pos: int) -> None:
256+
"""Mark a path as complete."""
257+
self._complete_paths.add(path)
258+
self._path_positions[path] = (start_pos, end_pos)
259+
260+
def is_path_complete(self, path: str) -> bool:
261+
"""
262+
Check if the sub-structure at the given path is complete.
263+
264+
Args:
265+
path: Dot-separated path (e.g., "user.address.city", "items[0]")
266+
Use "" for root object.
267+
268+
Returns:
269+
True if the structure at path is complete (closed), False otherwise.
270+
"""
271+
return path in self._complete_paths
272+
273+
def get_complete_paths(self) -> set[str]:
274+
"""Return all paths that are complete."""
275+
return self._complete_paths.copy()
276+
277+
def is_root_complete(self) -> bool:
278+
"""Check if the root JSON structure is complete."""
279+
return "" in self._complete_paths
280+
281+
282+
def is_json_complete(json_str: str) -> bool:
283+
"""
284+
Quick check if a JSON string represents a complete structure.
285+
286+
Uses jiter in strict mode - parsing fails if JSON is incomplete.
287+
288+
Args:
289+
json_str: The JSON string to check
290+
291+
Returns:
292+
True if the JSON is complete (all braces/brackets matched)
293+
"""
294+
from jiter import from_json
295+
296+
if not json_str or not json_str.strip():
297+
return False
298+
try:
299+
from_json(json_str.encode()) # No partial_mode = strict parsing
300+
return True
301+
except Exception:
302+
return False

0 commit comments

Comments
 (0)