Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ Documentation improvements are always welcome! Follow these guidelines:

We encourage contributions to our evaluation tests:

1. Explore existing evals in the [evals directory](https://github.com/instructor-ai/instructor/tree/main/tests/llm/test_openai/evals)
1. Explore existing evals in the [evals directory](https://github.com/instructor-ai/instructor/tree/main/tests/llm)
2. Contribute new evals as pytest tests
3. Evals should test specific capabilities or edge cases of the library or models
4. Follow the existing patterns for structuring eval tests
Expand Down
38 changes: 31 additions & 7 deletions instructor/dsl/partial.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,9 +242,16 @@ def model_from_chunks(
partial_mode = (
"on" if issubclass(cls, PartialLiteralMixin) else "trailing-strings"
)
chunk_buffer = []
chunk_buffer: list[str] = []
for chunk in json_chunks:
chunk_buffer += chunk
if chunk is None:
continue
if not isinstance(chunk, str):
try:
chunk = str(chunk)
except Exception:
continue
chunk_buffer.append(chunk)
if len(chunk_buffer) < 2:
continue
potential_object += remove_control_chars("".join(chunk_buffer))
Expand All @@ -254,7 +261,7 @@ def model_from_chunks(
)
yield obj
if chunk_buffer:
potential_object += remove_control_chars(chunk_buffer[0])
potential_object += remove_control_chars("".join(chunk_buffer))
obj = process_potential_object(
potential_object, partial_mode, partial_model, **kwargs
)
Expand All @@ -269,12 +276,29 @@ async def model_from_chunks_async(
partial_mode = (
"on" if issubclass(cls, PartialLiteralMixin) else "trailing-strings"
)
chunk_buffer: list[str] = []
async for chunk in json_chunks:
potential_object += chunk
obj = from_json(
(potential_object.strip() or "{}").encode(), partial_mode=partial_mode
if chunk is None:
continue
if not isinstance(chunk, str):
try:
chunk = str(chunk)
except Exception:
continue
chunk_buffer.append(chunk)
if len(chunk_buffer) < 2:
continue
potential_object += remove_control_chars("".join(chunk_buffer))
chunk_buffer = []
obj = process_potential_object(
potential_object, partial_mode, partial_model, **kwargs
)
yield obj
if chunk_buffer:
potential_object += remove_control_chars("".join(chunk_buffer))
obj = process_potential_object(
potential_object, partial_mode, partial_model, **kwargs
)
obj = partial_model.model_validate(obj, strict=None, **kwargs)
yield obj

@staticmethod
Expand Down
43 changes: 17 additions & 26 deletions tests/dsl/test_partial.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,26 +92,25 @@ def test_partial():
}, "Partial model JSON schema has changed"


partial_chunks = ["\n", "\t", " ", "\x00", '{"b": {"b": 1}}']
expected_model_from_chunks = [
# First model has default values
{"a": None, "b": {}},
# Second model has default values, unaffected by control characters
{"a": None, "b": {}},
# Last model has b populated from JSON (from the JSON chunk)
{"a": None, "b": {"b": 1}},
]


def test_partial_with_whitespace():
partial = Partial[SamplePartial]

# Get the actual models from chunks
models = list(partial.model_from_chunks(["\n", "\t", " ", '{"b": {"b": 1}}']))

# Print actual values for debugging
print(f"Number of models: {len(models)}")
models = list(partial.model_from_chunks(partial_chunks))
for i, model in enumerate(models):
print(f"Model {i}: {model.model_dump()}")

# Actual behavior: When whitespace chunks are processed, we may get models
# First model has default values
assert models[0].model_dump() == {"a": None, "b": {}}

# Last model has b populated from JSON (from the JSON chunk)
assert models[-1].model_dump() == {"a": None, "b": {"b": 1}}

# Check we have the expected number of models (2 instead of 4)
assert len(models) == 2
# Expected behavior: When whitespace chunks are processed, we should always get a model
assert model.model_dump() == expected_model_from_chunks[i]


@pytest.mark.asyncio
Expand All @@ -120,23 +119,15 @@ async def test_async_partial_with_whitespace():

# Handle any leading whitespace from the model
async def async_generator():
for chunk in ["\n", "\t", " ", '{"b": {"b": 1}}']:
for chunk in partial_chunks:
yield chunk

expected_model_dicts = [
{"a": None, "b": {}},
{"a": None, "b": {}},
{"a": None, "b": {}},
{"a": None, "b": {"b": 1}},
]

i = 0
async for model in partial.model_from_chunks_async(async_generator()):
assert model.model_dump() == expected_model_dicts[i]
# Expected behavior: When whitespace chunks are processed, we should always get a model
assert model.model_dump() == expected_model_from_chunks[i]
i += 1

assert model.model_dump() == {"a": None, "b": {"b": 1}}


@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set")
def test_summary_extraction():
Expand Down
Loading