feat: pass sqlmodel objects as example

Telsho · Telsho · commit ff6b628ca081 · 2025-12-08T12:23:29.000+01:00
diff --git a/docs/getting_started.rst b/docs/getting_started.rst
@@ -115,6 +115,19 @@ Now, we can feed unstructured text to the orchestrator. The `synthesize_and_save
     if __name__ == "__main__":
         asyncio.run(main())
 
+.. tip::
+
+    **Improving Results with Examples**: If you have existing data (e.g., a "Product" object fetched from your database), you can pass it to the orchestrator to help the LLM understand the output format. Use the `extraction_example_object` parameter in `synthesize_and_save`:
+
+    .. code-block:: python
+
+       # existing_product is a SQLModel instance
+       await orchestrator.synthesize_and_save(
+           input_strings=[text],
+           db_session=session,
+           extraction_example_object=existing_product
+       )
+
 Step 5: See the Results
 -----------------------
 
diff --git a/docs/workflow_orchestrator.rst b/docs/workflow_orchestrator.rst
@@ -163,6 +163,7 @@ Once the orchestrator is configured, you can start processing documents using on
    *   ``input_strings`` (``List[str]``): A list of strings, where each string is a document to be processed.
    *   ``db_session_for_hydration`` (``Optional[Session]``): An optional SQLAlchemy session. If provided, the hydrator will use it to resolve relationships. If not, a temporary in-memory session is created.
    *   ``extraction_example_json`` (``str``, optional): A JSON string that provides a few-shot example to the LLM, guiding it to produce a better-structured output. If not provided, the orchestrator will attempt to auto-generate one.
+   *   ``extraction_example_object`` (``Optional[Union[SQLModel, List[SQLModel]]]``, optional): An existing SQLModel object or a list of them to be used as the few-shot example. This is an alternative to providing the example as a raw JSON string.
    *   ``custom_extraction_process`` (``str``, optional): Custom, step-by-step instructions for the LLM on how to perform the extraction.
    *   ``custom_extraction_guidelines`` (``str``, optional): A list of rules or guidelines for the LLM to follow.
    *   ``custom_final_checklist`` (``str``, optional): A final checklist for the LLM to review before finalizing its output.
diff --git a/src/extrai/core/workflow_orchestrator.py b/src/extrai/core/workflow_orchestrator.py
@@ -256,6 +256,7 @@ async def synthesize(
         input_strings: List[str],
         db_session_for_hydration: Optional[Session],
         extraction_example_json: str = "",
+        extraction_example_object: Optional[Union[SQLModel, List[SQLModel]]] = None,
         custom_extraction_process: str = "",
         custom_extraction_guidelines: str = "",
         custom_final_checklist: str = "",
@@ -267,6 +268,7 @@ async def synthesize(
             input_strings: A list of input strings for data extraction.
             db_session_for_hydration: SQLAlchemy session for the hydrator.
             extraction_example_json: Optional JSON string for few-shot prompting.
+            extraction_example_object: Optional SQLModel object or list of objects to use as example.
             custom_extraction_process: Optional custom instructions for LLM extraction process.
             custom_extraction_guidelines: Optional custom guidelines for LLM extraction.
             custom_final_checklist: Optional custom final checklist for LLM.
@@ -281,6 +283,25 @@ async def synthesize(
         if not input_strings:
             raise ValueError("Input strings list cannot be empty.")
 
+        if extraction_example_object and not extraction_example_json:
+            objects_to_process = (
+                extraction_example_object
+                if isinstance(extraction_example_object, list)
+                else [extraction_example_object]
+            )
+            processed_objects = []
+            for obj in objects_to_process:
+                if isinstance(obj, SQLModel):
+                    processed_objects.append(obj.model_dump(mode="json"))
+                else:
+                    self.logger.warning(
+                        f"Skipping unsupported object type in extraction_example_object: {type(obj)}"
+                    )
+            if processed_objects:
+                extraction_example_json = json.dumps(
+                    processed_objects, default=str, indent=2
+                )
+
         self.logger.info(
             f"Starting synthesis for {self.root_sqlmodel_class.__name__}..."
         )
@@ -524,6 +545,7 @@ async def synthesize_and_save(
         input_strings: List[str],
         db_session: Session,
         extraction_example_json: str = "",
+        extraction_example_object: Optional[Union[SQLModel, List[SQLModel]]] = None,
         custom_extraction_process: str = "",
         custom_extraction_guidelines: str = "",
         custom_final_checklist: str = "",
@@ -536,6 +558,7 @@ async def synthesize_and_save(
             input_strings=input_strings,
             db_session_for_hydration=db_session,
             extraction_example_json=extraction_example_json,
+            extraction_example_object=extraction_example_object,
             custom_extraction_process=custom_extraction_process,
             custom_extraction_guidelines=custom_extraction_guidelines,
             custom_final_checklist=custom_final_checklist,
diff --git a/tests/core/workflow_orchestrator/test_workflow_orchestrator_execution.py b/tests/core/workflow_orchestrator/test_workflow_orchestrator_execution.py
@@ -772,5 +772,91 @@ async def test_synthesize_and_save_persist_raises_generic_exception(
         mock_rollback.assert_called_once()
 
 
+    async def test_synthesize_with_extraction_example_parameters(self):
+        """Test different scenarios for extraction_example_object and extraction_example_json."""
+        dept1 = DepartmentModel(name="Dept 1")
+        dept2 = DepartmentModel(name="Dept 2")
+
+        test_cases = [
+            {
+                "name": "single_object",
+                "object": dept1,
+                "json_arg": "",
+                "expected_json_in_prepare": lambda j: len(json.loads(j)) == 1
+                and json.loads(j)[0]["name"] == "Dept 1",
+                "expect_warning": False,
+            },
+            {
+                "name": "list_of_objects",
+                "object": [dept1, dept2],
+                "json_arg": "",
+                "expected_json_in_prepare": lambda j: len(json.loads(j)) == 2
+                and json.loads(j)[1]["name"] == "Dept 2",
+                "expect_warning": False,
+            },
+            {
+                "name": "priority_json_over_object",
+                "object": dept1,
+                "json_arg": '[{"name": "Override"}]',
+                "expected_json_in_prepare": lambda j: j == '[{"name": "Override"}]',
+                "expect_warning": False,
+            },
+            {
+                "name": "unsupported_type",
+                "object": ["unsupported"],
+                "json_arg": "",
+                "expected_json_in_prepare": lambda j: j == "",
+                "expect_warning": True,
+            },
+        ]
+
+        for case in test_cases:
+            with self.subTest(case=case["name"]):
+                with (
+                    mock.patch.object(
+                        self.orchestrator,
+                        "_prepare_extraction_example",
+                        new_callable=AsyncMock,
+                    ) as mock_prepare,
+                    mock.patch.object(
+                        self.orchestrator,
+                        "_execute_standard_extraction",
+                        AsyncMock(return_value=[]),
+                    ),
+                    mock.patch.object(
+                        self.orchestrator,
+                        "_hydrate_results",
+                        mock.MagicMock(return_value=[]),
+                    ),
+                    mock.patch.object(
+                        self.orchestrator.logger, "warning"
+                    ) as mock_logger_warning,
+                ):
+                    mock_prepare.return_value = "{}"
+
+                    await self.orchestrator.synthesize(
+                        input_strings=["test"],
+                        db_session_for_hydration=self.db_session,
+                        extraction_example_object=case["object"],
+                        extraction_example_json=case["json_arg"],
+                    )
+
+                    # Verify warning
+                    if case["expect_warning"]:
+                        mock_logger_warning.assert_called()
+                        args, _ = mock_logger_warning.call_args
+                        self.assertIn("Skipping unsupported object type", args[0])
+                    else:
+                        mock_logger_warning.assert_not_called()
+
+                    # Verify _prepare_extraction_example argument
+                    args, _ = mock_prepare.call_args
+                    actual_json = args[0]
+                    self.assertTrue(
+                        case["expected_json_in_prepare"](actual_json),
+                        f"Failed for case {case['name']}: actual json {actual_json}",
+                    )
+
+
 if __name__ == "__main__":
     unittest.main()