Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@

- Updated examples, default values, and documentation to use `gpt-4.1` / `gpt-4.1-mini` instead of deprecated GPT-4* models (e.g. `gpt-4o`, `gpt-4`).

### Added

- Exposed `use_structured_output` parameter in `SimpleKGPipeline` constructor for enabling structured output in entity extraction and automatic schema extraction with supported LLMs (OpenAI, VertexAI).

## 1.13.1

- Fixed invalid lexical graph relationships causing "Relationship references unknown start node" errors during parquet import when nodes are pruned.
Expand Down
24 changes: 23 additions & 1 deletion docs/source/user_guide_kg_builder.rst
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,26 @@ They are also accessible via the `SimpleKGPipeline` interface.
# ...
)

Structured Output
-----------------

When using an LLM that supports structured output (such as OpenAI or VertexAI),
you can enable it to improve the reliability of entity extraction and automatic
schema extraction:

.. code:: python

kg_builder = SimpleKGPipeline(
# ...
use_structured_output=True,
# ...
)

.. note::

Structured output is only supported by LLMs that have ``supports_structured_output=True``
(currently ``OpenAILLM`` and ``VertexAILLM``). Using it with an unsupported LLM will raise an error.

Skip Entity Resolution
----------------------

Expand Down Expand Up @@ -479,7 +499,8 @@ within the configuration file.
},
"lexical_graph_config": {
"chunk_node_label": "TextPart"
}
},
"use_structured_output": false
}


Expand Down Expand Up @@ -520,6 +541,7 @@ or in YAML:
- ["House", "RULES", "Planet"]
lexical_graph_config:
chunk_node_label: TextPart
use_structured_output: false


It is also possible to further customize components, with a syntax similar to the one
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ class SimpleKGPipelineConfig(TemplatePipelineConfig):
perform_entity_resolution: bool = True
lexical_graph_config: Optional[LexicalGraphConfig] = None
neo4j_database: Optional[str] = None
use_structured_output: bool = False

pdf_loader: Optional[ComponentType] = None
kg_writer: Optional[ComponentType] = None
Expand Down Expand Up @@ -186,7 +187,10 @@ def _get_schema(self) -> BaseSchemaBuilder:
Return SchemaFromTextExtractor for automatic extraction or SchemaBuilder for manual schema.
"""
if not self.has_user_provided_schema():
return SchemaFromTextExtractor(llm=self.get_default_llm())
return SchemaFromTextExtractor(
llm=self.get_default_llm(),
use_structured_output=self.use_structured_output,
)
return SchemaBuilder()

def _process_schema_with_precedence(self) -> dict[str, Any]:
Expand Down Expand Up @@ -222,6 +226,7 @@ def _get_extractor(self) -> EntityRelationExtractor:
llm=self.get_default_llm(),
prompt_template=self.prompt_template,
on_error=self.on_error,
use_structured_output=self.use_structured_output,
)

def _get_pruner(self) -> GraphPruning:
Expand Down
3 changes: 3 additions & 0 deletions src/neo4j_graphrag/experimental/pipeline/kg_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ class SimpleKGPipeline:
perform_entity_resolution (bool): Merge entities with same label and name. Default: True
prompt_template (str): A custom prompt template to use for extraction.
lexical_graph_config (Optional[LexicalGraphConfig], optional): Lexical graph configuration to customize node labels and relationship types in the lexical graph.
use_structured_output (bool): Whether to use structured output (LLMInterfaceV2) for entity extraction and automatic schema extraction. Only supported for OpenAILLM and VertexAILLM. Defaults to False.
"""

def __init__(
Expand All @@ -115,6 +116,7 @@ def __init__(
perform_entity_resolution: bool = True,
lexical_graph_config: Optional[LexicalGraphConfig] = None,
neo4j_database: Optional[str] = None,
use_structured_output: bool = False,
):
try:
config = SimpleKGPipelineConfig.model_validate(
Expand All @@ -137,6 +139,7 @@ def __init__(
perform_entity_resolution=perform_entity_resolution,
lexical_graph_config=lexical_graph_config,
neo4j_database=neo4j_database,
use_structured_output=use_structured_output,
)
)
except (ValidationError, ValueError) as e:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,39 @@ def test_simple_kg_pipeline_config_extractor(mock_llm: Mock, llm: LLMInterface)
assert extractor.prompt_template.template == "my template {text}"


@patch(
"neo4j_graphrag.experimental.pipeline.config.template_pipeline.simple_kg_builder.SimpleKGPipelineConfig.get_default_llm"
)
def test_simple_kg_pipeline_config_extractor_with_structured_output(
mock_llm: Mock, llm: LLMInterface
) -> None:
llm.supports_structured_output = True
mock_llm.return_value = llm
config = SimpleKGPipelineConfig(
on_error="IGNORE", # type: ignore
use_structured_output=True,
)
extractor = config._get_extractor()
assert isinstance(extractor, LLMEntityRelationExtractor)
assert extractor.use_structured_output is True


@patch(
"neo4j_graphrag.experimental.pipeline.config.template_pipeline.simple_kg_builder.SimpleKGPipelineConfig.get_default_llm"
)
def test_simple_kg_pipeline_config_schema_with_structured_output(
mock_llm: Mock, llm: LLMInterface
) -> None:
llm.supports_structured_output = True
mock_llm.return_value = llm
config = SimpleKGPipelineConfig(
use_structured_output=True,
)
schema = config._get_schema()
assert isinstance(schema, SchemaFromTextExtractor)
assert schema.use_structured_output is True


@patch(
"neo4j_graphrag.experimental.components.kg_writer.get_version",
return_value=((5, 23, 0), False, False),
Expand Down
36 changes: 36 additions & 0 deletions tests/unit/experimental/pipeline/test_kg_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,3 +193,39 @@ async def test_knowledge_graph_builder_with_lexical_graph_config(_: Mock) -> Non
assert pipe_inputs["extractor"]["lexical_graph_config"] == lexical_graph_config
assert pipe_inputs["extractor"]["document_info"] is not None
assert pipe_inputs["extractor"]["document_info"]["path"] == "document.txt"


@mock.patch(
"neo4j_graphrag.experimental.components.kg_writer.get_version",
return_value=((5, 23, 0), False, False),
)
def test_simple_kg_pipeline_accepts_use_structured_output(_: Mock) -> None:
llm = MagicMock(spec=LLMInterface)
llm.supports_structured_output = True
driver = MagicMock(spec=neo4j.Driver)
embedder = MagicMock(spec=Embedder)

# Should not raise
kg_builder = SimpleKGPipeline(
llm=llm,
driver=driver,
embedder=embedder,
from_pdf=False,
use_structured_output=True,
)
assert kg_builder is not None


def test_simple_kg_pipeline_use_structured_output_unsupported_llm() -> None:
llm = MagicMock(spec=LLMInterface)
llm.supports_structured_output = False
driver = MagicMock(spec=neo4j.Driver)
embedder = MagicMock(spec=Embedder)

with pytest.raises(ValueError):
SimpleKGPipeline(
llm=llm,
driver=driver,
embedder=embedder,
use_structured_output=True,
)
Loading