diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f22d4cd8..ab1334904 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ - Updated examples, default values, and documentation to use `gpt-4.1` / `gpt-4.1-mini` instead of deprecated GPT-4* models (e.g. `gpt-4o`, `gpt-4`). +### Added + +- Exposed `use_structured_output` parameter in `SimpleKGPipeline` constructor for enabling structured output in entity extraction and automatic schema extraction with supported LLMs (OpenAI, VertexAI). + ## 1.13.1 - Fixed invalid lexical graph relationships causing "Relationship references unknown start node" errors during parquet import when nodes are pruned. diff --git a/docs/source/user_guide_kg_builder.rst b/docs/source/user_guide_kg_builder.rst index 911b8f6d0..7d1a02568 100644 --- a/docs/source/user_guide_kg_builder.rst +++ b/docs/source/user_guide_kg_builder.rst @@ -162,6 +162,26 @@ They are also accessible via the `SimpleKGPipeline` interface. # ... ) +Structured Output +----------------- + +When using an LLM that supports structured output (such as OpenAI or VertexAI), +you can enable it to improve the reliability of entity extraction and automatic +schema extraction: + +.. code:: python + + kg_builder = SimpleKGPipeline( + # ... + use_structured_output=True, + # ... + ) + +.. note:: + + Structured output is only supported by LLMs that have ``supports_structured_output=True`` + (currently ``OpenAILLM`` and ``VertexAILLM``). Using it with an unsupported LLM will raise an error. + Skip Entity Resolution ---------------------- @@ -479,7 +499,8 @@ within the configuration file. }, "lexical_graph_config": { "chunk_node_label": "TextPart" - } + }, + "use_structured_output": false } @@ -520,6 +541,7 @@ or in YAML: - ["House", "RULES", "Planet"] lexical_graph_config: chunk_node_label: TextPart + use_structured_output: false It is also possible to further customize components, with a syntax similar to the one diff --git a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py index 929111898..71e2ea81f 100644 --- a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py +++ b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py @@ -94,6 +94,7 @@ class SimpleKGPipelineConfig(TemplatePipelineConfig): perform_entity_resolution: bool = True lexical_graph_config: Optional[LexicalGraphConfig] = None neo4j_database: Optional[str] = None + use_structured_output: bool = False pdf_loader: Optional[ComponentType] = None kg_writer: Optional[ComponentType] = None @@ -186,7 +187,10 @@ def _get_schema(self) -> BaseSchemaBuilder: Return SchemaFromTextExtractor for automatic extraction or SchemaBuilder for manual schema. """ if not self.has_user_provided_schema(): - return SchemaFromTextExtractor(llm=self.get_default_llm()) + return SchemaFromTextExtractor( + llm=self.get_default_llm(), + use_structured_output=self.use_structured_output, + ) return SchemaBuilder() def _process_schema_with_precedence(self) -> dict[str, Any]: @@ -222,6 +226,7 @@ def _get_extractor(self) -> EntityRelationExtractor: llm=self.get_default_llm(), prompt_template=self.prompt_template, on_error=self.on_error, + use_structured_output=self.use_structured_output, ) def _get_pruner(self) -> GraphPruning: diff --git a/src/neo4j_graphrag/experimental/pipeline/kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/kg_builder.py index b7313b3b0..1ce0f41e7 100644 --- a/src/neo4j_graphrag/experimental/pipeline/kg_builder.py +++ b/src/neo4j_graphrag/experimental/pipeline/kg_builder.py @@ -89,6 +89,7 @@ class SimpleKGPipeline: perform_entity_resolution (bool): Merge entities with same label and name. Default: True prompt_template (str): A custom prompt template to use for extraction. lexical_graph_config (Optional[LexicalGraphConfig], optional): Lexical graph configuration to customize node labels and relationship types in the lexical graph. + use_structured_output (bool): Whether to use structured output (LLMInterfaceV2) for entity extraction and automatic schema extraction. Only supported for OpenAILLM and VertexAILLM. Defaults to False. """ def __init__( @@ -115,6 +116,7 @@ def __init__( perform_entity_resolution: bool = True, lexical_graph_config: Optional[LexicalGraphConfig] = None, neo4j_database: Optional[str] = None, + use_structured_output: bool = False, ): try: config = SimpleKGPipelineConfig.model_validate( @@ -137,6 +139,7 @@ def __init__( perform_entity_resolution=perform_entity_resolution, lexical_graph_config=lexical_graph_config, neo4j_database=neo4j_database, + use_structured_output=use_structured_output, ) ) except (ValidationError, ValueError) as e: diff --git a/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py b/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py index bedccf0c3..5328bdbd1 100644 --- a/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py +++ b/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py @@ -178,6 +178,39 @@ def test_simple_kg_pipeline_config_extractor(mock_llm: Mock, llm: LLMInterface) assert extractor.prompt_template.template == "my template {text}" +@patch( + "neo4j_graphrag.experimental.pipeline.config.template_pipeline.simple_kg_builder.SimpleKGPipelineConfig.get_default_llm" +) +def test_simple_kg_pipeline_config_extractor_with_structured_output( + mock_llm: Mock, llm: LLMInterface +) -> None: + llm.supports_structured_output = True + mock_llm.return_value = llm + config = SimpleKGPipelineConfig( + on_error="IGNORE", # type: ignore + use_structured_output=True, + ) + extractor = config._get_extractor() + assert isinstance(extractor, LLMEntityRelationExtractor) + assert extractor.use_structured_output is True + + +@patch( + "neo4j_graphrag.experimental.pipeline.config.template_pipeline.simple_kg_builder.SimpleKGPipelineConfig.get_default_llm" +) +def test_simple_kg_pipeline_config_schema_with_structured_output( + mock_llm: Mock, llm: LLMInterface +) -> None: + llm.supports_structured_output = True + mock_llm.return_value = llm + config = SimpleKGPipelineConfig( + use_structured_output=True, + ) + schema = config._get_schema() + assert isinstance(schema, SchemaFromTextExtractor) + assert schema.use_structured_output is True + + @patch( "neo4j_graphrag.experimental.components.kg_writer.get_version", return_value=((5, 23, 0), False, False), diff --git a/tests/unit/experimental/pipeline/test_kg_builder.py b/tests/unit/experimental/pipeline/test_kg_builder.py index 8634a1e4e..2caf4e650 100644 --- a/tests/unit/experimental/pipeline/test_kg_builder.py +++ b/tests/unit/experimental/pipeline/test_kg_builder.py @@ -193,3 +193,39 @@ async def test_knowledge_graph_builder_with_lexical_graph_config(_: Mock) -> Non assert pipe_inputs["extractor"]["lexical_graph_config"] == lexical_graph_config assert pipe_inputs["extractor"]["document_info"] is not None assert pipe_inputs["extractor"]["document_info"]["path"] == "document.txt" + + +@mock.patch( + "neo4j_graphrag.experimental.components.kg_writer.get_version", + return_value=((5, 23, 0), False, False), +) +def test_simple_kg_pipeline_accepts_use_structured_output(_: Mock) -> None: + llm = MagicMock(spec=LLMInterface) + llm.supports_structured_output = True + driver = MagicMock(spec=neo4j.Driver) + embedder = MagicMock(spec=Embedder) + + # Should not raise + kg_builder = SimpleKGPipeline( + llm=llm, + driver=driver, + embedder=embedder, + from_pdf=False, + use_structured_output=True, + ) + assert kg_builder is not None + + +def test_simple_kg_pipeline_use_structured_output_unsupported_llm() -> None: + llm = MagicMock(spec=LLMInterface) + llm.supports_structured_output = False + driver = MagicMock(spec=neo4j.Driver) + embedder = MagicMock(spec=Embedder) + + with pytest.raises(ValueError): + SimpleKGPipeline( + llm=llm, + driver=driver, + embedder=embedder, + use_structured_output=True, + )