Use json-repair package to fix LLM generated json

willtai · willtai · commit dc1101db1c01 · 2024-12-06T16:20:03.000Z
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -489,3 +489,10 @@ PipelineStatusUpdateError
 
 .. autoclass:: neo4j_graphrag.experimental.pipeline.exceptions.PipelineStatusUpdateError
    :show-inheritance:
+
+
+JSONRepairError
+===============
+
+.. autoclass:: neo4j_graphrag.experimental.pipeline.exceptions.JSONRepairError
+   :show-inheritance:
diff --git a/examples/build_graph/simple_kg_builder_from_pdf_ollama.py b/examples/build_graph/simple_kg_builder_from_pdf_ollama.py
@@ -0,0 +1,105 @@
+"""This example illustrates how to get started easily with the SimpleKGPipeline
+and ingest PDF into a Neo4j Knowledge Graph.
+
+This example assumes a Neo4j db is up and running. Update the credentials below
+if needed.
+
+It's assumed Ollama is used to run a model locally.
+"""
+
+import asyncio
+import ollama
+from pathlib import Path
+
+import neo4j
+from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline
+from neo4j_graphrag.experimental.pipeline.pipeline import PipelineResult
+from neo4j_graphrag.llm import LLMInterface, LLMResponse
+
+from llama_index.embeddings.ollama import OllamaEmbedding
+from neo4j_graphrag.embeddings.base import Embedder
+
+
+class OllamaEmbedder(Embedder):
+    def __init__(self, ollama_embedding: OllamaEmbedding) -> None:
+        self.embedder = ollama_embedding
+
+    def embed_query(self, text: str) -> list[float]:
+        embedding: list[list[float]] = self.embedder.get_text_embedding_batch(
+            [text], show_progress=True
+        )
+        return embedding[0]
+
+
+ollama_embedding = OllamaEmbedding(
+    model_name="qwen2",
+    base_url="http://localhost:11434",
+    ollama_additional_kwargs={"mirostat": 0},
+)
+embedder = OllamaEmbedder(ollama_embedding)
+
+# Neo4j db infos
+URI = "neo4j://localhost:7687"
+AUTH = ("neo4j", "password")
+DATABASE = "neo4j"
+
+
+root_dir = Path(__file__).parents[4]
+file_path = "examples/data/Harry Potter and the Chamber of Secrets Summary.pdf"
+
+
+# Instantiate Entity and Relation objects. This defines the
+# entities and relations the LLM will be looking for in the text.
+ENTITIES = ["Person", "Organization", "Location"]
+RELATIONS = ["SITUATED_AT", "INTERACTS", "LED_BY"]
+POTENTIAL_SCHEMA = [
+    ("Person", "SITUATED_AT", "Location"),
+    ("Person", "INTERACTS", "Person"),
+    ("Organization", "LED_BY", "Person"),
+]
+
+
+async def define_and_run_pipeline(
+    neo4j_driver: neo4j.Driver,
+    llm: LLMInterface,
+) -> PipelineResult:
+    # Create an instance of the SimpleKGPipeline
+    kg_builder = SimpleKGPipeline(
+        llm=llm,
+        driver=neo4j_driver,
+        embedder=embedder,
+        entities=ENTITIES,
+        relations=RELATIONS,
+        potential_schema=POTENTIAL_SCHEMA,
+    )
+    return await kg_builder.run_async(file_path=str(file_path))
+
+
+async def main() -> PipelineResult:
+    class OllamaLLM(LLMInterface):
+        def invoke(self, input: str) -> LLMResponse:
+            response = ollama.chat(
+                model=self.model_name,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": input,
+                    },
+                ],
+                options={"temperature": 0.0},
+            )
+            return LLMResponse(content=response["message"]["content"])
+
+        async def ainvoke(self, input: str) -> LLMResponse:
+            return self.invoke(input)  # TODO: implement async with ollama.AsyncClient
+
+    llm = OllamaLLM("llama3.1")
+    with neo4j.GraphDatabase.driver(URI, auth=AUTH, database=DATABASE) as driver:
+        res = await define_and_run_pipeline(driver, llm)
+
+    return res
+
+
+if __name__ == "__main__":
+    res = asyncio.run(main())
+    print(res)
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -44,10 +44,13 @@ google-cloud-aiplatform = {version = "^1.66.0", optional = true }
 cohere = {version = "^5.9.0", optional = true}
 mistralai = {version = "^1.0.3", optional = true}
 qdrant-client = {version = "^1.11.3", optional = true}
-llama-index = {version = "^0.10.55", optional = true }
 openai = {version = "^1.51.1", optional = true }
 anthropic = { version = "^0.36.0", optional = true}
 sentence-transformers = {version = "^3.0.0", optional = true }
+ollama = "^0"
+setuptools = "^75.6.0"
+llama-index-embeddings-ollama = "^0.4.0"
+json-repair = "^0.30.2"
 
 [tool.poetry.group.dev.dependencies]
 urllib3 = "<2"
diff --git a/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py b/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py
@@ -19,10 +19,11 @@
 import enum
 import json
 import logging
-import re
 from datetime import datetime
 from typing import Any, List, Optional, Union
 
+import json_repair
+
 from pydantic import ValidationError, validate_call
 
 from neo4j_graphrag.exceptions import LLMGenerationError
@@ -36,6 +37,7 @@
     TextChunks,
 )
 from neo4j_graphrag.experimental.pipeline.component import Component
+from neo4j_graphrag.experimental.pipeline.exceptions import JSONRepairError
 from neo4j_graphrag.generation.prompts import ERExtractionTemplate, PromptTemplate
 from neo4j_graphrag.llm import LLMInterface
 
@@ -100,28 +102,19 @@ def balance_curly_braces(json_string: str) -> str:
     return "".join(fixed_json)
 
 
-def fix_invalid_json(invalid_json_string: str) -> str:
-    # Fix missing quotes around field names
-    invalid_json_string = re.sub(
-        r"([{,]\s*)(\w+)(\s*:)", r'\1"\2"\3', invalid_json_string
-    )
-
-    # Fix missing quotes around string values, correctly ignoring null, true, false, and numeric values
-    invalid_json_string = re.sub(
-        r"(?<=:\s)(?!(null|true|false|\d+\.?\d*))([a-zA-Z_][a-zA-Z0-9_]*)\s*(?=[,}])",
-        r'"\2"',
-        invalid_json_string,
-    )
-
-    # Correct the specific issue: remove trailing commas within arrays or objects before closing braces or brackets
-    invalid_json_string = re.sub(r",\s*(?=[}\]])", "", invalid_json_string)
+def fix_invalid_json(raw_json: str) -> str:
+    repaired_json = json_repair.repair_json(raw_json)
 
-    # Normalize excessive curly braces
-    invalid_json_string = re.sub(r"{{+", "{", invalid_json_string)
-    invalid_json_string = re.sub(r"}}+", "}", invalid_json_string)
+    if isinstance(repaired_json, str):
+        repaired_json = repaired_json.strip()
+    else:
+        repaired_json = ""
 
-    # Balance curly braces
-    return balance_curly_braces(invalid_json_string)
+    if repaired_json.strip() == '""':
+        raise JSONRepairError("JSON repair resulted in an empty or invalid JSON.")
+    if not repaired_json.strip():
+        raise JSONRepairError("JSON repair resulted in an empty string.")
+    return repaired_json
 
 
 class EntityRelationExtractor(Component, abc.ABC):
@@ -223,24 +216,18 @@ async def extract_for_chunk(
         )
         llm_result = await self.llm.ainvoke(prompt)
         try:
-            result = json.loads(llm_result.content)
-        except json.JSONDecodeError:
-            logger.info(
-                f"LLM response is not valid JSON {llm_result.content} for chunk_index={chunk.index}. Trying to fix it."
-            )
-            fixed_content = fix_invalid_json(llm_result.content)
-            try:
-                result = json.loads(fixed_content)
-            except json.JSONDecodeError as e:
-                if self.on_error == OnError.RAISE:
-                    raise LLMGenerationError(
-                        f"LLM response is not valid JSON {fixed_content}: {e}"
-                    )
-                else:
-                    logger.error(
-                        f"LLM response is not valid JSON {llm_result.content} for chunk_index={chunk.index}"
-                    )
-                result = {"nodes": [], "relationships": []}
+            llm_generated_json = fix_invalid_json(llm_result.content)
+            result = json.loads(llm_generated_json)
+        except (json.JSONDecodeError, JSONRepairError) as e:
+            if self.on_error == OnError.RAISE:
+                raise LLMGenerationError(
+                    f"LLM response is not valid JSON {llm_result.content}: {e}"
+                )
+            else:
+                logger.error(
+                    f"LLM response is not valid JSON {llm_result.content} for chunk_index={chunk.index}"
+                )
+            result = {"nodes": [], "relationships": []}
         try:
             chunk_graph = Neo4jGraph(**result)
         except ValidationError as e:
diff --git a/src/neo4j_graphrag/experimental/pipeline/exceptions.py b/src/neo4j_graphrag/experimental/pipeline/exceptions.py
@@ -31,3 +31,9 @@ class PipelineStatusUpdateError(Neo4jGraphRagError):
     """Raises when trying an invalid change of state (e.g. DONE => DOING)"""
 
     pass
+
+
+class JSONRepairError(Neo4jGraphRagError):
+    """Raised when JSON repair fails to produce valid JSON."""
+
+    pass
diff --git a/src/neo4j_graphrag/generation/prompts.py b/src/neo4j_graphrag/generation/prompts.py
@@ -174,7 +174,11 @@ class ERExtractionTemplate(PromptTemplate):
 Do respect the source and target node types for relationship and
 the relationship direction.
 
-Do not return any additional information other than the JSON in it.
+Make sure you adhere to the following rules to produce valid JSON objects:
+- Do not return any additional information other than the JSON in it.
+- Omit any backticks around the JSON - simply output the JSON on its own.
+- The JSON object must not wrapped into a list - it is its own JSON object.
+- Property names must be enclosed in double quotes
 
 Examples:
 {examples}
diff --git a/src/neo4j_graphrag/llm/mistralai_llm.py b/src/neo4j_graphrag/llm/mistralai_llm.py
@@ -84,10 +84,11 @@ def invoke(self, input: str) -> LLMResponse:
                 messages=self.get_messages(input),
                 **self.model_params,
             )
-            if response is None or response.choices is None or not response.choices:
-                content = ""
-            else:
-                content = response.choices[0].message.content or ""
+            content: str = ""
+            if response and response.choices:
+                possible_content = response.choices[0].message.content
+                if isinstance(possible_content, str):
+                    content = possible_content
             return LLMResponse(content=content)
         except SDKError as e:
             raise LLMGenerationError(e)
@@ -111,10 +112,11 @@ async def ainvoke(self, input: str) -> LLMResponse:
                 messages=self.get_messages(input),
                 **self.model_params,
             )
-            if response is None or response.choices is None or not response.choices:
-                content = ""
-            else:
-                content = response.choices[0].message.content or ""
+            content: str = ""
+            if response and response.choices:
+                possible_content = response.choices[0].message.content
+                if isinstance(possible_content, str):
+                    content = possible_content
             return LLMResponse(content=content)
         except SDKError as e:
             raise LLMGenerationError(e)
diff --git a/tests/unit/experimental/components/test_entity_relation_extractor.py b/tests/unit/experimental/components/test_entity_relation_extractor.py
@@ -15,7 +15,7 @@
 from __future__ import annotations
 
 import json
-from unittest.mock import MagicMock
+from unittest.mock import MagicMock, patch
 
 import pytest
 from neo4j_graphrag.exceptions import LLMGenerationError
@@ -31,6 +31,7 @@
     TextChunk,
     TextChunks,
 )
+from neo4j_graphrag.experimental.pipeline.exceptions import JSONRepairError
 from neo4j_graphrag.llm import LLMInterface, LLMResponse
 
 
@@ -154,8 +155,8 @@ async def test_extractor_llm_badly_formatted_json() -> None:
         llm=llm,
     )
     chunks = TextChunks(chunks=[TextChunk(text="some text", index=0)])
-    with pytest.raises(LLMGenerationError):
-        await extractor.run(chunks=chunks)
+
+    await extractor.run(chunks=chunks)
 
 
 @pytest.mark.asyncio
@@ -177,7 +178,7 @@ async def test_extractor_llm_invalid_json() -> None:
 
 
 @pytest.mark.asyncio
-async def test_extractor_llm_badly_formatted_json_do_not_raise() -> None:
+async def test_extractor_llm_badly_formatted_json_gets_fixed() -> None:
     llm = MagicMock(spec=LLMInterface)
     llm.ainvoke.return_value = LLMResponse(
         content='{"nodes": [{"id": "0", "label": "Person", "properties": {}}], "relationships": [}'
@@ -190,7 +191,11 @@ async def test_extractor_llm_badly_formatted_json_do_not_raise() -> None:
     )
     chunks = TextChunks(chunks=[TextChunk(text="some text", index=0)])
     res = await extractor.run(chunks=chunks)
-    assert res.nodes == []
+    print("res.nodes", res.nodes)
+    assert len(res.nodes) == 1
+    assert res.nodes[0].label == "Person"
+    assert res.nodes[0].properties == {"chunk_index": 0}
+    assert res.nodes[0].embedding_properties is None
     assert res.relationships == []
 
 
@@ -205,6 +210,14 @@ async def test_extractor_custom_prompt() -> None:
     llm.ainvoke.assert_called_once_with("this is my prompt")
 
 
+def test_fix_invalid_json_empty_result() -> None:
+    json_string = "invalid json"
+
+    with patch("json_repair.repair_json", return_value=""):
+        with pytest.raises(JSONRepairError):
+            fix_invalid_json(json_string)
+
+
 def test_fix_unquoted_keys() -> None:
     json_string = '{name: "John", age: "30"}'
     expected_result = '{"name": "John", "age": "30"}'