feat: Neo4j Destination Connector (#212)

ds-filipknefel · Filip Knefel · rbiseck3 · web-flow · commit 4e628590e22d · 2024-12-16T17:23:46.000+01:00
Implements Neo4j destination connector.

Connector takes Unstructured Elements and builds a lexical graph representing relationships between them. It consists of the following entities.

Nodes
- Document - represents the source file elements come 
- UnstructuredElement - represents the Unstructured Element prior to 
- Chunk - represents the Unstructured Element post chunking

Edges (Relationships)
- UnstructredElement/Chunk - PART_OF_DOCUMENT -&gt; Document - relationship of belonging to the source file
- UnstructuredElement - PART_OF_CHUNK -&gt; Chunk - relationship between origin elements making up a chunk
- UnstructuredElement - NEXT_ELEMENT -&gt; UnstructuredElement - order of occurrence in the document
- Chunk - NEXT_ELEMENT -&gt; Chunk - order of occurrence in document

---------

Co-authored-by: Filip Knefel &lt;filip@unstructured.io&gt;
Co-authored-by: Roman Isecke &lt;roman@unstructured.io&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,8 +1,9 @@
-## 0.3.9-dev2
+## 0.3.9-dev3
 
 ### Enhancements
 
 * **Support ndjson files in stagers**
+* **Add Neo4j destination connector**
 
 ### Fixes
 
diff --git a/requirements/connectors/neo4j.in b/requirements/connectors/neo4j.in
@@ -0,0 +1,2 @@
+neo4j
+cymple
diff --git a/requirements/connectors/neo4j.txt b/requirements/connectors/neo4j.txt
@@ -0,0 +1,8 @@
+# This file was autogenerated by uv via the following command:
+#    uv pip compile ./connectors/neo4j.in --output-file ./connectors/neo4j.txt --no-strip-extras --python-version 3.9
+cymple==0.11.0
+    # via -r ./connectors/neo4j.in
+neo4j==5.25.0
+    # via -r ./connectors/neo4j.in
+pytz==2024.2
+    # via neo4j
diff --git a/requirements/test.in b/requirements/test.in
@@ -3,6 +3,8 @@
 pytest
 pytest-cov
 pytest-mock
+pytest-check
+unstructured
 pytest-asyncio
 pytest_tagging
 pytest-json-report
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -124,13 +124,16 @@ pytest==8.3.4
     # via
     #   -r test.in
     #   pytest-asyncio
+    #   pytest-check
     #   pytest-cov
     #   pytest-json-report
     #   pytest-metadata
     #   pytest-mock
     #   pytest-tagging
 pytest-asyncio==0.25.0
     # via -r test.in
+pytest-check==2.4.1
+    # via -r test.in
 pytest-cov==6.0.0
     # via -r test.in
 pytest-json-report==1.5.0
diff --git a/setup.py b/setup.py
@@ -108,6 +108,7 @@ def load_requirements(file: Union[str, Path]) -> List[str]:
     "lancedb": load_requirements("requirements/connectors/lancedb.in"),
     "milvus": load_requirements("requirements/connectors/milvus.in"),
     "mongodb": load_requirements("requirements/connectors/mongodb.in"),
+    "neo4j": load_requirements("requirements/connectors/neo4j.in"),
     "notion": load_requirements("requirements/connectors/notion.in"),
     "onedrive": load_requirements("requirements/connectors/onedrive.in"),
     "opensearch": load_requirements("requirements/connectors/opensearch.in"),
diff --git a/test/integration/connectors/test_neo4j.py b/test/integration/connectors/test_neo4j.py
@@ -0,0 +1,236 @@
+import json
+import time
+import uuid
+from datetime import datetime
+from pathlib import Path
+
+import pytest
+from neo4j import AsyncGraphDatabase, Driver, GraphDatabase
+from neo4j.exceptions import ServiceUnavailable
+from pytest_check import check
+
+from test.integration.connectors.utils.constants import DESTINATION_TAG
+from test.integration.connectors.utils.docker import container_context
+from unstructured_ingest.error import DestinationConnectionError
+from unstructured_ingest.utils.chunking import elements_from_base64_gzipped_json
+from unstructured_ingest.v2.interfaces.file_data import (
+    FileData,
+    FileDataSourceMetadata,
+    SourceIdentifiers,
+)
+from unstructured_ingest.v2.processes.connectors.neo4j import (
+    CONNECTOR_TYPE,
+    Label,
+    Neo4jAccessConfig,
+    Neo4jConnectionConfig,
+    Neo4jUploader,
+    Neo4jUploaderConfig,
+    Neo4jUploadStager,
+    Relationship,
+)
+
+USERNAME = "neo4j"
+PASSWORD = "password"
+URI = "neo4j://localhost:7687"
+DATABASE = "neo4j"
+
+EXPECTED_DOCUMENT_COUNT = 1
+
+
+# NOTE: Precheck tests are read-only so we utilize the same container for all tests.
+# If new tests require clean neo4j container, this fixture's scope should be adjusted.
+@pytest.fixture(autouse=True, scope="module")
+def _neo4j_server():
+    with container_context(
+        image="neo4j:latest", environment={"NEO4J_AUTH": "neo4j/password"}, ports={"7687": "7687"}
+    ):
+        driver = GraphDatabase.driver(uri=URI, auth=(USERNAME, PASSWORD))
+        wait_for_connection(driver)
+        driver.close()
+        yield
+
+
+@pytest.mark.asyncio
+@pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE)
+async def test_neo4j_destination(upload_file: Path, tmp_path: Path):
+    stager = Neo4jUploadStager()
+    uploader = Neo4jUploader(
+        connection_config=Neo4jConnectionConfig(
+            access_config=Neo4jAccessConfig(password=PASSWORD),  # type: ignore
+            username=USERNAME,
+            uri=URI,
+            database=DATABASE,
+        ),
+        upload_config=Neo4jUploaderConfig(),
+    )
+    file_data = FileData(
+        identifier="mock-file-data",
+        connector_type="neo4j",
+        source_identifiers=SourceIdentifiers(
+            filename=upload_file.name,
+            fullpath=upload_file.name,
+        ),
+        metadata=FileDataSourceMetadata(
+            date_created=str(datetime(2022, 1, 1).timestamp()),
+            date_modified=str(datetime(2022, 1, 2).timestamp()),
+        ),
+    )
+    staged_filepath = stager.run(
+        upload_file,
+        file_data=file_data,
+        output_dir=tmp_path,
+        output_filename=upload_file.name,
+    )
+
+    await uploader.run_async(staged_filepath, file_data)
+    await validate_uploaded_graph(upload_file)
+
+    modified_upload_file = tmp_path / f"modified-{upload_file.name}"
+    with open(upload_file) as file:
+        elements = json.load(file)
+        for element in elements:
+            element["element_id"] = str(uuid.uuid4())
+
+    with open(modified_upload_file, "w") as file:
+        json.dump(elements, file, indent=4)
+
+    staged_filepath = stager.run(
+        modified_upload_file,
+        file_data=file_data,
+        output_dir=tmp_path,
+        output_filename=modified_upload_file.name,
+    )
+    await uploader.run_async(staged_filepath, file_data)
+    await validate_uploaded_graph(modified_upload_file)
+
+
+@pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE)
+class TestPrecheck:
+    @pytest.fixture
+    def configured_uploader(self) -> Neo4jUploader:
+        return Neo4jUploader(
+            connection_config=Neo4jConnectionConfig(
+                access_config=Neo4jAccessConfig(password=PASSWORD),  # type: ignore
+                username=USERNAME,
+                uri=URI,
+                database=DATABASE,
+            ),
+            upload_config=Neo4jUploaderConfig(),
+        )
+
+    def test_succeeds(self, configured_uploader: Neo4jUploader):
+        configured_uploader.precheck()
+
+    def test_fails_on_invalid_password(self, configured_uploader: Neo4jUploader):
+        configured_uploader.connection_config.access_config.get_secret_value().password = (
+            "invalid-password"
+        )
+        with pytest.raises(
+            DestinationConnectionError,
+            match="{code: Neo.ClientError.Security.Unauthorized}",
+        ):
+            configured_uploader.precheck()
+
+    def test_fails_on_invalid_username(self, configured_uploader: Neo4jUploader):
+        configured_uploader.connection_config.username = "invalid-username"
+        with pytest.raises(
+            DestinationConnectionError, match="{code: Neo.ClientError.Security.Unauthorized}"
+        ):
+            configured_uploader.precheck()
+
+    @pytest.mark.parametrize(
+        ("uri", "expected_error_msg"),
+        [
+            ("neo4j://localhst:7687", "Cannot resolve address"),
+            ("neo4j://localhost:7777", "Unable to retrieve routing information"),
+        ],
+    )
+    def test_fails_on_invalid_uri(
+        self, configured_uploader: Neo4jUploader, uri: str, expected_error_msg: str
+    ):
+        configured_uploader.connection_config.uri = uri
+        with pytest.raises(DestinationConnectionError, match=expected_error_msg):
+            configured_uploader.precheck()
+
+    def test_fails_on_invalid_database(self, configured_uploader: Neo4jUploader):
+        configured_uploader.connection_config.database = "invalid-database"
+        with pytest.raises(
+            DestinationConnectionError, match="{code: Neo.ClientError.Database.DatabaseNotFound}"
+        ):
+            configured_uploader.precheck()
+
+
+def wait_for_connection(driver: Driver, retries: int = 10, delay_seconds: int = 2):
+    attempts = 0
+    while attempts < retries:
+        try:
+            driver.verify_connectivity()
+            return
+        except ServiceUnavailable:
+            time.sleep(delay_seconds)
+            attempts += 1
+
+    pytest.fail("Failed to connect with Neo4j server.")
+
+
+async def validate_uploaded_graph(upload_file: Path):
+    with open(upload_file) as file:
+        elements = json.load(file)
+
+    for element in elements:
+        if "orig_elements" in element["metadata"]:
+            element["metadata"]["orig_elements"] = elements_from_base64_gzipped_json(
+                element["metadata"]["orig_elements"]
+            )
+        else:
+            element["metadata"]["orig_elements"] = []
+
+    expected_chunks_count = len(elements)
+    expected_element_count = len(
+        {
+            origin_element["element_id"]
+            for chunk in elements
+            for origin_element in chunk["metadata"]["orig_elements"]
+        }
+    )
+    expected_nodes_count = expected_chunks_count + expected_element_count + EXPECTED_DOCUMENT_COUNT
+
+    driver = AsyncGraphDatabase.driver(uri=URI, auth=(USERNAME, PASSWORD))
+    try:
+        nodes_count = len((await driver.execute_query("MATCH (n) RETURN n"))[0])
+        chunk_nodes_count = len(
+            (await driver.execute_query(f"MATCH (n: {Label.CHUNK}) RETURN n"))[0]
+        )
+        document_nodes_count = len(
+            (await driver.execute_query(f"MATCH (n: {Label.DOCUMENT}) RETURN n"))[0]
+        )
+        element_nodes_count = len(
+            (await driver.execute_query(f"MATCH (n: {Label.UNSTRUCTURED_ELEMENT}) RETURN n"))[0]
+        )
+        with check:
+            assert nodes_count == expected_nodes_count
+        with check:
+            assert document_nodes_count == EXPECTED_DOCUMENT_COUNT
+        with check:
+            assert chunk_nodes_count == expected_chunks_count
+        with check:
+            assert element_nodes_count == expected_element_count
+
+        records, _, _ = await driver.execute_query(
+            f"MATCH ()-[r:{Relationship.PART_OF_DOCUMENT}]->(:{Label.DOCUMENT}) RETURN r"
+        )
+        part_of_document_count = len(records)
+
+        records, _, _ = await driver.execute_query(
+            f"MATCH (:{Label.CHUNK})-[r:{Relationship.NEXT_CHUNK}]->(:{Label.CHUNK}) RETURN r"
+        )
+        next_chunk_count = len(records)
+
+        if not check.any_failures():
+            with check:
+                assert part_of_document_count == expected_chunks_count + expected_element_count
+            with check:
+                assert next_chunk_count == expected_chunks_count - 1
+
+    finally:
+        await driver.close()
diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py
@@ -1 +1 @@
-__version__ = "0.3.9-dev2"  # pragma: no cover
+__version__ = "0.3.9-dev3"  # pragma: no cover
diff --git a/unstructured_ingest/utils/chunking.py b/unstructured_ingest/utils/chunking.py
@@ -1,4 +1,7 @@
+import base64
 import hashlib
+import json
+import zlib
 from itertools import groupby
 
 
@@ -43,3 +46,11 @@ def assign_and_map_hash_ids(elements: list[dict]) -> list[dict]:
         e["metadata"]["parent_id"] = old_to_new_mapping[parent_id]
 
     return elements
+
+
+def elements_from_base64_gzipped_json(raw_s: str) -> list[dict]:
+    decoded_b64_bytes = base64.b64decode(raw_s)
+    elements_json_bytes = zlib.decompress(decoded_b64_bytes)
+    elements_json_str = elements_json_bytes.decode("utf-8")
+    element_dicts = json.loads(elements_json_str)
+    return element_dicts
diff --git a/unstructured_ingest/v2/processes/connectors/neo4j.py b/unstructured_ingest/v2/processes/connectors/neo4j.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.3.9-dev2" # pragma: no cover`
	`1`	`+__version__ = "0.3.9-dev3" # pragma: no cover`