Skip to content

Commit a488821

Browse files
tomasonjoa-s-g93
andauthored
Sanitize embedding-like values from cypher read tool (#154)
* sanitize embedding-like values from cypher read tool * update changelog, docstring, move to utils --------- Co-authored-by: alex <[email protected]>
1 parent 04e36b7 commit a488821

File tree

3 files changed

+59
-3
lines changed

3 files changed

+59
-3
lines changed

servers/mcp-neo4j-cypher/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
### Changed
66

77
### Added
8+
* Added Cypher result sanitation function from Neo4j GraphRAG that removes embedding values from the result
89

910
## v0.3.1
1011

servers/mcp-neo4j-cypher/src/mcp_neo4j_cypher/server.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from neo4j import AsyncDriver, AsyncGraphDatabase, RoutingControl
1111
from neo4j.exceptions import ClientError, Neo4jError
1212
from pydantic import Field
13+
from .utils import _value_sanitize
1314

1415
logger = logging.getLogger("mcp_neo4j_cypher")
1516

@@ -180,8 +181,8 @@ async def read_neo4j_cypher(
180181
database_=database,
181182
result_transformer_=lambda r: r.data(),
182183
)
183-
184-
results_json_str = json.dumps(results, default=str)
184+
sanitized_results = [_value_sanitize(el) for el in results]
185+
results_json_str = json.dumps(sanitized_results, default=str)
185186

186187
logger.debug(f"Read query returned {len(results_json_str)} rows")
187188

servers/mcp-neo4j-cypher/src/mcp_neo4j_cypher/utils.py

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import argparse
22
import logging
33
import os
4-
from typing import Union
4+
from typing import Any, Union
55

66
logger = logging.getLogger("mcp_neo4j_cypher")
77
logger.setLevel(logging.INFO)
@@ -169,3 +169,57 @@ def process_config(args: argparse.Namespace) -> dict[str, Union[str, int, None]]
169169
config["path"] = None
170170

171171
return config
172+
173+
def _value_sanitize(d: Any, list_limit: int = 128) -> Any:
174+
"""
175+
Sanitize the input dictionary or list.
176+
177+
Sanitizes the input by removing embedding-like values,
178+
lists with more than 128 elements, that are mostly irrelevant for
179+
generating answers in a LLM context. These properties, if left in
180+
results, can occupy significant context space and detract from
181+
the LLM's performance by introducing unnecessary noise and cost.
182+
183+
Sourced from: https://github.com/neo4j/neo4j-graphrag-python/blob/main/src/neo4j_graphrag/schema.py#L88
184+
185+
Parameters
186+
----------
187+
d : Any
188+
The input dictionary or list to sanitize.
189+
list_limit : int
190+
The limit for the number of elements in a list.
191+
192+
Returns
193+
-------
194+
Any
195+
The sanitized dictionary or list.
196+
"""
197+
if isinstance(d, dict):
198+
new_dict = {}
199+
for key, value in d.items():
200+
if isinstance(value, dict):
201+
sanitized_value = _value_sanitize(value)
202+
if (
203+
sanitized_value is not None
204+
): # Check if the sanitized value is not None
205+
new_dict[key] = sanitized_value
206+
elif isinstance(value, list):
207+
if len(value) < list_limit:
208+
sanitized_value = _value_sanitize(value)
209+
if (
210+
sanitized_value is not None
211+
): # Check if the sanitized value is not None
212+
new_dict[key] = sanitized_value
213+
# Do not include the key if the list is oversized
214+
else:
215+
new_dict[key] = value
216+
return new_dict
217+
elif isinstance(d, list):
218+
if len(d) < list_limit:
219+
return [
220+
_value_sanitize(item) for item in d if _value_sanitize(item) is not None
221+
]
222+
else:
223+
return None
224+
else:
225+
return d

0 commit comments

Comments
 (0)