diff --git a/servers/mcp-neo4j-cypher/CHANGELOG.md b/servers/mcp-neo4j-cypher/CHANGELOG.md index 5e583ae..a8d2568 100644 --- a/servers/mcp-neo4j-cypher/CHANGELOG.md +++ b/servers/mcp-neo4j-cypher/CHANGELOG.md @@ -5,6 +5,7 @@ ### Changed ### Added +* Add `NEO4J_SCHEMA_SAMPLE_SIZE` env variable and `schema-sample-size` cli argument to configure the `get_neo4j_schema` sample size * Update write query detection to include `INSERT` in regex check ## v0.4.1 diff --git a/servers/mcp-neo4j-cypher/README.md b/servers/mcp-neo4j-cypher/README.md index 03cfe32..0f11512 100644 --- a/servers/mcp-neo4j-cypher/README.md +++ b/servers/mcp-neo4j-cypher/README.md @@ -44,8 +44,10 @@ The server offers these core tools: - `get_neo4j_schema` - Get a list of all nodes types in the graph database, their attributes with name, type and relationships to other node types - - No input required + - Input: + - `sample_param` (integer, optional): Number of nodes to sample for schema analysis. Overrides server default if provided. - Returns: JSON serialized list of node labels with two dictionaries: one for attributes and one for relationships + - **Performance**: Uses sampling by default (1000 nodes per label). Reduce number for faster analysis on large databases. To stop sampling, set to -1. ### 🏷️ Namespacing @@ -105,6 +107,62 @@ When a response exceeds the token limit, it will be automatically truncated to f **Note**: Token limits only apply to `read_neo4j_cypher` responses. Schema queries and write operations return summary information and are not affected. +#### 🔍 Schema Sampling + +Control the performance and scope of schema inspection with the `sample` parameter for the `get_neo4j_schema` tool: + +**Command Line:** +```bash +mcp-neo4j-cypher --sample 1000 # Sample 1000 nodes per label +``` + +**Environment Variable:** +```bash +export NEO4J_SCHEMA_SAMPLE_SIZE=1000 +``` + +**Docker:** +```bash +docker run -e NEO4J_SCHEMA_SAMPLE_SIZE=1000 mcp-neo4j-cypher:latest +``` + +The `sample` parameter controls how many nodes are examined when generating the database schema: + +- **Default**: `1000` nodes per label are sampled for schema analysis +- **Performance**: Lower values (`100`, `500`) provide faster schema inspection on large databases +- **Accuracy**: Higher values (`5000`, `10000`) provide more comprehensive schema coverage +- **Full Scan**: Set to `-1` to examine all nodes (can be very slow on large databases) +- **Per-Call Override**: The `get_neo4j_schema` tool accepts a `sample_param` parameter to override the server default + +**How Sampling Works** (via [APOC's apoc.meta.schema](https://neo4j.com/docs/apoc/current/overview/apoc.meta/apoc.meta.schema/)): + +- For each node label, a skip count is calculated: `totalNodesForLabel / sample ± 10%` +- Every Nth node is examined based on the skip count +- Higher sample numbers result in more nodes being examined +- Results may vary between runs due to random sampling + +**Example Scenarios:** + +```bash +# Fast schema inspection for large databases +export NEO4J_SCHEMA_SAMPLE_SIZE=100 + +# Balanced performance and accuracy (default) +export NEO4J_SCHEMA_SAMPLE_SIZE=1000 + +# Comprehensive schema analysis +export NEO4J_SCHEMA_SAMPLE_SIZE=5000 + +# Full database scan (use with caution on large databases) +export NEO4J_SCHEMA_SAMPLE_SIZE=-1 +``` + +**Performance Considerations:** + +- **Large Databases**: Use lower sample values (`100-500`) to prevent timeouts +- **Development**: Higher sample values (`1000-5000`) for thorough schema understanding +- **Production**: Balance between performance and schema completeness based on your use case + ## 🏗️ Local Development & Deployment ### 🐳 Local Docker Development @@ -407,6 +465,7 @@ docker run --rm -p 8000:8000 \ | `NEO4J_RESPONSE_TOKEN_LIMIT` | _(none)_ | Maximum tokens for read query responses | | `NEO4J_READ_TIMEOUT` | `30` | Timeout in seconds for read queries | | `NEO4J_READ_ONLY` | `false` | Allow only read-only queries (true/false) | +| `NEO4J_SCHEMA_SAMPLE_SIZE` | `1000` | Number of nodes to sample for schema inspection (set to -1 for full scan) | ### 🌐 SSE Transport for Legacy Web Access diff --git a/servers/mcp-neo4j-cypher/src/mcp_neo4j_cypher/__init__.py b/servers/mcp-neo4j-cypher/src/mcp_neo4j_cypher/__init__.py index d0d6371..996b4cf 100644 --- a/servers/mcp-neo4j-cypher/src/mcp_neo4j_cypher/__init__.py +++ b/servers/mcp-neo4j-cypher/src/mcp_neo4j_cypher/__init__.py @@ -43,6 +43,12 @@ def main(): help="Allow only read-only queries (default: False)", ) parser.add_argument("--token-limit", default=None, help="Response token limit") + parser.add_argument( + "--schema-sample-size", + type=int, + default=None, + help="Default sample size for schema operations (default: 1000)", + ) args = parser.parse_args() config = process_config(args) diff --git a/servers/mcp-neo4j-cypher/src/mcp_neo4j_cypher/server.py b/servers/mcp-neo4j-cypher/src/mcp_neo4j_cypher/server.py index f14cb3d..2d6ae89 100644 --- a/servers/mcp-neo4j-cypher/src/mcp_neo4j_cypher/server.py +++ b/servers/mcp-neo4j-cypher/src/mcp_neo4j_cypher/server.py @@ -44,6 +44,7 @@ def create_mcp_server( read_timeout: int = 30, token_limit: Optional[int] = None, read_only: bool = False, + config_sample_size: int = 1000, ) -> FastMCP: mcp: FastMCP = FastMCP( "mcp-neo4j-cypher", dependencies=["neo4j", "pydantic"], stateless_http=True @@ -62,16 +63,26 @@ def create_mcp_server( openWorldHint=True, ), ) - async def get_neo4j_schema() -> list[ToolResult]: - """ - List all nodes, their attributes and their relationships to other nodes in the neo4j database. - This requires that the APOC plugin is installed and enabled. + async def get_neo4j_schema(sample_size: int = Field(default=config_sample_size, description="The sample size used to infer the graph schema. Larger samples are slower, but more accurate. Smaller samples are faster, but might miss information.")) -> list[ToolResult]: """ + Returns nodes, their properties (with types and indexed flags), and relationships + using APOC's schema inspection. + + You should only provide a `sample_size` value if requested by the user, or tuning the retrieval performance. - get_schema_query = """ - CALL apoc.meta.schema(); + Performance Notes: + - If `sample_size` is not provided, uses the server's default sample setting defined in the server configuration. + - If retrieving the schema times out, try lowering the sample size, e.g. `sample_size=100`. + - To sample the entire graph use `sample_size=-1`. """ + # Use provided sample_size, otherwise fall back to server default - 1000 + effective_sample_size = sample_size if sample_size else config_sample_size + + logger.info(f"Running `get_neo4j_schema` with sample size {effective_sample_size}.") + + get_schema_query = f"CALL apoc.meta.schema({{sample: {effective_sample_size}}}) YIELD value RETURN value" + def clean_schema(schema: dict) -> dict: cleaned = {} @@ -132,16 +143,16 @@ def clean_schema(schema: dict) -> dict: return cleaned try: - results_json_str = await neo4j_driver.execute_query( + results_json = await neo4j_driver.execute_query( get_schema_query, routing_control=RoutingControl.READ, database_=database, result_transformer_=lambda r: r.data(), ) + + logger.debug(f"Read query returned {len(results_json)} rows") - logger.debug(f"Read query returned {len(results_json_str)} rows") - - schema_clean = clean_schema(results_json_str[0].get("value")) + schema_clean = clean_schema(results_json[0].get("value")) schema_clean_str = json.dumps(schema_clean, default=str) @@ -275,6 +286,7 @@ async def main( read_timeout: int = 30, token_limit: Optional[int] = None, read_only: bool = False, + schema_sample_size: Optional[int] = None, # this is known as the config_sample_size in the create_mcp_server function ) -> None: logger.info("Starting MCP neo4j Server") @@ -296,7 +308,7 @@ async def main( ] mcp = create_mcp_server( - neo4j_driver, database, namespace, read_timeout, token_limit, read_only + neo4j_driver, database, namespace, read_timeout, token_limit, read_only, schema_sample_size ) # Run the server with the specified transport diff --git a/servers/mcp-neo4j-cypher/src/mcp_neo4j_cypher/utils.py b/servers/mcp-neo4j-cypher/src/mcp_neo4j_cypher/utils.py index d813471..a177307 100644 --- a/servers/mcp-neo4j-cypher/src/mcp_neo4j_cypher/utils.py +++ b/servers/mcp-neo4j-cypher/src/mcp_neo4j_cypher/utils.py @@ -296,6 +296,30 @@ def process_config(args: argparse.Namespace) -> dict[str, Union[str, int, None]] ) config["read_only"] = False + # parse schema sample size + if args.schema_sample_size is not None: + config["schema_sample_size"] = args.schema_sample_size + logger.info( + f"Info: Default sample size set to {config['schema_sample_size']} via command line argument." + ) + else: + if os.getenv("NEO4J_SCHEMA_SAMPLE_SIZE") is not None: + try: + config["schema_sample_size"] = int(os.getenv("NEO4J_SCHEMA_SAMPLE_SIZE")) + logger.info( + f"Info: Default sample size set to {config['schema_sample_size']} via environment variable." + ) + except ValueError: + logger.warning( + "Warning: Invalid sample size provided in NEO4J_SCHEMA_SAMPLE_SIZE environment variable. No default sample will be used." + ) + config["schema_sample_size"] = 1000 + else: + logger.info( + "Info: No default sample size provided. Schema operations will scan entire graph unless explicitly specified." + ) + config["schema_sample_size"] = 1000 + return config diff --git a/servers/mcp-neo4j-cypher/tests/unit/test_utils.py b/servers/mcp-neo4j-cypher/tests/unit/test_utils.py index 57ea70b..f634294 100644 --- a/servers/mcp-neo4j-cypher/tests/unit/test_utils.py +++ b/servers/mcp-neo4j-cypher/tests/unit/test_utils.py @@ -31,6 +31,7 @@ def clean_env(): "NEO4J_READ_TIMEOUT", "NEO4J_RESPONSE_TOKEN_LIMIT", "NEO4J_READ_ONLY", + "NEO4J_SCHEMA_SAMPLE_SIZE", ] # Store original values original_values = {} @@ -66,6 +67,7 @@ def _create_args(**kwargs): "read_timeout": None, "token_limit": None, "read_only": None, + "schema_sample_size": None, } defaults.update(kwargs) return argparse.Namespace(**defaults) @@ -741,4 +743,43 @@ def test_read_only_defaults_and_precedence(clean_env, args_factory): # When CLI flag is absent (False), env var is used os.environ["NEO4J_READ_ONLY"] = "true" - assert process_config(args_factory(read_only=False))["read_only"] is True + assert process_config(args_factory())["read_only"] is True + + +def test_sample_cli_args(clean_env, args_factory): + """Test sample configuration via CLI arguments.""" + assert process_config(args_factory(sample=1000))["schema_sample_size"] == 1000 + assert process_config(args_factory(sample=500))["schema_sample_size"] == 500 + assert process_config(args_factory(sample=0))["schema_sample_size"] == 0 + + +def test_sample_env_vars(clean_env, args_factory): + """Test sample configuration via environment variables.""" + os.environ["NEO4J_SCHEMA_SAMPLE_SIZE"] = "2000" + assert process_config(args_factory())["schema_sample_size"] == 2000 + + os.environ["NEO4J_SCHEMA_SAMPLE_SIZE"] = "100" + assert process_config(args_factory())["schema_sample_size"] == 100 + + +def test_sample_defaults(clean_env, args_factory): + """Test sample defaults when not provided.""" + assert process_config(args_factory())["schema_sample_size"] is None + + +def test_sample_cli_overrides_env(clean_env, args_factory): + """Test that CLI arguments override environment variables for sample.""" + os.environ["NEO4J_SCHEMA_SAMPLE_SIZE"] = "1000" + assert process_config(args_factory(sample=500))["schema_sample_size"] == 500 + + +def test_sample_invalid_env_var(clean_env, args_factory, mock_logger): + """Test sample with invalid environment variable value.""" + os.environ["NEO4J_SCHEMA_SAMPLE_SIZE"] = "not_a_number" + config = process_config(args_factory()) + + # Should default to None and log warning + assert config["sa"] is None + mock_logger.warning.assert_called_with( + "Warning: Invalid sample size provided in NEO4J_SCHEMA_SAMPLE_SIZE environment variable. No default sample will be used." + )