neo4j-contrib · a-s-g93 · Nov 3, 2025 · Oct 29, 2025 · Oct 29, 2025 · Oct 30, 2025
diff --git a/servers/mcp-neo4j-cypher/CHANGELOG.md b/servers/mcp-neo4j-cypher/CHANGELOG.md
@@ -5,6 +5,7 @@
 ### Changed
 
 ### Added
+* Add `NEO4J_SCHEMA_SAMPLE_SIZE` env variable and `schema-sample-size` cli argument to configure the `get_neo4j_schema` sample size
 * Update write query detection to include `INSERT` in regex check
 
 ## v0.4.1

diff --git a/servers/mcp-neo4j-cypher/README.md b/servers/mcp-neo4j-cypher/README.md
@@ -44,8 +44,10 @@ The server offers these core tools:
 
 - `get_neo4j_schema`
   - Get a list of all nodes types in the graph database, their attributes with name, type and relationships to other node types
-  - No input required
+  - Input:
+    - `sample_param` (integer, optional): Number of nodes to sample for schema analysis. Overrides server default if provided.
   - Returns: JSON serialized list of node labels with two dictionaries: one for attributes and one for relationships
+  - **Performance**: Uses sampling by default (1000 nodes per label). Reduce number for faster analysis on large databases. To stop sampling, set to -1. 
 
 ### 🏷️ Namespacing
 
@@ -105,6 +107,62 @@ When a response exceeds the token limit, it will be automatically truncated to f
 
 **Note**: Token limits only apply to `read_neo4j_cypher` responses. Schema queries and write operations return summary information and are not affected.
 
+#### 🔍 Schema Sampling
+
+Control the performance and scope of schema inspection with the `sample` parameter for the `get_neo4j_schema` tool:
+
+**Command Line:**
+```bash
+mcp-neo4j-cypher --sample 1000  # Sample 1000 nodes per label
+```
+
+**Environment Variable:**
+```bash
+export NEO4J_SCHEMA_SAMPLE_SIZE=1000
+```
+
+**Docker:**
+```bash
+docker run -e NEO4J_SCHEMA_SAMPLE_SIZE=1000 mcp-neo4j-cypher:latest
+```
+
+The `sample` parameter controls how many nodes are examined when generating the database schema:
+
+- **Default**: `1000` nodes per label are sampled for schema analysis
+- **Performance**: Lower values (`100`, `500`) provide faster schema inspection on large databases
+- **Accuracy**: Higher values (`5000`, `10000`) provide more comprehensive schema coverage
+- **Full Scan**: Set to `-1` to examine all nodes (can be very slow on large databases)
+- **Per-Call Override**: The `get_neo4j_schema` tool accepts a `sample_param` parameter to override the server default
+
+**How Sampling Works** (via [APOC's apoc.meta.schema](https://neo4j.com/docs/apoc/current/overview/apoc.meta/apoc.meta.schema/)):
+
+- For each node label, a skip count is calculated: `totalNodesForLabel / sample ± 10%`
+- Every Nth node is examined based on the skip count
+- Higher sample numbers result in more nodes being examined
+- Results may vary between runs due to random sampling
+
+**Example Scenarios:**
+
+```bash
+# Fast schema inspection for large databases
+export NEO4J_SCHEMA_SAMPLE_SIZE=100
+
+# Balanced performance and accuracy (default)
+export NEO4J_SCHEMA_SAMPLE_SIZE=1000
+
+# Comprehensive schema analysis
+export NEO4J_SCHEMA_SAMPLE_SIZE=5000
+
+# Full database scan (use with caution on large databases)
+export NEO4J_SCHEMA_SAMPLE_SIZE=-1
+```
+
+**Performance Considerations:**
+
+- **Large Databases**: Use lower sample values (`100-500`) to prevent timeouts
+- **Development**: Higher sample values (`1000-5000`) for thorough schema understanding
+- **Production**: Balance between performance and schema completeness based on your use case
+
 ## 🏗️ Local Development & Deployment
 
 ### 🐳 Local Docker Development
@@ -407,6 +465,7 @@ docker run --rm -p 8000:8000 \
 | `NEO4J_RESPONSE_TOKEN_LIMIT`       | _(none)_                                | Maximum tokens for read query responses            |
 | `NEO4J_READ_TIMEOUT`               | `30`                                    | Timeout in seconds for read queries                |
 | `NEO4J_READ_ONLY`                  | `false`                                 | Allow only read-only queries (true/false)          |
+| `NEO4J_SCHEMA_SAMPLE_SIZE`                     | `1000`                                  | Number of nodes to sample for schema inspection (set to -1 for full scan) |
 
 ### 🌐 SSE Transport for Legacy Web Access
 

diff --git a/servers/mcp-neo4j-cypher/src/mcp_neo4j_cypher/__init__.py b/servers/mcp-neo4j-cypher/src/mcp_neo4j_cypher/__init__.py
@@ -43,6 +43,12 @@ def main():
         help="Allow only read-only queries (default: False)",
     )
     parser.add_argument("--token-limit", default=None, help="Response token limit")
+    parser.add_argument(
+        "--schema-sample-size",
+        type=int,
+        default=None,
+        help="Default sample size for schema operations (default: 1000)",
+    )
 
     args = parser.parse_args()
     config = process_config(args)

diff --git a/servers/mcp-neo4j-cypher/src/mcp_neo4j_cypher/server.py b/servers/mcp-neo4j-cypher/src/mcp_neo4j_cypher/server.py
@@ -44,6 +44,7 @@ def create_mcp_server(
     read_timeout: int = 30,
     token_limit: Optional[int] = None,
     read_only: bool = False,
+    config_sample_size: int = 1000,
 ) -> FastMCP:
     mcp: FastMCP = FastMCP(
         "mcp-neo4j-cypher", dependencies=["neo4j", "pydantic"], stateless_http=True
@@ -62,16 +63,26 @@ def create_mcp_server(
             openWorldHint=True,
         ),
     )
-    async def get_neo4j_schema() -> list[ToolResult]:
-        """
-        List all nodes, their attributes and their relationships to other nodes in the neo4j database.
-        This requires that the APOC plugin is installed and enabled.
+    async def get_neo4j_schema(sample_size: int = Field(default=config_sample_size, description="The sample size used to infer the graph schema. Larger samples are slower, but more accurate. Smaller samples are faster, but might miss information.")) -> list[ToolResult]:
         """
+        Returns nodes, their properties (with types and indexed flags), and relationships
+        using APOC's schema inspection.
+
+        You should only provide a `sample_size` value if requested by the user, or tuning the retrieval performance.
 
-        get_schema_query = """
-        CALL apoc.meta.schema();
+        Performance Notes:
+            - If `sample_size` is not provided, uses the server's default sample setting defined in the server configuration.
+            - If retrieving the schema times out, try lowering the sample size, e.g. `sample_size=100`.
+            - To sample the entire graph use `sample_size=-1`.
         """
 
+        # Use provided sample_size, otherwise fall back to server default - 1000
+        effective_sample_size = sample_size if sample_size else config_sample_size
+
+        logger.info(f"Running `get_neo4j_schema` with sample size {effective_sample_size}.")
+
+        get_schema_query = f"CALL apoc.meta.schema({{sample: {effective_sample_size}}}) YIELD value RETURN value"
+
         def clean_schema(schema: dict) -> dict:
             cleaned = {}
 
@@ -132,16 +143,16 @@ def clean_schema(schema: dict) -> dict:
             return cleaned
 
         try:
-            results_json_str = await neo4j_driver.execute_query(
+            results_json = await neo4j_driver.execute_query(
                 get_schema_query,
                 routing_control=RoutingControl.READ,
                 database_=database,
                 result_transformer_=lambda r: r.data(),
             )
+
+            logger.debug(f"Read query returned {len(results_json)} rows")
 
-            logger.debug(f"Read query returned {len(results_json_str)} rows")
-
-            schema_clean = clean_schema(results_json_str[0].get("value"))
+            schema_clean = clean_schema(results_json[0].get("value"))
 
             schema_clean_str = json.dumps(schema_clean, default=str)
 
@@ -275,6 +286,7 @@ async def main(
     read_timeout: int = 30,
     token_limit: Optional[int] = None,
     read_only: bool = False,
+    schema_sample_size: Optional[int] = None, # this is known as the config_sample_size in the create_mcp_server function
 ) -> None:
     logger.info("Starting MCP neo4j Server")
 
@@ -296,7 +308,7 @@ async def main(
     ]
 
     mcp = create_mcp_server(
-        neo4j_driver, database, namespace, read_timeout, token_limit, read_only
+        neo4j_driver, database, namespace, read_timeout, token_limit, read_only, schema_sample_size
     )
 
     # Run the server with the specified transport

diff --git a/servers/mcp-neo4j-cypher/src/mcp_neo4j_cypher/utils.py b/servers/mcp-neo4j-cypher/src/mcp_neo4j_cypher/utils.py
@@ -296,6 +296,30 @@ def process_config(args: argparse.Namespace) -> dict[str, Union[str, int, None]]
         )
         config["read_only"] = False
 
+    # parse schema sample size
+    if args.schema_sample_size is not None:
+        config["schema_sample_size"] = args.schema_sample_size
+        logger.info(
+            f"Info: Default sample size set to {config['schema_sample_size']} via command line argument."
+        )
+    else:
+        if os.getenv("NEO4J_SCHEMA_SAMPLE_SIZE") is not None:
+            try:
+                config["schema_sample_size"] = int(os.getenv("NEO4J_SCHEMA_SAMPLE_SIZE"))
+                logger.info(
+                    f"Info: Default sample size set to {config['schema_sample_size']} via environment variable."
+                )
+            except ValueError:
+                logger.warning(
+                    "Warning: Invalid sample size provided in NEO4J_SCHEMA_SAMPLE_SIZE environment variable. No default sample will be used."
+                )
+                config["schema_sample_size"] = 1000
+        else:
+            logger.info(
+                "Info: No default sample size provided. Schema operations will scan entire graph unless explicitly specified."
+            )
+            config["schema_sample_size"] = 1000
+
     return config
 
 

diff --git a/servers/mcp-neo4j-cypher/tests/unit/test_utils.py b/servers/mcp-neo4j-cypher/tests/unit/test_utils.py
@@ -31,6 +31,7 @@ def clean_env():
         "NEO4J_READ_TIMEOUT",
         "NEO4J_RESPONSE_TOKEN_LIMIT",
         "NEO4J_READ_ONLY",
+        "NEO4J_SCHEMA_SAMPLE_SIZE",
     ]
     # Store original values
     original_values = {}
@@ -66,6 +67,7 @@ def _create_args(**kwargs):
             "read_timeout": None,
             "token_limit": None,
             "read_only": None,
+            "schema_sample_size": None,
         }
         defaults.update(kwargs)
         return argparse.Namespace(**defaults)
@@ -741,4 +743,43 @@ def test_read_only_defaults_and_precedence(clean_env, args_factory):
 
     # When CLI flag is absent (False), env var is used
     os.environ["NEO4J_READ_ONLY"] = "true"
-    assert process_config(args_factory(read_only=False))["read_only"] is True
+    assert process_config(args_factory())["read_only"] is True
+
+
+def test_sample_cli_args(clean_env, args_factory):
+    """Test sample configuration via CLI arguments."""
+    assert process_config(args_factory(sample=1000))["schema_sample_size"] == 1000
+    assert process_config(args_factory(sample=500))["schema_sample_size"] == 500
+    assert process_config(args_factory(sample=0))["schema_sample_size"] == 0
+
+
+def test_sample_env_vars(clean_env, args_factory):
+    """Test sample configuration via environment variables."""
+    os.environ["NEO4J_SCHEMA_SAMPLE_SIZE"] = "2000"
+    assert process_config(args_factory())["schema_sample_size"] == 2000
+
+    os.environ["NEO4J_SCHEMA_SAMPLE_SIZE"] = "100"
+    assert process_config(args_factory())["schema_sample_size"] == 100
+
+
+def test_sample_defaults(clean_env, args_factory):
+    """Test sample defaults when not provided."""
+    assert process_config(args_factory())["schema_sample_size"] is None
+
+
+def test_sample_cli_overrides_env(clean_env, args_factory):
+    """Test that CLI arguments override environment variables for sample."""
+    os.environ["NEO4J_SCHEMA_SAMPLE_SIZE"] = "1000"
+    assert process_config(args_factory(sample=500))["schema_sample_size"] == 500
+
+
+def test_sample_invalid_env_var(clean_env, args_factory, mock_logger):
+    """Test sample with invalid environment variable value."""
+    os.environ["NEO4J_SCHEMA_SAMPLE_SIZE"] = "not_a_number"
+    config = process_config(args_factory())
+
+    # Should default to None and log warning
+    assert config["sa"] is None
+    mock_logger.warning.assert_called_with(
+        "Warning: Invalid sample size provided in NEO4J_SCHEMA_SAMPLE_SIZE environment variable. No default sample will be used."
+    )