fix: MCP empty params schema, paddleocr validation, and annotation config parity

Goldziher · Goldziher · commit 70cfaf77f131 · 2026-02-20T08:18:39.000+01:00
- MCP parameterless tools (cache_stats, cache_clear) now emit valid inputSchema with type: "object" instead of const: null (#406) - Python get_valid_ocr_backends() unconditionally includes paddleocr - TypeScript E2E generator maps extract_annotations in mapPdfConfig() - PHP PdfConfig adds extractAnnotations and margin fraction fields
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -27,6 +27,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Node.js NAPI-RS binding correctly exposes `annotations` field on `ExtractionResult`
 - Python output format validation tests updated to reflect `json` as a valid format (alias for `structured`)
 - XLSX extraction with `output_format="markdown"` now produces markdown tables instead of plain text (#405)
+- MCP tools with no parameters (`cache_stats`, `cache_clear`) now emit valid `inputSchema` with `{"type": "object", "properties": {}}` instead of `{"const": null}`, fixing Claude Code and other MCP clients that validate schema type (#406)
+- Python `get_valid_ocr_backends()` now unconditionally includes `paddleocr` in the returned list, matching all other language bindings
+- TypeScript E2E test generator now maps `extract_annotations` to `extractAnnotations` in `mapPdfConfig()`, fixing annotation assertion failures
+- PHP `PdfConfig` now includes `extractAnnotations`, `topMarginFraction`, and `bottomMarginFraction` fields, restoring parity with the Rust core config
 
 ---
 
diff --git a/crates/kreuzberg-py/src/validation.rs b/crates/kreuzberg-py/src/validation.rs
@@ -166,14 +166,11 @@ pub fn get_valid_language_codes() -> PyResult<Vec<String>> {
 ///     list[str]: List of valid OCR backend names
 #[pyfunction]
 pub fn get_valid_ocr_backends() -> PyResult<Vec<String>> {
-    let mut backends = vec!["tesseract".to_string(), "easyocr".to_string()];
-    // Only advertise paddleocr if it's actually registered at runtime
-    if let Ok(registered) = kreuzberg::plugins::list_ocr_backends()
-        && registered.iter().any(|b| b == "paddleocr")
-    {
-        backends.push("paddleocr".to_string());
-    }
-    Ok(backends)
+    Ok(vec![
+        "tesseract".to_string(),
+        "easyocr".to_string(),
+        "paddleocr".to_string(),
+    ])
 }
 
 /// Get list of valid token reduction levels.
diff --git a/crates/kreuzberg/src/mcp/params.rs b/crates/kreuzberg/src/mcp/params.rs
@@ -54,6 +54,13 @@ fn default_use_content() -> bool {
     true
 }
 
+/// Empty parameters for tools that take no arguments.
+///
+/// This generates `{"type": "object", "properties": {}}` which is required by
+/// the MCP specification, unlike `()` which generates `{"const": null}`.
+#[derive(Debug, serde::Deserialize, serde::Serialize, schemars::JsonSchema)]
+pub struct EmptyParams {}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -170,4 +177,17 @@ mod tests {
         assert_eq!(params.path, deserialized.path);
         assert_eq!(params.use_content, deserialized.use_content);
     }
+
+    #[test]
+    fn test_empty_params_schema_has_type_object() {
+        let schema = schemars::schema_for!(EmptyParams);
+        let json = serde_json::to_value(&schema).unwrap();
+        assert_eq!(json["type"], "object");
+    }
+
+    #[test]
+    fn test_empty_params_deserializes_from_empty_object() {
+        let params: EmptyParams = serde_json::from_str("{}").unwrap();
+        let _ = params;
+    }
 }
diff --git a/crates/kreuzberg/src/mcp/server.rs b/crates/kreuzberg/src/mcp/server.rs
@@ -193,7 +193,10 @@ impl KreuzbergMcp {
         description = "Get cache statistics including total files, size, and available disk space.",
         annotations(title = "Cache Stats", read_only_hint = true, idempotent_hint = true)
     )]
-    fn cache_stats(&self, Parameters(_): Parameters<()>) -> Result<CallToolResult, rmcp::ErrorData> {
+    fn cache_stats(
+        &self,
+        Parameters(_): Parameters<super::params::EmptyParams>,
+    ) -> Result<CallToolResult, rmcp::ErrorData> {
         use super::errors::map_kreuzberg_error_to_mcp;
         use crate::cache;
 
@@ -230,7 +233,10 @@ impl KreuzbergMcp {
         description = "Clear all cached files. Returns the number of files removed and space freed in MB.",
         annotations(title = "Clear Cache", destructive_hint = true)
     )]
-    fn cache_clear(&self, Parameters(_): Parameters<()>) -> Result<CallToolResult, rmcp::ErrorData> {
+    fn cache_clear(
+        &self,
+        Parameters(_): Parameters<super::params::EmptyParams>,
+    ) -> Result<CallToolResult, rmcp::ErrorData> {
         use super::errors::map_kreuzberg_error_to_mcp;
         use crate::cache;
 
diff --git a/crates/kreuzberg/src/mcp/tools/cache.rs b/crates/kreuzberg/src/mcp/tools/cache.rs
@@ -17,7 +17,7 @@ pub(in crate::mcp) trait CacheTool {
         description = "Get cache statistics including total files, size, and available disk space.",
         annotations(title = "Cache Stats", read_only_hint = true, idempotent_hint = true)
     )]
-    fn cache_stats(&self, Parameters(_): Parameters<()>) -> Result<CallToolResult, McpError> {
+    fn cache_stats(&self, Parameters(_): Parameters<super::super::params::EmptyParams>) -> Result<CallToolResult, McpError> {
         let cache_dir = std::env::current_dir()
             .unwrap_or_else(|_| std::path::PathBuf::from("."))
             .join(".kreuzberg");
@@ -51,7 +51,7 @@ pub(in crate::mcp) trait CacheTool {
         description = "Clear all cached files. Returns the number of files removed and space freed in MB.",
         annotations(title = "Clear Cache", destructive_hint = true)
     )]
-    fn cache_clear(&self, Parameters(_): Parameters<()>) -> Result<CallToolResult, McpError> {
+    fn cache_clear(&self, Parameters(_): Parameters<super::super::params::EmptyParams>) -> Result<CallToolResult, McpError> {
         let cache_dir = std::env::current_dir()
             .unwrap_or_else(|_| std::path::PathBuf::from("."))
             .join(".kreuzberg");
@@ -86,7 +86,7 @@ mod tests {
     async fn test_cache_stats_returns_statistics() {
         let server = TestMcpServer;
 
-        let result = server.cache_stats(Parameters(()));
+        let result = server.cache_stats(Parameters(super::super::params::EmptyParams {}));
 
         assert!(result.is_ok());
         let call_result = result.unwrap();
@@ -110,7 +110,7 @@ mod tests {
     async fn test_cache_clear_returns_result() {
         let server = TestMcpServer;
 
-        let result = server.cache_clear(Parameters(()));
+        let result = server.cache_clear(Parameters(super::super::params::EmptyParams {}));
 
         assert!(result.is_ok());
         let call_result = result.unwrap();
@@ -133,18 +133,18 @@ mod tests {
     async fn test_cache_clear_is_idempotent() {
         let server = TestMcpServer;
 
-        let result1 = server.cache_clear(Parameters(()));
+        let result1 = server.cache_clear(Parameters(super::super::params::EmptyParams {}));
         assert!(result1.is_ok());
 
-        let result2 = server.cache_clear(Parameters(()));
+        let result2 = server.cache_clear(Parameters(super::super::params::EmptyParams {}));
         assert!(result2.is_ok());
     }
 
     #[tokio::test]
     async fn test_cache_clear_returns_metrics() {
         let server = TestMcpServer;
 
-        let result = server.cache_clear(Parameters(()));
+        let result = server.cache_clear(Parameters(super::super::params::EmptyParams {}));
 
         assert!(result.is_ok());
         let call_result = result.unwrap();
@@ -160,7 +160,7 @@ mod tests {
     async fn test_cache_stats_returns_valid_data() {
         let server = TestMcpServer;
 
-        let result = server.cache_stats(Parameters(()));
+        let result = server.cache_stats(Parameters(super::super::params::EmptyParams {}));
 
         assert!(result.is_ok());
         let call_result = result.unwrap();
diff --git a/e2e/typescript/tests/helpers.ts b/e2e/typescript/tests/helpers.ts
@@ -153,6 +153,7 @@ function mapPdfConfig(raw: PlainRecord): PdfConfig {
 		config.passwords = raw.passwords.filter((item: unknown): item is string => typeof item === "string");
 	}
 	assignBooleanField(config as PlainRecord, raw, "extract_metadata", "extractMetadata");
+	assignBooleanField(config as PlainRecord, raw, "extract_annotations", "extractAnnotations");
 	return config;
 }
 
diff --git a/packages/php/src/Config/PdfConfig.php b/packages/php/src/Config/PdfConfig.php
@@ -55,6 +55,33 @@ public function __construct(
          * @default null
          */
         public ?HierarchyConfig $hierarchy = null,
+
+        /**
+         * Extract PDF annotations (text notes, highlights, links, stamps).
+         *
+         * When enabled, annotations embedded in PDFs are extracted and included
+         * in the extraction results.
+         *
+         * @var bool
+         * @default false
+         */
+        public bool $extractAnnotations = false,
+
+        /**
+         * Top margin fraction (0.0-1.0) of page height to exclude headers/running heads.
+         *
+         * @var float|null
+         * @default null
+         */
+        public ?float $topMarginFraction = null,
+
+        /**
+         * Bottom margin fraction (0.0-1.0) of page height to exclude footers/page numbers.
+         *
+         * @var float|null
+         * @default null
+         */
+        public ?float $bottomMarginFraction = null,
     ) {
     }
 
@@ -92,11 +119,31 @@ public static function fromArray(array $data): self
             $hierarchy = HierarchyConfig::fromArray($hierarchyData);
         }
 
+        /** @var bool $extractAnnotations */
+        $extractAnnotations = $data['extract_annotations'] ?? false;
+        if (!is_bool($extractAnnotations)) {
+            /** @var bool $extractAnnotations */
+            $extractAnnotations = (bool) $extractAnnotations;
+        }
+
+        $topMarginFraction = null;
+        if (isset($data['top_margin_fraction']) && is_numeric($data['top_margin_fraction'])) {
+            $topMarginFraction = (float) $data['top_margin_fraction'];
+        }
+
+        $bottomMarginFraction = null;
+        if (isset($data['bottom_margin_fraction']) && is_numeric($data['bottom_margin_fraction'])) {
+            $bottomMarginFraction = (float) $data['bottom_margin_fraction'];
+        }
+
         return new self(
             extractImages: $extractImages,
             passwords: $passwords,
             extractMetadata: $extractMetadata,
             hierarchy: $hierarchy,
+            extractAnnotations: $extractAnnotations,
+            topMarginFraction: $topMarginFraction,
+            bottomMarginFraction: $bottomMarginFraction,
         );
     }
 
@@ -141,6 +188,9 @@ public function toArray(): array
             'passwords' => $this->passwords,
             'extract_metadata' => $this->extractMetadata,
             'hierarchy' => $this->hierarchy?->toArray(),
+            'extract_annotations' => $this->extractAnnotations,
+            'top_margin_fraction' => $this->topMarginFraction,
+            'bottom_margin_fraction' => $this->bottomMarginFraction,
         ], static fn ($value): bool => $value !== null);
     }
 
diff --git a/tools/e2e-generator/src/typescript.rs b/tools/e2e-generator/src/typescript.rs
@@ -156,6 +156,7 @@ function mapPdfConfig(raw: PlainRecord): PdfConfig {
         config.passwords = raw.passwords.filter((item: unknown): item is string => typeof item === "string");
     }
     assignBooleanField(config as PlainRecord, raw, "extract_metadata", "extractMetadata");
+    assignBooleanField(config as PlainRecord, raw, "extract_annotations", "extractAnnotations");
     return config;
 }
 

Original file line number	Diff line number	Diff line change
`@@ -153,6 +153,7 @@ function mapPdfConfig(raw: PlainRecord): PdfConfig {`
`153`	`153`	`config.passwords = raw.passwords.filter((item: unknown): item is string => typeof item === "string");`
`154`	`154`	`}`
`155`	`155`	`assignBooleanField(config as PlainRecord, raw, "extract_metadata", "extractMetadata");`
	`156`	`+ assignBooleanField(config as PlainRecord, raw, "extract_annotations", "extractAnnotations");`
`156`	`157`	`return config;`
`157`	`158`	`}`
`158`	`159`
Original file line number	Diff line number	Diff line change
`@@ -156,6 +156,7 @@ function mapPdfConfig(raw: PlainRecord): PdfConfig {`
`156`	`156`	`config.passwords = raw.passwords.filter((item: unknown): item is string => typeof item === "string");`
`157`	`157`	`}`
`158`	`158`	`assignBooleanField(config as PlainRecord, raw, "extract_metadata", "extractMetadata");`
	`159`	`+ assignBooleanField(config as PlainRecord, raw, "extract_annotations", "extractAnnotations");`
`159`	`160`	`return config;`
`160`	`161`	`}`
`161`	`162`