Skip to content

Commit 70cfaf7

Browse files
committed
fix: MCP empty params schema, paddleocr validation, and annotation config parity
- MCP parameterless tools (cache_stats, cache_clear) now emit valid inputSchema with type: "object" instead of const: null (#406) - Python get_valid_ocr_backends() unconditionally includes paddleocr - TypeScript E2E generator maps extract_annotations in mapPdfConfig() - PHP PdfConfig adds extractAnnotations and margin fraction fields
1 parent cee1717 commit 70cfaf7

File tree

8 files changed

+97
-18
lines changed

8 files changed

+97
-18
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2727
- Node.js NAPI-RS binding correctly exposes `annotations` field on `ExtractionResult`
2828
- Python output format validation tests updated to reflect `json` as a valid format (alias for `structured`)
2929
- XLSX extraction with `output_format="markdown"` now produces markdown tables instead of plain text (#405)
30+
- MCP tools with no parameters (`cache_stats`, `cache_clear`) now emit valid `inputSchema` with `{"type": "object", "properties": {}}` instead of `{"const": null}`, fixing Claude Code and other MCP clients that validate schema type (#406)
31+
- Python `get_valid_ocr_backends()` now unconditionally includes `paddleocr` in the returned list, matching all other language bindings
32+
- TypeScript E2E test generator now maps `extract_annotations` to `extractAnnotations` in `mapPdfConfig()`, fixing annotation assertion failures
33+
- PHP `PdfConfig` now includes `extractAnnotations`, `topMarginFraction`, and `bottomMarginFraction` fields, restoring parity with the Rust core config
3034

3135
---
3236

crates/kreuzberg-py/src/validation.rs

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -166,14 +166,11 @@ pub fn get_valid_language_codes() -> PyResult<Vec<String>> {
166166
/// list[str]: List of valid OCR backend names
167167
#[pyfunction]
168168
pub fn get_valid_ocr_backends() -> PyResult<Vec<String>> {
169-
let mut backends = vec!["tesseract".to_string(), "easyocr".to_string()];
170-
// Only advertise paddleocr if it's actually registered at runtime
171-
if let Ok(registered) = kreuzberg::plugins::list_ocr_backends()
172-
&& registered.iter().any(|b| b == "paddleocr")
173-
{
174-
backends.push("paddleocr".to_string());
175-
}
176-
Ok(backends)
169+
Ok(vec![
170+
"tesseract".to_string(),
171+
"easyocr".to_string(),
172+
"paddleocr".to_string(),
173+
])
177174
}
178175

179176
/// Get list of valid token reduction levels.

crates/kreuzberg/src/mcp/params.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,13 @@ fn default_use_content() -> bool {
5454
true
5555
}
5656

57+
/// Empty parameters for tools that take no arguments.
58+
///
59+
/// This generates `{"type": "object", "properties": {}}` which is required by
60+
/// the MCP specification, unlike `()` which generates `{"const": null}`.
61+
#[derive(Debug, serde::Deserialize, serde::Serialize, schemars::JsonSchema)]
62+
pub struct EmptyParams {}
63+
5764
#[cfg(test)]
5865
mod tests {
5966
use super::*;
@@ -170,4 +177,17 @@ mod tests {
170177
assert_eq!(params.path, deserialized.path);
171178
assert_eq!(params.use_content, deserialized.use_content);
172179
}
180+
181+
#[test]
182+
fn test_empty_params_schema_has_type_object() {
183+
let schema = schemars::schema_for!(EmptyParams);
184+
let json = serde_json::to_value(&schema).unwrap();
185+
assert_eq!(json["type"], "object");
186+
}
187+
188+
#[test]
189+
fn test_empty_params_deserializes_from_empty_object() {
190+
let params: EmptyParams = serde_json::from_str("{}").unwrap();
191+
let _ = params;
192+
}
173193
}

crates/kreuzberg/src/mcp/server.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,10 @@ impl KreuzbergMcp {
193193
description = "Get cache statistics including total files, size, and available disk space.",
194194
annotations(title = "Cache Stats", read_only_hint = true, idempotent_hint = true)
195195
)]
196-
fn cache_stats(&self, Parameters(_): Parameters<()>) -> Result<CallToolResult, rmcp::ErrorData> {
196+
fn cache_stats(
197+
&self,
198+
Parameters(_): Parameters<super::params::EmptyParams>,
199+
) -> Result<CallToolResult, rmcp::ErrorData> {
197200
use super::errors::map_kreuzberg_error_to_mcp;
198201
use crate::cache;
199202

@@ -230,7 +233,10 @@ impl KreuzbergMcp {
230233
description = "Clear all cached files. Returns the number of files removed and space freed in MB.",
231234
annotations(title = "Clear Cache", destructive_hint = true)
232235
)]
233-
fn cache_clear(&self, Parameters(_): Parameters<()>) -> Result<CallToolResult, rmcp::ErrorData> {
236+
fn cache_clear(
237+
&self,
238+
Parameters(_): Parameters<super::params::EmptyParams>,
239+
) -> Result<CallToolResult, rmcp::ErrorData> {
234240
use super::errors::map_kreuzberg_error_to_mcp;
235241
use crate::cache;
236242

crates/kreuzberg/src/mcp/tools/cache.rs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ pub(in crate::mcp) trait CacheTool {
1717
description = "Get cache statistics including total files, size, and available disk space.",
1818
annotations(title = "Cache Stats", read_only_hint = true, idempotent_hint = true)
1919
)]
20-
fn cache_stats(&self, Parameters(_): Parameters<()>) -> Result<CallToolResult, McpError> {
20+
fn cache_stats(&self, Parameters(_): Parameters<super::super::params::EmptyParams>) -> Result<CallToolResult, McpError> {
2121
let cache_dir = std::env::current_dir()
2222
.unwrap_or_else(|_| std::path::PathBuf::from("."))
2323
.join(".kreuzberg");
@@ -51,7 +51,7 @@ pub(in crate::mcp) trait CacheTool {
5151
description = "Clear all cached files. Returns the number of files removed and space freed in MB.",
5252
annotations(title = "Clear Cache", destructive_hint = true)
5353
)]
54-
fn cache_clear(&self, Parameters(_): Parameters<()>) -> Result<CallToolResult, McpError> {
54+
fn cache_clear(&self, Parameters(_): Parameters<super::super::params::EmptyParams>) -> Result<CallToolResult, McpError> {
5555
let cache_dir = std::env::current_dir()
5656
.unwrap_or_else(|_| std::path::PathBuf::from("."))
5757
.join(".kreuzberg");
@@ -86,7 +86,7 @@ mod tests {
8686
async fn test_cache_stats_returns_statistics() {
8787
let server = TestMcpServer;
8888

89-
let result = server.cache_stats(Parameters(()));
89+
let result = server.cache_stats(Parameters(super::super::params::EmptyParams {}));
9090

9191
assert!(result.is_ok());
9292
let call_result = result.unwrap();
@@ -110,7 +110,7 @@ mod tests {
110110
async fn test_cache_clear_returns_result() {
111111
let server = TestMcpServer;
112112

113-
let result = server.cache_clear(Parameters(()));
113+
let result = server.cache_clear(Parameters(super::super::params::EmptyParams {}));
114114

115115
assert!(result.is_ok());
116116
let call_result = result.unwrap();
@@ -133,18 +133,18 @@ mod tests {
133133
async fn test_cache_clear_is_idempotent() {
134134
let server = TestMcpServer;
135135

136-
let result1 = server.cache_clear(Parameters(()));
136+
let result1 = server.cache_clear(Parameters(super::super::params::EmptyParams {}));
137137
assert!(result1.is_ok());
138138

139-
let result2 = server.cache_clear(Parameters(()));
139+
let result2 = server.cache_clear(Parameters(super::super::params::EmptyParams {}));
140140
assert!(result2.is_ok());
141141
}
142142

143143
#[tokio::test]
144144
async fn test_cache_clear_returns_metrics() {
145145
let server = TestMcpServer;
146146

147-
let result = server.cache_clear(Parameters(()));
147+
let result = server.cache_clear(Parameters(super::super::params::EmptyParams {}));
148148

149149
assert!(result.is_ok());
150150
let call_result = result.unwrap();
@@ -160,7 +160,7 @@ mod tests {
160160
async fn test_cache_stats_returns_valid_data() {
161161
let server = TestMcpServer;
162162

163-
let result = server.cache_stats(Parameters(()));
163+
let result = server.cache_stats(Parameters(super::super::params::EmptyParams {}));
164164

165165
assert!(result.is_ok());
166166
let call_result = result.unwrap();

e2e/typescript/tests/helpers.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ function mapPdfConfig(raw: PlainRecord): PdfConfig {
153153
config.passwords = raw.passwords.filter((item: unknown): item is string => typeof item === "string");
154154
}
155155
assignBooleanField(config as PlainRecord, raw, "extract_metadata", "extractMetadata");
156+
assignBooleanField(config as PlainRecord, raw, "extract_annotations", "extractAnnotations");
156157
return config;
157158
}
158159

packages/php/src/Config/PdfConfig.php

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,33 @@ public function __construct(
5555
* @default null
5656
*/
5757
public ?HierarchyConfig $hierarchy = null,
58+
59+
/**
60+
* Extract PDF annotations (text notes, highlights, links, stamps).
61+
*
62+
* When enabled, annotations embedded in PDFs are extracted and included
63+
* in the extraction results.
64+
*
65+
* @var bool
66+
* @default false
67+
*/
68+
public bool $extractAnnotations = false,
69+
70+
/**
71+
* Top margin fraction (0.0-1.0) of page height to exclude headers/running heads.
72+
*
73+
* @var float|null
74+
* @default null
75+
*/
76+
public ?float $topMarginFraction = null,
77+
78+
/**
79+
* Bottom margin fraction (0.0-1.0) of page height to exclude footers/page numbers.
80+
*
81+
* @var float|null
82+
* @default null
83+
*/
84+
public ?float $bottomMarginFraction = null,
5885
) {
5986
}
6087

@@ -92,11 +119,31 @@ public static function fromArray(array $data): self
92119
$hierarchy = HierarchyConfig::fromArray($hierarchyData);
93120
}
94121

122+
/** @var bool $extractAnnotations */
123+
$extractAnnotations = $data['extract_annotations'] ?? false;
124+
if (!is_bool($extractAnnotations)) {
125+
/** @var bool $extractAnnotations */
126+
$extractAnnotations = (bool) $extractAnnotations;
127+
}
128+
129+
$topMarginFraction = null;
130+
if (isset($data['top_margin_fraction']) && is_numeric($data['top_margin_fraction'])) {
131+
$topMarginFraction = (float) $data['top_margin_fraction'];
132+
}
133+
134+
$bottomMarginFraction = null;
135+
if (isset($data['bottom_margin_fraction']) && is_numeric($data['bottom_margin_fraction'])) {
136+
$bottomMarginFraction = (float) $data['bottom_margin_fraction'];
137+
}
138+
95139
return new self(
96140
extractImages: $extractImages,
97141
passwords: $passwords,
98142
extractMetadata: $extractMetadata,
99143
hierarchy: $hierarchy,
144+
extractAnnotations: $extractAnnotations,
145+
topMarginFraction: $topMarginFraction,
146+
bottomMarginFraction: $bottomMarginFraction,
100147
);
101148
}
102149

@@ -141,6 +188,9 @@ public function toArray(): array
141188
'passwords' => $this->passwords,
142189
'extract_metadata' => $this->extractMetadata,
143190
'hierarchy' => $this->hierarchy?->toArray(),
191+
'extract_annotations' => $this->extractAnnotations,
192+
'top_margin_fraction' => $this->topMarginFraction,
193+
'bottom_margin_fraction' => $this->bottomMarginFraction,
144194
], static fn ($value): bool => $value !== null);
145195
}
146196

tools/e2e-generator/src/typescript.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ function mapPdfConfig(raw: PlainRecord): PdfConfig {
156156
config.passwords = raw.passwords.filter((item: unknown): item is string => typeof item === "string");
157157
}
158158
assignBooleanField(config as PlainRecord, raw, "extract_metadata", "extractMetadata");
159+
assignBooleanField(config as PlainRecord, raw, "extract_annotations", "extractAnnotations");
159160
return config;
160161
}
161162

0 commit comments

Comments
 (0)