Skip to content

Commit bf2ceae

Browse files
committed
fix: add extract_annotations and margin config to all bindings
- Go/Java/C#/Python/Node/WASM/Ruby/PHP: add extract_annotations, top_margin_fraction, bottom_margin_fraction to PdfConfig - Python .pyi stubs: add new fields and constructor params - Ruby: add PdfAnnotation type, annotations to Result, config fields - PHP: add PdfAnnotation type class, annotations to ExtractionResult - TypeScript: update core types, WASM types, test-utils config mapping - WASM Deno: fix missing initWasm() in plugin-api test generator - WASM: make unregisterOcrBackend graceful (no-throw on missing) - Regenerate wasm-deno e2e tests
1 parent 312b44d commit bf2ceae

File tree

20 files changed

+309
-31
lines changed

20 files changed

+309
-31
lines changed

crates/kreuzberg-node/src/config/types.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,9 @@ pub struct JsPdfConfig {
322322
pub passwords: Option<Vec<String>>,
323323
pub extract_metadata: Option<bool>,
324324
pub hierarchy: Option<JsHierarchyConfig>,
325+
pub extract_annotations: Option<bool>,
326+
pub top_margin_fraction: Option<f64>,
327+
pub bottom_margin_fraction: Option<f64>,
325328
}
326329

327330
impl From<JsPdfConfig> for RustPdfConfig {
@@ -331,9 +334,9 @@ impl From<JsPdfConfig> for RustPdfConfig {
331334
passwords: val.passwords,
332335
extract_metadata: val.extract_metadata.unwrap_or(true),
333336
hierarchy: val.hierarchy.map(|h| h.into()),
334-
extract_annotations: false,
335-
top_margin_fraction: None,
336-
bottom_margin_fraction: None,
337+
extract_annotations: val.extract_annotations.unwrap_or(false),
338+
top_margin_fraction: val.top_margin_fraction.map(|v| v as f32),
339+
bottom_margin_fraction: val.bottom_margin_fraction.map(|v| v as f32),
337340
}
338341
}
339342
}
@@ -1128,6 +1131,9 @@ impl TryFrom<ExtractionConfig> for JsExtractionConfig {
11281131
include_bbox: Some(h.include_bbox),
11291132
ocr_coverage_threshold: h.ocr_coverage_threshold.map(|v| v as f64),
11301133
}),
1134+
extract_annotations: Some(pdf.extract_annotations),
1135+
top_margin_fraction: pdf.top_margin_fraction.map(|v| v as f64),
1136+
bottom_margin_fraction: pdf.bottom_margin_fraction.map(|v| v as f64),
11311137
}),
11321138
token_reduction: val.token_reduction.map(|tr| JsTokenReductionConfig {
11331139
mode: Some(tr.mode),

crates/kreuzberg-node/typescript/types.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,15 @@ export interface PdfConfig {
318318

319319
/** Hierarchy extraction configuration. */
320320
hierarchy?: HierarchyConfig;
321+
322+
/** Extract annotations from PDF pages. Default: false. */
323+
extractAnnotations?: boolean;
324+
325+
/** Top margin fraction (0.0-0.5) for filtering header content. */
326+
topMarginFraction?: number;
327+
328+
/** Bottom margin fraction (0.0-0.5) for filtering footer content. */
329+
bottomMarginFraction?: number;
321330
}
322331

323332
/**

crates/kreuzberg-py/src/config/types.rs

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -878,22 +878,25 @@ pub struct PdfConfig {
878878
#[pymethods]
879879
impl PdfConfig {
880880
#[new]
881-
#[pyo3(signature = (extract_images=None, passwords=None, extract_metadata=None, hierarchy=None))]
881+
#[pyo3(signature = (extract_images=None, passwords=None, extract_metadata=None, hierarchy=None, extract_annotations=None, top_margin_fraction=None, bottom_margin_fraction=None))]
882882
fn new(
883883
extract_images: Option<bool>,
884884
passwords: Option<Vec<String>>,
885885
extract_metadata: Option<bool>,
886886
hierarchy: Option<HierarchyConfig>,
887+
extract_annotations: Option<bool>,
888+
top_margin_fraction: Option<f32>,
889+
bottom_margin_fraction: Option<f32>,
887890
) -> Self {
888891
Self {
889892
inner: kreuzberg::PdfConfig {
890893
extract_images: extract_images.unwrap_or(false),
891894
passwords,
892895
extract_metadata: extract_metadata.unwrap_or(true),
893896
hierarchy: hierarchy.map(|h| h.inner),
894-
extract_annotations: false,
895-
top_margin_fraction: None,
896-
bottom_margin_fraction: None,
897+
extract_annotations: extract_annotations.unwrap_or(false),
898+
top_margin_fraction,
899+
bottom_margin_fraction,
897900
},
898901
}
899902
}
@@ -938,6 +941,36 @@ impl PdfConfig {
938941
self.inner.hierarchy = value.map(|h| h.inner);
939942
}
940943

944+
#[getter]
945+
fn extract_annotations(&self) -> bool {
946+
self.inner.extract_annotations
947+
}
948+
949+
#[setter]
950+
fn set_extract_annotations(&mut self, value: bool) {
951+
self.inner.extract_annotations = value;
952+
}
953+
954+
#[getter]
955+
fn top_margin_fraction(&self) -> Option<f32> {
956+
self.inner.top_margin_fraction
957+
}
958+
959+
#[setter]
960+
fn set_top_margin_fraction(&mut self, value: Option<f32>) {
961+
self.inner.top_margin_fraction = value;
962+
}
963+
964+
#[getter]
965+
fn bottom_margin_fraction(&self) -> Option<f32> {
966+
self.inner.bottom_margin_fraction
967+
}
968+
969+
#[setter]
970+
fn set_bottom_margin_fraction(&mut self, value: Option<f32>) {
971+
self.inner.bottom_margin_fraction = value;
972+
}
973+
941974
fn __repr__(&self) -> String {
942975
format!(
943976
"PdfConfig(extract_images={}, extract_metadata={}, passwords={})",

crates/kreuzberg-wasm/typescript/ocr/registry.spec.ts

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -215,8 +215,8 @@ describe("OCR Registry", () => {
215215
expect(getOcrBackend("test")).toBeUndefined();
216216
});
217217

218-
it("should throw if backend not found", async () => {
219-
await expect(unregisterOcrBackend("nonexistent")).rejects.toThrow('OCR backend "nonexistent" is not registered');
218+
it("should silently succeed if backend not found", async () => {
219+
await unregisterOcrBackend("nonexistent");
220220
});
221221

222222
it("should call shutdown method if available", async () => {
@@ -258,22 +258,18 @@ describe("OCR Registry", () => {
258258
const backend = createMockBackend("Test");
259259
registerOcrBackend(backend);
260260

261-
await expect(unregisterOcrBackend("test")).rejects.toThrow("is not registered");
261+
await unregisterOcrBackend("test");
262262

263263
expect(getOcrBackend("Test")).toBe(backend);
264264
});
265265

266-
it("should throw error that lists available backends", async () => {
266+
it("should silently succeed when unregistering nonexistent with others registered", async () => {
267267
const backend = createMockBackend("available");
268268
registerOcrBackend(backend);
269269

270-
try {
271-
await unregisterOcrBackend("nonexistent");
272-
} catch (error) {
273-
if (error instanceof Error) {
274-
expect(error.message).toContain("available");
275-
}
276-
}
270+
await unregisterOcrBackend("nonexistent");
271+
272+
expect(getOcrBackend("available")).toBe(backend);
277273
});
278274
});
279275

crates/kreuzberg-wasm/typescript/ocr/registry.ts

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -125,9 +125,7 @@ export async function unregisterOcrBackend(name: string): Promise<void> {
125125
const backend = ocrBackendRegistry.get(name);
126126

127127
if (!backend) {
128-
throw new Error(
129-
`OCR backend "${name}" is not registered. Available backends: ${Array.from(ocrBackendRegistry.keys()).join(", ")}`,
130-
);
128+
return;
131129
}
132130

133131
if (typeof backend.shutdown === "function") {

crates/kreuzberg-wasm/typescript/types.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,12 @@ export interface PdfConfig {
302302
passwords?: string[];
303303
/** Whether to extract metadata */
304304
extractMetadata?: boolean;
305+
/** Whether to extract annotations from PDF */
306+
extractAnnotations?: boolean;
307+
/** Top margin fraction (0.0-0.5) for filtering header content */
308+
topMarginFraction?: number;
309+
/** Bottom margin fraction (0.0-0.5) for filtering footer content */
310+
bottomMarginFraction?: number;
305311
}
306312

307313
/**

e2e/wasm-deno/plugin-apis.test.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,16 @@ import {
1313
clearValidators,
1414
detectMimeFromBytes,
1515
getExtensionsForMime,
16+
initWasm,
1617
listOcrBackends,
1718
listPostProcessors,
1819
listValidators,
1920
unregisterOcrBackend,
2021
} from "npm:@kreuzberg/wasm@^4.0.0";
2122
import { assertEquals } from "@std/assert";
2223

24+
await initWasm();
25+
2326
// Configuration
2427

2528
Deno.test({ name: "Discover configuration from current or parent directories", ignore: true, fn() {} });

packages/csharp/Kreuzberg/Models.cs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2485,6 +2485,24 @@ public sealed class PdfConfig
24852485
/// </summary>
24862486
[JsonPropertyName("hierarchy")]
24872487
public HierarchyConfig? Hierarchy { get; init; }
2488+
2489+
/// <summary>
2490+
/// Whether to extract annotations from PDF documents.
2491+
/// </summary>
2492+
[JsonPropertyName("extract_annotations")]
2493+
public bool? ExtractAnnotations { get; init; }
2494+
2495+
/// <summary>
2496+
/// Top margin fraction (0.0-0.5) for filtering header content.
2497+
/// </summary>
2498+
[JsonPropertyName("top_margin_fraction")]
2499+
public float? TopMarginFraction { get; init; }
2500+
2501+
/// <summary>
2502+
/// Bottom margin fraction (0.0-0.5) for filtering footer content.
2503+
/// </summary>
2504+
[JsonPropertyName("bottom_margin_fraction")]
2505+
public float? BottomMarginFraction { get; init; }
24882506
}
24892507

24902508
/// <summary>

packages/go/v4/config_types.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -199,10 +199,13 @@ type FontConfig struct {
199199

200200
// PdfConfig exposes PDF-specific options.
201201
type PdfConfig struct {
202-
ExtractImages *bool `json:"extract_images,omitempty"`
203-
Passwords []string `json:"passwords,omitempty"`
204-
ExtractMetadata *bool `json:"extract_metadata,omitempty"`
205-
FontConfig *FontConfig `json:"font_config,omitempty"`
202+
ExtractImages *bool `json:"extract_images,omitempty"`
203+
Passwords []string `json:"passwords,omitempty"`
204+
ExtractMetadata *bool `json:"extract_metadata,omitempty"`
205+
FontConfig *FontConfig `json:"font_config,omitempty"`
206+
ExtractAnnotations *bool `json:"extract_annotations,omitempty"`
207+
TopMarginFraction *float64 `json:"top_margin_fraction,omitempty"`
208+
BottomMarginFraction *float64 `json:"bottom_margin_fraction,omitempty"`
206209
}
207210

208211
// HierarchyConfig controls PDF hierarchy extraction based on font sizes.

packages/java/src/main/java/dev/kreuzberg/config/PdfConfig.java

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ public final class PdfConfig {
1717
private final boolean extractMetadata;
1818
private final FontConfig fontConfig;
1919
private final HierarchyConfig hierarchyConfig;
20+
private final boolean extractAnnotations;
21+
private final Double topMarginFraction;
22+
private final Double bottomMarginFraction;
2023

2124
private PdfConfig(Builder builder) {
2225
this.extractImages = builder.extractImages;
@@ -26,6 +29,9 @@ private PdfConfig(Builder builder) {
2629
this.extractMetadata = builder.extractMetadata;
2730
this.fontConfig = builder.fontConfig;
2831
this.hierarchyConfig = builder.hierarchyConfig;
32+
this.extractAnnotations = builder.extractAnnotations;
33+
this.topMarginFraction = builder.topMarginFraction;
34+
this.bottomMarginFraction = builder.bottomMarginFraction;
2935
}
3036

3137
public static Builder builder() {
@@ -52,19 +58,38 @@ public HierarchyConfig getHierarchyConfig() {
5258
return hierarchyConfig;
5359
}
5460

61+
public boolean isExtractAnnotations() {
62+
return extractAnnotations;
63+
}
64+
65+
public Double getTopMarginFraction() {
66+
return topMarginFraction;
67+
}
68+
69+
public Double getBottomMarginFraction() {
70+
return bottomMarginFraction;
71+
}
72+
5573
public Map<String, Object> toMap() {
5674
Map<String, Object> map = new HashMap<>();
5775
map.put("extract_images", extractImages);
5876
if (passwords != null && !passwords.isEmpty()) {
5977
map.put("passwords", passwords);
6078
}
6179
map.put("extract_metadata", extractMetadata);
80+
map.put("extract_annotations", extractAnnotations);
6281
if (fontConfig != null) {
6382
map.put("font_config", fontConfig.toMap());
6483
}
6584
if (hierarchyConfig != null) {
6685
map.put("hierarchy", hierarchyConfig.toMap());
6786
}
87+
if (topMarginFraction != null) {
88+
map.put("top_margin_fraction", topMarginFraction);
89+
}
90+
if (bottomMarginFraction != null) {
91+
map.put("bottom_margin_fraction", bottomMarginFraction);
92+
}
6893
return map;
6994
}
7095

@@ -74,6 +99,9 @@ public static final class Builder {
7499
private boolean extractMetadata = true;
75100
private FontConfig fontConfig;
76101
private HierarchyConfig hierarchyConfig;
102+
private boolean extractAnnotations = false;
103+
private Double topMarginFraction;
104+
private Double bottomMarginFraction;
77105

78106
private Builder() {
79107
}
@@ -111,6 +139,21 @@ public Builder hierarchyConfig(HierarchyConfig hierarchyConfig) {
111139
return this;
112140
}
113141

142+
public Builder extractAnnotations(boolean extractAnnotations) {
143+
this.extractAnnotations = extractAnnotations;
144+
return this;
145+
}
146+
147+
public Builder topMarginFraction(Double topMarginFraction) {
148+
this.topMarginFraction = topMarginFraction;
149+
return this;
150+
}
151+
152+
public Builder bottomMarginFraction(Double bottomMarginFraction) {
153+
this.bottomMarginFraction = bottomMarginFraction;
154+
return this;
155+
}
156+
114157
public PdfConfig build() {
115158
return new PdfConfig(this);
116159
}
@@ -141,6 +184,18 @@ static PdfConfig fromMap(Map<String, Object> map) {
141184
if (extractMetadataValue instanceof Boolean) {
142185
builder.extractMetadata((Boolean) extractMetadataValue);
143186
}
187+
Object extractAnnotationsValue = map.get("extract_annotations");
188+
if (extractAnnotationsValue instanceof Boolean) {
189+
builder.extractAnnotations((Boolean) extractAnnotationsValue);
190+
}
191+
Object topMarginValue = map.get("top_margin_fraction");
192+
if (topMarginValue instanceof Number) {
193+
builder.topMarginFraction(((Number) topMarginValue).doubleValue());
194+
}
195+
Object bottomMarginValue = map.get("bottom_margin_fraction");
196+
if (bottomMarginValue instanceof Number) {
197+
builder.bottomMarginFraction(((Number) bottomMarginValue).doubleValue());
198+
}
144199
@SuppressWarnings("unchecked")
145200
Map<String, Object> fontConfigMap = map.get("font_config") instanceof Map
146201
? (Map<String, Object>) map.get("font_config")

0 commit comments

Comments
 (0)