Skip to content

Commit 0e8a989

Browse files
committed
fix: resolve CI failures across Ruby, PHP, Rust, Validate, and WASM
- Skip bindgen when no PDFium headers found, preserving pre-generated bindings - Update WASM PDFium fallback version from 7442b to 7623 - Add x86_64-linux platform back to Ruby Gemfile.lock - Fix PHP e2e generator to transform model objects to string format - Make EmbeddingConfig::fromArray robust against array model input - Fix Biome import ordering in TypeScript test-utils
1 parent 597242a commit 0e8a989

File tree

7 files changed

+59
-98
lines changed

7 files changed

+59
-98
lines changed

crates/kreuzberg-pdfium-render/build.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,15 @@ fn build_bindings_for_one_pdfium_release(release: &str) -> Result<(), BuildError
134134
}
135135
}
136136

137+
// If no header files found, skip binding generation and keep pre-generated file.
138+
if included_header_files.is_empty() {
139+
eprintln!(
140+
"cargo:warning=No header files found in include/{}/; skipping bindgen, using pre-generated bindings",
141+
release
142+
);
143+
return Ok(());
144+
}
145+
137146
let wrapper = included_header_files
138147
.iter()
139148
.map(|file_name| format!("#include \"{}\"", file_name))

crates/kreuzberg/build.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -192,9 +192,9 @@ fn get_latest_version(repo: &str) -> String {
192192

193193
if repo.contains("paulocoutinhox") {
194194
eprintln!(
195-
"cargo:warning=Failed to fetch latest PDFium WASM version from GitHub API, using fallback version 7442b"
195+
"cargo:warning=Failed to fetch latest PDFium WASM version from GitHub API, using fallback version 7623"
196196
);
197-
"7442b".to_string()
197+
"7623".to_string()
198198
} else if repo.contains("bblanchon") {
199199
eprintln!(
200200
"cargo:warning=Failed to fetch latest PDFium binaries version from GitHub API, using fallback version 7568"

e2e/php/tests/Helpers.php

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,18 @@ public static function buildConfig(?array $config): ?ExtractionConfig
4141
return null;
4242
}
4343

44+
// Transform embedding model from Rust object format to PHP string format.
45+
// Fixtures define model as {"type": "preset", "name": "balanced"} but
46+
// PHP's EmbeddingConfig expects just the preset name string "balanced".
47+
if (isset($config['chunking']['embedding']['model'])
48+
&& is_array($config['chunking']['embedding']['model'])
49+
) {
50+
$model = $config['chunking']['embedding']['model'];
51+
if (isset($model['type']) && $model['type'] === 'preset' && isset($model['name'])) {
52+
$config['chunking']['embedding']['model'] = $model['name'];
53+
}
54+
}
55+
4456
return ExtractionConfig::fromArray($config);
4557
}
4658

packages/php/src/Config/EmbeddingConfig.php

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,13 @@ public function __construct(
6868
*/
6969
public static function fromArray(array $data): self
7070
{
71-
/** @var string $model */
71+
/** @var string|array<string, string> $model */
7272
$model = $data['model'] ?? 'balanced';
73-
if (!is_string($model)) {
73+
if (is_array($model)) {
74+
// Handle Rust-format model: {"type": "preset", "name": "balanced"}
75+
/** @var string $model */
76+
$model = $model['name'] ?? 'balanced';
77+
} elseif (!is_string($model)) {
7478
/** @var string $model */
7579
$model = (string) $model;
7680
}

packages/ruby/Gemfile.lock

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ PLATFORMS
163163
x86-linux-gnu
164164
x86-linux-musl
165165
x86_64-darwin
166+
x86_64-linux
166167
x86_64-linux-gnu
167168
x86_64-linux-musl
168169

packages/typescript/test-utils/src/config-mapping/build-config.ts

Lines changed: 17 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,5 @@
1-
import {
2-
assignBooleanField,
3-
assignNumberField,
4-
assignStringField,
5-
assignStringArrayField,
6-
} from "./field-mappers.js";
7-
import { type PlainRecord, isPlainRecord } from "./types.js";
1+
import { assignBooleanField, assignNumberField, assignStringArrayField, assignStringField } from "./field-mappers.js";
2+
import { isPlainRecord, type PlainRecord } from "./types.js";
83

94
/**
105
* Config types - these should match the types from @kreuzberg/node
@@ -79,18 +74,8 @@ export interface ExtractionConfig {
7974
function mapTesseractConfig(raw: PlainRecord): TesseractConfig {
8075
const config: TesseractConfig = {};
8176
assignNumberField(config as PlainRecord, raw, "psm", "psm");
82-
assignBooleanField(
83-
config as PlainRecord,
84-
raw,
85-
"enable_table_detection",
86-
"enableTableDetection",
87-
);
88-
assignStringField(
89-
config as PlainRecord,
90-
raw,
91-
"tessedit_char_whitelist",
92-
"tesseditCharWhitelist",
93-
);
77+
assignBooleanField(config as PlainRecord, raw, "enable_table_detection", "enableTableDetection");
78+
assignStringField(config as PlainRecord, raw, "tessedit_char_whitelist", "tesseditCharWhitelist");
9479
return config;
9580
}
9681

@@ -128,25 +113,10 @@ function mapChunkingConfig(raw: PlainRecord): ChunkingConfig {
128113
*/
129114
function mapImageExtractionConfig(raw: PlainRecord): ImageExtractionConfig {
130115
const config: ImageExtractionConfig = {};
131-
assignBooleanField(
132-
config as PlainRecord,
133-
raw,
134-
"extract_images",
135-
"extractImages",
136-
);
116+
assignBooleanField(config as PlainRecord, raw, "extract_images", "extractImages");
137117
assignNumberField(config as PlainRecord, raw, "target_dpi", "targetDpi");
138-
assignNumberField(
139-
config as PlainRecord,
140-
raw,
141-
"max_image_dimension",
142-
"maxImageDimension",
143-
);
144-
assignBooleanField(
145-
config as PlainRecord,
146-
raw,
147-
"auto_adjust_dpi",
148-
"autoAdjustDpi",
149-
);
118+
assignNumberField(config as PlainRecord, raw, "max_image_dimension", "maxImageDimension");
119+
assignBooleanField(config as PlainRecord, raw, "auto_adjust_dpi", "autoAdjustDpi");
150120
assignNumberField(config as PlainRecord, raw, "min_dpi", "minDpi");
151121
assignNumberField(config as PlainRecord, raw, "max_dpi", "maxDpi");
152122
return config;
@@ -157,19 +127,9 @@ function mapImageExtractionConfig(raw: PlainRecord): ImageExtractionConfig {
157127
*/
158128
function mapPdfConfig(raw: PlainRecord): PdfConfig {
159129
const config: PdfConfig = {};
160-
assignBooleanField(
161-
config as PlainRecord,
162-
raw,
163-
"extract_images",
164-
"extractImages",
165-
);
130+
assignBooleanField(config as PlainRecord, raw, "extract_images", "extractImages");
166131
assignStringArrayField(config as PlainRecord, raw, "passwords", "passwords");
167-
assignBooleanField(
168-
config as PlainRecord,
169-
raw,
170-
"extract_metadata",
171-
"extractMetadata",
172-
);
132+
assignBooleanField(config as PlainRecord, raw, "extract_metadata", "extractMetadata");
173133
return config;
174134
}
175135

@@ -179,12 +139,7 @@ function mapPdfConfig(raw: PlainRecord): PdfConfig {
179139
function mapTokenReductionConfig(raw: PlainRecord): TokenReductionConfig {
180140
const config: TokenReductionConfig = {};
181141
assignStringField(config as PlainRecord, raw, "mode", "mode");
182-
assignBooleanField(
183-
config as PlainRecord,
184-
raw,
185-
"preserve_important_words",
186-
"preserveImportantWords",
187-
);
142+
assignBooleanField(config as PlainRecord, raw, "preserve_important_words", "preserveImportantWords");
188143
return config;
189144
}
190145

@@ -194,18 +149,8 @@ function mapTokenReductionConfig(raw: PlainRecord): TokenReductionConfig {
194149
function mapLanguageDetectionConfig(raw: PlainRecord): LanguageDetectionConfig {
195150
const config: LanguageDetectionConfig = {};
196151
assignBooleanField(config as PlainRecord, raw, "enabled", "enabled");
197-
assignNumberField(
198-
config as PlainRecord,
199-
raw,
200-
"min_confidence",
201-
"minConfidence",
202-
);
203-
assignBooleanField(
204-
config as PlainRecord,
205-
raw,
206-
"detect_multiple",
207-
"detectMultiple",
208-
);
152+
assignNumberField(config as PlainRecord, raw, "min_confidence", "minConfidence");
153+
assignBooleanField(config as PlainRecord, raw, "detect_multiple", "detectMultiple");
209154
return config;
210155
}
211156

@@ -215,18 +160,8 @@ function mapLanguageDetectionConfig(raw: PlainRecord): LanguageDetectionConfig {
215160
function mapPostProcessorConfig(raw: PlainRecord): PostProcessorConfig {
216161
const config: PostProcessorConfig = {};
217162
assignBooleanField(config as PlainRecord, raw, "enabled", "enabled");
218-
assignStringArrayField(
219-
config as PlainRecord,
220-
raw,
221-
"enabled_processors",
222-
"enabledProcessors",
223-
);
224-
assignStringArrayField(
225-
config as PlainRecord,
226-
raw,
227-
"disabled_processors",
228-
"disabledProcessors",
229-
);
163+
assignStringArrayField(config as PlainRecord, raw, "enabled_processors", "enabledProcessors");
164+
assignStringArrayField(config as PlainRecord, raw, "disabled_processors", "disabledProcessors");
230165
return config;
231166
}
232167

@@ -244,19 +179,9 @@ export function buildConfig(raw: unknown): ExtractionConfig {
244179
const target = result as PlainRecord;
245180

246181
assignBooleanField(target, source, "use_cache", "useCache");
247-
assignBooleanField(
248-
target,
249-
source,
250-
"enable_quality_processing",
251-
"enableQualityProcessing",
252-
);
182+
assignBooleanField(target, source, "enable_quality_processing", "enableQualityProcessing");
253183
assignBooleanField(target, source, "force_ocr", "forceOcr");
254-
assignNumberField(
255-
target,
256-
source,
257-
"max_concurrent_extractions",
258-
"maxConcurrentExtractions",
259-
);
184+
assignNumberField(target, source, "max_concurrent_extractions", "maxConcurrentExtractions");
260185

261186
if (isPlainRecord(source["ocr"])) {
262187
const mapped = mapOcrConfig(source["ocr"]);
@@ -282,9 +207,7 @@ export function buildConfig(raw: unknown): ExtractionConfig {
282207
}
283208

284209
if (isPlainRecord(source["language_detection"])) {
285-
result.languageDetection = mapLanguageDetectionConfig(
286-
source["language_detection"],
287-
);
210+
result.languageDetection = mapLanguageDetectionConfig(source["language_detection"]);
288211
}
289212

290213
if (isPlainRecord(source["postprocessor"])) {

tools/e2e-generator/src/php.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,18 @@ class Helpers
5050
return null;
5151
}
5252
53+
// Transform embedding model from Rust object format to PHP string format.
54+
// Fixtures define model as {"type": "preset", "name": "balanced"} but
55+
// PHP's EmbeddingConfig expects just the preset name string "balanced".
56+
if (isset($config['chunking']['embedding']['model'])
57+
&& is_array($config['chunking']['embedding']['model'])
58+
) {
59+
$model = $config['chunking']['embedding']['model'];
60+
if (isset($model['type']) && $model['type'] === 'preset' && isset($model['name'])) {
61+
$config['chunking']['embedding']['model'] = $model['name'];
62+
}
63+
}
64+
5365
return ExtractionConfig::fromArray($config);
5466
}
5567

0 commit comments

Comments
 (0)