Skip to content

Commit c2ba038

Browse files
committed
fix(tests): serialize pipeline tests with #[serial] to fix race conditions
Replace manual REGISTRY_TEST_GUARD mutex with serial_test's #[serial] attribute across all 19 pipeline tests. Fixes flaky failures caused by global plugin registry state pollution between parallel tests. Also re-enables the previously #[ignore]d keyword extraction test by clearing the processor cache after re-registration.
1 parent 0a5608c commit c2ba038

File tree

2 files changed

+29
-26
lines changed

2 files changed

+29
-26
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1616
- **Ruby gem missing `sorbet-runtime` at runtime (#400)**: `sorbet-runtime` was listed as a development dependency in the gemspec but is required at runtime for `T::Struct` types. Promoted to a runtime dependency.
1717
- **E2e generator Ruby rubocop warnings**: The Ruby e2e generator emitted redundant `RSpec/DescribeClass` and `RSpec/ExampleLength` inline disable directives that rubocop autocorrect mangled into invalid syntax. Simplified to only disable `Metrics/BlockLength`.
1818
- **E2e generator TypeScript npm warnings**: Replaced `npx` with `pnpm exec` for running biome in the e2e generator, eliminating spurious warnings from pnpm-specific `.npmrc` settings.
19+
- **Tesseract TSV level mapping off-by-one**: OCR element hierarchy levels were incorrectly mapped — levels are 1=Page, 2=Block, 3=Paragraph, 4=Line, 5=Word. Fixed `parse_tsv_to_elements` to include word-level entries.
20+
- **OCR elements dropped in image OCR path**: `image_ocr.rs` hardcoded `ocr_elements` to `None` instead of passing through the elements parsed from Tesseract TSV output.
21+
- **DOCX extractor panic on multi-byte UTF-8 page boundaries (#401)**: Page break insertion used byte-index slicing on multi-byte UTF-8 content, causing panics. Fixed with char-boundary-safe insertion.
22+
- **Node.js `djot_content` field missing**: `JsExtractionResult` in kreuzberg-node was not mapping the `djot_content` field from Rust results, always returning `undefined`.
23+
- **E2e generator missing `mapPageConfig` and `mapHtmlOptions`**: TypeScript e2e test generator did not map page extraction or HTML formatting options from fixture configs, causing tests with those options to use defaults.
24+
- **Pipeline test race conditions**: Replaced manual `REGISTRY_TEST_GUARD` mutex with `#[serial]` from `serial_test`, fixing flaky failures in `test_pipeline_with_quality_processing`, `test_pipeline_with_all_features`, and `test_postprocessor_runs_before_validator` caused by global registry state pollution between parallel tests.
25+
- **`test_pipeline_with_keyword_extraction` permanently ignored**: Test was marked `#[ignore]` due to test isolation issues. Fixed the underlying problem — `Lazy` static prevented re-registration after `shutdown_all()` — by clearing the processor cache after re-registration.
1926

2027
---
2128

crates/kreuzberg/src/core/pipeline/tests.rs

Lines changed: 22 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
use super::*;
44
use crate::core::config::OutputFormat;
55
use crate::types::Metadata;
6-
use lazy_static::lazy_static;
6+
use serial_test::serial;
77
use std::borrow::Cow;
88

99
const VALIDATION_MARKER_KEY: &str = "registry_validation_marker";
@@ -12,11 +12,8 @@ const QUALITY_VALIDATION_MARKER: &str = "quality_validation_test";
1212
const POSTPROCESSOR_VALIDATION_MARKER: &str = "postprocessor_validation_test";
1313
const ORDER_VALIDATION_MARKER: &str = "order_validation_test";
1414

15-
lazy_static! {
16-
static ref REGISTRY_TEST_GUARD: std::sync::Mutex<()> = std::sync::Mutex::new(());
17-
}
18-
1915
#[tokio::test]
16+
#[serial]
2017
async fn test_run_pipeline_basic() {
2118
let mut result = ExtractionResult {
2219
content: "test".to_string(),
@@ -53,6 +50,7 @@ async fn test_run_pipeline_basic() {
5350
}
5451

5552
#[tokio::test]
53+
#[serial]
5654
#[cfg(feature = "quality")]
5755
async fn test_pipeline_with_quality_processing() {
5856
let result = ExtractionResult {
@@ -83,6 +81,7 @@ async fn test_pipeline_with_quality_processing() {
8381
}
8482

8583
#[tokio::test]
84+
#[serial]
8685
async fn test_pipeline_without_quality_processing() {
8786
let result = ExtractionResult {
8887
content: "test".to_string(),
@@ -116,6 +115,7 @@ async fn test_pipeline_without_quality_processing() {
116115
}
117116

118117
#[tokio::test]
118+
#[serial]
119119
#[cfg(feature = "chunking")]
120120
async fn test_pipeline_with_chunking() {
121121
let result = ExtractionResult {
@@ -155,6 +155,7 @@ async fn test_pipeline_with_chunking() {
155155
}
156156

157157
#[tokio::test]
158+
#[serial]
158159
async fn test_pipeline_without_chunking() {
159160
let result = ExtractionResult {
160161
content: "test".to_string(),
@@ -188,6 +189,7 @@ async fn test_pipeline_without_chunking() {
188189
}
189190

190191
#[tokio::test]
192+
#[serial]
191193
async fn test_pipeline_preserves_metadata() {
192194
use ahash::AHashMap;
193195
let mut additional = AHashMap::new();
@@ -235,6 +237,7 @@ async fn test_pipeline_preserves_metadata() {
235237
}
236238

237239
#[tokio::test]
240+
#[serial]
238241
async fn test_pipeline_preserves_tables() {
239242
use crate::types::Table;
240243

@@ -277,9 +280,8 @@ async fn test_pipeline_preserves_tables() {
277280
}
278281

279282
#[tokio::test]
283+
#[serial]
280284
async fn test_pipeline_empty_content() {
281-
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
282-
283285
{
284286
let registry = crate::plugins::registry::get_post_processor_registry();
285287
registry.write().unwrap().shutdown_all().unwrap();
@@ -309,13 +311,12 @@ async fn test_pipeline_empty_content() {
309311
};
310312
let config = ExtractionConfig::default();
311313

312-
drop(_guard);
313-
314314
let processed = run_pipeline(result, &config).await.unwrap();
315315
assert_eq!(processed.content, "");
316316
}
317317

318318
#[tokio::test]
319+
#[serial]
319320
#[cfg(feature = "chunking")]
320321
async fn test_pipeline_with_all_features() {
321322
let result = ExtractionResult {
@@ -355,11 +356,9 @@ async fn test_pipeline_with_all_features() {
355356
}
356357

357358
#[tokio::test]
359+
#[serial]
358360
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
359-
#[ignore = "Requires test isolation - run with --test-threads=1 or individually with --include-ignored"]
360-
#[allow(clippy::await_holding_lock)]
361361
async fn test_pipeline_with_keyword_extraction() {
362-
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
363362
crate::plugins::registry::get_validator_registry()
364363
.write()
365364
.unwrap()
@@ -371,7 +370,9 @@ async fn test_pipeline_with_keyword_extraction() {
371370
.shutdown_all()
372371
.unwrap();
373372

373+
// Register keyword processor directly (bypasses Lazy which only runs once per process)
374374
let _ = crate::keywords::register_keyword_processor();
375+
clear_processor_cache().unwrap();
375376

376377
let result = ExtractionResult {
377378
content: r#"
@@ -427,11 +428,9 @@ Natural language processing enables computers to understand human language.
427428
}
428429

429430
#[tokio::test]
431+
#[serial]
430432
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
431433
async fn test_pipeline_without_keyword_config() {
432-
{
433-
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
434-
}
435434
let result = ExtractionResult {
436435
content: "Machine learning and artificial intelligence.".to_string(),
437436
mime_type: Cow::Borrowed("text/plain"),
@@ -462,9 +461,9 @@ async fn test_pipeline_without_keyword_config() {
462461
}
463462

464463
#[tokio::test]
464+
#[serial]
465465
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
466466
async fn test_pipeline_keyword_extraction_short_content() {
467-
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
468467
crate::plugins::registry::get_validator_registry()
469468
.write()
470469
.unwrap()
@@ -506,14 +505,13 @@ async fn test_pipeline_keyword_extraction_short_content() {
506505
..Default::default()
507506
};
508507

509-
drop(_guard);
510-
511508
let processed = run_pipeline(result, &config).await.unwrap();
512509

513510
assert!(!processed.metadata.additional.contains_key("keywords"));
514511
}
515512

516513
#[tokio::test]
514+
#[serial]
517515
async fn test_postprocessor_runs_before_validator() {
518516
use crate::plugins::{Plugin, PostProcessor, ProcessingStage, Validator};
519517
use async_trait::async_trait;
@@ -600,7 +598,6 @@ async fn test_postprocessor_runs_before_validator() {
600598
let pp_registry = crate::plugins::registry::get_post_processor_registry();
601599
let val_registry = crate::plugins::registry::get_validator_registry();
602600

603-
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
604601
clear_processor_cache().unwrap();
605602
pp_registry.write().unwrap().shutdown_all().unwrap();
606603
val_registry.write().unwrap().shutdown_all().unwrap();
@@ -616,7 +613,6 @@ async fn test_postprocessor_runs_before_validator() {
616613
registry.register(Arc::new(TestValidator)).unwrap();
617614
}
618615

619-
// Clear the cache after registering new processors so it rebuilds with the test processors
620616
clear_processor_cache().unwrap();
621617

622618
let mut result = ExtractionResult {
@@ -652,7 +648,6 @@ async fn test_postprocessor_runs_before_validator() {
652648
}),
653649
..Default::default()
654650
};
655-
drop(_guard);
656651

657652
let processed = run_pipeline(result, &config).await;
658653

@@ -669,9 +664,9 @@ async fn test_postprocessor_runs_before_validator() {
669664
}
670665

671666
#[tokio::test]
667+
#[serial]
672668
#[cfg(feature = "quality")]
673669
async fn test_quality_processing_runs_before_validator() {
674-
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
675670
use crate::plugins::{Plugin, Validator};
676671
use async_trait::async_trait;
677672
use std::sync::Arc;
@@ -750,8 +745,6 @@ async fn test_quality_processing_runs_before_validator() {
750745
..Default::default()
751746
};
752747

753-
drop(_guard);
754-
755748
let processed = run_pipeline(result, &config).await;
756749

757750
{
@@ -763,6 +756,7 @@ async fn test_quality_processing_runs_before_validator() {
763756
}
764757

765758
#[tokio::test]
759+
#[serial]
766760
async fn test_multiple_postprocessors_run_before_validator() {
767761
use crate::plugins::{Plugin, PostProcessor, ProcessingStage, Validator};
768762
use async_trait::async_trait;
@@ -906,7 +900,6 @@ async fn test_multiple_postprocessors_run_before_validator() {
906900

907901
let pp_registry = crate::plugins::registry::get_post_processor_registry();
908902
let val_registry = crate::plugins::registry::get_validator_registry();
909-
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
910903

911904
pp_registry.write().unwrap().shutdown_all().unwrap();
912905
val_registry.write().unwrap().shutdown_all().unwrap();
@@ -946,7 +939,6 @@ async fn test_multiple_postprocessors_run_before_validator() {
946939
};
947940

948941
let config = ExtractionConfig::default();
949-
drop(_guard);
950942

951943
let processed = run_pipeline(result, &config).await;
952944

@@ -958,6 +950,7 @@ async fn test_multiple_postprocessors_run_before_validator() {
958950
}
959951

960952
#[tokio::test]
953+
#[serial]
961954
async fn test_run_pipeline_with_output_format_plain() {
962955
let result = ExtractionResult {
963956
content: "test content".to_string(),
@@ -976,6 +969,7 @@ async fn test_run_pipeline_with_output_format_plain() {
976969
}
977970

978971
#[tokio::test]
972+
#[serial]
979973
async fn test_run_pipeline_with_output_format_djot() {
980974
use crate::types::{BlockType, DjotContent, FormattedBlock, InlineElement, InlineType};
981975

@@ -1020,6 +1014,7 @@ async fn test_run_pipeline_with_output_format_djot() {
10201014
}
10211015

10221016
#[tokio::test]
1017+
#[serial]
10231018
async fn test_run_pipeline_with_output_format_html() {
10241019
let result = ExtractionResult {
10251020
content: "test content".to_string(),
@@ -1041,6 +1036,7 @@ async fn test_run_pipeline_with_output_format_html() {
10411036
}
10421037

10431038
#[tokio::test]
1039+
#[serial]
10441040
async fn test_run_pipeline_applies_output_format_last() {
10451041
// This test verifies that output format is applied after all other processing
10461042
use crate::types::DjotContent;

0 commit comments

Comments
 (0)