Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions docling/pipeline/standard_pdf_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,10 +506,14 @@ def _init_models(self) -> None:
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())

# --- optional enrichment ------------------------------------------------
# Update code_formula_options to match the boolean flags
code_formula_opts = self.pipeline_options.code_formula_options
code_formula_opts.extract_code = self.pipeline_options.do_code_enrichment
code_formula_opts.extract_formulas = self.pipeline_options.do_formula_enrichment
# Create a copy to avoid mutating pipeline_options in-place,
# which would change its hash and break pipeline caching (#3109).
code_formula_opts = self.pipeline_options.code_formula_options.model_copy(
update={
"extract_code": self.pipeline_options.do_code_enrichment,
"extract_formulas": self.pipeline_options.do_formula_enrichment,
}
)

self.enrichment_pipe = [
# Code Formula Enrichment Model (using new VLM runtime system)
Expand Down
28 changes: 28 additions & 0 deletions tests/test_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,34 @@ def test_parser_backends(test_doc_path):
assert doc_result.status == ConversionStatus.SUCCESS


def test_pipeline_cache_after_initialize(test_doc_path):
"""Test that initialize_pipeline caches correctly and convert reuses the cache.

Regression test for #3109: code_formula_options were mutated in-place during
pipeline initialization, changing the options hash and causing a cache miss
when convert() was called afterwards.
"""
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = False

converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)
}
)

converter.initialize_pipeline(InputFormat.PDF)
assert len(converter._get_initialized_pipelines()) == 1

converter.convert(test_doc_path)
assert len(converter._get_initialized_pipelines()) == 1, (
"Pipeline should be reused from cache, not re-initialized"
)


def test_confidence(test_doc_path):
converter = DocumentConverter()
doc_result: ConversionResult = converter.convert(test_doc_path, page_range=(6, 9))
Expand Down
Loading