Skip to content

Commit d7f713d

Browse files
authored
fix: use new pdf_backend (#96)
* change default Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * adopt new docling-parse Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix and improve test Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
1 parent 2835f8c commit d7f713d

File tree

3 files changed

+18
-23
lines changed

3 files changed

+18
-23
lines changed

docling_jobkit/convert/manager.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,6 @@
1313
from pydantic import BaseModel, Field
1414

1515
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
16-
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
17-
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
1816
from docling.backend.pdf_backend import PdfDocumentBackend
1917
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
2018
from docling.datamodel import vlm_model_specs
@@ -32,6 +30,7 @@
3230
TableStructureOptions,
3331
VlmConvertOptions,
3432
VlmPipelineOptions,
33+
normalize_pdf_backend,
3534
)
3635
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, InlineVlmOptions
3736
from docling.datamodel.vlm_engine_options import (
@@ -1012,13 +1011,10 @@ def _parse_standard_pdf_opts(
10121011
def _parse_backend(
10131012
self, request: ConvertDocumentsOptions
10141013
) -> type[PdfDocumentBackend]:
1015-
if request.pdf_backend == PdfBackend.DLPARSE_V1:
1014+
pdf_backend = normalize_pdf_backend(request.pdf_backend)
1015+
if pdf_backend == PdfBackend.DOCLING_PARSE:
10161016
backend: type[PdfDocumentBackend] = DoclingParseDocumentBackend
1017-
elif request.pdf_backend == PdfBackend.DLPARSE_V2:
1018-
backend = DoclingParseV2DocumentBackend
1019-
elif request.pdf_backend == PdfBackend.DLPARSE_V4:
1020-
backend = DoclingParseV4DocumentBackend
1021-
elif request.pdf_backend == PdfBackend.PYPDFIUM2:
1017+
elif pdf_backend == PdfBackend.PYPDFIUM2:
10221018
backend = PyPdfiumDocumentBackend
10231019
else:
10241020
raise RuntimeError(f"Unexpected PDF backend type {request.pdf_backend}")

docling_jobkit/datamodel/convert.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -344,11 +344,11 @@ class ConvertDocumentsOptions(BaseModel):
344344
description=(
345345
"The PDF backend to use. String. "
346346
f"Allowed values: {', '.join([v.value for v in PdfBackend])}. "
347-
f"Optional, defaults to {PdfBackend.DLPARSE_V4.value}."
347+
f"Optional, defaults to {PdfBackend.DOCLING_PARSE.value}."
348348
),
349-
examples=[PdfBackend.DLPARSE_V4],
349+
examples=[PdfBackend.DOCLING_PARSE],
350350
),
351-
] = PdfBackend.DLPARSE_V4
351+
] = PdfBackend.DOCLING_PARSE
352352

353353
table_mode: Annotated[
354354
TableFormerMode,

tests/test_options.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
import pytest
44

55
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
6-
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
7-
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
86
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
97
from docling.datamodel import vlm_model_specs
108
from docling.datamodel.base_models import InputFormat
@@ -56,7 +54,7 @@ def test_options_validator():
5654
pipeline_opts = m.get_pdf_pipeline_opts(opts)
5755
assert pipeline_opts.pipeline_options is not None
5856
assert isinstance(pipeline_opts.pipeline_options, PdfPipelineOptions)
59-
assert pipeline_opts.backend == DoclingParseV4DocumentBackend
57+
assert pipeline_opts.backend == DoclingParseDocumentBackend
6058
assert pipeline_opts.pipeline_options.generate_page_images is True
6159

6260
opts = ConvertDocumentsOptions(
@@ -70,15 +68,16 @@ def test_options_validator():
7068
assert pipeline_opts.pipeline_options.generate_page_images is True
7169
assert pipeline_opts.pipeline_options.generate_picture_images is True
7270

73-
opts = ConvertDocumentsOptions(pdf_backend=PdfBackend.DLPARSE_V2)
74-
pipeline_opts = m.get_pdf_pipeline_opts(opts)
75-
assert pipeline_opts.pipeline_options is not None
76-
assert pipeline_opts.backend == DoclingParseV2DocumentBackend
77-
78-
opts = ConvertDocumentsOptions(pdf_backend=PdfBackend.DLPARSE_V1)
79-
pipeline_opts = m.get_pdf_pipeline_opts(opts)
80-
assert pipeline_opts.pipeline_options is not None
81-
assert pipeline_opts.backend == DoclingParseDocumentBackend
71+
for pdf_backend in (
72+
PdfBackend.DLPARSE_V4,
73+
PdfBackend.DLPARSE_V2,
74+
PdfBackend.DLPARSE_V1,
75+
PdfBackend.DOCLING_PARSE,
76+
):
77+
opts = ConvertDocumentsOptions(pdf_backend=pdf_backend)
78+
pipeline_opts = m.get_pdf_pipeline_opts(opts)
79+
assert pipeline_opts.pipeline_options is not None
80+
assert pipeline_opts.backend == DoclingParseDocumentBackend
8281

8382
opts = ConvertDocumentsOptions(
8483
pipeline=ProcessingPipeline.VLM,

0 commit comments

Comments
 (0)