Skip to content

Commit bbe82a6

Browse files
authored
feat(pdf): Support for password-protected PDF documents (#2499)
* add test and example for PDF with password Signed-off-by: Michele Dolfi <[email protected]> * use docling-parse with new password feature Signed-off-by: Michele Dolfi <[email protected]> * add pdfbackendoptions Signed-off-by: Michele Dolfi <[email protected]> * generalize backend_options and add PdfBackendOptions Signed-off-by: Michele Dolfi <[email protected]> * add pdf-password option Signed-off-by: Michele Dolfi <[email protected]> * update exception test Signed-off-by: Michele Dolfi <[email protected]> * fix docs description Signed-off-by: Michele Dolfi <[email protected]> --------- Signed-off-by: Michele Dolfi <[email protected]>
1 parent 89820d0 commit bbe82a6

16 files changed

+202
-114
lines changed

docling/backend/abstract_backend.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,11 @@
55

66
from docling_core.types.doc import DoclingDocument
77

8-
from docling.datamodel.backend_options import BackendOptions, DeclarativeBackendOptions
8+
from docling.datamodel.backend_options import (
9+
BackendOptions,
10+
BaseBackendOptions,
11+
DeclarativeBackendOptions,
12+
)
913

1014
if TYPE_CHECKING:
1115
from docling.datamodel.base_models import InputFormat
@@ -14,11 +18,17 @@
1418

1519
class AbstractDocumentBackend(ABC):
1620
@abstractmethod
17-
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
21+
def __init__(
22+
self,
23+
in_doc: "InputDocument",
24+
path_or_stream: Union[BytesIO, Path],
25+
options: BaseBackendOptions = BaseBackendOptions(),
26+
):
1827
self.file = in_doc.file
1928
self.path_or_stream = path_or_stream
2029
self.document_hash = in_doc.document_hash
2130
self.input_format = in_doc.format
31+
self.options = options
2232

2333
@abstractmethod
2434
def is_valid(self) -> bool:
@@ -67,13 +77,8 @@ def __init__(
6777
path_or_stream: Union[BytesIO, Path],
6878
options: BackendOptions = DeclarativeBackendOptions(),
6979
) -> None:
70-
super().__init__(in_doc, path_or_stream)
71-
self.options: BackendOptions = options
80+
super().__init__(in_doc, path_or_stream, options)
7281

7382
@abstractmethod
7483
def convert(self) -> DoclingDocument:
7584
pass
76-
77-
@classmethod
78-
def get_default_options(cls) -> BackendOptions:
79-
return DeclarativeBackendOptions()

docling/backend/docling_parse_v4_backend.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from pypdfium2 import PdfPage
1313

1414
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
15+
from docling.datamodel.backend_options import PdfBackendOptions
1516
from docling.datamodel.base_models import Size
1617
from docling.utils.locks import pypdfium2_lock
1718

@@ -189,13 +190,23 @@ def unload(self):
189190

190191

191192
class DoclingParseV4DocumentBackend(PdfDocumentBackend):
192-
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
193-
super().__init__(in_doc, path_or_stream)
193+
def __init__(
194+
self,
195+
in_doc: "InputDocument",
196+
path_or_stream: Union[BytesIO, Path],
197+
options: PdfBackendOptions = PdfBackendOptions(),
198+
):
199+
super().__init__(in_doc, path_or_stream, options)
194200

201+
password = (
202+
self.options.password.get_secret_value() if self.options.password else None
203+
)
195204
with pypdfium2_lock:
196-
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
205+
self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
197206
self.parser = DoclingPdfParser(loglevel="fatal")
198-
self.dp_doc: PdfDocument = self.parser.load(path_or_stream=self.path_or_stream)
207+
self.dp_doc: PdfDocument = self.parser.load(
208+
path_or_stream=self.path_or_stream, password=password
209+
)
199210
success = self.dp_doc is not None
200211

201212
if not success:

docling/backend/html_backend.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -246,11 +246,6 @@ def unload(self):
246246
def supported_formats(cls) -> set[InputFormat]:
247247
return {InputFormat.HTML}
248248

249-
@classmethod
250-
@override
251-
def get_default_options(cls) -> HTMLBackendOptions:
252-
return HTMLBackendOptions()
253-
254249
@override
255250
def convert(self) -> DoclingDocument:
256251
_log.debug("Starting HTML conversion...")

docling/backend/md_backend.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -536,11 +536,6 @@ def supports_pagination(cls) -> bool:
536536
def supported_formats(cls) -> set[InputFormat]:
537537
return {InputFormat.MD}
538538

539-
@classmethod
540-
@override
541-
def get_default_options(cls) -> MarkdownBackendOptions:
542-
return MarkdownBackendOptions()
543-
544539
def convert(self) -> DoclingDocument:
545540
_log.debug("converting Markdown...")
546541

docling/backend/pdf_backend.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from PIL import Image
1010

1111
from docling.backend.abstract_backend import PaginatedDocumentBackend
12+
from docling.datamodel.backend_options import PdfBackendOptions
1213
from docling.datamodel.base_models import InputFormat
1314
from docling.datamodel.document import InputDocument
1415

@@ -50,8 +51,14 @@ def unload(self):
5051

5152

5253
class PdfDocumentBackend(PaginatedDocumentBackend):
53-
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
54-
super().__init__(in_doc, path_or_stream)
54+
def __init__(
55+
self,
56+
in_doc: InputDocument,
57+
path_or_stream: Union[BytesIO, Path],
58+
options: PdfBackendOptions = PdfBackendOptions(),
59+
):
60+
super().__init__(in_doc, path_or_stream, options)
61+
self.options: PdfBackendOptions
5562

5663
if self.input_format is not InputFormat.PDF:
5764
if self.input_format is InputFormat.IMAGE:

docling/backend/pypdfium2_backend.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from pypdfium2._helpers.misc import PdfiumError
2121

2222
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
23+
from docling.datamodel.backend_options import PdfBackendOptions
2324
from docling.utils.locks import pypdfium2_lock
2425

2526

@@ -370,12 +371,20 @@ def unload(self):
370371

371372

372373
class PyPdfiumDocumentBackend(PdfDocumentBackend):
373-
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
374-
super().__init__(in_doc, path_or_stream)
374+
def __init__(
375+
self,
376+
in_doc: "InputDocument",
377+
path_or_stream: Union[BytesIO, Path],
378+
options: PdfBackendOptions = PdfBackendOptions(),
379+
):
380+
super().__init__(in_doc, path_or_stream, options)
375381

382+
password = (
383+
self.options.password.get_secret_value() if self.options.password else None
384+
)
376385
try:
377386
with pypdfium2_lock:
378-
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
387+
self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
379388
except PdfiumError as e:
380389
raise RuntimeError(
381390
f"pypdfium could not load document with hash {self.document_hash}"

docling/cli/main.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
WHISPER_TURBO_NATIVE,
5252
AsrModelType,
5353
)
54+
from docling.datamodel.backend_options import PdfBackendOptions
5455
from docling.datamodel.base_models import (
5556
ConversionStatus,
5657
FormatToExtensions,
@@ -404,6 +405,9 @@ def convert( # noqa: C901
404405
pdf_backend: Annotated[
405406
PdfBackend, typer.Option(..., help="The PDF backend to use.")
406407
] = PdfBackend.DLPARSE_V4,
408+
pdf_password: Annotated[
409+
Optional[str], typer.Option(..., help="Password for protected PDF documents")
410+
] = None,
407411
table_mode: Annotated[
408412
TableFormerMode,
409413
typer.Option(..., help="The mode to use in the table structure model."),
@@ -628,6 +632,9 @@ def convert( # noqa: C901
628632
pipeline_options: PipelineOptions
629633

630634
format_options: Dict[InputFormat, FormatOption] = {}
635+
pdf_backend_options: Optional[PdfBackendOptions] = PdfBackendOptions(
636+
password=pdf_password
637+
)
631638

632639
if pipeline == ProcessingPipeline.STANDARD:
633640
pipeline_options = PdfPipelineOptions(
@@ -658,8 +665,10 @@ def convert( # noqa: C901
658665
backend: Type[PdfDocumentBackend]
659666
if pdf_backend == PdfBackend.DLPARSE_V1:
660667
backend = DoclingParseDocumentBackend
668+
pdf_backend_options = None
661669
elif pdf_backend == PdfBackend.DLPARSE_V2:
662670
backend = DoclingParseV2DocumentBackend
671+
pdf_backend_options = None
663672
elif pdf_backend == PdfBackend.DLPARSE_V4:
664673
backend = DoclingParseV4DocumentBackend # type: ignore
665674
elif pdf_backend == PdfBackend.PYPDFIUM2:
@@ -670,6 +679,7 @@ def convert( # noqa: C901
670679
pdf_format_option = PdfFormatOption(
671680
pipeline_options=pipeline_options,
672681
backend=backend, # pdf_backend
682+
backend_options=pdf_backend_options,
673683
)
674684

675685
# METS GBS options
@@ -816,7 +826,7 @@ def convert( # noqa: C901
816826
_log.error(f"{asr_model} is not known")
817827
raise ValueError(f"{asr_model} is not known")
818828

819-
_log.info(f"ASR pipeline_options: {asr_pipeline_options}")
829+
_log.debug(f"ASR pipeline_options: {asr_pipeline_options}")
820830

821831
audio_format_option = AudioFormatOption(
822832
pipeline_cls=AsrPipeline,

docling/datamodel/backend_options.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from pathlib import PurePath
22
from typing import Annotated, Literal, Optional, Union
33

4-
from pydantic import AnyUrl, BaseModel, Field
4+
from pydantic import AnyUrl, BaseModel, Field, SecretStr
55

66

77
class BaseBackendOptions(BaseModel):
@@ -64,7 +64,19 @@ class MarkdownBackendOptions(BaseBackendOptions):
6464
)
6565

6666

67+
class PdfBackendOptions(BaseBackendOptions):
68+
"""Backend options for pdf document backends."""
69+
70+
kind: Literal["pdf"] = Field("pdf", exclude=True, repr=False)
71+
password: Optional[SecretStr] = None
72+
73+
6774
BackendOptions = Annotated[
68-
Union[DeclarativeBackendOptions, HTMLBackendOptions, MarkdownBackendOptions],
75+
Union[
76+
DeclarativeBackendOptions,
77+
HTMLBackendOptions,
78+
MarkdownBackendOptions,
79+
PdfBackendOptions,
80+
],
6981
Field(discriminator="kind"),
7082
]

docling/datamodel/document.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ class InputDocument(BaseModel):
114114
]
115115
valid: bool = Field(True, description="Whether this is is a valid input document.")
116116
backend_options: Optional[BackendOptions] = Field(
117-
None, description="Custom options for declarative backends."
117+
None, description="Custom options for backends."
118118
)
119119
limits: DocumentLimits = Field(
120120
DocumentLimits(), description="Limits in the input document for the conversion."
@@ -146,15 +146,6 @@ def __init__(
146146
self.limits = limits or DocumentLimits()
147147
self.format = format
148148

149-
# check for backend incompatibilities
150-
if issubclass(backend, DeclarativeDocumentBackend) and backend_options:
151-
if not issubclass(
152-
type(backend_options), type(backend.get_default_options())
153-
):
154-
raise ValueError(
155-
"Incompatible types between backend and backend_options arguments."
156-
)
157-
158149
try:
159150
if isinstance(path_or_stream, Path):
160151
self.file = path_or_stream
@@ -214,7 +205,7 @@ def _init_doc(
214205
backend: Type[AbstractDocumentBackend],
215206
path_or_stream: Union[BytesIO, Path],
216207
) -> None:
217-
if issubclass(backend, DeclarativeDocumentBackend) and self.backend_options:
208+
if self.backend_options:
218209
self._backend = backend(
219210
self,
220211
path_or_stream=path_or_stream,

docling/document_converter.py

Lines changed: 21 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,12 @@
3131
from docling.backend.webvtt_backend import WebVTTDocumentBackend
3232
from docling.backend.xml.jats_backend import JatsDocumentBackend
3333
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
34-
from docling.datamodel.backend_options import BackendOptions, HTMLBackendOptions
34+
from docling.datamodel.backend_options import (
35+
BackendOptions,
36+
HTMLBackendOptions,
37+
MarkdownBackendOptions,
38+
PdfBackendOptions,
39+
)
3540
from docling.datamodel.base_models import (
3641
BaseFormatOption,
3742
ConversionStatus,
@@ -98,7 +103,7 @@ class PowerpointFormatOption(FormatOption):
98103
class MarkdownFormatOption(FormatOption):
99104
pipeline_cls: Type = SimplePipeline
100105
backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
101-
backend_options: HTMLBackendOptions = HTMLBackendOptions()
106+
backend_options: Optional[MarkdownBackendOptions] = None
102107

103108

104109
class AsciiDocFormatOption(FormatOption):
@@ -109,7 +114,7 @@ class AsciiDocFormatOption(FormatOption):
109114
class HTMLFormatOption(FormatOption):
110115
pipeline_cls: Type = SimplePipeline
111116
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
112-
backend_options: HTMLBackendOptions = HTMLBackendOptions()
117+
backend_options: Optional[HTMLBackendOptions] = None
113118

114119

115120
class PatentUsptoFormatOption(FormatOption):
@@ -130,6 +135,7 @@ class ImageFormatOption(FormatOption):
130135
class PdfFormatOption(FormatOption):
131136
pipeline_cls: Type = StandardPdfPipeline
132137
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
138+
backend_options: Optional[PdfBackendOptions] = None
133139

134140

135141
class AudioFormatOption(FormatOption):
@@ -139,48 +145,24 @@ class AudioFormatOption(FormatOption):
139145

140146
def _get_default_option(format: InputFormat) -> FormatOption:
141147
format_to_default_options = {
142-
InputFormat.CSV: FormatOption(
143-
pipeline_cls=SimplePipeline, backend=CsvDocumentBackend
144-
),
145-
InputFormat.XLSX: FormatOption(
146-
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
147-
),
148-
InputFormat.DOCX: FormatOption(
149-
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
150-
),
151-
InputFormat.PPTX: FormatOption(
152-
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
153-
),
154-
InputFormat.MD: FormatOption(
155-
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
156-
),
157-
InputFormat.ASCIIDOC: FormatOption(
158-
pipeline_cls=SimplePipeline, backend=AsciiDocBackend
159-
),
160-
InputFormat.HTML: FormatOption(
161-
pipeline_cls=SimplePipeline,
162-
backend=HTMLDocumentBackend,
163-
backend_options=HTMLBackendOptions(),
164-
),
165-
InputFormat.XML_USPTO: FormatOption(
166-
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
167-
),
168-
InputFormat.XML_JATS: FormatOption(
169-
pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
170-
),
148+
InputFormat.CSV: CsvFormatOption(),
149+
InputFormat.XLSX: ExcelFormatOption(),
150+
InputFormat.DOCX: WordFormatOption(),
151+
InputFormat.PPTX: PowerpointFormatOption(),
152+
InputFormat.MD: MarkdownFormatOption(),
153+
InputFormat.ASCIIDOC: AsciiDocFormatOption(),
154+
InputFormat.HTML: HTMLFormatOption(),
155+
InputFormat.XML_USPTO: PatentUsptoFormatOption(),
156+
InputFormat.XML_JATS: XMLJatsFormatOption(),
171157
InputFormat.METS_GBS: FormatOption(
172158
pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend
173159
),
174-
InputFormat.IMAGE: FormatOption(
175-
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
176-
),
177-
InputFormat.PDF: FormatOption(
178-
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
179-
),
160+
InputFormat.IMAGE: ImageFormatOption(),
161+
InputFormat.PDF: PdfFormatOption(),
180162
InputFormat.JSON_DOCLING: FormatOption(
181163
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
182164
),
183-
InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
165+
InputFormat.AUDIO: AudioFormatOption(),
184166
InputFormat.VTT: FormatOption(
185167
pipeline_cls=SimplePipeline, backend=WebVTTDocumentBackend
186168
),

0 commit comments

Comments
 (0)