Skip to content

Commit 0c64402

Browse files
Mark1626dolfim-ibm
andauthored
feat: Support reading password protected PDF (#169)
Signed-off-by: Nimalan <[email protected]> Signed-off-by: Michele Dolfi <[email protected]> Co-authored-by: Michele Dolfi <[email protected]>
1 parent 1c21ce0 commit 0c64402

File tree

8 files changed

+92
-24
lines changed

8 files changed

+92
-24
lines changed

app/parse_v2.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ int main(int argc, char* argv[]) {
8282
("c,config", "Config file", cxxopts::value<std::string>())
8383
("create-config", "Create config file", cxxopts::value<std::string>())
8484
("p,page", "Pages to process (default: -1 for all)", cxxopts::value<int>()->default_value("-1"))
85+
("password", "Password for accessing encrypted, password-protected files", cxxopts::value<std::string>())
8586
("o,output", "Output file", cxxopts::value<std::string>())
8687
("l,loglevel", "loglevel [error;warning;success;info]", cxxopts::value<std::string>())
8788
("h,help", "Print usage");
@@ -158,6 +159,9 @@ int main(int argc, char* argv[]) {
158159

159160
auto config = create_config(ifile, ofile, page);
160161
LOG_S(INFO) << "config: \n" << config.dump(2);
162+
if (result.count("password")) {
163+
config["password"] = result["password"].as<std::string>();
164+
}
161165

162166
utils::timer timer;
163167

app/pybind_parse.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
//-*-C++-*-
22

3+
#include <optional>
34
#include <pybind11/pybind11.h>
45
#include <pybind11/stl.h>
56
#include <pybind11/buffer_info.h>
@@ -247,17 +248,24 @@ PYBIND11_MODULE(pdf_parsers, m) {
247248
List[str]: A list of keys for the currently loaded documents.)")
248249

249250
.def("load_document",
250-
[](docling::docling_parser_v2 &self, const std::string &key, const std::string &filename) -> bool {
251-
return self.load_document(key, filename);
251+
[](
252+
docling::docling_parser_v2 &self,
253+
const std::string &key,
254+
const std::string &filename,
255+
std::optional<std::string>& password
256+
) -> bool {
257+
return self.load_document(key, filename, password);
252258
},
253259
pybind11::arg("key"),
254260
pybind11::arg("filename"),
261+
pybind11::arg("password") = pybind11::none(),
255262
R"(
256263
Load a document by key and filename.
257264
258265
Parameters:
259266
key (str): The unique key to identify the document.
260267
filename (str): The path to the document file to load.
268+
password (str, optional): Optional password for password-protected files
261269
262270
Returns:
263271
bool: True if the document was successfully loaded, False otherwise.)")

docling_parse/pdf_parser.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -601,14 +601,17 @@ def load(
601601
path_or_stream: Union[str, Path, BytesIO],
602602
lazy: bool = True,
603603
boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX,
604+
password: Optional[str] = None,
604605
) -> PdfDocument:
605606

606607
if isinstance(path_or_stream, str):
607608
path_or_stream = Path(path_or_stream)
608609

609610
if isinstance(path_or_stream, Path):
610611
key = f"key={str(path_or_stream)}" # use filepath as internal handle
611-
success = self._load_document(key=key, filename=str(path_or_stream))
612+
success = self._load_document(
613+
key=key, filename=str(path_or_stream), password=password
614+
)
612615

613616
elif isinstance(path_or_stream, BytesIO):
614617
hasher = hashlib.sha256(usedforsecurity=False)
@@ -632,17 +635,22 @@ def load(
632635
else:
633636
raise RuntimeError(f"Failed to load document with key {key}")
634637

635-
def _load_document(self, key: str, filename: str) -> bool:
638+
def _load_document(
639+
self, key: str, filename: str, password: Optional[str] = None
640+
) -> bool:
636641
"""Load a document by key and filename.
637642
638643
Parameters:
639644
key (str): The unique key to identify the document.
640645
filename (str): The path to the document file to load.
646+
password (str, optional): Optional password for password-protected files
641647
642648
Returns:
643649
bool: True if the document was successfully loaded, False otherwise.)")
644650
"""
645-
return self.parser.load_document(key=key, filename=filename.encode("utf8"))
651+
return self.parser.load_document(
652+
key=key, filename=filename.encode("utf8"), password=password
653+
)
646654

647655
def _load_document_from_bytesio(self, key: str, data: BytesIO) -> bool:
648656
"""Load a document by key from a BytesIO-like object.

docling_parse/visualize.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import logging
33
import os
44
from pathlib import Path
5+
from typing import Optional
56

67
from docling_core.types.doc.page import SegmentedPdfPage, TextCellUnit
78

@@ -103,6 +104,14 @@ def parse_args():
103104
help="page to be displayed (default: -1 -> all)",
104105
)
105106

107+
# Add an argument for the output directory, defaulting to "./tmp"
108+
parser.add_argument(
109+
"--password",
110+
type=str,
111+
required=False,
112+
help="Optional password for password-protected files",
113+
)
114+
106115
# Parse the command-line arguments
107116
args = parser.parse_args()
108117

@@ -125,6 +134,7 @@ def parse_args():
125134
args.enforce_same_font,
126135
args.page_boundary,
127136
args.category,
137+
args.password,
128138
)
129139

130140

@@ -140,10 +150,13 @@ def visualise_py(
140150
page_boundary: str = "crop_box", # media_box
141151
category: str = "char", # "both", "sanitized", "original"
142152
page_num: int = -1,
153+
password: Optional[str] = None,
143154
):
144155
parser = DoclingPdfParser(loglevel=log_level)
145156

146-
pdf_doc: PdfDocument = parser.load(path_or_stream=pdf_path, lazy=True)
157+
pdf_doc: PdfDocument = parser.load(
158+
path_or_stream=pdf_path, lazy=True, password=password
159+
)
147160

148161
page_nos = [page_num]
149162
if page_num == -1:
@@ -254,6 +267,7 @@ def main():
254267
enforce_same_font,
255268
page_boundary,
256269
category,
270+
password,
257271
) = parse_args()
258272

259273
logging.info(f"page_boundary: {page_boundary}")
@@ -269,6 +283,7 @@ def main():
269283
page_boundary=page_boundary,
270284
category=category,
271285
page_num=page_num,
286+
password=password,
272287
)
273288

274289

src/pybind/docling_parser_v2.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#ifndef PYBIND_PDF_PARSER_V2_H
44
#define PYBIND_PDF_PARSER_V2_H
55

6+
#include <optional>
67
#ifdef _WIN32
78
#include <locale>
89
#include <codecvt>
@@ -31,7 +32,7 @@ namespace docling
3132
bool is_loaded(std::string key);
3233
std::vector<std::string> list_loaded_keys();
3334

34-
bool load_document(std::string key, std::string filename);
35+
bool load_document(std::string key, std::string filename, std::optional<std::string> password);
3536
bool load_document_from_bytesio(std::string key, pybind11::object bytes_io);
3637

3738
bool unload_document(std::string key);
@@ -188,7 +189,7 @@ namespace docling
188189
return (key2doc.count(key)==1);
189190
}
190191

191-
bool docling_parser_v2::load_document(std::string key, std::string filename)
192+
bool docling_parser_v2::load_document(std::string key, std::string filename, std::optional<std::string> password)
192193
{
193194
#ifdef _WIN32
194195
// Convert UTF-8 string to UTF-16 wstring
@@ -203,7 +204,7 @@ namespace docling
203204
{
204205
//key2doc[key] = std::filesystem::path(filename);
205206
key2doc[key] = std::make_shared<decoder_type>();
206-
key2doc.at(key)->process_document_from_file(filename);
207+
key2doc.at(key)->process_document_from_file(filename, password);
207208
return true;
208209
}
209210

src/v2/parser.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,16 @@ namespace plib
196196
return false;
197197
}
198198

199-
if(not document_decoder.process_document_from_file(inp_filename))
199+
std::optional<std::string> password;
200+
if (input_file["password"].is_null())
201+
{
202+
password = std::nullopt;
203+
}
204+
else
205+
{
206+
password = input_file["password"];
207+
}
208+
if(not document_decoder.process_document_from_file(inp_filename, password))
200209
{
201210
LOG_S(ERROR) << "aborting the parse of file "<< inp_filename;
202211
return false;

src/v2/pdf_decoders/document.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#ifndef PDF_DOCUMENT_DECODER_H
44
#define PDF_DOCUMENT_DECODER_H
55

6+
#include <optional>
67
#include <qpdf/QPDF.hh>
78
//#include <qpdf/QPDFPageObjectHelper.hh>
89

@@ -27,7 +28,7 @@ namespace pdflib
2728
nlohmann::json get_meta_xml() { return json_annots["meta_xml"]; }
2829
nlohmann::json get_table_of_contents() { return json_annots["table_of_contents"]; }
2930

30-
bool process_document_from_file(std::string& _filename);
31+
bool process_document_from_file(std::string& _filename, std::optional<std::string>& password);
3132
bool process_document_from_bytesio(std::string& _buffer);
3233

3334
void decode_document(std::string page_boundary, bool do_sanitization);
@@ -150,7 +151,7 @@ namespace pdflib
150151
return json_document;
151152
}
152153

153-
bool pdf_decoder<DOCUMENT>::process_document_from_file(std::string& _filename)
154+
bool pdf_decoder<DOCUMENT>::process_document_from_file(std::string& _filename, std::optional<std::string>& password)
154155
{
155156
filename = _filename; // save it
156157
LOG_S(INFO) << "start processing '" << filename << "' by qpdf ...";
@@ -159,7 +160,11 @@ namespace pdflib
159160

160161
try
161162
{
162-
qpdf_document.processFile(filename.c_str());
163+
if (password.has_value()) {
164+
qpdf_document.processFile(filename.c_str(), password.value().c_str());
165+
} else {
166+
qpdf_document.processFile(filename.c_str());
167+
}
163168
LOG_S(INFO) << "filename: " << filename << " processed by qpdf!";
164169

165170
qpdf_root = qpdf_document.getRoot();

uv.lock

Lines changed: 29 additions & 11 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)