Skip to content

Commit ec6556b

Browse files
authored
fix: filenames with unicode chars on Windows (#124)
Signed-off-by: Michele Dolfi <[email protected]>
1 parent 185c924 commit ec6556b

File tree

2 files changed

+16
-2
lines changed

2 files changed

+16
-2
lines changed

docling_parse/pdf_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -486,7 +486,7 @@ def _load_document(self, key: str, filename: str) -> bool:
486486
Returns:
487487
bool: True if the document was successfully loaded, False otherwise.)")
488488
"""
489-
return self.parser.load_document(key=key, filename=filename)
489+
return self.parser.load_document(key=key, filename=filename.encode("utf8"))
490490

491491
def _load_document_from_bytesio(self, key: str, data: BytesIO) -> bool:
492492
"""Load a document by key from a BytesIO-like object.

src/pybind/docling_parser_v2.h

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@
33
#ifndef PYBIND_PDF_PARSER_V2_H
44
#define PYBIND_PDF_PARSER_V2_H
55

6+
#ifdef _WIN32
7+
#include <locale>
8+
#include <codecvt>
9+
#endif
10+
611
#include <pybind/docling_resources.h>
712

813
#include <v2.h>
@@ -178,7 +183,16 @@ namespace docling
178183

179184
bool docling_parser_v2::load_document(std::string key, std::string filename)
180185
{
181-
if (std::filesystem::exists(filename))
186+
#ifdef _WIN32
187+
// Convert UTF-8 string to UTF-16 wstring
188+
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
189+
std::wstring wide_filename = converter.from_bytes(filename);
190+
std::filesystem::path path_filename(wide_filename);
191+
#else
192+
std::filesystem::path path_filename(filename);
193+
#endif
194+
195+
if (std::filesystem::exists(path_filename))
182196
{
183197
//key2doc[key] = std::filesystem::path(filename);
184198
key2doc[key] = std::make_shared<decoder_type>();

0 commit comments

Comments
 (0)