66import json
77import logging
88import mimetypes
9- import os
109import re
1110import sys
1211import typing
1312import warnings
1413from enum import Enum
15- from io import BytesIO
14+ from io import BytesIO , StringIO
1615from pathlib import Path
1716from typing import (
1817 Any ,
6564 PictureClassificationLabel ,
6665)
6766from docling_core .types .doc .tokens import DocumentToken , TableToken
68- from docling_core .types .doc .utils import parse_otsl_table_content , relative_path
67+ from docling_core .types .doc .utils import (
68+ is_remote_path ,
69+ parse_otsl_table_content ,
70+ relative_path ,
71+ )
6972
7073_logger = logging .getLogger (__name__ )
7174
@@ -4762,38 +4765,50 @@ def _with_pictures_refs(
47624765 img_count = 0
47634766 image_dir .mkdir (parents = True , exist_ok = True )
47644767
4765- if image_dir .is_dir ():
4766- for item , level in result .iterate_items (page_no = page_no , with_groups = False ):
4767- if isinstance (item , PictureItem ):
4768- img = item .get_image (doc = self )
4769- if img is not None :
4770-
4771- hexhash = PictureItem ._image_to_hexhash (img )
4772-
4773- # loc_path = image_dir / f"image_{img_count:06}.png"
4774- if hexhash is not None :
4775- loc_path = image_dir / f"image_{ img_count :06} _{ hexhash } .png"
4776-
4777- img .save (loc_path )
4778- if reference_path is not None :
4779- obj_path = relative_path (
4780- reference_path .resolve (),
4781- loc_path .resolve (),
4782- )
4783- else :
4784- obj_path = loc_path
4768+ # Note: Skip is_dir() check for remote paths since S3/cloud storage
4769+ # doesn't have real directories - mkdir() is a no-op for remote paths
4770+ for item , level in result .iterate_items (page_no = page_no , with_groups = False ):
4771+ if isinstance (item , PictureItem ):
4772+ img = item .get_image (doc = self )
4773+ if img is not None :
4774+
4775+ hexhash = PictureItem ._image_to_hexhash (img )
4776+
4777+ # loc_path = image_dir / f"image_{img_count:06}.png"
4778+ if hexhash is not None :
4779+ loc_path = image_dir / f"image_{ img_count :06} _{ hexhash } .png"
4780+
4781+ # Use BytesIO + write_bytes for UPath compatibility
4782+ buf = BytesIO ()
4783+ img .save (buf , format = "PNG" )
4784+ loc_path .write_bytes (buf .getvalue ())
4785+
4786+ # For remote paths, use absolute URI string; for local, compute relative
4787+ obj_path : Union [str , Path ]
4788+ if is_remote_path (loc_path ) or is_remote_path (reference_path ):
4789+ # Convert to string URI for remote paths (Pydantic can't serialize UPath)
4790+ obj_path = str (loc_path )
4791+ elif reference_path is not None :
4792+ obj_path = relative_path (
4793+ reference_path .resolve (),
4794+ loc_path .resolve (),
4795+ )
4796+ else :
4797+ obj_path = loc_path
47854798
4786- if item .image is None :
4787- scale = img .size [0 ] / item .prov [0 ].bbox .width
4788- item .image = ImageRef .from_pil (
4789- image = img , dpi = round (72 * scale )
4790- )
4791- item .image .uri = Path (obj_path )
4799+ if item .image is None :
4800+ scale = img .size [0 ] / item .prov [0 ].bbox .width
4801+ item .image = ImageRef .from_pil (
4802+ image = img , dpi = round (72 * scale )
4803+ )
4804+ # For remote paths, store as string URI; for local, store as Path
4805+ # Pydantic coerces str to AnyUrl at runtime
4806+ item .image .uri = obj_path # type: ignore[assignment]
47924807
4793- # if item.image._pil is not None:
4794- # item.image._pil.close()
4808+ # if item.image._pil is not None:
4809+ # item.image._pil.close()
47954810
4796- img_count += 1
4811+ img_count += 1
47974812
47984813 return result
47994814
@@ -4859,7 +4874,7 @@ def save_as_json(
48594874 artifacts_dir , reference_path = self ._get_output_paths (filename , artifacts_dir )
48604875
48614876 if image_mode == ImageRefMode .REFERENCED :
4862- os . makedirs ( artifacts_dir , exist_ok = True )
4877+ artifacts_dir . mkdir ( parents = True , exist_ok = True )
48634878
48644879 new_doc = self ._make_copy_with_refmode (
48654880 artifacts_dir , image_mode , page_no = None , reference_path = reference_path
@@ -4868,8 +4883,7 @@ def save_as_json(
48684883 out = new_doc .export_to_dict (
48694884 coord_precision = coord_precision , confid_precision = confid_precision
48704885 )
4871- with open (filename , "w" , encoding = "utf-8" ) as fw :
4872- json .dump (out , fw , indent = indent )
4886+ filename .write_text (json .dumps (out , indent = indent ), encoding = "utf-8" )
48734887
48744888 @classmethod
48754889 def load_from_json (cls , filename : Union [str , Path ]) -> "DoclingDocument" :
@@ -4884,8 +4898,7 @@ def load_from_json(cls, filename: Union[str, Path]) -> "DoclingDocument":
48844898 """
48854899 if isinstance (filename , str ):
48864900 filename = Path (filename )
4887- with open (filename , "r" , encoding = "utf-8" ) as f :
4888- return cls .model_validate_json (f .read ())
4901+ return cls .model_validate_json (filename .read_text (encoding = "utf-8" ))
48894902
48904903 def save_as_yaml (
48914904 self ,
@@ -4902,7 +4915,7 @@ def save_as_yaml(
49024915 artifacts_dir , reference_path = self ._get_output_paths (filename , artifacts_dir )
49034916
49044917 if image_mode == ImageRefMode .REFERENCED :
4905- os . makedirs ( artifacts_dir , exist_ok = True )
4918+ artifacts_dir . mkdir ( parents = True , exist_ok = True )
49064919
49074920 new_doc = self ._make_copy_with_refmode (
49084921 artifacts_dir , image_mode , page_no = None , reference_path = reference_path
@@ -4911,8 +4924,9 @@ def save_as_yaml(
49114924 out = new_doc .export_to_dict (
49124925 coord_precision = coord_precision , confid_precision = confid_precision
49134926 )
4914- with open (filename , "w" , encoding = "utf-8" ) as fw :
4915- yaml .dump (out , fw , default_flow_style = default_flow_style )
4927+ stream = StringIO ()
4928+ yaml .dump (out , stream , default_flow_style = default_flow_style )
4929+ filename .write_text (stream .getvalue (), encoding = "utf-8" )
49164930
49174931 @classmethod
49184932 def load_from_yaml (cls , filename : Union [str , Path ]) -> "DoclingDocument" :
@@ -4926,8 +4940,7 @@ def load_from_yaml(cls, filename: Union[str, Path]) -> "DoclingDocument":
49264940 """
49274941 if isinstance (filename , str ):
49284942 filename = Path (filename )
4929- with open (filename , encoding = "utf-8" ) as f :
4930- data = yaml .load (f , Loader = yaml .SafeLoader )
4943+ data = yaml .load (filename .read_text (encoding = "utf-8" ), Loader = yaml .SafeLoader )
49314944 return DoclingDocument .model_validate (data )
49324945
49334946 def export_to_dict (
@@ -4979,7 +4992,7 @@ def save_as_markdown(
49794992 artifacts_dir , reference_path = self ._get_output_paths (filename , artifacts_dir )
49804993
49814994 if image_mode == ImageRefMode .REFERENCED :
4982- os . makedirs ( artifacts_dir , exist_ok = True )
4995+ artifacts_dir . mkdir ( parents = True , exist_ok = True )
49834996
49844997 new_doc = self ._make_copy_with_refmode (
49854998 artifacts_dir , image_mode , page_no , reference_path = reference_path
@@ -5005,8 +5018,7 @@ def save_as_markdown(
50055018 mark_meta = mark_meta ,
50065019 )
50075020
5008- with open (filename , "w" , encoding = "utf-8" ) as fw :
5009- fw .write (md_out )
5021+ filename .write_text (md_out , encoding = "utf-8" )
50105022
50115023 def export_to_markdown ( # noqa: C901
50125024 self ,
@@ -5185,7 +5197,7 @@ def save_as_html(
51855197 artifacts_dir , reference_path = self ._get_output_paths (filename , artifacts_dir )
51865198
51875199 if image_mode == ImageRefMode .REFERENCED :
5188- os . makedirs ( artifacts_dir , exist_ok = True )
5200+ artifacts_dir . mkdir ( parents = True , exist_ok = True )
51895201
51905202 new_doc = self ._make_copy_with_refmode (
51915203 artifacts_dir , image_mode , page_no , reference_path = reference_path
@@ -5205,8 +5217,7 @@ def save_as_html(
52055217 include_annotations = include_annotations ,
52065218 )
52075219
5208- with open (filename , "w" , encoding = "utf-8" ) as fw :
5209- fw .write (html_out )
5220+ filename .write_text (html_out , encoding = "utf-8" )
52105221
52115222 def _get_output_paths (
52125223 self , filename : Union [str , Path ], artifacts_dir : Optional [Path ] = None
@@ -5850,8 +5861,7 @@ def save_as_doctags(
58505861 minified = minified ,
58515862 )
58525863
5853- with open (filename , "w" , encoding = "utf-8" ) as fw :
5854- fw .write (out )
5864+ filename .write_text (out , encoding = "utf-8" )
58555865
58565866 @deprecated ("Use export_to_doctags() instead." )
58575867 def export_to_document_tokens (self , * args , ** kwargs ):
0 commit comments