66from datetime import datetime
77from io import BufferedReader , BytesIO , TextIOWrapper
88from tempfile import SpooledTemporaryFile
9- from typing import IO , TYPE_CHECKING , Any , BinaryIO , List , Optional
9+ from typing import IO , TYPE_CHECKING , Any , Optional , TypeVar , cast
1010
1111import emoji
1212from tabulate import tabulate
@@ -191,14 +191,14 @@ def layout_list_to_list_items(
191191 coordinate_system : Optional [CoordinateSystem ],
192192 metadata : Optional [ElementMetadata ],
193193 detection_origin : Optional [str ],
194- ) -> List [Element ]:
194+ ) -> list [Element ]:
195195 """Converts a list LayoutElement to a list of ListItem elements."""
196196 split_items = ENUMERATED_BULLETS_RE .split (text ) if text else []
197197 # NOTE(robinson) - this means there wasn't a match for the enumerated bullets
198198 if len (split_items ) == 1 :
199199 split_items = UNICODE_BULLETS_RE .split (text ) if text else []
200200
201- list_items : List [Element ] = []
201+ list_items : list [Element ] = []
202202 for text_segment in split_items :
203203 if len (text_segment .strip ()) > 0 :
204204 # Both `coordinates` and `coordinate_system` must be present
@@ -216,13 +216,13 @@ def layout_list_to_list_items(
216216
217217
218218def set_element_hierarchy (
219- elements : List [Element ], ruleset : dict [str , list [str ]] = HIERARCHY_RULE_SET
219+ elements : list [Element ], ruleset : dict [str , list [str ]] = HIERARCHY_RULE_SET
220220) -> list [Element ]:
221221 """Sets the parent_id for each element in the list of elements
222222 based on the element's category, depth and a ruleset
223223
224224 """
225- stack : List [Element ] = []
225+ stack : list [Element ] = []
226226 for element in elements :
227227 if element .metadata .parent_id is not None :
228228 continue
@@ -274,7 +274,7 @@ def add_element_metadata(
274274 coordinate_system : Optional [CoordinateSystem ] = None ,
275275 image_path : Optional [str ] = None ,
276276 detection_origin : Optional [str ] = None ,
277- languages : Optional [List [str ]] = None ,
277+ languages : Optional [list [str ]] = None ,
278278 ** kwargs : Any ,
279279) -> Element :
280280 """Adds document metadata to the document element.
@@ -338,7 +338,7 @@ def remove_element_metadata(layout_elements) -> list[Element]:
338338
339339 Document metadata includes information like the filename, source url, and page number.
340340 """
341- elements : List [Element ] = []
341+ elements : list [Element ] = []
342342 metadata = ElementMetadata ()
343343 for layout_element in layout_elements :
344344 element = normalize_layout_element (layout_element )
@@ -431,16 +431,25 @@ def exactly_one(**kwargs: Any) -> None:
431431 raise ValueError (message )
432432
433433
434- def spooled_to_bytes_io_if_needed (
435- file_obj : bytes | BinaryIO | SpooledTemporaryFile [bytes ] | None ,
436- ) -> bytes | BinaryIO | None :
437- if isinstance (file_obj , SpooledTemporaryFile ):
438- file_obj .seek (0 )
439- contents = file_obj .read ()
440- return BytesIO (contents )
441- else :
442- # Return the original file object if it's not a SpooledTemporaryFile
443- return file_obj
434+ _T = TypeVar ("_T" )
435+
436+
437+ def spooled_to_bytes_io_if_needed (file : _T | SpooledTemporaryFile [bytes ]) -> _T | BytesIO :
438+ """Convert `file` to `BytesIO` when it is a `SpooledTemporaryFile`.
439+
440+ Note that `file` does not need to be IO[bytes]. It can be `None` or `bytes` and this function
441+ will not complain.
442+
443+ In Python <3.11, `SpooledTemporaryFile` does not implement `.readable()` or `.seekable()` which
444+ triggers an exception when the file is loaded by certain packages. In particular, the stdlib
445+ `zipfile.Zipfile` raises on opening a `SpooledTemporaryFile` as does `Pandas.read_csv()`.
446+ """
447+ if isinstance (file , SpooledTemporaryFile ):
448+ file .seek (0 )
449+ return BytesIO (cast (bytes , file .read ()))
450+
451+ # -- return `file` unchanged otherwise --
452+ return file
444453
445454
446455def convert_to_bytes (file : bytes | IO [bytes ]) -> bytes :
@@ -537,16 +546,16 @@ def document_to_element_list(
537546 source_format : Optional [str ] = None ,
538547 detection_origin : Optional [str ] = None ,
539548 sort_mode : str = SORT_MODE_XY_CUT ,
540- languages : Optional [List [str ]] = None ,
549+ languages : Optional [list [str ]] = None ,
541550 starting_page_number : int = 1 ,
542551 ** kwargs : Any ,
543- ) -> List [Element ]:
552+ ) -> list [Element ]:
544553 """Converts a DocumentLayout object to a list of unstructured elements."""
545- elements : List [Element ] = []
554+ elements : list [Element ] = []
546555
547556 num_pages = len (document .pages )
548557 for page_number , page in enumerate (document .pages , start = starting_page_number ):
549- page_elements : List [Element ] = []
558+ page_elements : list [Element ] = []
550559
551560 page_image_metadata = _get_page_image_metadata (page )
552561 image_format = page_image_metadata .get ("format" )
@@ -566,7 +575,7 @@ def document_to_element_list(
566575 infer_list_items = infer_list_items ,
567576 source_format = source_format if source_format else "html" ,
568577 )
569- if isinstance (element , List ):
578+ if isinstance (element , list ):
570579 for el in element :
571580 if last_modification_date :
572581 el .metadata .last_modified = last_modification_date
@@ -628,7 +637,7 @@ def document_to_element_list(
628637
629638
630639def ocr_data_to_elements (
631- ocr_data : List ["LayoutElement" ],
640+ ocr_data : list ["LayoutElement" ],
632641 image_size : tuple [int | float , int | float ],
633642 common_metadata : Optional [ElementMetadata ] = None ,
634643 infer_list_items : bool = True ,
0 commit comments