Unstructured-IO
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎unstructured/__version__.py‎
Lines changed: 1 addition & 1 deletion b/‎unstructured/__version__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎unstructured/partition/api.py‎
Lines changed: 12 additions & 20 deletions b/‎unstructured/partition/api.py‎
Lines changed: 12 additions & 20 deletions
diff --git a/‎unstructured/partition/auto.py‎
Lines changed: 23 additions & 20 deletions b/‎unstructured/partition/auto.py‎
Lines changed: 23 additions & 20 deletions
diff --git a/‎unstructured/partition/common.py‎
Lines changed: 32 additions & 23 deletions b/‎unstructured/partition/common.py‎
Lines changed: 32 additions & 23 deletions
diff --git a/‎unstructured/partition/csv.py‎
Lines changed: 9 additions & 10 deletions b/‎unstructured/partition/csv.py‎
Lines changed: 9 additions & 10 deletions
@@ -1,4 +1,4 @@
-## 0.13.7-dev4
+## 0.13.7-dev5
 
 ### Enhancements
 
 
@@ -1 +1 @@
-__version__ = "0.13.7-dev4"  # pragma: no cover
+__version__ = "0.13.7-dev5"  # pragma: no cover
@@ -1,10 +1,8 @@
+from __future__ import annotations
+
 import contextlib
 import json
-from typing import (
-    IO,
-    List,
-    Optional,
-)
+from typing import IO, Optional
 
 import requests
 from unstructured_client import UnstructuredClient
@@ -25,7 +23,7 @@ def partition_via_api(
     api_key: str = "",
     metadata_filename: Optional[str] = None,
     **request_kwargs,
-) -> List[Element]:
+) -> list[Element]:
     """Partitions a document using the Unstructured REST API. This is equivalent to
     running the document through partition.
 
@@ -84,10 +82,7 @@ def partition_via_api(
                 "If file is specified in partition_via_api, "
                 "metadata_filename must be specified as well.",
             )
-        files = shared.Files(
-            content=file,
-            file_name=metadata_filename,
-        )
+        files = shared.Files(content=file, file_name=metadata_filename)
 
     # NOTE(christine): Converts all list type parameters to JSON formatted strings
     # (e.g. ["image", "table"] -> '["image", "table"]')
@@ -96,10 +91,7 @@ def partition_via_api(
         if isinstance(v, list):
             request_kwargs[k] = json.dumps(v)
 
-    req = shared.PartitionParameters(
-        files=files,
-        **request_kwargs,
-    )
+    req = shared.PartitionParameters(files=files, **request_kwargs)
     response = sdk.general.partition(req)
 
     if response.status_code == 200:
@@ -111,15 +103,15 @@ def partition_via_api(
 
 
 def partition_multiple_via_api(
-    filenames: Optional[List[str]] = None,
-    content_types: Optional[List[str]] = None,
-    files: Optional[List[str]] = None,
-    file_filenames: Optional[List[str]] = None,
+    filenames: Optional[list[str]] = None,
+    content_types: Optional[list[str]] = None,
+    files: Optional[list[str]] = None,
+    file_filenames: Optional[list[str]] = None,
     api_url: str = "https://api.unstructured.io/general/v0/general",
     api_key: str = "",
-    metadata_filenames: Optional[List[str]] = None,
+    metadata_filenames: Optional[list[str]] = None,
     **request_kwargs,
-) -> List[List[Element]]:
+) -> list[list[Element]]:
     """Partitions multiple documents using the Unstructured REST API by batching
     the documents into a single HTTP request.
 
 
@@ -1,9 +1,13 @@
+"""Provides partitioning with automatic file-type detection."""
+
+from __future__ import annotations
+
 import io
-from typing import IO, Callable, Dict, List, Optional, Tuple
+from typing import IO, Any, Callable, Optional
 
 import requests
 
-from unstructured.documents.elements import DataSourceMetadata
+from unstructured.documents.elements import DataSourceMetadata, Element
 from unstructured.file_utils.filetype import (
     FILETYPE_TO_MIMETYPE,
     STR_TO_FILETYPE,
@@ -16,15 +20,13 @@
 from unstructured.partition.email import partition_email
 from unstructured.partition.html import partition_html
 from unstructured.partition.json import partition_json
-from unstructured.partition.lang import (
-    check_language_args,
-)
+from unstructured.partition.lang import check_language_args
 from unstructured.partition.text import partition_text
 from unstructured.partition.utils.constants import PartitionStrategy
 from unstructured.partition.xml import partition_xml
 from unstructured.utils import dependency_exists
 
-PARTITION_WITH_EXTRAS_MAP: Dict[str, Callable] = {}
+PARTITION_WITH_EXTRAS_MAP: dict[str, Callable[..., list[Element]]] = {}
 
 if dependency_exists("pandas"):
     from unstructured.partition.csv import partition_csv
@@ -114,7 +116,7 @@
 
 def _get_partition_with_extras(
     doc_type: str,
-    partition_with_extras_map: Optional[Dict[str, Callable]] = None,
+    partition_with_extras_map: Optional[dict[str, Callable[..., list[Element]]]] = None,
 ):
     if partition_with_extras_map is None:
         partition_with_extras_map = PARTITION_WITH_EXTRAS_MAP
@@ -138,15 +140,15 @@ def partition(
     strategy: str = PartitionStrategy.AUTO,
     encoding: Optional[str] = None,
     paragraph_grouper: Optional[Callable[[str], str]] = None,
-    headers: Dict[str, str] = {},
-    skip_infer_table_types: List[str] = [],
+    headers: dict[str, str] = {},
+    skip_infer_table_types: list[str] = [],
     ssl_verify: bool = True,
     ocr_languages: Optional[str] = None,  # changing to optional for deprecation
-    languages: Optional[List[str]] = None,
+    languages: Optional[list[str]] = None,
     detect_language_per_element: bool = False,
     pdf_infer_table_structure: bool = True,
     extract_images_in_pdf: bool = False,
-    extract_image_block_types: Optional[List[str]] = None,
+    extract_image_block_types: Optional[list[str]] = None,
     extract_image_block_output_dir: Optional[str] = None,
     extract_image_block_to_payload: bool = False,
     xml_keep_tags: bool = False,
@@ -157,7 +159,7 @@ def partition(
     model_name: Optional[str] = None,  # to be deprecated
     date_from_file_object: bool = False,
     starting_page_number: int = 1,
-    **kwargs,
+    **kwargs: Any,
 ):
     """Partitions a document into its constituent elements. Will use libmagic to determine
     the file's type and route it to the appropriate partitioning function. Applies the default
@@ -422,8 +424,8 @@ def partition(
     elif filetype == FileType.PDF:
         _partition_pdf = _get_partition_with_extras("pdf")
         elements = _partition_pdf(
-            filename=filename,  # type: ignore
-            file=file,  # type: ignore
+            filename=filename,
+            file=file,
             url=None,
             include_page_breaks=include_page_breaks,
             infer_table_structure=infer_table_structure,
@@ -438,9 +440,10 @@ def partition(
             **kwargs,
         )
     elif filetype in IMAGE_FILETYPES:
-        elements = partition_image(
-            filename=filename,  # type: ignore
-            file=file,  # type: ignore
+        _partition_image = _get_partition_with_extras("image")
+        elements = _partition_image(
+            filename=filename,
+            file=file,
             url=None,
             include_page_breaks=include_page_breaks,
             infer_table_structure=infer_table_structure,
@@ -557,10 +560,10 @@ def partition(
 def file_and_type_from_url(
     url: str,
     content_type: Optional[str] = None,
-    headers: Dict[str, str] = {},
+    headers: dict[str, str] = {},
     ssl_verify: bool = True,
     request_timeout: Optional[int] = None,
-) -> Tuple[io.BytesIO, Optional[FileType]]:
+) -> tuple[io.BytesIO, Optional[FileType]]:
     response = requests.get(url, headers=headers, verify=ssl_verify, timeout=request_timeout)
     file = io.BytesIO(response.content)
 
@@ -575,7 +578,7 @@ def file_and_type_from_url(
 
 def decide_table_extraction(
     filetype: Optional[FileType],
-    skip_infer_table_types: List[str],
+    skip_infer_table_types: list[str],
     pdf_infer_table_structure: bool,
 ) -> bool:
     doc_type = filetype.name.lower() if filetype else None
 
@@ -6,7 +6,7 @@
 from datetime import datetime
 from io import BufferedReader, BytesIO, TextIOWrapper
 from tempfile import SpooledTemporaryFile
-from typing import IO, TYPE_CHECKING, Any, BinaryIO, List, Optional
+from typing import IO, TYPE_CHECKING, Any, Optional, TypeVar, cast
 
 import emoji
 from tabulate import tabulate
@@ -191,14 +191,14 @@ def layout_list_to_list_items(
     coordinate_system: Optional[CoordinateSystem],
     metadata: Optional[ElementMetadata],
     detection_origin: Optional[str],
-) -> List[Element]:
+) -> list[Element]:
     """Converts a list LayoutElement to a list of ListItem elements."""
     split_items = ENUMERATED_BULLETS_RE.split(text) if text else []
     # NOTE(robinson) - this means there wasn't a match for the enumerated bullets
     if len(split_items) == 1:
         split_items = UNICODE_BULLETS_RE.split(text) if text else []
 
-    list_items: List[Element] = []
+    list_items: list[Element] = []
     for text_segment in split_items:
         if len(text_segment.strip()) > 0:
             # Both `coordinates` and `coordinate_system` must be present
@@ -216,13 +216,13 @@ def layout_list_to_list_items(
 
 
 def set_element_hierarchy(
-    elements: List[Element], ruleset: dict[str, list[str]] = HIERARCHY_RULE_SET
+    elements: list[Element], ruleset: dict[str, list[str]] = HIERARCHY_RULE_SET
 ) -> list[Element]:
     """Sets the parent_id for each element in the list of elements
     based on the element's category, depth and a ruleset
 
     """
-    stack: List[Element] = []
+    stack: list[Element] = []
     for element in elements:
         if element.metadata.parent_id is not None:
             continue
@@ -274,7 +274,7 @@ def add_element_metadata(
     coordinate_system: Optional[CoordinateSystem] = None,
     image_path: Optional[str] = None,
     detection_origin: Optional[str] = None,
-    languages: Optional[List[str]] = None,
+    languages: Optional[list[str]] = None,
     **kwargs: Any,
 ) -> Element:
     """Adds document metadata to the document element.
@@ -338,7 +338,7 @@ def remove_element_metadata(layout_elements) -> list[Element]:
 
     Document metadata includes information like the filename, source url, and page number.
     """
-    elements: List[Element] = []
+    elements: list[Element] = []
     metadata = ElementMetadata()
     for layout_element in layout_elements:
         element = normalize_layout_element(layout_element)
@@ -431,16 +431,25 @@ def exactly_one(**kwargs: Any) -> None:
         raise ValueError(message)
 
 
-def spooled_to_bytes_io_if_needed(
-    file_obj: bytes | BinaryIO | SpooledTemporaryFile[bytes] | None,
-) -> bytes | BinaryIO | None:
-    if isinstance(file_obj, SpooledTemporaryFile):
-        file_obj.seek(0)
-        contents = file_obj.read()
-        return BytesIO(contents)
-    else:
-        # Return the original file object if it's not a SpooledTemporaryFile
-        return file_obj
+_T = TypeVar("_T")
+
+
+def spooled_to_bytes_io_if_needed(file: _T | SpooledTemporaryFile[bytes]) -> _T | BytesIO:
+    """Convert `file` to `BytesIO` when it is a `SpooledTemporaryFile`.
+
+    Note that `file` does not need to be IO[bytes]. It can be `None` or `bytes` and this function
+    will not complain.
+
+    In Python <3.11, `SpooledTemporaryFile` does not implement `.readable()` or `.seekable()` which
+    triggers an exception when the file is loaded by certain packages. In particular, the stdlib
+    `zipfile.Zipfile` raises on opening a `SpooledTemporaryFile` as does `Pandas.read_csv()`.
+    """
+    if isinstance(file, SpooledTemporaryFile):
+        file.seek(0)
+        return BytesIO(cast(bytes, file.read()))
+
+    # -- return `file` unchanged otherwise --
+    return file
 
 
 def convert_to_bytes(file: bytes | IO[bytes]) -> bytes:
@@ -537,16 +546,16 @@ def document_to_element_list(
     source_format: Optional[str] = None,
     detection_origin: Optional[str] = None,
     sort_mode: str = SORT_MODE_XY_CUT,
-    languages: Optional[List[str]] = None,
+    languages: Optional[list[str]] = None,
     starting_page_number: int = 1,
     **kwargs: Any,
-) -> List[Element]:
+) -> list[Element]:
     """Converts a DocumentLayout object to a list of unstructured elements."""
-    elements: List[Element] = []
+    elements: list[Element] = []
 
     num_pages = len(document.pages)
     for page_number, page in enumerate(document.pages, start=starting_page_number):
-        page_elements: List[Element] = []
+        page_elements: list[Element] = []
 
         page_image_metadata = _get_page_image_metadata(page)
         image_format = page_image_metadata.get("format")
@@ -566,7 +575,7 @@ def document_to_element_list(
                 infer_list_items=infer_list_items,
                 source_format=source_format if source_format else "html",
             )
-            if isinstance(element, List):
+            if isinstance(element, list):
                 for el in element:
                     if last_modification_date:
                         el.metadata.last_modified = last_modification_date
@@ -628,7 +637,7 @@ def document_to_element_list(
 
 
 def ocr_data_to_elements(
-    ocr_data: List["LayoutElement"],
+    ocr_data: list["LayoutElement"],
     image_size: tuple[int | float, int | float],
     common_metadata: Optional[ElementMetadata] = None,
     infer_list_items: bool = True,
 
@@ -1,6 +1,7 @@
+from __future__ import annotations
+
 import csv
-from tempfile import SpooledTemporaryFile
-from typing import IO, BinaryIO, List, Optional, Union, cast
+from typing import IO, Any, Optional, cast
 
 import pandas as pd
 from lxml.html.soupparser import fromstring as soupparser_fromstring
@@ -29,18 +30,18 @@
 @add_chunking_strategy
 def partition_csv(
     filename: Optional[str] = None,
-    file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
+    file: Optional[IO[bytes]] = None,
     metadata_filename: Optional[str] = None,
     metadata_last_modified: Optional[str] = None,
     include_header: bool = False,
     include_metadata: bool = True,
     infer_table_structure: bool = True,
-    languages: Optional[List[str]] = ["auto"],
+    languages: Optional[list[str]] = ["auto"],
     # NOTE (jennings) partition_csv generates a single TableElement
     # so detect_language_per_element is not included as a param
     date_from_file_object: bool = False,
-    **kwargs,
-) -> List[Element]:
+    **kwargs: Any,
+) -> list[Element]:
     """Partitions Microsoft Excel Documents in .csv format into its document elements.
 
     Parameters
@@ -84,14 +85,12 @@ def partition_csv(
         last_modification_date = (
             get_last_modified_date_from_file(file) if date_from_file_object else None
         )
-        f = spooled_to_bytes_io_if_needed(
-            cast(Union[BinaryIO, SpooledTemporaryFile], file),
-        )
+        f = spooled_to_bytes_io_if_needed(file)
         delimiter = get_delimiter(file=f)
         table = pd.read_csv(f, header=header, sep=delimiter)
 
     html_text = table.to_html(index=False, header=include_header, na_rep="")
-    text = soupparser_fromstring(html_text).text_content()
+    text = cast(str, soupparser_fromstring(html_text).text_content())
 
     if include_metadata:
         metadata = ElementMetadata(
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-## 0.13.7-dev4`
	`1`	`+## 0.13.7-dev5`
`2`	`2`
`3`	`3`	`### Enhancements`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.13.7-dev4" # pragma: no cover`
	`1`	`+__version__ = "0.13.7-dev5" # pragma: no cover`