Skip to content

Commit 58626a0

Browse files
committed
update multipart
1 parent a0e0f04 commit 58626a0

File tree

1 file changed

+7
-7
lines changed

1 file changed

+7
-7
lines changed

src/unstructured_client/_hooks/custom/pdf_utils.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
import logging
55
from typing import cast, Optional, BinaryIO, Union
66

7+
from email.message import Message
78
from email.parser import BytesParser
8-
from email.policy import default
9+
from email.policy import (default, Policy)
910
from pypdf import PdfReader
1011
from pypdf.errors import FileNotDecryptedError, PdfReadError
1112

@@ -45,23 +46,22 @@ def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
4546
pdf_file.seek(0)
4647
raw = pdf_file.read()
4748
except Exception as e:
48-
raise IOError(f"Failed to read file stream: {e}")
49+
raise IOError(f"Failed to read file stream: {e}") from e
4950
else:
50-
raise TypeError("Expected bytes or a file-like object with 'read()' method")
51+
raise IOError("Expected bytes or a file-like object with 'read()' method")
5152

5253
# This looks for multipart extraction
5354
try:
54-
msg = BytesParser(policy=default).parsebytes(raw)
55+
msg = BytesParser(policy=cast(Policy[Message], default)).parsebytes(raw)
5556
for part in msg.walk():
5657
if part.get_content_type() == "application/pdf":
5758
pdf_bytes = part.get_payload(decode=True)
5859
if not isinstance(pdf_bytes, bytes):
5960
continue
60-
pdf_bytes = cast(bytes, pdf_bytes)
6161
pdf = PdfReader(io.BytesIO(pdf_bytes), strict=False)
6262
return check_pdf(pdf)
6363
except Exception as e:
64-
pdf_logger.debug(f"Multipart extraction failed: {e}")
64+
pdf_logger.debug("Multipart extraction failed: %s", e)
6565

6666
# This looks for %PDF-
6767
try:
@@ -71,7 +71,7 @@ def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
7171
pdf = PdfReader(io.BytesIO(sliced), strict=False)
7272
return check_pdf(pdf)
7373
except Exception as e:
74-
pdf_logger.debug(f"%PDF- slicing fallback failed: {e}")
74+
pdf_logger.debug("%%PDF- slicing fallback failed: %s", e)
7575

7676
return None
7777

0 commit comments

Comments
 (0)