Skip to content

Commit e474d96

Browse files
committed
chore(dev): fixes + improvements to content utils
1 parent 7d03a46 commit e474d96

File tree

1 file changed

+75
-36
lines changed

1 file changed

+75
-36
lines changed

weave/type_wrappers/Content/utils.py

Lines changed: 75 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import base64
4+
import functools
45
import logging
56
import re
67
from pathlib import Path
@@ -34,6 +35,10 @@
3435
# A global variable to hold the lazily imported mimetypes module.
3536
_mimetypes_module: ModuleType | None = None
3637

38+
# Cached polyfile matcher -- initialised once on first use.
39+
_polyfile_matcher: MagicMatcher | None = None
40+
_polyfile_unavailable: bool = False
41+
3742
DATA_URL_PATTERN = re.compile(
3843
r"^data:(?P<media_type>[\w\/\-\+\.]+(?:;[\w\-]+\=[\w\-\.]+)*)?(?P<base64>;base64)?,(?P<data>.*)$"
3944
)
@@ -106,38 +111,62 @@ def default_filename(
106111

107112

108113
def get_extension_from_mimetype(mimetype: str) -> str | None:
114+
"""Map a MIME type to a file extension via the stdlib mimetypes db."""
109115
mimetypes = _get_mimetypes_module()
110116
extension = mimetypes.guess_extension(mimetype)
111117
if not extension:
112-
logger.warning(
113-
f"Got mime-type {mimetype} but failed to resolve a valid extension"
118+
logger.debug(
119+
"Got mime-type %s but failed to resolve a valid extension", mimetype
114120
)
115121
return extension
116122

117123

118-
def guess_from_buffer(buffer: bytes) -> str | None:
119-
"""Guess the mimetype from a byte buffer using polyfile."""
120-
if len(buffer) == 0:
121-
return None
124+
def _get_polyfile_matcher() -> MagicMatcher | None:
125+
"""Return the cached polyfile MagicMatcher, or None if unavailable.
122126
127+
The matcher and the "not installed" state are each determined once per
128+
process so that we never re-import polyfile or log the warning repeatedly.
129+
"""
130+
global _polyfile_matcher, _polyfile_unavailable
131+
if _polyfile_matcher is not None:
132+
return _polyfile_matcher
133+
if _polyfile_unavailable:
134+
return None
123135
try:
124-
# Lazily import polyfile only when needed.
125136
from polyfile.magic import MagicMatcher
126137
except (ImportError, ModuleNotFoundError):
138+
_polyfile_unavailable = True
127139
logger.warning(
128-
"Failed to determine MIME type from file extension and cannot infer from data\n"
129-
"MIME type detection from raw data requires the polyfile library\n"
130-
"Install it by running: `pip install polyfile `\n"
140+
"MIME type detection from raw data requires the polyfile library. "
141+
"Install it by running: `pip install polyfile`. "
131142
"See: https://pypi.org/project/polyfile for detailed instructions"
132143
)
133144
return None
145+
else:
146+
_polyfile_matcher = cast("MagicMatcher", MagicMatcher.DEFAULT_INSTANCE)
147+
return _polyfile_matcher
134148

135-
try:
136-
matcher = cast("MagicMatcher", MagicMatcher.DEFAULT_INSTANCE)
137-
return next(matcher.match(buffer)).mimetypes[0]
138-
except IndexError:
139-
# This occurs if polyfile is installed but finds no match.
149+
150+
@functools.lru_cache(maxsize=256)
151+
def guess_from_buffer(buffer: bytes) -> str | None:
152+
"""Guess the mimetype from a byte buffer using polyfile.
153+
154+
Results are LRU-cached (keyed on the buffer bytes) so that repeated
155+
calls with identical content skip the expensive magic-match pass.
156+
"""
157+
if len(buffer) == 0:
140158
return None
159+
matcher = _get_polyfile_matcher()
160+
if matcher is None:
161+
return None
162+
try:
163+
match = next(matcher.match(buffer), None)
164+
if match is not None and match.mimetypes:
165+
return match.mimetypes[0]
166+
except Exception:
167+
# Broad guard so a polyfile bug can never crash a caller.
168+
logger.debug("polyfile match raised an unexpected error", exc_info=True)
169+
return None
141170

142171

143172
def guess_from_filename(filename: str) -> str | None:
@@ -171,22 +200,32 @@ def get_mime_and_extension(
171200

172201
if extension is not None and len(extension) > 0:
173202
extension = f".{extension.lstrip('.')}"
174-
if mimetype and extension:
175-
return mimetype, extension
176203

177-
elif (
178-
mimetype
179-
and not extension
180-
and (guessed_ext := get_extension_from_mimetype(mimetype))
181-
):
182-
return mimetype, guessed_ext
204+
# Track whether we already tried (and failed) to derive an extension
205+
# from the current mimetype, so we don't call get_extension_from_mimetype
206+
# twice with the same value.
207+
extension_lookup_failed_for: str | None = None
183208

184-
elif (
185-
extension and not mimetype and (guessed_type := guess_from_extension(extension))
186-
):
187-
return guessed_type, extension
209+
# --- Fast paths: both known, or one can be derived cheaply ----------
210+
if mimetype and extension:
211+
return mimetype, extension
188212

189-
if filename is not None:
213+
if mimetype and not extension:
214+
guessed_ext = get_extension_from_mimetype(mimetype)
215+
if guessed_ext:
216+
return mimetype, guessed_ext
217+
# mimetype is valid but has no known extension — keep it, keep looking
218+
# for an extension below instead of discarding it.
219+
extension_lookup_failed_for = mimetype
220+
221+
if extension and not mimetype:
222+
guessed_type = guess_from_extension(extension)
223+
if guessed_type:
224+
return guessed_type, extension
225+
226+
# --- Slower guessing: filename, then buffer -------------------------
227+
# Only guess mimetype from filename/buffer when we don't already have one.
228+
if not mimetype and filename is not None:
190229
mimetype = guess_from_filename(filename)
191230

192231
if not mimetype and extension is not None:
@@ -195,17 +234,17 @@ def get_mime_and_extension(
195234
if not mimetype and buffer is not None:
196235
mimetype = guess_from_buffer(buffer[:MIME_DETECTION_BUFFER_SIZE])
197236

237+
# --- Derive the missing half from whichever half we now have --------
198238
if mimetype and extension:
199239
return mimetype, extension
200240

201-
elif (
202-
mimetype
203-
and not extension
204-
and (extension := get_extension_from_mimetype(mimetype))
205-
):
206-
return mimetype, extension
241+
if mimetype and not extension and mimetype != extension_lookup_failed_for:
242+
extension = get_extension_from_mimetype(mimetype)
243+
if extension:
244+
return mimetype, extension
207245

208-
if filename is not None:
246+
# Last resort: pull an extension directly from the filename string.
247+
if not extension and filename is not None:
209248
idx = filename.rfind(".")
210249
if idx != -1:
211250
extension = filename[idx:]
@@ -281,7 +320,7 @@ def try_parse_data_url(url: str) -> DataUrl | None:
281320
encoding=encoding,
282321
)
283322
)
284-
elif base64:
323+
elif is_base64:
285324
return DataUrl(
286325
params=DataUrlBase64(
287326
mimetype=base_media_type,

0 commit comments

Comments
 (0)