11from __future__ import annotations
22
33import base64
4+ import functools
45import logging
56import re
67from pathlib import Path
3435# A global variable to hold the lazily imported mimetypes module.
3536_mimetypes_module : ModuleType | None = None
3637
38+ # Cached polyfile matcher -- initialised once on first use.
39+ _polyfile_matcher : MagicMatcher | None = None
40+ _polyfile_unavailable : bool = False
41+
3742DATA_URL_PATTERN = re .compile (
3843 r"^data:(?P<media_type>[\w\/\-\+\.]+(?:;[\w\-]+\=[\w\-\.]+)*)?(?P<base64>;base64)?,(?P<data>.*)$"
3944)
@@ -106,38 +111,62 @@ def default_filename(
106111
107112
108113def get_extension_from_mimetype (mimetype : str ) -> str | None :
114+ """Map a MIME type to a file extension via the stdlib mimetypes db."""
109115 mimetypes = _get_mimetypes_module ()
110116 extension = mimetypes .guess_extension (mimetype )
111117 if not extension :
112- logger .warning (
113- f "Got mime-type { mimetype } but failed to resolve a valid extension"
118+ logger .debug (
119+ "Got mime-type %s but failed to resolve a valid extension" , mimetype
114120 )
115121 return extension
116122
117123
118- def guess_from_buffer (buffer : bytes ) -> str | None :
119- """Guess the mimetype from a byte buffer using polyfile."""
120- if len (buffer ) == 0 :
121- return None
124+ def _get_polyfile_matcher () -> MagicMatcher | None :
125+ """Return the cached polyfile MagicMatcher, or None if unavailable.
122126
127+ The matcher and the "not installed" state are each determined once per
128+ process so that we never re-import polyfile or log the warning repeatedly.
129+ """
130+ global _polyfile_matcher , _polyfile_unavailable
131+ if _polyfile_matcher is not None :
132+ return _polyfile_matcher
133+ if _polyfile_unavailable :
134+ return None
123135 try :
124- # Lazily import polyfile only when needed.
125136 from polyfile .magic import MagicMatcher
126137 except (ImportError , ModuleNotFoundError ):
138+ _polyfile_unavailable = True
127139 logger .warning (
128- "Failed to determine MIME type from file extension and cannot infer from data\n "
129- "MIME type detection from raw data requires the polyfile library\n "
130- "Install it by running: `pip install polyfile `\n "
140+ "MIME type detection from raw data requires the polyfile library. "
141+ "Install it by running: `pip install polyfile`. "
131142 "See: https://pypi.org/project/polyfile for detailed instructions"
132143 )
133144 return None
145+ else :
146+ _polyfile_matcher = cast ("MagicMatcher" , MagicMatcher .DEFAULT_INSTANCE )
147+ return _polyfile_matcher
134148
135- try :
136- matcher = cast ("MagicMatcher" , MagicMatcher .DEFAULT_INSTANCE )
137- return next (matcher .match (buffer )).mimetypes [0 ]
138- except IndexError :
139- # This occurs if polyfile is installed but finds no match.
149+
150+ @functools .lru_cache (maxsize = 256 )
151+ def guess_from_buffer (buffer : bytes ) -> str | None :
152+ """Guess the mimetype from a byte buffer using polyfile.
153+
154+ Results are LRU-cached (keyed on the buffer bytes) so that repeated
155+ calls with identical content skip the expensive magic-match pass.
156+ """
157+ if len (buffer ) == 0 :
140158 return None
159+ matcher = _get_polyfile_matcher ()
160+ if matcher is None :
161+ return None
162+ try :
163+ match = next (matcher .match (buffer ), None )
164+ if match is not None and match .mimetypes :
165+ return match .mimetypes [0 ]
166+ except Exception :
167+ # Broad guard so a polyfile bug can never crash a caller.
168+ logger .debug ("polyfile match raised an unexpected error" , exc_info = True )
169+ return None
141170
142171
143172def guess_from_filename (filename : str ) -> str | None :
@@ -171,22 +200,32 @@ def get_mime_and_extension(
171200
172201 if extension is not None and len (extension ) > 0 :
173202 extension = f".{ extension .lstrip ('.' )} "
174- if mimetype and extension :
175- return mimetype , extension
176203
177- elif (
178- mimetype
179- and not extension
180- and (guessed_ext := get_extension_from_mimetype (mimetype ))
181- ):
182- return mimetype , guessed_ext
204+ # Track whether we already tried (and failed) to derive an extension
205+ # from the current mimetype, so we don't call get_extension_from_mimetype
206+ # twice with the same value.
207+ extension_lookup_failed_for : str | None = None
183208
184- elif (
185- extension and not mimetype and (guessed_type := guess_from_extension (extension ))
186- ):
187- return guessed_type , extension
209+ # --- Fast paths: both known, or one can be derived cheaply ----------
210+ if mimetype and extension :
211+ return mimetype , extension
188212
189- if filename is not None :
213+ if mimetype and not extension :
214+ guessed_ext = get_extension_from_mimetype (mimetype )
215+ if guessed_ext :
216+ return mimetype , guessed_ext
217+ # mimetype is valid but has no known extension — keep it, keep looking
218+ # for an extension below instead of discarding it.
219+ extension_lookup_failed_for = mimetype
220+
221+ if extension and not mimetype :
222+ guessed_type = guess_from_extension (extension )
223+ if guessed_type :
224+ return guessed_type , extension
225+
226+ # --- Slower guessing: filename, then buffer -------------------------
227+ # Only guess mimetype from filename/buffer when we don't already have one.
228+ if not mimetype and filename is not None :
190229 mimetype = guess_from_filename (filename )
191230
192231 if not mimetype and extension is not None :
@@ -195,17 +234,17 @@ def get_mime_and_extension(
195234 if not mimetype and buffer is not None :
196235 mimetype = guess_from_buffer (buffer [:MIME_DETECTION_BUFFER_SIZE ])
197236
237+ # --- Derive the missing half from whichever half we now have --------
198238 if mimetype and extension :
199239 return mimetype , extension
200240
201- elif (
202- mimetype
203- and not extension
204- and (extension := get_extension_from_mimetype (mimetype ))
205- ):
206- return mimetype , extension
241+ if mimetype and not extension and mimetype != extension_lookup_failed_for :
242+ extension = get_extension_from_mimetype (mimetype )
243+ if extension :
244+ return mimetype , extension
207245
208- if filename is not None :
246+ # Last resort: pull an extension directly from the filename string.
247+ if not extension and filename is not None :
209248 idx = filename .rfind ("." )
210249 if idx != - 1 :
211250 extension = filename [idx :]
@@ -281,7 +320,7 @@ def try_parse_data_url(url: str) -> DataUrl | None:
281320 encoding = encoding ,
282321 )
283322 )
284- elif base64 :
323+ elif is_base64 :
285324 return DataUrl (
286325 params = DataUrlBase64 (
287326 mimetype = base_media_type ,
0 commit comments