11#!/usr/bin/env python
22"""
33puremagic is a pure python module that will identify a file based off it's
4- magic numbers. It is designed to be minimalistic and inherently cross platform
4+ magic numbers. It is designed to be minimalistic and inherently cross- platform
55compatible, with no imports when used as a module.
66
77© 2013-2025 Chris Griffith - License: MIT (see LICENSE)
1919from binascii import unhexlify
2020from collections import namedtuple
2121from itertools import chain
22+ from pathlib import Path
23+
24+ if os .getenv ("PUREMAGIC_DEEPSCAN" ) != "0" :
25+ from puremagic .scanners import zip_scanner , pdf_scanner , text_scanner
2226
2327__author__ = "Chris Griffith"
24- __version__ = "1.29 "
28+ __version__ = "2.0.0b1 "
2529__all__ = [
2630 "magic_file" ,
2731 "magic_string" ,
@@ -133,9 +137,6 @@ def _confidence(matches, ext=None) -> list[PureMagicWithConfidence]:
133137 if ext == magic_row .extension
134138 ]
135139
136- if not results :
137- raise PureError ("Could not identify file" )
138-
139140 return sorted (results , key = lambda x : (x .confidence , len (x .byte_match )), reverse = True )
140141
141142
@@ -196,11 +197,22 @@ def _identify_all(header: bytes, footer: bytes, ext=None) -> list[PureMagicWithC
196197 return _confidence (matches , ext )
197198
198199
199- def _magic (header : bytes , footer : bytes , mime : bool , ext = None ) -> str :
200+ def _magic (header : bytes , footer : bytes , mime : bool , ext = None , filename = None ) -> str :
200201 """Discover what type of file it is based on the incoming string"""
201202 if not header :
202203 raise ValueError ("Input was empty" )
203- info = _identify_all (header , footer , ext )[0 ]
204+ infos = _identify_all (header , footer , ext )
205+ if filename and os .getenv ("PUREMAGIC_DEEPSCAN" ) != "0" :
206+ results = _run_deep_scan (infos , filename , header , footer , raise_on_none = True )
207+ if results :
208+ if results [0 ].extension == "" :
209+ raise PureError ("Could not identify file" )
210+ if mime :
211+ return results [0 ].mime_type
212+ return results [0 ].extension
213+ if not infos :
214+ raise PureError ("Could not identify file" )
215+ info = infos [0 ]
204216 if mime :
205217 return info .mime_type
206218 return info .extension if not isinstance (info .extension , list ) else info [0 ].extension
@@ -268,7 +280,7 @@ def from_file(filename: os.PathLike | str, mime: bool = False) -> str:
268280 """
269281
270282 head , foot = _file_details (filename )
271- return _magic (head , foot , mime , ext_from_filename (filename ))
283+ return _magic (head , foot , mime , ext_from_filename (filename ), filename = filename )
272284
273285
274286def from_string (string : str | bytes , mime : bool = False , filename : os .PathLike | str | None = None ) -> str :
@@ -321,6 +333,8 @@ def magic_file(filename: os.PathLike | str) -> list[PureMagicWithConfidence]:
321333 except PureError :
322334 info = []
323335 info .sort (key = lambda x : x .confidence , reverse = True )
336+ if os .getenv ("PUREMAGIC_DEEPSCAN" ) != "0" :
337+ return _run_deep_scan (info , filename , head , foot , raise_on_none = False )
324338 return info
325339
326340
@@ -343,7 +357,10 @@ def magic_string(string, filename: os.PathLike | str | None = None) -> list[Pure
343357 return info
344358
345359
346- def magic_stream (stream , filename : os .PathLike | str | None = None ) -> list [PureMagicWithConfidence ]:
360+ def magic_stream (
361+ stream ,
362+ filename : os .PathLike | None = None ,
363+ ) -> list [PureMagicWithConfidence ]:
347364 """Returns tuple of (num_of_matches, array_of_matches)
348365 arranged highest confidence match first
349366 If filename is provided it will be used in the computation.
@@ -361,6 +378,75 @@ def magic_stream(stream, filename: os.PathLike | str | None = None) -> list[Pure
361378 return info
362379
363380
381+ def _single_deep_scan (
382+ bytes_match : bytes | bytearray | None ,
383+ filename : os .PathLike | str ,
384+ head = None ,
385+ foot = None ,
386+ ):
387+ if os .getenv ("PUREMAGIC_DEEPSCAN" ) == "0" :
388+ return None
389+ if not isinstance (filename , os .PathLike ):
390+ filename = Path (filename )
391+ match bytes_match :
392+ case zip_scanner .match_bytes :
393+ return zip_scanner .main (filename , head , foot )
394+ case pdf_scanner .match_bytes :
395+ return pdf_scanner .main (filename , head , foot )
396+ case None | b"" :
397+ for scanner in (text_scanner , pdf_scanner ):
398+ result = scanner .main (filename , head , foot )
399+ if result :
400+ return result
401+ return None
402+
403+
404+ def _run_deep_scan (
405+ matches : list [PureMagicWithConfidence ],
406+ filename : os .PathLike | str ,
407+ head = None ,
408+ foot = None ,
409+ raise_on_none = True ,
410+ ):
411+ if not matches or matches [0 ].byte_match == b"" :
412+ try :
413+ result = _single_deep_scan (None , filename , head , foot )
414+ except Exception :
415+ pass
416+ else :
417+ if result :
418+ return [
419+ PureMagicWithConfidence (
420+ confidence = result .confidence ,
421+ byte_match = None ,
422+ offset = None ,
423+ extension = result .extension ,
424+ mime_type = result .mime_type ,
425+ name = result .name ,
426+ )
427+ ]
428+ if raise_on_none :
429+ raise PureError ("Could not identify file" )
430+
431+ for pure_magic_match in matches :
432+ try :
433+ result = _single_deep_scan (pure_magic_match .byte_match , filename , head , foot )
434+ except Exception :
435+ continue
436+ if result :
437+ return [
438+ PureMagicWithConfidence (
439+ confidence = result .confidence ,
440+ byte_match = pure_magic_match .byte_match ,
441+ offset = pure_magic_match .offset ,
442+ extension = result .extension ,
443+ mime_type = result .mime_type ,
444+ name = result .name ,
445+ )
446+ ]
447+ return matches
448+
449+
364450def command_line_entry (* args ):
365451 import sys
366452 from argparse import ArgumentParser
0 commit comments