diff --git a/pytest/test_load_file.py b/pytest/test_load_file.py index 5ef87746..1eae29e5 100644 --- a/pytest/test_load_file.py +++ b/pytest/test_load_file.py @@ -23,12 +23,13 @@ def test_load_file() -> None: obj_path = check_object_path(load_py) ( version_tuple, - timestamp, - magic_int, + _timestamp, + _magic_int, co_module, pypy, source_size, sip_hash, + _file_offsets, ) = load_module(obj_path) if (3, 3) <= version_tuple <= (3, 7): statinfo = os.stat(load_py) diff --git a/xdis/bin/pydisasm.py b/xdis/bin/pydisasm.py index 183c43d2..919e6222 100644 --- a/xdis/bin/pydisasm.py +++ b/xdis/bin/pydisasm.py @@ -43,19 +43,25 @@ metavar="FUNCTION-OR-METHOD", multiple=True, type=str, - help=("Specify which specific methods or functions to show. " - "If omitted all, functions are shown. " - "Can be given multiple times.") + help=( + "Specify which specific methods or functions to show. " + "If omitted all, functions are shown. " + "Can be given multiple times." + ), ) - @click.option( "--show-source/--no-show-source", "-S", help="Intersperse Python source text from linecache if available.", ) +@click.option( + "--show-file-offsets/--no-show-file_offsets", + "-x", + help="Show bytecode file hex addresses for the start of each code object.", +) @click.version_option(version=__version__) @click.argument("files", nargs=-1, type=click.Path(readable=True), required=True) -def main(format: list[str], method: tuple, show_source: bool, files): +def main(format: list[str], method: tuple, show_source: bool, show_file_offsets, files): """Disassembles a Python bytecode file. We handle bytecode for virtually every release of Python and some releases of PyPy. @@ -91,7 +97,14 @@ def main(format: list[str], method: tuple, show_source: bool, files): continue try: - disassemble_file(path, sys.stdout, format, show_source=show_source, methods=method) + disassemble_file( + path, + sys.stdout, + format, + show_source=show_source, + methods=method, + save_file_offsets=show_file_offsets, + ) except (ImportError, NotImplementedError, ValueError) as e: print(e) rc = 3 diff --git a/xdis/bytecode.py b/xdis/bytecode.py index de238bcc..cde84872 100644 --- a/xdis/bytecode.py +++ b/xdis/bytecode.py @@ -497,7 +497,7 @@ def get_instructions_bytes( linestarts=linestarts, line_offset=0, exception_entries=exception_entries, - labels=labels + labels=labels, ) ) @@ -515,7 +515,9 @@ class Bytecode: Iterating over these yields the bytecode operations as Instruction instances. """ - def __init__(self, x, opc, first_line=None, current_offset=None, dup_lines: bool=True) -> None: + def __init__( + self, x, opc, first_line=None, current_offset=None, dup_lines: bool = True + ) -> None: self.codeobj = co = get_code_object(x) self._line_offset = 0 self._cell_names = () @@ -536,7 +538,11 @@ def __init__(self, x, opc, first_line=None, current_offset=None, dup_lines: bool self.opnames = opc.opname self.current_offset = current_offset - if opc.version_tuple >= (3, 11) and not opc.is_pypy and hasattr(co, "co_exceptiontable"): + if ( + opc.version_tuple >= (3, 11) + and not opc.is_pypy + and hasattr(co, "co_exceptiontable") + ): self.exception_entries = parse_exception_table(co.co_exceptiontable) else: self.exception_entries = None @@ -573,7 +579,11 @@ def info(self) -> str: """Return formatted information about the code object.""" return format_code_info(self.codeobj, self.opc.version_tuple) - def dis(self, asm_format: str="classic", show_source: bool=False) -> str: + def dis( + self, + asm_format: str = "classic", + show_source: bool = False, + ) -> str: """Return a formatted view of the bytecode operations.""" co = self.codeobj filename = co.co_filename @@ -839,7 +849,9 @@ def get_instructions(self, x, first_line=None): ) -def list2bytecode(inst_list: Iterable, opc, varnames: str, consts: Tuple[None, int]) -> bytes: +def list2bytecode( + inst_list: Iterable, opc, varnames: str, consts: Tuple[None, int] +) -> bytes: """Convert list/tuple of list/tuples to bytecode _names_ contains a list of name objects """ diff --git a/xdis/cross_dis.py b/xdis/cross_dis.py index 2c69c454..374aff05 100644 --- a/xdis/cross_dis.py +++ b/xdis/cross_dis.py @@ -19,7 +19,7 @@ # earlier versions of xdis (and without attribution). from types import CodeType -from typing import List +from typing import List, Optional from xdis.util import ( COMPILER_FLAG_NAMES, @@ -272,7 +272,7 @@ def pretty_flags(flags, is_pypy=False) -> str: def format_code_info( - co, version_tuple: tuple, name=None, is_pypy=False, is_graal=False + co, version_tuple: tuple, name=None, is_pypy=False, is_graal=False, file_offset: Optional[int]=None ) -> str: if not name: name = co.co_name @@ -285,6 +285,9 @@ def format_code_info( # Later versions use "" lines.append("# Filename: %s" % co.co_filename) + if file_offset: + lines.append("# Offset in file: 0x%x" % file_offset) + if not is_graal: if version_tuple >= (1, 3): lines.append("# Argument count: %s" % co.co_argcount) diff --git a/xdis/disasm.py b/xdis/disasm.py index 6399e1a8..213bda14 100644 --- a/xdis/disasm.py +++ b/xdis/disasm.py @@ -29,7 +29,7 @@ import sys import types from collections import deque -from typing import Tuple +from typing import Optional, Tuple import xdis from xdis.bytecode import Bytecode @@ -72,6 +72,7 @@ def show_module_header( header=True, show_filename=True, is_graal=False, + file_offset: Optional[int] = None, ) -> None: bytecode_version = ".".join((str(i) for i in version_tuple)) real_out = out or sys.stdout @@ -121,6 +122,8 @@ def show_module_header( real_out.write("# SipHash: 0x%x\n" % sip_hash) if show_filename: real_out.write("# Embedded file name: %s\n" % co.co_filename) + if file_offset: + real_out.write("# Position in bytecode file: 0x%x\n" % file_offset) def disco( @@ -128,15 +131,16 @@ def disco( co, timestamp, out=sys.stdout, - is_pypy: bool=False, + is_pypy: bool = False, magic_int=None, source_size=None, sip_hash=None, - asm_format: str="classic", + asm_format: str = "classic", alternate_opmap=None, - show_source: bool=False, - is_graal: bool=False, + show_source: bool = False, + is_graal: bool = False, methods=tuple(), + file_offsets: dict = {}, ) -> None: """ disassembles and deparses a given code block 'co' @@ -163,7 +167,15 @@ def disco( if co.co_filename and asm_format != "xasm": if not_filtered(co, methods): - real_out.write(format_code_info(co, version_tuple, is_graal=is_graal) + "\n") + real_out.write( + format_code_info( + co, + version_tuple, + is_graal=is_graal, + file_offset=file_offsets.get(co), + ) + + "\n" + ) pass opc = get_opcode(version_tuple, is_pypy, alternate_opmap) @@ -184,6 +196,7 @@ def disco( dup_lines=True, show_source=show_source, methods=methods, + file_offsets=file_offsets, ) @@ -196,6 +209,7 @@ def disco_loop( asm_format="classic", show_source=False, methods=tuple(), + file_offsets: dict = {}, ) -> None: """Disassembles a queue of code objects. If we discover another code object which will be found in co_consts, we add @@ -211,7 +225,13 @@ def disco_loop( co = queue.popleft() if not_filtered(co, methods): if co.co_name not in ("", "?"): - real_out.write("\n" + format_code_info(co, version_tuple) + "\n") + real_out.write( + "\n" + + format_code_info( + co, version_tuple, file_offset=file_offsets.get(co) + ) + + "\n" + ) if asm_format == "dis": assert version_tuple[:2] == PYTHON_VERSION_TRIPLE[:2], ( @@ -222,12 +242,18 @@ def disco_loop( else: bytecode = Bytecode(co, opc, dup_lines=dup_lines) real_out.write( - bytecode.dis(asm_format=asm_format, show_source=show_source) + "\n" + bytecode.dis( + asm_format=asm_format, + show_source=show_source, + ) + + "\n" ) if version_tuple >= (3, 11): if bytecode.exception_entries not in (None, []): - exception_table = format_exception_table(bytecode, version_tuple) + exception_table = format_exception_table( + bytecode, version_tuple + ) real_out.write(exception_table + "\n") for c in co.co_consts: @@ -242,7 +268,9 @@ def code_uniquify(basename, co_code) -> str: return "%s_0x%x" % (basename, id(co_code)) -def disco_loop_asm_format(opc, version_tuple, co, real_out, fn_name_map, all_fns) -> None: +def disco_loop_asm_format( + opc, version_tuple, co, real_out, fn_name_map, all_fns +) -> None: """Produces disassembly in a format more conducive to automatic assembly by producing inner modules before they are used by outer ones. Since this is recursive, we'll @@ -318,7 +346,8 @@ def disassemble_file( asm_format="classic", alternate_opmap=None, show_source=False, - methods: Tuple[str] = tuple() + methods: Tuple[str] = tuple(), + save_file_offsets: bool = False, ): """ Disassemble Python byte-code file (.pyc). @@ -329,6 +358,7 @@ def disassemble_file( If that fails, we'll compile internally for the Python version currently running. """ pyc_filename = None + file_offsets = {} try: # FIXME: add whether we want PyPy pyc_filename = check_object_path(filename) @@ -340,7 +370,8 @@ def disassemble_file( is_pypy, source_size, sip_hash, - ) = load_module(pyc_filename) + file_offsets, + ) = load_module(pyc_filename, save_file_offsets=save_file_offsets) except (ImportError, NotImplementedError, ValueError): raise except Exception: @@ -391,6 +422,7 @@ def disassemble_file( show_source=show_source, is_graal=is_graal, methods=methods, + file_offsets=file_offsets, ) # print co.co_filename return ( @@ -404,9 +436,11 @@ def disassemble_file( sip_hash, ) + def not_filtered(co: types.CodeType, methods: tuple) -> bool: return len(methods) == 0 or co.co_name in methods + def _test() -> None: """Simple test program to disassemble a file.""" argc = len(sys.argv) diff --git a/xdis/dropbox/decrypt25.py b/xdis/dropbox/decrypt25.py index af683eb0..3b90961b 100644 --- a/xdis/dropbox/decrypt25.py +++ b/xdis/dropbox/decrypt25.py @@ -294,7 +294,7 @@ def fix_dropbox_pyc(fp): timestamp = struct.unpack("I", ts)[0] b = fp.read() co = loads(b) - return (2, 5, "0dropbox"), timestamp, 62131, co, False, source_size, None + return (2, 5, "0dropbox"), timestamp, 62131, co, False, source_size, None, {} def fix_dir(path) -> None: diff --git a/xdis/load.py b/xdis/load.py index 811a602e..f20558c8 100644 --- a/xdis/load.py +++ b/xdis/load.py @@ -142,7 +142,11 @@ def load_file(filename: str, out=sys.stdout) -> CodeType: def load_module( - filename: str, code_objects=None, fast_load: bool = False, get_code: bool = True + filename: str, + code_objects=None, + fast_load: bool = False, + get_code: bool = True, + save_file_offsets: bool = False, ): """load a module without importing it. Parameters: @@ -197,11 +201,17 @@ def load_module( code_objects=code_objects, fast_load=fast_load, get_code=get_code, + save_file_offsets=save_file_offsets, ) def load_module_from_file_object( - fp, filename="", code_objects=None, fast_load=False, get_code=True + fp, + filename="", + code_objects=None, + fast_load=False, + get_code=True, + save_file_offsets=False, ): """load a module from a file object without importing it. @@ -212,6 +222,7 @@ def load_module_from_file_object( code_objects = {} timestamp = 0 + file_offsets = {} try: magic = fp.read(4) magic_int = magic2int(magic) @@ -233,7 +244,9 @@ def load_module_from_file_object( else: raise ImportError(f"Bad magic number: '{magic}'") - if magic_int in [2657, 22138] + list(GRAAL3_MAGICS) + list(RUSTPYTHON_MAGICS) + list(JYTHON_MAGICS): + if magic_int in [2657, 22138] + list(GRAAL3_MAGICS) + list( + RUSTPYTHON_MAGICS + ) + list(JYTHON_MAGICS): version = magicint2version.get(magic_int, "") raise ImportError(f"Magic int {magic_int} ({version}) is not supported.") @@ -323,14 +336,18 @@ def load_module_from_file_object( source_size = unpack(" None: """Write bytecode file _bytecode_path_, with code for having Python magic_int (i.e. bytecode associated with some version of Python) @@ -399,7 +417,7 @@ def write_bytecode_file( if __name__ == "__main__": co = load_file(__file__) obj_path = check_object_path(__file__) - version, timestamp, magic_int, co2, pypy, source_size, sip_hash = load_module( + version, timestamp, magic_int, co2, pypy, source_size, sip_hash, file_offsets = load_module( obj_path ) print("version", version, "magic int", magic_int, "is_pypy", pypy) diff --git a/xdis/unmarshal.py b/xdis/unmarshal.py index b6ef31bc..375d85c2 100644 --- a/xdis/unmarshal.py +++ b/xdis/unmarshal.py @@ -146,6 +146,10 @@ def __init__(self, fp, magic_int, bytes_for_s, code_objects={}) -> None: self.magic_int = magic_int self.code_objects = code_objects + # Save a list of offsets in the bytecode file where code + # objects starts. + self.code_to_file_offsets = {} + self.bytes_for_s = bytes_for_s version = magic_int2tuple(self.magic_int) if version >= (3, 4): @@ -457,6 +461,10 @@ def t_code(self, save_ref, bytes_for_s: bool = False): # FIXME: use tables to simplify this? # FIXME: Python 1.0 .. 1.3 isn't well known + # Go back one byte to TYPE_CODE "c" or "c" with the FLAG_REF + # set. + code_offset_in_file = self.fp.tell() - 1 + ret, i = self.r_ref_reserve(None, save_ref) self.version_tuple = magic_int2tuple(self.magic_int) @@ -627,8 +635,11 @@ def t_code(self, save_ref, bytes_for_s: bool = False): version_triple=self.version_tuple, ) + self.code_to_file_offsets[code] = code_offset_in_file + self.code_objects[str(code)] = code ret = code + return self.r_ref_insert(ret, i) # Since Python 3.4 @@ -653,3 +664,11 @@ def load_code(fp, magic_int, bytes_for_s: bool = False, code_objects={}): fp, magic_int, bytes_for_s, code_objects=code_objects ) return um_gen.load() + +def load_code_and_get_file_offsets(fp, magic_int, bytes_for_s: bool = False, code_objects={}) -> tuple: + if isinstance(fp, bytes): + fp = io.BytesIO(fp) + um_gen = _VersionIndependentUnmarshaller( + fp, magic_int, bytes_for_s, code_objects=code_objects + ) + return um_gen.load(), um_gen.code_to_file_offsets