|
16 | 16 | import argparse |
17 | 17 | import hashlib |
18 | 18 | import struct |
| 19 | +from collections import defaultdict |
19 | 20 | from enum import Enum |
20 | 21 |
|
21 | 22 | try: |
@@ -52,12 +53,62 @@ def section_in_binary(section): |
52 | 53 | sh_flags = section["sh_flags"] |
53 | 54 | return sh_flags & SH_FLAGS.SHF_ALLOC != 0 |
54 | 55 |
|
| 56 | + @staticmethod |
| 57 | + def build_symbol_by_name_cache(symtab, little_endian): |
| 58 | + # An optimized imlementation for building a cache for quick symbol info lookups |
| 59 | + # |
| 60 | + # Replacing implementation here: |
| 61 | + # https://github.com/eliben/pyelftools/blob/49ffaf4/elftools/elf/sections.py#L198-L210 |
| 62 | + # |
| 63 | + # Two main performance optimizations |
| 64 | + # 1) The "struct_parse" utility pyelftools relies on for decoding structures |
| 65 | + # is extremely slow. We will use Python's struct.unpack instead here |
| 66 | + # 2) pyelftools passes around a file stream object while doing deserialization |
| 67 | + # which means there are a ton of disk seeks that get kicked off |
| 68 | + # |
| 69 | + # Empirically, seeing about 10x performance improvement |
| 70 | + symtab_data = symtab.data() |
| 71 | + symtab_entry_size = symtab["sh_entsize"] |
| 72 | + |
| 73 | + symbol_name_map = defaultdict(list) |
| 74 | + |
| 75 | + stringtable_data = symtab.stringtable.data() |
| 76 | + |
| 77 | + def _get_string(start_offset): |
| 78 | + end_offset = stringtable_data.find(b"\x00", start_offset) |
| 79 | + if end_offset == -1: |
| 80 | + return None |
| 81 | + s = stringtable_data[start_offset:end_offset] |
| 82 | + return s.decode("utf-8", errors="replace") |
| 83 | + |
| 84 | + for idx in range(symtab.num_symbols()): |
| 85 | + entry_offset = idx * symtab_entry_size |
| 86 | + # The first word of a "Symbol Table Entry" is "st_name" |
| 87 | + # For more details, see the "Executable and Linking Format" specification |
| 88 | + symtab_entry_data = symtab_data[entry_offset : entry_offset + 4] |
| 89 | + |
| 90 | + endianess_prefix = "<" if little_endian else ">" |
| 91 | + st_name = struct.unpack("{}I".format(endianess_prefix), symtab_entry_data)[0] |
| 92 | + name = _get_string(st_name) |
| 93 | + symbol_name_map[name].append(idx) |
| 94 | + |
| 95 | + return symbol_name_map |
| 96 | + |
55 | 97 | @property |
56 | 98 | def symtab(self): |
57 | 99 | # Cache the SymbolTableSection, to avoid re-parsing |
58 | 100 | if self._symtab: |
59 | 101 | return self._symtab |
60 | 102 | self._symtab = self.elf.get_section_by_name(".symtab") |
| 103 | + |
| 104 | + # Pyelftools maintains a symbol_name to index cache (_symbol_name_map) which is extremely |
| 105 | + # slow to build when there are many symbols present in an ELF so we build the cache here |
| 106 | + # using an optimized implementation |
| 107 | + if self._symtab: |
| 108 | + self._symtab._symbol_name_map = self.build_symbol_by_name_cache( |
| 109 | + self._symtab, little_endian=self.elf.little_endian |
| 110 | + ) |
| 111 | + |
61 | 112 | return self._symtab |
62 | 113 |
|
63 | 114 | def find_symbol_and_section(self, symbol_name): |
|
0 commit comments