|
| 1 | +import os |
| 2 | +from future.utils import viewitems, viewvalues |
| 3 | +from utils import guess_machine |
| 4 | + |
| 5 | +from miasm.analysis.binary import Container |
| 6 | +from miasm.analysis.machine import Machine |
| 7 | +from miasm.expression.expression import get_expr_mem |
| 8 | +from miasm.analysis.data_analysis import inter_block_flow #, intra_block_flow_raw |
| 9 | +from miasm.core.graph import DiGraph |
| 10 | +from miasm.ir.symbexec import SymbolicExecutionEngine |
| 11 | +from miasm.analysis.data_flow import DeadRemoval, ReachingDefinitions, DiGraphDefUse |
| 12 | +from miasm.core.locationdb import LocationDB |
| 13 | +from miasm.core.bin_stream_ida import bin_stream_ida |
| 14 | + |
| 15 | +def intra_block_flow_symb(lifter, _, flow_graph, irblock, in_nodes, out_nodes): |
| 16 | + symbols_init = lifter.arch.regs.regs_init.copy() |
| 17 | + sb = SymbolicExecutionEngine(lifter, symbols_init) |
| 18 | + sb.eval_updt_irblock(irblock) |
| 19 | + print('*' * 40) |
| 20 | + print(irblock) |
| 21 | + |
| 22 | + |
| 23 | + out = sb.modified(mems=False) |
| 24 | + current_nodes = {} |
| 25 | + # Gen mem arg to mem node links |
| 26 | + for dst, src in out: |
| 27 | + src = sb.eval_expr(dst) |
| 28 | + for n in [dst, src]: |
| 29 | + |
| 30 | + all_mems = set() |
| 31 | + all_mems.update(get_expr_mem(n)) |
| 32 | + |
| 33 | + for n in all_mems: |
| 34 | + node_n_w = (irblock.loc_key, 0, n) |
| 35 | + if not n == src: |
| 36 | + continue |
| 37 | + o_r = n.ptr.get_r(mem_read=False, cst_read=True) |
| 38 | + for i, n_r in enumerate(o_r): |
| 39 | + if n_r in current_nodes: |
| 40 | + node_n_r = current_nodes[n_r] |
| 41 | + else: |
| 42 | + node_n_r = (irblock.loc_key, i, n_r) |
| 43 | + if not n_r in in_nodes: |
| 44 | + in_nodes[n_r] = node_n_r |
| 45 | + flow_graph.add_uniq_edge(node_n_r, node_n_w) |
| 46 | + |
| 47 | + # Gen data flow links |
| 48 | + for dst in out: |
| 49 | + src = sb.eval_expr(dst) |
| 50 | + nodes_r = src.get_r(mem_read=False, cst_read=True) |
| 51 | + nodes_w = set([dst]) |
| 52 | + for n_r in nodes_r: |
| 53 | + if n_r in current_nodes: |
| 54 | + node_n_r = current_nodes[n_r] |
| 55 | + else: |
| 56 | + node_n_r = (irblock.loc_key, 0, n_r) |
| 57 | + if not n_r in in_nodes: |
| 58 | + in_nodes[n_r] = node_n_r |
| 59 | + |
| 60 | + flow_graph.add_node(node_n_r) |
| 61 | + for n_w in nodes_w: |
| 62 | + node_n_w = (irblock.loc_key, 1, n_w) |
| 63 | + out_nodes[n_w] = node_n_w |
| 64 | + |
| 65 | + flow_graph.add_node(node_n_w) |
| 66 | + flow_graph.add_uniq_edge(node_n_r, node_n_w) |
| 67 | + |
| 68 | + |
| 69 | + |
| 70 | +def intra_block_flow_raw(lifter, ircfg, flow_graph, irb, in_nodes, out_nodes): |
| 71 | + """ |
| 72 | + Create data flow for an irbloc using raw IR expressions |
| 73 | + """ |
| 74 | + current_nodes = {} |
| 75 | + for i, assignblk in enumerate(irb): |
| 76 | + dict_rw = assignblk.get_rw(cst_read=True) |
| 77 | + current_nodes.update(out_nodes) |
| 78 | + |
| 79 | + # gen mem arg to mem node links |
| 80 | + all_mems = set() |
| 81 | + for node_w, nodes_r in viewitems(dict_rw): |
| 82 | + for n in nodes_r.union([node_w]): |
| 83 | + all_mems.update(get_expr_mem(n)) |
| 84 | + if not all_mems: |
| 85 | + continue |
| 86 | + |
| 87 | + for n in all_mems: |
| 88 | + node_n_w = (hex(assignblk.instr.offset), i, n) |
| 89 | + if not n in nodes_r: |
| 90 | + continue |
| 91 | + o_r = n.ptr.get_r(mem_read=False, cst_read=True) |
| 92 | + for n_r in o_r: |
| 93 | + if n_r in current_nodes: |
| 94 | + node_n_r = current_nodes[n_r] |
| 95 | + else: |
| 96 | + node_n_r = (hex(assignblk.instr.offset), i, n_r) |
| 97 | + current_nodes[n_r] = node_n_r |
| 98 | + in_nodes[n_r] = node_n_r |
| 99 | + flow_graph.add_uniq_edge(node_n_r, node_n_w) |
| 100 | + |
| 101 | + # gen data flow links |
| 102 | + for node_w, nodes_r in viewitems(dict_rw): |
| 103 | + for n_r in nodes_r: |
| 104 | + if n_r in current_nodes: |
| 105 | + node_n_r = current_nodes[n_r] |
| 106 | + else: |
| 107 | + node_n_r = (hex(assignblk.instr.offset), i, n_r) |
| 108 | + current_nodes[n_r] = node_n_r |
| 109 | + in_nodes[n_r] = node_n_r |
| 110 | + |
| 111 | + flow_graph.add_node(node_n_r) |
| 112 | + |
| 113 | + node_n_w = (hex(assignblk.instr.offset), i + 1, node_w) |
| 114 | + out_nodes[node_w] = node_n_w |
| 115 | + |
| 116 | + flow_graph.add_node(node_n_w) |
| 117 | + flow_graph.add_uniq_edge(node_n_r, node_n_w) |
| 118 | + |
| 119 | + |
| 120 | + |
| 121 | +def node2str(node): |
| 122 | + out = "%s,%s\\l\\\n%s" % node |
| 123 | + return out |
| 124 | + |
| 125 | + |
| 126 | +def gen_function_data_flow_graph(lifter, ircfg, ad, block_flow_cb) -> DiGraph: |
| 127 | + ''' |
| 128 | + generate data flow graph for a given function |
| 129 | + ''' |
| 130 | + irblock_0 = None |
| 131 | + for irblock in viewvalues(ircfg.blocks): |
| 132 | + loc_key = irblock.loc_key |
| 133 | + offset = ircfg.loc_db.get_location_offset(loc_key) |
| 134 | + # print('{} -> {}'.format(hex(offset), irblock.loc_key)) |
| 135 | + if offset == ad: |
| 136 | + irblock_0 = irblock |
| 137 | + break |
| 138 | + assert irblock_0 is not None |
| 139 | + flow_graph = DiGraph() |
| 140 | + flow_graph.node2str = node2str |
| 141 | + |
| 142 | + |
| 143 | + irb_in_nodes = {} |
| 144 | + irb_out_nodes = {} |
| 145 | + for label in ircfg.blocks: |
| 146 | + irb_in_nodes[label] = {} |
| 147 | + irb_out_nodes[label] = {} |
| 148 | + |
| 149 | + for label, irblock in viewitems(ircfg.blocks): |
| 150 | + block_flow_cb(lifter, ircfg, flow_graph, irblock, irb_in_nodes[label], irb_out_nodes[label]) |
| 151 | + |
| 152 | + # for label in ircfg.blocks: |
| 153 | + # print(label) |
| 154 | + # print('IN', [str(x) for x in irb_in_nodes[label]]) |
| 155 | + # print('OUT', [str(x) for x in irb_out_nodes[label]]) |
| 156 | + |
| 157 | + # print('*' * 20, 'interblock', '*' * 20) |
| 158 | + inter_block_flow(lifter, ircfg, flow_graph, irblock_0.loc_key, irb_in_nodes, irb_out_nodes) |
| 159 | + |
| 160 | + return flow_graph |
| 161 | + |
| 162 | + |
| 163 | +def ida_dataflow_analysis(function_addr:int, function_name:str, output_dir:str, defuse_only: bool = False) -> None: |
| 164 | + |
| 165 | + loc_db = LocationDB() |
| 166 | + |
| 167 | + ###################### IDA specific ####################### |
| 168 | + machine = guess_machine() |
| 169 | + bin_stream = bin_stream_ida() |
| 170 | + |
| 171 | + # Populate symbols with ida names |
| 172 | + import idautils |
| 173 | + for ad, name in idautils.Names(): |
| 174 | + if name is None: |
| 175 | + continue |
| 176 | + loc_db.add_location(name, ad) |
| 177 | + |
| 178 | + |
| 179 | + ###################### Reverse-tool-independent ###################### |
| 180 | + |
| 181 | + mdis = machine.dis_engine(bin_stream, loc_db=loc_db, dont_dis_nulstart_bloc=True) |
| 182 | + mdis.follow_call = True |
| 183 | + lifter = machine.lifter_model_call(loc_db=loc_db) |
| 184 | + |
| 185 | + print('disassembling function: {}:{}'.format(hex(function_addr), function_name)) |
| 186 | + asmcfg = mdis.dis_multiblock(function_addr) |
| 187 | + |
| 188 | + print('generating IR...') |
| 189 | + ircfg = lifter.new_ircfg_from_asmcfg(asmcfg) |
| 190 | + deadrm = DeadRemoval(lifter) |
| 191 | + # deadrm(ircfg) # TODO: 这里会删掉一部分IR,需要研究一下 |
| 192 | + |
| 193 | + with open(os.path.join(output_dir, '{}.asm2ir'.format(function_name)),'w') as f: |
| 194 | + # print('\tOFFSET\t| ASM\t| SRC -> DST') |
| 195 | + f.write('\tOFFSET\t| ASM\t| SRC -> DST\n') |
| 196 | + for lbl, irblock in ircfg.blocks.items(): |
| 197 | + insr = [] |
| 198 | + for assignblk in irblock: |
| 199 | + for dst, src in assignblk.iteritems(): |
| 200 | + # print('\t{}\t| {}\t| {} -> {}'.format(hex(assignblk.instr.offset), assignblk.instr, src, dst)) |
| 201 | + f.write('\t{}\t| {}\t| {} -> {}\n'.format(hex(assignblk.instr.offset), assignblk.instr, src, dst)) |
| 202 | + |
| 203 | + if not defuse_only: |
| 204 | + block_flow_cb = intra_block_flow_raw # if args.symb else intra_block_flow_symb |
| 205 | + |
| 206 | + dfg = gen_function_data_flow_graph(lifter, ircfg, function_addr, block_flow_cb) |
| 207 | + open(os.path.join(output_dir,'{}_dfg.dot'.format(function_name)), 'w').write(dfg.dot()) |
| 208 | + |
| 209 | + reaching_defs = ReachingDefinitions(ircfg) |
| 210 | + defuse = DiGraphDefUse(reaching_defs) |
| 211 | + open(os.path.join(output_dir,'{}_defuse.dot'.format(function_name)), 'w').write(defuse.dot()) |
| 212 | + |
| 213 | + ''' |
| 214 | + 根据block_loc_key + assignblk_idx 可以推算出instr offset,所以这个def-use图也是可以对应回指令的 |
| 215 | + ''' |
| 216 | + LocKeyIdx2InstrOffset = {} |
| 217 | + for block in viewvalues(reaching_defs.ircfg.blocks): |
| 218 | + for index, assignblk in enumerate(block): |
| 219 | + LocKeyIdx2InstrOffset['{}_{}'.format(block.loc_key, index)] = hex(assignblk.instr.offset) |
| 220 | + |
| 221 | + # print(['{}:{}'.format(key,LocKeyIdx2InstrOffset[key]) for key in LocKeyIdx2InstrOffset]) |
| 222 | + open(os.path.join(output_dir,'{}_LocKeyIdx2InstrOffset.map'.format(function_name)), 'w').write( |
| 223 | + '\n'.join(['{}:{}'.format(key,LocKeyIdx2InstrOffset[key]) for key in LocKeyIdx2InstrOffset])) |
| 224 | + |
| 225 | + |
| 226 | +def miasm_dataflow_analysis(function_addr:int, function_name:str, output_dir:str, filepath:str, arch:str = "X86_64", defuse_only: bool = False) -> None: |
| 227 | + |
| 228 | + bin_stream = Container.from_stream(open(filepath, 'rb'), loc_db).bin_stream |
| 229 | + machine = Machine(arch) |
| 230 | + |
| 231 | + loc_db = LocationDB() |
| 232 | + mdis = machine.dis_engine(bin_stream, loc_db=loc_db, dont_dis_nulstart_bloc=True) |
| 233 | + mdis.follow_call = True |
| 234 | + lifter = machine.lifter_model_call(loc_db=loc_db) |
| 235 | + |
| 236 | + print('disassembling function: {}:{}'.format(hex(function_addr), function_name)) |
| 237 | + asmcfg = mdis.dis_multiblock(function_addr) |
| 238 | + |
| 239 | + print('generating IR...') |
| 240 | + ircfg = lifter.new_ircfg_from_asmcfg(asmcfg) |
| 241 | + deadrm = DeadRemoval(lifter) |
| 242 | + # deadrm(ircfg) # TODO: 这里会删掉一部分IR,需要研究一下 |
| 243 | + |
| 244 | + with open(os.path.join(output_dir, '{}.asm2ir'.format(function_name)),'w') as f: |
| 245 | + # print('\tOFFSET\t| ASM\t| SRC -> DST') |
| 246 | + f.write('\tOFFSET\t| ASM\t| SRC -> DST\n') |
| 247 | + for lbl, irblock in ircfg.blocks.items(): |
| 248 | + insr = [] |
| 249 | + for assignblk in irblock: |
| 250 | + for dst, src in assignblk.iteritems(): |
| 251 | + # print('\t{}\t| {}\t| {} -> {}'.format(hex(assignblk.instr.offset), assignblk.instr, src, dst)) |
| 252 | + f.write('\t{}\t| {}\t| {} -> {}\n'.format(hex(assignblk.instr.offset), assignblk.instr, src, dst)) |
| 253 | + |
| 254 | + if not defuse_only: |
| 255 | + block_flow_cb = intra_block_flow_raw # if args.symb else intra_block_flow_symb |
| 256 | + |
| 257 | + dfg = gen_function_data_flow_graph(lifter, ircfg, function_addr, block_flow_cb) |
| 258 | + open(os.path.join(output_dir,'{}_dfg.dot'.format(function_name)), 'w').write(dfg.dot()) |
| 259 | + |
| 260 | + reaching_defs = ReachingDefinitions(ircfg) |
| 261 | + defuse = DiGraphDefUse(reaching_defs) |
| 262 | + open(os.path.join(output_dir,'{}_defuse.dot'.format(function_name)), 'w').write(defuse.dot()) |
| 263 | + |
| 264 | + ''' |
| 265 | + 根据block_loc_key + assignblk_idx 可以推算出instr offset,所以这个def-use图也是可以对应回指令的 |
| 266 | + ''' |
| 267 | + LocKeyIdx2InstrOffset = {} |
| 268 | + for block in viewvalues(reaching_defs.ircfg.blocks): |
| 269 | + for index, assignblk in enumerate(block): |
| 270 | + LocKeyIdx2InstrOffset['{}_{}'.format(block.loc_key, index)] = hex(assignblk.instr.offset) |
| 271 | + |
| 272 | + # print(['{}:{}'.format(key,LocKeyIdx2InstrOffset[key]) for key in LocKeyIdx2InstrOffset]) |
| 273 | + open(os.path.join(output_dir,'{}_LocKeyIdx2InstrOffset.map'.format(function_name)), 'w').write( |
| 274 | + '\n'.join(['{}:{}'.format(key,LocKeyIdx2InstrOffset[key]) for key in LocKeyIdx2InstrOffset])) |
| 275 | + |
| 276 | + |
0 commit comments