Skip to content

Commit ee0e91a

Browse files
committed
init commit
0 parents  commit ee0e91a

File tree

10 files changed

+1291
-0
lines changed

10 files changed

+1291
-0
lines changed

IDAGenDFG.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import idaapi
2+
import idautils
3+
import idc
4+
import ida_pro
5+
import ida_auto
6+
import os, sys
7+
from libdataflow import ida_dataflow_analysis
8+
from argparse import ArgumentParser
9+
10+
def main(OUTPUT_DIR:str) -> None:
11+
os.makedirs(OUTPUT_DIR, exist_ok=True)
12+
13+
textStartEA = 0
14+
textEndEA = 0
15+
for seg in idautils.Segments():
16+
if (idc.get_segm_name(seg)==".text"):
17+
textStartEA = idc.get_segm_start(seg)
18+
textEndEA = idc.get_segm_end(seg)
19+
break
20+
21+
for func in idautils.Functions(textStartEA, textEndEA):
22+
# Ignore Library Code
23+
flags = idc.get_func_attr(func, idc.FUNCATTR_FLAGS)
24+
if flags & idc.FUNC_LIB:
25+
print(hex(func), "FUNC_LIB", idc.get_func_name(func))
26+
continue
27+
try:
28+
ida_dataflow_analysis(func, idc.get_func_name(func), OUTPUT_DIR, defuse_only=True)
29+
except Exception as e:
30+
print('Skip function {} due to dataflow analysis error: {}'.format(idc.get_func_name(func),e))
31+
32+
if __name__ == '__main__':
33+
if len(idc.ARGV) < 2:
34+
print('\n\nGenerating DFG & Def-Use Graph with IDA Pro and MIASM')
35+
print('\tNeed to specify the output dir with -o option')
36+
print('\tUsage: /path/to/ida -A -Lida.log -S"{} -o <output_dir>" /path/to/binary\n\n'.format(idc.ARGV[0]))
37+
ida_pro.qexit(1)
38+
39+
parser = ArgumentParser(description="IDAPython script for generating dataflow graph of each function in the given binary")
40+
parser.add_argument("-o", "--output_dir", help="Output dir", default='./outputs', nargs='?')
41+
# parser.add_argument("-s", "--symb", help="Symbolic execution mode",
42+
# action="store_true")
43+
args = parser.parse_args()
44+
45+
ida_auto.auto_wait()
46+
47+
main(args.output_dir)
48+
49+
ida_pro.qexit(0)

LICENSE

Lines changed: 339 additions & 0 deletions
Large diffs are not rendered by default.

README.md

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# ida-dfg
2+
3+
IDA Pro data-flow graph generator
4+
5+
Tested with IDA Pro 7.6 and miasm 7ee593d
6+
7+
## libdataflow.py
8+
9+
封装了两个核心接口给其他脚本用
10+
- `ida_dataflow_analysis`: 面向IDA + MIASM的场景
11+
- `miasm_dataflow_analysis`: 单独使用,不需要IDA Pro
12+
13+
## IDAGenDFG.py
14+
15+
IDAPython调用的脚本
16+
17+
`/path/to/ida -A -Lida.log -S"path/to/IDAGenDFG.py -o <output_dir>" /path/to/binary`
18+
19+
## deprecated/graph_dataflow.py
20+
21+
新版miasm支持的DFG/ReachinDefinition/DefUse分析
22+
23+
## deprecated/libdfg.py
24+
25+
代码升级 & debug工作停止,因为新版miasm自身支持dfg生成。
26+
27+
但是这部分代码的价值在于学习如何将miasm用到IDAPython里,详见`dataflow_analysis`函数。
28+
29+
30+
31+
## miasm的一些核心概念:
32+
- machine类: 定义架构、反汇编引擎、lifter
33+
- LocationDB类:各类数据结构的loc_key(unique id),例如AsmBlock, IRBlock的loc_key;以及定义了offset和loc_key相互转换的函数
34+
- Instruction类:可以在miasm.core.cpu内查看其成员函数、变量
35+
- AsmCFG类、AsmBlock类:汇编控制流图、基本块
36+
- IRBlock类、AssignBlock类:AsmBlock经Lifter翻译得到IRBlock,每一个IRBlock有若干个AssignBlock
37+
* 每个AssignBlock对应一条IR赋值语句(src -> dst),同时也可以对应回一条汇编指令(assignblk.instr)
38+
39+
## miasm的局限性
40+
41+
- 反汇编较慢
42+
- 无法处理80bit浮点数

libdataflow.py

Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
import os
2+
from future.utils import viewitems, viewvalues
3+
from utils import guess_machine
4+
5+
from miasm.analysis.binary import Container
6+
from miasm.analysis.machine import Machine
7+
from miasm.expression.expression import get_expr_mem
8+
from miasm.analysis.data_analysis import inter_block_flow #, intra_block_flow_raw
9+
from miasm.core.graph import DiGraph
10+
from miasm.ir.symbexec import SymbolicExecutionEngine
11+
from miasm.analysis.data_flow import DeadRemoval, ReachingDefinitions, DiGraphDefUse
12+
from miasm.core.locationdb import LocationDB
13+
from miasm.core.bin_stream_ida import bin_stream_ida
14+
15+
def intra_block_flow_symb(lifter, _, flow_graph, irblock, in_nodes, out_nodes):
16+
symbols_init = lifter.arch.regs.regs_init.copy()
17+
sb = SymbolicExecutionEngine(lifter, symbols_init)
18+
sb.eval_updt_irblock(irblock)
19+
print('*' * 40)
20+
print(irblock)
21+
22+
23+
out = sb.modified(mems=False)
24+
current_nodes = {}
25+
# Gen mem arg to mem node links
26+
for dst, src in out:
27+
src = sb.eval_expr(dst)
28+
for n in [dst, src]:
29+
30+
all_mems = set()
31+
all_mems.update(get_expr_mem(n))
32+
33+
for n in all_mems:
34+
node_n_w = (irblock.loc_key, 0, n)
35+
if not n == src:
36+
continue
37+
o_r = n.ptr.get_r(mem_read=False, cst_read=True)
38+
for i, n_r in enumerate(o_r):
39+
if n_r in current_nodes:
40+
node_n_r = current_nodes[n_r]
41+
else:
42+
node_n_r = (irblock.loc_key, i, n_r)
43+
if not n_r in in_nodes:
44+
in_nodes[n_r] = node_n_r
45+
flow_graph.add_uniq_edge(node_n_r, node_n_w)
46+
47+
# Gen data flow links
48+
for dst in out:
49+
src = sb.eval_expr(dst)
50+
nodes_r = src.get_r(mem_read=False, cst_read=True)
51+
nodes_w = set([dst])
52+
for n_r in nodes_r:
53+
if n_r in current_nodes:
54+
node_n_r = current_nodes[n_r]
55+
else:
56+
node_n_r = (irblock.loc_key, 0, n_r)
57+
if not n_r in in_nodes:
58+
in_nodes[n_r] = node_n_r
59+
60+
flow_graph.add_node(node_n_r)
61+
for n_w in nodes_w:
62+
node_n_w = (irblock.loc_key, 1, n_w)
63+
out_nodes[n_w] = node_n_w
64+
65+
flow_graph.add_node(node_n_w)
66+
flow_graph.add_uniq_edge(node_n_r, node_n_w)
67+
68+
69+
70+
def intra_block_flow_raw(lifter, ircfg, flow_graph, irb, in_nodes, out_nodes):
71+
"""
72+
Create data flow for an irbloc using raw IR expressions
73+
"""
74+
current_nodes = {}
75+
for i, assignblk in enumerate(irb):
76+
dict_rw = assignblk.get_rw(cst_read=True)
77+
current_nodes.update(out_nodes)
78+
79+
# gen mem arg to mem node links
80+
all_mems = set()
81+
for node_w, nodes_r in viewitems(dict_rw):
82+
for n in nodes_r.union([node_w]):
83+
all_mems.update(get_expr_mem(n))
84+
if not all_mems:
85+
continue
86+
87+
for n in all_mems:
88+
node_n_w = (hex(assignblk.instr.offset), i, n)
89+
if not n in nodes_r:
90+
continue
91+
o_r = n.ptr.get_r(mem_read=False, cst_read=True)
92+
for n_r in o_r:
93+
if n_r in current_nodes:
94+
node_n_r = current_nodes[n_r]
95+
else:
96+
node_n_r = (hex(assignblk.instr.offset), i, n_r)
97+
current_nodes[n_r] = node_n_r
98+
in_nodes[n_r] = node_n_r
99+
flow_graph.add_uniq_edge(node_n_r, node_n_w)
100+
101+
# gen data flow links
102+
for node_w, nodes_r in viewitems(dict_rw):
103+
for n_r in nodes_r:
104+
if n_r in current_nodes:
105+
node_n_r = current_nodes[n_r]
106+
else:
107+
node_n_r = (hex(assignblk.instr.offset), i, n_r)
108+
current_nodes[n_r] = node_n_r
109+
in_nodes[n_r] = node_n_r
110+
111+
flow_graph.add_node(node_n_r)
112+
113+
node_n_w = (hex(assignblk.instr.offset), i + 1, node_w)
114+
out_nodes[node_w] = node_n_w
115+
116+
flow_graph.add_node(node_n_w)
117+
flow_graph.add_uniq_edge(node_n_r, node_n_w)
118+
119+
120+
121+
def node2str(node):
122+
out = "%s,%s\\l\\\n%s" % node
123+
return out
124+
125+
126+
def gen_function_data_flow_graph(lifter, ircfg, ad, block_flow_cb) -> DiGraph:
127+
'''
128+
generate data flow graph for a given function
129+
'''
130+
irblock_0 = None
131+
for irblock in viewvalues(ircfg.blocks):
132+
loc_key = irblock.loc_key
133+
offset = ircfg.loc_db.get_location_offset(loc_key)
134+
# print('{} -> {}'.format(hex(offset), irblock.loc_key))
135+
if offset == ad:
136+
irblock_0 = irblock
137+
break
138+
assert irblock_0 is not None
139+
flow_graph = DiGraph()
140+
flow_graph.node2str = node2str
141+
142+
143+
irb_in_nodes = {}
144+
irb_out_nodes = {}
145+
for label in ircfg.blocks:
146+
irb_in_nodes[label] = {}
147+
irb_out_nodes[label] = {}
148+
149+
for label, irblock in viewitems(ircfg.blocks):
150+
block_flow_cb(lifter, ircfg, flow_graph, irblock, irb_in_nodes[label], irb_out_nodes[label])
151+
152+
# for label in ircfg.blocks:
153+
# print(label)
154+
# print('IN', [str(x) for x in irb_in_nodes[label]])
155+
# print('OUT', [str(x) for x in irb_out_nodes[label]])
156+
157+
# print('*' * 20, 'interblock', '*' * 20)
158+
inter_block_flow(lifter, ircfg, flow_graph, irblock_0.loc_key, irb_in_nodes, irb_out_nodes)
159+
160+
return flow_graph
161+
162+
163+
def ida_dataflow_analysis(function_addr:int, function_name:str, output_dir:str, defuse_only: bool = False) -> None:
164+
165+
loc_db = LocationDB()
166+
167+
###################### IDA specific #######################
168+
machine = guess_machine()
169+
bin_stream = bin_stream_ida()
170+
171+
# Populate symbols with ida names
172+
import idautils
173+
for ad, name in idautils.Names():
174+
if name is None:
175+
continue
176+
loc_db.add_location(name, ad)
177+
178+
179+
###################### Reverse-tool-independent ######################
180+
181+
mdis = machine.dis_engine(bin_stream, loc_db=loc_db, dont_dis_nulstart_bloc=True)
182+
mdis.follow_call = True
183+
lifter = machine.lifter_model_call(loc_db=loc_db)
184+
185+
print('disassembling function: {}:{}'.format(hex(function_addr), function_name))
186+
asmcfg = mdis.dis_multiblock(function_addr)
187+
188+
print('generating IR...')
189+
ircfg = lifter.new_ircfg_from_asmcfg(asmcfg)
190+
deadrm = DeadRemoval(lifter)
191+
# deadrm(ircfg) # TODO: 这里会删掉一部分IR,需要研究一下
192+
193+
with open(os.path.join(output_dir, '{}.asm2ir'.format(function_name)),'w') as f:
194+
# print('\tOFFSET\t| ASM\t| SRC -> DST')
195+
f.write('\tOFFSET\t| ASM\t| SRC -> DST\n')
196+
for lbl, irblock in ircfg.blocks.items():
197+
insr = []
198+
for assignblk in irblock:
199+
for dst, src in assignblk.iteritems():
200+
# print('\t{}\t| {}\t| {} -> {}'.format(hex(assignblk.instr.offset), assignblk.instr, src, dst))
201+
f.write('\t{}\t| {}\t| {} -> {}\n'.format(hex(assignblk.instr.offset), assignblk.instr, src, dst))
202+
203+
if not defuse_only:
204+
block_flow_cb = intra_block_flow_raw # if args.symb else intra_block_flow_symb
205+
206+
dfg = gen_function_data_flow_graph(lifter, ircfg, function_addr, block_flow_cb)
207+
open(os.path.join(output_dir,'{}_dfg.dot'.format(function_name)), 'w').write(dfg.dot())
208+
209+
reaching_defs = ReachingDefinitions(ircfg)
210+
defuse = DiGraphDefUse(reaching_defs)
211+
open(os.path.join(output_dir,'{}_defuse.dot'.format(function_name)), 'w').write(defuse.dot())
212+
213+
'''
214+
根据block_loc_key + assignblk_idx 可以推算出instr offset,所以这个def-use图也是可以对应回指令的
215+
'''
216+
LocKeyIdx2InstrOffset = {}
217+
for block in viewvalues(reaching_defs.ircfg.blocks):
218+
for index, assignblk in enumerate(block):
219+
LocKeyIdx2InstrOffset['{}_{}'.format(block.loc_key, index)] = hex(assignblk.instr.offset)
220+
221+
# print(['{}:{}'.format(key,LocKeyIdx2InstrOffset[key]) for key in LocKeyIdx2InstrOffset])
222+
open(os.path.join(output_dir,'{}_LocKeyIdx2InstrOffset.map'.format(function_name)), 'w').write(
223+
'\n'.join(['{}:{}'.format(key,LocKeyIdx2InstrOffset[key]) for key in LocKeyIdx2InstrOffset]))
224+
225+
226+
def miasm_dataflow_analysis(function_addr:int, function_name:str, output_dir:str, filepath:str, arch:str = "X86_64", defuse_only: bool = False) -> None:
227+
228+
bin_stream = Container.from_stream(open(filepath, 'rb'), loc_db).bin_stream
229+
machine = Machine(arch)
230+
231+
loc_db = LocationDB()
232+
mdis = machine.dis_engine(bin_stream, loc_db=loc_db, dont_dis_nulstart_bloc=True)
233+
mdis.follow_call = True
234+
lifter = machine.lifter_model_call(loc_db=loc_db)
235+
236+
print('disassembling function: {}:{}'.format(hex(function_addr), function_name))
237+
asmcfg = mdis.dis_multiblock(function_addr)
238+
239+
print('generating IR...')
240+
ircfg = lifter.new_ircfg_from_asmcfg(asmcfg)
241+
deadrm = DeadRemoval(lifter)
242+
# deadrm(ircfg) # TODO: 这里会删掉一部分IR,需要研究一下
243+
244+
with open(os.path.join(output_dir, '{}.asm2ir'.format(function_name)),'w') as f:
245+
# print('\tOFFSET\t| ASM\t| SRC -> DST')
246+
f.write('\tOFFSET\t| ASM\t| SRC -> DST\n')
247+
for lbl, irblock in ircfg.blocks.items():
248+
insr = []
249+
for assignblk in irblock:
250+
for dst, src in assignblk.iteritems():
251+
# print('\t{}\t| {}\t| {} -> {}'.format(hex(assignblk.instr.offset), assignblk.instr, src, dst))
252+
f.write('\t{}\t| {}\t| {} -> {}\n'.format(hex(assignblk.instr.offset), assignblk.instr, src, dst))
253+
254+
if not defuse_only:
255+
block_flow_cb = intra_block_flow_raw # if args.symb else intra_block_flow_symb
256+
257+
dfg = gen_function_data_flow_graph(lifter, ircfg, function_addr, block_flow_cb)
258+
open(os.path.join(output_dir,'{}_dfg.dot'.format(function_name)), 'w').write(dfg.dot())
259+
260+
reaching_defs = ReachingDefinitions(ircfg)
261+
defuse = DiGraphDefUse(reaching_defs)
262+
open(os.path.join(output_dir,'{}_defuse.dot'.format(function_name)), 'w').write(defuse.dot())
263+
264+
'''
265+
根据block_loc_key + assignblk_idx 可以推算出instr offset,所以这个def-use图也是可以对应回指令的
266+
'''
267+
LocKeyIdx2InstrOffset = {}
268+
for block in viewvalues(reaching_defs.ircfg.blocks):
269+
for index, assignblk in enumerate(block):
270+
LocKeyIdx2InstrOffset['{}_{}'.format(block.loc_key, index)] = hex(assignblk.instr.offset)
271+
272+
# print(['{}:{}'.format(key,LocKeyIdx2InstrOffset[key]) for key in LocKeyIdx2InstrOffset])
273+
open(os.path.join(output_dir,'{}_LocKeyIdx2InstrOffset.map'.format(function_name)), 'w').write(
274+
'\n'.join(['{}:{}'.format(key,LocKeyIdx2InstrOffset[key]) for key in LocKeyIdx2InstrOffset]))
275+
276+

testcase/test

85 KB
Binary file not shown.

testcase/test-idapython.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import idc
2+
import idautils
3+
import idaapi
4+
import ida_pro
5+
import ida_auto
6+
ida_auto.auto_wait()
7+
8+
9+
for func in idautils.Functions():
10+
11+
func_name = idc.get_func_name(func)
12+
print(hex(func),':',func_name)
13+
14+
15+
16+
17+
ida_pro.qexit(0)

0 commit comments

Comments
 (0)