Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
ee0e245
Update test binary to include more data reference situations
NicolaasWeideman Sep 2, 2022
2799723
Add some test cases for data references
NicolaasWeideman Sep 2, 2022
e733c87
Merge branch 'main' into testing/data_references
NicolaasWeideman Sep 2, 2022
2bd9bf2
Add code to compute communities
NicolaasWeideman Sep 2, 2022
32a587d
Update driver
NicolaasWeideman Sep 2, 2022
ee6da83
add function data references
SimaArasteh Sep 8, 2022
6ce1d2d
create matrix dissimilarity
SimaArasteh Sep 10, 2022
fff27d3
Merge branch 'testing/data_references' into df
NicolaasWeideman Sep 10, 2022
309a73f
Fix issue with CFGEemulated including callee function graphs
NicolaasWeideman Sep 10, 2022
adba150
Update tests to use new API
NicolaasWeideman Sep 10, 2022
c920102
Remove whitespace
NicolaasWeideman Sep 10, 2022
bb9498a
Merged data references and dissimilarity matrix into community comput…
NicolaasWeideman Sep 10, 2022
b5ec9e1
Fix graph node equality assertion
NicolaasWeideman Sep 10, 2022
331ba8a
Fix bug in matrix Mc computation
NicolaasWeideman Sep 13, 2022
2c0b4d4
Add option to supply BCDangr with proj, CFG and ELF file
NicolaasWeideman Sep 13, 2022
85f7e7b
Remove print statements
NicolaasWeideman Sep 13, 2022
e300136
Move data ref extraction into its own class and implement cache.
NicolaasWeideman Sep 14, 2022
e1ffc4b
Update Data ref pairs to use new data ref class
NicolaasWeideman Sep 14, 2022
2097397
Improve visualization of results in driver
NicolaasWeideman Sep 14, 2022
13a79f4
Merge branch 'dev/merge/communities_computation/data_refs/base' into …
NicolaasWeideman Sep 14, 2022
c574c2c
Update data reference extraction: do not sort data references, includ…
NicolaasWeideman Sep 19, 2022
8d690e1
Update data reference function pair calculation to remove duplicate d…
NicolaasWeideman Sep 19, 2022
92844bc
Update BCD to work with new data reference API
NicolaasWeideman Sep 19, 2022
b50a909
Update most valuable edge calculation for communities calculation
NicolaasWeideman Sep 19, 2022
43bbabd
Add missing import
NicolaasWeideman Sep 19, 2022
7be2c82
Add option to driver to limit number of iterations
NicolaasWeideman Sep 19, 2022
255aaab
Merge pull request #4 from usc-isi-bass/dev/merge/communities_computa…
NicolaasWeideman Sep 19, 2022
f072f03
Merge pull request #5 from usc-isi-bass/dev/merge/communities_computa…
NicolaasWeideman Oct 3, 2022
409bc17
Add code to filter out PLT functions, alignment bytes and simprocedur…
NicolaasWeideman Oct 3, 2022
5490d8d
Fix bug in setting the weight of decomposition graph edges
NicolaasWeideman Oct 3, 2022
e0af509
Switch to nx.greedy_modularity_communities algorithm
NicolaasWeideman Oct 3, 2022
a6aa7ff
Add code to draw community graph
NicolaasWeideman Oct 17, 2022
23a287a
validate angrBCD on ardupilot
SimaArasteh Oct 31, 2022
90f5319
fix a bug in computing compilation unit
SimaArasteh Nov 7, 2022
e51d581
Relabel nodes in community graph to match community index in output.
NicolaasWeideman Nov 7, 2022
c101dde
Merge branch 'dev/communities_computation/alternative_community_detec…
NicolaasWeideman Nov 7, 2022
0970c20
fix a bug in bcd datareference
SimaArasteh Nov 19, 2022
840a52e
fix a bug in data references
SimaArasteh Nov 20, 2022
5480083
fix a bug in data_ref_extraction
SimaArasteh Nov 21, 2022
f4277d3
add module for comptuing binary compilation unit
SimaArasteh Nov 21, 2022
6ce1884
add module for bcd modularization evaluation
SimaArasteh Nov 21, 2022
e481501
add module to apply and evaluate ardupilot
SimaArasteh Nov 21, 2022
8f889a2
fix a bug in evaluation
SimaArasteh Nov 30, 2022
0af0faa
module to evalute autocps binaries
SimaArasteh Dec 4, 2022
bc85f88
update scoring function
SimaArasteh Dec 13, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 98 additions & 20 deletions bcd/bcd_angr.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,46 @@
import angr
import textdistance
import networkx as nx
import numpy as np
from operator import itemgetter

import itertools
from elftools.elf.elffile import ELFFile
from bcd.data_ref_function_pair_property_calculator import DataRefFunctionPairPropertyCalulator
from bcd.call_function_pair_property_calculator import CallFunctionPairPropertyCalulator
from bcd.sections import Section

import itertools

class BCDangr:

def __init__(self, bin_path):
def __init__(self, bin_path, proj=None, cfg=None, elffile=None):
self._bin_path = bin_path
self._proj = angr.Project(bin_path, auto_load_libs=False)

self._cfg = self._proj.analyses.CFGFast(normalize=True)

self._func_list = sorted(self._cfg.functions.keys())
if proj is None:
self._proj = angr.Project(bin_path, auto_load_libs=False)
else:
self._proj = proj
if cfg is None:
self._cfg = self._proj.analyses.CFGFast(normalize=True)
else:
self._cfg = cfg
if elffile is None:
self.elffile = ELFFile(open(bin_path, 'rb'))
else:
self.elffile = elffile


func_list = []
for func_addr, func in self._cfg.functions.items():
if not (func.alignment or func.is_plt or func.is_simprocedure):
func_list.append(func_addr)

self._func_list = sorted(func_list)
self._num_funcs = len(self._func_list)

self._drfpp = DataRefFunctionPairPropertyCalulator(self._proj, self._cfg, self._func_list)
self._cfpp = CallFunctionPairPropertyCalulator(self._proj, self._cfg, self._func_list)
self.sections = self.elffile.iter_sections()
self.section_offsets = [Section(sec).compute_section_offsets() for sec in self.sections]
self._drfpp = DataRefFunctionPairPropertyCalulator(self._bin_path, self._proj, self._cfg, self._func_list, self.section_offsets)
self._cfpp = CallFunctionPairPropertyCalulator(self._proj, self._cfg, self._func_list, self.section_offsets)

self._sequence_graph = self._compute_sequence_graph()
self._data_reference_graph = self._compute_data_reference_graph()
Expand All @@ -33,10 +54,51 @@ def __init__(self, bin_path):

self._matrix_penalty = self._compute_penalty_matrix()

# TODO Compute final edge weight matrix
def get_communities(self, alpha, beta, gamma):
communities_iter = self._get_communities(alpha, beta, gamma)

# Translate community back to function addresses
for community_tup in communities_iter:
yield tuple(set(self._func_list[i] for i in community_set) for community_set in community_tup)

def _get_communities(self, alpha, beta, gamma):
assert np.isclose(alpha + beta + gamma, 1.0), "Sum of alpha, beta and gamma should be 1, but instead it is: {}".format(alpha + beta + gamma)
H, W = self._calculate_decomposition_graph(alpha, beta, gamma)
nx.set_edge_attributes(H, {(u, v): {'weight':W[u][v]} for u, v in H.edges()})
def find_mve(G):
u, v, w = max(G.edges(data="weight"), key=itemgetter(2))
mve_nodes = (u, v)
return mve_nodes

#communities = nx.algorithms.community.girvan_newman(H, most_valuable_edge=find_mve)
communities = iter([nx.algorithms.community.greedy_modularity_communities(H, weight='weight')])

return communities


def _calculate_decomposition_graph(self, alpha, beta, gamma):
G_s = self._sequence_graph
G_d = self._data_reference_graph
G_c = self._call_graph

assert set(G_s.nodes()) == set(G_d.nodes()) == set(G_c.nodes()), "Gs: {} Gd: {} Gc: {}".format(G_s.nodes(), G_d.nodes(), G_c.nodes())
H = nx.compose(G_s, nx.compose(G_d, G_c))
assert set(H.nodes()) == set(G_s.nodes())

W = self._calculate_final_weight_matrix(alpha, beta, gamma)
return H, W

def _calculate_final_weight_matrix(self, alpha, beta, gamma):
N = np.array(self._matrix_penalty)
M_s = np.array(self._matrix_sequence)
M_c = np.array(self._matrix_call)
M_d = np.array(self._matrix_data_reference)
rho_d = np.array(self._matrix_dissimilarity_score)

W = np.multiply(N, alpha * M_s + beta * M_c + gamma * (np.multiply(rho_d, M_d)))

return W

# TODO: use Newman's algorithm to compute components
self._components = None

# Graph Calculation

Expand All @@ -53,11 +115,18 @@ def _compute_sequence_graph(self):

def _compute_data_reference_graph(self):
drg = nx.DiGraph()

drg.add_nodes_from(range(self._num_funcs))

for i in range(self._num_funcs):
for j in range(i + 1, self._num_funcs):
if self._drfpp.get_property(i, j) > 0:
#print(i)
#print(j)
#dfi = self._drfpp.compute_function_data_references(self._func_list[i])
#dfj = self._drfpp.compute_function_data_references(self._func_list[j])
#drg.nodes[i]['df'] = dfi
#drg.nodes[j]['df'] = dfj
if len(self._drfpp.get_property(i, j)) > 0:
drg.add_edge(i, j)
drg.add_edge(j, i)
return drg
Expand Down Expand Up @@ -88,16 +157,17 @@ def _compute_matrix_data_reference(self):
m = [[None for i in range(self._num_funcs)] for j in range(self._num_funcs)]
for (i, j) in itertools.product(range(self._num_funcs), repeat=2):
if self._data_reference_graph.has_edge(i, j):
m[i][j] = self._drfpp.get_property(i, j)
m[i][j] = len(self._drfpp.get_property(i, j))
else:
m[i][j] = 0

assert all([c is not None for r in m for c in r])
return m
return m

def _compute_matrix_call(self):
m = [[None for i in range(self._num_funcs)] for j in range(self._num_funcs)]
for (i, j) in itertools.product(range(self._num_funcs), repeat=2):
if self._data_reference_graph.has_edge(i, j):
if self._call_graph.has_edge(i, j):
m[i][j] = self._cfpp.get_property(i, j)
else:
m[i][j] = 0
Expand All @@ -107,14 +177,19 @@ def _compute_matrix_call(self):
def _compute_matrix_dissimilarity_score(self):
rho = [[None for i in range(self._num_funcs)] for j in range(self._num_funcs)]
for (i, j) in itertools.product(range(self._num_funcs), repeat=2):
if self._data_reference_graph.has_edge(i, j): # TODO Check Di or Dj have length > 0
# TODO
rho[i][j] = -1
Di = self._drfpp.compute_function_data_references(self._func_list[i])
Dj = self._drfpp.compute_function_data_references(self._func_list[j])
p = len(Di)
q = len(Dj)
#print(self._data_reference_graph.has_edge(i, j))
if self._data_reference_graph.has_edge(i, j) and max(p,q) > 0:
rho[i][j] = 1 - (self.levenshtein_distance(Di,Dj)/max(p,q))
#print(rho[i][j])
else:
rho[i][j] = 0
assert all([c is not None for r in rho for c in r])
return rho

def _compute_penalty_matrix(self):
N = [[None for i in range(self._num_funcs)] for j in range(self._num_funcs)]
for (i, j) in itertools.product(range(self._num_funcs), repeat=2):
Expand All @@ -124,3 +199,6 @@ def _compute_penalty_matrix(self):
N[i][j] = 1
assert all([c is not None for r in N for c in r])
return N
def levenshtein_distance(self,arr1, arr2):
return textdistance.levenshtein.distance(arr1,arr2)

12 changes: 7 additions & 5 deletions bcd/call_function_pair_property_calculator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@

from bcd.function_pair_property_calculator import FunctionPairPropertyCalculator


class CallFunctionPairPropertyCalulator(FunctionPairPropertyCalculator):

def _get_property(self, i, j):
Expand All @@ -10,11 +11,12 @@ def _get_property(self, i, j):
func1 = self._cfg.functions.function(addr=func1_addr)

num_calls_to_func2 = 0
for call_site_addr in func1.get_call_sites():
call_target = func1.get_call_target(call_site_addr)
assert call_target is not None
if call_target == func2_addr:
num_calls_to_func2 += 1
if func1 is not None:
for call_site_addr in func1.get_call_sites():
call_target = func1.get_call_target(call_site_addr)
assert call_target is not None
if call_target == func2_addr:
num_calls_to_func2 += 1

return num_calls_to_func2

Expand Down
137 changes: 137 additions & 0 deletions bcd/data_ref_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@

from elftools.elf.elffile import ELFFile

class DataRefExtraction:

def __init__(self, bin_path, proj, cfg, func_list, section_offsets):
self._bin_path = bin_path
self._proj = proj
self._cfg = cfg
self.func_list = func_list
self.section_offsets = section_offsets

self._cache = {}

def compute_function_data_references(self, func_address):
if func_address in self._cache:
return self._cache[func_address]
func_references = self._compute_function_data_references(func_address)
self._cache[func_address] = func_references
return func_references

def _compute_function_data_references(self, func_address):

#print(self._func_list)
func_refs = []
sec_offsets = self.dic_section_offsets()
#print(sec_offsets)
base_address = self._proj.loader.main_object.min_addr
#print(hex(base_address))
instructions = []
func = self._cfg.functions.function(addr=func_address)
func_blocks = sorted(func.blocks, key=lambda b: b.addr) # Apparrently blocks aren't sorted by default

for block in func_blocks:
for ins in block.capstone.insns:
instructions.append(ins)

with open(self._bin_path, 'rb') as f:
elffile = ELFFile(f)
arch = elffile.get_machine_arch()
if arch == 'x64':
func_refs = self.function_references_for_amd(instructions, base_address, sec_offsets)
elif arch == 'ARM':
func_refs = self.function_references_for_arm(instructions, base_address, sec_offsets)

return func_refs


def function_references_for_amd(self, instructions, base_adrs, sec_offsets):
function_references = []
for instruct in instructions:
if 'rip' in instruct.op_str and '[' in instruct.op_str :
mnemonic = instruct.op_str
parts = mnemonic.split(",")
for part in parts:
if 'rip' in part:
whole_address = part.split("[")[-1][:-1]
if '+' in whole_address:
if instructions.index(instruct)+1 < len(instructions):
offset = whole_address.split("+")[-1].strip()
rip = instructions[instructions.index(instruct)+1].address
new_offset = int(offset, 16)
data_reference = rip+new_offset-base_adrs
if self.check_validity_data_references(hex(data_reference), sec_offsets):
function_references.append(data_reference)

return function_references

def function_references_for_arm(self, instrucs, base_adr, sec_offsets):
func_refs = []
for instruct in instrucs:
#print(instruct)
if 'pc' in instruct.op_str and '[' in instruct.op_str :
mnemonic = instruct.op_str
parts = mnemonic.split("[")
#print(parts)
for part in parts:
if 'pc' in part:
#print(instruct)
#part = part.replace("[","")
part = part.replace("]","")
mnemo_parts = part.split(",")
if len(mnemo_parts) >1:
offset = mnemo_parts[-1].strip().replace("#",'')
#print(offset)

if instrucs.index(instruct)+1 < len(instrucs) and offset.startswith('0x'):
pc = instrucs[instrucs.index(instruct)+1].address
#print("this is pc")
#print(pc)
new_offset = int(offset, 16)
#print("this is new offset")
#print(new_offset)
data_reference = pc+new_offset-base_adr
#print(data_reference)
if self.check_validity_data_references(hex(data_reference), sec_offsets):
#print("valid")
func_refs.append(data_reference)


#print(func_refs)
return func_refs


def dic_section_offsets(self):

section_offset = {}

for entry in self.section_offsets:
section_offset[entry[0]] = entry[1]
return section_offset

def check_validity_data_references(self, data_ref, dictionary_sections):
# This function checks if a data reference exists in one of the three sections(.bss, .rodata, .data)
A = False
B = False
C = False
if '.bss' in list(dictionary_sections.keys()):
bss_low_address = dictionary_sections['.bss'][0]
bss_high_address = dictionary_sections['.bss'][1]
A = (data_ref>=bss_low_address and data_ref<=bss_high_address)

if '.rodata' in list(dictionary_sections.keys()):
rodata_low_address = dictionary_sections['.rodata'][0]
rodata_high_address = dictionary_sections['.rodata'][1]
B = (data_ref>=rodata_low_address and data_ref<=rodata_high_address)

if '.data' in list(dictionary_sections.keys()):
data_low_address = dictionary_sections['.data'][0]
data_high_address = dictionary_sections['.data'][1]
C = (data_ref>=data_low_address and data_ref<=data_high_address)


if A or B or C :
return True
else:
return False
25 changes: 21 additions & 4 deletions bcd/data_ref_function_pair_property_calculator.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,28 @@

import angr
import itertools
from bcd.function_pair_property_calculator import SymmetricFunctionPairPropertyCalculator
from bcd.data_ref_extraction import DataRefExtraction
from elftools.elf.elffile import ELFFile

class DataRefFunctionPairPropertyCalulator(SymmetricFunctionPairPropertyCalculator, DataRefExtraction):

class DataRefFunctionPairPropertyCalulator(SymmetricFunctionPairPropertyCalculator):
def __init__(self, bin_path, proj, cfg, func_list, section_offsets):
SymmetricFunctionPairPropertyCalculator.__init__(self, proj, cfg, func_list, section_offsets)
DataRefExtraction.__init__(self, bin_path, proj, cfg, func_list, section_offsets)

def _get_property(self, i, j):
func1 = self._func_list[i]
func2 = self._func_list[j]
# TODO: return data references common to func1 and func2
func1_df = set(self.compute_function_data_references(func1))
func2_df = set(self.compute_function_data_references(func2))

return self.common_elements(func1_df, func2_df)

return 0
def common_elements(self, l1, l2):
l1_set = set(l1)
l2_set = set(l2)

if (l1_set & l2_set):
return list(l1_set & l2_set)
else:
return []
Loading