usc-isi-bass · SimaArasteh · Sep 2, 2022 · Sep 2, 2022 · Sep 2, 2022 · Sep 2, 2022
diff --git a/bcd/bcd_angr.py b/bcd/bcd_angr.py
@@ -1,25 +1,46 @@
 import angr
+import textdistance
 import networkx as nx
 import numpy as np
+from operator import itemgetter
 
+import itertools
+from elftools.elf.elffile import ELFFile
 from bcd.data_ref_function_pair_property_calculator import DataRefFunctionPairPropertyCalulator
 from bcd.call_function_pair_property_calculator import CallFunctionPairPropertyCalulator
+from bcd.sections import Section
 
 import itertools
 
 class BCDangr:
 
-    def __init__(self, bin_path):
+    def __init__(self, bin_path, proj=None, cfg=None, elffile=None):
         self._bin_path = bin_path
-        self._proj = angr.Project(bin_path, auto_load_libs=False)
-
-        self._cfg = self._proj.analyses.CFGFast(normalize=True)
-
-        self._func_list = sorted(self._cfg.functions.keys())
+        if proj is None:
+            self._proj = angr.Project(bin_path, auto_load_libs=False)
+        else:
+            self._proj = proj
+        if cfg is None:
+            self._cfg = self._proj.analyses.CFGFast(normalize=True)
+        else:
+            self._cfg = cfg
+        if elffile is None:
+            self.elffile = ELFFile(open(bin_path, 'rb'))
+        else:
+            self.elffile = elffile
+
+
+        func_list = []
+        for func_addr, func in self._cfg.functions.items():
+            if not (func.alignment or func.is_plt or func.is_simprocedure):
+                func_list.append(func_addr)
+
+        self._func_list = sorted(func_list)
         self._num_funcs = len(self._func_list)
-
-        self._drfpp = DataRefFunctionPairPropertyCalulator(self._proj, self._cfg, self._func_list)
-        self._cfpp = CallFunctionPairPropertyCalulator(self._proj, self._cfg, self._func_list)
+        self.sections = self.elffile.iter_sections()
+        self.section_offsets = [Section(sec).compute_section_offsets() for sec in self.sections]
+        self._drfpp = DataRefFunctionPairPropertyCalulator(self._bin_path, self._proj, self._cfg, self._func_list, self.section_offsets)
+        self._cfpp = CallFunctionPairPropertyCalulator(self._proj, self._cfg, self._func_list, self.section_offsets)
 
         self._sequence_graph = self._compute_sequence_graph()
         self._data_reference_graph = self._compute_data_reference_graph()
@@ -33,10 +54,51 @@ def __init__(self, bin_path):
 
         self._matrix_penalty = self._compute_penalty_matrix()
 
-        # TODO Compute final edge weight matrix
+    def get_communities(self, alpha, beta, gamma):
+        communities_iter = self._get_communities(alpha, beta, gamma)
+
+        # Translate community back to function addresses
+        for community_tup in communities_iter:
+            yield tuple(set(self._func_list[i] for i in community_set) for community_set in community_tup)
+
+    def _get_communities(self, alpha, beta, gamma):
+        assert np.isclose(alpha + beta + gamma, 1.0), "Sum of alpha, beta and gamma should be 1, but instead it is: {}".format(alpha + beta + gamma)
+        H, W = self._calculate_decomposition_graph(alpha, beta, gamma)
+        nx.set_edge_attributes(H, {(u, v): {'weight':W[u][v]} for u, v in H.edges()})
+        def find_mve(G):
+            u, v, w = max(G.edges(data="weight"), key=itemgetter(2))
+            mve_nodes = (u, v)
+            return mve_nodes
+
+        #communities = nx.algorithms.community.girvan_newman(H, most_valuable_edge=find_mve)
+        communities = iter([nx.algorithms.community.greedy_modularity_communities(H, weight='weight')])
+
+        return communities
+
+
+    def _calculate_decomposition_graph(self, alpha, beta, gamma):
+        G_s = self._sequence_graph
+        G_d = self._data_reference_graph
+        G_c = self._call_graph
+
+        assert set(G_s.nodes()) == set(G_d.nodes()) == set(G_c.nodes()), "Gs: {} Gd: {} Gc: {}".format(G_s.nodes(), G_d.nodes(), G_c.nodes())
+        H = nx.compose(G_s, nx.compose(G_d, G_c))
+        assert set(H.nodes()) == set(G_s.nodes())
+
+        W = self._calculate_final_weight_matrix(alpha, beta, gamma)
+        return H, W
+
+    def _calculate_final_weight_matrix(self, alpha, beta, gamma):
+        N = np.array(self._matrix_penalty)
+        M_s = np.array(self._matrix_sequence)
+        M_c = np.array(self._matrix_call)
+        M_d = np.array(self._matrix_data_reference)
+        rho_d = np.array(self._matrix_dissimilarity_score)
+
+        W = np.multiply(N, alpha * M_s + beta * M_c + gamma * (np.multiply(rho_d, M_d)))
+
+        return W
 
-        # TODO: use Newman's algorithm to compute components
-        self._components = None
 
     # Graph Calculation
 
@@ -53,11 +115,18 @@ def _compute_sequence_graph(self):
 
     def _compute_data_reference_graph(self):
         drg = nx.DiGraph()
+
         drg.add_nodes_from(range(self._num_funcs))
 
         for i in range(self._num_funcs):
             for j in range(i + 1, self._num_funcs):
-                if self._drfpp.get_property(i, j) > 0:
+                #print(i)
+                #print(j)
+                #dfi = self._drfpp.compute_function_data_references(self._func_list[i])
+                #dfj = self._drfpp.compute_function_data_references(self._func_list[j])
+                #drg.nodes[i]['df'] = dfi
+                #drg.nodes[j]['df'] = dfj
+                if len(self._drfpp.get_property(i, j)) > 0:
                     drg.add_edge(i, j)
                     drg.add_edge(j, i)
         return drg
@@ -88,16 +157,17 @@ def _compute_matrix_data_reference(self):
         m = [[None for i in range(self._num_funcs)] for j in range(self._num_funcs)]
         for (i, j) in itertools.product(range(self._num_funcs), repeat=2):
             if self._data_reference_graph.has_edge(i, j):
-                m[i][j] = self._drfpp.get_property(i, j)
+                m[i][j] = len(self._drfpp.get_property(i, j))
             else:
                 m[i][j] = 0
+
         assert all([c is not None for r in m for c in r])
-        return m           
+        return m
 
     def _compute_matrix_call(self):
         m = [[None for i in range(self._num_funcs)] for j in range(self._num_funcs)]
         for (i, j) in itertools.product(range(self._num_funcs), repeat=2):
-            if self._data_reference_graph.has_edge(i, j):
+            if self._call_graph.has_edge(i, j):
                 m[i][j] = self._cfpp.get_property(i, j)
             else:
                 m[i][j] = 0
@@ -107,14 +177,19 @@ def _compute_matrix_call(self):
     def _compute_matrix_dissimilarity_score(self):
         rho = [[None for i in range(self._num_funcs)] for j in range(self._num_funcs)]
         for (i, j) in itertools.product(range(self._num_funcs), repeat=2):
-            if self._data_reference_graph.has_edge(i, j): # TODO Check Di or Dj have length > 0
-                # TODO
-                rho[i][j] = -1
+            Di = self._drfpp.compute_function_data_references(self._func_list[i])
+            Dj = self._drfpp.compute_function_data_references(self._func_list[j])
+            p = len(Di)
+            q = len(Dj)
+            #print(self._data_reference_graph.has_edge(i, j))
+            if self._data_reference_graph.has_edge(i, j) and max(p,q) > 0:
+                rho[i][j] = 1 - (self.levenshtein_distance(Di,Dj)/max(p,q))
+                #print(rho[i][j])
             else:
                 rho[i][j] = 0
         assert all([c is not None for r in rho for c in r])
         return rho
-            
+
     def _compute_penalty_matrix(self):
         N = [[None for i in range(self._num_funcs)] for j in range(self._num_funcs)]
         for (i, j) in itertools.product(range(self._num_funcs), repeat=2):
@@ -124,3 +199,6 @@ def _compute_penalty_matrix(self):
                 N[i][j] = 1
         assert all([c is not None for r in N for c in r])
         return N
+    def levenshtein_distance(self,arr1, arr2):
+        return textdistance.levenshtein.distance(arr1,arr2)
+
diff --git a/bcd/call_function_pair_property_calculator.py b/bcd/call_function_pair_property_calculator.py
@@ -1,6 +1,7 @@
 
 from bcd.function_pair_property_calculator import FunctionPairPropertyCalculator
 
+
 class CallFunctionPairPropertyCalulator(FunctionPairPropertyCalculator):
 
     def _get_property(self, i, j):
@@ -10,11 +11,12 @@ def _get_property(self, i, j):
         func1 = self._cfg.functions.function(addr=func1_addr)
 
         num_calls_to_func2 = 0
-        for call_site_addr in func1.get_call_sites():
-            call_target = func1.get_call_target(call_site_addr)
-            assert call_target is not None
-            if call_target == func2_addr:
-                num_calls_to_func2 += 1
+        if func1 is not None:
+            for call_site_addr in func1.get_call_sites():
+                call_target = func1.get_call_target(call_site_addr)
+                assert call_target is not None
+                if call_target == func2_addr:
+                    num_calls_to_func2 += 1
 
         return num_calls_to_func2
 

diff --git a/bcd/data_ref_extraction.py b/bcd/data_ref_extraction.py
@@ -0,0 +1,137 @@
+
+from elftools.elf.elffile import ELFFile
+
+class DataRefExtraction:
+
+    def __init__(self, bin_path, proj, cfg, func_list, section_offsets):
+        self._bin_path = bin_path
+        self._proj = proj
+        self._cfg = cfg
+        self.func_list = func_list
+        self.section_offsets = section_offsets
+
+        self._cache = {}
+
+    def compute_function_data_references(self, func_address):
+        if func_address in self._cache:
+            return self._cache[func_address]
+        func_references = self._compute_function_data_references(func_address) 
+        self._cache[func_address] = func_references
+        return func_references
+
+    def _compute_function_data_references(self, func_address):
+
+        #print(self._func_list)
+        func_refs = []
+        sec_offsets = self.dic_section_offsets()
+        #print(sec_offsets)
+        base_address = self._proj.loader.main_object.min_addr
+        #print(hex(base_address))
+        instructions = []
+        func = self._cfg.functions.function(addr=func_address)
+        func_blocks = sorted(func.blocks, key=lambda b: b.addr) # Apparrently blocks aren't sorted by default
+
+        for block in func_blocks:
+            for ins in block.capstone.insns:
+                instructions.append(ins)
+
+        with open(self._bin_path, 'rb') as f:
+            elffile = ELFFile(f)
+            arch = elffile.get_machine_arch()
+            if arch == 'x64':
+                func_refs = self.function_references_for_amd(instructions, base_address, sec_offsets)
+            elif arch == 'ARM':
+                func_refs = self.function_references_for_arm(instructions, base_address, sec_offsets)
+
+        return func_refs
+
+
+    def function_references_for_amd(self, instructions, base_adrs, sec_offsets):
+        function_references = []
+        for instruct in instructions:
+            if 'rip' in instruct.op_str and '[' in instruct.op_str :
+                mnemonic = instruct.op_str
+                parts = mnemonic.split(",")
+                for part in parts:
+                    if 'rip' in part:
+                        whole_address = part.split("[")[-1][:-1]
+                        if '+' in whole_address:
+                            if instructions.index(instruct)+1 < len(instructions):
+                                offset = whole_address.split("+")[-1].strip()
+                                rip = instructions[instructions.index(instruct)+1].address
+                                new_offset = int(offset, 16)
+                                data_reference = rip+new_offset-base_adrs
+                                if self.check_validity_data_references(hex(data_reference), sec_offsets):
+                                    function_references.append(data_reference)
+
+        return function_references
+
+    def function_references_for_arm(self, instrucs, base_adr, sec_offsets):
+        func_refs = []
+        for instruct in instrucs:
+            #print(instruct)
+            if 'pc' in instruct.op_str and '[' in instruct.op_str :
+                mnemonic = instruct.op_str
+                parts = mnemonic.split("[")
+                #print(parts)
+                for part in parts:
+                    if 'pc' in part:
+                        #print(instruct)
+                        #part = part.replace("[","")
+                        part = part.replace("]","")
+                        mnemo_parts = part.split(",")
+                        if len(mnemo_parts) >1:
+                            offset = mnemo_parts[-1].strip().replace("#",'')
+                            #print(offset)
+
+                        if instrucs.index(instruct)+1 < len(instrucs) and offset.startswith('0x'):
+                            pc = instrucs[instrucs.index(instruct)+1].address
+                            #print("this is pc")
+                            #print(pc)
+                            new_offset = int(offset, 16)
+                            #print("this is new offset")
+                            #print(new_offset)
+                            data_reference = pc+new_offset-base_adr
+                            #print(data_reference)
+                            if self.check_validity_data_references(hex(data_reference), sec_offsets):
+                                #print("valid")
+                                func_refs.append(data_reference)
+
+
+        #print(func_refs)
+        return func_refs
+
+
+    def dic_section_offsets(self):
+
+        section_offset = {}
+
+        for entry in self.section_offsets:
+            section_offset[entry[0]] = entry[1]
+        return section_offset
+
+    def check_validity_data_references(self, data_ref, dictionary_sections):
+        # This function checks if a data reference exists in one of the three sections(.bss, .rodata, .data)
+        A = False
+        B = False
+        C = False
+        if '.bss' in list(dictionary_sections.keys()):
+            bss_low_address = dictionary_sections['.bss'][0]
+            bss_high_address = dictionary_sections['.bss'][1]
+            A = (data_ref>=bss_low_address and data_ref<=bss_high_address)
+
+        if '.rodata' in list(dictionary_sections.keys()):
+            rodata_low_address = dictionary_sections['.rodata'][0]
+            rodata_high_address = dictionary_sections['.rodata'][1]
+            B = (data_ref>=rodata_low_address and data_ref<=rodata_high_address)
+
+        if '.data' in list(dictionary_sections.keys()): 
+            data_low_address = dictionary_sections['.data'][0]
+            data_high_address = dictionary_sections['.data'][1]
+            C = (data_ref>=data_low_address and data_ref<=data_high_address)
+
+
+        if A or B or C :
+            return True
+        else:
+            return False
diff --git a/bcd/data_ref_function_pair_property_calculator.py b/bcd/data_ref_function_pair_property_calculator.py
@@ -1,11 +1,28 @@
-
+import angr 
+import itertools
 from bcd.function_pair_property_calculator import SymmetricFunctionPairPropertyCalculator
+from bcd.data_ref_extraction import DataRefExtraction
+from elftools.elf.elffile import ELFFile
+
+class DataRefFunctionPairPropertyCalulator(SymmetricFunctionPairPropertyCalculator, DataRefExtraction):
 
-class DataRefFunctionPairPropertyCalulator(SymmetricFunctionPairPropertyCalculator):
+    def __init__(self, bin_path,  proj, cfg, func_list, section_offsets):
+        SymmetricFunctionPairPropertyCalculator.__init__(self, proj, cfg, func_list, section_offsets)
+        DataRefExtraction.__init__(self, bin_path,  proj, cfg, func_list, section_offsets)
 
     def _get_property(self, i, j):
         func1 = self._func_list[i]
         func2 = self._func_list[j]
-        # TODO: return data references common to func1 and func2
+        func1_df = set(self.compute_function_data_references(func1))
+        func2_df = set(self.compute_function_data_references(func2))
+
+        return self.common_elements(func1_df, func2_df)
 
-        return 0
+    def common_elements(self, l1, l2):
+        l1_set = set(l1)
+        l2_set = set(l2)
+
+        if (l1_set & l2_set):
+            return list(l1_set & l2_set)
+        else:
+            return []