Skip to content

Commit b8be098

Browse files
committed
decomp: generate structured ast from the generated one and remove gotos
decomp: update the ci pipeline to verify the function calls decomp: add unit tests for decompilation decomp: update option flags to disable structuring passes decomp: update switch handling and remove fallback code entirely and add new tests
1 parent 8887030 commit b8be098

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+6844
-232
lines changed
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
/*
2+
* Copyright (c) 2024, Trail of Bits, Inc.
3+
*
4+
* This source code is licensed in accordance with the terms specified in
5+
* the LICENSE file found in the root directory of this source tree.
6+
*/
7+
8+
#pragma once
9+
10+
#include <cstdint>
11+
#include <string>
12+
#include <vector>
13+
14+
#include <clang/AST/Decl.h>
15+
#include <clang/AST/Stmt.h>
16+
17+
namespace patchestry::ghidra {
18+
struct Function;
19+
} // namespace patchestry::ghidra
20+
21+
namespace patchestry::ast {
22+
23+
/// A case entry for switch blocks -- carries Ghidra case metadata.
24+
struct SwitchCaseEntry {
25+
int64_t value; // case constant value
26+
size_t succ_index; // index into CfgBlock::succs[] for this case target
27+
bool has_exit = false; // whether Ghidra marked this case as having a break/exit
28+
};
29+
30+
// A basic block in the CFG
31+
struct CfgBlock {
32+
std::string label; // empty if unlabeled entry block
33+
std::vector< clang::Stmt * > stmts; // statements in the block
34+
std::vector< size_t > succs; // successor block indices
35+
bool is_conditional = false; // true if block ends with if(cond) goto
36+
clang::Expr *branch_cond = nullptr; // condition expr if conditional
37+
size_t taken_succ = 0; // index of "then" successor
38+
size_t fallthrough_succ = 0; // index of "else" / fallthrough successor
39+
std::vector< SwitchCaseEntry > switch_cases; // non-empty iff this is a switch block
40+
};
41+
42+
// Per-function CFG
43+
struct Cfg {
44+
const clang::FunctionDecl *function = nullptr;
45+
std::vector< CfgBlock > blocks;
46+
size_t entry = 0;
47+
48+
size_t BlockCount() const { return blocks.size(); }
49+
};
50+
51+
// Build CFGs for all functions in the translation unit.
52+
// Splits at label boundaries, extracts goto edges, reorders in RPO.
53+
std::vector< Cfg > BuildCfgs(clang::ASTContext &ctx);
54+
55+
// Build CFG for a single function.
56+
Cfg BuildCfg(const clang::FunctionDecl *fn);
57+
58+
// Reorder blocks in reverse post-order.
59+
void ReorderBlocksRPO(Cfg &cfg);
60+
61+
// Populate CfgBlock::switch_cases from Ghidra switch metadata.
62+
// Must be called AFTER BuildCfg + ReorderBlocksRPO so that CfgBlock labels
63+
// and succ indices are final. Matches CfgBlock labels to ghidra block keys
64+
// (via labelNameFromKey) and maps each ghidra::SwitchCase target to the
65+
// corresponding succs[] index.
66+
void PopulateSwitchMetadata(Cfg &cfg, const ghidra::Function &func);
67+
68+
} // namespace patchestry::ast
Lines changed: 326 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,326 @@
1+
/*
2+
* Copyright (c) 2024, Trail of Bits, Inc.
3+
*
4+
* This source code is licensed in accordance with the terms specified in
5+
* the LICENSE file found in the root directory of this source tree.
6+
*/
7+
8+
#pragma once
9+
10+
#include <patchestry/AST/CfgBuilder.hpp>
11+
#include <patchestry/AST/SNode.hpp>
12+
13+
#include <algorithm>
14+
#include <cstdint>
15+
#include <limits>
16+
#include <list>
17+
#include <string>
18+
#include <unordered_set>
19+
#include <utility>
20+
#include <vector>
21+
22+
#include <clang/AST/Expr.h>
23+
24+
namespace patchestry::ast {
25+
26+
// -----------------------------------------------------------------------
27+
// detail:: namespace — internal but testable collapse graph types.
28+
// -----------------------------------------------------------------------
29+
namespace detail {
30+
31+
/// A node in the collapsing graph. Starts as a 1:1 mirror of CfgBlock.
32+
/// As rules fire, nodes get absorbed into StructuredNode wrappers and
33+
/// removed from the active set.
34+
struct CNode {
35+
static constexpr size_t NONE = std::numeric_limits<size_t>::max();
36+
37+
size_t id; // original CfgBlock index
38+
std::vector<size_t> succs; // outgoing edges (by CNode id)
39+
std::vector<size_t> preds; // incoming edges (by CNode id)
40+
41+
// Edge properties (indexed same as succs)
42+
std::vector<uint32_t> edge_flags;
43+
44+
// The SNode produced when this node is collapsed (null = leaf)
45+
SNode *structured = nullptr;
46+
47+
// Leaf payload: statements from the original CfgBlock
48+
std::string label; // original CfgBlock label (for SLabel wrapping)
49+
std::vector<clang::Stmt *> stmts;
50+
clang::Expr *branch_cond = nullptr;
51+
bool is_conditional = false;
52+
53+
// Switch case metadata (copied from CfgBlock; non-empty for switch blocks)
54+
std::vector< SwitchCaseEntry > switch_cases;
55+
56+
// Set when absorbed into a parent structured node
57+
bool collapsed = false;
58+
size_t collapsed_into = NONE; // representative node after collapse
59+
60+
// Flags for the collapse algorithm
61+
bool mark = false;
62+
int visit_count = 0;
63+
64+
enum EdgeFlag : uint32_t {
65+
F_GOTO = 1u << 0,
66+
F_BACK = 1u << 1,
67+
F_LOOP_EXIT = 1u << 2,
68+
};
69+
70+
bool isGotoOut(size_t i) const {
71+
return i < edge_flags.size() && (edge_flags[i] & F_GOTO);
72+
}
73+
bool isBackEdge(size_t i) const {
74+
return i < edge_flags.size() && (edge_flags[i] & F_BACK);
75+
}
76+
bool isLoopExit(size_t i) const {
77+
return i < edge_flags.size() && (edge_flags[i] & F_LOOP_EXIT);
78+
}
79+
bool isDecisionOut(size_t i) const {
80+
return !isGotoOut(i) && !isBackEdge(i);
81+
}
82+
/// Edges to trace in TraceDAG: not back-edges, not loop-exit edges.
83+
bool isLoopDAGOut(size_t i) const {
84+
return !isBackEdge(i) && !isLoopExit(i);
85+
}
86+
bool isSwitchOut() const { return succs.size() > 2; }
87+
88+
size_t sizeIn() const { return preds.size(); }
89+
size_t sizeOut() const { return succs.size(); }
90+
91+
void setGoto(size_t i) {
92+
if (i < edge_flags.size()) edge_flags[i] |= F_GOTO;
93+
}
94+
void setLoopExit(size_t i) {
95+
if (i < edge_flags.size()) edge_flags[i] |= F_LOOP_EXIT;
96+
}
97+
void clearLoopExit(size_t i) {
98+
if (i < edge_flags.size()) edge_flags[i] &= ~static_cast<uint32_t>(F_LOOP_EXIT);
99+
}
100+
};
101+
102+
/// The collapsing graph — lightweight mirror of the Cfg that supports
103+
/// collapsing blocks into structured nodes.
104+
struct CGraph {
105+
std::vector<CNode> nodes;
106+
size_t entry = 0;
107+
108+
/// Active (uncollapsed) node ids
109+
std::vector<size_t> activeIds() const {
110+
std::vector<size_t> result;
111+
for (auto &n : nodes) {
112+
if (!n.collapsed) result.push_back(n.id);
113+
}
114+
return result;
115+
}
116+
117+
size_t activeCount() const {
118+
size_t c = 0;
119+
for (auto &n : nodes) {
120+
if (!n.collapsed) ++c;
121+
}
122+
return c;
123+
}
124+
125+
CNode &node(size_t id) { return nodes[id]; }
126+
const CNode &node(size_t id) const { return nodes[id]; }
127+
128+
/// Remove an edge from the active graph
129+
void removeEdge(size_t from, size_t to) {
130+
auto &s = nodes[from].succs;
131+
auto &f = nodes[from].edge_flags;
132+
for (size_t i = 0; i < s.size(); ++i) {
133+
if (s[i] == to) {
134+
s.erase(s.begin() + static_cast<ptrdiff_t>(i));
135+
f.erase(f.begin() + static_cast<ptrdiff_t>(i));
136+
break;
137+
}
138+
}
139+
auto &p = nodes[to].preds;
140+
p.erase(std::remove(p.begin(), p.end(), from), p.end());
141+
}
142+
143+
/// Replace a set of nodes with a single new structured node.
144+
/// Returns the representative node id.
145+
size_t collapseNodes(const std::vector<size_t> &ids, SNode *snode);
146+
};
147+
148+
/// Build the collapse graph from a Cfg.
149+
CGraph buildCGraph(const Cfg &cfg);
150+
151+
/// Detect back-edges using DFS and mark them in the graph.
152+
void markBackEdges(CGraph &g);
153+
154+
/// A detected natural loop: header, back-edge tails, nesting info.
155+
struct LoopBody {
156+
size_t head; // loop header CNode id
157+
std::vector<size_t> tails; // CNode ids with back-edges to head
158+
int depth = 0; // nesting depth (deeper = higher number)
159+
int unique_count = 0; // count of head+tail nodes before reachability
160+
size_t exit_block = NONE; // official exit CNode id, or NONE
161+
LoopBody *immed_container = nullptr; // immediately containing loop
162+
163+
static constexpr size_t NONE = std::numeric_limits<size_t>::max();
164+
165+
explicit LoopBody(size_t h) : head(h) {}
166+
167+
void addTail(size_t t) { tails.push_back(t); }
168+
169+
/// Core body computation: backward reachability from tails to head.
170+
/// Populates `body` with all CNode ids in the loop. Sets CNode::mark.
171+
/// Caller MUST call clearMarks() after using the body.
172+
void findBase(CGraph &g, std::vector<size_t> &body) const;
173+
174+
/// Set immed_container based on containment within other loops.
175+
void labelContainments(const CGraph &g, const std::vector<size_t> &body,
176+
const std::vector<LoopBody *> &looporder);
177+
178+
/// Exit detection, tail ordering, body extension, exit edge labeling.
179+
/// (Declared here, implemented in Plan 02)
180+
void findExit(const CGraph &g, const std::vector<size_t> &body);
181+
void orderTails(const CGraph &g);
182+
void extend(CGraph &g, std::vector<size_t> &body) const;
183+
void labelExitEdges(CGraph &g, const std::vector<size_t> &body) const;
184+
185+
/// Merge LoopBody records that share the same head.
186+
static void mergeIdenticalHeads(std::vector<LoopBody *> &looporder,
187+
std::list<LoopBody> &storage);
188+
189+
/// Mark edges leaving the loop body as F_LOOP_EXIT.
190+
void setExitMarks(CGraph &g, const std::vector<size_t> &body) const;
191+
192+
/// Clear F_LOOP_EXIT marks set by setExitMarks.
193+
void clearExitMarks(CGraph &g, const std::vector<size_t> &body) const;
194+
195+
/// Check if this loop's head is still active (not collapsed).
196+
bool update(const CGraph &g) const;
197+
198+
/// Sort innermost-first (higher depth = processed first).
199+
bool operator<(const LoopBody &other) const {
200+
return depth > other.depth;
201+
}
202+
};
203+
204+
/// A lazily-resolved edge reference that survives collapse operations.
205+
struct FloatingEdge {
206+
size_t top_id; // source CNode id
207+
size_t bottom_id; // destination CNode id
208+
209+
FloatingEdge(size_t t, size_t b) : top_id(t), bottom_id(b) {}
210+
211+
/// Resolve to current edge in graph. Returns {source_id, edge_index},
212+
/// or {CNode::NONE, 0} if the edge no longer exists.
213+
std::pair<size_t, size_t> getCurrentEdge(const CGraph &g) const;
214+
};
215+
216+
/// TraceDAG: traces DAG paths through the CGraph to identify
217+
/// the least-disruptive edges to mark as gotos.
218+
class TraceDAG {
219+
struct BlockTrace;
220+
221+
struct BranchPoint {
222+
BranchPoint *parent = nullptr;
223+
int pathout = -1;
224+
size_t top_id; // CNode id of branch point
225+
std::vector<BlockTrace *> paths;
226+
int depth = 0;
227+
bool ismark = false;
228+
229+
void markPath();
230+
int distance(BranchPoint *op2);
231+
};
232+
233+
struct BlockTrace {
234+
enum : uint32_t { f_active = 1, f_terminal = 2 };
235+
uint32_t flags = 0;
236+
BranchPoint *top;
237+
int pathout;
238+
size_t bottom_id; // CNode id (or NONE for root traces)
239+
size_t dest_id; // destination CNode id
240+
int edgelump = 1;
241+
std::list<BlockTrace *>::iterator activeiter;
242+
BranchPoint *derivedbp = nullptr;
243+
244+
bool isActive() const { return flags & f_active; }
245+
bool isTerminal() const { return flags & f_terminal; }
246+
};
247+
248+
struct BadEdgeScore {
249+
size_t exitproto_id;
250+
BlockTrace *trace;
251+
int distance = -1;
252+
int terminal = 0;
253+
int siblingedge = 0;
254+
255+
bool compareFinal(const BadEdgeScore &op2) const;
256+
bool operator<(const BadEdgeScore &op2) const;
257+
};
258+
259+
std::list<FloatingEdge> &likelygoto;
260+
std::vector<size_t> rootlist;
261+
std::vector<BranchPoint *> branchlist;
262+
int activecount = 0;
263+
std::list<BlockTrace *> activetrace;
264+
std::list<BlockTrace *>::iterator current_activeiter;
265+
size_t finishblock_id = CNode::NONE;
266+
267+
void removeTrace(BlockTrace *trace);
268+
void processExitConflict(std::list<BadEdgeScore>::iterator start,
269+
std::list<BadEdgeScore>::iterator end);
270+
BlockTrace *selectBadEdge();
271+
void insertActive(BlockTrace *trace);
272+
void removeActive(BlockTrace *trace);
273+
bool checkOpen(const CGraph &g, BlockTrace *trace);
274+
std::list<BlockTrace *>::iterator openBranch(CGraph &g,
275+
BlockTrace *parent);
276+
bool checkRetirement(BlockTrace *trace, size_t &exitblock_id);
277+
std::list<BlockTrace *>::iterator retireBranch(BranchPoint *bp,
278+
size_t exitblock_id);
279+
void clearVisitCount(CGraph &g);
280+
281+
public:
282+
TraceDAG(std::list<FloatingEdge> &lg) : likelygoto(lg) {}
283+
~TraceDAG();
284+
285+
void addRoot(size_t root_id) { rootlist.push_back(root_id); }
286+
void setFinishBlock(size_t id) { finishblock_id = id; }
287+
void initialize();
288+
void pushBranches(CGraph &g);
289+
};
290+
291+
/// Clear CNode::mark for all nodes in body vector.
292+
void clearMarks(CGraph &g, const std::vector<size_t> &body);
293+
294+
/// Scan back-edges and create LoopBody records.
295+
void labelLoops(CGraph &g, std::list<LoopBody> &loopbody,
296+
std::vector<LoopBody *> &looporder);
297+
298+
/// Discover all loops, compute bodies/nesting/exits, and order innermost-first.
299+
/// Called once from CfgFoldStructure() after markBackEdges().
300+
void orderLoopBodies(CGraph &g, std::list<LoopBody> &loopbody);
301+
302+
} // namespace detail
303+
304+
/// Build a structured SNode tree from a Cfg using iterative
305+
/// pattern-matching and graph collapse.
306+
///
307+
/// The algorithm structures the CFG *before* generating the SNode tree,
308+
/// producing better results for complex control flow than a flat
309+
/// goto-heavy tree followed by goto elimination.
310+
///
311+
/// Pipeline:
312+
/// 1. Detect loops via back-edges (Tarjan-based).
313+
/// 2. Resolve logical AND/OR condition chains.
314+
/// 3. Iteratively fold graph patterns:
315+
/// - Sequential blocks → SSeq
316+
/// - If / if-else → SIfThenElse
317+
/// - While-do / do-while → SWhile / SDoWhile
318+
/// - Infinite loops → SWhile(true, body)
319+
/// - Switch statements → SSwitch
320+
/// 4. When stuck, resolve by selecting an edge to mark as goto (DAG heuristic).
321+
/// 5. Repeat until fully collapsed.
322+
/// 6. Refine: while→for, break/continue insertion, dead label removal.
323+
SNode *CfgFoldStructure(const Cfg &cfg, SNodeFactory &factory,
324+
clang::ASTContext &ctx);
325+
326+
} // namespace patchestry::ast

0 commit comments

Comments
 (0)