Fix the postorder traversal in the DominatorTree (#5821)

elliottt · jameysharp · web-flow · commit a139ed6d56c3 · 2023-02-17T20:39:04.000Z
Fix the postorder traversal computed by the `DominatorTree`. It was
recording nodes in the wrong order depending on the order child nodes
were visited. Consider the following program:

```
function %foo2(i8) -&gt; i8 {
block0(v0: i8):
    brif v0, block1, block2

block1:
    return v0

block2:
    jump block1
}
```

The postorder produced by the previous implementation was:

```
block2
block1
block0
```

Which is incorrect, as `block1` is branched to by `block2`. Changing the
branch order in the function would also change the postorder result,
yielding the expected order with `block1` emitted first.

The problem was that when pushing successor nodes onto the stack, the
old implementation would also mark them SEEN. This would then prevent
them from being pushed on the stack again in the future, which is
incorrect as they might be visited by other nodes that have not yet been
pushed. This causes nodes to potentially show up later in the postorder
traversal than they should.

This PR reworks the implementation of `DominatorTree::compute` to
produce an order where `block1` is always returned first, regardless of
the branch order in the original program.

Co-authored-by: Jamey Sharp &lt;jsharp@fastly.com&gt;
diff --git a/cranelift/codegen/src/dominator_tree.rs b/cranelift/codegen/src/dominator_tree.rs
@@ -2,7 +2,6 @@
 
 use crate::entity::SecondaryMap;
 use crate::flowgraph::{BlockPredecessor, ControlFlowGraph};
-use crate::inst_predicates;
 use crate::ir::{Block, ExpandedProgramPoint, Function, Inst, Layout, ProgramOrder, Value};
 use crate::packed_option::PackedOption;
 use crate::timing;
@@ -16,8 +15,7 @@ use core::mem;
 const STRIDE: u32 = 4;
 
 /// Special RPO numbers used during `compute_postorder`.
-const DONE: u32 = 1;
-const SEEN: u32 = 2;
+const SEEN: u32 = 1;
 
 /// Dominator tree node. We keep one of these per block.
 #[derive(Clone, Default)]
@@ -36,6 +34,12 @@ struct DomNode {
     idom: PackedOption<Inst>,
 }
 
+/// DFT stack state marker for computing the cfg postorder.
+enum Visit {
+    First,
+    Last,
+}
+
 /// The dominator tree for a single function.
 pub struct DominatorTree {
     nodes: SecondaryMap<Block, DomNode>,
@@ -44,7 +48,7 @@ pub struct DominatorTree {
     postorder: Vec<Block>,
 
     /// Scratch memory used by `compute_postorder()`.
-    stack: Vec<Block>,
+    stack: Vec<(Visit, Block)>,
 
     valid: bool,
 }
@@ -275,93 +279,64 @@ impl DominatorTree {
 
         // This algorithm is a depth first traversal (DFT) of the control flow graph, computing a
         // post-order of the blocks that are reachable form the entry block. A DFT post-order is not
-        // unique. The specific order we get is controlled by two factors:
-        //
-        // 1. The order each node's children are visited, and
-        // 2. The method used for pruning graph edges to get a tree.
-        //
-        // There are two ways of viewing the CFG as a graph:
-        //
-        // 1. Each block is a node, with outgoing edges for all the branches in the block.
-        // 2. Each basic block is a node, with outgoing edges for the single branch at the end of
-        //    the BB. (A block is a linear sequence of basic blocks).
-        //
-        // The first graph is a contraction of the second one. We want to compute a block post-order
-        // that is compatible both graph interpretations. That is, if you compute a BB post-order
-        // and then remove those BBs that do not correspond to block headers, you get a post-order of
-        // the block graph.
-        //
-        // Node child order:
-        //
-        //     In the BB graph, we always go down the fall-through path first and follow the branch
-        //     destination second.
-        //
-        //     In the block graph, this is equivalent to visiting block successors in a bottom-up
-        //     order, starting from the destination of the block's terminating jump, ending at the
-        //     destination of the first branch in the block.
-        //
-        // Edge pruning:
+        // unique. The specific order we get is controlled by the order each node's children are
+        // visited.
         //
-        //     In the BB graph, we keep an edge to a block the first time we visit the *source* side
-        //     of the edge. Any subsequent edges to the same block are pruned.
+        // We view the CFG as a graph where each `BlockCall` value of a terminating branch
+        // instruction is an edge. A consequence of this is that we visit successor nodes in the
+        // reverse order specified by the branch instruction that terminates the basic block.
+        // (Reversed because we are using a stack to control traversal, and push the successors in
+        // the order the branch instruction specifies -- there's no good reason for this particular
+        // order.)
         //
-        //     The equivalent tree is reached in the block graph by keeping the first edge to a block
-        //     in a top-down traversal of the successors. (And then visiting edges in a bottom-up
-        //     order).
-        //
-        // This pruning method makes it possible to compute the DFT without storing lots of
-        // information about the progress through a block.
-
         // During this algorithm only, use `rpo_number` to hold the following state:
         //
-        //   0:    block has not yet been reached in the pre-order.
-        //   SEEN: block has been pushed on the stack but successors not yet pushed.
-        //   DONE: Successors pushed.
+        //   0:    block has not yet had its first visit
+        //   SEEN: block has been visited at least once, implying that all of its successors are on
+        //         the stack
 
         match func.layout.entry_block() {
             Some(block) => {
-                self.stack.push(block);
-                self.nodes[block].rpo_number = SEEN;
+                self.stack.push((Visit::First, block));
             }
             None => return,
         }
 
-        while let Some(block) = self.stack.pop() {
-            match self.nodes[block].rpo_number {
-                SEEN => {
-                    // This is the first time we pop the block, so we need to scan its successors and
-                    // then revisit it.
-                    self.nodes[block].rpo_number = DONE;
-                    self.stack.push(block);
-                    self.push_successors(func, block);
+        while let Some((visit, block)) = self.stack.pop() {
+            match visit {
+                Visit::First => {
+                    if self.nodes[block].rpo_number == 0 {
+                        // This is the first time we pop the block, so we need to scan its
+                        // successors and then revisit it.
+                        self.nodes[block].rpo_number = SEEN;
+                        self.stack.push((Visit::Last, block));
+                        if let Some(inst) = func.stencil.layout.last_inst(block) {
+                            for block in func.stencil.dfg.insts[inst]
+                                .branch_destination(&func.stencil.dfg.jump_tables)
+                                .iter()
+                            {
+                                let succ = block.block(&func.stencil.dfg.value_lists);
+
+                                // This is purely an optimization to avoid additional iterations of
+                                // the loop, and is not required; it's merely inlining the check
+                                // from the outer conditional of this case to avoid the extra loop
+                                // iteration.
+                                if self.nodes[succ].rpo_number == 0 {
+                                    self.stack.push((Visit::First, succ))
+                                }
+                            }
+                        }
+                    }
                 }
-                DONE => {
-                    // This is the second time we pop the block, so all successors have been
-                    // processed.
+
+                Visit::Last => {
+                    // We've finished all this node's successors.
                     self.postorder.push(block);
                 }
-                _ => unreachable!(),
             }
         }
     }
 
-    /// Push `block` successors onto `self.stack`, filtering out those that have already been seen.
-    ///
-    /// The successors are pushed in program order which is important to get a split-invariant
-    /// post-order. Split-invariant means that if a block is split in two, we get the same
-    /// post-order except for the insertion of the new block header at the split point.
-    fn push_successors(&mut self, func: &Function, block: Block) {
-        inst_predicates::visit_block_succs(func, block, |_, succ, _| self.push_if_unseen(succ))
-    }
-
-    /// Push `block` onto `self.stack` if it has not already been seen.
-    fn push_if_unseen(&mut self, block: Block) {
-        if self.nodes[block].rpo_number == 0 {
-            self.nodes[block].rpo_number = SEEN;
-            self.stack.push(block);
-        }
-    }
-
     /// Build a dominator tree from a control flow graph using Keith D. Cooper's
     /// "Simple, Fast Dominator Algorithm."
     fn compute_domtree(&mut self, func: &Function, cfg: &ControlFlowGraph) {
@@ -728,7 +703,7 @@ mod tests {
         //   } block3:jump block1
         // } block3
 
-        assert_eq!(dt.cfg_postorder(), &[block2, block0, block1, block3]);
+        assert_eq!(dt.cfg_postorder(), &[block0, block2, block1, block3]);
 
         assert_eq!(cur.func.layout.entry_block().unwrap(), block3);
         assert_eq!(dt.idom(block3), None);