diff --git a/libs/@local/hashql/mir/package.json b/libs/@local/hashql/mir/package.json index 1a0eee7fc9a..c4f30023df0 100644 --- a/libs/@local/hashql/mir/package.json +++ b/libs/@local/hashql/mir/package.json @@ -9,6 +9,7 @@ "fix:clippy": "just clippy --fix", "lint:clippy": "just clippy", "test:codspeed": "cargo codspeed run -p hashql-mir", + "test:miri": "cargo miri nextest run -- changed_bitor", "test:unit": "mise run test:unit @rust/hashql-mir" }, "dependencies": { diff --git a/libs/@local/hashql/mir/src/pass/mod.rs b/libs/@local/hashql/mir/src/pass/mod.rs index 00492cea510..f22590b7598 100644 --- a/libs/@local/hashql/mir/src/pass/mod.rs +++ b/libs/@local/hashql/mir/src/pass/mod.rs @@ -111,12 +111,21 @@ impl Changed { } } - const fn from_u8(value: u8) -> Self { + /// Convert from a `u8` value. + /// + /// # Safety + /// + /// The caller must ensure that the value is either `0`, `1`, or `3`. + #[expect(unsafe_code)] + const unsafe fn from_u8_unchecked(value: u8) -> Self { + debug_assert!(value == 0 || value == 1 || value == 3); + match value { 0 => Self::No, 1 => Self::Unknown, 3 => Self::Yes, - _ => unreachable!(), + // SAFETY: caller guarantees that the value is valid. + _ => unsafe { core::hint::unreachable_unchecked() }, } } @@ -128,12 +137,21 @@ impl Changed { impl BitOr for Changed { type Output = Self; + #[inline] + #[expect(unsafe_code)] fn bitor(self, rhs: Self) -> Self::Output { - Self::from_u8(self.into_u8() | rhs.into_u8()) + let result = self.into_u8() | rhs.into_u8(); + + // We use `from_u8_unchecked` here because the safe version prevents LLVM from vectorizing + // loops that use `|=` on slices of `Changed` values. + // SAFETY: Both operands have valid discriminants (0, 1, or 3). The bitwise OR of any + // combination of these values produces only 0, 1, or 3, which are all valid discriminants. + unsafe { Self::from_u8_unchecked(result) } } } impl BitOrAssign for Changed { + #[inline] fn bitor_assign(&mut self, rhs: Self) { *self = *self | rhs; } @@ -294,6 +312,22 @@ impl<'ctx> GlobalTransformState<'ctx> { pub fn mark(&mut self, id: DefId, changed: Changed) { self.changed[id] |= changed; } + + /// Overlays the state from another [`GlobalTransformState`] onto this one. + /// + /// This is useful when you want to combine the results of multiple passes into a single + /// state. + /// + /// # Panics + /// + /// Panics if the lengths of the two states are not equal. + pub fn overlay(&mut self, other: &DefIdSlice) { + assert_eq!(self.changed.len(), other.len()); + + for (target, &value) in self.changed.iter_mut().zip(other) { + *target |= value; + } + } } /// A global transformation pass over MIR. diff --git a/libs/@local/hashql/mir/src/pass/transform/canonicalization.rs b/libs/@local/hashql/mir/src/pass/transform/canonicalization.rs new file mode 100644 index 00000000000..72ea7b45d83 --- /dev/null +++ b/libs/@local/hashql/mir/src/pass/transform/canonicalization.rs @@ -0,0 +1,295 @@ +//! MIR canonicalization pass. +//! +//! This module contains the [`Canonicalization`] pass, which runs a fixpoint loop of local and +//! global transformations to simplify MIR bodies into a canonical form. + +use core::alloc::Allocator; + +use hashql_core::{heap::BumpAllocator, id::bit_vec::DenseBitSet}; + +use super::{ + AdministrativeReduction, CfgSimplify, DeadStoreElimination, ForwardSubstitution, InstSimplify, +}; +use crate::{ + body::Body, + context::MirContext, + def::{DefId, DefIdSlice}, + pass::{ + Changed, GlobalTransformPass, GlobalTransformState, TransformPass, + transform::CopyPropagation, + }, +}; + +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct CanonicalizationConfig { + pub max_iterations: usize, +} + +impl Default for CanonicalizationConfig { + fn default() -> Self { + Self { max_iterations: 16 } + } +} + +/// MIR canonicalization driver. +/// +/// This pass orchestrates a sequence of local and global transformations in a fixpoint loop, +/// simplifying MIR bodies into a canonical form. Canonicalization reduces redundancy, propagates +/// values, and eliminates dead code to produce cleaner, more uniform MIR. +/// +/// # Pass Ordering +/// +/// The pass ordering is carefully chosen so each pass feeds the next with new opportunities: +/// +/// 1. **Administrative reduction** - Removes structural clutter and normalizes shape +/// 2. **Instruction simplification** - Constant folding and algebraic simplification +/// 3. **Value propagation** (FS/CP alternating) - Propagates values through the code +/// 4. **Dead store elimination** - Removes stores made dead by propagation +/// 5. **CFG simplification** - Cleans up control flow after local changes +pub struct Canonicalization { + alloc: A, + config: CanonicalizationConfig, +} + +impl Canonicalization { + /// Creates a new canonicalization pass with the given allocator. + /// + /// The allocator is used for temporary data structures within sub-passes and is reset + /// between pass invocations. + pub const fn new_in(config: CanonicalizationConfig, alloc: A) -> Self { + Self { alloc, config } + } + + /// Runs a local transform pass on all unstable bodies. + /// + /// Only bodies in the `unstable` set are processed. The `state` slice is updated to track + /// which bodies were modified. + fn run_local_pass<'env, 'heap>( + context: &mut MirContext<'env, 'heap>, + bodies: &mut DefIdSlice>, + mut pass: impl TransformPass<'env, 'heap>, + unstable: &DenseBitSet, + state: &mut DefIdSlice, + ) -> Changed { + let mut changed = Changed::No; + + for (id, body) in bodies.iter_enumerated_mut() { + if !unstable.contains(id) { + continue; + } + + let result = pass.run(context, body); + changed |= result; + state[id] |= result; + } + + changed + } + + /// Runs a global transform pass on all bodies. + /// + /// Unlike local passes, global passes have access to all bodies and can perform + /// inter-procedural transformations. The `state` slice is updated by the pass to track + /// which bodies were modified. + fn run_global_pass<'env, 'heap>( + context: &mut MirContext<'env, 'heap>, + bodies: &mut DefIdSlice>, + mut pass: impl GlobalTransformPass<'env, 'heap>, + + state: &mut DefIdSlice, + ) -> Changed { + pass.run(context, &mut GlobalTransformState::new(state), bodies) + } + + fn copy_propagation<'heap>( + &mut self, + context: &mut MirContext<'_, 'heap>, + bodies: &mut DefIdSlice>, + unstable: &DenseBitSet, + state: &mut DefIdSlice, + ) -> Changed { + self.alloc.scoped(|alloc| { + let pass = CopyPropagation::new_in(alloc); + Self::run_local_pass(context, bodies, pass, unstable, state) + }) + } + + fn cfg_simplify<'heap>( + &mut self, + context: &mut MirContext<'_, 'heap>, + bodies: &mut DefIdSlice>, + unstable: &DenseBitSet, + state: &mut DefIdSlice, + ) -> Changed { + self.alloc.scoped(|alloc| { + let pass = CfgSimplify::new_in(alloc); + Self::run_local_pass(context, bodies, pass, unstable, state) + }) + } + + fn inst_simplify<'heap>( + &mut self, + context: &mut MirContext<'_, 'heap>, + bodies: &mut DefIdSlice>, + unstable: &DenseBitSet, + state: &mut DefIdSlice, + ) -> Changed { + self.alloc.scoped(|alloc| { + let pass = InstSimplify::new_in(alloc); + Self::run_local_pass(context, bodies, pass, unstable, state) + }) + } + + fn forward_substitution<'heap>( + &mut self, + context: &mut MirContext<'_, 'heap>, + bodies: &mut DefIdSlice>, + unstable: &DenseBitSet, + state: &mut DefIdSlice, + ) -> Changed { + self.alloc.scoped(|alloc| { + let pass = ForwardSubstitution::new_in(alloc); + Self::run_local_pass(context, bodies, pass, unstable, state) + }) + } + + fn administrative_reduction<'heap>( + &mut self, + context: &mut MirContext<'_, 'heap>, + bodies: &mut DefIdSlice>, + unstable: &mut DenseBitSet, + state: &mut DefIdSlice, + ) -> Changed { + let changed: Changed = self.alloc.scoped(|alloc| { + let pass = AdministrativeReduction::new_in(alloc); + Self::run_global_pass(context, bodies, pass, state) + }); + + if changed != Changed::No { + // If we've changed, re-queue any that have changed. This allows us to propagate changes + // earlier and potentially skip redundant iterations. + for (id, &changed) in state.iter_enumerated() { + if changed != Changed::No { + unstable.insert(id); + } + } + } + + changed + } + + fn dse<'heap>( + &mut self, + context: &mut MirContext<'_, 'heap>, + bodies: &mut DefIdSlice>, + unstable: &DenseBitSet, + state: &mut DefIdSlice, + ) -> Changed { + self.alloc.scoped(|alloc| { + let pass = DeadStoreElimination::new_in(alloc); + Self::run_local_pass(context, bodies, pass, unstable, state) + }) + } +} + +impl<'env, 'heap, A: BumpAllocator> GlobalTransformPass<'env, 'heap> for Canonicalization { + #[expect(clippy::integer_division_remainder_used)] + fn run( + &mut self, + context: &mut MirContext<'env, 'heap>, + state: &mut GlobalTransformState<'_>, + bodies: &mut DefIdSlice>, + ) -> Changed { + let global = state; + + // We allocate state on the heap rather than scratch because bump scopes require + // `&mut` access across iterations, and our generic allocator can't express the + // necessary lifetime bounds cleanly (limitation of the underlying bump-scope crate). + // Acceptable since this meta-pass runs once and the data is a single byte per body. + let state = { + let uninit = context.heap.allocate_slice_uninit(bodies.len()); + let init = uninit.write_filled(Changed::No); + + DefIdSlice::from_raw_mut(init) + }; + let mut unstable = DenseBitSet::new_filled(bodies.len()); + + // Pre-pass: run CP + CFG once before the fixpoint loop. + // + // Both passes are cheap and effective on obvious cases (e.g., `if true { ... } else { ... + // }`). CP exposes constant conditions; CFG then prunes unreachable blocks and + // merges straight-line code. This shrinks the MIR upfront so more expensive passes + // run on smaller, cleaner bodies. + let mut global_changed = Changed::No; + global_changed |= self.copy_propagation(context, bodies, &unstable, state); + global_changed |= self.cfg_simplify(context, bodies, &unstable, state); + + let mut iter = 0; + loop { + if iter >= self.config.max_iterations { + break; + } + + global.overlay(state); + + // Reset per-iteration state to track which bodies change in this iteration only. + state.as_raw_mut().fill(Changed::No); + + // The pass ordering is chosen so each pass feeds the next with new opportunities: + // + // 1. AR: Removes structural clutter (unnecessary wrappers, trivial blocks/calls) and + // normalizes shape, exposing simpler instructions for later passes. + // 2. IS: Simplifies individual instructions (constant folding, algebraic + // simplification) given the cleaner structure, producing canonical RHS values ideal + // for propagation. + // 3. FS / CP: Propagates values through the code, eliminating temporaries. After + // propagation, many stores become unused. + // 4. DSE: Removes stores made dead by propagation. Dropping these often empties blocks. + // 5. CS: Cleans up CFG after local changes (empty blocks, unconditional edges), + // producing a minimal CFG that maximizes the next iteration's effectiveness. + + let mut changed = Changed::No; + changed |= self.administrative_reduction(context, bodies, &mut unstable, state); + changed |= self.inst_simplify(context, bodies, &unstable, state); + + // FS vs CP strategy: ForwardSubstitution is more powerful but expensive; + // CopyPropagation is cheaper but weaker. We start with FS (iter=0) to + // aggressively expose the biggest opportunities early when there's most + // redundancy. Subsequent iterations alternate: CP maintains propagation + // cheaply, while periodic FS picks up deeper opportunities. + changed |= if iter % 2 == 0 { + self.forward_substitution(context, bodies, &unstable, state) + } else { + self.copy_propagation(context, bodies, &unstable, state) + }; + + changed |= self.dse(context, bodies, &unstable, state); + changed |= self.cfg_simplify(context, bodies, &unstable, state); + + global_changed |= changed; + if changed == Changed::No { + break; + } + + // Update the unstable set based on this iteration's results. Bodies that had no changes + // are removed (monotonically decreasing), but global passes may re-add bodies by + // creating new optimization opportunities in previously stable functions. + for (id, &changed) in state.iter_enumerated() { + if changed == Changed::No { + unstable.remove(id); + } else { + unstable.insert(id); + } + } + + if unstable.is_empty() { + break; + } + + iter += 1; + } + + global.overlay(state); + global_changed + } +} diff --git a/libs/@local/hashql/mir/src/pass/transform/mod.rs b/libs/@local/hashql/mir/src/pass/transform/mod.rs index 656bbc9ce06..463d2f85a5b 100644 --- a/libs/@local/hashql/mir/src/pass/transform/mod.rs +++ b/libs/@local/hashql/mir/src/pass/transform/mod.rs @@ -1,4 +1,5 @@ mod administrative_reduction; +mod canonicalization; mod cfg_simplify; mod copy_propagation; mod dbe; @@ -13,6 +14,7 @@ mod ssa_repair; pub use self::{ administrative_reduction::AdministrativeReduction, + canonicalization::{Canonicalization, CanonicalizationConfig}, cfg_simplify::CfgSimplify, copy_propagation::CopyPropagation, dbe::DeadBlockElimination, diff --git a/libs/@local/hashql/mir/src/pass/transform/pre_inline.rs b/libs/@local/hashql/mir/src/pass/transform/pre_inline.rs index 3802df1e24e..f750063cbad 100644 --- a/libs/@local/hashql/mir/src/pass/transform/pre_inline.rs +++ b/libs/@local/hashql/mir/src/pass/transform/pre_inline.rs @@ -1,52 +1,32 @@ //! Pre-inlining optimization pass. //! -//! This module contains the [`PreInline`] pass, which runs a fixpoint loop of local and global -//! transformations to optimize MIR bodies before inlining occurs. +//! This module contains the [`PreInline`] pass, a thin wrapper around [`Canonicalization`] that +//! runs with settings tuned for pre-inlining optimization. use core::alloc::Allocator; -use hashql_core::{heap::BumpAllocator, id::bit_vec::DenseBitSet}; +use hashql_core::heap::BumpAllocator; -use super::{ - AdministrativeReduction, CfgSimplify, DeadStoreElimination, ForwardSubstitution, InstSimplify, -}; +use super::{Canonicalization, CanonicalizationConfig}; use crate::{ body::Body, context::MirContext, - def::{DefId, DefIdSlice}, - pass::{ - Changed, GlobalTransformPass, GlobalTransformState, TransformPass, - transform::CopyPropagation, - }, + def::DefIdSlice, + pass::{Changed, GlobalTransformPass, GlobalTransformState}, }; /// Pre-inlining optimization driver. /// -/// This pass orchestrates a sequence of local and global transformations in a fixpoint loop, -/// preparing MIR bodies for inlining. By running these optimizations before inlining, we ensure -/// that: +/// A thin wrapper around [`Canonicalization`] configured for pre-inlining optimization. By running +/// canonicalization before inlining, we ensure that: /// /// - Inlined code is already simplified, reducing work after inlining /// - Call sites see optimized callees, enabling better inlining decisions /// - The overall MIR size is reduced before the potential code explosion from inlining /// -/// # Pass Ordering -/// -/// The pass ordering is carefully chosen so each pass feeds the next with new opportunities: -/// -/// 1. **Administrative reduction** - Removes structural clutter and normalizes shape -/// 2. **Instruction simplification** - Constant folding and algebraic simplification -/// 3. **Value propagation** (FS/CP alternating) - Propagates values through the code -/// 4. **Dead store elimination** - Removes stores made dead by propagation -/// 5. **CFG simplification** - Cleans up control flow after local changes -/// -/// # Implementation Notes -/// -/// This pass manages its own per-body change tracking and does not populate the caller-provided -/// [`GlobalTransformState`]. Callers receive a combined [`Changed`] result indicating whether any -/// body was modified. +/// See [`Canonicalization`] for details on the pass ordering and implementation. pub struct PreInline { - alloc: A, + canonicalization: Canonicalization, } impl PreInline { @@ -55,224 +35,22 @@ impl PreInline { /// The allocator is used for temporary data structures within sub-passes and is reset /// between pass invocations. pub const fn new_in(alloc: A) -> Self { - Self { alloc } - } - - /// Runs a local transform pass on all unstable bodies. - /// - /// Only bodies in the `unstable` set are processed. The `state` slice is updated to track - /// which bodies were modified. - fn run_local_pass<'env, 'heap>( - context: &mut MirContext<'env, 'heap>, - bodies: &mut DefIdSlice>, - mut pass: impl TransformPass<'env, 'heap>, - unstable: &DenseBitSet, - state: &mut DefIdSlice, - ) -> Changed { - let mut changed = Changed::No; - - for (id, body) in bodies.iter_enumerated_mut() { - if !unstable.contains(id) { - continue; - } - - let result = pass.run(context, body); - changed |= result; - state[id] |= result; + Self { + canonicalization: Canonicalization::new_in( + CanonicalizationConfig { max_iterations: 8 }, + alloc, + ), } - - changed - } - - /// Runs a global transform pass on all bodies. - /// - /// Unlike local passes, global passes have access to all bodies and can perform - /// inter-procedural transformations. The `state` slice is updated by the pass to track - /// which bodies were modified. - fn run_global_pass<'env, 'heap>( - context: &mut MirContext<'env, 'heap>, - bodies: &mut DefIdSlice>, - mut pass: impl GlobalTransformPass<'env, 'heap>, - - state: &mut DefIdSlice, - ) -> Changed { - pass.run(context, &mut GlobalTransformState::new(state), bodies) - } - - fn copy_propagation<'heap>( - &mut self, - context: &mut MirContext<'_, 'heap>, - bodies: &mut DefIdSlice>, - unstable: &DenseBitSet, - state: &mut DefIdSlice, - ) -> Changed { - self.alloc.scoped(|alloc| { - let pass = CopyPropagation::new_in(alloc); - Self::run_local_pass(context, bodies, pass, unstable, state) - }) - } - - fn cfg_simplify<'heap>( - &mut self, - context: &mut MirContext<'_, 'heap>, - bodies: &mut DefIdSlice>, - unstable: &DenseBitSet, - state: &mut DefIdSlice, - ) -> Changed { - self.alloc.scoped(|alloc| { - let pass = CfgSimplify::new_in(alloc); - Self::run_local_pass(context, bodies, pass, unstable, state) - }) - } - - fn inst_simplify<'heap>( - &mut self, - context: &mut MirContext<'_, 'heap>, - bodies: &mut DefIdSlice>, - unstable: &DenseBitSet, - state: &mut DefIdSlice, - ) -> Changed { - self.alloc.scoped(|alloc| { - let pass = InstSimplify::new_in(alloc); - Self::run_local_pass(context, bodies, pass, unstable, state) - }) - } - - fn forward_substitution<'heap>( - &mut self, - context: &mut MirContext<'_, 'heap>, - bodies: &mut DefIdSlice>, - unstable: &DenseBitSet, - state: &mut DefIdSlice, - ) -> Changed { - self.alloc.scoped(|alloc| { - let pass = ForwardSubstitution::new_in(alloc); - Self::run_local_pass(context, bodies, pass, unstable, state) - }) - } - - fn administrative_reduction<'heap>( - &mut self, - context: &mut MirContext<'_, 'heap>, - bodies: &mut DefIdSlice>, - - state: &mut DefIdSlice, - ) -> Changed { - self.alloc.scoped(|alloc| { - let pass = AdministrativeReduction::new_in(alloc); - Self::run_global_pass(context, bodies, pass, state) - }) - } - - fn dse<'heap>( - &mut self, - context: &mut MirContext<'_, 'heap>, - bodies: &mut DefIdSlice>, - unstable: &DenseBitSet, - state: &mut DefIdSlice, - ) -> Changed { - self.alloc.scoped(|alloc| { - let pass = DeadStoreElimination::new_in(alloc); - Self::run_local_pass(context, bodies, pass, unstable, state) - }) } } -const MAX_ITERATIONS: usize = 16; - impl<'env, 'heap, A: BumpAllocator> GlobalTransformPass<'env, 'heap> for PreInline { - #[expect(clippy::integer_division_remainder_used)] fn run( &mut self, context: &mut MirContext<'env, 'heap>, - _: &mut GlobalTransformState<'_>, + state: &mut GlobalTransformState<'_>, bodies: &mut DefIdSlice>, ) -> Changed { - // We allocate state on the heap rather than scratch because bump scopes require - // `&mut` access across iterations, and our generic allocator can't express the - // necessary lifetime bounds cleanly (limitation of the underlying bump-scope crate). - // Acceptable since this meta-pass runs once and the data is a single byte per body. - let state = { - let uninit = context.heap.allocate_slice_uninit(bodies.len()); - let init = uninit.write_filled(Changed::No); - - DefIdSlice::from_raw_mut(init) - }; - let mut unstable = DenseBitSet::new_filled(bodies.len()); - - // Pre-pass: run CP + CFG once before the fixpoint loop. - // - // Both passes are cheap and effective on obvious cases (e.g., `if true { ... } else { ... - // }`). CP exposes constant conditions; CFG then prunes unreachable blocks and - // merges straight-line code. This shrinks the MIR upfront so more expensive passes - // run on smaller, cleaner bodies. - let mut global_changed = Changed::No; - global_changed |= self.copy_propagation(context, bodies, &unstable, state); - global_changed |= self.cfg_simplify(context, bodies, &unstable, state); - - let mut iter = 0; - loop { - if iter >= MAX_ITERATIONS { - break; - } - - // Reset per-iteration state to track which bodies change in this iteration only. - state.as_raw_mut().fill(Changed::No); - - // The pass ordering is chosen so each pass feeds the next with new opportunities: - // - // 1. AR: Removes structural clutter (unnecessary wrappers, trivial blocks/calls) and - // normalizes shape, exposing simpler instructions for later passes. - // 2. IS: Simplifies individual instructions (constant folding, algebraic - // simplification) given the cleaner structure, producing canonical RHS values ideal - // for propagation. - // 3. FS / CP: Propagates values through the code, eliminating temporaries. After - // propagation, many stores become unused. - // 4. DSE: Removes stores made dead by propagation. Dropping these often empties blocks. - // 5. CS: Cleans up CFG after local changes (empty blocks, unconditional edges), - // producing a minimal CFG that maximizes the next iteration's effectiveness. - - let mut changed = Changed::No; - changed |= self.administrative_reduction(context, bodies, state); - changed |= self.inst_simplify(context, bodies, &unstable, state); - - // FS vs CP strategy: ForwardSubstitution is more powerful but expensive; - // CopyPropagation is cheaper but weaker. We start with FS (iter=0) to - // aggressively expose the biggest opportunities early when there's most - // redundancy. Subsequent iterations alternate: CP maintains propagation - // cheaply, while periodic FS picks up deeper opportunities. - changed |= if iter % 2 == 0 { - self.forward_substitution(context, bodies, &unstable, state) - } else { - self.copy_propagation(context, bodies, &unstable, state) - }; - - changed |= self.dse(context, bodies, &unstable, state); - changed |= self.cfg_simplify(context, bodies, &unstable, state); - - global_changed |= changed; - if changed == Changed::No { - break; - } - - // Update the unstable set based on this iteration's results. Bodies that had no changes - // are removed (monotonically decreasing), but global passes may re-add bodies by - // creating new optimization opportunities in previously stable functions. - for (id, &changed) in state.iter_enumerated() { - if changed == Changed::No { - unstable.remove(id); - } else { - unstable.insert(id); - } - } - - if unstable.is_empty() { - break; - } - - iter += 1; - } - - global_changed + self.canonicalization.run(context, state, bodies) } }