From db3c38913d87addb7244ff07b9d58844b9987b1a Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Tue, 26 Aug 2025 19:50:13 +0800
Subject: [PATCH 01/10] Add redundant move elimination patterns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit implements redundant move elimination to optimize away
unnecessary move operations that are immediately overwritten, targetting
common inefficiencies in compiler-generated code.

Added 5 optimization patterns:
- Consecutive assignments to same destination:
  {mov rd,rs1; mov rd,rs2} → {mov rd,rs2}
- Load immediately overwritten:
  {load rd,offset; mov rd,rs} → {mov rd,rs}
- Constant load immediately overwritten:
  {li rd,imm; mov rd,rs} → {mov rd,rs}
- Consecutive loads to same register:
  {load rd,off1; load rd,off2} → {load rd,off2}
- Consecutive constant loads:
  {li rd,imm1; li rd,imm2} → {li rd,imm2}
---
 src/peephole.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 85 insertions(+), 1 deletion(-)

diff --git a/src/peephole.c b/src/peephole.c
index f3282b71..3edd3641 100644
--- a/src/peephole.c
+++ b/src/peephole.c
@@ -242,6 +242,84 @@ bool insn_fusion(ph2_ir_t *ph2_ir)
     return false;
 }
 
+/* Redundant move elimination
+ * Eliminates unnecessary move operations that are overwritten or redundant
+ */
+bool redundant_move_elim(ph2_ir_t *ph2_ir)
+{
+    ph2_ir_t *next = ph2_ir->next;
+    if (!next)
+        return false;
+
+    /* Pattern 1: Consecutive assignments to same destination
+     * {mov rd, rs1; mov rd, rs2} → {mov rd, rs2}
+     * The first move is completely overwritten by the second
+     */
+    if (ph2_ir->op == OP_assign && next->op == OP_assign &&
+        ph2_ir->dest == next->dest) {
+        /* Replace first move with second, skip second */
+        ph2_ir->src0 = next->src0;
+        ph2_ir->next = next->next;
+        return true;
+    }
+
+    /* Pattern 2: Redundant load immediately overwritten
+     * {load rd, offset; mov rd, rs} → {mov rd, rs}
+     * Loading a value that's immediately replaced is wasteful
+     */
+    if ((ph2_ir->op == OP_load || ph2_ir->op == OP_global_load) &&
+        next->op == OP_assign && ph2_ir->dest == next->dest) {
+        /* Replace load with move */
+        ph2_ir->op = OP_assign;
+        ph2_ir->src0 = next->src0;
+        ph2_ir->src1 = 0; /* Clear unused field */
+        ph2_ir->next = next->next;
+        return true;
+    }
+
+    /* Pattern 3: Load constant immediately overwritten
+     * {li rd, imm; mov rd, rs} → {mov rd, rs}
+     * Loading a constant that's immediately replaced
+     */
+    if (ph2_ir->op == OP_load_constant && next->op == OP_assign &&
+        ph2_ir->dest == next->dest) {
+        /* Replace constant load with move */
+        ph2_ir->op = OP_assign;
+        ph2_ir->src0 = next->src0;
+        ph2_ir->next = next->next;
+        return true;
+    }
+
+    /* Pattern 4: Consecutive loads to same register
+     * {load rd, offset1; load rd, offset2} → {load rd, offset2}
+     * First load is pointless if immediately overwritten
+     */
+    if ((ph2_ir->op == OP_load || ph2_ir->op == OP_global_load) &&
+        (next->op == OP_load || next->op == OP_global_load) &&
+        ph2_ir->dest == next->dest) {
+        /* Keep only the second load */
+        ph2_ir->op = next->op;
+        ph2_ir->src0 = next->src0;
+        ph2_ir->src1 = next->src1;
+        ph2_ir->next = next->next;
+        return true;
+    }
+
+    /* Pattern 5: Consecutive constant loads (already handled in main loop
+     * but included here for completeness)
+     * {li rd, imm1; li rd, imm2} → {li rd, imm2}
+     */
+    if (ph2_ir->op == OP_load_constant && next->op == OP_load_constant &&
+        ph2_ir->dest == next->dest) {
+        /* Keep only the second constant */
+        ph2_ir->src0 = next->src0;
+        ph2_ir->next = next->next;
+        return true;
+    }
+
+    return false;
+}
+
 /* Main peephole optimization driver.
  * It iterates through all functions, basic blocks, and IR instructions to apply
  * local optimizations on adjacent instruction pairs.
@@ -265,7 +343,13 @@ void peephole(void)
                     continue;
                 }
 
-                insn_fusion(ir);
+                /* Try instruction fusion first */
+                if (insn_fusion(ir))
+                    continue;
+
+                /* Apply redundant move elimination */
+                if (redundant_move_elim(ir))
+                    continue;
             }
         }
     }

From 077113f6456d26db028c7a92fc73802584927eae Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Tue, 26 Aug 2025 20:21:49 +0800
Subject: [PATCH 02/10] Add dead code elimination for unreachable blocks

This commit implements dead code elimination that works in conjunction
with SCCP to remove unreachable code after constant propagation and
branch folding.

These optimizations target code that becomes dead after constant
propagation, such as:
- Branches with constant conditions (if(1), if(0))
- Instructions that are immediately overwritten
- Unreachable code blocks after branch folding
---
 src/peephole.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)

diff --git a/src/peephole.c b/src/peephole.c
index 3edd3641..11256aca 100644
--- a/src/peephole.c
+++ b/src/peephole.c
@@ -320,6 +320,117 @@ bool redundant_move_elim(ph2_ir_t *ph2_ir)
     return false;
 }
 
+/* Simple dead instruction elimination within basic blocks.
+ * Removes instructions whose results are never used (dead stores).
+ * Works in conjunction with existing SSA-based DCE.
+ */
+bool eliminate_dead_instructions(func_t *func)
+{
+    if (!func || !func->bbs)
+        return false;
+
+    bool changed = false;
+
+    for (basic_block_t *bb = func->bbs; bb; bb = bb->rpo_next) {
+        ph2_ir_t *ir = bb->ph2_ir_list.head;
+        while (ir && ir->next) {
+            ph2_ir_t *next = ir->next;
+
+            /* Check if next instruction immediately overwrites this one's
+             * result */
+            if (ir->op == OP_load_constant && next->op == OP_load_constant &&
+                ir->dest == next->dest) {
+                /* Consecutive constant loads to same register - first is dead
+                 */
+                ir->next = next->next;
+                if (next == bb->ph2_ir_list.tail) {
+                    bb->ph2_ir_list.tail = ir;
+                }
+                changed = true;
+                continue;
+            }
+
+            /* Check for dead arithmetic results */
+            if ((ir->op == OP_add || ir->op == OP_sub || ir->op == OP_mul) &&
+                next->op == OP_assign && ir->dest == next->dest) {
+                /* Arithmetic result immediately overwritten by assignment */
+                ir->next = next->next;
+                if (next == bb->ph2_ir_list.tail) {
+                    bb->ph2_ir_list.tail = ir;
+                }
+                changed = true;
+                continue;
+            }
+
+            ir = ir->next;
+        }
+    }
+
+    return changed;
+}
+
+/* Simple constant folding for branches after SCCP.
+ * Converts branches with obvious constant conditions to jumps.
+ * Very conservative to maintain bootstrap stability.
+ */
+bool fold_constant_branches(func_t *func)
+{
+    if (!func || !func->bbs)
+        return false;
+
+    bool changed = false;
+
+    for (basic_block_t *bb = func->bbs; bb; bb = bb->rpo_next) {
+        if (!bb->ph2_ir_list.tail)
+            continue;
+
+        ph2_ir_t *last = bb->ph2_ir_list.tail;
+
+        /* Only handle branches */
+        if (last->op != OP_branch || last->src0 < 0)
+            continue;
+
+        /* Look for immediately preceding constant load to the same register */
+        ph2_ir_t *prev = bb->ph2_ir_list.head;
+        ph2_ir_t *found = NULL;
+
+        /* Find the most recent constant load to the branch condition register
+         */
+        while (prev && prev != last) {
+            if (prev->op == OP_load_constant && prev->dest == last->src0) {
+                found = prev;
+                /* Keep looking - want the most recent load */
+            }
+            /* Stop if we see any other write to this register */
+            else if (prev->dest == last->src0) {
+                found = NULL; /* Register was modified, can't fold */
+            }
+            prev = prev->next;
+        }
+
+        if (found) {
+            /* Found constant condition - convert branch to jump */
+            int const_val = found->src0;
+
+            /* Just change the opcode, don't modify CFG edges directly */
+            last->op = OP_jump;
+
+            if (const_val != 0) {
+                /* Always take then branch */
+                last->next_bb = bb->then_;
+            } else {
+                /* Always take else branch */
+                last->next_bb = bb->else_;
+            }
+
+            /* Don't modify src0 or CFG edges - let later passes handle it */
+            changed = true;
+        }
+    }
+
+    return changed;
+}
+
 /* Main peephole optimization driver.
  * It iterates through all functions, basic blocks, and IR instructions to apply
  * local optimizations on adjacent instruction pairs.
@@ -327,6 +438,11 @@ bool redundant_move_elim(ph2_ir_t *ph2_ir)
 void peephole(void)
 {
     for (func_t *func = FUNC_LIST.head; func; func = func->next) {
+        /* Phase 1: Dead code elimination working with SCCP results */
+        eliminate_dead_instructions(func);
+        fold_constant_branches(func);
+
+        /* Phase 2: Local peephole optimizations */
         for (basic_block_t *bb = func->bbs; bb; bb = bb->rpo_next) {
             for (ph2_ir_t *ir = bb->ph2_ir_list.head; ir; ir = ir->next) {
                 ph2_ir_t *next = ir->next;

From 7c7d34db42f249bfcb5c0a162720dcc39e578a8d Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Tue, 26 Aug 2025 20:45:21 +0800
Subject: [PATCH 03/10] Add comprehensive load/store elimination

This extends load/store elimination with more aggressive patterns,
reducing memory traffic by eliminating redundant memory operations.

Local memory optimizations:
- Dead store elimination: Consecutive stores to same location
- Redundant load elimination: Consecutive loads from same location
- Store-to-load forwarding: Replace load with stored value
- Load-store redundancy: Remove store of just-loaded value

Global memory optimizations:
- Global dead store elimination
- Global redundant load elimination
---
 src/peephole.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)

diff --git a/src/peephole.c b/src/peephole.c
index 11256aca..2b314b25 100644
--- a/src/peephole.c
+++ b/src/peephole.c
@@ -431,6 +431,105 @@ bool fold_constant_branches(func_t *func)
     return changed;
 }
 
+/* Load/store elimination for consecutive memory operations.
+ * Removes redundant loads and dead stores that access the same memory location.
+ * Conservative implementation to maintain bootstrap stability.
+ */
+bool eliminate_load_store_pairs(ph2_ir_t *ph2_ir)
+{
+    ph2_ir_t *next = ph2_ir->next;
+    if (!next)
+        return false;
+
+    /* Only handle local loads/stores for now (not globals) to be safe */
+
+    /* Pattern 1: Consecutive stores to same local location
+     * {store [addr], val1; store [addr], val2} → {store [addr], val2}
+     * First store is dead if immediately overwritten
+     */
+    if (ph2_ir->op == OP_store && next->op == OP_store) {
+        /* Check if storing to same memory location */
+        if (ph2_ir->src0 == next->src0 && ph2_ir->src1 == next->src1 &&
+            ph2_ir->src0 >= 0 && ph2_ir->src1 >= 0) {
+            /* Remove first store - it's dead */
+            ph2_ir->dest = next->dest;
+            ph2_ir->next = next->next;
+            return true;
+        }
+    }
+
+    /* Pattern 2: Redundant consecutive loads from same local location
+     * {load rd1, [addr]; load rd2, [addr]} → {load rd1, [addr]; mov rd2, rd1}
+     * Second load can reuse the first load's result
+     * Only apply if addresses are simple (not complex expressions)
+     */
+    if (ph2_ir->op == OP_load && next->op == OP_load) {
+        /* Check if loading from same memory location */
+        if (ph2_ir->src0 == next->src0 && ph2_ir->src1 == next->src1 &&
+            ph2_ir->src0 >= 0 && ph2_ir->src1 >= 0) {
+            /* Replace second load with move */
+            next->op = OP_assign;
+            next->src0 = ph2_ir->dest; /* Result of first load */
+            next->src1 = 0;
+            return true;
+        }
+    }
+
+    /* Pattern 3: Store followed by load from same location (store-to-load
+     * forwarding) {store [addr], val; load rd, [addr]} → {store [addr], val;
+     * mov rd, val} The load can use the stored value directly
+     */
+    if (ph2_ir->op == OP_store && next->op == OP_load) {
+        /* Check if accessing same memory location */
+        if (ph2_ir->src0 == next->src0 && ph2_ir->src1 == next->src1 &&
+            ph2_ir->src0 >= 0 && ph2_ir->dest >= 0) {
+            /* Replace load with move of stored value */
+            next->op = OP_assign;
+            next->src0 = ph2_ir->dest; /* Value that was stored */
+            next->src1 = 0;
+            return true;
+        }
+    }
+
+    /* Pattern 4: Load followed by redundant store of same value
+     * {load rd, [addr]; store [addr], rd} → {load rd, [addr]}
+     * The store is redundant if storing back the just-loaded value
+     */
+    if (ph2_ir->op == OP_load && next->op == OP_store) {
+        /* Check if storing the value we just loaded from same location */
+        if (ph2_ir->dest == next->dest && ph2_ir->src0 == next->src0 &&
+            ph2_ir->src1 == next->src1 && ph2_ir->src0 >= 0) {
+            /* Remove redundant store */
+            ph2_ir->next = next->next;
+            return true;
+        }
+    }
+
+    /* Pattern 5: Global store/load optimizations (carefully enabled) */
+    if (ph2_ir->op == OP_global_store && next->op == OP_global_store) {
+        /* Consecutive global stores to same location */
+        if (ph2_ir->src0 == next->src0 && ph2_ir->src1 == next->src1) {
+            /* Remove first store - it's dead */
+            ph2_ir->dest = next->dest;
+            ph2_ir->next = next->next;
+            return true;
+        }
+    }
+
+    if (ph2_ir->op == OP_global_load && next->op == OP_global_load) {
+        /* Consecutive global loads from same location */
+        if (ph2_ir->src0 == next->src0 && ph2_ir->src1 == next->src1) {
+            /* Replace second load with move */
+            next->op = OP_assign;
+            next->src0 = ph2_ir->dest;
+            next->src1 = 0;
+            return true;
+        }
+    }
+
+    return false;
+}
+
 /* Main peephole optimization driver.
  * It iterates through all functions, basic blocks, and IR instructions to apply
  * local optimizations on adjacent instruction pairs.
@@ -466,6 +565,10 @@ void peephole(void)
                 /* Apply redundant move elimination */
                 if (redundant_move_elim(ir))
                     continue;
+
+                /* Apply load/store elimination */
+                if (eliminate_load_store_pairs(ir))
+                    continue;
             }
         }
     }

From f7cfbc0665399a7e026ba4f4ddadd8d92dd986f3 Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Tue, 26 Aug 2025 23:39:20 +0800
Subject: [PATCH 04/10] Add algebraic simplification to peephole optimizer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This implements mathematical identity patterns on register operands:
- Self-subtraction: x - x → 0
- Self-XOR: x ^ x → 0
- Self-OR: x | x → x (identity)
- Self-AND: x & x → x (identity)

These patterns emerge after register allocation when different
variables are assigned to the same register. SSA handles constant
folding, peephole handles register-based patterns.
---
 src/peephole.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/src/peephole.c b/src/peephole.c
index 2b314b25..653593dd 100644
--- a/src/peephole.c
+++ b/src/peephole.c
@@ -530,6 +530,77 @@ bool eliminate_load_store_pairs(ph2_ir_t *ph2_ir)
     return false;
 }
 
+/* Algebraic simplification: Apply mathematical identities to simplify
+ * expressions
+ *
+ * This function handles patterns that SSA cannot see:
+ * - Self-operations on registers (x-x, x^x, x|x, x&x)
+ * - These patterns emerge after register allocation when different
+ *   variables are assigned to the same register
+ *
+ * SSA handles: Constant folding with known values (5+3 → 8)
+ * Peephole handles: Register-based patterns (r1-r1 → 0)
+ *
+ * Returns true if optimization was applied
+ */
+bool algebraic_simplification(ph2_ir_t *ph2_ir)
+{
+    if (!ph2_ir)
+        return false;
+
+    /* NOTE: SSA's const_folding handles constant operations with known values.
+     * We focus on register-based patterns that appear after register
+     * allocation.
+     */
+
+    /* Pattern 1: Self-subtraction → 0
+     * x - x = 0 (for register operands)
+     */
+    if (ph2_ir->op == OP_sub && ph2_ir->src0 == ph2_ir->src1) {
+        ph2_ir->op = OP_load_constant;
+        ph2_ir->src0 = 0; /* result is 0 */
+        ph2_ir->src1 = 0; /* clear unused field */
+        return true;
+    }
+
+    /* Pattern 2: Self-XOR → 0
+     * x ^ x = 0 (for register operands)
+     */
+    if (ph2_ir->op == OP_bit_xor && ph2_ir->src0 == ph2_ir->src1) {
+        ph2_ir->op = OP_load_constant;
+        ph2_ir->src0 = 0; /* result is 0 */
+        ph2_ir->src1 = 0; /* clear unused field */
+        return true;
+    }
+
+    /* Pattern 3: Self-OR → x
+     * x | x = x (identity operation for register operands)
+     */
+    if (ph2_ir->op == OP_bit_or && ph2_ir->src0 == ph2_ir->src1) {
+        ph2_ir->op = OP_assign;
+        /* src0 already contains x, just need to move it */
+        ph2_ir->src1 = 0; /* clear unused field */
+        return true;
+    }
+
+    /* Pattern 4: Self-AND → x
+     * x & x = x (identity operation for register operands)
+     */
+    if (ph2_ir->op == OP_bit_and && ph2_ir->src0 == ph2_ir->src1) {
+        ph2_ir->op = OP_assign;
+        /* src0 already contains x, just need to move it */
+        ph2_ir->src1 = 0; /* clear unused field */
+        return true;
+    }
+
+    /* NOTE: Arithmetic identity patterns (x+0, x*1, x*0, x-0) are already
+     * handled by SSA's const_folding() function and insn_fusion().
+     * We focus on register-level patterns that SSA cannot see.
+     */
+
+    return false;
+}
+
 /* Main peephole optimization driver.
  * It iterates through all functions, basic blocks, and IR instructions to apply
  * local optimizations on adjacent instruction pairs.

From 3c24984a3a60458ae8cc35a7a1572b45bd0c4bc5 Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Tue, 26 Aug 2025 23:39:47 +0800
Subject: [PATCH 05/10] Add strength reduction optimizations to peephole
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This implements power-of-2 strength reduction patterns:
- Division by 2^n → right shift by n
- Modulo by 2^n → bitwise AND with (2^n - 1)
- Multiplication by 2^n → left shift by n

This optimization is unique to peephole optimizer since SSA
works on virtual registers before actual constants are loaded.
---
 src/peephole.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

diff --git a/src/peephole.c b/src/peephole.c
index 653593dd..efb0f861 100644
--- a/src/peephole.c
+++ b/src/peephole.c
@@ -601,6 +601,82 @@ bool algebraic_simplification(ph2_ir_t *ph2_ir)
     return false;
 }
 
+/* Division/modulo strength reduction: Optimize division and modulo by
+ * power-of-2
+ *
+ * This pattern is unique to peephole optimizer.
+ * SSA cannot perform this optimization because it works on virtual registers
+ * before actual constant values are loaded.
+ *
+ * Returns true if optimization was applied
+ */
+bool strength_reduction(ph2_ir_t *ph2_ir)
+{
+    if (!ph2_ir || !ph2_ir->next)
+        return false;
+
+    ph2_ir_t *next = ph2_ir->next;
+
+    /* Check for constant load followed by division or modulo */
+    if (ph2_ir->op != OP_load_constant)
+        return false;
+
+    int value = ph2_ir->src0;
+
+    /* Check if value is a power of 2 */
+    if (value <= 0 || (value & (value - 1)) != 0)
+        return false;
+
+    /* Calculate shift amount for power of 2 */
+    int shift = 0;
+    int tmp = value;
+    while (tmp > 1) {
+        shift++;
+        tmp >>= 1;
+    }
+
+    /* Pattern 1: Division by power of 2 → right shift
+     * x / 2^n = x >> n (for unsigned)
+     */
+    if (next->op == OP_div && next->src1 == ph2_ir->dest) {
+        /* Convert division to right shift */
+        ph2_ir->src0 = shift; /* Load shift amount instead */
+        next->op = OP_rshift;
+        return true;
+    }
+
+    /* Pattern 2: Modulo by power of 2 → bitwise AND
+     * x % 2^n = x & (2^n - 1)
+     */
+    if (next->op == OP_mod && next->src1 == ph2_ir->dest) {
+        /* Convert modulo to bitwise AND */
+        ph2_ir->src0 = value - 1; /* Load mask (2^n - 1) */
+        next->op = OP_bit_and;
+        return true;
+    }
+
+    /* Pattern 3: Multiplication by power of 2 → left shift
+     * x * 2^n = x << n
+     */
+    if (next->op == OP_mul) {
+        if (next->src0 == ph2_ir->dest) {
+            /* 2^n * x = x << n */
+            ph2_ir->src0 = shift; /* Load shift amount */
+            next->op = OP_lshift;
+            next->src0 = next->src1;   /* Move x to src0 */
+            next->src1 = ph2_ir->dest; /* Shift amount in src1 */
+            return true;
+        } else if (next->src1 == ph2_ir->dest) {
+            /* x * 2^n = x << n */
+            ph2_ir->src0 = shift; /* Load shift amount */
+            next->op = OP_lshift;
+            return true;
+        }
+    }
+
+    return false;
+}
+
 /* Main peephole optimization driver.
  * It iterates through all functions, basic blocks, and IR instructions to apply
  * local optimizations on adjacent instruction pairs.

From 1753776778756b81b1865aeef7f6525eca474cdb Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Tue, 26 Aug 2025 23:40:12 +0800
Subject: [PATCH 06/10] Add comparison optimization patterns to peephole
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This implements self-comparison optimizations:
- x != x → 0 (always false)
- x == x → 1 (always true)
- x < x → 0 (always false)
- x > x → 0 (always false)
- x <= x → 1 (always true)
- x >= x → 1 (always true)

These register-based patterns appear after register allocation
when different variables are assigned to the same register.
Complements SSA's SCCP constant comparison folding.
---
 src/peephole.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

diff --git a/src/peephole.c b/src/peephole.c
index efb0f861..ba245a44 100644
--- a/src/peephole.c
+++ b/src/peephole.c
@@ -677,6 +677,82 @@ bool strength_reduction(ph2_ir_t *ph2_ir)
     return false;
 }
 
+/* Comparison optimization: Simplify comparison patterns
+ * Focus on register-based patterns that SSA's SCCP misses
+ * Returns true if optimization was applied
+ */
+bool comparison_optimization(ph2_ir_t *ph2_ir)
+{
+    if (!ph2_ir)
+        return false;
+
+    /* NOTE: SSA's SCCP handles constant comparisons, so we focus on
+     * register-based self-comparisons after register allocation
+     */
+
+    /* Pattern 1: Self-comparison always false for !=
+     * x != x → 0 (for register operands)
+     */
+    if (ph2_ir->op == OP_neq && ph2_ir->src0 == ph2_ir->src1) {
+        ph2_ir->op = OP_load_constant;
+        ph2_ir->src0 = 0; /* always false */
+        ph2_ir->src1 = 0;
+        return true;
+    }
+
+    /* Pattern 2: Self-comparison always true for ==
+     * x == x → 1 (for register operands)
+     */
+    if (ph2_ir->op == OP_eq && ph2_ir->src0 == ph2_ir->src1) {
+        ph2_ir->op = OP_load_constant;
+        ph2_ir->src0 = 1; /* always true */
+        ph2_ir->src1 = 0;
+        return true;
+    }
+
+    /* Pattern 3: Self-comparison for less-than
+     * x < x → 0 (always false)
+     */
+    if (ph2_ir->op == OP_lt && ph2_ir->src0 == ph2_ir->src1) {
+        ph2_ir->op = OP_load_constant;
+        ph2_ir->src0 = 0; /* always false */
+        ph2_ir->src1 = 0;
+        return true;
+    }
+
+    /* Pattern 4: Self-comparison for greater-than
+     * x > x → 0 (always false)
+     */
+    if (ph2_ir->op == OP_gt && ph2_ir->src0 == ph2_ir->src1) {
+        ph2_ir->op = OP_load_constant;
+        ph2_ir->src0 = 0; /* always false */
+        ph2_ir->src1 = 0;
+        return true;
+    }
+
+    /* Pattern 5: Self-comparison for less-equal
+     * x <= x → 1 (always true)
+     */
+    if (ph2_ir->op == OP_leq && ph2_ir->src0 == ph2_ir->src1) {
+        ph2_ir->op = OP_load_constant;
+        ph2_ir->src0 = 1; /* always true */
+        ph2_ir->src1 = 0;
+        return true;
+    }
+
+    /* Pattern 6: Self-comparison for greater-equal
+     * x >= x → 1 (always true)
+     */
+    if (ph2_ir->op == OP_geq && ph2_ir->src0 == ph2_ir->src1) {
+        ph2_ir->op = OP_load_constant;
+        ph2_ir->src0 = 1; /* always true */
+        ph2_ir->src1 = 0;
+        return true;
+    }
+
+    return false;
+}
+
 /* Main peephole optimization driver.
  * It iterates through all functions, basic blocks, and IR instructions to apply
  * local optimizations on adjacent instruction pairs.

From 2cce7c3eab40f76e5ce91a7141554f7beada07ca Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Tue, 26 Aug 2025 23:40:41 +0800
Subject: [PATCH 07/10] Add bitwise operation optimizations to peephole
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This implements bitwise identity and absorption patterns:
- Double complement: ~(~x) → x
- AND with all-ones: x & -1 → x
- OR with zero: x | 0 → x
- XOR with zero: x ^ 0 → x
- AND with zero: x & 0 → 0 (absorption)
- OR with all-ones: x | -1 → -1 (absorption)
- Shift by zero: x << 0 → x, x >> 0 → x

These patterns are not handled by SSA optimizer and provide
significant optimization opportunities for bitwise operations.
---
 src/peephole.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

diff --git a/src/peephole.c b/src/peephole.c
index ba245a44..0147b688 100644
--- a/src/peephole.c
+++ b/src/peephole.c
@@ -753,6 +753,108 @@ bool comparison_optimization(ph2_ir_t *ph2_ir)
     return false;
 }
 
+/* Bitwise operation optimization: Simplify bitwise patterns
+ * Returns true if optimization was applied
+ */
+bool bitwise_optimization(ph2_ir_t *ph2_ir)
+{
+    if (!ph2_ir || !ph2_ir->next)
+        return false;
+
+    ph2_ir_t *next = ph2_ir->next;
+
+    /* Pattern 1: Double complement → identity
+     * ~(~x) = x
+     */
+    if (ph2_ir->op == OP_negate && next->op == OP_negate &&
+        next->src0 == ph2_ir->dest) {
+        /* Replace with simple assignment */
+        ph2_ir->op = OP_assign;
+        ph2_ir->dest = next->dest;
+        ph2_ir->next = next->next;
+        return true;
+    }
+
+    /* Pattern 2: AND with all-ones mask → identity
+     * x & 0xFFFFFFFF = x (for 32-bit)
+     */
+    if (ph2_ir->op == OP_load_constant && ph2_ir->src0 == -1 &&
+        next->op == OP_bit_and && next->src1 == ph2_ir->dest) {
+        /* Replace AND with assignment */
+        next->op = OP_assign;
+        next->src1 = 0;
+        ph2_ir->next = next->next;
+        return true;
+    }
+
+    /* Pattern 3: OR with zero → identity
+     * x | 0 = x
+     */
+    if (ph2_ir->op == OP_load_constant && ph2_ir->src0 == 0 &&
+        next->op == OP_bit_or && next->src1 == ph2_ir->dest) {
+        /* Replace OR with assignment */
+        next->op = OP_assign;
+        next->src1 = 0;
+        ph2_ir->next = next->next;
+        return true;
+    }
+
+    /* Pattern 4: XOR with zero → identity
+     * x ^ 0 = x
+     */
+    if (ph2_ir->op == OP_load_constant && ph2_ir->src0 == 0 &&
+        next->op == OP_bit_xor && next->src1 == ph2_ir->dest) {
+        /* Replace XOR with assignment */
+        next->op = OP_assign;
+        next->src1 = 0;
+        ph2_ir->next = next->next;
+        return true;
+    }
+
+    /* Pattern 5: AND with zero → zero
+     * x & 0 = 0
+     */
+    if (ph2_ir->op == OP_load_constant && ph2_ir->src0 == 0 &&
+        next->op == OP_bit_and &&
+        (next->src0 == ph2_ir->dest || next->src1 == ph2_ir->dest)) {
+        /* Replace with constant load of 0 */
+        next->op = OP_load_constant;
+        next->src0 = 0;
+        next->src1 = 0;
+        ph2_ir->next = next->next;
+        return true;
+    }
+
+    /* Pattern 6: OR with all-ones → all-ones
+     * x | 0xFFFFFFFF = 0xFFFFFFFF
+     */
+    if (ph2_ir->op == OP_load_constant && ph2_ir->src0 == -1 &&
+        next->op == OP_bit_or &&
+        (next->src0 == ph2_ir->dest || next->src1 == ph2_ir->dest)) {
+        /* Replace with constant load of -1 */
+        next->op = OP_load_constant;
+        next->src0 = -1;
+        next->src1 = 0;
+        ph2_ir->next = next->next;
+        return true;
+    }
+
+    /* Pattern 7: Shift by zero → identity
+     * x << 0 = x, x >> 0 = x
+     */
+    if (ph2_ir->op == OP_load_constant && ph2_ir->src0 == 0 &&
+        (next->op == OP_lshift || next->op == OP_rshift) &&
+        next->src1 == ph2_ir->dest) {
+        /* Replace shift with assignment */
+        next->op = OP_assign;
+        next->src1 = 0;
+        ph2_ir->next = next->next;
+        return true;
+    }
+
+    return false;
+}
+
 /* Main peephole optimization driver.
  * It iterates through all functions, basic blocks, and IR instructions to apply
  * local optimizations on adjacent instruction pairs.

From 073ad2c0fbcd0d090b5609900360e4bc8dddc49c Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Tue, 26 Aug 2025 23:41:19 +0800
Subject: [PATCH 08/10] Add triple pattern optimization

This implements 3-instruction sequence optimizations:
- Store-load-store elimination: removes unused intermediate loads
- Consecutive stores: only last store to same location matters
---
 src/peephole.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/src/peephole.c b/src/peephole.c
index 0147b688..b6379bfe 100644
--- a/src/peephole.c
+++ b/src/peephole.c
@@ -855,6 +855,87 @@ bool bitwise_optimization(ph2_ir_t *ph2_ir)
     return false;
 }
 
+/* Triple pattern optimization: Handle 3-instruction sequences
+ * These patterns are more complex but offer significant optimization
+ * opportunities Returns true if optimization was applied
+ */
+bool triple_pattern_optimization(ph2_ir_t *ph2_ir)
+{
+    if (!ph2_ir || !ph2_ir->next || !ph2_ir->next->next)
+        return false;
+
+    ph2_ir_t *second = ph2_ir->next;
+    ph2_ir_t *third = second->next;
+
+    /* Pattern 1: Store-load-store elimination
+     * {store val1, addr; load r, addr; store val2, addr}
+     * The middle load is pointless if not used elsewhere
+     */
+    if (ph2_ir->op == OP_store && second->op == OP_load &&
+        third->op == OP_store &&
+        ph2_ir->src1 == second->src0 && /* same address */
+        ph2_ir->dest == second->src1 && /* same offset */
+        second->src0 == third->src1 &&  /* same address */
+        second->src1 == third->dest) {  /* same offset */
+        /* Check if the loaded value is used by the third store */
+        if (third->src0 != second->dest) {
+            /* The load result is not used, can eliminate it */
+            ph2_ir->next = third;
+            return true;
+        }
+    }
+
+    /* Pattern 2: Consecutive stores to same location
+     * {store v1, addr; store v2, addr; store v3, addr}
+     * Only the last store matters
+     */
+    if (ph2_ir->op == OP_store && second->op == OP_store &&
+        third->op == OP_store && ph2_ir->src1 == second->src1 &&
+        ph2_ir->dest == second->dest && second->src1 == third->src1 &&
+        second->dest == third->dest) {
+        /* All three stores go to the same location */
+        /* Only the last one matters, eliminate first two */
+        ph2_ir->src0 = third->src0; /* Use last value */
+        ph2_ir->next = third->next; /* Skip middle stores */
+        return true;
+    }
+
+    /* FIXME: Additional patterns for future implementation:
+     *
+     * Pattern 3: Load-op-store with same location
+     * {load r1, [addr]; op r2, r1, ...; store r2, [addr]}
+     * Can optimize to in-place operation if possible
+     * Requires architecture-specific support in codegen.
+     *
+     * Pattern 4: Redundant comparison after boolean operation
+     * {cmp a, b; load 1; load 0} → simplified when used in branch
+     * The comparison already produces 0 or 1, constants may be redundant
+     *
+     * Pattern 5: Consecutive loads that can be combined
+     * {load r1, [base+off1]; load r2, [base+off2]; op r3, r1, r2}
+     * Useful for struct member access patterns
+     * Needs alignment checking and architecture support.
+     *
+     * Pattern 6: Load-Load-Select pattern
+     * {load r1, c1; load r2, c2; select/cmov based on condition}
+     * Can optimize by loading only the needed value
+     * Requires control flow analysis.
+     *
+     * Pattern 7: Add-Add-Add chain simplification
+     * {add r1, r0, c1; add r2, r1, c2; add r3, r2, c3}
+     * Can be simplified if all are constants
+     * Requires tracking constant values through the chain.
+     *
+     * Pattern 8: Global load followed by immediate use
+     * {global_load r1; op r2, r1, ...; store r2}
+     * Track global access patterns
+     * Could optimize to atomic operations or direct memory ops.
+     * Needs careful synchronization analysis.
+     */
+
+    return false;
+}
+
 /* Main peephole optimization driver.
  * It iterates through all functions, basic blocks, and IR instructions to apply
  * local optimizations on adjacent instruction pairs.

From 5509c889a02942461b28f27a7bb6c56f61343ec1 Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Tue, 26 Aug 2025 23:41:59 +0800
Subject: [PATCH 09/10] Integrate working optimizer functions

Integrates all safe and working peephole optimizations:
- Instruction fusion for eliminating redundant moves
- Comparison optimization for self-comparisons
- Strength reduction for power-of-2 operations
- Algebraic simplification for register patterns
- Bitwise operation optimizations
- Redundant move elimination
- Load/store pair elimination
- Triple pattern optimization

Removed eliminate_dead_instructions() and fold_constant_branches()
as they were causing bootstrap failures due to linked list corruption.
---
 src/peephole.c | 165 +++++++++++++------------------------------------
 1 file changed, 44 insertions(+), 121 deletions(-)

diff --git a/src/peephole.c b/src/peephole.c
index b6379bfe..cdc02078 100644
--- a/src/peephole.c
+++ b/src/peephole.c
@@ -320,116 +320,6 @@ bool redundant_move_elim(ph2_ir_t *ph2_ir)
     return false;
 }
 
-/* Simple dead instruction elimination within basic blocks.
- * Removes instructions whose results are never used (dead stores).
- * Works in conjunction with existing SSA-based DCE.
- */
-bool eliminate_dead_instructions(func_t *func)
-{
-    if (!func || !func->bbs)
-        return false;
-
-    bool changed = false;
-
-    for (basic_block_t *bb = func->bbs; bb; bb = bb->rpo_next) {
-        ph2_ir_t *ir = bb->ph2_ir_list.head;
-        while (ir && ir->next) {
-            ph2_ir_t *next = ir->next;
-
-            /* Check if next instruction immediately overwrites this one's
-             * result */
-            if (ir->op == OP_load_constant && next->op == OP_load_constant &&
-                ir->dest == next->dest) {
-                /* Consecutive constant loads to same register - first is dead
-                 */
-                ir->next = next->next;
-                if (next == bb->ph2_ir_list.tail) {
-                    bb->ph2_ir_list.tail = ir;
-                }
-                changed = true;
-                continue;
-            }
-
-            /* Check for dead arithmetic results */
-            if ((ir->op == OP_add || ir->op == OP_sub || ir->op == OP_mul) &&
-                next->op == OP_assign && ir->dest == next->dest) {
-                /* Arithmetic result immediately overwritten by assignment */
-                ir->next = next->next;
-                if (next == bb->ph2_ir_list.tail) {
-                    bb->ph2_ir_list.tail = ir;
-                }
-                changed = true;
-                continue;
-            }
-
-            ir = ir->next;
-        }
-    }
-
-    return changed;
-}
-
-/* Simple constant folding for branches after SCCP.
- * Converts branches with obvious constant conditions to jumps.
- * Very conservative to maintain bootstrap stability.
- */
-bool fold_constant_branches(func_t *func)
-{
-    if (!func || !func->bbs)
-        return false;
-
-    bool changed = false;
-
-    for (basic_block_t *bb = func->bbs; bb; bb = bb->rpo_next) {
-        if (!bb->ph2_ir_list.tail)
-            continue;
-
-        ph2_ir_t *last = bb->ph2_ir_list.tail;
-
-        /* Only handle branches */
-        if (last->op != OP_branch || last->src0 < 0)
-            continue;
-
-        /* Look for immediately preceding constant load to the same register */
-        ph2_ir_t *prev = bb->ph2_ir_list.head;
-        ph2_ir_t *found = NULL;
-
-        /* Find the most recent constant load to the branch condition register
-         */
-        while (prev && prev != last) {
-            if (prev->op == OP_load_constant && prev->dest == last->src0) {
-                found = prev;
-                /* Keep looking - want the most recent load */
-            }
-            /* Stop if we see any other write to this register */
-            else if (prev->dest == last->src0) {
-                found = NULL; /* Register was modified, can't fold */
-            }
-            prev = prev->next;
-        }
-
-        if (found) {
-            /* Found constant condition - convert branch to jump */
-            int const_val = found->src0;
-
-            /* Just change the opcode, don't modify CFG edges directly */
-            last->op = OP_jump;
-
-            if (const_val != 0) {
-                /* Always take then branch */
-                last->next_bb = bb->then_;
-            } else {
-                /* Always take else branch */
-                last->next_bb = bb->else_;
-            }
-
-            /* Don't modify src0 or CFG edges - let later passes handle it */
-            changed = true;
-        }
-    }
-
-    return changed;
-}
 
 /* Load/store elimination for consecutive memory operations.
  * Removes redundant loads and dead stores that access the same memory location.
@@ -937,17 +827,29 @@ bool triple_pattern_optimization(ph2_ir_t *ph2_ir)
 }
 
 /* Main peephole optimization driver.
- * It iterates through all functions, basic blocks, and IR instructions to apply
- * local optimizations on adjacent instruction pairs.
+ *
+ * SSA Optimizer (insn_t, before register allocation):
+ * - Constant folding with known values (5+3 → 8, x+0 → x)
+ * - Common subexpression elimination
+ * - Self-assignment elimination (x = x)
+ * - Dead code elimination
+ * - Constant comparison folding (5 < 3 → 0)
+ *
+ * Peephole Optimizer (ph2_ir_t, after register allocation):
+ * - Register-based self-operations (r1-r1 → 0, r1^r1 → 0)
+ * - Bitwise operation optimization (SSA doesn't handle these)
+ * - Strength reduction for power-of-2 (needs actual constants loaded)
+ * - Load/store pattern elimination
+ * - Triple instruction sequence optimization
+ * - Architecture-specific instruction fusion
+ *
+ * This refined separation eliminates redundant optimizations while
+ * maintaining comprehensive coverage of optimization opportunities.
  */
 void peephole(void)
 {
     for (func_t *func = FUNC_LIST.head; func; func = func->next) {
-        /* Phase 1: Dead code elimination working with SCCP results */
-        eliminate_dead_instructions(func);
-        fold_constant_branches(func);
-
-        /* Phase 2: Local peephole optimizations */
+        /* Local peephole optimizations on post-register-allocation IR */
         for (basic_block_t *bb = func->bbs; bb; bb = bb->rpo_next) {
             for (ph2_ir_t *ir = bb->ph2_ir_list.head; ir; ir = ir->next) {
                 ph2_ir_t *next = ir->next;
@@ -955,19 +857,40 @@ void peephole(void)
                     continue;
 
                 /* Self-assignment elimination
-                 * Removes trivial assignments where destination equals source
-                 * Pattern: {mov x, x} → eliminated
-                 * Common in compiler-generated intermediate code
+                 * Keep this as a safety net: SSA handles most cases, but
+                 * register allocation might create new self-assignments
                  */
                 if (next->op == OP_assign && next->dest == next->src0) {
                     ir->next = next->next;
                     continue;
                 }
 
-                /* Try instruction fusion first */
+                /* Try triple pattern optimization first (3-instruction
+                 * sequences)
+                 */
+                if (triple_pattern_optimization(ir))
+                    continue;
+
+                /* Try instruction fusion (2-instruction sequences) */
                 if (insn_fusion(ir))
                     continue;
 
+                /* Apply comparison optimization */
+                if (comparison_optimization(ir))
+                    continue;
+
+                /* Apply strength reduction for power-of-2 operations */
+                if (strength_reduction(ir))
+                    continue;
+
+                /* Apply algebraic simplification */
+                if (algebraic_simplification(ir))
+                    continue;
+
+                /* Apply bitwise operation optimizations */
+                if (bitwise_optimization(ir))
+                    continue;
+
                 /* Apply redundant move elimination */
                 if (redundant_move_elim(ir))
                     continue;

From a801dc499086ecc9a53984a8dc8a5bb73804f641 Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Sun, 31 Aug 2025 15:57:01 +0800
Subject: [PATCH 10/10] Add reverse redundant move patterns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This eliminates moves immediately overwritten by loads or constants:
- {mov rd, rs; load rd, offset} → {load rd, offset}
- {mov rd, rs; li rd, imm} → {li rd, imm}

Co-authored-by: fennecJ <hwahwa649@gmail.com>
---
 src/peephole.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/src/peephole.c b/src/peephole.c
index cdc02078..42d8099a 100644
--- a/src/peephole.c
+++ b/src/peephole.c
@@ -317,6 +317,35 @@ bool redundant_move_elim(ph2_ir_t *ph2_ir)
         return true;
     }
 
+    /* Pattern 6: Move followed by load
+     * {mov rd, rs; load rd, offset} → {load rd, offset}
+     * The move is pointless if immediately overwritten by load
+     */
+    if (ph2_ir->op == OP_assign &&
+        (next->op == OP_load || next->op == OP_global_load) &&
+        ph2_ir->dest == next->dest) {
+        /* Replace move+load with just the load */
+        ph2_ir->op = next->op;
+        ph2_ir->src0 = next->src0;
+        ph2_ir->src1 = next->src1;
+        ph2_ir->next = next->next;
+        return true;
+    }
+
+    /* Pattern 7: Move followed by constant load
+     * {mov rd, rs; li rd, imm} → {li rd, imm}
+     * The move is pointless if immediately overwritten by constant
+     */
+    if (ph2_ir->op == OP_assign && next->op == OP_load_constant &&
+        ph2_ir->dest == next->dest) {
+        /* Replace move+li with just the li */
+        ph2_ir->op = OP_load_constant;
+        ph2_ir->src0 = next->src0;
+        ph2_ir->src1 = 0; /* Clear unused field */
+        ph2_ir->next = next->next;
+        return true;
+    }
+
     return false;
 }