optimization: peel align-head and unroll body to 64 bytes

l0rinc · hodlinator · l0rinc · commit 248b6a27c351 · 2025-07-16T14:37:19.000-07:00
Benchmarks indicated that obfuscating multiple bytes already gives an order of magnitude speed-up, but:
* GCC still emitted scalar code;
* Clang’s auto-vectorized loop ran on the slow unaligned-load path.

Fix contains:
* peeling the misaligned head enabled the hot loop starting at an 8-byte address;
* `std::assume_aligned&lt;8&gt;` tells the optimizer the promise holds - required to keep Apple Clang happy;
* manually unrolling the body to 64 bytes enabled GCC to auto-vectorize.

Note that `target.size() &gt; KEY_SIZE` condition is just an optimization, the aligned and unaligned loops work without it as well - it's why the alignment calculation still contains `std::min`.

&gt;  C++ compiler .......................... GNU 14.2.0

|             ns/byte |              byte/s |    err% |        ins/byte |        cyc/byte |    IPC |       bra/byte |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|                0.03 |   32,464,658,919.11 |    0.0% |            0.50 |            0.11 |  4.474 |           0.08 |    0.0% |      5.29 | `ObfuscationBench`

&gt; C++ compiler .......................... Clang 20.1.7

|             ns/byte |              byte/s |    err% |        ins/byte |        cyc/byte |    IPC |       bra/byte |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|                0.02 |   41,231,547,045.17 |    0.0% |            0.30 |            0.09 |  3.463 |           0.02 |    0.0% |      5.47 | `ObfuscationBench`

Co-authored-by: Hodlinator &lt;172445034+hodlinator@users.noreply.github.com&gt;
diff --git a/src/util/obfuscation.h b/src/util/obfuscation.h
@@ -14,6 +14,7 @@
 #include <bit>
 #include <climits>
 #include <ios>
+#include <memory>
 
 class Obfuscation
 {
@@ -33,9 +34,26 @@ class Obfuscation
     {
         if (!*this) return;
 
-        const KeyType rot_key{m_rotations[key_offset % KEY_SIZE]}; // Continue obfuscation from where we left off
-        for (; target.size() >= KEY_SIZE; target = target.subspan(KEY_SIZE)) {
-            XorWord(target.first<KEY_SIZE>(), rot_key);
+        KeyType rot_key{m_rotations[key_offset % KEY_SIZE]}; // Continue obfuscation from where we left off
+        if (target.size() > KEY_SIZE) {
+            // Obfuscate until 64-bit alignment boundary
+            if (const auto misalign{std::bit_cast<uintptr_t>(target.data()) % KEY_SIZE}) {
+                const size_t alignment{std::min(KEY_SIZE - misalign, target.size())};
+                XorWord(target.first(alignment), rot_key);
+
+                target = {std::assume_aligned<KEY_SIZE>(target.data() + alignment), target.size() - alignment};
+                rot_key = m_rotations[(key_offset + alignment) % KEY_SIZE];
+            }
+            // Aligned obfuscation in 64-byte chunks
+            for (constexpr auto unroll{8}; target.size() >= KEY_SIZE * unroll; target = target.subspan(KEY_SIZE * unroll)) {
+                for (size_t i{0}; i < unroll; ++i) {
+                    XorWord(target.subspan(i * KEY_SIZE, KEY_SIZE), rot_key);
+                }
+            }
+            // Aligned obfuscation in 64-bit chunks
+            for (; target.size() >= KEY_SIZE; target = target.subspan(KEY_SIZE)) {
+                XorWord(target.first<KEY_SIZE>(), rot_key);
+            }
         }
         XorWord(target, rot_key);
     }