Milerius
diff --git a/‎Cargo.lock‎
Lines changed: 409 additions & 9 deletions b/‎Cargo.lock‎
Lines changed: 409 additions & 9 deletions
diff --git a/‎crates/bench/benches/spsc.rs‎
Lines changed: 452 additions & 70 deletions b/‎crates/bench/benches/spsc.rs‎
Lines changed: 452 additions & 70 deletions
diff --git a/‎crates/platform/Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎crates/platform/Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎crates/platform/src/intrinsics/compiler_hints.rs‎
Lines changed: 56 additions & 8 deletions b/‎crates/platform/src/intrinsics/compiler_hints.rs‎
Lines changed: 56 additions & 8 deletions
diff --git a/‎crates/queue/Cargo.toml‎
Lines changed: 4 additions & 0 deletions b/‎crates/queue/Cargo.toml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎crates/queue/examples/asm_shim.rs‎
Lines changed: 15 additions & 0 deletions b/‎crates/queue/examples/asm_shim.rs‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎crates/queue/src/copy_ring/engine.rs‎
Lines changed: 182 additions & 10 deletions b/‎crates/queue/src/copy_ring/engine.rs‎
Lines changed: 182 additions & 10 deletions
@@ -12,6 +12,7 @@ default = []
 std = []
 asm = []
 nightly = []
+prefetch = []
 perf-counters = ["dep:perf-event2"]
 
 [dependencies]
 
@@ -37,22 +37,55 @@ pub fn prefetch<T>(ptr: *const T, rw: PrefetchRW, locality: PrefetchLocality) {
     #[cfg(target_arch = "x86_64")]
     {
         use core::arch::x86_64::{
-            _MM_HINT_NTA, _MM_HINT_T0, _MM_HINT_T1, _MM_HINT_T2, _mm_prefetch,
+            _MM_HINT_ET0, _MM_HINT_NTA, _MM_HINT_T0, _MM_HINT_T1, _MM_HINT_T2, _mm_prefetch,
         };
-        let _ = rw;
         let hint = ptr.cast::<i8>();
         // SAFETY: prefetch is a hint and never faults, even on invalid addresses.
         // The locality must be a compile-time constant for _mm_prefetch.
         unsafe {
-            match locality {
-                PrefetchLocality::NoTemporal => _mm_prefetch(hint, _MM_HINT_NTA),
-                PrefetchLocality::Low => _mm_prefetch(hint, _MM_HINT_T2),
-                PrefetchLocality::Moderate => _mm_prefetch(hint, _MM_HINT_T1),
-                PrefetchLocality::High => _mm_prefetch(hint, _MM_HINT_T0),
+            match (rw, locality) {
+                // Write prefetch: use ET0 (exclusive) to bring line in Modified
+                // state, avoiding the subsequent RFO on the actual store.
+                (PrefetchRW::Write, PrefetchLocality::High) => {
+                    _mm_prefetch(hint, _MM_HINT_ET0);
+                }
+                // Non-High write localities fall through to read hints — ET0
+                // only exists as a single locality level on x86; for other
+                // localities we use the read hint as a reasonable fallback.
+                (_, PrefetchLocality::NoTemporal) => _mm_prefetch(hint, _MM_HINT_NTA),
+                (_, PrefetchLocality::Low) => _mm_prefetch(hint, _MM_HINT_T2),
+                (_, PrefetchLocality::Moderate) => _mm_prefetch(hint, _MM_HINT_T1),
+                (_, PrefetchLocality::High) => _mm_prefetch(hint, _MM_HINT_T0),
             }
         }
     }
-    #[cfg(not(target_arch = "x86_64"))]
+    #[cfg(target_arch = "aarch64")]
+    {
+        let addr = ptr.cast::<u8>();
+        // SAFETY: PRFM is a hint instruction — it never faults and has no
+        // side effects beyond cache management. options(nostack, preserves_flags)
+        // tells LLVM it doesn't touch the stack or condition flags.
+        unsafe {
+            match rw {
+                PrefetchRW::Read => {
+                    core::arch::asm!(
+                        "prfm pldl1keep, [{ptr}]",
+                        ptr = in(reg) addr,
+                        options(nostack, preserves_flags),
+                    );
+                }
+                PrefetchRW::Write => {
+                    core::arch::asm!(
+                        "prfm pstl1keep, [{ptr}]",
+                        ptr = in(reg) addr,
+                        options(nostack, preserves_flags),
+                    );
+                }
+            }
+        }
+        let _ = locality; // Locality encoded in instruction mnemonic (L1KEEP)
+    }
+    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
     {
         let _ = (ptr, rw, locality);
     }
@@ -109,6 +142,21 @@ mod tests {
         prefetch_large(big.as_ptr(), PrefetchRW::Read, PrefetchLocality::Low, 2);
     }
 
+    #[test]
+    fn prefetch_write_does_not_crash() {
+        let mut value: u64 = 42;
+        prefetch(&raw const value, PrefetchRW::Write, PrefetchLocality::High);
+        // Verify write after prefetch still works
+        value = 99;
+        assert_eq!(value, 99);
+    }
+
+    #[test]
+    fn prefetch_read_does_not_crash_stack_array() {
+        let arr = [0u8; 128];
+        prefetch(arr.as_ptr(), PrefetchRW::Read, PrefetchLocality::High);
+    }
+
     #[test]
     fn enum_values() {
         assert_eq!(PrefetchRW::Read as i32, 0);
 
@@ -13,11 +13,15 @@ std = ["alloc", "mantis-platform/std"]
 alloc = []
 asm = ["mantis-platform/asm"]
 nightly = ["mantis-platform/nightly"]
+prefetch = ["mantis-platform/prefetch"]
 
 [dependencies]
 mantis-core = { workspace = true }
 mantis-platform = { workspace = true }
 mantis-types = { workspace = true }
 
+[dev-dependencies]
+proptest = "1.11.0"
+
 [lints]
 workspace = true
@@ -39,6 +39,16 @@ pub fn spsc_copy_pop_u64(ring: &mut SpscRingCopy<u64, 1024>, out: &mut u64) -> b
     ring.pop(out)
 }
 
+#[inline(never)]
+pub fn spsc_copy_push_batch_u64(ring: &mut SpscRingCopy<u64, 1024>, src: &[u64]) -> usize {
+    ring.push_batch(src)
+}
+
+#[inline(never)]
+pub fn spsc_copy_pop_batch_u64(ring: &mut SpscRingCopy<u64, 1024>, dst: &mut [u64]) -> usize {
+    ring.pop_batch(dst)
+}
+
 fn main() {
     let mut ring = SpscRing::<u64, 1024>::new();
     std::hint::black_box(spsc_push_u64(&mut ring, 42));
@@ -52,4 +62,9 @@ fn main() {
     std::hint::black_box(spsc_copy_push_u64(&mut copy_ring, &42));
     let mut copy_out = 0u64;
     std::hint::black_box(spsc_copy_pop_u64(&mut copy_ring, &mut copy_out));
+
+    let batch_src = [0u64; 8];
+    std::hint::black_box(spsc_copy_push_batch_u64(&mut copy_ring, &batch_src));
+    let mut batch_dst = [0u64; 8];
+    std::hint::black_box(spsc_copy_pop_batch_u64(&mut copy_ring, &mut batch_dst));
 }
@@ -63,6 +63,9 @@ where
         let head = self.head.load(Ordering::Relaxed);
         let next_head = I::wrap(head + 1, self.storage.capacity());
 
+        #[cfg(feature = "prefetch")]
+        crate::raw::prefetch_slot_write(&self.storage, next_head);
+
         if next_head == self.tail_cached.get() {
             let tail = self.tail.load(Ordering::Acquire);
             self.tail_cached.set(tail);
@@ -83,6 +86,9 @@ where
     pub(crate) fn pop(&self, out: &mut T) -> bool {
         let tail = self.tail.load(Ordering::Relaxed);
 
+        #[cfg(feature = "prefetch")]
+        crate::raw::prefetch_slot_read(&self.storage, tail);
+
         if tail == self.head_cached.get() {
             let head = self.head.load(Ordering::Acquire);
             self.head_cached.set(head);
@@ -133,13 +139,29 @@ where
         }
 
         let n = src.len().min(free);
-        let mut idx = head;
-        for item in &src[..n] {
-            crate::copy_ring::raw::write_slot_copy::<T, S, CP>(&self.storage, idx, item);
-            idx = I::wrap(idx + 1, cap);
+
+        // Two-chunk contiguous copy bypassing per-element CopyPolicy dispatch.
+        // `memcpy` auto-vectorizes for bulk transfers; per-element SIMD
+        // dispatch adds call overhead that dominates for large batches.
+        //
+        // `first_chunk`: slots from head to end of backing array (no wrap).
+        // `second_chunk`: remaining slots written from index 0 (wrap).
+        let first_chunk = n.min(cap - head);
+        let second_chunk = n - first_chunk;
+
+        // First chunk: slots head..head+first_chunk (no wrap).
+        // `head < cap` (ring invariant) and `first_chunk <= cap - head`,
+        // so `head + first_chunk <= cap`. Producer owns this range.
+        crate::copy_ring::raw::write_batch_copy::<T, S>(&self.storage, head, &src[..first_chunk]);
+
+        if second_chunk > 0 {
+            // Second chunk: wraps to slots 0..second_chunk.
+            // `second_chunk <= n - first_chunk <= free < cap`,
+            // so `second_chunk <= cap`. Producer owns this range.
+            crate::copy_ring::raw::write_batch_copy::<T, S>(&self.storage, 0, &src[first_chunk..n]);
         }
 
-        self.head.store(idx, Ordering::Release);
+        self.head.store(I::wrap(head + n, cap), Ordering::Release);
         n
     }
 
@@ -173,13 +195,37 @@ where
         }
 
         let n = dst.len().min(avail);
-        let mut idx = tail;
-        for out in &mut dst[..n] {
-            crate::copy_ring::raw::read_slot_copy::<T, S, CP>(&self.storage, idx, out);
-            idx = I::wrap(idx + 1, cap);
+
+        // Two-chunk contiguous copy symmetric to push_batch.
+        // `memcpy` auto-vectorizes for bulk transfers; per-element dispatch
+        // adds call overhead that dominates for large batches.
+        //
+        // `first_chunk`: slots from tail to end of backing array (no wrap).
+        // `second_chunk`: remaining slots read from index 0 (wrap).
+        let first_chunk = n.min(cap - tail);
+        let second_chunk = n - first_chunk;
+
+        // First chunk: slots tail..tail+first_chunk (no wrap).
+        // `tail < cap` (ring invariant) and `first_chunk <= cap - tail`,
+        // so `tail + first_chunk <= cap`. Consumer owns this range.
+        crate::copy_ring::raw::read_batch_copy::<T, S>(
+            &self.storage,
+            tail,
+            &mut dst[..first_chunk],
+        );
+
+        if second_chunk > 0 {
+            // Second chunk: wraps to slots 0..second_chunk.
+            // `second_chunk <= n - first_chunk <= avail < cap`,
+            // so `second_chunk <= cap`. Consumer owns this range.
+            crate::copy_ring::raw::read_batch_copy::<T, S>(
+                &self.storage,
+                0,
+                &mut dst[first_chunk..n],
+            );
         }
 
-        self.tail.store(idx, Ordering::Release);
+        self.tail.store(I::wrap(tail + n, cap), Ordering::Release);
         n
     }
 
@@ -212,6 +258,7 @@ where
 }
 
 #[cfg(test)]
+#[expect(clippy::cast_sign_loss, reason = "test-only usize→u64 conversions")]
 mod tests {
     extern crate std;
     use std::vec;
@@ -361,6 +408,46 @@ mod tests {
         assert_eq!(out, vec![100, 101, 102, 103, 104]);
     }
 
+    #[test]
+    fn pop_batch_wraparound_contiguous() {
+        let engine = new_engine(); // capacity=8, usable=7
+        // Advance tail to near end
+        let fill: Vec<u64> = (0..6).collect();
+        engine.push_batch(&fill);
+        let mut drain = vec![0u64; 6];
+        engine.pop_batch(&mut drain);
+
+        // Push 5 (wraps around), then batch-pop all 5
+        let wrap_src: Vec<u64> = (300..305).collect();
+        engine.push_batch(&wrap_src);
+
+        let mut out = vec![0u64; 5];
+        let popped = engine.pop_batch(&mut out);
+        assert_eq!(popped, 5);
+        assert_eq!(out, vec![300, 301, 302, 303, 304]);
+    }
+
+    #[test]
+    fn push_batch_wraparound_contiguous() {
+        // Advance head to near end of buffer, then batch-push across wrap
+        let engine = new_engine(); // capacity=8, usable=7
+        // Fill 6, drain 6 — head and tail now at index 6
+        let fill: Vec<u64> = (0..6).collect();
+        engine.push_batch(&fill);
+        let mut drain = vec![0u64; 6];
+        engine.pop_batch(&mut drain);
+
+        // Now push 5 elements starting at index 6: wraps at index 8 -> 0
+        let wrap_src: Vec<u64> = (200..205).collect();
+        let pushed = engine.push_batch(&wrap_src);
+        assert_eq!(pushed, 5);
+
+        let mut out = vec![0u64; 5];
+        let popped = engine.pop_batch(&mut out);
+        assert_eq!(popped, 5);
+        assert_eq!(out, vec![200, 201, 202, 203, 204]);
+    }
+
     #[test]
     fn batch_fifo_ordering() {
         let engine = new_engine();
@@ -391,4 +478,89 @@ mod tests {
         engine.pop(&mut out);
         assert_eq!(engine.len(), 0);
     }
+
+    #[test]
+    fn push_pop_batch_differential() {
+        // Verify contiguous batch produces same results as sequential push/pop
+        // for various batch sizes and fill levels.
+        for fill_first in 0..7 {
+            for batch_size in 1..=7 {
+                let engine = new_engine(); // capacity=8, usable=7
+
+                // Advance head/tail by fill_first positions
+                for i in 0..fill_first {
+                    assert!(engine.push(&(i as u64)));
+                }
+                let mut drain = vec![0u64; fill_first];
+                engine.pop_batch(&mut drain);
+
+                // Batch push
+                let src: Vec<u64> = (100_u64..100 + batch_size as u64).collect();
+                let pushed = engine.push_batch(&src);
+
+                // Batch pop
+                let mut dst = vec![0u64; pushed];
+                let popped = engine.pop_batch(&mut dst);
+
+                assert_eq!(popped, pushed, "fill={fill_first} batch={batch_size}");
+                assert_eq!(
+                    dst,
+                    src[..pushed].to_vec(),
+                    "FIFO violated: fill={fill_first} batch={batch_size}"
+                );
+            }
+        }
+    }
+}
+
+// proptest uses `getcwd` for failure persistence, which Miri's isolation blocks.
+#[cfg(all(test, not(miri)))]
+mod proptest_tests {
+    use super::*;
+    use crate::storage::InlineStorage;
+    use mantis_core::{ImmediatePush, NoInstr, Pow2Masked};
+    use proptest::prelude::*;
+
+    extern crate std;
+    use std::vec;
+    use std::vec::Vec;
+
+    type TestEngine = CopyRingEngine<
+        u64,
+        InlineStorage<u64, 8>,
+        Pow2Masked,
+        ImmediatePush,
+        NoInstr,
+        mantis_platform::DefaultCopyPolicy,
+    >;
+
+    fn new_engine() -> TestEngine {
+        CopyRingEngine::new(InlineStorage::new(), NoInstr)
+    }
+
+    proptest! {
+        #[test]
+        fn batch_fifo_preserved(
+            fill_level in 0usize..7,
+            batch_size in 1usize..8,
+        ) {
+            let engine = new_engine();
+
+            // Advance to fill_level
+            for i in 0..fill_level {
+                engine.push(&(i as u64));
+            }
+            let mut drain = vec![0u64; fill_level];
+            engine.pop_batch(&mut drain);
+
+            let src: Vec<u64> = (0..batch_size as u64).collect();
+            let pushed = engine.push_batch(&src);
+
+            let mut dst = vec![0u64; pushed];
+            let popped = engine.pop_batch(&mut dst);
+
+            prop_assert_eq!(popped, pushed);
+            prop_assert_eq!(dst, src[..pushed].to_vec());
+        }
+    }
 }