Skip to content

Commit 6679687

Browse files
committed
feat(msgpack): add multi-architecture prefetch
- Add ARM64 PRFM prefetch instruction support with locality hints - Implement x86/x64 prefetch levels (T0/T1/T2/NTA) based on locality - Add PREFETCHW support for write operations on compatible CPUs - Document platform support matrix and architecture optimizations
1 parent 91c44c0 commit 6679687

File tree

2 files changed

+76
-29
lines changed

2 files changed

+76
-29
lines changed

README.md

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# zig-msgpack
22

3-
[![CI](https://github.com/zigcc/zig-msgpack/actions/workflows/test.yml/badge.svg)](https://github.com/zigcc/zig-msgpack/actions/workflows/test.yml)
3+
[![CI](https://github.com/zigcc/zig-msgpack/actions/workflows/ci.yml/badge.svg)](https://github.com/zigcc/zig-msgpack/actions/workflows/ci.yml)
44

55
A MessagePack implementation for the Zig programming language. This library provides a simple and efficient way to serialize and deserialize data using the MessagePack format.
66

@@ -17,6 +17,26 @@ An article introducing it: [Zig Msgpack](https://blog.nvimer.org/2025/09/20/zig-
1717
- **Simple API:** Offers a straightforward and easy-to-use API for encoding and decoding.
1818
- **Performance Optimized:** Advanced optimizations including CPU cache prefetching, branch prediction hints, and SIMD operations for maximum throughput.
1919

20+
## Platform Support
21+
22+
This library is tested and optimized for all major platforms and architectures:
23+
24+
| Platform | Architecture | CI Status | SIMD Optimizations |
25+
|----------|--------------|-----------|-------------------|
26+
| **Windows** | x86_64 | ✅ Tested | SSE2/AVX2 prefetch |
27+
| **macOS** | x86_64 (Intel) | ✅ Tested | SSE2/AVX2 prefetch |
28+
| **macOS** | ARM64 (Apple Silicon) | ✅ Tested | ARM NEON + PRFM |
29+
| **Linux** | x86_64 | ✅ Tested | SSE2/AVX2 prefetch |
30+
| **Linux** | ARM64/aarch64 | ✅ Tested | ARM NEON + PRFM |
31+
| **Other** | RISC-V, MIPS, etc. | ✅ Tested | Graceful fallback |
32+
33+
### Architecture-Specific Optimizations
34+
35+
- **x86/x64**: Utilizes SSE/AVX prefetch instructions (`PREFETCHT0/1/2`, `PREFETCHNTA`) for cache-aware memory access
36+
- **ARM64**: Uses ARM PRFM (Prefetch Memory) instructions for optimal performance on Apple Silicon and ARM servers
37+
- **Cross-platform**: Automatically detects CPU features at compile-time with zero runtime overhead
38+
- **Safe fallback**: Gracefully degrades to standard operations on unsupported architectures
39+
2040
## Installation
2141

2242
### Version Compatibility

src/msgpack.zig

Lines changed: 55 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -19,42 +19,69 @@ const CACHE_LINE_SIZE: usize = 64;
1919
/// Uses compiler intrinsics to hint CPU to prefetch data
2020
/// This is a performance hint and may be a no-op on some architectures
2121
inline fn prefetchRead(ptr: [*]const u8, comptime locality: u2) void {
22-
_ = locality; // locality: 0=no temporal locality, 3=high temporal locality
23-
// Check if we're on x86/x64 with SSE support for prefetch instructions
24-
const has_prefetch = comptime blk: {
25-
const arch = builtin.cpu.arch;
26-
break :blk arch.isX86() and std.Target.x86.featureSetHas(builtin.cpu.features, .sse);
27-
};
22+
// locality: 0=no temporal locality (NTA), 1=low (T2), 2=medium (T1), 3=high (T0)
23+
const arch = comptime builtin.cpu.arch;
2824

29-
if (has_prefetch) {
30-
// Use inline assembly for prefetch on x86/x64
31-
// PREFETCHT0 - prefetch to all cache levels
32-
if (comptime builtin.cpu.arch.isX86()) {
33-
asm volatile ("prefetcht0 %[ptr]"
34-
:
35-
: [ptr] "m" (@as(*const u8, ptr)),
36-
);
25+
// x86/x64: Check for SSE support (required for PREFETCH instructions)
26+
if (comptime arch.isX86()) {
27+
const has_sse = comptime std.Target.x86.featureSetHas(builtin.cpu.features, .sse);
28+
if (has_sse) {
29+
// Use different prefetch instructions based on locality
30+
switch (locality) {
31+
3 => asm volatile ("prefetcht0 %[ptr]" : : [ptr] "m" (@as(*const u8, ptr))), // High locality -> L1+L2+L3
32+
2 => asm volatile ("prefetcht1 %[ptr]" : : [ptr] "m" (@as(*const u8, ptr))), // Medium -> L2+L3
33+
1 => asm volatile ("prefetcht2 %[ptr]" : : [ptr] "m" (@as(*const u8, ptr))), // Low -> L3 only
34+
0 => asm volatile ("prefetchnta %[ptr]" : : [ptr] "m" (@as(*const u8, ptr))), // Non-temporal
35+
}
3736
}
3837
}
39-
// On other architectures or without SSE, this is a no-op (compiler may optimize)
38+
// ARM64 (Apple Silicon, Linux ARM): Use PRFM instruction
39+
else if (comptime arch.isAARCH64()) {
40+
// ARM PRFM (Prefetch Memory) instruction
41+
// Syntax: prfm <prfop>, [<Xn|SP>{, #<pimm>}]
42+
// prfop encoding: PLD (prefetch for load) + locality hint
43+
switch (locality) {
44+
3 => asm volatile ("prfm pldl1keep, [%[ptr]]" : : [ptr] "r" (ptr)), // Keep in L1
45+
2 => asm volatile ("prfm pldl2keep, [%[ptr]]" : : [ptr] "r" (ptr)), // Keep in L2
46+
1 => asm volatile ("prfm pldl3keep, [%[ptr]]" : : [ptr] "r" (ptr)), // Keep in L3
47+
0 => asm volatile ("prfm pldl1strm, [%[ptr]]" : : [ptr] "r" (ptr)), // Streaming (non-temporal)
48+
}
49+
}
50+
// Other architectures: no-op (compiler optimizes away)
51+
// RISC-V, MIPS, etc. may have their own prefetch extensions but not standard
4052
}
4153

4254
/// Prefetch data for write operations
4355
inline fn prefetchWrite(ptr: [*]u8, comptime locality: u2) void {
44-
_ = locality;
45-
const has_prefetch = comptime blk: {
46-
const arch = builtin.cpu.arch;
47-
break :blk arch.isX86() and std.Target.x86.featureSetHas(builtin.cpu.features, .sse);
48-
};
56+
const arch = comptime builtin.cpu.arch;
4957

50-
if (has_prefetch) {
51-
if (comptime builtin.cpu.arch.isX86()) {
52-
// PREFETCHW - prefetch for write
53-
// Note: Requires 3DNow! or later x86 extensions
54-
asm volatile ("prefetcht0 %[ptr]"
55-
:
56-
: [ptr] "m" (@as(*u8, ptr)),
57-
);
58+
// x86/x64: Use PREFETCHW if available (3DNow!/SSE), fallback to read prefetch
59+
if (comptime arch.isX86()) {
60+
// PREFETCHW is part of 3DNow! (AMD) or PRFCHW feature (Intel Broadwell+)
61+
const has_prefetchw = comptime std.Target.x86.featureSetHas(builtin.cpu.features, .prfchw) or
62+
std.Target.x86.featureSetHas(builtin.cpu.features, .@"3dnow");
63+
const has_sse = comptime std.Target.x86.featureSetHas(builtin.cpu.features, .sse);
64+
65+
if (has_prefetchw) {
66+
// Use write-specific prefetch (ignores locality for simplicity)
67+
asm volatile ("prefetchw %[ptr]" : : [ptr] "m" (@as(*u8, ptr)));
68+
} else if (has_sse) {
69+
// Fallback to read prefetch with specified locality
70+
switch (locality) {
71+
3 => asm volatile ("prefetcht0 %[ptr]" : : [ptr] "m" (@as(*u8, ptr))),
72+
2 => asm volatile ("prefetcht1 %[ptr]" : : [ptr] "m" (@as(*u8, ptr))),
73+
1 => asm volatile ("prefetcht2 %[ptr]" : : [ptr] "m" (@as(*u8, ptr))),
74+
0 => asm volatile ("prefetchnta %[ptr]" : : [ptr] "m" (@as(*u8, ptr))),
75+
}
76+
}
77+
}
78+
// ARM64: Use PST (prefetch for store)
79+
else if (comptime arch.isAARCH64()) {
80+
switch (locality) {
81+
3 => asm volatile ("prfm pstl1keep, [%[ptr]]" : : [ptr] "r" (ptr)),
82+
2 => asm volatile ("prfm pstl2keep, [%[ptr]]" : : [ptr] "r" (ptr)),
83+
1 => asm volatile ("prfm pstl3keep, [%[ptr]]" : : [ptr] "r" (ptr)),
84+
0 => asm volatile ("prfm pstl1strm, [%[ptr]]" : : [ptr] "r" (ptr)),
5885
}
5986
}
6087
}

0 commit comments

Comments
 (0)