Skip to content

Commit 0ab6347

Browse files
committed
style(msgpack): reformat asm and fix whitespace
- Remove trailing whitespace throughout module - Reformat inline assembly blocks for consistency - Standardize slice notation spacing ([0 .. len]) - Clean up test file formatting
1 parent 6679687 commit 0ab6347

File tree

2 files changed

+121
-70
lines changed

2 files changed

+121
-70
lines changed

src/msgpack.zig

Lines changed: 102 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -15,23 +15,35 @@ const little_endian = std.builtin.Endian.little;
1515
/// Cache line size for prefetch optimization
1616
const CACHE_LINE_SIZE: usize = 64;
1717

18-
/// Prefetch hint for read-ahead optimization
18+
/// Prefetch hint for read-ahead optimization
1919
/// Uses compiler intrinsics to hint CPU to prefetch data
2020
/// This is a performance hint and may be a no-op on some architectures
2121
inline fn prefetchRead(ptr: [*]const u8, comptime locality: u2) void {
2222
// locality: 0=no temporal locality (NTA), 1=low (T2), 2=medium (T1), 3=high (T0)
2323
const arch = comptime builtin.cpu.arch;
24-
24+
2525
// x86/x64: Check for SSE support (required for PREFETCH instructions)
2626
if (comptime arch.isX86()) {
2727
const has_sse = comptime std.Target.x86.featureSetHas(builtin.cpu.features, .sse);
2828
if (has_sse) {
2929
// Use different prefetch instructions based on locality
3030
switch (locality) {
31-
3 => asm volatile ("prefetcht0 %[ptr]" : : [ptr] "m" (@as(*const u8, ptr))), // High locality -> L1+L2+L3
32-
2 => asm volatile ("prefetcht1 %[ptr]" : : [ptr] "m" (@as(*const u8, ptr))), // Medium -> L2+L3
33-
1 => asm volatile ("prefetcht2 %[ptr]" : : [ptr] "m" (@as(*const u8, ptr))), // Low -> L3 only
34-
0 => asm volatile ("prefetchnta %[ptr]" : : [ptr] "m" (@as(*const u8, ptr))), // Non-temporal
31+
3 => asm volatile ("prefetcht0 %[ptr]"
32+
:
33+
: [ptr] "m" (@as(*const u8, ptr)),
34+
), // High locality -> L1+L2+L3
35+
2 => asm volatile ("prefetcht1 %[ptr]"
36+
:
37+
: [ptr] "m" (@as(*const u8, ptr)),
38+
), // Medium -> L2+L3
39+
1 => asm volatile ("prefetcht2 %[ptr]"
40+
:
41+
: [ptr] "m" (@as(*const u8, ptr)),
42+
), // Low -> L3 only
43+
0 => asm volatile ("prefetchnta %[ptr]"
44+
:
45+
: [ptr] "m" (@as(*const u8, ptr)),
46+
), // Non-temporal
3547
}
3648
}
3749
}
@@ -41,10 +53,22 @@ inline fn prefetchRead(ptr: [*]const u8, comptime locality: u2) void {
4153
// Syntax: prfm <prfop>, [<Xn|SP>{, #<pimm>}]
4254
// prfop encoding: PLD (prefetch for load) + locality hint
4355
switch (locality) {
44-
3 => asm volatile ("prfm pldl1keep, [%[ptr]]" : : [ptr] "r" (ptr)), // Keep in L1
45-
2 => asm volatile ("prfm pldl2keep, [%[ptr]]" : : [ptr] "r" (ptr)), // Keep in L2
46-
1 => asm volatile ("prfm pldl3keep, [%[ptr]]" : : [ptr] "r" (ptr)), // Keep in L3
47-
0 => asm volatile ("prfm pldl1strm, [%[ptr]]" : : [ptr] "r" (ptr)), // Streaming (non-temporal)
56+
3 => asm volatile ("prfm pldl1keep, [%[ptr]]"
57+
:
58+
: [ptr] "r" (ptr),
59+
), // Keep in L1
60+
2 => asm volatile ("prfm pldl2keep, [%[ptr]]"
61+
:
62+
: [ptr] "r" (ptr),
63+
), // Keep in L2
64+
1 => asm volatile ("prfm pldl3keep, [%[ptr]]"
65+
:
66+
: [ptr] "r" (ptr),
67+
), // Keep in L3
68+
0 => asm volatile ("prfm pldl1strm, [%[ptr]]"
69+
:
70+
: [ptr] "r" (ptr),
71+
), // Streaming (non-temporal)
4872
}
4973
}
5074
// Other architectures: no-op (compiler optimizes away)
@@ -54,34 +78,61 @@ inline fn prefetchRead(ptr: [*]const u8, comptime locality: u2) void {
5478
/// Prefetch data for write operations
5579
inline fn prefetchWrite(ptr: [*]u8, comptime locality: u2) void {
5680
const arch = comptime builtin.cpu.arch;
57-
81+
5882
// x86/x64: Use PREFETCHW if available (3DNow!/SSE), fallback to read prefetch
5983
if (comptime arch.isX86()) {
6084
// PREFETCHW is part of 3DNow! (AMD) or PRFCHW feature (Intel Broadwell+)
6185
const has_prefetchw = comptime std.Target.x86.featureSetHas(builtin.cpu.features, .prfchw) or
62-
std.Target.x86.featureSetHas(builtin.cpu.features, .@"3dnow");
86+
std.Target.x86.featureSetHas(builtin.cpu.features, .@"3dnow");
6387
const has_sse = comptime std.Target.x86.featureSetHas(builtin.cpu.features, .sse);
64-
88+
6589
if (has_prefetchw) {
6690
// Use write-specific prefetch (ignores locality for simplicity)
67-
asm volatile ("prefetchw %[ptr]" : : [ptr] "m" (@as(*u8, ptr)));
91+
asm volatile ("prefetchw %[ptr]"
92+
:
93+
: [ptr] "m" (@as(*u8, ptr)),
94+
);
6895
} else if (has_sse) {
6996
// Fallback to read prefetch with specified locality
7097
switch (locality) {
71-
3 => asm volatile ("prefetcht0 %[ptr]" : : [ptr] "m" (@as(*u8, ptr))),
72-
2 => asm volatile ("prefetcht1 %[ptr]" : : [ptr] "m" (@as(*u8, ptr))),
73-
1 => asm volatile ("prefetcht2 %[ptr]" : : [ptr] "m" (@as(*u8, ptr))),
74-
0 => asm volatile ("prefetchnta %[ptr]" : : [ptr] "m" (@as(*u8, ptr))),
98+
3 => asm volatile ("prefetcht0 %[ptr]"
99+
:
100+
: [ptr] "m" (@as(*u8, ptr)),
101+
),
102+
2 => asm volatile ("prefetcht1 %[ptr]"
103+
:
104+
: [ptr] "m" (@as(*u8, ptr)),
105+
),
106+
1 => asm volatile ("prefetcht2 %[ptr]"
107+
:
108+
: [ptr] "m" (@as(*u8, ptr)),
109+
),
110+
0 => asm volatile ("prefetchnta %[ptr]"
111+
:
112+
: [ptr] "m" (@as(*u8, ptr)),
113+
),
75114
}
76115
}
77116
}
78117
// ARM64: Use PST (prefetch for store)
79118
else if (comptime arch.isAARCH64()) {
80119
switch (locality) {
81-
3 => asm volatile ("prfm pstl1keep, [%[ptr]]" : : [ptr] "r" (ptr)),
82-
2 => asm volatile ("prfm pstl2keep, [%[ptr]]" : : [ptr] "r" (ptr)),
83-
1 => asm volatile ("prfm pstl3keep, [%[ptr]]" : : [ptr] "r" (ptr)),
84-
0 => asm volatile ("prfm pstl1strm, [%[ptr]]" : : [ptr] "r" (ptr)),
120+
3 => asm volatile ("prfm pstl1keep, [%[ptr]]"
121+
:
122+
: [ptr] "r" (ptr),
123+
),
124+
2 => asm volatile ("prfm pstl2keep, [%[ptr]]"
125+
:
126+
: [ptr] "r" (ptr),
127+
),
128+
1 => asm volatile ("prfm pstl3keep, [%[ptr]]"
129+
:
130+
: [ptr] "r" (ptr),
131+
),
132+
0 => asm volatile ("prfm pstl1strm, [%[ptr]]"
133+
:
134+
: [ptr] "r" (ptr),
135+
),
85136
}
86137
}
87138
}
@@ -473,7 +524,7 @@ fn memcpySIMD(dest: []u8, src: []const u8) void {
473524
// Unaligned loads/stores can be 2-3x slower on some architectures
474525
const dest_addr = @intFromPtr(dest.ptr);
475526
const alignment_offset = dest_addr & (chunk_size - 1); // Modulo chunk_size
476-
527+
477528
if (alignment_offset != 0 and len >= chunk_size) {
478529
// Calculate bytes needed to reach alignment
479530
const bytes_to_align = chunk_size - alignment_offset;
@@ -483,7 +534,7 @@ fn memcpySIMD(dest: []u8, src: []const u8) void {
483534
i = bytes_to_align;
484535
}
485536
}
486-
537+
487538
// Process chunks with SIMD
488539
while (i + chunk_size <= len) : (i += chunk_size) {
489540
const vec: VecType = src[i..][0..chunk_size].*;
@@ -541,9 +592,9 @@ inline fn writeU64Aligned(ptr: *align(@alignOf(u64)) [8]u8, val: u64) void {
541592
/// Useful for copying strings, binary data >= 64 bytes
542593
inline fn memcpyLarge(dest: []u8, src: []const u8) void {
543594
std.debug.assert(dest.len >= src.len);
544-
595+
545596
const len = src.len;
546-
597+
547598
// For very large copies (>= 64 bytes), use SIMD-optimized copy
548599
if (len >= 64) {
549600
memcpySIMD(dest[0..len], src);
@@ -566,7 +617,7 @@ inline fn byteSwapU32SIMD(val: u32) u32 {
566617
}
567618

568619
const chunk_size = comptime detectSIMDChunkSize();
569-
620+
570621
// Use SIMD if available (SSE2+ or NEON)
571622
if (chunk_size >= 16) {
572623
// Zig's @byteSwap is optimized to use BSWAP on x86 or REV on ARM
@@ -584,7 +635,7 @@ inline fn byteSwapU64SIMD(val: u64) u64 {
584635
}
585636

586637
const chunk_size = comptime detectSIMDChunkSize();
587-
638+
588639
if (chunk_size >= 16) {
589640
return @byteSwap(val);
590641
} else {
@@ -621,34 +672,34 @@ inline fn readU64Fast(buffer: *const [8]u8) u64 {
621672

622673
/// Batch convert u32 array to big-endian (optimized for array serialization)
623674
/// This is useful when writing arrays of integers with known format
624-
/// Returns the number of bytes written
675+
/// Returns the number of bytes written
625676
/// Optimized with alignment-aware fast paths
626677
pub fn batchU32ToBigEndian(values: []const u32, output: []u8) usize {
627678
std.debug.assert(output.len >= values.len * 4);
628-
679+
629680
if (!needsByteSwap()) {
630681
// Already big-endian, direct copy
631-
@memcpy(output[0..values.len * 4], std.mem.sliceAsBytes(values));
682+
@memcpy(output[0 .. values.len * 4], std.mem.sliceAsBytes(values));
632683
return values.len * 4;
633684
}
634685

635686
const chunk_size = comptime detectSIMDChunkSize();
636-
687+
637688
// SIMD optimization for batch conversion
638689
if (chunk_size >= 16) {
639690
// Check if output is aligned for faster writes
640691
const output_aligned = isAligned(output.ptr, @alignOf(u32));
641-
692+
642693
// Process 4 u32s at a time (16 bytes = 128 bits)
643694
const VecType = @Vector(4, u32);
644695
var i: usize = 0;
645-
696+
646697
while (i + 4 <= values.len) : (i += 4) {
647698
const vec: VecType = values[i..][0..4].*;
648699
const swapped = @byteSwap(vec);
649-
700+
650701
const out_offset = i * 4;
651-
702+
652703
if (output_aligned and isAligned(output.ptr + out_offset, 16)) {
653704
// Fast path: aligned write (can be faster on some CPUs)
654705
const dest_ptr: *align(16) [16]u8 = @ptrCast(@alignCast(output[out_offset..].ptr));
@@ -660,21 +711,21 @@ pub fn batchU32ToBigEndian(values: []const u32, output: []u8) usize {
660711
@memcpy(output[out_offset..][0..16], swapped_bytes);
661712
}
662713
}
663-
714+
664715
// Handle remaining elements
665716
while (i < values.len) : (i += 1) {
666717
var buffer: [4]u8 = undefined;
667718
writeU32Fast(&buffer, values[i]);
668-
@memcpy(output[i * 4..][0..4], &buffer);
719+
@memcpy(output[i * 4 ..][0..4], &buffer);
669720
}
670-
721+
671722
return values.len * 4;
672723
} else {
673724
// Scalar fallback
674725
for (values, 0..) |val, i| {
675726
var buffer: [4]u8 = undefined;
676727
writeU32Fast(&buffer, val);
677-
@memcpy(output[i * 4..][0..4], &buffer);
728+
@memcpy(output[i * 4 ..][0..4], &buffer);
678729
}
679730
return values.len * 4;
680731
}
@@ -684,28 +735,28 @@ pub fn batchU32ToBigEndian(values: []const u32, output: []u8) usize {
684735
/// Optimized with alignment-aware fast paths
685736
pub fn batchU64ToBigEndian(values: []const u64, output: []u8) usize {
686737
std.debug.assert(output.len >= values.len * 8);
687-
738+
688739
if (!needsByteSwap()) {
689-
@memcpy(output[0..values.len * 8], std.mem.sliceAsBytes(values));
740+
@memcpy(output[0 .. values.len * 8], std.mem.sliceAsBytes(values));
690741
return values.len * 8;
691742
}
692743

693744
const chunk_size = comptime detectSIMDChunkSize();
694-
745+
695746
if (chunk_size >= 16) {
696747
// Check if output is aligned for faster writes
697748
const output_aligned = isAligned(output.ptr, @alignOf(u64));
698-
749+
699750
// Process 2 u64s at a time (16 bytes)
700751
const VecType = @Vector(2, u64);
701752
var i: usize = 0;
702-
753+
703754
while (i + 2 <= values.len) : (i += 2) {
704755
const vec: VecType = values[i..][0..2].*;
705756
const swapped = @byteSwap(vec);
706-
757+
707758
const out_offset = i * 8;
708-
759+
709760
if (output_aligned and isAligned(output.ptr + out_offset, 16)) {
710761
// Fast path: aligned write
711762
const dest_ptr: *align(16) [16]u8 = @ptrCast(@alignCast(output[out_offset..].ptr));
@@ -717,20 +768,20 @@ pub fn batchU64ToBigEndian(values: []const u64, output: []u8) usize {
717768
@memcpy(output[out_offset..][0..16], swapped_bytes);
718769
}
719770
}
720-
771+
721772
// Handle remaining element
722773
if (i < values.len) {
723774
var buffer: [8]u8 = undefined;
724775
writeU64Fast(&buffer, values[i]);
725-
@memcpy(output[i * 8..][0..8], &buffer);
776+
@memcpy(output[i * 8 ..][0..8], &buffer);
726777
}
727-
778+
728779
return values.len * 8;
729780
} else {
730781
for (values, 0..) |val, i| {
731782
var buffer: [8]u8 = undefined;
732783
writeU64Fast(&buffer, val);
733-
@memcpy(output[i * 8..][0..8], &buffer);
784+
@memcpy(output[i * 8 ..][0..8], &buffer);
734785
}
735786
return values.len * 8;
736787
}

0 commit comments

Comments
 (0)