@@ -15,23 +15,35 @@ const little_endian = std.builtin.Endian.little;
1515/// Cache line size for prefetch optimization
1616const CACHE_LINE_SIZE : usize = 64 ;
1717
18- /// Prefetch hint for read-ahead optimization
18+ /// Prefetch hint for read-ahead optimization
1919/// Uses compiler intrinsics to hint CPU to prefetch data
2020/// This is a performance hint and may be a no-op on some architectures
2121inline fn prefetchRead (ptr : [* ]const u8 , comptime locality : u2 ) void {
2222 // locality: 0=no temporal locality (NTA), 1=low (T2), 2=medium (T1), 3=high (T0)
2323 const arch = comptime builtin .cpu .arch ;
24-
24+
2525 // x86/x64: Check for SSE support (required for PREFETCH instructions)
2626 if (comptime arch .isX86 ()) {
2727 const has_sse = comptime std .Target .x86 .featureSetHas (builtin .cpu .features , .sse );
2828 if (has_sse ) {
2929 // Use different prefetch instructions based on locality
3030 switch (locality ) {
31- 3 = > asm volatile ("prefetcht0 %[ptr]" : : [ptr ] "m" (@as (* const u8 , ptr ))), // High locality -> L1+L2+L3
32- 2 = > asm volatile ("prefetcht1 %[ptr]" : : [ptr ] "m" (@as (* const u8 , ptr ))), // Medium -> L2+L3
33- 1 = > asm volatile ("prefetcht2 %[ptr]" : : [ptr ] "m" (@as (* const u8 , ptr ))), // Low -> L3 only
34- 0 = > asm volatile ("prefetchnta %[ptr]" : : [ptr ] "m" (@as (* const u8 , ptr ))), // Non-temporal
31+ 3 = > asm volatile ("prefetcht0 %[ptr]"
32+ :
33+ : [ptr ] "m" (@as (* const u8 , ptr )),
34+ ), // High locality -> L1+L2+L3
35+ 2 = > asm volatile ("prefetcht1 %[ptr]"
36+ :
37+ : [ptr ] "m" (@as (* const u8 , ptr )),
38+ ), // Medium -> L2+L3
39+ 1 = > asm volatile ("prefetcht2 %[ptr]"
40+ :
41+ : [ptr ] "m" (@as (* const u8 , ptr )),
42+ ), // Low -> L3 only
43+ 0 = > asm volatile ("prefetchnta %[ptr]"
44+ :
45+ : [ptr ] "m" (@as (* const u8 , ptr )),
46+ ), // Non-temporal
3547 }
3648 }
3749 }
@@ -41,10 +53,22 @@ inline fn prefetchRead(ptr: [*]const u8, comptime locality: u2) void {
4153 // Syntax: prfm <prfop>, [<Xn|SP>{, #<pimm>}]
4254 // prfop encoding: PLD (prefetch for load) + locality hint
4355 switch (locality ) {
44- 3 = > asm volatile ("prfm pldl1keep, [%[ptr]]" : : [ptr ] "r" (ptr )), // Keep in L1
45- 2 = > asm volatile ("prfm pldl2keep, [%[ptr]]" : : [ptr ] "r" (ptr )), // Keep in L2
46- 1 = > asm volatile ("prfm pldl3keep, [%[ptr]]" : : [ptr ] "r" (ptr )), // Keep in L3
47- 0 = > asm volatile ("prfm pldl1strm, [%[ptr]]" : : [ptr ] "r" (ptr )), // Streaming (non-temporal)
56+ 3 = > asm volatile ("prfm pldl1keep, [%[ptr]]"
57+ :
58+ : [ptr ] "r" (ptr ),
59+ ), // Keep in L1
60+ 2 = > asm volatile ("prfm pldl2keep, [%[ptr]]"
61+ :
62+ : [ptr ] "r" (ptr ),
63+ ), // Keep in L2
64+ 1 = > asm volatile ("prfm pldl3keep, [%[ptr]]"
65+ :
66+ : [ptr ] "r" (ptr ),
67+ ), // Keep in L3
68+ 0 = > asm volatile ("prfm pldl1strm, [%[ptr]]"
69+ :
70+ : [ptr ] "r" (ptr ),
71+ ), // Streaming (non-temporal)
4872 }
4973 }
5074 // Other architectures: no-op (compiler optimizes away)
@@ -54,34 +78,61 @@ inline fn prefetchRead(ptr: [*]const u8, comptime locality: u2) void {
5478/// Prefetch data for write operations
5579inline fn prefetchWrite (ptr : [* ]u8 , comptime locality : u2 ) void {
5680 const arch = comptime builtin .cpu .arch ;
57-
81+
5882 // x86/x64: Use PREFETCHW if available (3DNow!/SSE), fallback to read prefetch
5983 if (comptime arch .isX86 ()) {
6084 // PREFETCHW is part of 3DNow! (AMD) or PRFCHW feature (Intel Broadwell+)
6185 const has_prefetchw = comptime std .Target .x86 .featureSetHas (builtin .cpu .features , .prfchw ) or
62- std .Target .x86 .featureSetHas (builtin .cpu .features , .@"3dnow" );
86+ std .Target .x86 .featureSetHas (builtin .cpu .features , .@"3dnow" );
6387 const has_sse = comptime std .Target .x86 .featureSetHas (builtin .cpu .features , .sse );
64-
88+
6589 if (has_prefetchw ) {
6690 // Use write-specific prefetch (ignores locality for simplicity)
67- asm volatile ("prefetchw %[ptr]" : : [ptr ] "m" (@as (* u8 , ptr )));
91+ asm volatile ("prefetchw %[ptr]"
92+ :
93+ : [ptr ] "m" (@as (* u8 , ptr )),
94+ );
6895 } else if (has_sse ) {
6996 // Fallback to read prefetch with specified locality
7097 switch (locality ) {
71- 3 = > asm volatile ("prefetcht0 %[ptr]" : : [ptr ] "m" (@as (* u8 , ptr ))),
72- 2 = > asm volatile ("prefetcht1 %[ptr]" : : [ptr ] "m" (@as (* u8 , ptr ))),
73- 1 = > asm volatile ("prefetcht2 %[ptr]" : : [ptr ] "m" (@as (* u8 , ptr ))),
74- 0 = > asm volatile ("prefetchnta %[ptr]" : : [ptr ] "m" (@as (* u8 , ptr ))),
98+ 3 = > asm volatile ("prefetcht0 %[ptr]"
99+ :
100+ : [ptr ] "m" (@as (* u8 , ptr )),
101+ ),
102+ 2 = > asm volatile ("prefetcht1 %[ptr]"
103+ :
104+ : [ptr ] "m" (@as (* u8 , ptr )),
105+ ),
106+ 1 = > asm volatile ("prefetcht2 %[ptr]"
107+ :
108+ : [ptr ] "m" (@as (* u8 , ptr )),
109+ ),
110+ 0 = > asm volatile ("prefetchnta %[ptr]"
111+ :
112+ : [ptr ] "m" (@as (* u8 , ptr )),
113+ ),
75114 }
76115 }
77116 }
78117 // ARM64: Use PST (prefetch for store)
79118 else if (comptime arch .isAARCH64 ()) {
80119 switch (locality ) {
81- 3 = > asm volatile ("prfm pstl1keep, [%[ptr]]" : : [ptr ] "r" (ptr )),
82- 2 = > asm volatile ("prfm pstl2keep, [%[ptr]]" : : [ptr ] "r" (ptr )),
83- 1 = > asm volatile ("prfm pstl3keep, [%[ptr]]" : : [ptr ] "r" (ptr )),
84- 0 = > asm volatile ("prfm pstl1strm, [%[ptr]]" : : [ptr ] "r" (ptr )),
120+ 3 = > asm volatile ("prfm pstl1keep, [%[ptr]]"
121+ :
122+ : [ptr ] "r" (ptr ),
123+ ),
124+ 2 = > asm volatile ("prfm pstl2keep, [%[ptr]]"
125+ :
126+ : [ptr ] "r" (ptr ),
127+ ),
128+ 1 = > asm volatile ("prfm pstl3keep, [%[ptr]]"
129+ :
130+ : [ptr ] "r" (ptr ),
131+ ),
132+ 0 = > asm volatile ("prfm pstl1strm, [%[ptr]]"
133+ :
134+ : [ptr ] "r" (ptr ),
135+ ),
85136 }
86137 }
87138}
@@ -473,7 +524,7 @@ fn memcpySIMD(dest: []u8, src: []const u8) void {
473524 // Unaligned loads/stores can be 2-3x slower on some architectures
474525 const dest_addr = @intFromPtr (dest .ptr );
475526 const alignment_offset = dest_addr & (chunk_size - 1 ); // Modulo chunk_size
476-
527+
477528 if (alignment_offset != 0 and len >= chunk_size ) {
478529 // Calculate bytes needed to reach alignment
479530 const bytes_to_align = chunk_size - alignment_offset ;
@@ -483,7 +534,7 @@ fn memcpySIMD(dest: []u8, src: []const u8) void {
483534 i = bytes_to_align ;
484535 }
485536 }
486-
537+
487538 // Process chunks with SIMD
488539 while (i + chunk_size <= len ) : (i += chunk_size ) {
489540 const vec : VecType = src [i .. ][0.. chunk_size ].* ;
@@ -541,9 +592,9 @@ inline fn writeU64Aligned(ptr: *align(@alignOf(u64)) [8]u8, val: u64) void {
541592/// Useful for copying strings, binary data >= 64 bytes
542593inline fn memcpyLarge (dest : []u8 , src : []const u8 ) void {
543594 std .debug .assert (dest .len >= src .len );
544-
595+
545596 const len = src .len ;
546-
597+
547598 // For very large copies (>= 64 bytes), use SIMD-optimized copy
548599 if (len >= 64 ) {
549600 memcpySIMD (dest [0.. len ], src );
@@ -566,7 +617,7 @@ inline fn byteSwapU32SIMD(val: u32) u32 {
566617 }
567618
568619 const chunk_size = comptime detectSIMDChunkSize ();
569-
620+
570621 // Use SIMD if available (SSE2+ or NEON)
571622 if (chunk_size >= 16 ) {
572623 // Zig's @byteSwap is optimized to use BSWAP on x86 or REV on ARM
@@ -584,7 +635,7 @@ inline fn byteSwapU64SIMD(val: u64) u64 {
584635 }
585636
586637 const chunk_size = comptime detectSIMDChunkSize ();
587-
638+
588639 if (chunk_size >= 16 ) {
589640 return @byteSwap (val );
590641 } else {
@@ -621,34 +672,34 @@ inline fn readU64Fast(buffer: *const [8]u8) u64 {
621672
622673/// Batch convert u32 array to big-endian (optimized for array serialization)
623674/// This is useful when writing arrays of integers with known format
624- /// Returns the number of bytes written
675+ /// Returns the number of bytes written
625676/// Optimized with alignment-aware fast paths
626677pub fn batchU32ToBigEndian (values : []const u32 , output : []u8 ) usize {
627678 std .debug .assert (output .len >= values .len * 4 );
628-
679+
629680 if (! needsByteSwap ()) {
630681 // Already big-endian, direct copy
631- @memcpy (output [0.. values .len * 4 ], std .mem .sliceAsBytes (values ));
682+ @memcpy (output [0 .. values .len * 4 ], std .mem .sliceAsBytes (values ));
632683 return values .len * 4 ;
633684 }
634685
635686 const chunk_size = comptime detectSIMDChunkSize ();
636-
687+
637688 // SIMD optimization for batch conversion
638689 if (chunk_size >= 16 ) {
639690 // Check if output is aligned for faster writes
640691 const output_aligned = isAligned (output .ptr , @alignOf (u32 ));
641-
692+
642693 // Process 4 u32s at a time (16 bytes = 128 bits)
643694 const VecType = @Vector (4 , u32 );
644695 var i : usize = 0 ;
645-
696+
646697 while (i + 4 <= values .len ) : (i += 4 ) {
647698 const vec : VecType = values [i .. ][0.. 4].* ;
648699 const swapped = @byteSwap (vec );
649-
700+
650701 const out_offset = i * 4 ;
651-
702+
652703 if (output_aligned and isAligned (output .ptr + out_offset , 16 )) {
653704 // Fast path: aligned write (can be faster on some CPUs)
654705 const dest_ptr : * align (16 ) [16 ]u8 = @ptrCast (@alignCast (output [out_offset .. ].ptr ));
@@ -660,21 +711,21 @@ pub fn batchU32ToBigEndian(values: []const u32, output: []u8) usize {
660711 @memcpy (output [out_offset .. ][0.. 16], swapped_bytes );
661712 }
662713 }
663-
714+
664715 // Handle remaining elements
665716 while (i < values .len ) : (i += 1 ) {
666717 var buffer : [4 ]u8 = undefined ;
667718 writeU32Fast (& buffer , values [i ]);
668- @memcpy (output [i * 4.. ][0.. 4], & buffer );
719+ @memcpy (output [i * 4 .. ][0.. 4], & buffer );
669720 }
670-
721+
671722 return values .len * 4 ;
672723 } else {
673724 // Scalar fallback
674725 for (values , 0.. ) | val , i | {
675726 var buffer : [4 ]u8 = undefined ;
676727 writeU32Fast (& buffer , val );
677- @memcpy (output [i * 4.. ][0.. 4], & buffer );
728+ @memcpy (output [i * 4 .. ][0.. 4], & buffer );
678729 }
679730 return values .len * 4 ;
680731 }
@@ -684,28 +735,28 @@ pub fn batchU32ToBigEndian(values: []const u32, output: []u8) usize {
684735/// Optimized with alignment-aware fast paths
685736pub fn batchU64ToBigEndian (values : []const u64 , output : []u8 ) usize {
686737 std .debug .assert (output .len >= values .len * 8 );
687-
738+
688739 if (! needsByteSwap ()) {
689- @memcpy (output [0.. values .len * 8 ], std .mem .sliceAsBytes (values ));
740+ @memcpy (output [0 .. values .len * 8 ], std .mem .sliceAsBytes (values ));
690741 return values .len * 8 ;
691742 }
692743
693744 const chunk_size = comptime detectSIMDChunkSize ();
694-
745+
695746 if (chunk_size >= 16 ) {
696747 // Check if output is aligned for faster writes
697748 const output_aligned = isAligned (output .ptr , @alignOf (u64 ));
698-
749+
699750 // Process 2 u64s at a time (16 bytes)
700751 const VecType = @Vector (2 , u64 );
701752 var i : usize = 0 ;
702-
753+
703754 while (i + 2 <= values .len ) : (i += 2 ) {
704755 const vec : VecType = values [i .. ][0.. 2].* ;
705756 const swapped = @byteSwap (vec );
706-
757+
707758 const out_offset = i * 8 ;
708-
759+
709760 if (output_aligned and isAligned (output .ptr + out_offset , 16 )) {
710761 // Fast path: aligned write
711762 const dest_ptr : * align (16 ) [16 ]u8 = @ptrCast (@alignCast (output [out_offset .. ].ptr ));
@@ -717,20 +768,20 @@ pub fn batchU64ToBigEndian(values: []const u64, output: []u8) usize {
717768 @memcpy (output [out_offset .. ][0.. 16], swapped_bytes );
718769 }
719770 }
720-
771+
721772 // Handle remaining element
722773 if (i < values .len ) {
723774 var buffer : [8 ]u8 = undefined ;
724775 writeU64Fast (& buffer , values [i ]);
725- @memcpy (output [i * 8.. ][0.. 8], & buffer );
776+ @memcpy (output [i * 8 .. ][0.. 8], & buffer );
726777 }
727-
778+
728779 return values .len * 8 ;
729780 } else {
730781 for (values , 0.. ) | val , i | {
731782 var buffer : [8 ]u8 = undefined ;
732783 writeU64Fast (& buffer , val );
733- @memcpy (output [i * 8.. ][0.. 8], & buffer );
784+ @memcpy (output [i * 8 .. ][0.. 8], & buffer );
734785 }
735786 return values .len * 8 ;
736787 }
0 commit comments