Skip to content

Commit cef72c7

Browse files
committed
Kernel: Use word-sized writes in the generic memset implementation
This decreases the boot time on my x86-64 host system from 15 s to 13 s for AArch64 QEMU TCG and 33 s to 30 s for RISC-V QEMU TCG! I additionally measured the performance of this new implementation with this simple benchmark: https://gist.github.com/spholz/b06ea737b435ecc181069cf0d911faa4 Based on to this benchmark, an unroll level 8 seems like a good choice for all tested systems. Here are the speedups for n=0x10000: - Raspberry Pi 5: 7.6 ( 81984 ns -> 10748 ns) - Raspberry Pi 4: 3.3 (131197 ns -> 39704 ns) - StarFive VisionFive 2: 5.5 (279107 ns -> 50650 ns) - AArch64 QEMU TCG: 6.8 (374287 ns -> 54847 ns) - RISC-V QEMU TCG: 6.7 (354195 ns -> 52615 ns) - x86-64 QEMU KVM: 3.8 ( 32443 ns -> 8542 ns)
1 parent 79f06a1 commit cef72c7

File tree

1 file changed

+30
-3
lines changed

1 file changed

+30
-3
lines changed

Kernel/Library/MiniStdLib.cpp

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,37 @@ void* memset(void* dest_ptr, int c, size_t n)
7070
: "a"(c)
7171
: "memory");
7272
#else
73-
u8* pd = (u8*)dest_ptr;
74-
for (; n--;)
75-
*pd++ = c;
73+
auto* dest = static_cast<u8*>(dest_ptr);
74+
75+
auto is_word_aligned = [](auto ptr) {
76+
return reinterpret_cast<FlatPtr>(ptr) % sizeof(FlatPtr) == 0;
77+
};
78+
79+
// Set bytes until destination is word-aligned.
80+
while (n > 0 && !is_word_aligned(dest)) {
81+
*dest++ = static_cast<u8>(c);
82+
n--;
83+
}
84+
85+
// Set in word-sized chunks.
86+
FlatPtr exploded = explode_byte(c);
87+
88+
auto* dest_word = reinterpret_cast<FlatPtr*>(dest);
89+
90+
# pragma GCC unroll 8
91+
while (n >= sizeof(FlatPtr)) {
92+
*dest_word++ = exploded;
93+
n -= sizeof(FlatPtr);
94+
}
95+
dest = reinterpret_cast<u8*>(dest_word);
96+
97+
// Set remaining tail bytes.
98+
while (n > 0) {
99+
*dest++ = static_cast<u8>(c);
100+
n--;
101+
}
76102
#endif
103+
77104
return dest_ptr;
78105
}
79106

0 commit comments

Comments
 (0)