Skip to content

Commit b1a3e75

Browse files
terrellntorvalds
authored andcommitted
lz4: fix kernel decompression speed
This patch replaces all memcpy() calls with LZ4_memcpy() which calls __builtin_memcpy() so the compiler can inline it. LZ4 relies heavily on memcpy() with a constant size being inlined. In x86 and i386 pre-boot environments memcpy() cannot be inlined because memcpy() doesn't get defined as __builtin_memcpy(). An equivalent patch has been applied upstream so that the next import won't lose this change [1]. I've measured the kernel decompression speed using QEMU before and after this patch for the x86_64 and i386 architectures. The speed-up is about 10x as shown below. Code Arch Kernel Size Time Speed v5.8 x86_64 11504832 B 148 ms 79 MB/s patch x86_64 11503872 B 13 ms 885 MB/s v5.8 i386 9621216 B 91 ms 106 MB/s patch i386 9620224 B 10 ms 962 MB/s I also measured the time to decompress the initramfs on x86_64, i386, and arm. All three show the same decompression speed before and after, as expected. [1] lz4/lz4#890 Signed-off-by: Nick Terrell <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Cc: Yann Collet <[email protected]> Cc: Gao Xiang <[email protected]> Cc: Sven Schmidt <[email protected]> Cc: Greg Kroah-Hartman <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: Arvind Sankar <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Linus Torvalds <[email protected]>
1 parent a8a4b7a commit b1a3e75

File tree

4 files changed

+22
-12
lines changed

4 files changed

+22
-12
lines changed

lib/lz4/lz4_compress.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -446,7 +446,7 @@ static FORCE_INLINE int LZ4_compress_generic(
446446
*op++ = (BYTE)(lastRun << ML_BITS);
447447
}
448448

449-
memcpy(op, anchor, lastRun);
449+
LZ4_memcpy(op, anchor, lastRun);
450450

451451
op += lastRun;
452452
}
@@ -708,7 +708,7 @@ static int LZ4_compress_destSize_generic(
708708
} else {
709709
*op++ = (BYTE)(lastRunSize<<ML_BITS);
710710
}
711-
memcpy(op, anchor, lastRunSize);
711+
LZ4_memcpy(op, anchor, lastRunSize);
712712
op += lastRunSize;
713713
}
714714

lib/lz4/lz4_decompress.c

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
153153
&& likely((endOnInput ? ip < shortiend : 1) &
154154
(op <= shortoend))) {
155155
/* Copy the literals */
156-
memcpy(op, ip, endOnInput ? 16 : 8);
156+
LZ4_memcpy(op, ip, endOnInput ? 16 : 8);
157157
op += length; ip += length;
158158

159159
/*
@@ -172,9 +172,9 @@ static FORCE_INLINE int LZ4_decompress_generic(
172172
(offset >= 8) &&
173173
(dict == withPrefix64k || match >= lowPrefix)) {
174174
/* Copy the match. */
175-
memcpy(op + 0, match + 0, 8);
176-
memcpy(op + 8, match + 8, 8);
177-
memcpy(op + 16, match + 16, 2);
175+
LZ4_memcpy(op + 0, match + 0, 8);
176+
LZ4_memcpy(op + 8, match + 8, 8);
177+
LZ4_memcpy(op + 16, match + 16, 2);
178178
op += length + MINMATCH;
179179
/* Both stages worked, load the next token. */
180180
continue;
@@ -263,7 +263,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
263263
}
264264
}
265265

266-
memcpy(op, ip, length);
266+
LZ4_memcpy(op, ip, length);
267267
ip += length;
268268
op += length;
269269

@@ -350,7 +350,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
350350
size_t const copySize = (size_t)(lowPrefix - match);
351351
size_t const restSize = length - copySize;
352352

353-
memcpy(op, dictEnd - copySize, copySize);
353+
LZ4_memcpy(op, dictEnd - copySize, copySize);
354354
op += copySize;
355355
if (restSize > (size_t)(op - lowPrefix)) {
356356
/* overlap copy */
@@ -360,7 +360,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
360360
while (op < endOfMatch)
361361
*op++ = *copyFrom++;
362362
} else {
363-
memcpy(op, lowPrefix, restSize);
363+
LZ4_memcpy(op, lowPrefix, restSize);
364364
op += restSize;
365365
}
366366
}
@@ -386,7 +386,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
386386
while (op < copyEnd)
387387
*op++ = *match++;
388388
} else {
389-
memcpy(op, match, mlen);
389+
LZ4_memcpy(op, match, mlen);
390390
}
391391
op = copyEnd;
392392
if (op == oend)
@@ -400,7 +400,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
400400
op[2] = match[2];
401401
op[3] = match[3];
402402
match += inc32table[offset];
403-
memcpy(op + 4, match, 4);
403+
LZ4_memcpy(op + 4, match, 4);
404404
match -= dec64table[offset];
405405
} else {
406406
LZ4_copy8(op, match);

lib/lz4/lz4defs.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,16 @@ static FORCE_INLINE void LZ4_writeLE16(void *memPtr, U16 value)
137137
return put_unaligned_le16(value, memPtr);
138138
}
139139

140+
/*
141+
* LZ4 relies on memcpy with a constant size being inlined. In freestanding
142+
* environments, the compiler can't assume the implementation of memcpy() is
143+
* standard compliant, so apply its specialized memcpy() inlining logic. When
144+
* possible, use __builtin_memcpy() to tell the compiler to analyze memcpy()
145+
* as-if it were standard compliant, so it can inline it in freestanding
146+
* environments. This is needed when decompressing the Linux Kernel, for example.
147+
*/
148+
#define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size)
149+
140150
static FORCE_INLINE void LZ4_copy8(void *dst, const void *src)
141151
{
142152
#if LZ4_ARCH64

lib/lz4/lz4hc_compress.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -570,7 +570,7 @@ static int LZ4HC_compress_generic(
570570
*op++ = (BYTE) lastRun;
571571
} else
572572
*op++ = (BYTE)(lastRun<<ML_BITS);
573-
memcpy(op, anchor, iend - anchor);
573+
LZ4_memcpy(op, anchor, iend - anchor);
574574
op += iend - anchor;
575575
}
576576

0 commit comments

Comments
 (0)