Skip to content

Commit 192dd3f

Browse files
committed
optimze ZSTD_copy16 on x86
this change removes one (1) instruction, but gives a ~2% performance improvement on the benchmarks.
1 parent 1130bd5 commit 192dd3f

File tree

1 file changed

+22
-4
lines changed

1 file changed

+22
-4
lines changed

lib/common/zstd_internal.rs

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
use core::mem::MaybeUninit;
2+
13
use libc::size_t;
24

35
const fn const_max(a: usize, b: usize) -> usize {
@@ -68,12 +70,28 @@ pub(crate) static OF_defaultNorm: [i16; 29] = [
6870
pub(crate) const OF_DEFAULTNORMLOG: u32 = 5;
6971
pub(crate) static OF_defaultNormLog: u32 = OF_DEFAULTNORMLOG;
7072

73+
#[inline]
7174
pub(crate) unsafe fn ZSTD_copy16(dst: *mut u8, src: *const u8) {
7275
// We use `copy` instead of `copy_nonoverlapping` here because the literal buffer can now
7376
// be located within the dst buffer. In circumstances where the op "catches up" to where the
7477
// literal buffer is, there can be partial overlaps in this call on the final
7578
// copy if the literal is being shifted by less than 16 bytes.
76-
core::ptr::copy(src, dst, 16)
79+
crate::cfg_select!(
80+
target_arch = "x86_64" => {
81+
#[repr(C)]
82+
struct V {
83+
x: i64,
84+
y: i64,
85+
}
86+
87+
// This generates the same instructions, but surrounding code optimizes better.
88+
let v = core::ptr::read_unaligned(src as *const MaybeUninit<V>);
89+
core::ptr::write_unaligned(dst as *mut MaybeUninit<V>, v);
90+
}
91+
_ => {
92+
core::ptr::copy(src, dst, 16);
93+
}
94+
);
7795
}
7896

7997
pub(crate) const WILDCOPY_OVERLENGTH: usize = 32;
@@ -126,7 +144,7 @@ pub(crate) unsafe fn ZSTD_wildcopy(
126144
// probabilities. Since it is almost certain to be short, only do
127145
// one 16-byte copy in the first call. Then, do two calls per loop since
128146
// at that point it is more likely to have a high trip count.
129-
core::ptr::copy(ip, op, 16);
147+
ZSTD_copy16(op, ip);
130148

131149
if 16 >= length {
132150
return;
@@ -136,11 +154,11 @@ pub(crate) unsafe fn ZSTD_wildcopy(
136154
ip = ip.add(16);
137155

138156
loop {
139-
core::ptr::copy(ip, op, 16);
157+
ZSTD_copy16(op, ip);
140158
op = op.add(16);
141159
ip = ip.add(16);
142160

143-
core::ptr::copy(ip, op, 16);
161+
ZSTD_copy16(op, ip);
144162
op = op.add(16);
145163
ip = ip.add(16);
146164

0 commit comments

Comments
 (0)