-
Notifications
You must be signed in to change notification settings - Fork 15.2k
Closed
Description
Reference code: Zig Godbolt
Some opportunities for producing optimized sum of absolute differences (SAD) calculations are being missed. It looks like prior support for this was overly restrictive.
Essentially, the absolute difference is being calculated, when it should just be handled by the dedicated SAD instruction.
Here's the code inline:
const block_width = 8;
const T = u8;
const VT = @Vector(block_width, T);
export fn sad(noalias srcp: [*]const u8, noalias refp: [*]const u8, height: usize, stride: usize) u32 {
const src = srcp[0..height * stride];
const ref = refp[0..height * stride];
var sum: u32 = 0;
const s: VT = src[0*stride..][0..block_width].*;
const r: VT = ref[0*stride..][0..block_width].*;
// Should work, but doesn't.
const absdiff = @max(s,r) - @min(s,r);
sum += @reduce(.Add, absdiff);
// Should work, but doesn't
//const VTI = @Vector(block_width, i16);
//sum += @reduce(.Add, @abs(@as(VTI, s) - @as(VTI, r)));
// Does work
//const VTI = @Vector(block_width, i32);
//sum += @reduce(.Add, @abs(@as(VTI, s) - @as(VTI, r)));
return sum;
}Which produces:
sad:
push rbp
mov rbp, rsp
vmovq xmm0, qword ptr [rdi]
vmovq xmm1, qword ptr [rsi]
vpminub xmm2, xmm0, xmm1
vpmaxub xmm0, xmm0, xmm1
vpxor xmm1, xmm1, xmm1
vpsubb xmm0, xmm0, xmm2
vpsadbw xmm0, xmm0, xmm1
vpextrb eax, xmm0, 0
pop rbp
retBut it should be:
sad:
push rbp
mov rbp, rsp
vmovq xmm0, qword ptr [rdi]
vmovq xmm1, qword ptr [rsi]
vpsadbw xmm0, xmm0, xmm1
vmovd eax, xmm0
pop rbp
ret