Skip to content

Commit 3586dc9

Browse files
committed
reduce interleave to 2
1 parent a1dab99 commit 3586dc9

File tree

3 files changed

+65
-99
lines changed

3 files changed

+65
-99
lines changed

internal/cosine/cosine_avx.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ void f32_cosine_distance(const float *x, const float *y, double *result, const u
1010
float sum_xx = 0.0f;
1111
float sum_yy = 0.0f;
1212

13-
#pragma clang loop vectorize(enable) interleave_count(4)
13+
#pragma clang loop vectorize(enable) interleave_count(2)
1414
for (uint64_t i = 0; i < size; i++) {
1515
sum_xy += x[i] * y[i]; // Sum of x * y
1616
sum_xx += x[i] * x[i]; // Sum of x * x

internal/cosine/simd/cosine_avx.s

Lines changed: 62 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -8,132 +8,98 @@ TEXT ·f32_cosine_distance(SB), $0-32
88
MOVQ size+24(FP), CX
99
BYTE $0x55 // push rbp
1010
WORD $0x8948; BYTE $0xe5 // mov rbp, rsp
11-
LONG $0xe0e48348 // and rsp, -32
12-
LONG $0x60ec8348 // sub rsp, 96
13-
LONG $0xed57d0c5 // vxorps xmm5, xmm5, xmm5
14-
LONG $0xe457d8c5 // vxorps xmm4, xmm4, xmm4
11+
LONG $0xf8e48348 // and rsp, -8
1512
LONG $0xc957f0c5 // vxorps xmm1, xmm1, xmm1
16-
LONG $0xdb57e0c5 // vxorps xmm3, xmm3, xmm3
13+
LONG $0xc057f8c5 // vxorps xmm0, xmm0, xmm0
14+
LONG $0xd257e8c5 // vxorps xmm2, xmm2, xmm2
15+
LONG $0xe457d8c5 // vxorps xmm4, xmm4, xmm4
1716
WORD $0x8548; BYTE $0xc9 // test rcx, rcx
1817
JE LBB0_8
19-
LONG $0x20f98348 // cmp rcx, 32
18+
LONG $0x10f98348 // cmp rcx, 16
2019
JAE LBB0_4
21-
LONG $0xd257e8c5 // vxorps xmm2, xmm2, xmm2
20+
LONG $0xdb57e0c5 // vxorps xmm3, xmm3, xmm3
2221
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
23-
LONG $0xf657c8c5 // vxorps xmm6, xmm6, xmm6
24-
LONG $0xc957f0c5 // vxorps xmm1, xmm1, xmm1
22+
LONG $0xe457d8c5 // vxorps xmm4, xmm4, xmm4
23+
LONG $0xd257e8c5 // vxorps xmm2, xmm2, xmm2
2524
JMP LBB0_3
2625

2726
LBB0_4:
28-
WORD $0x8949; BYTE $0xc8 // mov r8, rcx
29-
LONG $0xe0e08349 // and r8, -32
30-
LONG $0xc057f8c5 // vxorps xmm0, xmm0, xmm0
31-
LONG $0x0429fcc5; BYTE $0x24 // vmovaps ymmword ptr [rsp], ymm0
32-
WORD $0xc031 // xor eax, eax
33-
LONG $0xdb57e0c5 // vxorps xmm3, xmm3, xmm3
34-
LONG $0xe457d8c5 // vxorps xmm4, xmm4, xmm4
35-
LONG $0xed57d0c5 // vxorps xmm5, xmm5, xmm5
36-
LONG $0xf657c8c5 // vxorps xmm6, xmm6, xmm6
37-
LONG $0xff57c0c5 // vxorps xmm7, xmm7, xmm7
38-
LONG $0x573841c4; BYTE $0xc0 // vxorps xmm8, xmm8, xmm8
39-
LONG $0x573041c4; BYTE $0xc9 // vxorps xmm9, xmm9, xmm9
40-
LONG $0xc957f0c5 // vxorps xmm1, xmm1, xmm1
41-
LONG $0x572041c4; BYTE $0xdb // vxorps xmm11, xmm11, xmm11
42-
LONG $0x571841c4; BYTE $0xe4 // vxorps xmm12, xmm12, xmm12
43-
LONG $0x571041c4; BYTE $0xed // vxorps xmm13, xmm13, xmm13
27+
WORD $0x8949; BYTE $0xc8 // mov r8, rcx
28+
LONG $0xf0e08349 // and r8, -16
29+
LONG $0xdb57e0c5 // vxorps xmm3, xmm3, xmm3
30+
WORD $0xc031 // xor eax, eax
31+
LONG $0xed57d0c5 // vxorps xmm5, xmm5, xmm5
32+
LONG $0xe457d8c5 // vxorps xmm4, xmm4, xmm4
33+
LONG $0xf657c8c5 // vxorps xmm6, xmm6, xmm6
34+
LONG $0xd257e8c5 // vxorps xmm2, xmm2, xmm2
35+
LONG $0xff57c0c5 // vxorps xmm7, xmm7, xmm7
4436

4537
LBB0_5:
46-
LONG $0x6c29fcc5; WORD $0x2024 // vmovaps ymmword ptr [rsp + 32], ymm5
47-
LONG $0x34107cc5; BYTE $0x87 // vmovups ymm14, ymmword ptr [rdi + 4*rax]
48-
LONG $0x7c107cc5; WORD $0x2087 // vmovups ymm15, ymmword ptr [rdi + 4*rax + 32]
49-
LONG $0x54107cc5; WORD $0x4087 // vmovups ymm10, ymmword ptr [rdi + 4*rax + 64]
50-
LONG $0x4410fcc5; WORD $0x6087 // vmovups ymm0, ymmword ptr [rdi + 4*rax + 96]
51-
LONG $0x1410fcc5; BYTE $0x86 // vmovups ymm2, ymmword ptr [rsi + 4*rax]
52-
LONG $0xec28fcc5 // vmovaps ymm5, ymm4
53-
LONG $0xe328fcc5 // vmovaps ymm4, ymm3
54-
LONG $0x1c28fcc5; BYTE $0x24 // vmovaps ymm3, ymmword ptr [rsp]
55-
LONG $0xb86dc2c4; BYTE $0xde // vfmadd231ps ymm3, ymm2, ymm14
56-
LONG $0x1c29fcc5; BYTE $0x24 // vmovaps ymmword ptr [rsp], ymm3
57-
LONG $0xdc28fcc5 // vmovaps ymm3, ymm4
58-
LONG $0xe528fcc5 // vmovaps ymm4, ymm5
59-
LONG $0x6c28fcc5; WORD $0x2024 // vmovaps ymm5, ymmword ptr [rsp + 32]
60-
LONG $0xb80dc2c4; BYTE $0xf6 // vfmadd231ps ymm6, ymm14, ymm14
61-
LONG $0x74107cc5; WORD $0x2086 // vmovups ymm14, ymmword ptr [rsi + 4*rax + 32]
62-
LONG $0xb80dc2c4; BYTE $0xdf // vfmadd231ps ymm3, ymm14, ymm15
63-
LONG $0xb805c2c4; BYTE $0xff // vfmadd231ps ymm7, ymm15, ymm15
64-
LONG $0x7c107cc5; WORD $0x4086 // vmovups ymm15, ymmword ptr [rsi + 4*rax + 64]
65-
LONG $0xb805c2c4; BYTE $0xe2 // vfmadd231ps ymm4, ymm15, ymm10
66-
LONG $0xb82d42c4; BYTE $0xc2 // vfmadd231ps ymm8, ymm10, ymm10
67-
LONG $0x54107cc5; WORD $0x6086 // vmovups ymm10, ymmword ptr [rsi + 4*rax + 96]
68-
LONG $0xb82de2c4; BYTE $0xe8 // vfmadd231ps ymm5, ymm10, ymm0
69-
LONG $0xb87d62c4; BYTE $0xc8 // vfmadd231ps ymm9, ymm0, ymm0
70-
LONG $0xb86de2c4; BYTE $0xca // vfmadd231ps ymm1, ymm2, ymm2
71-
LONG $0xb80d42c4; BYTE $0xde // vfmadd231ps ymm11, ymm14, ymm14
72-
LONG $0xb80542c4; BYTE $0xe7 // vfmadd231ps ymm12, ymm15, ymm15
73-
LONG $0xb82d42c4; BYTE $0xea // vfmadd231ps ymm13, ymm10, ymm10
74-
LONG $0x20c08348 // add rax, 32
38+
LONG $0x04107cc5; BYTE $0x87 // vmovups ymm8, ymmword ptr [rdi + 4*rax]
39+
LONG $0x4c107cc5; WORD $0x2087 // vmovups ymm9, ymmword ptr [rdi + 4*rax + 32]
40+
LONG $0x14107cc5; BYTE $0x86 // vmovups ymm10, ymmword ptr [rsi + 4*rax]
41+
LONG $0x5c107cc5; WORD $0x2086 // vmovups ymm11, ymmword ptr [rsi + 4*rax + 32]
42+
LONG $0xb82dc2c4; BYTE $0xd8 // vfmadd231ps ymm3, ymm10, ymm8
43+
LONG $0xb825c2c4; BYTE $0xe9 // vfmadd231ps ymm5, ymm11, ymm9
44+
LONG $0xb83dc2c4; BYTE $0xe0 // vfmadd231ps ymm4, ymm8, ymm8
45+
LONG $0xb835c2c4; BYTE $0xf1 // vfmadd231ps ymm6, ymm9, ymm9
46+
LONG $0xb82dc2c4; BYTE $0xd2 // vfmadd231ps ymm2, ymm10, ymm10
47+
LONG $0xb825c2c4; BYTE $0xfb // vfmadd231ps ymm7, ymm11, ymm11
48+
LONG $0x10c08348 // add rax, 16
7549
WORD $0x3949; BYTE $0xc0 // cmp r8, rax
7650
JNE LBB0_5
77-
LONG $0xc158a4c5 // vaddps ymm0, ymm11, ymm1
78-
LONG $0xc0589cc5 // vaddps ymm0, ymm12, ymm0
79-
LONG $0xc05894c5 // vaddps ymm0, ymm13, ymm0
80-
LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1
81-
LONG $0xc158f8c5 // vaddps xmm0, xmm0, xmm1
82-
LONG $0x0579e3c4; WORD $0x01c8 // vpermilpd xmm1, xmm0, 1
83-
LONG $0xc158f8c5 // vaddps xmm0, xmm0, xmm1
84-
LONG $0xc816fac5 // vmovshdup xmm1, xmm0
85-
LONG $0xc958fac5 // vaddss xmm1, xmm0, xmm1
86-
LONG $0xc658c4c5 // vaddps ymm0, ymm7, ymm6
87-
LONG $0xc058bcc5 // vaddps ymm0, ymm8, ymm0
88-
LONG $0xc058b4c5 // vaddps ymm0, ymm9, ymm0
89-
LONG $0x197de3c4; WORD $0x01c2 // vextractf128 xmm2, ymm0, 1
90-
LONG $0xc258f8c5 // vaddps xmm0, xmm0, xmm2
91-
LONG $0x0579e3c4; WORD $0x01d0 // vpermilpd xmm2, xmm0, 1
92-
LONG $0xc258f8c5 // vaddps xmm0, xmm0, xmm2
93-
LONG $0xd016fac5 // vmovshdup xmm2, xmm0
94-
LONG $0xf258fac5 // vaddss xmm6, xmm0, xmm2
95-
LONG $0x0458e4c5; BYTE $0x24 // vaddps ymm0, ymm3, ymmword ptr [rsp]
96-
LONG $0xc058dcc5 // vaddps ymm0, ymm4, ymm0
97-
LONG $0xc058d4c5 // vaddps ymm0, ymm5, ymm0
98-
LONG $0x197de3c4; WORD $0x01c2 // vextractf128 xmm2, ymm0, 1
99-
LONG $0xc258f8c5 // vaddps xmm0, xmm0, xmm2
100-
LONG $0x0579e3c4; WORD $0x01d0 // vpermilpd xmm2, xmm0, 1
101-
LONG $0xc258f8c5 // vaddps xmm0, xmm0, xmm2
102-
LONG $0xd016fac5 // vmovshdup xmm2, xmm0
103-
LONG $0xd258fac5 // vaddss xmm2, xmm0, xmm2
51+
LONG $0xd258c4c5 // vaddps ymm2, ymm7, ymm2
52+
LONG $0x197de3c4; WORD $0x01d7 // vextractf128 xmm7, ymm2, 1
53+
LONG $0xd758e8c5 // vaddps xmm2, xmm2, xmm7
54+
LONG $0x0579e3c4; WORD $0x01fa // vpermilpd xmm7, xmm2, 1
55+
LONG $0xd758e8c5 // vaddps xmm2, xmm2, xmm7
56+
LONG $0xfa16fac5 // vmovshdup xmm7, xmm2
57+
LONG $0xd758eac5 // vaddss xmm2, xmm2, xmm7
58+
LONG $0xe458ccc5 // vaddps ymm4, ymm6, ymm4
59+
LONG $0x197de3c4; WORD $0x01e6 // vextractf128 xmm6, ymm4, 1
60+
LONG $0xe658d8c5 // vaddps xmm4, xmm4, xmm6
61+
LONG $0x0579e3c4; WORD $0x01f4 // vpermilpd xmm6, xmm4, 1
62+
LONG $0xe658d8c5 // vaddps xmm4, xmm4, xmm6
63+
LONG $0xf416fac5 // vmovshdup xmm6, xmm4
64+
LONG $0xe658dac5 // vaddss xmm4, xmm4, xmm6
65+
LONG $0xdb58d4c5 // vaddps ymm3, ymm5, ymm3
66+
LONG $0x197de3c4; WORD $0x01dd // vextractf128 xmm5, ymm3, 1
67+
LONG $0xdd58e0c5 // vaddps xmm3, xmm3, xmm5
68+
LONG $0x0579e3c4; WORD $0x01eb // vpermilpd xmm5, xmm3, 1
69+
LONG $0xdd58e0c5 // vaddps xmm3, xmm3, xmm5
70+
LONG $0xeb16fac5 // vmovshdup xmm5, xmm3
71+
LONG $0xdd58e2c5 // vaddss xmm3, xmm3, xmm5
10472
WORD $0x3949; BYTE $0xc8 // cmp r8, rcx
105-
LONG $0xe457d8c5 // vxorps xmm4, xmm4, xmm4
106-
LONG $0xed57d0c5 // vxorps xmm5, xmm5, xmm5
10773
JE LBB0_7
10874

10975
LBB0_3:
110-
LONG $0x107aa1c4; WORD $0x8704 // vmovss xmm0, dword ptr [rdi + 4*r8]
111-
LONG $0x107aa1c4; WORD $0x861c // vmovss xmm3, dword ptr [rsi + 4*r8]
112-
LONG $0xb961e2c4; BYTE $0xd0 // vfmadd231ss xmm2, xmm3, xmm0
113-
LONG $0xb979e2c4; BYTE $0xf0 // vfmadd231ss xmm6, xmm0, xmm0
114-
LONG $0xb961e2c4; BYTE $0xcb // vfmadd231ss xmm1, xmm3, xmm3
76+
LONG $0x107aa1c4; WORD $0x872c // vmovss xmm5, dword ptr [rdi + 4*r8]
77+
LONG $0x107aa1c4; WORD $0x8634 // vmovss xmm6, dword ptr [rsi + 4*r8]
78+
LONG $0xb949e2c4; BYTE $0xdd // vfmadd231ss xmm3, xmm6, xmm5
79+
LONG $0xb951e2c4; BYTE $0xe5 // vfmadd231ss xmm4, xmm5, xmm5
80+
LONG $0xb949e2c4; BYTE $0xd6 // vfmadd231ss xmm2, xmm6, xmm6
11581
WORD $0xff49; BYTE $0xc0 // inc r8
11682
WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
11783
JNE LBB0_3
11884

11985
LBB0_7:
120-
LONG $0xd959cac5 // vmulss xmm3, xmm6, xmm1
121-
LONG $0xca5aeac5 // vcvtss2sd xmm1, xmm2, xmm2
86+
LONG $0xe259dac5 // vmulss xmm4, xmm4, xmm2
87+
LONG $0xd35ae2c5 // vcvtss2sd xmm2, xmm3, xmm3
12288

12389
LBB0_8:
124-
LONG $0xd351e2c5 // vsqrtss xmm2, xmm3, xmm3
125-
LONG $0xd52ef8c5 // vucomiss xmm2, xmm5
90+
LONG $0xdc51dac5 // vsqrtss xmm3, xmm4, xmm4
91+
LONG $0xd92ef8c5 // vucomiss xmm3, xmm1
12692
JNE LBB0_9
127-
LONG $0x2211fbc5 // vmovsd qword ptr [rdx], xmm4
93+
LONG $0x0211fbc5 // vmovsd qword ptr [rdx], xmm0
12894
WORD $0x8948; BYTE $0xec // mov rsp, rbp
12995
BYTE $0x5d // pop rbp
13096
WORD $0xf8c5; BYTE $0x77 // vzeroupper
13197
BYTE $0xc3 // ret
13298

13399
LBB0_9:
134-
LONG $0xc25aeac5 // vcvtss2sd xmm0, xmm2, xmm2
135-
LONG $0xe05ef3c5 // vdivsd xmm4, xmm1, xmm0
136-
LONG $0x2211fbc5 // vmovsd qword ptr [rdx], xmm4
100+
LONG $0xc35ae2c5 // vcvtss2sd xmm0, xmm3, xmm3
101+
LONG $0xc05eebc5 // vdivsd xmm0, xmm2, xmm0
102+
LONG $0x0211fbc5 // vmovsd qword ptr [rdx], xmm0
137103
WORD $0x8948; BYTE $0xec // mov rsp, rbp
138104
BYTE $0x5d // pop rbp
139105
WORD $0xf8c5; BYTE $0x77 // vzeroupper

internal/cosine/simd/simd_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ import (
99

1010
/*
1111
cpu: 13th Gen Intel(R) Core(TM) i7-13700K
12-
BenchmarkCosine/std-24 15045380 80.61 ns/op 0 B/op 0 allocs/op
13-
BenchmarkCosine/our-24 55741100 20.85 ns/op 0 B/op 0 allocs/op
12+
BenchmarkCosine/std-24 14911036 80.46 ns/op 0 B/op 0 allocs/op
13+
BenchmarkCosine/our-24 61780514 18.11 ns/op 0 B/op 0 allocs/op
1414
*/
1515
func BenchmarkCosine(b *testing.B) {
1616
x := randVec()

0 commit comments

Comments
 (0)