Skip to content

Commit e10454f

Browse files
committed
chore: better cpu os detection, parallel hashing on scalar
1 parent 6c29ab6 commit e10454f

File tree

13 files changed

+103
-90
lines changed

13 files changed

+103
-90
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ require (
100100
github.com/ipfs/go-log/v2 v2.6.0 // indirect
101101
github.com/jackpal/go-nat-pmp v1.0.2 // indirect
102102
github.com/jbenet/go-temp-err-catcher v0.1.0 // indirect
103-
github.com/klauspost/cpuid/v2 v2.2.10 // indirect
103+
github.com/klauspost/cpuid/v2 v2.3.0
104104
github.com/koron/go-ssdp v0.0.6 // indirect
105105
github.com/leodido/go-urn v1.4.0 // indirect
106106
github.com/libdns/libdns v0.2.2 // indirect

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -527,8 +527,8 @@ github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYW
527527
github.com/klauspost/cpuid v0.0.0-20170728055534-ae7887de9fa5/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
528528
github.com/klauspost/cpuid/v2 v2.0.4/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
529529
github.com/klauspost/cpuid/v2 v2.0.6/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
530-
github.com/klauspost/cpuid/v2 v2.2.10 h1:tBs3QSyvjDyFTq3uoc/9xFpCuOsJQFNPiAhYdw2skhE=
531-
github.com/klauspost/cpuid/v2 v2.2.10/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
530+
github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y=
531+
github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
532532
github.com/klauspost/crc32 v0.0.0-20161016154125-cb6bfca970f6/go.mod h1:+ZoRqAPRLkC4NPOvfYeR5KNOrY6TD+/sAC3HXPZgDYg=
533533
github.com/klauspost/pgzip v1.0.2-0.20170402124221-0bf5dcad4ada/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
534534
github.com/klauspost/reedsolomon v1.11.8 h1:s8RpUW5TK4hjr+djiOpbZJB4ksx+TdYbRH7vHQpwPOY=

pkg/bmt/bmt_simd.go

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,12 @@ func (h *Hasher) hashLeavesBatch(start, end, bw, secsize, prefixLen int) {
5454
for j := batch; j < 8; j++ {
5555
inputs[j] = nil
5656
}
57-
outputs := keccak.Sum256x8(inputs)
57+
var outputs [8]keccak.Hash256
58+
if h.useSIMD {
59+
outputs = keccak.Sum256x8(inputs)
60+
} else {
61+
outputs = keccak.Sum256x8Scalar(inputs)
62+
}
5863
for j := 0; j < batch; j++ {
5964
leaf := h.bmt.levels[0][i+j]
6065
if leaf.isLeft {
@@ -83,7 +88,12 @@ func (h *Hasher) hashLeavesBatch(start, end, bw, secsize, prefixLen int) {
8388
for j := batch; j < 4; j++ {
8489
inputs[j] = nil
8590
}
86-
outputs := keccak.Sum256x4(inputs)
91+
var outputs [4]keccak.Hash256
92+
if h.useSIMD {
93+
outputs = keccak.Sum256x4(inputs)
94+
} else {
95+
outputs = keccak.Sum256x4Scalar(inputs)
96+
}
8797
for j := 0; j < batch; j++ {
8898
leaf := h.bmt.levels[0][i+j]
8999
if leaf.isLeft {
@@ -119,7 +129,12 @@ func (h *Hasher) hashNodesBatch(nodes []*node, bw, prefixLen int) {
119129
for j := batch; j < 8; j++ {
120130
inputs[j] = nil
121131
}
122-
outputs := keccak.Sum256x8(inputs)
132+
var outputs [8]keccak.Hash256
133+
if h.useSIMD {
134+
outputs = keccak.Sum256x8(inputs)
135+
} else {
136+
outputs = keccak.Sum256x8Scalar(inputs)
137+
}
123138
for j := 0; j < batch; j++ {
124139
n := nodes[i+j]
125140
if n.isLeft {
@@ -145,7 +160,12 @@ func (h *Hasher) hashNodesBatch(nodes []*node, bw, prefixLen int) {
145160
for j := batch; j < 4; j++ {
146161
inputs[j] = nil
147162
}
148-
outputs := keccak.Sum256x4(inputs)
163+
var outputs [4]keccak.Hash256
164+
if h.useSIMD {
165+
outputs = keccak.Sum256x4(inputs)
166+
} else {
167+
outputs = keccak.Sum256x4Scalar(inputs)
168+
}
149169
for j := 0; j < batch; j++ {
150170
n := nodes[i+j]
151171
if n.isLeft {

pkg/bmt/export_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,6 @@ var Sha3hash = sha3hash
1010
func NewConfNoSIMD(segmentCount, capacity int) *Conf {
1111
c := NewConf(segmentCount, capacity)
1212
c.useSIMD = false
13-
c.batchWidth = 4 // use 4-wide batching with scalar fallback
13+
c.batchWidth = 8 // use 8-wide batching with scalar fallback
1414
return c
1515
}

pkg/bmt/pool.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ func newConf(prefix []byte, segmentCount, capacity int) *Conf {
6666

6767
bw := keccak.BatchWidth()
6868
if bw == 0 {
69-
bw = 4 // use 4-wide batching with scalar fallback
69+
bw = 8 // use 4-wide batching with scalar fallback
7070
}
7171
c.batchWidth = bw
7272

pkg/keccak/cpuid_amd64.s

Lines changed: 0 additions & 22 deletions
This file was deleted.

pkg/keccak/keccak.go

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ package keccak
1111

1212
import (
1313
"encoding/hex"
14+
"sync"
1415

1516
"golang.org/x/crypto/sha3"
1617
)
@@ -66,6 +67,12 @@ func Sum256x4(inputs [4][]byte) [4]Hash256 {
6667
return outputs
6768
}
6869

70+
// Sum256x4Scalar computes 4 Keccak-256 hashes using the scalar path,
71+
// bypassing SIMD detection. Used by callers that explicitly want non-SIMD.
72+
func Sum256x4Scalar(inputs [4][]byte) [4]Hash256 {
73+
return sum256x4Scalar(inputs)
74+
}
75+
6976
// Sum256x8 computes 8 Keccak-256 hashes in parallel using AVX-512.
7077
// Falls back to scalar if AVX-512 is not available.
7178
func Sum256x8(inputs [8][]byte) [8]Hash256 {
@@ -79,6 +86,12 @@ func Sum256x8(inputs [8][]byte) [8]Hash256 {
7986
return outputs
8087
}
8188

89+
// Sum256x8Scalar computes 8 Keccak-256 hashes using the scalar path,
90+
// bypassing SIMD detection. Used by callers that explicitly want non-SIMD.
91+
func Sum256x8Scalar(inputs [8][]byte) [8]Hash256 {
92+
return sum256x8Scalar(inputs)
93+
}
94+
8295
func sum256Scalar(data []byte) Hash256 {
8396
var out Hash256
8497
h := sha3.NewLegacyKeccak256()
@@ -89,26 +102,49 @@ func sum256Scalar(data []byte) Hash256 {
89102

90103
func sum256x4Scalar(inputs [4][]byte) [4]Hash256 {
91104
var outputs [4]Hash256
105+
var wg sync.WaitGroup
106+
var mu sync.Mutex
92107
for i := 0; i < 4; i++ {
93108
if inputs[i] == nil {
94109
continue
95110
}
96-
h := sha3.NewLegacyKeccak256()
97-
h.Write(inputs[i])
98-
copy(outputs[i][:], h.Sum(nil))
111+
wg.Add(1)
112+
113+
panic(2)
114+
go func() {
115+
defer wg.Done()
116+
h := sha3.NewLegacyKeccak256()
117+
h.Write(inputs[i])
118+
result := h.Sum(nil)
119+
mu.Lock()
120+
copy(outputs[i][:], result)
121+
mu.Unlock()
122+
}()
99123
}
124+
wg.Wait()
100125
return outputs
101126
}
102127

103128
func sum256x8Scalar(inputs [8][]byte) [8]Hash256 {
104129
var outputs [8]Hash256
130+
var wg sync.WaitGroup
131+
var mu sync.Mutex
132+
105133
for i := 0; i < 8; i++ {
106134
if inputs[i] == nil {
107135
continue
108136
}
109-
h := sha3.NewLegacyKeccak256()
110-
h.Write(inputs[i])
111-
copy(outputs[i][:], h.Sum(nil))
137+
wg.Add(1)
138+
go func() {
139+
defer wg.Done()
140+
h := sha3.NewLegacyKeccak256()
141+
h.Write(inputs[i])
142+
result := h.Sum(nil)
143+
mu.Lock()
144+
copy(outputs[i][:], result)
145+
mu.Unlock()
146+
}()
112147
}
148+
wg.Wait()
113149
return outputs
114150
}

pkg/keccak/keccak_amd64.go

Lines changed: 1 addition & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,55 +1,9 @@
1-
//go:build amd64 && !purego && !windows
1+
//go:build linux && amd64 && !purego
22

33
package keccak
44

5-
var (
6-
hasAVX512 = detectAVX512()
7-
hasAVX2 = detectAVX2()
8-
)
9-
105
//go:noescape
116
func keccak256x4(inputs *[4][]byte, outputs *[4]Hash256)
127

138
//go:noescape
149
func keccak256x8(inputs *[8][]byte, outputs *[8]Hash256)
15-
16-
func osSupportsAVX() bool {
17-
// Check OSXSAVE: leaf 1, ECX bit 27
18-
_, _, ecx, _ := cpuid(1, 0) //nolint:errcheck,dogsled
19-
if ecx&(1<<27) == 0 {
20-
return false
21-
}
22-
// XCR0 bits 1,2 = SSE, AVX state saving enabled by OS
23-
eax, _ := xgetbv(0)
24-
return eax&0x6 == 0x6
25-
}
26-
27-
func osSupportsAVX512() bool {
28-
if !osSupportsAVX() {
29-
return false
30-
}
31-
// XCR0 bits 5,6,7 = opmask, ZMM_Hi256, Hi16_ZMM state saving
32-
eax, _ := xgetbv(0)
33-
return eax&0xE0 == 0xE0
34-
}
35-
36-
func detectAVX512() bool {
37-
if !osSupportsAVX512() {
38-
return false
39-
}
40-
// CPUID leaf 7, sub-leaf 0: EBX bit 16 = AVX-512F, bit 31 = AVX-512VL
41-
_, ebx, _, _ := cpuid(7, 0) //nolint:errcheck,dogsled
42-
return (ebx&(1<<16)) != 0 && (ebx&(1<<31)) != 0
43-
}
44-
45-
func detectAVX2() bool {
46-
if !osSupportsAVX() {
47-
return false
48-
}
49-
// CPUID leaf 7, sub-leaf 0: EBX bit 5 = AVX2
50-
_, ebx, _, _ := cpuid(7, 0) //nolint:errcheck,dogsled
51-
return (ebx & (1 << 5)) != 0
52-
}
53-
54-
func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
55-
func xgetbv(index uint32) (eax, edx uint32)

pkg/keccak/keccak_cpu.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
// Copyright 2024 The Swarm Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build linux && amd64 && !purego
6+
7+
package keccak
8+
9+
import cpuid "github.com/klauspost/cpuid/v2"
10+
11+
var (
12+
hasAVX2 = cpuid.CPU.Supports(cpuid.AVX2)
13+
hasAVX512 = cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512VL)
14+
)

pkg/keccak/keccak_cpu_other.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
// Copyright 2024 The Swarm Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build !linux || !amd64 || purego
6+
7+
package keccak
8+
9+
// No SIMD Keccak implementations are available on this platform;
10+
// Sum256x4/Sum256x8 will fall back to the scalar goroutine path.
11+
var (
12+
hasAVX2 = false
13+
hasAVX512 = false
14+
)

0 commit comments

Comments
 (0)