Skip to content

Commit 6a988f6

Browse files
Ola Liljedahlnsz-arm
authored andcommitted
networking: New subproject.
Add scalar and NEON ones' complement checksumming implementations for AArch64 and Armv7-A.
1 parent 6f41cff commit 6a988f6

File tree

10 files changed

+991
-2
lines changed

10 files changed

+991
-2
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ libdir = $(prefix)/lib
1010
includedir = $(prefix)/include
1111

1212
# Configure these in config.mk, do not make changes in this file.
13-
SUBS = math string
13+
SUBS = math string networking
1414
HOST_CC = cc
1515
HOST_CFLAGS = -std=c99 -O2
1616
HOST_LDFLAGS =

README

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ math/ - math subproject sources.
1717
math/include/ - math library public headers.
1818
math/test/ - math test and benchmark related sources.
1919
math/tools/ - tools used for designing the algorithms.
20+
networking/ - networking subproject sources.
21+
networking/include/ - networking library public headers.
22+
networking/test/ - networking test and benchmark related sources.
2023
string/ - string routines subproject sources.
2124
string/include/ - string library public headers.
2225
string/test/ - string test and benchmark related sources.

config.mk.dist

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
# SPDX-License-Identifier: MIT
55

66
# Subprojects to build
7-
SUBS = math string
7+
SUBS = math string networking
88

99
# Target architecture: aarch64, arm or x86_64
1010
ARCH = aarch64
@@ -40,6 +40,7 @@ math-ldlibs =
4040
math-ulpflags =
4141
math-testflags =
4242
string-cflags =
43+
networking-cflags =
4344

4445
# Use if mpfr is available on the target for ulp error checking.
4546
#math-ldlibs += -lmpfr -lgmp
@@ -58,3 +59,9 @@ math-cflags += -ffp-contract=fast -fno-math-errno
5859
# Disable fenv checks
5960
#math-ulpflags = -q -f
6061
#math-testflags = -nostatus
62+
63+
# Enable assertion checks.
64+
#networking-cflags += -DWANT_ASSERT
65+
66+
# Avoid auto-vectorization of scalar code and unroll loops
67+
networking-cflags += -O2 -fno-tree-vectorize -funroll-loops

networking/Dir.mk

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# Makefile fragment - requires GNU make
2+
#
3+
# Copyright (c) 2019-2020, Arm Limited.
4+
# SPDX-License-Identifier: MIT
5+
6+
S := $(srcdir)/networking
7+
B := build/networking
8+
9+
ifeq ($(ARCH),)
10+
all-networking check-networking install-networking clean-networking:
11+
@echo "*** Please set ARCH in config.mk. ***"
12+
@exit 1
13+
else
14+
15+
networking-lib-srcs := $(wildcard $(S)/*.[cS]) $(wildcard $(S)/$(ARCH)/*.[cS])
16+
networking-test-srcs := $(wildcard $(S)/test/*.c)
17+
18+
networking-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
19+
20+
networking-libs := \
21+
build/lib/libnetworking.so \
22+
build/lib/libnetworking.a \
23+
24+
networking-tools := \
25+
build/bin/test/chksum
26+
27+
networking-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(networking-lib-srcs)))
28+
networking-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(networking-test-srcs)))
29+
30+
networking-objs := \
31+
$(networking-lib-objs) \
32+
$(networking-lib-objs:%.o=%.os) \
33+
$(networking-test-objs) \
34+
35+
networking-files := \
36+
$(networking-objs) \
37+
$(networking-libs) \
38+
$(networking-tools) \
39+
$(networking-includes) \
40+
41+
all-networking: $(networking-libs) $(networking-tools) $(networking-includes)
42+
43+
$(networking-objs): $(networking-includes)
44+
$(networking-objs): CFLAGS_ALL += $(networking-cflags)
45+
46+
build/lib/libnetworking.so: $(networking-lib-objs:%.o=%.os)
47+
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^
48+
49+
build/lib/libnetworkinglib.a: $(networking-lib-objs)
50+
rm -f $@
51+
$(AR) rc $@ $^
52+
$(RANLIB) $@
53+
54+
build/bin/test/%: $(B)/test/%.o build/lib/libnetworkinglib.a
55+
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
56+
57+
build/include/%.h: $(S)/include/%.h
58+
cp $< $@
59+
60+
build/bin/%.sh: $(S)/test/%.sh
61+
cp $< $@
62+
63+
check-networking: $(networking-tools)
64+
$(EMULATOR) build/bin/test/chksum -i simple
65+
$(EMULATOR) build/bin/test/chksum -i scalar
66+
$(EMULATOR) build/bin/test/chksum -i simd || true # simd is not always available
67+
68+
install-networking: \
69+
$(networking-libs:build/lib/%=$(DESTDIR)$(libdir)/%) \
70+
$(networking-includes:build/include/%=$(DESTDIR)$(includedir)/%)
71+
72+
clean-networking:
73+
rm -f $(networking-files)
74+
endif
75+
76+
.PHONY: all-networking check-networking install-networking clean-networking

networking/aarch64/chksum_simd.c

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
/*
2+
* AArch64-specific checksum implementation using NEON
3+
*
4+
* Copyright (c) 2020, Arm Limited.
5+
* SPDX-License-Identifier: MIT
6+
*/
7+
8+
#include "networking.h"
9+
#include "../chksum_common.h"
10+
11+
#ifndef __ARM_NEON
12+
#pragma GCC target("+simd")
13+
#endif
14+
15+
#include <arm_neon.h>
16+
17+
always_inline
18+
static inline uint64_t
19+
slurp_head64(const void **pptr, uint32_t *nbytes)
20+
{
21+
Assert(*nbytes >= 8);
22+
uint64_t sum = 0;
23+
uint32_t off = (uintptr_t) *pptr % 8;
24+
if (likely(off != 0))
25+
{
26+
/* Get rid of bytes 0..off-1 */
27+
const unsigned char *ptr64 = align_ptr(*pptr, 8);
28+
uint64_t mask = ALL_ONES << (CHAR_BIT * off);
29+
uint64_t val = load64(ptr64) & mask;
30+
/* Fold 64-bit sum to 33 bits */
31+
sum = val >> 32;
32+
sum += (uint32_t) val;
33+
*pptr = ptr64 + 8;
34+
*nbytes -= 8 - off;
35+
}
36+
return sum;
37+
}
38+
39+
always_inline
40+
static inline uint64_t
41+
slurp_tail64(uint64_t sum, const void *ptr, uint32_t nbytes)
42+
{
43+
Assert(nbytes < 8);
44+
if (likely(nbytes != 0))
45+
{
46+
/* Get rid of bytes 7..nbytes */
47+
uint64_t mask = ALL_ONES >> (CHAR_BIT * (8 - nbytes));
48+
Assert(__builtin_popcountl(mask) / CHAR_BIT == nbytes);
49+
uint64_t val = load64(ptr) & mask;
50+
sum += val >> 32;
51+
sum += (uint32_t) val;
52+
nbytes = 0;
53+
}
54+
Assert(nbytes == 0);
55+
return sum;
56+
}
57+
58+
unsigned short
59+
__chksum_aarch64_simd(const void *ptr, unsigned int nbytes)
60+
{
61+
bool swap = (uintptr_t) ptr & 1;
62+
uint64_t sum;
63+
64+
if (unlikely(nbytes < 50))
65+
{
66+
sum = slurp_small(ptr, nbytes);
67+
swap = false;
68+
goto fold;
69+
}
70+
71+
/* 8-byte align pointer */
72+
Assert(nbytes >= 8);
73+
sum = slurp_head64(&ptr, &nbytes);
74+
Assert(((uintptr_t) ptr & 7) == 0);
75+
76+
const uint32_t *may_alias ptr32 = ptr;
77+
78+
uint64x2_t vsum0 = { 0, 0 };
79+
uint64x2_t vsum1 = { 0, 0 };
80+
uint64x2_t vsum2 = { 0, 0 };
81+
uint64x2_t vsum3 = { 0, 0 };
82+
83+
/* Sum groups of 64 bytes */
84+
for (uint32_t i = 0; i < nbytes / 64; i++)
85+
{
86+
uint32x4_t vtmp0 = vld1q_u32(ptr32);
87+
uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4);
88+
uint32x4_t vtmp2 = vld1q_u32(ptr32 + 8);
89+
uint32x4_t vtmp3 = vld1q_u32(ptr32 + 12);
90+
vsum0 = vpadalq_u32(vsum0, vtmp0);
91+
vsum1 = vpadalq_u32(vsum1, vtmp1);
92+
vsum2 = vpadalq_u32(vsum2, vtmp2);
93+
vsum3 = vpadalq_u32(vsum3, vtmp3);
94+
ptr32 += 16;
95+
}
96+
nbytes %= 64;
97+
98+
/* Fold vsum2 and vsum3 into vsum0 and vsum1 */
99+
vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum2));
100+
vsum1 = vpadalq_u32(vsum1, vreinterpretq_u32_u64(vsum3));
101+
102+
/* Add any trailing group of 32 bytes */
103+
if (nbytes & 32)
104+
{
105+
uint32x4_t vtmp0 = vld1q_u32(ptr32);
106+
uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4);
107+
vsum0 = vpadalq_u32(vsum0, vtmp0);
108+
vsum1 = vpadalq_u32(vsum1, vtmp1);
109+
ptr32 += 8;
110+
nbytes -= 32;
111+
}
112+
Assert(nbytes < 32);
113+
114+
/* Fold vsum1 into vsum0 */
115+
vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum1));
116+
117+
/* Add any trailing group of 16 bytes */
118+
if (nbytes & 16)
119+
{
120+
uint32x4_t vtmp = vld1q_u32(ptr32);
121+
vsum0 = vpadalq_u32(vsum0, vtmp);
122+
ptr32 += 4;
123+
nbytes -= 16;
124+
}
125+
Assert(nbytes < 16);
126+
127+
/* Add any trailing group of 8 bytes */
128+
if (nbytes & 8)
129+
{
130+
uint32x2_t vtmp = vld1_u32(ptr32);
131+
vsum0 = vaddw_u32(vsum0, vtmp);
132+
ptr32 += 2;
133+
nbytes -= 8;
134+
}
135+
Assert(nbytes < 8);
136+
137+
uint64_t val = vaddlvq_u32(vreinterpretq_u32_u64(vsum0));
138+
sum += val >> 32;
139+
sum += (uint32_t) val;
140+
141+
/* Handle any trailing 0..7 bytes */
142+
sum = slurp_tail64(sum, ptr32, nbytes);
143+
144+
fold:
145+
return fold_and_swap(sum, swap);
146+
}

0 commit comments

Comments
 (0)