Skip to content

Commit 5671983

Browse files
kmvijayrickystewart
authored andcommitted
runtime: improvement in memmove for s390x
Memmove/Memcpy routine of s390x architecture is optimized using better instructions for particular sizes. goos: linux goarch: s390x pkg: runtime │ Orig_Memmove_for_benchstat.log │ New_backwards_and_forwards_Memmove_for_benchstat.log │ │ sec/op │ sec/op vs base │ Memmove/0 2.138n ± 4% 2.020n ± 0% -5.52% (p=0.000 n=10) Memmove/1 2.076n ± 1% 2.079n ± 1% ~ (p=0.592 n=10) Memmove/2 2.115n ± 3% 2.123n ± 1% ~ (p=0.325 n=10) Memmove/3 2.414n ± 0% 2.401n ± 1% ~ (p=0.158 n=10) Memmove/4 2.116n ± 0% 2.127n ± 18% +0.52% (p=0.000 n=10) Memmove/5 2.419n ± 0% 2.414n ± 1% -0.21% (p=0.034 n=10) Memmove/6 2.351n ± 1% 2.337n ± 1% ~ (p=0.137 n=10) Memmove/7 2.833n ± 4% 2.764n ± 1% -2.42% (p=0.003 n=10) Memmove/8 2.010n ± 0% 2.001n ± 1% ~ (p=0.100 n=10) Memmove/9 2.380n ± 0% 2.388n ± 0% +0.34% (p=0.000 n=10) Memmove/10 2.617n ± 1% 2.612n ± 1% ~ (p=0.564 n=10) Memmove/11 2.827n ± 3% 2.792n ± 0% -1.24% (p=0.001 n=10) Memmove/12 2.397n ± 0% 2.400n ± 1% ~ (p=0.781 n=10) Memmove/13 2.705n ± 0% 2.704n ± 0% ~ (p=0.136 n=10) Memmove/14 3.000n ± 0% 2.991n ± 1% -0.28% (p=0.035 n=10) Memmove/15 3.118n ± 0% 3.142n ± 1% ~ (p=0.210 n=10) Memmove/16 2.390n ± 0% 2.391n ± 0% ~ (p=0.611 n=10) Memmove/32 5.128n ± 6% 4.287n ± 0% -16.41% (p=0.000 n=10) Memmove/64 8.425n ± 3% 4.100n ± 0% -51.33% (p=0.000 n=10) Memmove/128 5.197n ± 0% 4.857n ± 0% -6.53% (p=0.001 n=10) Memmove/256 5.774n ± 0% 5.920n ± 2% +2.52% (p=0.000 n=10) Memmove/512 10.79n ± 0% 10.70n ± 1% -0.88% (p=0.000 n=10) Memmove/1024 19.86n ± 0% 19.79n ± 0% -0.30% (p=0.000 n=10) Memmove/2048 36.38n ± 0% 35.94n ± 0% -1.20% (p=0.000 n=10) Memmove/4096 72.12n ± 3% 69.65n ± 0% -3.42% (p=0.000 n=10) MemmoveOverlap/32 4.481n ± 1% 4.430n ± 0% -1.13% (p=0.002 n=10) MemmoveOverlap/64 5.932n ± 1% 5.886n ± 0% -0.78% (p=0.000 n=10) MemmoveOverlap/128 7.950n ± 10% 6.591n ± 0% -17.10% (p=0.000 n=10) MemmoveOverlap/256 11.24n ± 1% 10.20n ± 0% -9.25% (p=0.000 n=10) MemmoveOverlap/512 18.49n ± 0% 16.80n ± 2% -9.11% (p=0.000 n=10) MemmoveOverlap/1024 32.64n ± 0% 28.68n ± 0% -12.13% (p=0.000 n=10) MemmoveOverlap/2048 62.79n ± 0% 56.34n ± 1% -10.28% (p=0.000 n=10) MemmoveOverlap/4096 123.2n ± 0% 105.8n ± 0% -14.12% (p=0.000 n=10) MemmoveUnalignedDst/0 2.571n ± 10% 2.225n ± 2% -13.46% (p=0.000 n=10) MemmoveUnalignedDst/1 2.323n ± 0% 2.322n ± 0% -0.04% (p=0.000 n=10) MemmoveUnalignedDst/2 2.370n ± 0% 2.373n ± 0% +0.13% (p=0.020 n=10) MemmoveUnalignedDst/3 2.701n ± 0% 2.700n ± 0% -0.04% (p=0.006 n=10) MemmoveUnalignedDst/4 2.350n ± 4% 2.318n ± 2% ~ (p=0.076 n=10) MemmoveUnalignedDst/5 2.654n ± 6% 2.673n ± 3% ~ (p=0.738 n=10) MemmoveUnalignedDst/6 2.764n ± 1% 2.744n ± 0% -0.71% (p=0.000 n=10) MemmoveUnalignedDst/7 3.118n ± 5% 3.107n ± 7% ~ (p=0.971 n=10) MemmoveUnalignedDst/8 2.399n ± 0% 2.399n ± 0% 0.00% (p=0.037 n=10) MemmoveUnalignedDst/9 2.704n ± 0% 2.861n ± 4% +5.79% (p=0.000 n=10) MemmoveUnalignedDst/10 2.972n ± 5% 2.976n ± 6% ~ (p=0.778 n=10) MemmoveUnalignedDst/11 3.156n ± 4% 3.118n ± 0% -1.22% (p=0.001 n=10) MemmoveUnalignedDst/12 2.704n ± 0% 2.703n ± 0% -0.04% (p=0.000 n=10) MemmoveUnalignedDst/13 3.101n ± 0% 3.102n ± 0% ~ (p=0.690 n=10) MemmoveUnalignedDst/14 3.188n ± 0% 3.186n ± 0% -0.05% (p=0.000 n=10) MemmoveUnalignedDst/15 3.613n ± 3% 3.547n ± 2% ~ (p=0.085 n=10) MemmoveUnalignedDst/16 2.772n ± 0% 2.771n ± 0% -0.04% (p=0.001 n=10) MemmoveUnalignedDst/32 5.401n ± 10% 4.426n ± 0% -18.06% (p=0.000 n=10) MemmoveUnalignedDst/64 9.037n ± 11% 4.381n ± 0% -51.51% (p=0.000 n=10) MemmoveUnalignedDst/128 5.239n ± 0% 5.580n ± 0% +6.52% (p=0.000 n=10) MemmoveUnalignedDst/256 6.563n ± 1% 6.372n ± 3% -2.90% (p=0.004 n=10) MemmoveUnalignedDst/512 10.99n ± 1% 10.32n ± 4% -6.10% (p=0.000 n=10) MemmoveUnalignedDst/1024 20.55n ± 2% 20.18n ± 0% ~ (p=0.125 n=10) MemmoveUnalignedDst/2048 37.62n ± 1% 36.52n ± 0% -2.92% (p=0.000 n=10) MemmoveUnalignedDst/4096 72.48n ± 1% 70.33n ± 0% -2.98% (p=0.000 n=10) MemmoveUnalignedDstOverlap/32 5.032n ± 7% 4.823n ± 3% ~ (p=0.342 n=10) MemmoveUnalignedDstOverlap/64 6.021n ± 11% 6.197n ± 7% ~ (p=0.085 n=10) MemmoveUnalignedDstOverlap/128 7.979n ± 1% 6.742n ± 0% -15.51% (p=0.000 n=10) MemmoveUnalignedDstOverlap/256 11.51n ± 1% 10.20n ± 0% -11.38% (p=0.000 n=10) MemmoveUnalignedDstOverlap/512 19.72n ± 1% 16.50n ± 1% -16.38% (p=0.000 n=10) MemmoveUnalignedDstOverlap/1024 34.94n ± 0% 28.73n ± 0% -17.76% (p=0.000 n=10) MemmoveUnalignedDstOverlap/2048 64.63n ± 0% 57.02n ± 1% -11.77% (p=0.000 n=10) MemmoveUnalignedDstOverlap/4096 124.8n ± 0% 106.6n ± 0% -14.62% (p=0.000 n=10) MemmoveUnalignedSrc/0 2.571n ± 13% 2.235n ± 1% -13.05% (p=0.000 n=10) MemmoveUnalignedSrc/1 2.375n ± 3% 2.374n ± 0% -0.04% (p=0.000 n=10) MemmoveUnalignedSrc/2 2.389n ± 0% 2.389n ± 0% ~ (p=0.721 n=10) MemmoveUnalignedSrc/3 2.704n ± 0% 2.703n ± 0% -0.04% (p=0.003 n=10) MemmoveUnalignedSrc/4 2.345n ± 5% 2.325n ± 3% ~ (p=0.170 n=10) MemmoveUnalignedSrc/5 2.716n ± 5% 2.654n ± 8% ~ (p=0.383 n=10) MemmoveUnalignedSrc/6 2.772n ± 1% 2.759n ± 0% -0.49% (p=0.001 n=10) MemmoveUnalignedSrc/7 3.123n ± 9% 3.127n ± 4% ~ (p=0.315 n=10) MemmoveUnalignedSrc/8 2.407n ± 0% 2.406n ± 0% -0.04% (p=0.000 n=10) MemmoveUnalignedSrc/9 2.705n ± 0% 2.724n ± 5% ~ (p=0.694 n=10) MemmoveUnalignedSrc/10 2.978n ± 4% 2.933n ± 6% ~ (p=0.280 n=10) MemmoveUnalignedSrc/11 3.166n ± 7% 3.130n ± 0% -1.14% (p=0.007 n=10) MemmoveUnalignedSrc/12 2.704n ± 0% 2.704n ± 0% ~ (p=0.054 n=10) MemmoveUnalignedSrc/13 3.132n ± 0% 3.141n ± 1% ~ (p=0.565 n=10) MemmoveUnalignedSrc/14 3.224n ± 0% 3.224n ± 0% ~ (p=1.000 n=10) MemmoveUnalignedSrc/15 3.589n ± 4% 3.547n ± 3% ~ (p=0.060 n=10) MemmoveUnalignedSrc/16 2.762n ± 0% 2.754n ± 0% -0.29% (p=0.000 n=10) MemmoveUnalignedSrc/32 5.402n ± 9% 4.603n ± 0% -14.79% (p=0.000 n=10) MemmoveUnalignedSrc/64 8.548n ± 10% 4.604n ± 1% -46.13% (p=0.000 n=10) MemmoveUnalignedSrc/128 5.204n ± 0% 5.381n ± 0% +3.41% (p=0.000 n=10) MemmoveUnalignedSrc/256 6.221n ± 0% 6.231n ± 0% +0.14% (p=0.004 n=10) MemmoveUnalignedSrc/512 11.18n ± 2% 10.17n ± 0% -9.03% (p=0.000 n=10) MemmoveUnalignedSrc/1024 20.45n ± 1% 20.84n ± 3% +1.86% (p=0.016 n=10) MemmoveUnalignedSrc/2048 37.54n ± 5% 36.46n ± 0% -2.88% (p=0.000 n=10) MemmoveUnalignedSrc/4096 73.55n ± 5% 69.93n ± 0% -4.92% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_16_0 3.465n ± 0% 3.464n ± 0% -0.03% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_16_0 3.465n ± 0% 3.464n ± 0% -0.03% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_16_1 3.465n ± 0% 3.464n ± 0% -0.03% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_16_1 3.465n ± 0% 3.464n ± 0% -0.03% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_16_4 3.465n ± 0% 3.464n ± 0% -0.03% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_16_4 3.465n ± 0% 3.464n ± 0% -0.03% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_16_7 3.465n ± 0% 3.464n ± 0% -0.03% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_16_7 3.465n ± 0% 3.464n ± 0% -0.03% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_64_0 7.997n ± 6% 5.292n ± 0% -33.83% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_64_0 8.261n ± 10% 5.398n ± 0% -34.66% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_64_1 8.193n ± 6% 5.388n ± 0% -34.24% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_64_1 8.990n ± 12% 5.398n ± 0% -39.96% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_64_4 8.585n ± 3% 5.388n ± 0% -37.24% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_64_4 8.508n ± 2% 5.398n ± 0% -36.55% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_64_7 8.440n ± 5% 5.388n ± 0% -36.16% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_64_7 8.469n ± 5% 5.398n ± 0% -36.26% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_256_0 6.849n ± 0% 7.089n ± 0% +3.50% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_256_0 7.067n ± 0% 7.090n ± 1% ~ (p=0.748 n=10) MemmoveUnalignedSrcDst/f_256_1 7.036n ± 0% 7.316n ± 0% +3.99% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_256_1 7.064n ± 0% 7.106n ± 1% ~ (p=0.096 n=10) MemmoveUnalignedSrcDst/f_256_4 7.035n ± 0% 7.318n ± 0% +4.02% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_256_4 7.067n ± 0% 7.105n ± 1% ~ (p=0.389 n=10) MemmoveUnalignedSrcDst/f_256_7 7.035n ± 0% 7.317n ± 0% +4.01% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_256_7 7.064n ± 0% 7.103n ± 1% +0.56% (p=0.011 n=10) MemmoveUnalignedSrcDst/f_4096_0 72.34n ± 1% 69.97n ± 0% -3.27% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_4096_0 72.27n ± 1% 69.99n ± 0% -3.15% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_4096_1 73.70n ± 1% 72.84n ± 0% -1.16% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_4096_1 71.24n ± 2% 69.80n ± 0% -2.03% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_4096_4 73.90n ± 0% 72.85n ± 0% -1.42% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_4096_4 70.58n ± 0% 69.79n ± 0% -1.11% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_4096_7 73.71n ± 0% 72.82n ± 0% -1.20% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_4096_7 70.58n ± 0% 69.77n ± 0% -1.14% (p=0.000 n=10) MemmoveUnalignedSrcDst/f_65536_0 1.204µ ± 0% 1.210µ ± 0% +0.50% (p=0.000 n=10) MemmoveUnalignedSrcDst/b_65536_0 1.268µ ± 0% 1.275µ ± 0% +0.55% (p=0.001 n=10) MemmoveUnalignedSrcDst/f_65536_1 1.253µ ± 3% 1.259µ ± 0% ~ (p=0.439 n=10) MemmoveUnalignedSrcDst/b_65536_1 1.246µ ± 2% 1.256µ ± 0% +0.80% (p=0.021 n=10) MemmoveUnalignedSrcDst/f_65536_4 1.254µ ± 1% 1.259µ ± 0% +0.40% (p=0.019 n=10) MemmoveUnalignedSrcDst/b_65536_4 1.246µ ± 0% 1.256µ ± 0% +0.76% (p=0.001 n=10) MemmoveUnalignedSrcDst/f_65536_7 1.253µ ± 1% 1.259µ ± 0% +0.48% (p=0.032 n=10) MemmoveUnalignedSrcDst/b_65536_7 1.270µ ± 6% 1.256µ ± 0% ~ (p=0.614 n=10) MemmoveUnalignedSrcOverlap/32 4.712n ± 5% 4.433n ± 1% -5.91% (p=0.001 n=10) MemmoveUnalignedSrcOverlap/64 6.043n ± 3% 5.895n ± 0% -2.44% (p=0.000 n=10) MemmoveUnalignedSrcOverlap/128 7.869n ± 5% 8.031n ± 1% ~ (p=0.256 n=10) MemmoveUnalignedSrcOverlap/256 11.41n ± 1% 10.59n ± 0% -7.19% (p=0.000 n=10) MemmoveUnalignedSrcOverlap/512 21.14n ± 1% 16.92n ± 0% -20.00% (p=0.000 n=10) MemmoveUnalignedSrcOverlap/1024 39.55n ± 1% 29.91n ± 1% -24.36% (p=0.000 n=10) MemmoveUnalignedSrcOverlap/2048 77.01n ± 0% 58.59n ± 0% -23.93% (p=0.000 n=10) MemmoveUnalignedSrcOverlap/4096 152.0n ± 0% 112.6n ± 0% -25.95% (p=0.000 n=10) MemmoveKnownSize112 2.310n ± 0% 2.309n ± 0% -0.04% (p=0.000 n=10) MemmoveKnownSize128 2.502n ± 0% 2.502n ± 0% ~ (p=0.139 n=10) MemmoveKnownSize192 3.850n ± 0% 3.849n ± 0% -0.03% (p=0.000 n=10) MemmoveKnownSize248 4.042n ± 0% 4.041n ± 0% -0.04% (p=0.000 n=10) MemmoveKnownSize256 4.042n ± 0% 4.041n ± 0% -0.02% (p=0.001 n=10) MemmoveKnownSize512 8.116n ± 0% 8.114n ± 0% -0.02% (p=0.002 n=10) MemmoveKnownSize1024 16.20n ± 0% 16.18n ± 0% -0.09% (p=0.001 n=10) geomean 9.386n 8.773n -6.54% Change-Id: I9476a27bc8aa935a977767bec6aafd6d721f9924
1 parent e3cacd9 commit 5671983

File tree

1 file changed

+67
-32
lines changed

1 file changed

+67
-32
lines changed

src/runtime/memmove_s390x.s

Lines changed: 67 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ TEXT runtime·memmove(SB),NOSPLIT|NOFRAME,$0-24
1515
CMPBEQ R6, R4, done
1616

1717
start:
18+
moveLE16:
1819
CMPBLE R5, $3, move0to3
1920
CMPBLE R5, $7, move4to7
2021
CMPBLE R5, $11, move8to11
@@ -27,48 +28,82 @@ start:
2728
RET
2829

2930
movemt16:
30-
CMPBGT R4, R6, forwards
31+
CMPBGT R4, R6, forwards_copy
3132
ADD R5, R4, R7
32-
CMPBLE R7, R6, forwards
33+
CMPBLE R7, R6, forwards_copy
3334
ADD R5, R6, R8
34-
backwards:
35-
MOVD -8(R7), R3
36-
MOVD R3, -8(R8)
37-
MOVD -16(R7), R3
38-
MOVD R3, -16(R8)
35+
36+
// backwards_copy is used in below scenario:
37+
// 1. When src and dst are overlapping, and the dst is at higher address than src.
38+
backwards_copy:
39+
MOVD R5, R8
40+
SRD $4, R8
3941
ADD $-16, R5
40-
ADD $-16, R7
41-
ADD $-16, R8
42-
CMP R5, $16
43-
BGE backwards
44-
BR start
45-
46-
forwards:
47-
CMPBGT R5, $64, forwards_fast
48-
MOVD 0(R4), R3
49-
MOVD R3, 0(R6)
50-
MOVD 8(R4), R3
51-
MOVD R3, 8(R6)
52-
ADD $16, R4
53-
ADD $16, R6
42+
CMPBGE R8, $8, moveGE128
43+
44+
moveLT128:
45+
MOVD 8(R4)(R5), R3
46+
MOVD R3, 8(R6)(R5)
47+
MOVD 0(R4)(R5), R3
48+
MOVD R3, 0(R6)(R5)
5449
ADD $-16, R5
55-
CMP R5, $16
56-
BGE forwards
57-
BR start
50+
BRCTG R8, moveLT128
51+
ADD $16, R5
52+
BR moveLE16
53+
54+
moveGE128:
55+
ADD $-48, R5
56+
SRD $2, R8, R0
5857

59-
forwards_fast:
60-
CMP R5, $256
61-
BLE forwards_small
58+
move_large_64_loop:
59+
VL 0(R4)(R5), V1
60+
VL 16(R4)(R5), V2
61+
VL 32(R4)(R5), V3
62+
VL 48(R4)(R5), V4
63+
VST V1, 0(R6)(R5)
64+
VST V2, 16(R6)(R5)
65+
VST V3, 32(R6)(R5)
66+
VST V4, 48(R6)(R5)
67+
ADD $-64, R5
68+
BRCTG R0, move_large_64_loop
69+
ADD $48, R5
70+
AND $3, R8
71+
BNE moveLT128
72+
ADD $16, R5
73+
BR moveLE16
74+
75+
// forwards_copy is used in below scenarios:
76+
// 1. When src and dst are non-overlapping.
77+
// 2. When src and dst are overlapping, but src is at higher address than dst.
78+
forwards_copy:
79+
MOVD R5, R8
80+
SRD $8, R8
81+
CMPBNE R8, $0, moveGE256
82+
83+
use_exrl:
84+
CMPBEQ R5, $0, done
85+
ADD $-1, R5
86+
EXRL $memmove_exrl_mvc<>(SB), R5
87+
RET
88+
89+
moveGE256:
90+
CMP R8, $4096
91+
BGT moveGT1MB
92+
93+
mvc_loop:
6294
MVC $256, 0(R4), 0(R6)
6395
ADD $256, R4
6496
ADD $256, R6
6597
ADD $-256, R5
66-
BR forwards_fast
98+
BRCTG R8, mvc_loop
99+
BR use_exrl
67100

68-
forwards_small:
69-
CMPBEQ R5, $0, done
70-
ADD $-1, R5
71-
EXRL $memmove_exrl_mvc<>(SB), R5
101+
moveGT1MB:
102+
MOVD R5, R7
103+
104+
mvcle_loop:
105+
MVCLE 0, R4, R6
106+
BVS mvcle_loop
72107
RET
73108

74109
move0to3:

0 commit comments

Comments
 (0)