Skip to content

Commit ee4caa4

Browse files
author
Jamil Nimeh
committed
8349106: Change ChaCha20 intrinsic to use quarter-round parallel implementation on aarch64
Reviewed-by: aph
1 parent b985347 commit ee4caa4

File tree

1 file changed

+165
-95
lines changed

1 file changed

+165
-95
lines changed

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

Lines changed: 165 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -4341,22 +4341,48 @@ class StubGenerator: public StubCodeGenerator {
43414341
return start;
43424342
}
43434343

4344-
// ChaCha20 block function. This version parallelizes by loading
4345-
// individual 32-bit state elements into vectors for four blocks
4346-
// (e.g. all four blocks' worth of state[0] in one register, etc.)
4344+
// ChaCha20 block function. This version parallelizes 4 quarter
4345+
// round operations at a time. It uses 16 SIMD registers to
4346+
// produce 4 blocks of key stream.
43474347
//
43484348
// state (int[16]) = c_rarg0
4349-
// keystream (byte[1024]) = c_rarg1
4349+
// keystream (byte[256]) = c_rarg1
43504350
// return - number of bytes of keystream (always 256)
4351-
address generate_chacha20Block_blockpar() {
4352-
Label L_twoRounds, L_cc20_const;
4351+
//
4352+
// In this approach, we load the 512-bit start state sequentially into
4353+
// 4 128-bit vectors. We then make 4 4-vector copies of that starting
4354+
// state, with each successive set of 4 vectors having a +1 added into
4355+
// the first 32-bit lane of the 4th vector in that group (the counter).
4356+
// By doing this, we can perform the block function on 4 512-bit blocks
4357+
// within one run of this intrinsic.
4358+
// The alignment of the data across the 4-vector group is such that at
4359+
// the start it is already aligned for the first round of each two-round
4360+
// loop iteration. In other words, the corresponding lanes of each vector
4361+
// will contain the values needed for that quarter round operation (e.g.
4362+
// elements 0/4/8/12, 1/5/9/13, 2/6/10/14, etc.).
4363+
// In between each full round, a lane shift must occur. Within a loop
4364+
// iteration, between the first and second rounds, the 2nd, 3rd, and 4th
4365+
// vectors are rotated left 32, 64 and 96 bits, respectively. The result
4366+
// is effectively a diagonal orientation in columnar form. After the
4367+
// second full round, those registers are left-rotated again, this time
4368+
// 96, 64, and 32 bits - returning the vectors to their columnar organization.
4369+
// After all 10 iterations, the original state is added to each 4-vector
4370+
// working state along with the add mask, and the 4 vector groups are
4371+
// sequentially written to the memory dedicated for the output key stream.
4372+
//
4373+
// For a more detailed explanation, see Goll and Gueron, "Vectorization of
4374+
// ChaCha Stream Cipher", 2014 11th Int. Conf. on Information Technology:
4375+
// New Generations, Las Vegas, NV, USA, April 2014, DOI: 10.1109/ITNG.2014.33
4376+
address generate_chacha20Block_qrpar() {
4377+
Label L_Q_twoRounds, L_Q_cc20_const;
43534378
// The constant data is broken into two 128-bit segments to be loaded
4354-
// onto FloatRegisters. The first 128 bits are a counter add overlay
4355-
// that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4379+
// onto SIMD registers. The first 128 bits are a counter add overlay
4380+
// that adds +1/+0/+0/+0 to the vectors holding replicated state[12].
43564381
// The second 128-bits is a table constant used for 8-bit left rotations.
4357-
__ BIND(L_cc20_const);
4358-
__ emit_int64(0x0000000100000000UL);
4359-
__ emit_int64(0x0000000300000002UL);
4382+
// on 32-bit lanes within a SIMD register.
4383+
__ BIND(L_Q_cc20_const);
4384+
__ emit_int64(0x0000000000000001UL);
4385+
__ emit_int64(0x0000000000000000UL);
43604386
__ emit_int64(0x0605040702010003UL);
43614387
__ emit_int64(0x0E0D0C0F0A09080BUL);
43624388

@@ -4366,100 +4392,144 @@ class StubGenerator: public StubCodeGenerator {
43664392
address start = __ pc();
43674393
__ enter();
43684394

4369-
int i, j;
43704395
const Register state = c_rarg0;
43714396
const Register keystream = c_rarg1;
43724397
const Register loopCtr = r10;
43734398
const Register tmpAddr = r11;
43744399

4375-
const FloatRegister stateFirst = v0;
4376-
const FloatRegister stateSecond = v1;
4377-
const FloatRegister stateThird = v2;
4378-
const FloatRegister stateFourth = v3;
4379-
const FloatRegister origCtrState = v28;
4380-
const FloatRegister scratch = v29;
4400+
const FloatRegister aState = v0;
4401+
const FloatRegister bState = v1;
4402+
const FloatRegister cState = v2;
4403+
const FloatRegister dState = v3;
4404+
const FloatRegister a1Vec = v4;
4405+
const FloatRegister b1Vec = v5;
4406+
const FloatRegister c1Vec = v6;
4407+
const FloatRegister d1Vec = v7;
4408+
// Skip the callee-saved registers v8 - v15
4409+
const FloatRegister a2Vec = v16;
4410+
const FloatRegister b2Vec = v17;
4411+
const FloatRegister c2Vec = v18;
4412+
const FloatRegister d2Vec = v19;
4413+
const FloatRegister a3Vec = v20;
4414+
const FloatRegister b3Vec = v21;
4415+
const FloatRegister c3Vec = v22;
4416+
const FloatRegister d3Vec = v23;
4417+
const FloatRegister a4Vec = v24;
4418+
const FloatRegister b4Vec = v25;
4419+
const FloatRegister c4Vec = v26;
4420+
const FloatRegister d4Vec = v27;
4421+
const FloatRegister scratch = v28;
4422+
const FloatRegister addMask = v29;
43814423
const FloatRegister lrot8Tbl = v30;
43824424

4383-
// Organize SIMD registers in an array that facilitates
4384-
// putting repetitive opcodes into loop structures. It is
4385-
// important that each grouping of 4 registers is monotonically
4386-
// increasing to support the requirements of multi-register
4387-
// instructions (e.g. ld4r, st4, etc.)
4388-
const FloatRegister workSt[16] = {
4389-
v4, v5, v6, v7, v16, v17, v18, v19,
4390-
v20, v21, v22, v23, v24, v25, v26, v27
4391-
};
4392-
4393-
// Load from memory and interlace across 16 SIMD registers,
4394-
// With each word from memory being broadcast to all lanes of
4395-
// each successive SIMD register.
4396-
// Addr(0) -> All lanes in workSt[i]
4397-
// Addr(4) -> All lanes workSt[i + 1], etc.
4398-
__ mov(tmpAddr, state);
4399-
for (i = 0; i < 16; i += 4) {
4400-
__ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4401-
__ post(tmpAddr, 16));
4402-
}
4403-
4404-
// Pull in constant data. The first 16 bytes are the add overlay
4405-
// which is applied to the vector holding the counter (state[12]).
4406-
// The second 16 bytes is the index register for the 8-bit left
4407-
// rotation tbl instruction.
4408-
__ adr(tmpAddr, L_cc20_const);
4409-
__ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr));
4410-
__ addv(workSt[12], __ T4S, workSt[12], origCtrState);
4411-
4412-
// Set up the 10 iteration loop and perform all 8 quarter round ops
4425+
// Load the initial state in the first 4 quadword registers,
4426+
// then copy the initial state into the next 4 quadword registers
4427+
// that will be used for the working state.
4428+
__ ld1(aState, bState, cState, dState, __ T16B, Address(state));
4429+
4430+
// Load the index register for 2 constant 128-bit data fields.
4431+
// The first represents the +1/+0/+0/+0 add mask. The second is
4432+
// the 8-bit left rotation.
4433+
__ adr(tmpAddr, L_Q_cc20_const);
4434+
__ ldpq(addMask, lrot8Tbl, Address(tmpAddr));
4435+
4436+
__ mov(a1Vec, __ T16B, aState);
4437+
__ mov(b1Vec, __ T16B, bState);
4438+
__ mov(c1Vec, __ T16B, cState);
4439+
__ mov(d1Vec, __ T16B, dState);
4440+
4441+
__ mov(a2Vec, __ T16B, aState);
4442+
__ mov(b2Vec, __ T16B, bState);
4443+
__ mov(c2Vec, __ T16B, cState);
4444+
__ addv(d2Vec, __ T4S, d1Vec, addMask);
4445+
4446+
__ mov(a3Vec, __ T16B, aState);
4447+
__ mov(b3Vec, __ T16B, bState);
4448+
__ mov(c3Vec, __ T16B, cState);
4449+
__ addv(d3Vec, __ T4S, d2Vec, addMask);
4450+
4451+
__ mov(a4Vec, __ T16B, aState);
4452+
__ mov(b4Vec, __ T16B, bState);
4453+
__ mov(c4Vec, __ T16B, cState);
4454+
__ addv(d4Vec, __ T4S, d3Vec, addMask);
4455+
4456+
// Set up the 10 iteration loop
44134457
__ mov(loopCtr, 10);
4414-
__ BIND(L_twoRounds);
4415-
4416-
__ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12],
4417-
scratch, lrot8Tbl);
4418-
__ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13],
4419-
scratch, lrot8Tbl);
4420-
__ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14],
4421-
scratch, lrot8Tbl);
4422-
__ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15],
4423-
scratch, lrot8Tbl);
4424-
4425-
__ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15],
4426-
scratch, lrot8Tbl);
4427-
__ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12],
4428-
scratch, lrot8Tbl);
4429-
__ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13],
4430-
scratch, lrot8Tbl);
4431-
__ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14],
4432-
scratch, lrot8Tbl);
4458+
__ BIND(L_Q_twoRounds);
4459+
4460+
// The first set of operations on the vectors covers the first 4 quarter
4461+
// round operations:
4462+
// Qround(state, 0, 4, 8,12)
4463+
// Qround(state, 1, 5, 9,13)
4464+
// Qround(state, 2, 6,10,14)
4465+
// Qround(state, 3, 7,11,15)
4466+
__ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
4467+
__ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
4468+
__ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
4469+
__ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
4470+
4471+
// Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors to
4472+
// diagonals. The a1Vec does not need to change orientation.
4473+
__ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, true);
4474+
__ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, true);
4475+
__ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, true);
4476+
__ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, true);
4477+
4478+
// The second set of operations on the vectors covers the second 4 quarter
4479+
// round operations, now acting on the diagonals:
4480+
// Qround(state, 0, 5,10,15)
4481+
// Qround(state, 1, 6,11,12)
4482+
// Qround(state, 2, 7, 8,13)
4483+
// Qround(state, 3, 4, 9,14)
4484+
__ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
4485+
__ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
4486+
__ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
4487+
__ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
4488+
4489+
// Before we start the next iteration, we need to perform shuffles
4490+
// on the b/c/d vectors to move them back to columnar organizations
4491+
// from their current diagonal orientation.
4492+
__ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, false);
4493+
__ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, false);
4494+
__ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, false);
4495+
__ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, false);
44334496

44344497
// Decrement and iterate
44354498
__ sub(loopCtr, loopCtr, 1);
4436-
__ cbnz(loopCtr, L_twoRounds);
4437-
4438-
__ mov(tmpAddr, state);
4439-
4440-
// Add the starting state back to the post-loop keystream
4441-
// state. We read/interlace the state array from memory into
4442-
// 4 registers similar to what we did in the beginning. Then
4443-
// add the counter overlay onto workSt[12] at the end.
4444-
for (i = 0; i < 16; i += 4) {
4445-
__ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
4446-
__ post(tmpAddr, 16));
4447-
__ addv(workSt[i], __ T4S, workSt[i], stateFirst);
4448-
__ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond);
4449-
__ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird);
4450-
__ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth);
4451-
}
4452-
__ addv(workSt[12], __ T4S, workSt[12], origCtrState); // Add ctr mask
4453-
4454-
// Write to key stream, storing the same element out of workSt[0..15]
4455-
// to consecutive 4-byte offsets in the key stream buffer, then repeating
4456-
// for the next element position.
4457-
for (i = 0; i < 4; i++) {
4458-
for (j = 0; j < 16; j += 4) {
4459-
__ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4460-
__ post(keystream, 16));
4461-
}
4462-
}
4499+
__ cbnz(loopCtr, L_Q_twoRounds);
4500+
4501+
// Once the counter reaches zero, we fall out of the loop
4502+
// and need to add the initial state back into the working state
4503+
// represented by the a/b/c/d1Vec registers. This is destructive
4504+
// on the dState register but we no longer will need it.
4505+
__ addv(a1Vec, __ T4S, a1Vec, aState);
4506+
__ addv(b1Vec, __ T4S, b1Vec, bState);
4507+
__ addv(c1Vec, __ T4S, c1Vec, cState);
4508+
__ addv(d1Vec, __ T4S, d1Vec, dState);
4509+
4510+
__ addv(a2Vec, __ T4S, a2Vec, aState);
4511+
__ addv(b2Vec, __ T4S, b2Vec, bState);
4512+
__ addv(c2Vec, __ T4S, c2Vec, cState);
4513+
__ addv(dState, __ T4S, dState, addMask);
4514+
__ addv(d2Vec, __ T4S, d2Vec, dState);
4515+
4516+
__ addv(a3Vec, __ T4S, a3Vec, aState);
4517+
__ addv(b3Vec, __ T4S, b3Vec, bState);
4518+
__ addv(c3Vec, __ T4S, c3Vec, cState);
4519+
__ addv(dState, __ T4S, dState, addMask);
4520+
__ addv(d3Vec, __ T4S, d3Vec, dState);
4521+
4522+
__ addv(a4Vec, __ T4S, a4Vec, aState);
4523+
__ addv(b4Vec, __ T4S, b4Vec, bState);
4524+
__ addv(c4Vec, __ T4S, c4Vec, cState);
4525+
__ addv(dState, __ T4S, dState, addMask);
4526+
__ addv(d4Vec, __ T4S, d4Vec, dState);
4527+
4528+
// Write the final state back to the result buffer
4529+
__ st1(a1Vec, b1Vec, c1Vec, d1Vec, __ T16B, __ post(keystream, 64));
4530+
__ st1(a2Vec, b2Vec, c2Vec, d2Vec, __ T16B, __ post(keystream, 64));
4531+
__ st1(a3Vec, b3Vec, c3Vec, d3Vec, __ T16B, __ post(keystream, 64));
4532+
__ st1(a4Vec, b4Vec, c4Vec, d4Vec, __ T16B, __ post(keystream, 64));
44634533

44644534
__ mov(r0, 256); // Return length of output keystream
44654535
__ leave();
@@ -8866,7 +8936,7 @@ class StubGenerator: public StubCodeGenerator {
88668936
#endif // COMPILER2
88678937

88688938
if (UseChaCha20Intrinsics) {
8869-
StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
8939+
StubRoutines::_chacha20Block = generate_chacha20Block_qrpar();
88708940
}
88718941

88728942
if (UseBASE64Intrinsics) {

0 commit comments

Comments
 (0)