@@ -4341,22 +4341,48 @@ class StubGenerator: public StubCodeGenerator {
43414341 return start;
43424342 }
43434343
4344- // ChaCha20 block function. This version parallelizes by loading
4345- // individual 32-bit state elements into vectors for four blocks
4346- // (e.g. all four blocks' worth of state[0] in one register, etc.)
4344+ // ChaCha20 block function. This version parallelizes 4 quarter
4345+ // round operations at a time. It uses 16 SIMD registers to
4346+ // produce 4 blocks of key stream.
43474347 //
43484348 // state (int[16]) = c_rarg0
4349- // keystream (byte[1024 ]) = c_rarg1
4349+ // keystream (byte[256 ]) = c_rarg1
43504350 // return - number of bytes of keystream (always 256)
4351- address generate_chacha20Block_blockpar () {
4352- Label L_twoRounds, L_cc20_const;
4351+ //
4352+ // In this approach, we load the 512-bit start state sequentially into
4353+ // 4 128-bit vectors. We then make 4 4-vector copies of that starting
4354+ // state, with each successive set of 4 vectors having a +1 added into
4355+ // the first 32-bit lane of the 4th vector in that group (the counter).
4356+ // By doing this, we can perform the block function on 4 512-bit blocks
4357+ // within one run of this intrinsic.
4358+ // The alignment of the data across the 4-vector group is such that at
4359+ // the start it is already aligned for the first round of each two-round
4360+ // loop iteration. In other words, the corresponding lanes of each vector
4361+ // will contain the values needed for that quarter round operation (e.g.
4362+ // elements 0/4/8/12, 1/5/9/13, 2/6/10/14, etc.).
4363+ // In between each full round, a lane shift must occur. Within a loop
4364+ // iteration, between the first and second rounds, the 2nd, 3rd, and 4th
4365+ // vectors are rotated left 32, 64 and 96 bits, respectively. The result
4366+ // is effectively a diagonal orientation in columnar form. After the
4367+ // second full round, those registers are left-rotated again, this time
4368+ // 96, 64, and 32 bits - returning the vectors to their columnar organization.
4369+ // After all 10 iterations, the original state is added to each 4-vector
4370+ // working state along with the add mask, and the 4 vector groups are
4371+ // sequentially written to the memory dedicated for the output key stream.
4372+ //
4373+ // For a more detailed explanation, see Goll and Gueron, "Vectorization of
4374+ // ChaCha Stream Cipher", 2014 11th Int. Conf. on Information Technology:
4375+ // New Generations, Las Vegas, NV, USA, April 2014, DOI: 10.1109/ITNG.2014.33
4376+ address generate_chacha20Block_qrpar () {
4377+ Label L_Q_twoRounds, L_Q_cc20_const;
43534378 // The constant data is broken into two 128-bit segments to be loaded
4354- // onto FloatRegisters . The first 128 bits are a counter add overlay
4355- // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4379+ // onto SIMD registers . The first 128 bits are a counter add overlay
4380+ // that adds +1/+0/+0/+0 to the vectors holding replicated state[12].
43564381 // The second 128-bits is a table constant used for 8-bit left rotations.
4357- __ BIND (L_cc20_const);
4358- __ emit_int64 (0x0000000100000000UL );
4359- __ emit_int64 (0x0000000300000002UL );
4382+ // on 32-bit lanes within a SIMD register.
4383+ __ BIND (L_Q_cc20_const);
4384+ __ emit_int64 (0x0000000000000001UL );
4385+ __ emit_int64 (0x0000000000000000UL );
43604386 __ emit_int64 (0x0605040702010003UL );
43614387 __ emit_int64 (0x0E0D0C0F0A09080BUL );
43624388
@@ -4366,100 +4392,144 @@ class StubGenerator: public StubCodeGenerator {
43664392 address start = __ pc ();
43674393 __ enter ();
43684394
4369- int i, j;
43704395 const Register state = c_rarg0;
43714396 const Register keystream = c_rarg1;
43724397 const Register loopCtr = r10;
43734398 const Register tmpAddr = r11;
43744399
4375- const FloatRegister stateFirst = v0;
4376- const FloatRegister stateSecond = v1;
4377- const FloatRegister stateThird = v2;
4378- const FloatRegister stateFourth = v3;
4379- const FloatRegister origCtrState = v28;
4380- const FloatRegister scratch = v29;
4400+ const FloatRegister aState = v0;
4401+ const FloatRegister bState = v1;
4402+ const FloatRegister cState = v2;
4403+ const FloatRegister dState = v3;
4404+ const FloatRegister a1Vec = v4;
4405+ const FloatRegister b1Vec = v5;
4406+ const FloatRegister c1Vec = v6;
4407+ const FloatRegister d1Vec = v7;
4408+ // Skip the callee-saved registers v8 - v15
4409+ const FloatRegister a2Vec = v16;
4410+ const FloatRegister b2Vec = v17;
4411+ const FloatRegister c2Vec = v18;
4412+ const FloatRegister d2Vec = v19;
4413+ const FloatRegister a3Vec = v20;
4414+ const FloatRegister b3Vec = v21;
4415+ const FloatRegister c3Vec = v22;
4416+ const FloatRegister d3Vec = v23;
4417+ const FloatRegister a4Vec = v24;
4418+ const FloatRegister b4Vec = v25;
4419+ const FloatRegister c4Vec = v26;
4420+ const FloatRegister d4Vec = v27;
4421+ const FloatRegister scratch = v28;
4422+ const FloatRegister addMask = v29;
43814423 const FloatRegister lrot8Tbl = v30;
43824424
4383- // Organize SIMD registers in an array that facilitates
4384- // putting repetitive opcodes into loop structures. It is
4385- // important that each grouping of 4 registers is monotonically
4386- // increasing to support the requirements of multi-register
4387- // instructions (e.g. ld4r, st4, etc.)
4388- const FloatRegister workSt[16 ] = {
4389- v4, v5, v6, v7, v16, v17, v18, v19,
4390- v20, v21, v22, v23, v24, v25, v26, v27
4391- };
4392-
4393- // Load from memory and interlace across 16 SIMD registers,
4394- // With each word from memory being broadcast to all lanes of
4395- // each successive SIMD register.
4396- // Addr(0) -> All lanes in workSt[i]
4397- // Addr(4) -> All lanes workSt[i + 1], etc.
4398- __ mov (tmpAddr, state);
4399- for (i = 0 ; i < 16 ; i += 4 ) {
4400- __ ld4r (workSt[i], workSt[i + 1 ], workSt[i + 2 ], workSt[i + 3 ], __ T4S,
4401- __ post (tmpAddr, 16 ));
4402- }
4403-
4404- // Pull in constant data. The first 16 bytes are the add overlay
4405- // which is applied to the vector holding the counter (state[12]).
4406- // The second 16 bytes is the index register for the 8-bit left
4407- // rotation tbl instruction.
4408- __ adr (tmpAddr, L_cc20_const);
4409- __ ldpq (origCtrState, lrot8Tbl, Address (tmpAddr));
4410- __ addv (workSt[12 ], __ T4S, workSt[12 ], origCtrState);
4411-
4412- // Set up the 10 iteration loop and perform all 8 quarter round ops
4425+ // Load the initial state in the first 4 quadword registers,
4426+ // then copy the initial state into the next 4 quadword registers
4427+ // that will be used for the working state.
4428+ __ ld1 (aState, bState, cState, dState, __ T16B, Address (state));
4429+
4430+ // Load the index register for 2 constant 128-bit data fields.
4431+ // The first represents the +1/+0/+0/+0 add mask. The second is
4432+ // the 8-bit left rotation.
4433+ __ adr (tmpAddr, L_Q_cc20_const);
4434+ __ ldpq (addMask, lrot8Tbl, Address (tmpAddr));
4435+
4436+ __ mov (a1Vec, __ T16B, aState);
4437+ __ mov (b1Vec, __ T16B, bState);
4438+ __ mov (c1Vec, __ T16B, cState);
4439+ __ mov (d1Vec, __ T16B, dState);
4440+
4441+ __ mov (a2Vec, __ T16B, aState);
4442+ __ mov (b2Vec, __ T16B, bState);
4443+ __ mov (c2Vec, __ T16B, cState);
4444+ __ addv (d2Vec, __ T4S, d1Vec, addMask);
4445+
4446+ __ mov (a3Vec, __ T16B, aState);
4447+ __ mov (b3Vec, __ T16B, bState);
4448+ __ mov (c3Vec, __ T16B, cState);
4449+ __ addv (d3Vec, __ T4S, d2Vec, addMask);
4450+
4451+ __ mov (a4Vec, __ T16B, aState);
4452+ __ mov (b4Vec, __ T16B, bState);
4453+ __ mov (c4Vec, __ T16B, cState);
4454+ __ addv (d4Vec, __ T4S, d3Vec, addMask);
4455+
4456+ // Set up the 10 iteration loop
44134457 __ mov (loopCtr, 10 );
4414- __ BIND (L_twoRounds);
4415-
4416- __ cc20_quarter_round (workSt[0 ], workSt[4 ], workSt[8 ], workSt[12 ],
4417- scratch, lrot8Tbl);
4418- __ cc20_quarter_round (workSt[1 ], workSt[5 ], workSt[9 ], workSt[13 ],
4419- scratch, lrot8Tbl);
4420- __ cc20_quarter_round (workSt[2 ], workSt[6 ], workSt[10 ], workSt[14 ],
4421- scratch, lrot8Tbl);
4422- __ cc20_quarter_round (workSt[3 ], workSt[7 ], workSt[11 ], workSt[15 ],
4423- scratch, lrot8Tbl);
4424-
4425- __ cc20_quarter_round (workSt[0 ], workSt[5 ], workSt[10 ], workSt[15 ],
4426- scratch, lrot8Tbl);
4427- __ cc20_quarter_round (workSt[1 ], workSt[6 ], workSt[11 ], workSt[12 ],
4428- scratch, lrot8Tbl);
4429- __ cc20_quarter_round (workSt[2 ], workSt[7 ], workSt[8 ], workSt[13 ],
4430- scratch, lrot8Tbl);
4431- __ cc20_quarter_round (workSt[3 ], workSt[4 ], workSt[9 ], workSt[14 ],
4432- scratch, lrot8Tbl);
4458+ __ BIND (L_Q_twoRounds);
4459+
4460+ // The first set of operations on the vectors covers the first 4 quarter
4461+ // round operations:
4462+ // Qround(state, 0, 4, 8,12)
4463+ // Qround(state, 1, 5, 9,13)
4464+ // Qround(state, 2, 6,10,14)
4465+ // Qround(state, 3, 7,11,15)
4466+ __ cc20_quarter_round (a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
4467+ __ cc20_quarter_round (a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
4468+ __ cc20_quarter_round (a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
4469+ __ cc20_quarter_round (a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
4470+
4471+ // Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors to
4472+ // diagonals. The a1Vec does not need to change orientation.
4473+ __ cc20_shift_lane_org (b1Vec, c1Vec, d1Vec, true );
4474+ __ cc20_shift_lane_org (b2Vec, c2Vec, d2Vec, true );
4475+ __ cc20_shift_lane_org (b3Vec, c3Vec, d3Vec, true );
4476+ __ cc20_shift_lane_org (b4Vec, c4Vec, d4Vec, true );
4477+
4478+ // The second set of operations on the vectors covers the second 4 quarter
4479+ // round operations, now acting on the diagonals:
4480+ // Qround(state, 0, 5,10,15)
4481+ // Qround(state, 1, 6,11,12)
4482+ // Qround(state, 2, 7, 8,13)
4483+ // Qround(state, 3, 4, 9,14)
4484+ __ cc20_quarter_round (a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
4485+ __ cc20_quarter_round (a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
4486+ __ cc20_quarter_round (a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
4487+ __ cc20_quarter_round (a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
4488+
4489+ // Before we start the next iteration, we need to perform shuffles
4490+ // on the b/c/d vectors to move them back to columnar organizations
4491+ // from their current diagonal orientation.
4492+ __ cc20_shift_lane_org (b1Vec, c1Vec, d1Vec, false );
4493+ __ cc20_shift_lane_org (b2Vec, c2Vec, d2Vec, false );
4494+ __ cc20_shift_lane_org (b3Vec, c3Vec, d3Vec, false );
4495+ __ cc20_shift_lane_org (b4Vec, c4Vec, d4Vec, false );
44334496
44344497 // Decrement and iterate
44354498 __ sub (loopCtr, loopCtr, 1 );
4436- __ cbnz (loopCtr, L_twoRounds);
4437-
4438- __ mov (tmpAddr, state);
4439-
4440- // Add the starting state back to the post-loop keystream
4441- // state. We read/interlace the state array from memory into
4442- // 4 registers similar to what we did in the beginning. Then
4443- // add the counter overlay onto workSt[12] at the end.
4444- for (i = 0 ; i < 16 ; i += 4 ) {
4445- __ ld4r (stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
4446- __ post (tmpAddr, 16 ));
4447- __ addv (workSt[i], __ T4S, workSt[i], stateFirst);
4448- __ addv (workSt[i + 1 ], __ T4S, workSt[i + 1 ], stateSecond);
4449- __ addv (workSt[i + 2 ], __ T4S, workSt[i + 2 ], stateThird);
4450- __ addv (workSt[i + 3 ], __ T4S, workSt[i + 3 ], stateFourth);
4451- }
4452- __ addv (workSt[12 ], __ T4S, workSt[12 ], origCtrState); // Add ctr mask
4453-
4454- // Write to key stream, storing the same element out of workSt[0..15]
4455- // to consecutive 4-byte offsets in the key stream buffer, then repeating
4456- // for the next element position.
4457- for (i = 0 ; i < 4 ; i++) {
4458- for (j = 0 ; j < 16 ; j += 4 ) {
4459- __ st4 (workSt[j], workSt[j + 1 ], workSt[j + 2 ], workSt[j + 3 ], __ S, i,
4460- __ post (keystream, 16 ));
4461- }
4462- }
4499+ __ cbnz (loopCtr, L_Q_twoRounds);
4500+
4501+ // Once the counter reaches zero, we fall out of the loop
4502+ // and need to add the initial state back into the working state
4503+ // represented by the a/b/c/d1Vec registers. This is destructive
4504+ // on the dState register but we no longer will need it.
4505+ __ addv (a1Vec, __ T4S, a1Vec, aState);
4506+ __ addv (b1Vec, __ T4S, b1Vec, bState);
4507+ __ addv (c1Vec, __ T4S, c1Vec, cState);
4508+ __ addv (d1Vec, __ T4S, d1Vec, dState);
4509+
4510+ __ addv (a2Vec, __ T4S, a2Vec, aState);
4511+ __ addv (b2Vec, __ T4S, b2Vec, bState);
4512+ __ addv (c2Vec, __ T4S, c2Vec, cState);
4513+ __ addv (dState, __ T4S, dState, addMask);
4514+ __ addv (d2Vec, __ T4S, d2Vec, dState);
4515+
4516+ __ addv (a3Vec, __ T4S, a3Vec, aState);
4517+ __ addv (b3Vec, __ T4S, b3Vec, bState);
4518+ __ addv (c3Vec, __ T4S, c3Vec, cState);
4519+ __ addv (dState, __ T4S, dState, addMask);
4520+ __ addv (d3Vec, __ T4S, d3Vec, dState);
4521+
4522+ __ addv (a4Vec, __ T4S, a4Vec, aState);
4523+ __ addv (b4Vec, __ T4S, b4Vec, bState);
4524+ __ addv (c4Vec, __ T4S, c4Vec, cState);
4525+ __ addv (dState, __ T4S, dState, addMask);
4526+ __ addv (d4Vec, __ T4S, d4Vec, dState);
4527+
4528+ // Write the final state back to the result buffer
4529+ __ st1 (a1Vec, b1Vec, c1Vec, d1Vec, __ T16B, __ post (keystream, 64 ));
4530+ __ st1 (a2Vec, b2Vec, c2Vec, d2Vec, __ T16B, __ post (keystream, 64 ));
4531+ __ st1 (a3Vec, b3Vec, c3Vec, d3Vec, __ T16B, __ post (keystream, 64 ));
4532+ __ st1 (a4Vec, b4Vec, c4Vec, d4Vec, __ T16B, __ post (keystream, 64 ));
44634533
44644534 __ mov (r0, 256 ); // Return length of output keystream
44654535 __ leave ();
@@ -8866,7 +8936,7 @@ class StubGenerator: public StubCodeGenerator {
88668936#endif // COMPILER2
88678937
88688938 if (UseChaCha20Intrinsics) {
8869- StubRoutines::_chacha20Block = generate_chacha20Block_blockpar ();
8939+ StubRoutines::_chacha20Block = generate_chacha20Block_qrpar ();
88708940 }
88718941
88728942 if (UseBASE64Intrinsics) {
0 commit comments