@@ -5103,6 +5103,262 @@ class StubGenerator: public StubCodeGenerator {
51035103 return (address) start;
51045104 }
51055105
5106+ void adler32_process_bytes (Register buff, Register s1, Register s2, VectorRegister vtable,
5107+ VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc,
5108+ Register temp0, Register temp1, Register temp2, Register temp3,
5109+ VectorRegister vtemp1, VectorRegister vtemp2, int step, Assembler::LMUL lmul) {
5110+
5111+ assert ((lmul == Assembler::m4 && step == 64 ) ||
5112+ (lmul == Assembler::m2 && step == 32 ) ||
5113+ (lmul == Assembler::m1 && step == 16 ),
5114+ " LMUL should be aligned with step: m4 and 64, m2 and 32 or m1 and 16" );
5115+ // Below is function for calculating Adler32 checksum with 64-, 32- or 16-byte step. LMUL=m4, m2 or m1 is used.
5116+ // The results are in v12, v13, ..., v22, v23. Example below is for 64-byte step case.
5117+ // We use b1, b2, ..., b64 to denote the 64 bytes loaded in each iteration.
5118+ // In non-vectorized code, we update s1 and s2 as:
5119+ // s1 <- s1 + b1
5120+ // s2 <- s2 + s1
5121+ // s1 <- s1 + b2
5122+ // s2 <- s2 + b1
5123+ // ...
5124+ // s1 <- s1 + b64
5125+ // s2 <- s2 + s1
5126+ // Putting above assignments together, we have:
5127+ // s1_new = s1 + b1 + b2 + ... + b64
5128+ // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b64) =
5129+ // = s2 + s1 * 64 + (b1 * 64 + b2 * 63 + ... + b64 * 1) =
5130+ // = s2 + s1 * 64 + (b1, b2, ... b64) dot (64, 63, ... 1)
5131+
5132+ __ mv (temp3, step);
5133+ // Load data
5134+ __ vsetvli (temp0, temp3, Assembler::e8 , lmul);
5135+ __ vle8_v (vbytes, buff);
5136+ __ addi (buff, buff, step);
5137+
5138+ // Upper bound reduction sum for s1_new:
5139+ // 0xFF * 64 = 0x3FC0, so:
5140+ // 1. Need to do vector-widening reduction sum
5141+ // 2. It is safe to perform sign-extension during vmv.x.s with 16-bits elements
5142+ __ vwredsumu_vs (vs1acc, vbytes, vzero);
5143+ // Multiplication for s2_new
5144+ __ vwmulu_vv (vs2acc, vtable, vbytes);
5145+
5146+ // s2 = s2 + s1 * log2(step)
5147+ __ slli (temp1, s1, exact_log2 (step));
5148+ __ add (s2, s2, temp1);
5149+
5150+ // Summing up calculated results for s2_new
5151+ if (MaxVectorSize > 16 ) {
5152+ __ vsetvli (temp0, temp3, Assembler::e16 , lmul);
5153+ } else {
5154+ // Half of vector-widening multiplication result is in successor of vs2acc
5155+ // group for vlen == 16, in which case we need to double vector register
5156+ // group width in order to reduction sum all of them
5157+ Assembler::LMUL lmulx2 = (lmul == Assembler::m1) ? Assembler::m2 :
5158+ (lmul == Assembler::m2) ? Assembler::m4 : Assembler::m8;
5159+ __ vsetvli (temp0, temp3, Assembler::e16 , lmulx2);
5160+ }
5161+ // Upper bound for reduction sum:
5162+ // 0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0 max for whole register group, so:
5163+ // 1. Need to do vector-widening reduction sum
5164+ // 2. It is safe to perform sign-extension during vmv.x.s with 32-bits elements
5165+ __ vwredsumu_vs (vtemp1, vs2acc, vzero);
5166+
5167+ // Extracting results for:
5168+ // s1_new
5169+ __ vmv_x_s (temp0, vs1acc);
5170+ __ add (s1, s1, temp0);
5171+ // s2_new
5172+ __ vsetvli (temp0, temp3, Assembler::e32 , Assembler::m1);
5173+ __ vmv_x_s (temp1, vtemp1);
5174+ __ add (s2, s2, temp1);
5175+ }
5176+
5177+ /* **
5178+ * int java.util.zip.Adler32.updateBytes(int adler, byte[] b, int off, int len)
5179+ *
5180+ * Arguments:
5181+ *
5182+ * Inputs:
5183+ * c_rarg0 - int adler
5184+ * c_rarg1 - byte* buff (b + off)
5185+ * c_rarg2 - int len
5186+ *
5187+ * Output:
5188+ * c_rarg0 - int adler result
5189+ */
5190+ address generate_updateBytesAdler32 () {
5191+ __ align (CodeEntryAlignment);
5192+ StubCodeMark mark (this , " StubRoutines" , " updateBytesAdler32" );
5193+ address start = __ pc ();
5194+
5195+ Label L_nmax, L_nmax_loop, L_nmax_loop_entry, L_by16, L_by16_loop,
5196+ L_by16_loop_unroll, L_by1_loop, L_do_mod, L_combine, L_by1;
5197+
5198+ // Aliases
5199+ Register adler = c_rarg0;
5200+ Register s1 = c_rarg0;
5201+ Register s2 = c_rarg3;
5202+ Register buff = c_rarg1;
5203+ Register len = c_rarg2;
5204+ Register nmax = c_rarg4;
5205+ Register base = c_rarg5;
5206+ Register count = c_rarg6;
5207+ Register temp0 = x28; // t3
5208+ Register temp1 = x29; // t4
5209+ Register temp2 = x30; // t5
5210+ Register temp3 = x31; // t6
5211+
5212+ VectorRegister vzero = v31;
5213+ VectorRegister vbytes = v8; // group: v8, v9, v10, v11
5214+ VectorRegister vs1acc = v12; // group: v12, v13, v14, v15
5215+ VectorRegister vs2acc = v16; // group: v16, v17, v18, v19, v20, v21, v22, v23
5216+ VectorRegister vtable_64 = v24; // group: v24, v25, v26, v27
5217+ VectorRegister vtable_32 = v4; // group: v4, v5
5218+ VectorRegister vtable_16 = v30;
5219+ VectorRegister vtemp1 = v28;
5220+ VectorRegister vtemp2 = v29;
5221+
5222+ // Max number of bytes we can process before having to take the mod
5223+ // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
5224+ const uint64_t BASE = 0xfff1 ;
5225+ const uint64_t NMAX = 0x15B0 ;
5226+
5227+ // Loops steps
5228+ int step_64 = 64 ;
5229+ int step_32 = 32 ;
5230+ int step_16 = 16 ;
5231+ int step_1 = 1 ;
5232+
5233+ __ enter (); // Required for proper stackwalking of RuntimeStub frame
5234+ __ mv (temp1, 64 );
5235+ __ vsetvli (temp0, temp1, Assembler::e8 , Assembler::m4);
5236+
5237+ // Generating accumulation coefficients for further calculations
5238+ // vtable_64:
5239+ __ vid_v (vtemp1);
5240+ __ vrsub_vx (vtable_64, vtemp1, temp1);
5241+ // vtable_64 group now contains { 0x40, 0x3f, 0x3e, ..., 0x3, 0x2, 0x1 }
5242+
5243+ // vtable_32:
5244+ __ mv (temp1, 32 );
5245+ __ vsetvli (temp0, temp1, Assembler::e8 , Assembler::m2);
5246+ __ vid_v (vtemp1);
5247+ __ vrsub_vx (vtable_32, vtemp1, temp1);
5248+ // vtable_32 group now contains { 0x20, 0x1f, 0x1e, ..., 0x3, 0x2, 0x1 }
5249+
5250+ __ vsetivli (temp0, 16 , Assembler::e8 , Assembler::m1);
5251+ // vtable_16:
5252+ __ mv (temp1, 16 );
5253+ __ vid_v (vtemp1);
5254+ __ vrsub_vx (vtable_16, vtemp1, temp1);
5255+ // vtable_16 now contains { 0x10, 0xf, 0xe, ..., 0x3, 0x2, 0x1 }
5256+
5257+ __ vmv_v_i (vzero, 0 );
5258+
5259+ __ mv (base, BASE);
5260+ __ mv (nmax, NMAX);
5261+
5262+ // s1 is initialized to the lower 16 bits of adler
5263+ // s2 is initialized to the upper 16 bits of adler
5264+ __ srliw (s2, adler, 16 ); // s2 = ((adler >> 16) & 0xffff)
5265+ __ zero_extend (s1, adler, 16 ); // s1 = (adler & 0xffff)
5266+
5267+ // The pipelined loop needs at least 16 elements for 1 iteration
5268+ // It does check this, but it is more effective to skip to the cleanup loop
5269+ __ mv (temp0, step_16);
5270+ __ bgeu (len, temp0, L_nmax);
5271+ __ beqz (len, L_combine);
5272+
5273+ // Jumping to L_by1_loop
5274+ __ sub (len, len, step_1);
5275+ __ j (L_by1_loop);
5276+
5277+ __ bind (L_nmax);
5278+ __ sub (len, len, nmax);
5279+ __ sub (count, nmax, 16 );
5280+ __ bltz (len, L_by16);
5281+
5282+ // Align L_nmax loop by 64
5283+ __ bind (L_nmax_loop_entry);
5284+ __ sub (count, count, 32 );
5285+
5286+ __ bind (L_nmax_loop);
5287+ adler32_process_bytes (buff, s1, s2, vtable_64, vzero,
5288+ vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5289+ vtemp1, vtemp2, step_64, Assembler::m4);
5290+ __ sub (count, count, step_64);
5291+ __ bgtz (count, L_nmax_loop);
5292+
5293+ // There are three iterations left to do
5294+ adler32_process_bytes (buff, s1, s2, vtable_32, vzero,
5295+ vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5296+ vtemp1, vtemp2, step_32, Assembler::m2);
5297+ adler32_process_bytes (buff, s1, s2, vtable_16, vzero,
5298+ vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5299+ vtemp1, vtemp2, step_16, Assembler::m1);
5300+
5301+ // s1 = s1 % BASE
5302+ __ remuw (s1, s1, base);
5303+ // s2 = s2 % BASE
5304+ __ remuw (s2, s2, base);
5305+
5306+ __ sub (len, len, nmax);
5307+ __ sub (count, nmax, 16 );
5308+ __ bgez (len, L_nmax_loop_entry);
5309+
5310+ __ bind (L_by16);
5311+ __ add (len, len, count);
5312+ __ bltz (len, L_by1);
5313+ // Trying to unroll
5314+ __ mv (temp3, step_64);
5315+ __ blt (len, temp3, L_by16_loop);
5316+
5317+ __ bind (L_by16_loop_unroll);
5318+ adler32_process_bytes (buff, s1, s2, vtable_64, vzero,
5319+ vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5320+ vtemp1, vtemp2, step_64, Assembler::m4);
5321+ __ sub (len, len, step_64);
5322+ // By now the temp3 should still be 64
5323+ __ bge (len, temp3, L_by16_loop_unroll);
5324+
5325+ __ bind (L_by16_loop);
5326+ adler32_process_bytes (buff, s1, s2, vtable_16, vzero,
5327+ vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5328+ vtemp1, vtemp2, step_16, Assembler::m1);
5329+ __ sub (len, len, step_16);
5330+ __ bgez (len, L_by16_loop);
5331+
5332+ __ bind (L_by1);
5333+ __ add (len, len, 15 );
5334+ __ bltz (len, L_do_mod);
5335+
5336+ __ bind (L_by1_loop);
5337+ __ lbu (temp0, Address (buff, 0 ));
5338+ __ addi (buff, buff, step_1);
5339+ __ add (s1, temp0, s1);
5340+ __ add (s2, s2, s1);
5341+ __ sub (len, len, step_1);
5342+ __ bgez (len, L_by1_loop);
5343+
5344+ __ bind (L_do_mod);
5345+ // s1 = s1 % BASE
5346+ __ remuw (s1, s1, base);
5347+ // s2 = s2 % BASE
5348+ __ remuw (s2, s2, base);
5349+
5350+ // Combine lower bits and higher bits
5351+ // adler = s1 | (s2 << 16)
5352+ __ bind (L_combine);
5353+ __ slli (s2, s2, 16 );
5354+ __ orr (s1, s1, s2);
5355+
5356+ __ leave (); // Required for proper stackwalking of RuntimeStub frame
5357+ __ ret ();
5358+
5359+ return start;
5360+ }
5361+
51065362#endif // COMPILER2_OR_JVMCI
51075363
51085364#ifdef COMPILER2
@@ -5746,6 +6002,10 @@ static const int64_t right_3_bits = right_n_bits(3);
57466002 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress (true , " sha1_implCompressMB" );
57476003 }
57486004
6005+ if (UseAdler32Intrinsics) {
6006+ StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32 ();
6007+ }
6008+
57496009#endif // COMPILER2_OR_JVMCI
57506010 }
57516011
0 commit comments