@@ -2491,41 +2491,80 @@ static void float_to_float16_v_slow_path(C2_MacroAssembler& masm,
24912491#define __ masm.
24922492 VectorRegister dst = stub.data <0 >();
24932493 VectorRegister src = stub.data <1 >();
2494- VectorRegister tmp = stub.data <2 >();
2494+ VectorRegister vtmp = stub.data <2 >();
2495+ assert_different_registers (dst, src, vtmp);
2496+
24952497 __ bind (stub.entry ());
24962498
2499+ // Active elements (NaNs) are marked in v0 mask register.
24972500 // mul is already set to mf2 in float_to_float16_v.
24982501
2499- // preserve the payloads of non-canonical NaNs.
2500- __ vnsra_wi (dst, src, 13 , Assembler::v0_t );
2501-
2502- // preserve the sign bit.
2503- __ vnsra_wi (tmp, src, 26 , Assembler::v0_t );
2504- __ vsll_vi (tmp, tmp, 10 , Assembler::v0_t );
2505- __ mv (t0, 0x3ff );
2506- __ vor_vx (tmp, tmp, t0, Assembler::v0_t );
2507-
2508- // get the result by merging sign bit and payloads of preserved non-canonical NaNs.
2509- __ vand_vv (dst, dst, tmp, Assembler::v0_t );
2502+ // Float (32 bits)
2503+ // Bit: 31 30 to 23 22 to 0
2504+ // +---+------------------+-----------------------------+
2505+ // | S | Exponent | Mantissa (Fraction) |
2506+ // +---+------------------+-----------------------------+
2507+ // 1 bit 8 bits 23 bits
2508+ //
2509+ // Float (16 bits)
2510+ // Bit: 15 14 to 10 9 to 0
2511+ // +---+----------------+------------------+
2512+ // | S | Exponent | Mantissa |
2513+ // +---+----------------+------------------+
2514+ // 1 bit 5 bits 10 bits
2515+ const int fp_sign_bits = 1 ;
2516+ const int fp32_bits = 32 ;
2517+ const int fp32_mantissa_2nd_part_bits = 9 ;
2518+ const int fp32_mantissa_3rd_part_bits = 4 ;
2519+ const int fp16_exponent_bits = 5 ;
2520+ const int fp16_mantissa_bits = 10 ;
2521+
2522+ // preserve the sign bit and exponent, clear mantissa.
2523+ __ vnsra_wi (dst, src, fp32_bits - fp_sign_bits - fp16_exponent_bits, Assembler::v0_t );
2524+ __ vsll_vi (dst, dst, fp16_mantissa_bits, Assembler::v0_t );
2525+
2526+ // Preserve high order bit of float NaN in the
2527+ // binary16 result NaN (tenth bit); OR in remaining
2528+ // bits into lower 9 bits of binary 16 significand.
2529+ // | (doppel & 0x007f_e000) >> 13 // 10 bits
2530+ // | (doppel & 0x0000_1ff0) >> 4 // 9 bits
2531+ // | (doppel & 0x0000_000f)); // 4 bits
2532+ //
2533+ // Check j.l.Float.floatToFloat16 for more information.
2534+ // 10 bits
2535+ __ vnsrl_wi (vtmp, src, fp32_mantissa_2nd_part_bits + fp32_mantissa_3rd_part_bits, Assembler::v0_t );
2536+ __ mv (t0, 0x3ff ); // retain first part of mantissa in a float 32
2537+ __ vand_vx (vtmp, vtmp, t0, Assembler::v0_t );
2538+ __ vor_vv (dst, dst, vtmp, Assembler::v0_t );
2539+ // 9 bits
2540+ __ vnsrl_wi (vtmp, src, fp32_mantissa_3rd_part_bits, Assembler::v0_t );
2541+ __ mv (t0, 0x1ff ); // retain second part of mantissa in a float 32
2542+ __ vand_vx (vtmp, vtmp, t0, Assembler::v0_t );
2543+ __ vor_vv (dst, dst, vtmp, Assembler::v0_t );
2544+ // 4 bits
2545+ // Narrow shift is necessary to move data from 32 bits element to 16 bits element in vector register.
2546+ __ vnsrl_wi (vtmp, src, 0 , Assembler::v0_t );
2547+ __ vand_vi (vtmp, vtmp, 0xf , Assembler::v0_t );
2548+ __ vor_vv (dst, dst, vtmp, Assembler::v0_t );
25102549
25112550 __ j (stub.continuation ());
25122551#undef __
25132552}
25142553
25152554// j.l.Float.float16ToFloat
2516- void C2_MacroAssembler::float_to_float16_v (VectorRegister dst, VectorRegister src, VectorRegister vtmp,
2517- Register tmp, uint vector_length) {
2555+ void C2_MacroAssembler::float_to_float16_v (VectorRegister dst, VectorRegister src,
2556+ VectorRegister vtmp, Register tmp, uint vector_length) {
25182557 assert_different_registers (dst, src, vtmp);
25192558
25202559 auto stub = C2CodeStub::make<VectorRegister, VectorRegister, VectorRegister>
2521- (dst, src, vtmp, 28 , float_to_float16_v_slow_path);
2560+ (dst, src, vtmp, 56 , float_to_float16_v_slow_path);
25222561
25232562 // On riscv, NaN needs a special process as vfncvt_f_f_w does not work in that case.
25242563
25252564 vsetvli_helper (BasicType::T_FLOAT, vector_length, Assembler::m1);
25262565
25272566 // check whether there is a NaN.
2528- // replace v_fclass with vmseq_vv as performance optimization.
2567+ // replace v_fclass with vmfne_vv as performance optimization.
25292568 vmfne_vv (v0, src, src);
25302569 vcpop_m (t0, v0);
25312570
0 commit comments