@@ -2489,41 +2489,80 @@ static void float_to_float16_v_slow_path(C2_MacroAssembler& masm,
24892489#define __ masm.
24902490 VectorRegister dst = stub.data <0 >();
24912491 VectorRegister src = stub.data <1 >();
2492- VectorRegister tmp = stub.data <2 >();
2492+ VectorRegister vtmp = stub.data <2 >();
2493+ assert_different_registers (dst, src, vtmp);
2494+
24932495 __ bind (stub.entry ());
24942496
2497+ // Active elements (NaNs) are marked in v0 mask register.
24952498 // mul is already set to mf2 in float_to_float16_v.
24962499
2497- // preserve the payloads of non-canonical NaNs.
2498- __ vnsra_wi (dst, src, 13 , Assembler::v0_t );
2499-
2500- // preserve the sign bit.
2501- __ vnsra_wi (tmp, src, 26 , Assembler::v0_t );
2502- __ vsll_vi (tmp, tmp, 10 , Assembler::v0_t );
2503- __ mv (t0, 0x3ff );
2504- __ vor_vx (tmp, tmp, t0, Assembler::v0_t );
2505-
2506- // get the result by merging sign bit and payloads of preserved non-canonical NaNs.
2507- __ vand_vv (dst, dst, tmp, Assembler::v0_t );
2500+ // Float (32 bits)
2501+ // Bit: 31 30 to 23 22 to 0
2502+ // +---+------------------+-----------------------------+
2503+ // | S | Exponent | Mantissa (Fraction) |
2504+ // +---+------------------+-----------------------------+
2505+ // 1 bit 8 bits 23 bits
2506+ //
2507+ // Float (16 bits)
2508+ // Bit: 15 14 to 10 9 to 0
2509+ // +---+----------------+------------------+
2510+ // | S | Exponent | Mantissa |
2511+ // +---+----------------+------------------+
2512+ // 1 bit 5 bits 10 bits
2513+ const int fp_sign_bits = 1 ;
2514+ const int fp32_bits = 32 ;
2515+ const int fp32_mantissa_2nd_part_bits = 9 ;
2516+ const int fp32_mantissa_3rd_part_bits = 4 ;
2517+ const int fp16_exponent_bits = 5 ;
2518+ const int fp16_mantissa_bits = 10 ;
2519+
2520+ // preserve the sign bit and exponent, clear mantissa.
2521+ __ vnsra_wi (dst, src, fp32_bits - fp_sign_bits - fp16_exponent_bits, Assembler::v0_t );
2522+ __ vsll_vi (dst, dst, fp16_mantissa_bits, Assembler::v0_t );
2523+
2524+ // Preserve high order bit of float NaN in the
2525+ // binary16 result NaN (tenth bit); OR in remaining
2526+ // bits into lower 9 bits of binary 16 significand.
2527+ // | (doppel & 0x007f_e000) >> 13 // 10 bits
2528+ // | (doppel & 0x0000_1ff0) >> 4 // 9 bits
2529+ // | (doppel & 0x0000_000f)); // 4 bits
2530+ //
2531+ // Check j.l.Float.floatToFloat16 for more information.
2532+ // 10 bits
2533+ __ vnsrl_wi (vtmp, src, fp32_mantissa_2nd_part_bits + fp32_mantissa_3rd_part_bits, Assembler::v0_t );
2534+ __ mv (t0, 0x3ff ); // retain first part of mantissa in a float 32
2535+ __ vand_vx (vtmp, vtmp, t0, Assembler::v0_t );
2536+ __ vor_vv (dst, dst, vtmp, Assembler::v0_t );
2537+ // 9 bits
2538+ __ vnsrl_wi (vtmp, src, fp32_mantissa_3rd_part_bits, Assembler::v0_t );
2539+ __ mv (t0, 0x1ff ); // retain second part of mantissa in a float 32
2540+ __ vand_vx (vtmp, vtmp, t0, Assembler::v0_t );
2541+ __ vor_vv (dst, dst, vtmp, Assembler::v0_t );
2542+ // 4 bits
2543+ // Narrow shift is necessary to move data from 32 bits element to 16 bits element in vector register.
2544+ __ vnsrl_wi (vtmp, src, 0 , Assembler::v0_t );
2545+ __ vand_vi (vtmp, vtmp, 0xf , Assembler::v0_t );
2546+ __ vor_vv (dst, dst, vtmp, Assembler::v0_t );
25082547
25092548 __ j (stub.continuation ());
25102549#undef __
25112550}
25122551
25132552// j.l.Float.float16ToFloat
2514- void C2_MacroAssembler::float_to_float16_v (VectorRegister dst, VectorRegister src, VectorRegister vtmp,
2515- Register tmp, uint vector_length) {
2553+ void C2_MacroAssembler::float_to_float16_v (VectorRegister dst, VectorRegister src,
2554+ VectorRegister vtmp, Register tmp, uint vector_length) {
25162555 assert_different_registers (dst, src, vtmp);
25172556
25182557 auto stub = C2CodeStub::make<VectorRegister, VectorRegister, VectorRegister>
2519- (dst, src, vtmp, 28 , float_to_float16_v_slow_path);
2558+ (dst, src, vtmp, 56 , float_to_float16_v_slow_path);
25202559
25212560 // On riscv, NaN needs a special process as vfncvt_f_f_w does not work in that case.
25222561
25232562 vsetvli_helper (BasicType::T_FLOAT, vector_length, Assembler::m1);
25242563
25252564 // check whether there is a NaN.
2526- // replace v_fclass with vmseq_vv as performance optimization.
2565+ // replace v_fclass with vmfne_vv as performance optimization.
25272566 vmfne_vv (v0, src, src);
25282567 vcpop_m (t0, v0);
25292568
0 commit comments