diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp index 44f1a35d443ab..c63065dd5e657 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp @@ -9242,10 +9242,90 @@ void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64b fill64(Address(dst, disp), xmm, use64byteVector); } +void MacroAssembler::fill32_tail(uint shift, Register dst, int disp, XMMRegister xmm, + Register length, Register temp) { + // This stub assumes that fill size <= 32 bytes (i.e. length <= (32 >> shift)) + assert(MaxVectorSize >= 32, "vector length should be >= 32"); + Label L16, L8, L4, L2, L1, L_done; + // Fastpath for fill size <= 4 bytes + cmpq(length, 4 >> shift); + jcc(Assembler::lessEqual, L4); + + // 32-byte store + cmpq(length, 32 >> shift); + jcc(Assembler::less, L16); + vmovdqu(Address(dst, disp), xmm); + addq(dst, 32); + subq(length, 32 >> shift); + jcc(Assembler::equal, L_done); + + // 16-byte store + bind(L16); + cmpq(length, 16 >> shift); + jcc(Assembler::less, L8); + movdqu(Address(dst, disp), xmm); + addq(dst, 16); + subq(length, 16 >> shift); + + // 8-byte store + bind(L8); + cmpq(length, 8 >> shift); + jcc(Assembler::less, L4); + movq(Address(dst, disp), xmm); + addq(dst, 8); + subq(length, 8 >> shift); + + // 4-byte store + bind(L4); + // temp holds low 8 bytes of xmm for 4/2/1B stores + movq(temp, xmm); + cmpq(length, 4 >> shift); + jcc(Assembler::less, L2); + movl(Address(dst, disp), temp); + addq(dst, 4); + subq(length, 4 >> shift); + + // 2-byte store + bind(L2); + cmpq(length, 2 >> shift); + jcc(Assembler::less, L1); + movw(Address(dst, disp), temp); + addq(dst, 2); + subq(length, 2 >> shift); + + // 1-byte store + bind(L1); + testq(length, length); + jcc(Assembler::zero, L_done); + movb(Address(dst, disp), temp); + + bind(L_done); +} + +void MacroAssembler::fill64_tail(uint shift, Register dst, int disp, + XMMRegister xmm, Register length, + Register temp) { + assert(MaxVectorSize >= 32, "vector length should be >= 32"); + Label L32, L_exit; + // Check if size > 32B + cmpq(length, 32 >> shift); + jcc(Assembler::lessEqual, L32); + fill32(dst, disp, xmm); + subq(length, 32 >> shift); + fill32_tail(shift, dst, disp + 32, xmm, length, temp); + jmp(L_exit); + // Size <= 32B + bind(L32); + fill32_tail(shift, dst, disp, xmm, length, temp); + bind(L_exit); +} + void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value, Register count, Register rtmp, XMMRegister xtmp) { Label L_exit; Label L_fill_start; + Label L_fill_32_tail; + Label L_fill_64_tail; Label L_fill_64_bytes; Label L_fill_96_bytes; Label L_fill_128_bytes; @@ -9274,42 +9354,43 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va if ((avx3threshold != 0) || (MaxVectorSize == 32)) { - if (MaxVectorSize == 64) { - cmpq(count, avx3threshold >> shift); - jcc(Assembler::greater, L_fill_zmm_sequence); - } - evpbroadcast(type, xtmp, value, Assembler::AVX_256bit); bind(L_fill_start); cmpq(count, 32 >> shift); - jccb(Assembler::greater, L_fill_64_bytes); - fill32_masked(shift, to, 0, xtmp, k2, count, rtmp); - jmp(L_exit); + jcc(Assembler::lessEqual, L_fill_32_tail); bind(L_fill_64_bytes); + + if (MaxVectorSize == 64) { + cmpq(count, avx3threshold >> shift); + jcc(Assembler::greater, L_fill_zmm_sequence); + } + cmpq(count, 64 >> shift); jccb(Assembler::greater, L_fill_96_bytes); - fill64_masked(shift, to, 0, xtmp, k2, count, rtmp); - jmp(L_exit); + fill32(to, 0, xtmp); + addptr(to, 32); + subq(count, 32 >> shift); + jmp(L_fill_32_tail); bind(L_fill_96_bytes); cmpq(count, 96 >> shift); jccb(Assembler::greater, L_fill_128_bytes); fill64(to, 0, xtmp); + addptr(to, 64); subq(count, 64 >> shift); - fill32_masked(shift, to, 64, xtmp, k2, count, rtmp); - jmp(L_exit); + jmp(L_fill_32_tail); bind(L_fill_128_bytes); cmpq(count, 128 >> shift); jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header); fill64(to, 0, xtmp); fill32(to, 64, xtmp); + addptr(to, 96); subq(count, 96 >> shift); - fill32_masked(shift, to, 96, xtmp, k2, count, rtmp); - jmp(L_exit); + jmp(L_fill_32_tail); bind(L_fill_128_bytes_loop_pre_header); { @@ -9361,25 +9442,24 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va bind(L_fill_start_zmm_sequence); cmpq(count, 64 >> shift); jccb(Assembler::greater, L_fill_128_bytes_zmm); - fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true); - jmp(L_exit); + jmp(L_fill_64_tail); bind(L_fill_128_bytes_zmm); cmpq(count, 128 >> shift); jccb(Assembler::greater, L_fill_192_bytes_zmm); fill64(to, 0, xtmp, true); + addptr(to, 64); subq(count, 64 >> shift); - fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true); - jmp(L_exit); + jmp(L_fill_64_tail); bind(L_fill_192_bytes_zmm); cmpq(count, 192 >> shift); jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm); fill64(to, 0, xtmp, true); fill64(to, 64, xtmp, true); + addptr(to, 128); subq(count, 128 >> shift); - fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true); - jmp(L_exit); + jmp(L_fill_64_tail); bind(L_fill_192_bytes_loop_pre_header_zmm); { @@ -9415,7 +9495,19 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va addq(count, 192 >> shift); jcc(Assembler::zero, L_exit); jmp(L_fill_start_zmm_sequence); + + bind(L_fill_64_tail); + cmpq(count, 32 >> shift); + jcc(Assembler::less, L_fill_32_tail); + fill32(to, 0, xtmp); + jcc(Assembler::equal, L_exit); + subq(count, 32 >> shift); + addptr(to, 32); } + + bind(L_fill_32_tail); + fill32_tail(shift, to, 0, xtmp, count, rtmp); + bind(L_exit); } #endif //COMPILER2_OR_JVMCI diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp index 4cecaa55345c9..f1b41e2a5a0f5 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp @@ -2023,10 +2023,16 @@ class MacroAssembler: public Assembler { XMMRegister xmm, KRegister mask, Register length, Register temp, bool use64byteVector = false); + void fill64_tail(uint shift, Register dst, int disp, + XMMRegister xmm, Register length, Register temp); + void fill32_masked(uint shift, Register dst, int disp, XMMRegister xmm, KRegister mask, Register length, Register temp); + void fill32_tail(uint shift, Register dst, int disp, + XMMRegister xmm, Register length, Register temp); + void fill32(Address dst, XMMRegister xmm); void fill32(Register dst, int disp, XMMRegister xmm);