From f18b385e910ecb455dfcab937eb42d38e725192a Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Thu, 20 Nov 2025 13:53:58 -0800 Subject: [PATCH 1/4] 8349452: Fix performance regression for Arrays.fill() with AVX512 --- src/hotspot/cpu/x86/macroAssembler_x86.cpp | 128 ++++++++++++++++++--- src/hotspot/cpu/x86/macroAssembler_x86.hpp | 6 + 2 files changed, 115 insertions(+), 19 deletions(-) diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp index 44f1a35d443ab..6fe600163eb97 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp @@ -9200,9 +9200,18 @@ void MacroAssembler::fill64_masked(uint shift, Register dst, int disp, assert(MaxVectorSize >= 32, "vector length should be >= 32"); const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; if (!use64byteVector) { + Label L32, L_exit; + // Check if size > 32B + cmpq(length, 32 >> shift); + jcc(Assembler::lessEqual, L32); fill32(dst, disp, xmm); subptr(length, 32 >> shift); fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp); + jmp(L_exit); + // Size <= 32B + bind(L32); + fill32_masked(shift, dst, disp, xmm, mask, length, temp); + bind(L_exit); } else { assert(MaxVectorSize == 64, "vector length != 64"); fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit); @@ -9242,6 +9251,84 @@ void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64b fill64(Address(dst, disp), xmm, use64byteVector); } +void MacroAssembler::fill32_unmasked(uint shift, Register dst, int disp, XMMRegister xmm, + Register length, + Register temp) { + // This stub assumes that fill size <= 32 bytes (i.e. length <= (32 >> shift)) + assert(MaxVectorSize >= 32, "vector length should be >= 32"); + Label L16, L8, L4, L2, L1, L_done; + // Fastpath for fill size <= 4 bytes + cmpq(length, 4 >> shift); + jcc(Assembler::lessEqual, L4); + + // 32-byte store + cmpq(length, 32 >> shift); + jcc(Assembler::less, L16); + vmovdqu(Address(dst, disp), xmm); + addq(dst, 32); + subq(length, 32 >> shift); + + // 16-byte store + bind(L16); + cmpq(length, 16 >> shift); + jcc(Assembler::less, L8); + movdqu(Address(dst, disp), xmm); + addq(dst, 16); + subq(length, 16 >> shift); + + // 8-byte store + bind(L8); + cmpq(length, 8 >> shift); + jcc(Assembler::less, L4); + movq(Address(dst, disp), xmm); + addq(dst, 8); + subq(length, 8 >> shift); + + // 4-byte store + bind(L4); + // temp holds low 8 bytes of xmm for 4/2/1B stores + movq(temp, xmm); + cmpq(length, 4 >> shift); + jcc(Assembler::less, L2); + movl(Address(dst, disp), temp); + addq(dst, 4); + subq(length, 4 >> shift); + + // 2-byte store + bind(L2); + cmpq(length, 2 >> shift); + jcc(Assembler::less, L1); + movw(Address(dst, disp), temp); + addq(dst, 2); + subq(length, 2 >> shift); + + // 1-byte store + bind(L1); + testq(length, length); + jcc(Assembler::zero, L_done); + movb(Address(dst, disp), temp); + + bind(L_done); +} + +void MacroAssembler::fill64_unmasked(uint shift, Register dst, int disp, + XMMRegister xmm, Register length, + Register temp) { + assert(MaxVectorSize >= 32, "vector length should be >= 32"); + Label L32, L_exit; + // Check if size > 32B + cmpq(length, 32 >> shift); + jcc(Assembler::lessEqual, L32); + fill32(dst, disp, xmm); + subq(length, 32 >> shift); + fill32_unmasked(shift, dst, disp + 32, xmm, length, temp); + jmp(L_exit); + // Size <= 32B + bind(L32); + fill32_unmasked(shift, dst, disp, xmm, length, temp); + bind(L_exit); +} + void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value, Register count, Register rtmp, XMMRegister xtmp) { Label L_exit; @@ -9274,41 +9361,44 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va if ((avx3threshold != 0) || (MaxVectorSize == 32)) { - if (MaxVectorSize == 64) { - cmpq(count, avx3threshold >> shift); - jcc(Assembler::greater, L_fill_zmm_sequence); - } - evpbroadcast(type, xtmp, value, Assembler::AVX_256bit); bind(L_fill_start); cmpq(count, 32 >> shift); - jccb(Assembler::greater, L_fill_64_bytes); - fill32_masked(shift, to, 0, xtmp, k2, count, rtmp); + jcc(Assembler::greater, L_fill_64_bytes); + fill32_unmasked(shift, to, 0, xtmp, count, rtmp); jmp(L_exit); bind(L_fill_64_bytes); + + if (MaxVectorSize == 64) { + cmpq(count, avx3threshold >> shift); + jcc(Assembler::greater, L_fill_zmm_sequence); + } + cmpq(count, 64 >> shift); - jccb(Assembler::greater, L_fill_96_bytes); - fill64_masked(shift, to, 0, xtmp, k2, count, rtmp); + jcc(Assembler::greater, L_fill_96_bytes); + fill32(to, 0, xtmp); + subq(count, 32 >> shift); + fill32_unmasked(shift, to, 32, xtmp, count, rtmp); jmp(L_exit); bind(L_fill_96_bytes); cmpq(count, 96 >> shift); - jccb(Assembler::greater, L_fill_128_bytes); + jcc(Assembler::greater, L_fill_128_bytes); fill64(to, 0, xtmp); subq(count, 64 >> shift); - fill32_masked(shift, to, 64, xtmp, k2, count, rtmp); + fill32_unmasked(shift, to, 64, xtmp, count, rtmp); jmp(L_exit); bind(L_fill_128_bytes); cmpq(count, 128 >> shift); - jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header); + jcc(Assembler::greater, L_fill_128_bytes_loop_pre_header); fill64(to, 0, xtmp); fill32(to, 64, xtmp); subq(count, 96 >> shift); - fill32_masked(shift, to, 96, xtmp, k2, count, rtmp); + fill32_unmasked(shift, to, 96, xtmp, count, rtmp); jmp(L_exit); bind(L_fill_128_bytes_loop_pre_header); @@ -9360,25 +9450,25 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va bind(L_fill_start_zmm_sequence); cmpq(count, 64 >> shift); - jccb(Assembler::greater, L_fill_128_bytes_zmm); - fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true); + jcc(Assembler::greater, L_fill_128_bytes_zmm); + fill64_unmasked(shift, to, 0, xtmp, count, rtmp); jmp(L_exit); bind(L_fill_128_bytes_zmm); cmpq(count, 128 >> shift); - jccb(Assembler::greater, L_fill_192_bytes_zmm); + jcc(Assembler::greater, L_fill_192_bytes_zmm); fill64(to, 0, xtmp, true); subq(count, 64 >> shift); - fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true); + fill64_unmasked(shift, to, 64, xtmp, count, rtmp); jmp(L_exit); bind(L_fill_192_bytes_zmm); cmpq(count, 192 >> shift); - jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm); + jcc(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm); fill64(to, 0, xtmp, true); fill64(to, 64, xtmp, true); subq(count, 128 >> shift); - fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true); + fill64_unmasked(shift, to, 128, xtmp, count, rtmp); jmp(L_exit); bind(L_fill_192_bytes_loop_pre_header_zmm); diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp index 4cecaa55345c9..a40a4c70024da 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp @@ -2023,10 +2023,16 @@ class MacroAssembler: public Assembler { XMMRegister xmm, KRegister mask, Register length, Register temp, bool use64byteVector = false); + void fill64_unmasked(uint shift, Register dst, int disp, + XMMRegister xmm, Register length, Register temp); + void fill32_masked(uint shift, Register dst, int disp, XMMRegister xmm, KRegister mask, Register length, Register temp); + void fill32_unmasked(uint shift, Register dst, int disp, + XMMRegister xmm, Register length, Register temp); + void fill32(Address dst, XMMRegister xmm); void fill32(Register dst, int disp, XMMRegister xmm); From ee1db381bbb13d725006b44aa6e221654a599e0c Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Thu, 20 Nov 2025 17:10:46 -0800 Subject: [PATCH 2/4] undo size check for fill64_masked --- src/hotspot/cpu/x86/macroAssembler_x86.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp index 6fe600163eb97..1d452792de284 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp @@ -9200,18 +9200,9 @@ void MacroAssembler::fill64_masked(uint shift, Register dst, int disp, assert(MaxVectorSize >= 32, "vector length should be >= 32"); const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; if (!use64byteVector) { - Label L32, L_exit; - // Check if size > 32B - cmpq(length, 32 >> shift); - jcc(Assembler::lessEqual, L32); fill32(dst, disp, xmm); subptr(length, 32 >> shift); fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp); - jmp(L_exit); - // Size <= 32B - bind(L32); - fill32_masked(shift, dst, disp, xmm, mask, length, temp); - bind(L_exit); } else { assert(MaxVectorSize == 64, "vector length != 64"); fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit); From 1371d556095012fcab8150ebdbc1ac36ba62ed80 Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Fri, 21 Nov 2025 15:50:49 -0800 Subject: [PATCH 3/4] refactor code to use fill32_tail at the end of the stub --- src/hotspot/cpu/x86/macroAssembler_x86.cpp | 53 +++++++++++++--------- src/hotspot/cpu/x86/macroAssembler_x86.hpp | 4 +- 2 files changed, 34 insertions(+), 23 deletions(-) diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp index 1d452792de284..de48577cf9151 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp @@ -9242,9 +9242,8 @@ void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64b fill64(Address(dst, disp), xmm, use64byteVector); } -void MacroAssembler::fill32_unmasked(uint shift, Register dst, int disp, XMMRegister xmm, - Register length, - Register temp) { +void MacroAssembler::fill32_tail(uint shift, Register dst, int disp, XMMRegister xmm, + Register length, Register temp) { // This stub assumes that fill size <= 32 bytes (i.e. length <= (32 >> shift)) assert(MaxVectorSize >= 32, "vector length should be >= 32"); Label L16, L8, L4, L2, L1, L_done; @@ -9258,6 +9257,7 @@ void MacroAssembler::fill32_unmasked(uint shift, Register dst, int disp, XMMRegi vmovdqu(Address(dst, disp), xmm); addq(dst, 32); subq(length, 32 >> shift); + jcc(Assembler::equal, L_done); // 16-byte store bind(L16); @@ -9302,7 +9302,7 @@ void MacroAssembler::fill32_unmasked(uint shift, Register dst, int disp, XMMRegi bind(L_done); } -void MacroAssembler::fill64_unmasked(uint shift, Register dst, int disp, +void MacroAssembler::fill64_tail(uint shift, Register dst, int disp, XMMRegister xmm, Register length, Register temp) { assert(MaxVectorSize >= 32, "vector length should be >= 32"); @@ -9312,11 +9312,11 @@ void MacroAssembler::fill64_unmasked(uint shift, Register dst, int disp, jcc(Assembler::lessEqual, L32); fill32(dst, disp, xmm); subq(length, 32 >> shift); - fill32_unmasked(shift, dst, disp + 32, xmm, length, temp); + fill32_tail(shift, dst, disp + 32, xmm, length, temp); jmp(L_exit); // Size <= 32B bind(L32); - fill32_unmasked(shift, dst, disp, xmm, length, temp); + fill32_tail(shift, dst, disp, xmm, length, temp); bind(L_exit); } @@ -9324,6 +9324,8 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va Register count, Register rtmp, XMMRegister xtmp) { Label L_exit; Label L_fill_start; + Label L_fill_32_tail; + Label L_fill_64_tail; Label L_fill_64_bytes; Label L_fill_96_bytes; Label L_fill_128_bytes; @@ -9357,9 +9359,7 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va bind(L_fill_start); cmpq(count, 32 >> shift); - jcc(Assembler::greater, L_fill_64_bytes); - fill32_unmasked(shift, to, 0, xtmp, count, rtmp); - jmp(L_exit); + jcc(Assembler::lessEqual, L_fill_32_tail); bind(L_fill_64_bytes); @@ -9371,26 +9371,26 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va cmpq(count, 64 >> shift); jcc(Assembler::greater, L_fill_96_bytes); fill32(to, 0, xtmp); + addptr(to, 32); subq(count, 32 >> shift); - fill32_unmasked(shift, to, 32, xtmp, count, rtmp); - jmp(L_exit); + jmp(L_fill_32_tail); bind(L_fill_96_bytes); cmpq(count, 96 >> shift); jcc(Assembler::greater, L_fill_128_bytes); fill64(to, 0, xtmp); + addptr(to, 64); subq(count, 64 >> shift); - fill32_unmasked(shift, to, 64, xtmp, count, rtmp); - jmp(L_exit); + jmp(L_fill_32_tail); bind(L_fill_128_bytes); cmpq(count, 128 >> shift); jcc(Assembler::greater, L_fill_128_bytes_loop_pre_header); fill64(to, 0, xtmp); fill32(to, 64, xtmp); + addptr(to, 96); subq(count, 96 >> shift); - fill32_unmasked(shift, to, 96, xtmp, count, rtmp); - jmp(L_exit); + jmp(L_fill_32_tail); bind(L_fill_128_bytes_loop_pre_header); { @@ -9442,25 +9442,24 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va bind(L_fill_start_zmm_sequence); cmpq(count, 64 >> shift); jcc(Assembler::greater, L_fill_128_bytes_zmm); - fill64_unmasked(shift, to, 0, xtmp, count, rtmp); - jmp(L_exit); + jmp(L_fill_64_tail); bind(L_fill_128_bytes_zmm); cmpq(count, 128 >> shift); jcc(Assembler::greater, L_fill_192_bytes_zmm); fill64(to, 0, xtmp, true); + addptr(to, 64); subq(count, 64 >> shift); - fill64_unmasked(shift, to, 64, xtmp, count, rtmp); - jmp(L_exit); + jmp(L_fill_64_tail); bind(L_fill_192_bytes_zmm); cmpq(count, 192 >> shift); jcc(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm); fill64(to, 0, xtmp, true); fill64(to, 64, xtmp, true); + addptr(to, 128); subq(count, 128 >> shift); - fill64_unmasked(shift, to, 128, xtmp, count, rtmp); - jmp(L_exit); + jmp(L_fill_64_tail); bind(L_fill_192_bytes_loop_pre_header_zmm); { @@ -9496,7 +9495,19 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va addq(count, 192 >> shift); jcc(Assembler::zero, L_exit); jmp(L_fill_start_zmm_sequence); + + bind(L_fill_64_tail); + cmpq(count, 32 >> shift); + jcc(Assembler::less, L_fill_32_tail); + fill32(to, 0, xtmp); + jcc(Assembler::equal, L_exit); + subq(count, 32 >> shift); + addptr(to, 32); } + + bind(L_fill_32_tail); + fill32_tail(shift, to, 0, xtmp, count, rtmp); + bind(L_exit); } #endif //COMPILER2_OR_JVMCI diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp index a40a4c70024da..f1b41e2a5a0f5 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp @@ -2023,14 +2023,14 @@ class MacroAssembler: public Assembler { XMMRegister xmm, KRegister mask, Register length, Register temp, bool use64byteVector = false); - void fill64_unmasked(uint shift, Register dst, int disp, + void fill64_tail(uint shift, Register dst, int disp, XMMRegister xmm, Register length, Register temp); void fill32_masked(uint shift, Register dst, int disp, XMMRegister xmm, KRegister mask, Register length, Register temp); - void fill32_unmasked(uint shift, Register dst, int disp, + void fill32_tail(uint shift, Register dst, int disp, XMMRegister xmm, Register length, Register temp); void fill32(Address dst, XMMRegister xmm); From 57dc6c4ae41727c630ca16eba33d3ac7c89695eb Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Fri, 21 Nov 2025 15:58:13 -0800 Subject: [PATCH 4/4] undo jccb to jcc change as needed --- src/hotspot/cpu/x86/macroAssembler_x86.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp index de48577cf9151..c63065dd5e657 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp @@ -9369,7 +9369,7 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va } cmpq(count, 64 >> shift); - jcc(Assembler::greater, L_fill_96_bytes); + jccb(Assembler::greater, L_fill_96_bytes); fill32(to, 0, xtmp); addptr(to, 32); subq(count, 32 >> shift); @@ -9377,7 +9377,7 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va bind(L_fill_96_bytes); cmpq(count, 96 >> shift); - jcc(Assembler::greater, L_fill_128_bytes); + jccb(Assembler::greater, L_fill_128_bytes); fill64(to, 0, xtmp); addptr(to, 64); subq(count, 64 >> shift); @@ -9385,7 +9385,7 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va bind(L_fill_128_bytes); cmpq(count, 128 >> shift); - jcc(Assembler::greater, L_fill_128_bytes_loop_pre_header); + jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header); fill64(to, 0, xtmp); fill32(to, 64, xtmp); addptr(to, 96); @@ -9441,12 +9441,12 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va bind(L_fill_start_zmm_sequence); cmpq(count, 64 >> shift); - jcc(Assembler::greater, L_fill_128_bytes_zmm); + jccb(Assembler::greater, L_fill_128_bytes_zmm); jmp(L_fill_64_tail); bind(L_fill_128_bytes_zmm); cmpq(count, 128 >> shift); - jcc(Assembler::greater, L_fill_192_bytes_zmm); + jccb(Assembler::greater, L_fill_192_bytes_zmm); fill64(to, 0, xtmp, true); addptr(to, 64); subq(count, 64 >> shift); @@ -9454,7 +9454,7 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va bind(L_fill_192_bytes_zmm); cmpq(count, 192 >> shift); - jcc(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm); + jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm); fill64(to, 0, xtmp, true); fill64(to, 64, xtmp, true); addptr(to, 128);