Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 100 additions & 19 deletions src/hotspot/cpu/x86/macroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9242,6 +9242,84 @@ void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64b
fill64(Address(dst, disp), xmm, use64byteVector);
}

void MacroAssembler::fill32_unmasked(uint shift, Register dst, int disp, XMMRegister xmm,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could be called as fill32_tail. Also good to replace overall fill32_masked with fill32_tail.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please see this suggestion incorporated in the updated code.

Register length,
Register temp) {
// This stub assumes that fill size <= 32 bytes (i.e. length <= (32 >> shift))
assert(MaxVectorSize >= 32, "vector length should be >= 32");
Label L16, L8, L4, L2, L1, L_done;
// Fastpath for fill size <= 4 bytes
cmpq(length, 4 >> shift);
jcc(Assembler::lessEqual, L4);

// 32-byte store
cmpq(length, 32 >> shift);
jcc(Assembler::less, L16);
vmovdqu(Address(dst, disp), xmm);
addq(dst, 32);
subq(length, 32 >> shift);

// 16-byte store
bind(L16);
cmpq(length, 16 >> shift);
jcc(Assembler::less, L8);
movdqu(Address(dst, disp), xmm);
addq(dst, 16);
subq(length, 16 >> shift);

// 8-byte store
bind(L8);
cmpq(length, 8 >> shift);
jcc(Assembler::less, L4);
movq(Address(dst, disp), xmm);
addq(dst, 8);
subq(length, 8 >> shift);

// 4-byte store
bind(L4);
// temp holds low 8 bytes of xmm for 4/2/1B stores
movq(temp, xmm);
cmpq(length, 4 >> shift);
jcc(Assembler::less, L2);
movl(Address(dst, disp), temp);
addq(dst, 4);
subq(length, 4 >> shift);

// 2-byte store
bind(L2);
cmpq(length, 2 >> shift);
jcc(Assembler::less, L1);
movw(Address(dst, disp), temp);
addq(dst, 2);
subq(length, 2 >> shift);

// 1-byte store
bind(L1);
testq(length, length);
jcc(Assembler::zero, L_done);
movb(Address(dst, disp), temp);

bind(L_done);
}

void MacroAssembler::fill64_unmasked(uint shift, Register dst, int disp,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could be called as fill64_tail. Also good to replace overall fill64_masked with fill64_tail.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please see this suggestion incorporated in the updated code.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @vamsi-parasa. It will be also good to remove fill64_masked and fill32_masked overall.

XMMRegister xmm, Register length,
Register temp) {
assert(MaxVectorSize >= 32, "vector length should be >= 32");
Label L32, L_exit;
// Check if size > 32B
cmpq(length, 32 >> shift);
jcc(Assembler::lessEqual, L32);
fill32(dst, disp, xmm);
subq(length, 32 >> shift);
fill32_unmasked(shift, dst, disp + 32, xmm, length, temp);
jmp(L_exit);
// Size <= 32B
bind(L32);
fill32_unmasked(shift, dst, disp, xmm, length, temp);
bind(L_exit);
}

void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value,
Register count, Register rtmp, XMMRegister xtmp) {
Label L_exit;
Expand Down Expand Up @@ -9274,41 +9352,44 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va

if ((avx3threshold != 0) || (MaxVectorSize == 32)) {

if (MaxVectorSize == 64) {
cmpq(count, avx3threshold >> shift);
jcc(Assembler::greater, L_fill_zmm_sequence);
}

evpbroadcast(type, xtmp, value, Assembler::AVX_256bit);

bind(L_fill_start);

cmpq(count, 32 >> shift);
jccb(Assembler::greater, L_fill_64_bytes);
fill32_masked(shift, to, 0, xtmp, k2, count, rtmp);
jcc(Assembler::greater, L_fill_64_bytes);
fill32_unmasked(shift, to, 0, xtmp, count, rtmp);
jmp(L_exit);
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of repeating fill32_unmasked multiple time, you could jmp to say L_fill_32_tail and have the fill32_unmasked code there one time.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please see this suggestion incorporated in the updated code.


bind(L_fill_64_bytes);

if (MaxVectorSize == 64) {
cmpq(count, avx3threshold >> shift);
jcc(Assembler::greater, L_fill_zmm_sequence);
}

cmpq(count, 64 >> shift);
jccb(Assembler::greater, L_fill_96_bytes);
fill64_masked(shift, to, 0, xtmp, k2, count, rtmp);
jcc(Assembler::greater, L_fill_96_bytes);
fill32(to, 0, xtmp);
subq(count, 32 >> shift);
fill32_unmasked(shift, to, 32, xtmp, count, rtmp);
jmp(L_exit);

bind(L_fill_96_bytes);
cmpq(count, 96 >> shift);
jccb(Assembler::greater, L_fill_128_bytes);
jcc(Assembler::greater, L_fill_128_bytes);
fill64(to, 0, xtmp);
subq(count, 64 >> shift);
fill32_masked(shift, to, 64, xtmp, k2, count, rtmp);
fill32_unmasked(shift, to, 64, xtmp, count, rtmp);
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of repeating fill64_unmasked multiple time, you could jmp to say L_fill_64_tail and have the fill64_unmasked code there one time.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please see this suggestion incorporated in the updated code.

jmp(L_exit);

bind(L_fill_128_bytes);
cmpq(count, 128 >> shift);
jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header);
jcc(Assembler::greater, L_fill_128_bytes_loop_pre_header);
fill64(to, 0, xtmp);
fill32(to, 64, xtmp);
subq(count, 96 >> shift);
fill32_masked(shift, to, 96, xtmp, k2, count, rtmp);
fill32_unmasked(shift, to, 96, xtmp, count, rtmp);
jmp(L_exit);

bind(L_fill_128_bytes_loop_pre_header);
Expand Down Expand Up @@ -9360,25 +9441,25 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va

bind(L_fill_start_zmm_sequence);
cmpq(count, 64 >> shift);
jccb(Assembler::greater, L_fill_128_bytes_zmm);
fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true);
jcc(Assembler::greater, L_fill_128_bytes_zmm);
fill64_unmasked(shift, to, 0, xtmp, count, rtmp);
jmp(L_exit);

bind(L_fill_128_bytes_zmm);
cmpq(count, 128 >> shift);
jccb(Assembler::greater, L_fill_192_bytes_zmm);
jcc(Assembler::greater, L_fill_192_bytes_zmm);
fill64(to, 0, xtmp, true);
subq(count, 64 >> shift);
fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true);
fill64_unmasked(shift, to, 64, xtmp, count, rtmp);
jmp(L_exit);

bind(L_fill_192_bytes_zmm);
cmpq(count, 192 >> shift);
jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm);
jcc(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm);
fill64(to, 0, xtmp, true);
fill64(to, 64, xtmp, true);
subq(count, 128 >> shift);
fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true);
fill64_unmasked(shift, to, 128, xtmp, count, rtmp);
jmp(L_exit);

bind(L_fill_192_bytes_loop_pre_header_zmm);
Expand Down
6 changes: 6 additions & 0 deletions src/hotspot/cpu/x86/macroAssembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2023,10 +2023,16 @@ class MacroAssembler: public Assembler {
XMMRegister xmm, KRegister mask, Register length,
Register temp, bool use64byteVector = false);

void fill64_unmasked(uint shift, Register dst, int disp,
XMMRegister xmm, Register length, Register temp);

void fill32_masked(uint shift, Register dst, int disp,
XMMRegister xmm, KRegister mask, Register length,
Register temp);

void fill32_unmasked(uint shift, Register dst, int disp,
XMMRegister xmm, Register length, Register temp);

void fill32(Address dst, XMMRegister xmm);

void fill32(Register dst, int disp, XMMRegister xmm);
Expand Down