Skip to content

Commit 6d36eb7

Browse files
author
Jatin Bhateja
committed
8322768: Optimize non-subword vector compress and expand APIs for AVX2 target.
Reviewed-by: epeter, sviswanathan
1 parent 9d1a6d1 commit 6d36eb7

File tree

10 files changed

+364
-18
lines changed

10 files changed

+364
-18
lines changed

src/hotspot/cpu/x86/assembler_x86.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* This code is free software; you can redistribute it and/or modify it
@@ -816,8 +816,8 @@ class Assembler : public AbstractAssembler {
816816
void check_relocation(RelocationHolder const& rspec, int format);
817817
#endif
818818

819-
void emit_data(jint data, relocInfo::relocType rtype, int format);
820-
void emit_data(jint data, RelocationHolder const& rspec, int format);
819+
void emit_data(jint data, relocInfo::relocType rtype, int format = 0);
820+
void emit_data(jint data, RelocationHolder const& rspec, int format = 0);
821821
void emit_data64(jlong data, relocInfo::relocType rtype, int format = 0);
822822
void emit_data64(jlong data, RelocationHolder const& rspec, int format = 0);
823823

src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5282,6 +5282,42 @@ void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Regis
52825282
kmov(dst, rtmp2);
52835283
}
52845284

5285+
#ifdef _LP64
5286+
void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5287+
XMMRegister mask, Register rtmp, Register rscratch,
5288+
XMMRegister permv, XMMRegister xtmp, BasicType bt,
5289+
int vec_enc) {
5290+
assert(type2aelembytes(bt) >= 4, "");
5291+
assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5292+
address compress_perm_table = nullptr;
5293+
address expand_perm_table = nullptr;
5294+
if (type2aelembytes(bt) == 8) {
5295+
compress_perm_table = StubRoutines::x86::compress_perm_table64();
5296+
expand_perm_table = StubRoutines::x86::expand_perm_table64();
5297+
vmovmskpd(rtmp, mask, vec_enc);
5298+
} else {
5299+
compress_perm_table = StubRoutines::x86::compress_perm_table32();
5300+
expand_perm_table = StubRoutines::x86::expand_perm_table32();
5301+
vmovmskps(rtmp, mask, vec_enc);
5302+
}
5303+
shlq(rtmp, 5); // for 32 byte permute row.
5304+
if (opcode == Op_CompressV) {
5305+
lea(rscratch, ExternalAddress(compress_perm_table));
5306+
} else {
5307+
lea(rscratch, ExternalAddress(expand_perm_table));
5308+
}
5309+
addptr(rtmp, rscratch);
5310+
vmovdqu(permv, Address(rtmp));
5311+
vpermps(dst, permv, src, Assembler::AVX_256bit);
5312+
vpxor(xtmp, xtmp, xtmp, vec_enc);
5313+
// Blend the result with zero vector using permute mask, each column entry
5314+
// in a permute table row contains either a valid permute index or a -1 (default)
5315+
// value, this can potentially be used as a blending mask after
5316+
// compressing/expanding the source vector lanes.
5317+
vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5318+
}
5319+
#endif
5320+
52855321
void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
52865322
bool merge, BasicType bt, int vec_enc) {
52875323
if (opcode == Op_CompressV) {

src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* This code is free software; you can redistribute it and/or modify it
@@ -390,6 +390,10 @@
390390

391391
void vector_round_float_avx(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
392392
Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4);
393+
394+
void vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, XMMRegister mask,
395+
Register rtmp, Register rscratch, XMMRegister permv, XMMRegister xtmp,
396+
BasicType bt, int vec_enc);
393397
#endif // _LP64
394398

395399
void udivI(Register rax, Register divisor, Register rdx);

src/hotspot/cpu/x86/stubGenerator_x86_64.cpp

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -951,6 +951,92 @@ address StubGenerator::generate_fp_mask(const char *stub_name, int64_t mask) {
951951
return start;
952952
}
953953

954+
address StubGenerator::generate_compress_perm_table(const char *stub_name, int32_t esize) {
955+
__ align(CodeEntryAlignment);
956+
StubCodeMark mark(this, "StubRoutines", stub_name);
957+
address start = __ pc();
958+
if (esize == 32) {
959+
// Loop to generate 256 x 8 int compression permute index table. A row is
960+
// accessed using 8 bit index computed using vector mask. An entry in
961+
// a row holds either a valid permute index corresponding to set bit position
962+
// or a -1 (default) value.
963+
for (int mask = 0; mask < 256; mask++) {
964+
int ctr = 0;
965+
for (int j = 0; j < 8; j++) {
966+
if (mask & (1 << j)) {
967+
__ emit_data(j, relocInfo::none);
968+
ctr++;
969+
}
970+
}
971+
for (; ctr < 8; ctr++) {
972+
__ emit_data(-1, relocInfo::none);
973+
}
974+
}
975+
} else {
976+
assert(esize == 64, "");
977+
// Loop to generate 16 x 4 long compression permute index table. A row is
978+
// accessed using 4 bit index computed using vector mask. An entry in
979+
// a row holds either a valid permute index pair for a quadword corresponding
980+
// to set bit position or a -1 (default) value.
981+
for (int mask = 0; mask < 16; mask++) {
982+
int ctr = 0;
983+
for (int j = 0; j < 4; j++) {
984+
if (mask & (1 << j)) {
985+
__ emit_data(2 * j, relocInfo::none);
986+
__ emit_data(2 * j + 1, relocInfo::none);
987+
ctr++;
988+
}
989+
}
990+
for (; ctr < 4; ctr++) {
991+
__ emit_data64(-1L, relocInfo::none);
992+
}
993+
}
994+
}
995+
return start;
996+
}
997+
998+
address StubGenerator::generate_expand_perm_table(const char *stub_name, int32_t esize) {
999+
__ align(CodeEntryAlignment);
1000+
StubCodeMark mark(this, "StubRoutines", stub_name);
1001+
address start = __ pc();
1002+
if (esize == 32) {
1003+
// Loop to generate 256 x 8 int expand permute index table. A row is accessed
1004+
// using 8 bit index computed using vector mask. An entry in a row holds either
1005+
// a valid permute index (starting from least significant lane) placed at poisition
1006+
// corresponding to set bit position or a -1 (default) value.
1007+
for (int mask = 0; mask < 256; mask++) {
1008+
int ctr = 0;
1009+
for (int j = 0; j < 8; j++) {
1010+
if (mask & (1 << j)) {
1011+
__ emit_data(ctr++, relocInfo::none);
1012+
} else {
1013+
__ emit_data(-1, relocInfo::none);
1014+
}
1015+
}
1016+
}
1017+
} else {
1018+
assert(esize == 64, "");
1019+
// Loop to generate 16 x 4 long expand permute index table. A row is accessed
1020+
// using 4 bit index computed using vector mask. An entry in a row holds either
1021+
// a valid doubleword permute index pair representing a quadword index (starting
1022+
// from least significant lane) placed at poisition corresponding to set bit
1023+
// position or a -1 (default) value.
1024+
for (int mask = 0; mask < 16; mask++) {
1025+
int ctr = 0;
1026+
for (int j = 0; j < 4; j++) {
1027+
if (mask & (1 << j)) {
1028+
__ emit_data(2 * ctr, relocInfo::none);
1029+
__ emit_data(2 * ctr + 1, relocInfo::none);
1030+
ctr++;
1031+
} else {
1032+
__ emit_data64(-1L, relocInfo::none);
1033+
}
1034+
}
1035+
}
1036+
}
1037+
return start;
1038+
}
1039+
9541040
address StubGenerator::generate_vector_mask(const char *stub_name, int64_t mask) {
9551041
__ align(CodeEntryAlignment);
9561042
StubCodeMark mark(this, "StubRoutines", stub_name);
@@ -4095,6 +4181,13 @@ void StubGenerator::generate_compiler_stubs() {
40954181
StubRoutines::x86::_vector_reverse_byte_perm_mask_int = generate_vector_reverse_byte_perm_mask_int("perm_mask_int");
40964182
StubRoutines::x86::_vector_reverse_byte_perm_mask_short = generate_vector_reverse_byte_perm_mask_short("perm_mask_short");
40974183

4184+
if (VM_Version::supports_avx2() && !VM_Version::supports_avx512vl()) {
4185+
StubRoutines::x86::_compress_perm_table32 = generate_compress_perm_table("compress_perm_table32", 32);
4186+
StubRoutines::x86::_compress_perm_table64 = generate_compress_perm_table("compress_perm_table64", 64);
4187+
StubRoutines::x86::_expand_perm_table32 = generate_expand_perm_table("expand_perm_table32", 32);
4188+
StubRoutines::x86::_expand_perm_table64 = generate_expand_perm_table("expand_perm_table64", 64);
4189+
}
4190+
40984191
if (VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
40994192
// lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
41004193
StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");

src/hotspot/cpu/x86/stubGenerator_x86_64.hpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* This code is free software; you can redistribute it and/or modify it
@@ -99,6 +99,10 @@ class StubGenerator: public StubCodeGenerator {
9999

100100
address generate_fp_mask(const char *stub_name, int64_t mask);
101101

102+
address generate_compress_perm_table(const char *stub_name, int32_t esize);
103+
104+
address generate_expand_perm_table(const char *stub_name, int32_t esize);
105+
102106
address generate_vector_mask(const char *stub_name, int64_t mask);
103107

104108
address generate_vector_byte_perm_mask(const char *stub_name);

src/hotspot/cpu/x86/stubRoutines_x86.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2013, 2024, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* This code is free software; you can redistribute it and/or modify it
@@ -82,6 +82,10 @@ address StubRoutines::x86::_join_0_1_base64 = nullptr;
8282
address StubRoutines::x86::_join_1_2_base64 = nullptr;
8383
address StubRoutines::x86::_join_2_3_base64 = nullptr;
8484
address StubRoutines::x86::_decoding_table_base64 = nullptr;
85+
address StubRoutines::x86::_compress_perm_table32 = nullptr;
86+
address StubRoutines::x86::_compress_perm_table64 = nullptr;
87+
address StubRoutines::x86::_expand_perm_table32 = nullptr;
88+
address StubRoutines::x86::_expand_perm_table64 = nullptr;
8589
#endif
8690
address StubRoutines::x86::_pshuffle_byte_flip_mask_addr = nullptr;
8791

src/hotspot/cpu/x86/stubRoutines_x86.hpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2013, 2024, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* This code is free software; you can redistribute it and/or modify it
@@ -37,7 +37,7 @@ enum platform_dependent_constants {
3737
_continuation_stubs_code_size = 1000 LP64_ONLY(+1000),
3838
// AVX512 intrinsics add more code in 64-bit VM,
3939
// Windows have more code to save/restore registers
40-
_compiler_stubs_code_size = 20000 LP64_ONLY(+32000) WINDOWS_ONLY(+2000),
40+
_compiler_stubs_code_size = 20000 LP64_ONLY(+39000) WINDOWS_ONLY(+2000),
4141
_final_stubs_code_size = 10000 LP64_ONLY(+20000) WINDOWS_ONLY(+2000) ZGC_ONLY(+20000)
4242
};
4343

@@ -58,6 +58,10 @@ class x86 {
5858
static address _float_sign_flip;
5959
static address _double_sign_mask;
6060
static address _double_sign_flip;
61+
static address _compress_perm_table32;
62+
static address _compress_perm_table64;
63+
static address _expand_perm_table32;
64+
static address _expand_perm_table64;
6165

6266
public:
6367

@@ -338,6 +342,10 @@ class x86 {
338342
static address base64_decoding_table_addr() { return _decoding_table_base64; }
339343
static address base64_AVX2_decode_tables_addr() { return _avx2_decode_tables_base64; }
340344
static address base64_AVX2_decode_LUT_tables_addr() { return _avx2_decode_lut_tables_base64; }
345+
static address compress_perm_table32() { return _compress_perm_table32; }
346+
static address compress_perm_table64() { return _compress_perm_table64; }
347+
static address expand_perm_table32() { return _expand_perm_table32; }
348+
static address expand_perm_table64() { return _expand_perm_table64; }
341349
#endif
342350
static address pshuffle_byte_flip_mask_addr() { return _pshuffle_byte_flip_mask_addr; }
343351
static address arrays_hashcode_powers_of_31() { return (address)_arrays_hashcode_powers_of_31; }

src/hotspot/cpu/x86/stubRoutines_x86_64.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,3 @@ address StubRoutines::x86::_float_sign_mask = nullptr;
4444
address StubRoutines::x86::_float_sign_flip = nullptr;
4545
address StubRoutines::x86::_double_sign_mask = nullptr;
4646
address StubRoutines::x86::_double_sign_flip = nullptr;
47-

src/hotspot/cpu/x86/x86.ad

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1425,6 +1425,8 @@ bool Matcher::match_rule_supported(int opcode) {
14251425
return false;
14261426
}
14271427
break;
1428+
case Op_CompressV:
1429+
case Op_ExpandV:
14281430
case Op_PopCountVL:
14291431
if (UseAVX < 2) {
14301432
return false;
@@ -1659,12 +1661,6 @@ bool Matcher::match_rule_supported(int opcode) {
16591661
return false;
16601662
}
16611663
break;
1662-
case Op_CompressV:
1663-
case Op_ExpandV:
1664-
if (!VM_Version::supports_avx512vl()) {
1665-
return false;
1666-
}
1667-
break;
16681664
case Op_SqrtF:
16691665
if (UseSSE < 1) {
16701666
return false;
@@ -1952,13 +1948,12 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
19521948
if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
19531949
return false;
19541950
}
1955-
if (size_in_bits < 128 ) {
1951+
if (!is_LP64 && !VM_Version::supports_avx512vl() && size_in_bits < 512) {
19561952
return false;
19571953
}
1958-
if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
1954+
if (size_in_bits < 128 ) {
19591955
return false;
19601956
}
1961-
break;
19621957
case Op_VectorLongToMask:
19631958
if (UseAVX < 1 || !is_LP64) {
19641959
return false;
@@ -9178,8 +9173,26 @@ instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp,
91789173
%}
91799174

91809175
// --------------------------------- Compress/Expand Operations ---------------------------
9176+
#ifdef _LP64
9177+
instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
9178+
predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
9179+
match(Set dst (CompressV src mask));
9180+
match(Set dst (ExpandV src mask));
9181+
effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
9182+
format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
9183+
ins_encode %{
9184+
int opcode = this->ideal_Opcode();
9185+
int vlen_enc = vector_length_encoding(this);
9186+
BasicType bt = Matcher::vector_element_basic_type(this);
9187+
__ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
9188+
$rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
9189+
%}
9190+
ins_pipe( pipe_slow );
9191+
%}
9192+
#endif
91819193

91829194
instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
9195+
predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
91839196
match(Set dst (CompressV src mask));
91849197
match(Set dst (ExpandV src mask));
91859198
format %{ "vector_compress_expand $dst, $src, $mask" %}

0 commit comments

Comments
 (0)