Skip to content

Commit 72989e0

Browse files
committed
8340093: C2 SuperWord: implement cost model
Reviewed-by: kvn, qamai
1 parent 6e838d6 commit 72989e0

File tree

13 files changed

+2884
-94
lines changed

13 files changed

+2884
-94
lines changed

src/hotspot/cpu/aarch64/aarch64_vector.ad

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -129,18 +129,24 @@ source %{
129129
bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
130130
if (UseSVE == 0) {
131131
// These operations are not profitable to be vectorized on NEON, because no direct
132-
// NEON instructions support them. But the match rule support for them is profitable for
133-
// Vector API intrinsics.
132+
// NEON instructions support them. They use multiple instructions which is more
133+
// expensive in almost all cases where we would auto vectorize.
134+
// But the match rule support for them is profitable for Vector API intrinsics.
134135
if ((opcode == Op_VectorCastD2X && (bt == T_INT || bt == T_SHORT)) ||
135136
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
136137
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
137138
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
139+
opcode == Op_MulVL ||
138140
// The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
139141
// They are not suitable for auto-vectorization because the result would not conform
140142
// to the JLS, Section Evaluation Order.
143+
// Note: we could implement sequential reductions for these reduction operators, but
144+
// this will still almost never lead to speedups, because the sequential
145+
// reductions are latency limited along the reduction chain, and not
146+
// throughput limited. This is unlike unordered reductions (associative op)
147+
// and element-wise ops which are usually throughput limited.
141148
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
142-
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
143-
opcode == Op_MulVL) {
149+
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) {
144150
return false;
145151
}
146152
}

src/hotspot/cpu/aarch64/aarch64_vector_ad.m4

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -119,18 +119,24 @@ source %{
119119
bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
120120
if (UseSVE == 0) {
121121
// These operations are not profitable to be vectorized on NEON, because no direct
122-
// NEON instructions support them. But the match rule support for them is profitable for
123-
// Vector API intrinsics.
122+
// NEON instructions support them. They use multiple instructions which is more
123+
// expensive in almost all cases where we would auto vectorize.
124+
// But the match rule support for them is profitable for Vector API intrinsics.
124125
if ((opcode == Op_VectorCastD2X && (bt == T_INT || bt == T_SHORT)) ||
125126
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
126127
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
127128
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
129+
opcode == Op_MulVL ||
128130
// The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
129131
// They are not suitable for auto-vectorization because the result would not conform
130132
// to the JLS, Section Evaluation Order.
133+
// Note: we could implement sequential reductions for these reduction operators, but
134+
// this will still almost never lead to speedups, because the sequential
135+
// reductions are latency limited along the reduction chain, and not
136+
// throughput limited. This is unlike unordered reductions (associative op)
137+
// and element-wise ops which are usually throughput limited.
131138
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
132-
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
133-
opcode == Op_MulVL) {
139+
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) {
134140
return false;
135141
}
136142
}

src/hotspot/share/opto/superword.cpp

Lines changed: 40 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,7 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) :
4242
),
4343
_vpointer_for_main_loop_alignment(nullptr),
4444
_aw_for_main_loop_alignment(0),
45-
_do_vector_loop(phase()->C->do_vector_loop()), // whether to do vectorization/simd style
46-
_num_work_vecs(0), // amount of vector work we have
47-
_num_reductions(0) // amount of reduction work we have
45+
_do_vector_loop(phase()->C->do_vector_loop()) // whether to do vectorization/simd style
4846
{
4947
}
5048

@@ -1567,18 +1565,6 @@ void SuperWord::filter_packs_for_implemented() {
15671565

15681566
// Remove packs that are not profitable.
15691567
void SuperWord::filter_packs_for_profitable() {
1570-
// Count the number of reductions vs other vector ops, for the
1571-
// reduction profitability heuristic.
1572-
for (int i = 0; i < _packset.length(); i++) {
1573-
Node_List* pack = _packset.at(i);
1574-
Node* n = pack->at(0);
1575-
if (is_marked_reduction(n)) {
1576-
_num_reductions++;
1577-
} else {
1578-
_num_work_vecs++;
1579-
}
1580-
}
1581-
15821568
// Remove packs that are not profitable
15831569
auto filter = [&](const Node_List* pack) {
15841570
return profitable(pack);
@@ -1595,31 +1581,7 @@ bool SuperWord::implemented(const Node_List* pack, const uint size) const {
15951581
if (p0 != nullptr) {
15961582
int opc = p0->Opcode();
15971583
if (is_marked_reduction(p0)) {
1598-
const Type *arith_type = p0->bottom_type();
1599-
// This heuristic predicts that 2-element reductions for INT/LONG are not
1600-
// profitable. This heuristic was added in JDK-8078563. The argument
1601-
// was that reductions are not just a single instruction, but multiple, and
1602-
// hence it is not directly clear that they are profitable. If we only have
1603-
// two elements per vector, then the performance gains from non-reduction
1604-
// vectors are at most going from 2 scalar instructions to 1 vector instruction.
1605-
// But a 2-element reduction vector goes from 2 scalar instructions to
1606-
// 3 instructions (1 shuffle and two reduction ops).
1607-
// However, this optimization assumes that these reductions stay in the loop
1608-
// which may not be true any more in most cases after the introduction of:
1609-
// See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
1610-
// Hence, this heuristic has room for improvement.
1611-
bool is_two_element_int_or_long_reduction = (size == 2) &&
1612-
(arith_type->basic_type() == T_INT ||
1613-
arith_type->basic_type() == T_LONG);
1614-
if (is_two_element_int_or_long_reduction && AutoVectorizationOverrideProfitability != 2) {
1615-
#ifndef PRODUCT
1616-
if (is_trace_superword_rejections()) {
1617-
tty->print_cr("\nPerformance heuristic: 2-element INT/LONG reduction not profitable.");
1618-
tty->print_cr(" Can override with AutoVectorizationOverrideProfitability=2");
1619-
}
1620-
#endif
1621-
return false;
1622-
}
1584+
const Type* arith_type = p0->bottom_type();
16231585
retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
16241586
} else if (VectorNode::is_convert_opcode(opc)) {
16251587
retValue = VectorCastNode::implemented(opc, size, velt_basic_type(p0->in(1)), velt_basic_type(p0));
@@ -1791,26 +1753,6 @@ bool SuperWord::profitable(const Node_List* p) const {
17911753
// The second input has to be the vector we wanted to reduce,
17921754
// but it was not packed.
17931755
return false;
1794-
} else if (_num_work_vecs == _num_reductions && AutoVectorizationOverrideProfitability != 2) {
1795-
// This heuristic predicts that the reduction is not profitable.
1796-
// Reduction vectors can be expensive, because they require multiple
1797-
// operations to fold all the lanes together. Hence, vectorizing the
1798-
// reduction is not profitable on its own. Hence, we need a lot of
1799-
// other "work vectors" that deliver performance improvements to
1800-
// balance out the performance loss due to reductions.
1801-
// This heuristic is a bit simplistic, and assumes that the reduction
1802-
// vector stays in the loop. But in some cases, we can move the
1803-
// reduction out of the loop, replacing it with a single vector op.
1804-
// See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
1805-
// Hence, this heuristic has room for improvement.
1806-
#ifndef PRODUCT
1807-
if (is_trace_superword_rejections()) {
1808-
tty->print_cr("\nPerformance heuristic: not enough vectors in the loop to make");
1809-
tty->print_cr(" reduction profitable.");
1810-
tty->print_cr(" Can override with AutoVectorizationOverrideProfitability=2");
1811-
}
1812-
#endif
1813-
return false;
18141756
} else if (second_pk->size() != p->size()) {
18151757
return false;
18161758
}
@@ -1969,19 +1911,53 @@ bool SuperWord::do_vtransform() const {
19691911
vtransform.optimize();
19701912

19711913
if (!vtransform.schedule()) { return false; }
1972-
if (vtransform.has_store_to_load_forwarding_failure()) { return false; }
1914+
1915+
if (!vtransform.is_profitable()) { return false; }
1916+
1917+
vtransform.apply();
1918+
return true;
1919+
}
1920+
1921+
// Check Cost-Model, and other heuristics.
1922+
// Can be overridden with AutoVectorizationOverrideProfitability.
1923+
bool VTransform::is_profitable() const {
1924+
assert(_graph.is_scheduled(), "must already be scheduled");
19731925

19741926
if (AutoVectorizationOverrideProfitability == 0) {
19751927
#ifndef PRODUCT
1976-
if (is_trace_superword_any()) {
1928+
if (_trace._info) {
19771929
tty->print_cr("\nForced bailout of vectorization (AutoVectorizationOverrideProfitability=0).");
19781930
}
19791931
#endif
19801932
return false;
19811933
}
19821934

1983-
vtransform.apply();
1984-
return true;
1935+
if (AutoVectorizationOverrideProfitability == 2) {
1936+
#ifndef PRODUCT
1937+
if (_trace._info) {
1938+
tty->print_cr("\nForced vectorization, ignoring profitability (AutoVectorizationOverrideProfitability=2).");
1939+
}
1940+
#endif
1941+
return true;
1942+
}
1943+
1944+
// Note: currently we only do throughput-based cost-modeling. In the future, we could
1945+
// also implement latency-based cost-modeling and take store-to-load-forwarding
1946+
// failures into account as the latency between the load and store. This would
1947+
// allow a more precise tradeoff between the forwarding failure penalty versus
1948+
// the vectorization gains.
1949+
if (has_store_to_load_forwarding_failure()) { return false; }
1950+
1951+
// Cost-model
1952+
float scalar_cost = _vloop_analyzer.cost_for_scalar_loop();
1953+
float vector_cost = cost_for_vector_loop();
1954+
#ifndef PRODUCT
1955+
if (_trace._info) {
1956+
tty->print_cr("\nVTransform: scalar_cost = %.2f vs vector_cost = %.2f",
1957+
scalar_cost, vector_cost);
1958+
}
1959+
#endif
1960+
return vector_cost < scalar_cost;
19851961
}
19861962

19871963
// Apply the vectorization, i.e. we irreversibly edit the C2 graph. At this point, all

src/hotspot/share/opto/superword.hpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -549,8 +549,6 @@ class SuperWord : public ResourceObj {
549549

550550
private:
551551
bool _do_vector_loop; // whether to do vectorization/simd style
552-
int _num_work_vecs; // Number of non memory vector operations
553-
int _num_reductions; // Number of reduction expressions applied
554552

555553
// Accessors
556554
Arena* arena() { return &_arena; }

src/hotspot/share/opto/traceAutoVectorizationTag.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
flags(MEMORY_SLICES, "Trace VLoopMemorySlices") \
3939
flags(BODY, "Trace VLoopBody") \
4040
flags(TYPES, "Trace VLoopTypes") \
41-
flags(POINTERS, "Trace VLoopPointers") \
41+
flags(POINTERS, "Trace VLoopVPointers") \
4242
flags(DEPENDENCY_GRAPH, "Trace VLoopDependencyGraph") \
4343
flags(SW_ADJACENT_MEMOPS, "Trace SuperWord::find_adjacent_memop_pairs") \
4444
flags(SW_REJECTIONS, "Trace SuperWord rejections (non vectorizations)") \
@@ -47,6 +47,8 @@
4747
flags(SW_VERBOSE, "Trace SuperWord verbose (all SW tags enabled)") \
4848
flags(VTRANSFORM, "Trace VTransform Graph") \
4949
flags(OPTIMIZATION, "Trace VTransform::optimize") \
50+
flags(COST, "Trace cost of VLoop (scalar) and VTransform (vector)") \
51+
flags(COST_VERBOSE, "Trace like COST, but more verbose") \
5052
flags(ALIGN_VECTOR, "Trace AlignVector") \
5153
flags(SPECULATIVE_ALIASING_ANALYSIS, "Trace Speculative Aliasing Analysis") \
5254
flags(SPECULATIVE_RUNTIME_CHECKS, "Trace VTransform::apply_speculative_runtime_checks") \

src/hotspot/share/opto/vectorization.cpp

Lines changed: 103 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,7 @@ void VLoopVPointers::compute_and_cache_vpointers() {
287287
int pointers_idx = 0;
288288
_body.for_each_mem([&] (MemNode* const mem, int bb_idx) {
289289
// Placement new: construct directly into the array.
290-
::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop);
290+
::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop, _pointer_expression_nodes);
291291
_bb_idx_to_vpointer.at_put(bb_idx, pointers_idx);
292292
pointers_idx++;
293293
});
@@ -541,6 +541,108 @@ void VLoopDependencyGraph::PredsIterator::next() {
541541
}
542542
}
543543

544+
// Cost-model heuristic for nodes that do not contribute to computational
545+
// cost inside the loop.
546+
bool VLoopAnalyzer::has_zero_cost(Node* n) const {
547+
// Outside body?
548+
if (!_vloop.in_bb(n)) { return true; }
549+
550+
// Internal nodes of pointer expressions are most likely folded into
551+
// the load / store and have no additional cost.
552+
if (vpointers().is_in_pointer_expression(n)) { return true; }
553+
554+
// Not all AddP nodes can be detected in VPointer parsing, so
555+
// we filter them out here.
556+
// We don't want to explicitly model the cost of control flow,
557+
// since we have the same CFG structure before and after
558+
// vectorization: A loop head, a loop exit, with a backedge.
559+
if (n->is_AddP() || // Pointer expression
560+
n->is_CFG() || // CFG
561+
n->is_Phi() || // CFG
562+
n->is_Cmp() || // CFG
563+
n->is_Bool()) { // CFG
564+
return true;
565+
}
566+
567+
// All other nodes have a non-zero cost.
568+
return false;
569+
}
570+
571+
// Compute the cost over all operations in the (scalar) loop.
572+
float VLoopAnalyzer::cost_for_scalar_loop() const {
573+
#ifndef PRODUCT
574+
if (_vloop.is_trace_cost()) {
575+
tty->print_cr("\nVLoopAnalyzer::cost_for_scalar_loop:");
576+
}
577+
#endif
578+
579+
float sum = 0;
580+
for (int j = 0; j < body().body().length(); j++) {
581+
Node* n = body().body().at(j);
582+
if (!has_zero_cost(n)) {
583+
float c = cost_for_scalar_node(n->Opcode());
584+
sum += c;
585+
#ifndef PRODUCT
586+
if (_vloop.is_trace_cost_verbose()) {
587+
tty->print_cr(" -> cost = %.2f for %d %s", c, n->_idx, n->Name());
588+
}
589+
#endif
590+
}
591+
}
592+
593+
#ifndef PRODUCT
594+
if (_vloop.is_trace_cost()) {
595+
tty->print_cr(" total_cost = %.2f", sum);
596+
}
597+
#endif
598+
return sum;
599+
}
600+
601+
// For now, we use unit cost. We might refine that in the future.
602+
// If needed, we could also use platform specific costs, if the
603+
// default here is not accurate enough.
604+
float VLoopAnalyzer::cost_for_scalar_node(int opcode) const {
605+
float c = 1;
606+
#ifndef PRODUCT
607+
if (_vloop.is_trace_cost()) {
608+
tty->print_cr(" cost = %.2f opc=%s", c, NodeClassNames[opcode]);
609+
}
610+
#endif
611+
return c;
612+
}
613+
614+
// For now, we use unit cost. We might refine that in the future.
615+
// If needed, we could also use platform specific costs, if the
616+
// default here is not accurate enough.
617+
float VLoopAnalyzer::cost_for_vector_node(int opcode, int vlen, BasicType bt) const {
618+
float c = 1;
619+
#ifndef PRODUCT
620+
if (_vloop.is_trace_cost()) {
621+
tty->print_cr(" cost = %.2f opc=%s vlen=%d bt=%s",
622+
c, NodeClassNames[opcode], vlen, type2name(bt));
623+
}
624+
#endif
625+
return c;
626+
}
627+
628+
// For now, we use unit cost, i.e. we count the number of backend instructions
629+
// that the vtnode will use. We might refine that in the future.
630+
// If needed, we could also use platform specific costs, if the
631+
// default here is not accurate enough.
632+
float VLoopAnalyzer::cost_for_vector_reduction_node(int opcode, int vlen, BasicType bt, bool requires_strict_order) const {
633+
// Each reduction is composed of multiple instructions, each estimated with a unit cost.
634+
// Linear: shuffle and reduce Recursive: shuffle and reduce
635+
float c = requires_strict_order ? 2 * vlen : 2 * exact_log2(vlen);
636+
#ifndef PRODUCT
637+
if (_vloop.is_trace_cost()) {
638+
tty->print_cr(" cost = %.2f opc=%s vlen=%d bt=%s requires_strict_order=%s",
639+
c, NodeClassNames[opcode], vlen, type2name(bt),
640+
requires_strict_order ? "true" : "false");
641+
}
642+
#endif
643+
return c;
644+
}
645+
544646
// Computing aliasing runtime check using init and last of main-loop
545647
// -----------------------------------------------------------------
546648
//

0 commit comments

Comments
 (0)