@@ -42,9 +42,7 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) :
4242 ),
4343 _vpointer_for_main_loop_alignment(nullptr ),
4444 _aw_for_main_loop_alignment(0 ),
45- _do_vector_loop(phase()->C->do_vector_loop()), // whether to do vectorization/simd style
46- _num_work_vecs(0 ), // amount of vector work we have
47- _num_reductions(0 ) // amount of reduction work we have
45+ _do_vector_loop(phase()->C->do_vector_loop()) // whether to do vectorization/simd style
4846{
4947}
5048
@@ -1567,18 +1565,6 @@ void SuperWord::filter_packs_for_implemented() {
15671565
15681566// Remove packs that are not profitable.
15691567void SuperWord::filter_packs_for_profitable () {
1570- // Count the number of reductions vs other vector ops, for the
1571- // reduction profitability heuristic.
1572- for (int i = 0 ; i < _packset.length (); i++) {
1573- Node_List* pack = _packset.at (i);
1574- Node* n = pack->at (0 );
1575- if (is_marked_reduction (n)) {
1576- _num_reductions++;
1577- } else {
1578- _num_work_vecs++;
1579- }
1580- }
1581-
15821568 // Remove packs that are not profitable
15831569 auto filter = [&](const Node_List* pack) {
15841570 return profitable (pack);
@@ -1595,31 +1581,7 @@ bool SuperWord::implemented(const Node_List* pack, const uint size) const {
15951581 if (p0 != nullptr ) {
15961582 int opc = p0->Opcode ();
15971583 if (is_marked_reduction (p0)) {
1598- const Type *arith_type = p0->bottom_type ();
1599- // This heuristic predicts that 2-element reductions for INT/LONG are not
1600- // profitable. This heuristic was added in JDK-8078563. The argument
1601- // was that reductions are not just a single instruction, but multiple, and
1602- // hence it is not directly clear that they are profitable. If we only have
1603- // two elements per vector, then the performance gains from non-reduction
1604- // vectors are at most going from 2 scalar instructions to 1 vector instruction.
1605- // But a 2-element reduction vector goes from 2 scalar instructions to
1606- // 3 instructions (1 shuffle and two reduction ops).
1607- // However, this optimization assumes that these reductions stay in the loop
1608- // which may not be true any more in most cases after the introduction of:
1609- // See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
1610- // Hence, this heuristic has room for improvement.
1611- bool is_two_element_int_or_long_reduction = (size == 2 ) &&
1612- (arith_type->basic_type () == T_INT ||
1613- arith_type->basic_type () == T_LONG);
1614- if (is_two_element_int_or_long_reduction && AutoVectorizationOverrideProfitability != 2 ) {
1615- #ifndef PRODUCT
1616- if (is_trace_superword_rejections ()) {
1617- tty->print_cr (" \n Performance heuristic: 2-element INT/LONG reduction not profitable." );
1618- tty->print_cr (" Can override with AutoVectorizationOverrideProfitability=2" );
1619- }
1620- #endif
1621- return false ;
1622- }
1584+ const Type* arith_type = p0->bottom_type ();
16231585 retValue = ReductionNode::implemented (opc, size, arith_type->basic_type ());
16241586 } else if (VectorNode::is_convert_opcode (opc)) {
16251587 retValue = VectorCastNode::implemented (opc, size, velt_basic_type (p0->in (1 )), velt_basic_type (p0));
@@ -1791,26 +1753,6 @@ bool SuperWord::profitable(const Node_List* p) const {
17911753 // The second input has to be the vector we wanted to reduce,
17921754 // but it was not packed.
17931755 return false ;
1794- } else if (_num_work_vecs == _num_reductions && AutoVectorizationOverrideProfitability != 2 ) {
1795- // This heuristic predicts that the reduction is not profitable.
1796- // Reduction vectors can be expensive, because they require multiple
1797- // operations to fold all the lanes together. Hence, vectorizing the
1798- // reduction is not profitable on its own. Hence, we need a lot of
1799- // other "work vectors" that deliver performance improvements to
1800- // balance out the performance loss due to reductions.
1801- // This heuristic is a bit simplistic, and assumes that the reduction
1802- // vector stays in the loop. But in some cases, we can move the
1803- // reduction out of the loop, replacing it with a single vector op.
1804- // See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
1805- // Hence, this heuristic has room for improvement.
1806- #ifndef PRODUCT
1807- if (is_trace_superword_rejections ()) {
1808- tty->print_cr (" \n Performance heuristic: not enough vectors in the loop to make" );
1809- tty->print_cr (" reduction profitable." );
1810- tty->print_cr (" Can override with AutoVectorizationOverrideProfitability=2" );
1811- }
1812- #endif
1813- return false ;
18141756 } else if (second_pk->size () != p->size ()) {
18151757 return false ;
18161758 }
@@ -1969,19 +1911,53 @@ bool SuperWord::do_vtransform() const {
19691911 vtransform.optimize ();
19701912
19711913 if (!vtransform.schedule ()) { return false ; }
1972- if (vtransform.has_store_to_load_forwarding_failure ()) { return false ; }
1914+
1915+ if (!vtransform.is_profitable ()) { return false ; }
1916+
1917+ vtransform.apply ();
1918+ return true ;
1919+ }
1920+
1921+ // Check Cost-Model, and other heuristics.
1922+ // Can be overridden with AutoVectorizationOverrideProfitability.
1923+ bool VTransform::is_profitable () const {
1924+ assert (_graph.is_scheduled (), " must already be scheduled" );
19731925
19741926 if (AutoVectorizationOverrideProfitability == 0 ) {
19751927#ifndef PRODUCT
1976- if (is_trace_superword_any () ) {
1928+ if (_trace. _info ) {
19771929 tty->print_cr (" \n Forced bailout of vectorization (AutoVectorizationOverrideProfitability=0)." );
19781930 }
19791931#endif
19801932 return false ;
19811933 }
19821934
1983- vtransform.apply ();
1984- return true ;
1935+ if (AutoVectorizationOverrideProfitability == 2 ) {
1936+ #ifndef PRODUCT
1937+ if (_trace._info ) {
1938+ tty->print_cr (" \n Forced vectorization, ignoring profitability (AutoVectorizationOverrideProfitability=2)." );
1939+ }
1940+ #endif
1941+ return true ;
1942+ }
1943+
1944+ // Note: currently we only do throughput-based cost-modeling. In the future, we could
1945+ // also implement latency-based cost-modeling and take store-to-load-forwarding
1946+ // failures into account as the latency between the load and store. This would
1947+ // allow a more precise tradeoff between the forwarding failure penalty versus
1948+ // the vectorization gains.
1949+ if (has_store_to_load_forwarding_failure ()) { return false ; }
1950+
1951+ // Cost-model
1952+ float scalar_cost = _vloop_analyzer.cost_for_scalar_loop ();
1953+ float vector_cost = cost_for_vector_loop ();
1954+ #ifndef PRODUCT
1955+ if (_trace._info ) {
1956+ tty->print_cr (" \n VTransform: scalar_cost = %.2f vs vector_cost = %.2f" ,
1957+ scalar_cost, vector_cost);
1958+ }
1959+ #endif
1960+ return vector_cost < scalar_cost;
19851961}
19861962
19871963// Apply the vectorization, i.e. we irreversibly edit the C2 graph. At this point, all
0 commit comments