Skip to content

Commit 6a42fba

Browse files
committed
8279258: Auto-vectorization enhancement for two-dimensional array operations
Reviewed-by: neliasso, kvn
1 parent 8d0f385 commit 6a42fba

File tree

5 files changed

+218
-8
lines changed

5 files changed

+218
-8
lines changed

src/hotspot/share/opto/loopTransform.cpp

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2000, 2021, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2000, 2022, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* This code is free software; you can redistribute it and/or modify it
@@ -899,15 +899,27 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
899899
return false;
900900
}
901901

902+
bool should_unroll = true;
903+
902904
// When unroll count is greater than LoopUnrollMin, don't unroll if:
903905
// the residual iterations are more than 10% of the trip count
904906
// and rounds of "unroll,optimize" are not making significant progress
905907
// Progress defined as current size less than 20% larger than previous size.
906908
if (UseSuperWord && cl->node_count_before_unroll() > 0 &&
907909
future_unroll_cnt > LoopUnrollMin &&
908-
(future_unroll_cnt - 1) * (100.0 / LoopPercentProfileLimit) > cl->profile_trip_cnt() &&
910+
is_residual_iters_large(future_unroll_cnt, cl) &&
909911
1.2 * cl->node_count_before_unroll() < (double)_body.size()) {
910-
return false;
912+
if ((cl->slp_max_unroll() == 0) && !is_residual_iters_large(cl->unrolled_count(), cl)) {
913+
// cl->slp_max_unroll() = 0 means that the previous slp analysis never passed.
914+
// slp analysis may fail due to the loop IR is too complicated especially during the early stage
915+
// of loop unrolling analysis. But after several rounds of loop unrolling and other optimizations,
916+
// it's possible that the loop IR becomes simple enough to pass the slp analysis.
917+
// So we don't return immediately in hoping that the next slp analysis can succeed.
918+
should_unroll = false;
919+
future_unroll_cnt = cl->unrolled_count();
920+
} else {
921+
return false;
922+
}
911923
}
912924

913925
Node *init_n = cl->init_trip();
@@ -985,7 +997,7 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
985997
}
986998

987999
// Only attempt slp analysis when user controls do not prohibit it
988-
if (LoopMaxUnroll > _local_loop_unroll_factor) {
1000+
if (!cl->range_checks_present() && (LoopMaxUnroll > _local_loop_unroll_factor)) {
9891001
// Once policy_slp_analysis succeeds, mark the loop with the
9901002
// maximal unroll factor so that we minimize analysis passes
9911003
if (future_unroll_cnt >= _local_loop_unroll_factor) {
@@ -1003,15 +1015,15 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
10031015

10041016
if (cl->has_passed_slp()) {
10051017
if (slp_max_unroll_factor >= future_unroll_cnt) {
1006-
return phase->may_require_nodes(estimate);
1018+
return should_unroll && phase->may_require_nodes(estimate);
10071019
}
10081020
return false; // Loop too big.
10091021
}
10101022

10111023
// Check for being too big
10121024
if (body_size > (uint)_local_loop_unroll_limit) {
10131025
if ((cl->is_subword_loop() || xors_in_loop >= 4) && body_size < 4u * LoopUnrollLimit) {
1014-
return phase->may_require_nodes(estimate);
1026+
return should_unroll && phase->may_require_nodes(estimate);
10151027
}
10161028
return false; // Loop too big.
10171029
}
@@ -1024,7 +1036,7 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
10241036
}
10251037

10261038
// Unroll once! (Each trip will soon do double iterations)
1027-
return phase->may_require_nodes(estimate);
1039+
return should_unroll && phase->may_require_nodes(estimate);
10281040
}
10291041

10301042
void IdealLoopTree::policy_unroll_slp_analysis(CountedLoopNode *cl, PhaseIdealLoop *phase, int future_unroll_cnt) {
@@ -3528,6 +3540,8 @@ bool IdealLoopTree::iteration_split_impl(PhaseIdealLoop *phase, Node_List &old_n
35283540
if (should_rce) {
35293541
if (phase->do_range_check(this, old_new) != 0) {
35303542
cl->mark_has_range_checks();
3543+
} else {
3544+
cl->clear_has_range_checks();
35313545
}
35323546
} else if (PostLoopMultiversioning) {
35333547
phase->has_range_checks(this);

src/hotspot/share/opto/loopnode.hpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 1998, 2021, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 1998, 2022, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* This code is free software; you can redistribute it and/or modify it
@@ -114,6 +114,7 @@ class LoopNode : public RegionNode {
114114
void mark_loop_vectorized() { _loop_flags |= VectorizedLoop; }
115115
void mark_has_atomic_post_loop() { _loop_flags |= HasAtomicPostLoop; }
116116
void mark_has_range_checks() { _loop_flags |= HasRangeChecks; }
117+
void clear_has_range_checks() { _loop_flags &= ~HasRangeChecks; }
117118
void mark_is_multiversioned() { _loop_flags |= IsMultiversioned; }
118119
void mark_strip_mined() { _loop_flags |= StripMined; }
119120
void clear_strip_mined() { _loop_flags &= ~StripMined; }
@@ -773,6 +774,12 @@ class IdealLoopTree : public ResourceObj {
773774

774775
// Estimate the number of nodes resulting from control and data flow merge.
775776
uint est_loop_flow_merge_sz() const;
777+
778+
// Check if the number of residual iterations is large with unroll_cnt.
779+
// Return true if the residual iterations are more than 10% of the trip count.
780+
bool is_residual_iters_large(int unroll_cnt, CountedLoopNode *cl) const {
781+
return (unroll_cnt - 1) * (100.0 / LoopPercentProfileLimit) > cl->profile_trip_cnt();
782+
}
776783
};
777784

778785
// -----------------------------PhaseIdealLoop---------------------------------
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
/*
2+
* Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* This code is free software; you can redistribute it and/or modify it
6+
* under the terms of the GNU General Public License version 2 only, as
7+
* published by the Free Software Foundation.
8+
*
9+
* This code is distributed in the hope that it will be useful, but WITHOUT
10+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12+
* version 2 for more details (a copy is included in the LICENSE file that
13+
* accompanied this code).
14+
*
15+
* You should have received a copy of the GNU General Public License version
16+
* 2 along with this work; if not, write to the Free Software Foundation,
17+
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18+
*
19+
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20+
* or visit www.oracle.com if you need additional information or have any
21+
* questions.
22+
*/
23+
24+
package compiler.c2.irTests;
25+
26+
import compiler.lib.ir_framework.*;
27+
28+
/*
29+
* @test
30+
* @bug 8279258
31+
* @summary Auto-vectorization enhancement for two-dimensional array operations
32+
* @library /test/lib /
33+
* @run driver compiler.c2.irTests.TestAutoVectorization2DArray
34+
*/
35+
36+
public class TestAutoVectorization2DArray {
37+
final private static int NUM = 64;
38+
39+
private static double[][] a = new double[NUM][NUM];
40+
private static double[][] b = new double[NUM][NUM];
41+
private static double[][] c = new double[NUM][NUM];
42+
43+
public static void main(String[] args) {
44+
TestFramework.run();
45+
}
46+
47+
@Test
48+
@IR(counts = { IRNode.LOAD_VECTOR, " >0 " })
49+
@IR(counts = { IRNode.ADD_VD, " >0 " })
50+
@IR(counts = { IRNode.STORE_VECTOR, " >0 " })
51+
private static void testDouble(double[][] a , double[][] b, double[][] c) {
52+
for(int i = 0; i < a.length; i++) {
53+
for (int j = 0; j < a[0].length; j++) {
54+
a[i][j] = b[i][j] + c[i][j];
55+
}
56+
}
57+
}
58+
59+
@Run(test = "testDouble")
60+
private void testDouble_runner() {
61+
testDouble(a, b, c);
62+
}
63+
}

test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ public class IRNode {
7474
public static final String STORE_D = START + "StoreD" + MID + END;
7575
public static final String STORE_P = START + "StoreP" + MID + END;
7676
public static final String STORE_N = START + "StoreN" + MID + END;
77+
public static final String STORE_VECTOR = START + "StoreVector" + MID + END;
7778
public static final String STORE_OF_CLASS = COMPOSITE_PREFIX + START + "Store(B|C|S|I|L|F|D|P|N)" + MID + "@\\S*" + IS_REPLACED + STORE_OF_CLASS_POSTFIX;
7879
public static final String STORE_B_OF_CLASS = COMPOSITE_PREFIX + START + "StoreB" + MID + "@\\S*" + IS_REPLACED + STORE_OF_CLASS_POSTFIX;
7980
public static final String STORE_C_OF_CLASS = COMPOSITE_PREFIX + START + "StoreC" + MID + "@\\S*" + IS_REPLACED + STORE_OF_CLASS_POSTFIX;
@@ -96,6 +97,7 @@ public class IRNode {
9697
public static final String LOAD_D = START + "LoadD" + MID + END;
9798
public static final String LOAD_P = START + "LoadP" + MID + END;
9899
public static final String LOAD_N = START + "LoadN" + MID + END;
100+
public static final String LOAD_VECTOR = START + "LoadVector" + MID + END;
99101
public static final String LOAD_OF_CLASS = COMPOSITE_PREFIX + START + "Load(B|UB|S|US|I|L|F|D|P|N)" + MID + "@\\S*"+ IS_REPLACED + LOAD_OF_CLASS_POSTFIX;
100102
public static final String LOAD_B_OF_CLASS = COMPOSITE_PREFIX + START + "LoadB" + MID + "@\\S*" + IS_REPLACED + LOAD_OF_CLASS_POSTFIX;
101103
public static final String LOAD_UB_OF_CLASS = COMPOSITE_PREFIX + START + "LoadUB" + MID + "@\\S*" + IS_REPLACED + LOAD_OF_CLASS_POSTFIX;
@@ -139,6 +141,7 @@ public class IRNode {
139141
public static final String LSHIFT_L = START + "LShiftL" + MID + END;
140142
public static final String ADD_I = START + "AddI" + MID + END;
141143
public static final String ADD_L = START + "AddL" + MID + END;
144+
public static final String ADD_VD = START + "AddVD" + MID + END;
142145
public static final String SUB_I = START + "SubI" + MID + END;
143146
public static final String SUB_L = START + "SubL" + MID + END;
144147
public static final String MUL_I = START + "MulI" + MID + END;
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
/*
2+
* Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* This code is free software; you can redistribute it and/or modify it
6+
* under the terms of the GNU General Public License version 2 only, as
7+
* published by the Free Software Foundation.
8+
*
9+
* This code is distributed in the hope that it will be useful, but WITHOUT
10+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12+
* version 2 for more details (a copy is included in the LICENSE file that
13+
* accompanied this code).
14+
*
15+
* You should have received a copy of the GNU General Public License version
16+
* 2 along with this work; if not, write to the Free Software Foundation,
17+
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18+
*
19+
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20+
* or visit www.oracle.com if you need additional information or have any
21+
* questions.
22+
*/
23+
24+
package org.openjdk.bench.vm.compiler;
25+
26+
import org.openjdk.jmh.annotations.*;
27+
import org.openjdk.jmh.infra.*;
28+
import java.util.concurrent.TimeUnit;
29+
30+
@Warmup(iterations = 3, time = 5, timeUnit = TimeUnit.SECONDS)
31+
@Measurement(iterations = 3, time = 5, timeUnit = TimeUnit.SECONDS)
32+
@BenchmarkMode(Mode.Throughput)
33+
@OutputTimeUnit(TimeUnit.SECONDS)
34+
@State(Scope.Thread)
35+
@Fork(value=1)
36+
public class AutoVectorization2DArray {
37+
@Param({"16", "32", "64"})
38+
private int LEN;
39+
40+
private byte[][] a_byte;
41+
private byte[][] b_byte;
42+
private byte[][] c_byte;
43+
44+
private int[][] a_int;
45+
private int[][] b_int;
46+
private int[][] c_int;
47+
48+
private double[][] a_double;
49+
private double[][] b_double;
50+
private double[][] c_double;
51+
52+
@Setup
53+
public void init() {
54+
a_byte = new byte[LEN][LEN];
55+
b_byte = new byte[LEN][LEN];
56+
c_byte = new byte[LEN][LEN];
57+
58+
a_int = new int[LEN][LEN];
59+
b_int = new int[LEN][LEN];
60+
c_int = new int[LEN][LEN];
61+
62+
a_double = new double[LEN][LEN];
63+
b_double = new double[LEN][LEN];
64+
c_double = new double[LEN][LEN];
65+
}
66+
67+
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
68+
private int run_byte(int count, byte[][] a , byte[][] b, byte[][] c) {
69+
for(int i = 0; i < a.length; i++) {
70+
for (int j = 0; j < a[0].length; j++) {
71+
a[i][j] = (byte)(b[i][j] + c[i][j]);
72+
}
73+
}
74+
return a[count][count];
75+
}
76+
77+
@Benchmark
78+
public void test_run_byte(Blackhole bh) {
79+
int r = 0;
80+
for(int i = 0 ; i < 100; i++) {
81+
r += run_byte(i % a_byte.length, a_byte, b_byte, c_byte);
82+
}
83+
bh.consume(r);
84+
}
85+
86+
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
87+
private int run_int(int count, int[][] a, int[][] b, int[][] c) {
88+
for(int i = 0; i < a.length; i++) {
89+
for (int j = 0; j < a[0].length; j++) {
90+
a[i][j] = b[i][j] + c[i][j];
91+
}
92+
}
93+
return a[count][count];
94+
}
95+
96+
@Benchmark
97+
public void test_run_int(Blackhole bh) {
98+
int r = 0;
99+
for(int i = 0 ; i < 100; i++) {
100+
r += run_int(i % a_int.length, a_int, b_int, c_int);
101+
}
102+
bh.consume(r);
103+
}
104+
105+
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
106+
private double run_double(int count, double[][] a, double[][] b, double[][] c) {
107+
for(int i = 0; i < a.length; i++) {
108+
for (int j = 0; j < a[0].length; j++) {
109+
a[i][j] = b[i][j] + c[i][j];
110+
}
111+
}
112+
return a[count][count];
113+
}
114+
115+
@Benchmark
116+
public void test_run_double(Blackhole bh) {
117+
double r = 0;
118+
for(int i = 0 ; i < 100; i++) {
119+
r += run_double(i % a_double.length, a_double, b_double, c_double);
120+
}
121+
bh.consume(r);
122+
}
123+
}

0 commit comments

Comments
 (0)