Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 18 additions & 5 deletions src/hotspot/share/opto/vectorization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,10 @@ VStatus VLoopAnalyzer::setup_submodules_helper() {
return body_status;
}

_memory_slices.find_memory_slices();
VStatus slices_status = _memory_slices.find_memory_slices();
if (!slices_status.is_success()) {
return slices_status;
}

// If there is no memory slice detected, it means there is no store.
// If there is no reduction and no store, then we give up, because
Expand All @@ -207,9 +210,11 @@ VStatus VLoopAnalyzer::setup_submodules_helper() {
}

// There are 2 kinds of slices:
// - No memory phi: only loads. All have the same input memory state from before the loop.
// - No memory phi: only loads.
// - Usually, all loads have the same input memory state from before the loop.
// - Only rarely this is not the case, and we just bail out for now.
// - With memory phi. Chain of memory operations inside the loop.
void VLoopMemorySlices::find_memory_slices() {
VStatus VLoopMemorySlices::find_memory_slices() {
Compile* C = _vloop.phase()->C;
// We iterate over the body, which is topologically sorted. Hence, if there is a phi
// in a slice, we will find it first, and the loads and stores afterwards.
Expand All @@ -228,8 +233,15 @@ void VLoopMemorySlices::find_memory_slices() {
PhiNode* head = _heads.at(alias_idx);
if (head == nullptr) {
// We did not find a phi on this slice yet -> must be a slice with only loads.
assert(_inputs.at(alias_idx) == nullptr || _inputs.at(alias_idx) == load->in(1),
"not yet touched or the same input");
// For now, we can only handle slices with a single memory input before the loop,
// so if we find multiple, we bail out of auto vectorization. If this becomes
// too restrictive in the fututure, we could consider tracking multiple inputs.
// Different memory inputs can for example happen if one load has its memory state
// optimized, and the other load fails to have it optimized, for example because
// it does not end up on the IGVN worklist any more.
if (_inputs.at(alias_idx) != nullptr && _inputs.at(alias_idx) != load->in(1)) {
return VStatus::make_failure(FAILURE_DIFFERENT_MEMORY_INPUT);
}
_inputs.at_put(alias_idx, load->in(1));
} // else: the load belongs to a slice with a phi that already set heads and inputs.
#ifdef ASSERT
Expand All @@ -243,6 +255,7 @@ void VLoopMemorySlices::find_memory_slices() {
}
}
NOT_PRODUCT( if (_vloop.is_trace_memory_slices()) { print(); } )
return VStatus::make_success();
}

#ifndef PRODUCT
Expand Down
6 changes: 4 additions & 2 deletions src/hotspot/share/opto/vectorization.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023, 2025, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2023, 2026, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2023, Arm Limited. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
Expand Down Expand Up @@ -504,6 +504,8 @@ class VLoopBody : public StackObj {
//
class VLoopMemorySlices : public StackObj {
private:
static constexpr char const* FAILURE_DIFFERENT_MEMORY_INPUT = "Load only slice has multiple memory inputs";

const VLoop& _vloop;
const VLoopBody& _body;

Expand All @@ -521,7 +523,7 @@ class VLoopMemorySlices : public StackObj {
const GrowableArray<Node*>& inputs() const { return _inputs; }
const GrowableArray<PhiNode*>& heads() const { return _heads; }

void find_memory_slices();
VStatus find_memory_slices();
void get_slice_in_reverse_order(PhiNode* head, MemNode* tail, GrowableArray<MemNode*>& slice) const;
bool same_memory_slice(MemNode* m1, MemNode* m2) const;

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/*
* Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/

/*
* @test id=all-flags
* @summary Test a case where we can have one memory slice that has only loads,
* but the loads from the slice do not have all the same input memory
* state from before the loop. This is rather rare but it can happen.
* @bug 8373453
* @run main/othervm
* -XX:CompileCommand=compileonly,${test.main.class}::test
* -Xbatch -XX:-TieredCompilation
* ${test.main.class}
*/

/*
* @test id=fewer-flags
* @bug 8373453
* @run main/othervm
* -XX:CompileCommand=compileonly,${test.main.class}::test
* ${test.main.class}
*/

/*
* @test id=vanilla
* @bug 8373453
* @run main ${test.main.class}
*/

package compiler.loopopts.superword;

public class TestLoadSliceWithMultipleMemoryInputStates {
static void test() {
// The relevant slice is the value field of the Byte Objects.
Byte x = 1;

for (int i = 0; i < 2; i++) {
if ((i & 1) == 0) {
// Not sure what this loop is needed for, but it is very sensitive,
// I cannot even replace N with 32.
int N = 32;
for (int j = 0; j < N; j++) {
if (j == 1) {
x = (byte) x;
}
}

for (int j = 0; j < 32; j++) {
// The call below has an effect on the memory state
// If we optimize the Load for Byte::value, we can bypass
// this call, since we know that Byte::value cannot be
// modified during the call.
Object o = 1;
o.toString();

for (int k = 0; k < 32; k++) { // OSR around here
// Loads of x byte field have different memory input states
// This is because some loads can split their memory state
// through a phi further up, and others are not put back on
// the IGVN worklist and are thus not optimized and keep
// the old memory state. Both are correct though.
x = (byte) (x + 1);
}
}
}
}
}

public static void main(String[] args) {
for (int i = 0; i < 10_000; i++) {
test();
}
}
}