Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions make/common/native/Flags.gmk
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,9 @@ define SetupLinkerFlags
ifeq ($(call isTargetOs, macosx), true)
$1_EXTRA_LDFLAGS += -Wl,-object_path_lto,$$($1_OBJECT_DIR)/$$($1_NAME)_lto_helper.o
endif
ifeq ($(TOOLCHAIN_TYPE), microsoft)
$1_EXTRA_LDFLAGS += -LTCGOUT:$$($1_OBJECT_DIR)/$$($1_NAME).iobj
endif
endif

$1_EXTRA_LDFLAGS += $$($1_LDFLAGS_$(OPENJDK_TARGET_OS_TYPE)) $$($1_LDFLAGS_$(OPENJDK_TARGET_OS)) \
Expand Down
29 changes: 18 additions & 11 deletions src/hotspot/cpu/aarch64/aarch64_vector.ad
Original file line number Diff line number Diff line change
Expand Up @@ -346,8 +346,14 @@ source %{
}

bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
// Only SVE has partial vector operations
if (UseSVE == 0) {
// 1. Only SVE requires partial vector operations.
// 2. The vector size in bytes must be smaller than MaxVectorSize.
// 3. Predicated vectors have a mask input, which guarantees that
// out-of-bounds lanes remain inactive.
int length_in_bytes = vt->length_in_bytes();
if (UseSVE == 0 ||
length_in_bytes == MaxVectorSize ||
node->is_predicated_vector()) {
return false;
}

Expand All @@ -370,21 +376,22 @@ source %{
return !node->in(1)->is_Con();
case Op_LoadVector:
case Op_StoreVector:
// We use NEON load/store instructions if the vector length is <= 128 bits.
return vt->length_in_bytes() > 16;
case Op_AddReductionVI:
case Op_AddReductionVL:
// We may prefer using NEON instructions rather than SVE partial operations.
return !VM_Version::use_neon_for_vector(vt->length_in_bytes());
// For these ops, we prefer using NEON instructions rather than SVE
// predicated instructions for better performance.
return !VM_Version::use_neon_for_vector(length_in_bytes);
case Op_MinReductionV:
case Op_MaxReductionV:
// For BYTE/SHORT/INT/FLOAT/DOUBLE types, we may prefer using NEON
// instructions rather than SVE partial operations.
// For BYTE/SHORT/INT/FLOAT/DOUBLE types, we prefer using NEON
// instructions rather than SVE predicated instructions for
// better performance.
return vt->element_basic_type() == T_LONG ||
!VM_Version::use_neon_for_vector(vt->length_in_bytes());
!VM_Version::use_neon_for_vector(length_in_bytes);
default:
// For other ops whose vector size is smaller than the max vector size, a
// full-sized unpredicated operation does not impact the final vector result.
// For other ops whose vector size is smaller than the max vector
// size, a full-sized unpredicated operation does not impact the
// vector result.
return false;
}
}
Expand Down
29 changes: 18 additions & 11 deletions src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
Original file line number Diff line number Diff line change
Expand Up @@ -336,8 +336,14 @@ source %{
}

bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
// Only SVE has partial vector operations
if (UseSVE == 0) {
// 1. Only SVE requires partial vector operations.
// 2. The vector size in bytes must be smaller than MaxVectorSize.
// 3. Predicated vectors have a mask input, which guarantees that
// out-of-bounds lanes remain inactive.
int length_in_bytes = vt->length_in_bytes();
if (UseSVE == 0 ||
length_in_bytes == MaxVectorSize ||
node->is_predicated_vector()) {
return false;
}

Expand All @@ -360,21 +366,22 @@ source %{
return !node->in(1)->is_Con();
case Op_LoadVector:
case Op_StoreVector:
// We use NEON load/store instructions if the vector length is <= 128 bits.
return vt->length_in_bytes() > 16;
case Op_AddReductionVI:
case Op_AddReductionVL:
// We may prefer using NEON instructions rather than SVE partial operations.
return !VM_Version::use_neon_for_vector(vt->length_in_bytes());
// For these ops, we prefer using NEON instructions rather than SVE
// predicated instructions for better performance.
return !VM_Version::use_neon_for_vector(length_in_bytes);
case Op_MinReductionV:
case Op_MaxReductionV:
// For BYTE/SHORT/INT/FLOAT/DOUBLE types, we may prefer using NEON
// instructions rather than SVE partial operations.
// For BYTE/SHORT/INT/FLOAT/DOUBLE types, we prefer using NEON
// instructions rather than SVE predicated instructions for
// better performance.
return vt->element_basic_type() == T_LONG ||
!VM_Version::use_neon_for_vector(vt->length_in_bytes());
!VM_Version::use_neon_for_vector(length_in_bytes);
default:
// For other ops whose vector size is smaller than the max vector size, a
// full-sized unpredicated operation does not impact the final vector result.
// For other ops whose vector size is smaller than the max vector
// size, a full-sized unpredicated operation does not impact the
// vector result.
return false;
}
}
Expand Down
1 change: 0 additions & 1 deletion src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5379,7 +5379,6 @@ void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
assert (UseCompressedClassPointers, "should only be used for compressed headers");
assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
int index = oop_recorder()->find_index(k);
assert(! Universe::heap()->is_in(k), "should not be an oop");

InstructionMark im(this);
RelocationHolder rspec = metadata_Relocation::spec(index);
Expand Down
29 changes: 29 additions & 0 deletions src/hotspot/cpu/ppc/ppc.ad
Original file line number Diff line number Diff line change
Expand Up @@ -6335,8 +6335,36 @@ instruct loadConD_Ex(regD dst, immD src) %{
// Prefetch instructions.
// Must be safe to execute with invalid address (cannot fault).

// Special prefetch versions which use the dcbz instruction.
instruct prefetch_alloc_zero(indirectMemory mem, iRegLsrc src) %{
match(PrefetchAllocation (AddP mem src));
predicate(AllocatePrefetchStyle == 3);
ins_cost(MEMORY_REF_COST);

format %{ "PREFETCH $mem, 2, $src \t// Prefetch write-many with zero" %}
size(4);
ins_encode %{
__ dcbz($src$$Register, $mem$$base$$Register);
%}
ins_pipe(pipe_class_memory);
%}

instruct prefetch_alloc_zero_no_offset(indirectMemory mem) %{
match(PrefetchAllocation mem);
predicate(AllocatePrefetchStyle == 3);
ins_cost(MEMORY_REF_COST);

format %{ "PREFETCH $mem, 2 \t// Prefetch write-many with zero" %}
size(4);
ins_encode %{
__ dcbz($mem$$base$$Register);
%}
ins_pipe(pipe_class_memory);
%}

instruct prefetch_alloc(indirectMemory mem, iRegLsrc src) %{
match(PrefetchAllocation (AddP mem src));
predicate(AllocatePrefetchStyle != 3);
ins_cost(MEMORY_REF_COST);

format %{ "PREFETCH $mem, 2, $src \t// Prefetch write-many" %}
Expand All @@ -6349,6 +6377,7 @@ instruct prefetch_alloc(indirectMemory mem, iRegLsrc src) %{

instruct prefetch_alloc_no_offset(indirectMemory mem) %{
match(PrefetchAllocation mem);
predicate(AllocatePrefetchStyle != 3);
ins_cost(MEMORY_REF_COST);

format %{ "PREFETCH $mem, 2 \t// Prefetch write-many" %}
Expand Down
1 change: 0 additions & 1 deletion src/hotspot/cpu/riscv/macroAssembler_riscv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4933,7 +4933,6 @@ void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
assert (UseCompressedClassPointers, "should only be used for compressed headers");
assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
int index = oop_recorder()->find_index(k);
assert(!Universe::heap()->is_in(k), "should not be an oop");

narrowKlass nk = CompressedKlassPointers::encode(k);
relocate(metadata_Relocation::spec(index), [&] {
Expand Down
17 changes: 17 additions & 0 deletions src/hotspot/share/gc/serial/serialHeap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,14 @@ bool SerialHeap::requires_barriers(stackChunkOop obj) const {

// Returns "TRUE" iff "p" points into the committed areas of the heap.
bool SerialHeap::is_in(const void* p) const {
// precondition
verify_not_in_native_if_java_thread();

if (!is_in_reserved(p)) {
// If it's not even in reserved.
return false;
}

return _young_gen->is_in(p) || _old_gen->is_in(p);
}

Expand Down Expand Up @@ -797,3 +805,12 @@ void SerialHeap::gc_epilogue(bool full) {

MetaspaceCounters::update_performance_counters();
};

#ifdef ASSERT
void SerialHeap::verify_not_in_native_if_java_thread() {
if (Thread::current()->is_Java_thread()) {
JavaThread* thread = JavaThread::current();
assert(thread->thread_state() != _thread_in_native, "precondition");
}
}
#endif
2 changes: 2 additions & 0 deletions src/hotspot/share/gc/serial/serialHeap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@ class SerialHeap : public CollectedHeap {
void print_tracing_info() const override;
void stop() override {};

static void verify_not_in_native_if_java_thread() NOT_DEBUG_RETURN;

public:
// Returns JNI_OK on success
jint initialize() override;
Expand Down
28 changes: 27 additions & 1 deletion src/hotspot/share/gc/shared/threadLocalAllocBuffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "utilities/copy.hpp"

size_t ThreadLocalAllocBuffer::_max_size = 0;
int ThreadLocalAllocBuffer::_reserve_for_allocation_prefetch = 0;
unsigned int ThreadLocalAllocBuffer::_target_refills = 0;

ThreadLocalAllocBuffer::ThreadLocalAllocBuffer() :
Expand Down Expand Up @@ -224,6 +225,30 @@ void ThreadLocalAllocBuffer::startup_initialization() {
// abort during VM initialization.
_target_refills = MAX2(_target_refills, 2U);

#ifdef COMPILER2
// If the C2 compiler is present, extra space is needed at the end of
// TLABs, otherwise prefetching instructions generated by the C2
// compiler will fault (due to accessing memory outside of heap).
// The amount of space is the max of the number of lines to
// prefetch for array and for instance allocations. (Extra space must be
// reserved to accommodate both types of allocations.)
//
// Only SPARC-specific BIS instructions are known to fault. (Those
// instructions are generated if AllocatePrefetchStyle==3 and
// AllocatePrefetchInstr==1). To be on the safe side, however,
// extra space is reserved for all combinations of
// AllocatePrefetchStyle and AllocatePrefetchInstr.
//
// If the C2 compiler is not present, no space is reserved.

// +1 for rounding up to next cache line, +1 to be safe
if (CompilerConfig::is_c2_or_jvmci_compiler_enabled()) {
int lines = MAX2(AllocatePrefetchLines, AllocateInstancePrefetchLines) + 2;
_reserve_for_allocation_prefetch = (AllocatePrefetchDistance + AllocatePrefetchStepSize * lines) /
(int)HeapWordSize;
}
#endif

// During jvm startup, the main thread is initialized
// before the heap is initialized. So reinitialize it now.
guarantee(Thread::current()->is_Java_thread(), "tlab initialization thread not Java thread");
Expand Down Expand Up @@ -429,7 +454,8 @@ void ThreadLocalAllocStats::publish() {
}

size_t ThreadLocalAllocBuffer::end_reserve() {
return CollectedHeap::lab_alignment_reserve();
size_t reserve_size = CollectedHeap::lab_alignment_reserve();
return MAX2(reserve_size, (size_t)_reserve_for_allocation_prefetch);
}

const HeapWord* ThreadLocalAllocBuffer::start_relaxed() const {
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/share/gc/shared/threadLocalAllocBuffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ class ThreadLocalAllocBuffer: public CHeapObj<mtThread> {
size_t _allocated_before_last_gc; // total bytes allocated up until the last gc

static size_t _max_size; // maximum size of any TLAB
static int _reserve_for_allocation_prefetch; // Reserve at the end of the TLAB
static unsigned _target_refills; // expected number of refills between GCs

unsigned _number_of_refills;
Expand Down
4 changes: 2 additions & 2 deletions src/hotspot/share/gc/shenandoah/shenandoahBarrierSet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,8 @@ class ShenandoahBarrierSet: public BarrierSet {
void write_ref_array(HeapWord* start, size_t count);

private:
template <class T>
inline void arraycopy_marking(T* dst, size_t count);
template <bool IS_GENERATIONAL, class T>
void arraycopy_marking(T* dst, size_t count);
template <class T>
inline void arraycopy_evacuation(T* src, size_t count);
template <class T>
Expand Down
11 changes: 8 additions & 3 deletions src/hotspot/share/gc/shenandoah/shenandoahBarrierSet.inline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,11 @@ void ShenandoahBarrierSet::arraycopy_barrier(T* src, T* dst, size_t count) {
// If marking old or young, we must evaluate the SATB barrier. This will be the only
// action if we are not marking old. If we are marking old, we must still evaluate the
// load reference barrier for a young collection.
arraycopy_marking(dst, count);
if (_heap->mode()->is_generational()) {
arraycopy_marking<true>(dst, count);
} else {
arraycopy_marking<false>(dst, count);
}
}

if ((gc_state & ShenandoahHeap::EVACUATION) != 0) {
Expand All @@ -441,11 +445,12 @@ void ShenandoahBarrierSet::arraycopy_barrier(T* src, T* dst, size_t count) {
}
}

template <class T>
template <bool IS_GENERATIONAL, class T>
void ShenandoahBarrierSet::arraycopy_marking(T* dst, size_t count) {
assert(_heap->is_concurrent_mark_in_progress(), "only during marking");
if (ShenandoahSATBBarrier) {
if (!_heap->marking_context()->allocated_after_mark_start(reinterpret_cast<HeapWord*>(dst))) {
if (!_heap->marking_context()->allocated_after_mark_start(reinterpret_cast<HeapWord*>(dst)) ||
(IS_GENERATIONAL && _heap->heap_region_containing(dst)->is_old() && _heap->is_concurrent_young_mark_in_progress())) {
arraycopy_work<T, false, false, true>(dst, count);
}
}
Expand Down
3 changes: 2 additions & 1 deletion src/hotspot/share/opto/macro.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1914,7 +1914,8 @@ Node* PhaseMacroExpand::prefetch_allocation(Node* i_o, Node*& needgc_false,
transform_later(cache_adr);
cache_adr = new CastP2XNode(needgc_false, cache_adr);
transform_later(cache_adr);
// Address is aligned to execute prefetch to the beginning of cache line size.
// Address is aligned to execute prefetch to the beginning of cache line size
// (it is important when BIS instruction is used on SPARC as prefetch).
Node* mask = _igvn.MakeConX(~(intptr_t)(step_size-1));
cache_adr = new AndXNode(cache_adr, mask);
transform_later(cache_adr);
Expand Down
4 changes: 4 additions & 0 deletions src/hotspot/share/opto/matcher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,10 @@ class Matcher : public PhaseTransform {

static bool match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt);

// Determines if a vector operation needs to be partially implemented with a mask
// controlling only the lanes in range [0, vector_length) are processed. This applies
// to operations whose vector length is less than the hardware-supported maximum
// vector length. Returns true if the operation requires masking, false otherwise.
static bool vector_needs_partial_operations(Node* node, const TypeVect* vt);

static bool vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen);
Expand Down
Loading
Loading