diff --git a/make/common/native/Flags.gmk b/make/common/native/Flags.gmk
index efb4c08e74c..6353b490654 100644
--- a/make/common/native/Flags.gmk
+++ b/make/common/native/Flags.gmk
@@ -234,6 +234,9 @@ define SetupLinkerFlags
     ifeq ($(call isTargetOs, macosx), true)
       $1_EXTRA_LDFLAGS += -Wl,-object_path_lto,$$($1_OBJECT_DIR)/$$($1_NAME)_lto_helper.o
     endif
+    ifeq ($(TOOLCHAIN_TYPE), microsoft)
+      $1_EXTRA_LDFLAGS += -LTCGOUT:$$($1_OBJECT_DIR)/$$($1_NAME).iobj
+    endif
   endif
 
   $1_EXTRA_LDFLAGS += $$($1_LDFLAGS_$(OPENJDK_TARGET_OS_TYPE)) $$($1_LDFLAGS_$(OPENJDK_TARGET_OS)) \
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad
index 842784d1a29..78ef121bd29 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@@ -346,8 +346,14 @@ source %{
   }
 
   bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
-    // Only SVE has partial vector operations
-    if (UseSVE == 0) {
+    // 1. Only SVE requires partial vector operations.
+    // 2. The vector size in bytes must be smaller than MaxVectorSize.
+    // 3. Predicated vectors have a mask input, which guarantees that
+    //    out-of-bounds lanes remain inactive.
+    int length_in_bytes = vt->length_in_bytes();
+    if (UseSVE == 0 ||
+        length_in_bytes == MaxVectorSize ||
+        node->is_predicated_vector()) {
       return false;
     }
 
@@ -370,21 +376,22 @@ source %{
         return !node->in(1)->is_Con();
       case Op_LoadVector:
       case Op_StoreVector:
-        // We use NEON load/store instructions if the vector length is <= 128 bits.
-        return vt->length_in_bytes() > 16;
       case Op_AddReductionVI:
       case Op_AddReductionVL:
-        // We may prefer using NEON instructions rather than SVE partial operations.
-        return !VM_Version::use_neon_for_vector(vt->length_in_bytes());
+        // For these ops, we prefer using NEON instructions rather than SVE
+        // predicated instructions for better performance.
+        return !VM_Version::use_neon_for_vector(length_in_bytes);
       case Op_MinReductionV:
       case Op_MaxReductionV:
-        // For BYTE/SHORT/INT/FLOAT/DOUBLE types, we may prefer using NEON
-        // instructions rather than SVE partial operations.
+        // For BYTE/SHORT/INT/FLOAT/DOUBLE types, we prefer using NEON
+        // instructions rather than SVE predicated instructions for
+        // better performance.
         return vt->element_basic_type() == T_LONG ||
-               !VM_Version::use_neon_for_vector(vt->length_in_bytes());
+               !VM_Version::use_neon_for_vector(length_in_bytes);
       default:
-        // For other ops whose vector size is smaller than the max vector size, a
-        // full-sized unpredicated operation does not impact the final vector result.
+        // For other ops whose vector size is smaller than the max vector
+        // size, a full-sized unpredicated operation does not impact the
+        // vector result.
         return false;
     }
   }
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
index dff82ce95ac..66dc22c3758 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@@ -336,8 +336,14 @@ source %{
   }
 
   bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
-    // Only SVE has partial vector operations
-    if (UseSVE == 0) {
+    // 1. Only SVE requires partial vector operations.
+    // 2. The vector size in bytes must be smaller than MaxVectorSize.
+    // 3. Predicated vectors have a mask input, which guarantees that
+    //    out-of-bounds lanes remain inactive.
+    int length_in_bytes = vt->length_in_bytes();
+    if (UseSVE == 0 ||
+        length_in_bytes == MaxVectorSize ||
+        node->is_predicated_vector()) {
       return false;
     }
 
@@ -360,21 +366,22 @@ source %{
         return !node->in(1)->is_Con();
       case Op_LoadVector:
       case Op_StoreVector:
-        // We use NEON load/store instructions if the vector length is <= 128 bits.
-        return vt->length_in_bytes() > 16;
       case Op_AddReductionVI:
       case Op_AddReductionVL:
-        // We may prefer using NEON instructions rather than SVE partial operations.
-        return !VM_Version::use_neon_for_vector(vt->length_in_bytes());
+        // For these ops, we prefer using NEON instructions rather than SVE
+        // predicated instructions for better performance.
+        return !VM_Version::use_neon_for_vector(length_in_bytes);
       case Op_MinReductionV:
       case Op_MaxReductionV:
-        // For BYTE/SHORT/INT/FLOAT/DOUBLE types, we may prefer using NEON
-        // instructions rather than SVE partial operations.
+        // For BYTE/SHORT/INT/FLOAT/DOUBLE types, we prefer using NEON
+        // instructions rather than SVE predicated instructions for
+        // better performance.
         return vt->element_basic_type() == T_LONG ||
-               !VM_Version::use_neon_for_vector(vt->length_in_bytes());
+               !VM_Version::use_neon_for_vector(length_in_bytes);
       default:
-        // For other ops whose vector size is smaller than the max vector size, a
-        // full-sized unpredicated operation does not impact the final vector result.
+        // For other ops whose vector size is smaller than the max vector
+        // size, a full-sized unpredicated operation does not impact the
+        // vector result.
         return false;
     }
   }
diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
index ceedb4f1063..2ccc755be3c 100644
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
@@ -5379,7 +5379,6 @@ void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
   assert (UseCompressedClassPointers, "should only be used for compressed headers");
   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
   int index = oop_recorder()->find_index(k);
-  assert(! Universe::heap()->is_in(k), "should not be an oop");
 
   InstructionMark im(this);
   RelocationHolder rspec = metadata_Relocation::spec(index);
diff --git a/src/hotspot/cpu/ppc/ppc.ad b/src/hotspot/cpu/ppc/ppc.ad
index aa00609094e..2a0a9149bb3 100644
--- a/src/hotspot/cpu/ppc/ppc.ad
+++ b/src/hotspot/cpu/ppc/ppc.ad
@@ -6335,8 +6335,36 @@ instruct loadConD_Ex(regD dst, immD src) %{
 // Prefetch instructions.
 // Must be safe to execute with invalid address (cannot fault).
 
+// Special prefetch versions which use the dcbz instruction.
+instruct prefetch_alloc_zero(indirectMemory mem, iRegLsrc src) %{
+  match(PrefetchAllocation (AddP mem src));
+  predicate(AllocatePrefetchStyle == 3);
+  ins_cost(MEMORY_REF_COST);
+
+  format %{ "PREFETCH $mem, 2, $src \t// Prefetch write-many with zero" %}
+  size(4);
+  ins_encode %{
+    __ dcbz($src$$Register, $mem$$base$$Register);
+  %}
+  ins_pipe(pipe_class_memory);
+%}
+
+instruct prefetch_alloc_zero_no_offset(indirectMemory mem) %{
+  match(PrefetchAllocation mem);
+  predicate(AllocatePrefetchStyle == 3);
+  ins_cost(MEMORY_REF_COST);
+
+  format %{ "PREFETCH $mem, 2 \t// Prefetch write-many with zero" %}
+  size(4);
+  ins_encode %{
+    __ dcbz($mem$$base$$Register);
+  %}
+  ins_pipe(pipe_class_memory);
+%}
+
 instruct prefetch_alloc(indirectMemory mem, iRegLsrc src) %{
   match(PrefetchAllocation (AddP mem src));
+  predicate(AllocatePrefetchStyle != 3);
   ins_cost(MEMORY_REF_COST);
 
   format %{ "PREFETCH $mem, 2, $src \t// Prefetch write-many" %}
@@ -6349,6 +6377,7 @@ instruct prefetch_alloc(indirectMemory mem, iRegLsrc src) %{
 
 instruct prefetch_alloc_no_offset(indirectMemory mem) %{
   match(PrefetchAllocation mem);
+  predicate(AllocatePrefetchStyle != 3);
   ins_cost(MEMORY_REF_COST);
 
   format %{ "PREFETCH $mem, 2 \t// Prefetch write-many" %}
diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp
index 7a8496ae42b..8ffd54fd7ee 100644
--- a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp
+++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp
@@ -4933,7 +4933,6 @@ void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
   assert (UseCompressedClassPointers, "should only be used for compressed headers");
   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
   int index = oop_recorder()->find_index(k);
-  assert(!Universe::heap()->is_in(k), "should not be an oop");
 
   narrowKlass nk = CompressedKlassPointers::encode(k);
   relocate(metadata_Relocation::spec(index), [&] {
diff --git a/src/hotspot/share/gc/serial/serialHeap.cpp b/src/hotspot/share/gc/serial/serialHeap.cpp
index 932c06b8109..104924c1cad 100644
--- a/src/hotspot/share/gc/serial/serialHeap.cpp
+++ b/src/hotspot/share/gc/serial/serialHeap.cpp
@@ -630,6 +630,14 @@ bool SerialHeap::requires_barriers(stackChunkOop obj) const {
 
 // Returns "TRUE" iff "p" points into the committed areas of the heap.
 bool SerialHeap::is_in(const void* p) const {
+  // precondition
+  verify_not_in_native_if_java_thread();
+
+  if (!is_in_reserved(p)) {
+    // If it's not even in reserved.
+    return false;
+  }
+
   return _young_gen->is_in(p) || _old_gen->is_in(p);
 }
 
@@ -797,3 +805,12 @@ void SerialHeap::gc_epilogue(bool full) {
 
   MetaspaceCounters::update_performance_counters();
 };
+
+#ifdef ASSERT
+void SerialHeap::verify_not_in_native_if_java_thread() {
+  if (Thread::current()->is_Java_thread()) {
+    JavaThread* thread = JavaThread::current();
+    assert(thread->thread_state() != _thread_in_native, "precondition");
+  }
+}
+#endif
diff --git a/src/hotspot/share/gc/serial/serialHeap.hpp b/src/hotspot/share/gc/serial/serialHeap.hpp
index ee016173c2a..f5286179abf 100644
--- a/src/hotspot/share/gc/serial/serialHeap.hpp
+++ b/src/hotspot/share/gc/serial/serialHeap.hpp
@@ -111,6 +111,8 @@ class SerialHeap : public CollectedHeap {
   void print_tracing_info() const override;
   void stop() override {};
 
+  static void verify_not_in_native_if_java_thread() NOT_DEBUG_RETURN;
+
 public:
   // Returns JNI_OK on success
   jint initialize() override;
diff --git a/src/hotspot/share/gc/shared/threadLocalAllocBuffer.cpp b/src/hotspot/share/gc/shared/threadLocalAllocBuffer.cpp
index 2181e089663..9635ed4d0cb 100644
--- a/src/hotspot/share/gc/shared/threadLocalAllocBuffer.cpp
+++ b/src/hotspot/share/gc/shared/threadLocalAllocBuffer.cpp
@@ -37,6 +37,7 @@
 #include "utilities/copy.hpp"
 
 size_t       ThreadLocalAllocBuffer::_max_size = 0;
+int          ThreadLocalAllocBuffer::_reserve_for_allocation_prefetch = 0;
 unsigned int ThreadLocalAllocBuffer::_target_refills = 0;
 
 ThreadLocalAllocBuffer::ThreadLocalAllocBuffer() :
@@ -224,6 +225,30 @@ void ThreadLocalAllocBuffer::startup_initialization() {
   // abort during VM initialization.
   _target_refills = MAX2(_target_refills, 2U);
 
+#ifdef COMPILER2
+  // If the C2 compiler is present, extra space is needed at the end of
+  // TLABs, otherwise prefetching instructions generated by the C2
+  // compiler will fault (due to accessing memory outside of heap).
+  // The amount of space is the max of the number of lines to
+  // prefetch for array and for instance allocations. (Extra space must be
+  // reserved to accommodate both types of allocations.)
+  //
+  // Only SPARC-specific BIS instructions are known to fault. (Those
+  // instructions are generated if AllocatePrefetchStyle==3 and
+  // AllocatePrefetchInstr==1). To be on the safe side, however,
+  // extra space is reserved for all combinations of
+  // AllocatePrefetchStyle and AllocatePrefetchInstr.
+  //
+  // If the C2 compiler is not present, no space is reserved.
+
+  // +1 for rounding up to next cache line, +1 to be safe
+  if (CompilerConfig::is_c2_or_jvmci_compiler_enabled()) {
+    int lines =  MAX2(AllocatePrefetchLines, AllocateInstancePrefetchLines) + 2;
+    _reserve_for_allocation_prefetch = (AllocatePrefetchDistance + AllocatePrefetchStepSize * lines) /
+                                       (int)HeapWordSize;
+  }
+#endif
+
   // During jvm startup, the main thread is initialized
   // before the heap is initialized.  So reinitialize it now.
   guarantee(Thread::current()->is_Java_thread(), "tlab initialization thread not Java thread");
@@ -429,7 +454,8 @@ void ThreadLocalAllocStats::publish() {
 }
 
 size_t ThreadLocalAllocBuffer::end_reserve() {
-  return CollectedHeap::lab_alignment_reserve();
+  size_t reserve_size = CollectedHeap::lab_alignment_reserve();
+  return MAX2(reserve_size, (size_t)_reserve_for_allocation_prefetch);
 }
 
 const HeapWord* ThreadLocalAllocBuffer::start_relaxed() const {
diff --git a/src/hotspot/share/gc/shared/threadLocalAllocBuffer.hpp b/src/hotspot/share/gc/shared/threadLocalAllocBuffer.hpp
index b64fa8d6ad1..59979646395 100644
--- a/src/hotspot/share/gc/shared/threadLocalAllocBuffer.hpp
+++ b/src/hotspot/share/gc/shared/threadLocalAllocBuffer.hpp
@@ -58,6 +58,7 @@ class ThreadLocalAllocBuffer: public CHeapObj<mtThread> {
   size_t    _allocated_before_last_gc;           // total bytes allocated up until the last gc
 
   static size_t   _max_size;                          // maximum size of any TLAB
+  static int      _reserve_for_allocation_prefetch;   // Reserve at the end of the TLAB
   static unsigned _target_refills;                    // expected number of refills between GCs
 
   unsigned  _number_of_refills;
diff --git a/src/hotspot/share/gc/shenandoah/shenandoahBarrierSet.hpp b/src/hotspot/share/gc/shenandoah/shenandoahBarrierSet.hpp
index 2b5bc766a46..7db478a781a 100644
--- a/src/hotspot/share/gc/shenandoah/shenandoahBarrierSet.hpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahBarrierSet.hpp
@@ -128,8 +128,8 @@ class ShenandoahBarrierSet: public BarrierSet {
   void write_ref_array(HeapWord* start, size_t count);
 
 private:
-  template <class T>
-  inline void arraycopy_marking(T* dst, size_t count);
+  template <bool IS_GENERATIONAL, class T>
+  void arraycopy_marking(T* dst, size_t count);
   template <class T>
   inline void arraycopy_evacuation(T* src, size_t count);
   template <class T>
diff --git a/src/hotspot/share/gc/shenandoah/shenandoahBarrierSet.inline.hpp b/src/hotspot/share/gc/shenandoah/shenandoahBarrierSet.inline.hpp
index adeea8ebf96..199256ca31b 100644
--- a/src/hotspot/share/gc/shenandoah/shenandoahBarrierSet.inline.hpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahBarrierSet.inline.hpp
@@ -429,7 +429,11 @@ void ShenandoahBarrierSet::arraycopy_barrier(T* src, T* dst, size_t count) {
     // If marking old or young, we must evaluate the SATB barrier. This will be the only
     // action if we are not marking old. If we are marking old, we must still evaluate the
     // load reference barrier for a young collection.
-    arraycopy_marking(dst, count);
+    if (_heap->mode()->is_generational()) {
+      arraycopy_marking<true>(dst, count);
+    } else {
+      arraycopy_marking<false>(dst, count);
+    }
   }
 
   if ((gc_state & ShenandoahHeap::EVACUATION) != 0) {
@@ -441,11 +445,12 @@ void ShenandoahBarrierSet::arraycopy_barrier(T* src, T* dst, size_t count) {
   }
 }
 
-template <class T>
+template <bool IS_GENERATIONAL, class T>
 void ShenandoahBarrierSet::arraycopy_marking(T* dst, size_t count) {
   assert(_heap->is_concurrent_mark_in_progress(), "only during marking");
   if (ShenandoahSATBBarrier) {
-    if (!_heap->marking_context()->allocated_after_mark_start(reinterpret_cast<HeapWord*>(dst))) {
+    if (!_heap->marking_context()->allocated_after_mark_start(reinterpret_cast<HeapWord*>(dst)) ||
+        (IS_GENERATIONAL && _heap->heap_region_containing(dst)->is_old() && _heap->is_concurrent_young_mark_in_progress())) {
       arraycopy_work<T, false, false, true>(dst, count);
     }
   }
diff --git a/src/hotspot/share/opto/macro.cpp b/src/hotspot/share/opto/macro.cpp
index 6f2171bbd75..90602bc2b35 100644
--- a/src/hotspot/share/opto/macro.cpp
+++ b/src/hotspot/share/opto/macro.cpp
@@ -1914,7 +1914,8 @@ Node* PhaseMacroExpand::prefetch_allocation(Node* i_o, Node*& needgc_false,
       transform_later(cache_adr);
       cache_adr = new CastP2XNode(needgc_false, cache_adr);
       transform_later(cache_adr);
-      // Address is aligned to execute prefetch to the beginning of cache line size.
+      // Address is aligned to execute prefetch to the beginning of cache line size
+      // (it is important when BIS instruction is used on SPARC as prefetch).
       Node* mask = _igvn.MakeConX(~(intptr_t)(step_size-1));
       cache_adr = new AndXNode(cache_adr, mask);
       transform_later(cache_adr);
diff --git a/src/hotspot/share/opto/matcher.hpp b/src/hotspot/share/opto/matcher.hpp
index ca13d0166a1..a071cff9e3c 100644
--- a/src/hotspot/share/opto/matcher.hpp
+++ b/src/hotspot/share/opto/matcher.hpp
@@ -329,6 +329,10 @@ class Matcher : public PhaseTransform {
 
   static bool match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt);
 
+  // Determines if a vector operation needs to be partially implemented with a mask
+  // controlling only the lanes in range [0, vector_length) are processed. This applies
+  // to operations whose vector length is less than the hardware-supported maximum
+  // vector length. Returns true if the operation requires masking, false otherwise.
   static bool vector_needs_partial_operations(Node* node, const TypeVect* vt);
 
   static bool vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen);
diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp
index 57b94205e5e..271dc901dcb 100644
--- a/src/hotspot/share/opto/vectornode.cpp
+++ b/src/hotspot/share/opto/vectornode.cpp
@@ -936,28 +936,26 @@ bool VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(int op
   }
 }
 
+// Idealize vector operations whose vector size is less than the hardware supported
+// max vector size. Generate a vector mask for the operation. Lanes with indices
+// inside of the vector size are set to true, while the remaining lanes are set to
+// false. Returns the corresponding masked vector node.
+static Node* ideal_partial_operations(PhaseGVN* phase, Node* node, const TypeVect* vt) {
+  if (!Matcher::vector_needs_partial_operations(node, vt)) {
+    return nullptr;
+  }
 
-Node* VectorNode::try_to_gen_masked_vector(PhaseGVN* gvn, Node* node, const TypeVect* vt) {
   int vopc = node->Opcode();
   uint vlen = vt->length();
   BasicType bt = vt->element_basic_type();
+  assert(Matcher::match_rule_supported_vector_masked(vopc, vlen, bt),
+         "The masked feature is required for the vector operation");
+  assert(Matcher::match_rule_supported_vector(Op_VectorMaskGen, vlen, bt),
+         "'VectorMaskGen' is required to generate a vector mask");
 
-  // Predicated vectors do not need to add another mask input
-  if (node->is_predicated_vector() || !Matcher::has_predicated_vectors() ||
-      !Matcher::match_rule_supported_vector_masked(vopc, vlen, bt) ||
-      !Matcher::match_rule_supported_vector(Op_VectorMaskGen, vlen, bt)) {
-    return nullptr;
-  }
-
-  Node* mask = nullptr;
-  // Generate a vector mask for vector operation whose vector length is lower than the
-  // hardware supported max vector length.
-  if (vt->length_in_bytes() < (uint)MaxVectorSize) {
-    Node* length = gvn->transform(new ConvI2LNode(gvn->makecon(TypeInt::make(vlen))));
-    mask = gvn->transform(VectorMaskGenNode::make(length, bt, vlen));
-  } else {
-    return nullptr;
-  }
+  // Generate a vector mask, with lanes inside of the vector length set to true.
+  Node* length = phase->transform(new ConvI2LNode(phase->makecon(TypeInt::make(vlen))));
+  Node* mask = phase->transform(VectorMaskGenNode::make(length, bt, vlen));
 
   // Generate the related masked op for vector load/store/load_gather/store_scatter.
   // Or append the mask to the vector op's input list by default.
@@ -1037,8 +1035,9 @@ bool VectorNode::should_swap_inputs_to_help_global_value_numbering() {
 }
 
 Node* VectorNode::Ideal(PhaseGVN* phase, bool can_reshape) {
-  if (Matcher::vector_needs_partial_operations(this, vect_type())) {
-    return try_to_gen_masked_vector(phase, this, vect_type());
+  Node* n = ideal_partial_operations(phase, this, vect_type());
+  if (n != nullptr) {
+    return n;
   }
 
   // Sort inputs of commutative non-predicated vector operations to help value numbering.
@@ -1119,9 +1118,9 @@ LoadVectorNode* LoadVectorNode::make(int opc, Node* ctl, Node* mem,
 }
 
 Node* LoadVectorNode::Ideal(PhaseGVN* phase, bool can_reshape) {
-  const TypeVect* vt = vect_type();
-  if (Matcher::vector_needs_partial_operations(this, vt)) {
-    return VectorNode::try_to_gen_masked_vector(phase, this, vt);
+  Node* n = ideal_partial_operations(phase, this, vect_type());
+  if (n != nullptr) {
+    return n;
   }
   return LoadNode::Ideal(phase, can_reshape);
 }
@@ -1133,9 +1132,9 @@ StoreVectorNode* StoreVectorNode::make(int opc, Node* ctl, Node* mem, Node* adr,
 }
 
 Node* StoreVectorNode::Ideal(PhaseGVN* phase, bool can_reshape) {
-  const TypeVect* vt = vect_type();
-  if (Matcher::vector_needs_partial_operations(this, vt)) {
-    return VectorNode::try_to_gen_masked_vector(phase, this, vt);
+  Node* n = ideal_partial_operations(phase, this, vect_type());
+  if (n != nullptr) {
+    return n;
   }
   return StoreNode::Ideal(phase, can_reshape);
 }
@@ -1411,11 +1410,11 @@ ReductionNode* ReductionNode::make(int opc, Node* ctrl, Node* n1, Node* n2, Basi
 }
 
 Node* ReductionNode::Ideal(PhaseGVN* phase, bool can_reshape) {
-  const TypeVect* vt = vect_type();
-  if (Matcher::vector_needs_partial_operations(this, vt)) {
-    return VectorNode::try_to_gen_masked_vector(phase, this, vt);
+  Node* n = ideal_partial_operations(phase, this, vect_type());
+  if (n != nullptr) {
+    return n;
   }
-  return nullptr;
+  return Node::Ideal(phase, can_reshape);
 }
 
 // Convert fromLong to maskAll if the input sets or unsets all lanes.
@@ -1893,11 +1892,11 @@ Node* VectorMaskOpNode::make(Node* mask, const Type* ty, int mopc) {
 }
 
 Node* VectorMaskOpNode::Ideal(PhaseGVN* phase, bool can_reshape) {
-  const TypeVect* vt = vect_type();
-  if (Matcher::vector_needs_partial_operations(this, vt)) {
-    return VectorNode::try_to_gen_masked_vector(phase, this, vt);
+  Node* n = ideal_partial_operations(phase, this, vect_type());
+  if (n != nullptr) {
+    return n;
   }
-  return nullptr;
+  return TypeNode::Ideal(phase, can_reshape);
 }
 
 Node* VectorMaskCastNode::Identity(PhaseGVN* phase) {
diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp
index 427aeff53fc..dc7aa13cf36 100644
--- a/src/hotspot/share/opto/vectornode.hpp
+++ b/src/hotspot/share/opto/vectornode.hpp
@@ -117,7 +117,6 @@ class VectorNode : public TypeNode {
   static bool is_vector_bitwise_not_pattern(Node* n);
   static Node* degenerate_vector_rotate(Node* n1, Node* n2, bool is_rotate_left, int vlen,
                                         BasicType bt, PhaseGVN* phase);
-  static Node* try_to_gen_masked_vector(PhaseGVN* gvn, Node* node, const TypeVect* vt);
 
   // [Start, end) half-open range defining which operands are vectors
   static void vector_operands(Node* n, uint* start, uint* end);
diff --git a/src/hotspot/share/runtime/vmStructs.cpp b/src/hotspot/share/runtime/vmStructs.cpp
index 25a99c2d758..85a2ef8b507 100644
--- a/src/hotspot/share/runtime/vmStructs.cpp
+++ b/src/hotspot/share/runtime/vmStructs.cpp
@@ -353,6 +353,7 @@
   nonstatic_field(ThreadLocalAllocBuffer,      _pf_top,                                       HeapWord*)                             \
   nonstatic_field(ThreadLocalAllocBuffer,      _desired_size,                                 size_t)                                \
   nonstatic_field(ThreadLocalAllocBuffer,      _refill_waste_limit,                           size_t)                                \
+     static_field(ThreadLocalAllocBuffer,      _reserve_for_allocation_prefetch,              int)                                   \
      static_field(ThreadLocalAllocBuffer,      _target_refills,                               unsigned)                              \
   nonstatic_field(ThreadLocalAllocBuffer,      _number_of_refills,                            unsigned)                              \
   nonstatic_field(ThreadLocalAllocBuffer,      _refill_waste,                                 unsigned)                              \
diff --git a/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/runtime/ThreadLocalAllocBuffer.java b/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/runtime/ThreadLocalAllocBuffer.java
index e23e63806bd..1dc67330d3d 100644
--- a/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/runtime/ThreadLocalAllocBuffer.java
+++ b/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/runtime/ThreadLocalAllocBuffer.java
@@ -76,9 +76,10 @@ private long alignmentReserve() {
 
   private long endReserve() {
     long labAlignmentReserve = VM.getVM().getLabAlignmentReserve();
+    long reserveForAllocationPrefetch = VM.getVM().getReserveForAllocationPrefetch();
     long heapWordSize = VM.getVM().getHeapWordSize();
 
-    return labAlignmentReserve * heapWordSize;
+    return Math.max(labAlignmentReserve, reserveForAllocationPrefetch) * heapWordSize;
   }
 
   /** Support for iteration over heap -- not sure how this will
diff --git a/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/runtime/VM.java b/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/runtime/VM.java
index 1607563150a..dc27a4fc59e 100644
--- a/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/runtime/VM.java
+++ b/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/runtime/VM.java
@@ -123,6 +123,7 @@ public class VM {
   private int          invocationEntryBCI;
   private ReversePtrs  revPtrs;
   private VMRegImpl    vmregImpl;
+  private int          reserveForAllocationPrefetch;
   private int          labAlignmentReserve;
 
   // System.getProperties from debuggee VM
@@ -446,6 +447,8 @@ private VM(TypeDataBase db, JVMDebugger debugger, boolean isBigEndian) {
     boolType = (CIntegerType) db.lookupType("bool");
 
     Type threadLocalAllocBuffer = db.lookupType("ThreadLocalAllocBuffer");
+    CIntegerField reserveForAllocationPrefetchField = threadLocalAllocBuffer.getCIntegerField("_reserve_for_allocation_prefetch");
+    reserveForAllocationPrefetch = (int)reserveForAllocationPrefetchField.getCInteger(intType);
 
     Type collectedHeap = db.lookupType("CollectedHeap");
     CIntegerField labAlignmentReserveField = collectedHeap.getCIntegerField("_lab_alignment_reserve");
@@ -912,6 +915,10 @@ public String getVMInternalInfo() {
     return vmInternalInfo;
   }
 
+  public int getReserveForAllocationPrefetch() {
+    return reserveForAllocationPrefetch;
+  }
+
   public int getLabAlignmentReserve() {
     return labAlignmentReserve;
   }
diff --git a/test/hotspot/jtreg/ProblemList.txt b/test/hotspot/jtreg/ProblemList.txt
index 6c3d907961d..ddc6e55dc05 100644
--- a/test/hotspot/jtreg/ProblemList.txt
+++ b/test/hotspot/jtreg/ProblemList.txt
@@ -187,9 +187,3 @@ vmTestbase/nsk/jdwp/ThreadReference/ForceEarlyReturn/forceEarlyReturn001/forceEa
 vmTestbase/nsk/monitoring/ThreadMXBean/ThreadInfo/Multi/Multi005/TestDescription.java 8076494 windows-x64
 
 vmTestbase/nsk/monitoring/ThreadMXBean/findMonitorDeadlockedThreads/find006/TestDescription.java 8310144 macosx-aarch64
-
-vmTestbase/nsk/monitoring/MemoryPoolMBean/isUsageThresholdExceeded/isexceeded001/TestDescription.java 8373022 generic-all
-vmTestbase/nsk/monitoring/MemoryPoolMBean/isUsageThresholdExceeded/isexceeded002/TestDescription.java 8373022 generic-all
-vmTestbase/nsk/monitoring/MemoryPoolMBean/isUsageThresholdExceeded/isexceeded003/TestDescription.java 8373022 generic-all
-vmTestbase/nsk/monitoring/MemoryPoolMBean/isUsageThresholdExceeded/isexceeded004/TestDescription.java 8373022 generic-all
-vmTestbase/nsk/monitoring/MemoryPoolMBean/isUsageThresholdExceeded/isexceeded005/TestDescription.java 8373022 generic-all
diff --git a/test/hotspot/jtreg/compiler/c2/irTests/TestFloat16ScalarOperations.java b/test/hotspot/jtreg/compiler/c2/irTests/TestFloat16ScalarOperations.java
index c8ee5e730fa..445fef5e55a 100644
--- a/test/hotspot/jtreg/compiler/c2/irTests/TestFloat16ScalarOperations.java
+++ b/test/hotspot/jtreg/compiler/c2/irTests/TestFloat16ScalarOperations.java
@@ -68,6 +68,15 @@ public class TestFloat16ScalarOperations {
     private static final Float16 RANDOM4 = Float16.valueOf(genF.next());
     private static final Float16 RANDOM5 = Float16.valueOf(genF.next());
 
+    // We have to ensure that the constants are not special values that lead the operations to
+    // constant fold. For example "x + 0" could constant fold to "x", so we need to avoid that
+    // the add constant is zero.
+    private static Generator<Float> genSmallRangeF = G.uniformFloats(0.1f, 0.9f);
+    private static final Float16 RANDOM_CON_ADD = Float16.valueOf(genSmallRangeF.next());
+    private static final Float16 RANDOM_CON_SUB = Float16.valueOf(genSmallRangeF.next());
+    private static final Float16 RANDOM_CON_MUL = Float16.valueOf(genSmallRangeF.next());
+    private static final Float16 RANDOM_CON_DIV = Float16.valueOf(genSmallRangeF.next());
+
     private static Float16 RANDOM1_VAR = RANDOM1;
     private static Float16 RANDOM2_VAR = RANDOM2;
     private static Float16 RANDOM3_VAR = RANDOM3;
@@ -435,10 +444,10 @@ public void checkExactFP16ConstantPatterns(short actual) {
     @Warmup(10000)
     public short testRandomFP16ConstantPatternSet1() {
         short res = 0;
-        res += Float.floatToFloat16(RANDOM1_VAR.floatValue() + RANDOM2.floatValue());
-        res += Float.floatToFloat16(RANDOM2_VAR.floatValue() - RANDOM3.floatValue());
-        res += Float.floatToFloat16(RANDOM3_VAR.floatValue() * RANDOM4.floatValue());
-        res += Float.floatToFloat16(RANDOM4_VAR.floatValue() / RANDOM5.floatValue());
+        res += Float.floatToFloat16(RANDOM1_VAR.floatValue() + RANDOM_CON_ADD.floatValue());
+        res += Float.floatToFloat16(RANDOM2_VAR.floatValue() - RANDOM_CON_SUB.floatValue());
+        res += Float.floatToFloat16(RANDOM3_VAR.floatValue() * RANDOM_CON_MUL.floatValue());
+        res += Float.floatToFloat16(RANDOM4_VAR.floatValue() / RANDOM_CON_DIV.floatValue());
         return res;
     }
 
@@ -456,10 +465,10 @@ public void checkRandomFP16ConstantPatternSet1(short actual) {
     @Warmup(10000)
     public short testRandomFP16ConstantPatternSet2() {
         short res = 0;
-        res += Float.floatToFloat16(RANDOM2.floatValue() + RANDOM1_VAR.floatValue());
-        res += Float.floatToFloat16(RANDOM3.floatValue() - RANDOM2_VAR.floatValue());
-        res += Float.floatToFloat16(RANDOM4.floatValue() * RANDOM3_VAR.floatValue());
-        res += Float.floatToFloat16(RANDOM5.floatValue() / RANDOM4_VAR.floatValue());
+        res += Float.floatToFloat16(RANDOM_CON_ADD.floatValue() + RANDOM1_VAR.floatValue());
+        res += Float.floatToFloat16(RANDOM_CON_SUB.floatValue() - RANDOM2_VAR.floatValue());
+        res += Float.floatToFloat16(RANDOM_CON_MUL.floatValue() * RANDOM3_VAR.floatValue());
+        res += Float.floatToFloat16(RANDOM_CON_DIV.floatValue() / RANDOM4_VAR.floatValue());
         return res;
     }
 
diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
index 85595b9b632..a9d7426b2e8 100644
--- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
+++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
@@ -1448,6 +1448,16 @@ public class IRNode {
         beforeMatchingNameRegex(VECTOR_MASK_LANE_IS_SET, "ExtractUB");
     }
 
+    public static final String VECTOR_MASK_GEN = PREFIX + "VECTOR_MASK_GEN" + POSTFIX;
+    static {
+        beforeMatchingNameRegex(VECTOR_MASK_GEN, "VectorMaskGen");
+    }
+
+    public static final String VECTOR_MASK_FIRST_TRUE = PREFIX + "VECTOR_MASK_FIRST_TRUE" + POSTFIX;
+    static {
+        beforeMatchingNameRegex(VECTOR_MASK_FIRST_TRUE, "VectorMaskFirstTrue");
+    }
+
     // Can only be used if avx512_vnni is available.
     public static final String MUL_ADD_VS2VI_VNNI = PREFIX + "MUL_ADD_VS2VI_VNNI" + POSTFIX;
     static {
diff --git a/test/hotspot/jtreg/compiler/vectorapi/TestVectorLoadStoreOptimization.java b/test/hotspot/jtreg/compiler/vectorapi/TestVectorLoadStoreOptimization.java
new file mode 100644
index 00000000000..c603f450d0c
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/vectorapi/TestVectorLoadStoreOptimization.java
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.vectorapi;
+
+import compiler.lib.generators.*;
+import compiler.lib.ir_framework.*;
+import jdk.incubator.vector.*;
+import jdk.test.lib.Asserts;
+
+/**
+ * @test 8371603
+ * @key randomness
+ * @library /test/lib /
+ * @summary Test the missing optimization issues for vector load/store caused by JDK-8286941
+ * @modules jdk.incubator.vector
+ *
+ * @run driver ${test.main.class}
+ */
+public class TestVectorLoadStoreOptimization {
+    private static final int LENGTH = 1024;
+    private static final Generators random = Generators.G;
+
+    private static final VectorSpecies<Integer> SPECIES = IntVector.SPECIES_PREFERRED;
+
+    private static int[] a;
+
+    static {
+        a = new int[LENGTH];
+        random.fill(random.ints(), a);
+    }
+
+    // Test that "LoadVectorNode::Ideal()" calls "LoadNode::Ideal()" as expected,
+    // which sees the previous stores that go to the same position in-dependently,
+    // and optimize out the load with matched store values.
+    @Test
+    @IR(counts = { IRNode.LOAD_VECTOR_I, "1" },
+        applyIfCPUFeatureOr = {"asimd", "true", "avx", "true", "rvv", "true"})
+    public static void testLoadVector() {
+        IntVector v1 = IntVector.fromArray(SPECIES, a, 0);
+        v1.intoArray(a, SPECIES.length());
+        v1.intoArray(a, 2 * SPECIES.length());
+        // The second load vector equals to the first one and should be optimized
+        // out by "LoadNode::Ideal()".
+        IntVector v2 = IntVector.fromArray(SPECIES, a, SPECIES.length());
+        v2.intoArray(a, 3 * SPECIES.length());
+    }
+
+    @Check(test = "testLoadVector")
+    public static void testLoadVectorVerify() {
+        for (int i = SPECIES.length(); i < 4 * SPECIES.length(); i += SPECIES.length()) {
+            for (int j = 0; j < SPECIES.length(); j++) {
+                Asserts.assertEquals(a[i + j], a[j]);
+            }
+        }
+    }
+
+    // Test that "StoreVectorNode::Ideal()" calls "StoreNode::Ideal()" as expected,
+    // which can get rid of previous stores that go to the same position.
+    @Test
+    @IR(counts = { IRNode.STORE_VECTOR, "1" },
+        applyIfCPUFeatureOr = {"asimd", "true", "avx", "true", "rvv", "true"})
+    public static void testStoreVector() {
+        IntVector v1 = IntVector.fromArray(SPECIES, a, 0 * SPECIES.length());
+        IntVector v2 = IntVector.fromArray(SPECIES, a, 1 * SPECIES.length());
+        // Useless store to same position as below, which should be optimized out by
+        // "StoreNode::Ideal()".
+        v1.intoArray(a, 3 * SPECIES.length());
+        v2.intoArray(a, 3 * SPECIES.length());
+    }
+
+    @Check(test = "testStoreVector")
+    public static void testStoreVectorVerify() {
+        for (int i = 3 * SPECIES.length(); i < 4 * SPECIES.length(); i++) {
+            Asserts.assertEquals(a[i], a[i - 2 * SPECIES.length()]);
+        }
+    }
+
+    public static void main(String[] args) {
+        TestFramework testFramework = new TestFramework();
+        testFramework.setDefaultWarmup(10000)
+                     .addFlags("--add-modules=jdk.incubator.vector")
+                     .start();
+    }
+}
\ No newline at end of file
diff --git a/test/hotspot/jtreg/compiler/vectorapi/TestVectorOperationsWithPartialSize.java b/test/hotspot/jtreg/compiler/vectorapi/TestVectorOperationsWithPartialSize.java
new file mode 100644
index 00000000000..6fd20b7e2fb
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/vectorapi/TestVectorOperationsWithPartialSize.java
@@ -0,0 +1,432 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.vectorapi;
+
+import compiler.lib.generators.*;
+import compiler.lib.ir_framework.*;
+import jdk.incubator.vector.*;
+import jdk.test.lib.Asserts;
+
+/**
+ * @test 8371603
+ * @key randomness
+ * @library /test/lib /
+ * @summary Test vector operations with vector size less than MaxVectorSize
+ * @modules jdk.incubator.vector
+ *
+ * @run driver ${test.main.class}
+ */
+
+public class TestVectorOperationsWithPartialSize {
+    private static final int SIZE = 1024;
+    private static final Generators random = Generators.G;
+
+    private static final VectorSpecies<Integer> ISPEC_128 = IntVector.SPECIES_128;
+    private static final VectorSpecies<Long> LSPEC_128 = LongVector.SPECIES_128;
+    private static final VectorSpecies<Float> FSPEC_128 = FloatVector.SPECIES_128;
+    private static final VectorSpecies<Double> DSPEC_128 = DoubleVector.SPECIES_128;
+    private static final VectorSpecies<Integer> ISPEC_256 = IntVector.SPECIES_256;
+    private static final VectorSpecies<Long> LSPEC_256 = LongVector.SPECIES_256;
+
+    private static int[] ia;
+    private static int[] ib;
+    private static long[] la;
+    private static long[] lb;
+    private static float[] fa;
+    private static float[] fb;
+    private static double[] da;
+    private static double[] db;
+    private static boolean[] m;
+    private static boolean[] mr;
+    private static int[] indices;
+
+    static {
+        ia = new int[SIZE];
+        ib = new int[SIZE];
+        la = new long[SIZE];
+        lb = new long[SIZE];
+        fa = new float[SIZE];
+        fb = new float[SIZE];
+        da = new double[SIZE];
+        db = new double[SIZE];
+        m = new boolean[SIZE];
+        mr = new boolean[SIZE];
+        indices = new int[SIZE];
+
+        random.fill(random.ints(), ia);
+        random.fill(random.longs(), la);
+        random.fill(random.floats(), fa);
+        random.fill(random.doubles(), da);
+        random.fill(random.uniformInts(0, ISPEC_128.length()), indices);
+        for (int i = 0; i < SIZE; i++) {
+            m[i] = i % 2 == 0;
+        }
+    }
+
+    // ================ Load/Store/Gather/Scatter Tests ==================
+
+    private static void verifyLoadStore(int[] expected, int[] actual, int vlen) {
+        for (int i = 0; i < vlen; i++) {
+            Asserts.assertEquals(expected[i], actual[i]);
+        }
+    }
+
+    private static void verifyLoadGatherStoreScatter(int[] expected, int[] actual, int[] indices, int vlen) {
+        for (int i = 0; i < vlen; i++) {
+            Asserts.assertEquals(expected[indices[i]], actual[indices[i]]);
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "0",
+                  IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "1",
+                  IRNode.STORE_VECTOR, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">=32"})
+    public void testLoadStore_128() {
+        IntVector v = IntVector.fromArray(ISPEC_128, ia, 0);
+        v.intoArray(ib, 0);
+        verifyLoadStore(ia, ib, ISPEC_128.length());
+    }
+
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "1",
+                  IRNode.LOAD_VECTOR_MASKED, "1",
+                  IRNode.STORE_VECTOR_MASKED, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">=64"})
+    public void testLoadStore_256() {
+        IntVector v = IntVector.fromArray(ISPEC_256, ia, 0);
+        v.intoArray(ib, 0);
+        verifyLoadStore(ia, ib, ISPEC_256.length());
+    }
+
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "1",
+                  IRNode.LOAD_VECTOR_GATHER_MASKED, "1",
+                  IRNode.STORE_VECTOR_SCATTER_MASKED, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">=32"})
+    public void testLoadGatherStoreScatter_128() {
+        IntVector v = IntVector.fromArray(ISPEC_128, ia, 0, indices, 0);
+        v.intoArray(ib, 0, indices, 0);
+        verifyLoadGatherStoreScatter(ia, ib, indices, ISPEC_128.length());
+    }
+
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "1",
+                  IRNode.LOAD_VECTOR_GATHER_MASKED, "1",
+                  IRNode.STORE_VECTOR_SCATTER_MASKED, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">=64"})
+    public void testLoadGatherStoreScatter_256() {
+        IntVector v = IntVector.fromArray(ISPEC_256, ia, 0, indices, 0);
+        v.intoArray(ib, 0, indices, 0);
+        verifyLoadGatherStoreScatter(ia, ib, indices, ISPEC_256.length());
+    }
+
+    // ===================== Reduction Tests - Add =====================
+
+    interface binOpInt {
+        int apply(int a, int b);
+    }
+
+    interface binOpLong {
+        long apply(long a, long b);
+    }
+
+    private static int reduceLanes(int init, int[] arr, int vlen, binOpInt f) {
+        int result = init;
+        for (int i = 0; i < vlen; i++) {
+            result = f.apply(arr[i], result);
+        }
+        return result;
+    }
+
+    private static long reduceLanes(long init, long[] arr, int vlen,binOpLong f) {
+        long result = init;
+        for (int i = 0; i < vlen; i++) {
+            result = f.apply(arr[i], result);
+        }
+        return result;
+    }
+
+    // Reduction add operations with integer types are implemented with NEON SIMD instructions
+    // when the vector size is less than or equal to 128-bit.
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "0",
+                  IRNode.ADD_REDUCTION_VI, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">=32"})
+    public int testAddReductionInt_128() {
+        IntVector v = IntVector.fromArray(ISPEC_128, ia, 0);
+        int result = v.reduceLanes(VectorOperators.ADD);
+        Asserts.assertEquals(reduceLanes(0, ia, ISPEC_128.length(), (a, b) -> (a + b)), result);
+        return result;
+    }
+
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "1",
+                  IRNode.ADD_REDUCTION_VI, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">=64"})
+    public int testAddReductionInt_256() {
+        IntVector v = IntVector.fromArray(ISPEC_256, ia, 0);
+        int result = v.reduceLanes(VectorOperators.ADD);
+        Asserts.assertEquals(reduceLanes(0, ia, ISPEC_256.length(), (a, b) -> (a + b)), result);
+        return result;
+    }
+
+    // Reduction add operations with long types are implemented with NEON SIMD instructions
+    // when the vector size is less than or equal to 128-bit.
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "0",
+                  IRNode.ADD_REDUCTION_VL, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">=32"})
+    public long testAddReductionLong_128() {
+        LongVector v = LongVector.fromArray(LSPEC_128, la, 0);
+        long result = v.reduceLanes(VectorOperators.ADD);
+        Asserts.assertEquals(reduceLanes(0L, la, LSPEC_128.length(), (a, b) -> (a + b)), result);
+        return result;
+    }
+
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "1",
+                  IRNode.ADD_REDUCTION_VL, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">=64"})
+    public long testAddReductionLong_256() {
+        LongVector v = LongVector.fromArray(LSPEC_256, la, 0);
+        long result = v.reduceLanes(VectorOperators.ADD);
+        Asserts.assertEquals(reduceLanes(0L, la, LSPEC_256.length(), (a, b) -> (a + b)), result);
+        return result;
+    }
+
+    private static void verifyAddReductionFloat(float actual, float[] arr, int vlen) {
+        float expected = 0.0f;
+        for (int i = 0; i < vlen; i++) {
+            expected += arr[i];
+        }
+        // Floating point addition reduction ops may introduce rounding errors.
+        float ROUNDING_ERROR_FACTOR_ADD = 10.0f;
+        float tolerance = Math.ulp(expected) * ROUNDING_ERROR_FACTOR_ADD;
+        if (Math.abs(expected - actual) > tolerance) {
+            throw new RuntimeException(
+                "assertEqualsWithTolerance" +
+                ": expected " + expected + " but was " + actual +
+                " (tolerance: " + tolerance + ", diff: " + Math.abs(expected - actual) + ")"
+            );
+        }
+    }
+
+    private static void verifyAddReductionDouble(double actual, double[] arr, int vlen) {
+        double expected = 0.0;
+        for (int i = 0; i < vlen; i++) {
+            expected += arr[i];
+        }
+        // Floating point addition reduction ops may introduce rounding errors.
+        double ROUNDING_ERROR_FACTOR_ADD = 10.0;
+        double tolerance = Math.ulp(expected) * ROUNDING_ERROR_FACTOR_ADD;
+        if (Math.abs(expected - actual) > tolerance) {
+            throw new RuntimeException(
+                "assertEqualsWithTolerance" +
+                ": expected " + expected + " but was " + actual +
+                " (tolerance: " + tolerance + ", diff: " + Math.abs(expected - actual) + ")"
+            );
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "1",
+                  IRNode.ADD_REDUCTION_VF, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">=32"})
+    public float testAddReductionFloat() {
+        FloatVector v = FloatVector.fromArray(FSPEC_128, fa, 0);
+        float result = v.reduceLanes(VectorOperators.ADD);
+        verifyAddReductionFloat(result, fa, FSPEC_128.length());
+        return result;
+    }
+
+
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "1",
+                  IRNode.ADD_REDUCTION_VD, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">=32"})
+    public double testAddReductionDouble() {
+        DoubleVector v = DoubleVector.fromArray(DSPEC_128, da, 0);
+        double result = v.reduceLanes(VectorOperators.ADD);
+        verifyAddReductionDouble(result, da, DSPEC_128.length());
+        return result;
+    }
+
+    // ============== Reduction Tests - Logical ==============
+
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "1",
+                  IRNode.AND_REDUCTION_V, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">=32"})
+    public int testAndReduction() {
+        IntVector v = IntVector.fromArray(ISPEC_128, ia, 0);
+        int result = v.reduceLanes(VectorOperators.AND);
+        Asserts.assertEquals(reduceLanes(-1, ia, ISPEC_128.length(), (a, b) -> (a & b)), result);
+        return result;
+    }
+
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "1",
+                  IRNode.OR_REDUCTION_V, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">=32"})
+    public int testOrReduction() {
+        IntVector v = IntVector.fromArray(ISPEC_128, ia, 0);
+        int result = v.reduceLanes(VectorOperators.OR);
+        Asserts.assertEquals(reduceLanes(0, ia, ISPEC_128.length(), (a, b) -> (a | b)), result);
+        return result;
+    }
+
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "1",
+                  IRNode.XOR_REDUCTION_V, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">=32"})
+    public int testXorReduction() {
+        IntVector v = IntVector.fromArray(ISPEC_128, ia, 0);
+        int result = v.reduceLanes(VectorOperators.XOR);
+        Asserts.assertEquals(reduceLanes(0, ia, ISPEC_128.length(), (a, b) -> (a ^ b)), result);
+        return result;
+    }
+
+    // ===================== Reduction Tests - Min/Max =====================
+
+    // Reduction min operations with non-long types are implemented with NEON SIMD instructions
+    // when the vector size is less than or equal to 128-bit.
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "0",
+                  IRNode.MIN_REDUCTION_V, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">= 32"})
+    public int testMinReductionInt_128() {
+        IntVector v = IntVector.fromArray(ISPEC_128, ia, 0);
+        int result = v.reduceLanes(VectorOperators.MIN);
+        Asserts.assertEquals(reduceLanes(Integer.MAX_VALUE, ia, ISPEC_128.length(), (a, b) -> Math.min(a, b)), result);
+        return result;
+    }
+
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "1",
+                  IRNode.MIN_REDUCTION_V, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">= 64"})
+    public int testMinReductionInt_256() {
+        IntVector v = IntVector.fromArray(ISPEC_256, ia, 0);
+        int result = v.reduceLanes(VectorOperators.MIN);
+        Asserts.assertEquals(reduceLanes(Integer.MAX_VALUE, ia, ISPEC_256.length(), (a, b) -> Math.min(a, b)), result);
+        return result;
+    }
+
+    // Reduction max operations with non-long types are implemented with NEON SIMD instructions
+    // when the vector size is less than or equal to 128-bit.
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "0",
+                  IRNode.MAX_REDUCTION_V, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">= 32"})
+    public int testMaxReductionInt_128() {
+        IntVector v = IntVector.fromArray(ISPEC_128, ia, 0);
+        int result = v.reduceLanes(VectorOperators.MAX);
+        Asserts.assertEquals(reduceLanes(Integer.MIN_VALUE, ia, ISPEC_128.length(), (a, b) -> Math.max(a, b)), result);
+        return result;
+    }
+
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "1",
+                  IRNode.MAX_REDUCTION_V, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">= 64"})
+    public int testMaxReductionInt_256() {
+        IntVector v = IntVector.fromArray(ISPEC_256, ia, 0);
+        int result = v.reduceLanes(VectorOperators.MAX);
+        Asserts.assertEquals(reduceLanes(Integer.MIN_VALUE, ia, ISPEC_256.length(), (a, b) -> Math.max(a, b)), result);
+        return result;
+    }
+
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "1",
+                  IRNode.MIN_REDUCTION_V, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">= 32"})
+    public static long testMinReductionLong() {
+        LongVector v = LongVector.fromArray(LSPEC_128, la, 0);
+        long result = v.reduceLanes(VectorOperators.MIN);
+        Asserts.assertEquals(reduceLanes(Long.MAX_VALUE, la, LSPEC_128.length(), (a, b) -> Math.min(a, b)), result);
+        return result;
+    }
+
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "1",
+                  IRNode.MAX_REDUCTION_V, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">= 32"})
+    public static long testMaxReductionLong() {
+        LongVector v = LongVector.fromArray(LSPEC_128, la, 0);
+        long result = v.reduceLanes(VectorOperators.MAX);
+        Asserts.assertEquals(reduceLanes(Long.MIN_VALUE, la, LSPEC_128.length(), (a, b) -> Math.max(a, b)), result);
+        return result;
+    }
+
+    // ====================== VectorMask Tests ======================
+
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "1",
+                  IRNode.VECTOR_LOAD_MASK, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">= 32"})
+    public static void testLoadMask() {
+        VectorMask<Integer> vm = VectorMask.fromArray(ISPEC_128, m, 0);
+        vm.not().intoArray(mr, 0);
+        // Verify that the mask is loaded correctly.
+        for (int i = 0; i < ISPEC_128.length(); i++) {
+            Asserts.assertEquals(!m[i], mr[i]);
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "1",
+                  IRNode.VECTOR_MASK_CMP, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">= 32"})
+    public static void testVectorMaskCmp() {
+        IntVector v1 = IntVector.fromArray(ISPEC_128, ia, 0);
+        IntVector v2 = IntVector.fromArray(ISPEC_128, ib, 0);
+        VectorMask<Integer> vm = v1.compare(VectorOperators.LT, v2);
+        vm.intoArray(mr, 0);
+        // Verify that the mask is generated correctly.
+        for (int i = 0; i < ISPEC_128.length(); i++) {
+            Asserts.assertEquals(ia[i] < ib[i], mr[i]);
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.VECTOR_MASK_GEN, "1",
+                  IRNode.VECTOR_MASK_FIRST_TRUE, "1"},
+        applyIfCPUFeature = {"sve", "true"}, applyIf = {"MaxVectorSize", ">= 32"})
+    public static int testFirstTrue() {
+        VectorMask<Integer> vm = ISPEC_128.maskAll(false);
+        int result = vm.firstTrue();
+        // The result is the vector length if no lane is true.
+        // This is the default behavior of the firstTrue method.
+        Asserts.assertEquals(ISPEC_128.length(), result);
+        return result;
+    }
+
+    public static void main(String[] args) {
+        TestFramework testFramework = new TestFramework();
+        testFramework.setDefaultWarmup(10000)
+                     .addFlags("--add-modules=jdk.incubator.vector")
+                     .start();
+    }
+}
diff --git a/test/hotspot/jtreg/vmTestbase/nsk/monitoring/MemoryPoolMBean/isUsageThresholdExceeded/isexceeded001.java b/test/hotspot/jtreg/vmTestbase/nsk/monitoring/MemoryPoolMBean/isUsageThresholdExceeded/isexceeded001.java
index a684c03e67a..5fbb4d2444e 100644
--- a/test/hotspot/jtreg/vmTestbase/nsk/monitoring/MemoryPoolMBean/isUsageThresholdExceeded/isexceeded001.java
+++ b/test/hotspot/jtreg/vmTestbase/nsk/monitoring/MemoryPoolMBean/isUsageThresholdExceeded/isexceeded001.java
@@ -92,7 +92,8 @@ public static int run(String[] argv, PrintStream out) {
             // but cannot assume this affects the pool we are testing.
             b = new byte[INCREMENT];
 
-            isExceeded = monitor.isUsageThresholdExceeded(pool);
+            // Ensure the observation of isExceeded is sticky to match peakUsage.
+            isExceeded = isExceeded || monitor.isUsageThresholdExceeded(pool);
             log.display("  Allocated heap.  isExceeded = " + isExceeded);
 
             // Fetch usage information: use peak usage in comparisons below, in case usage went up and then down.
diff --git a/test/jdk/ProblemList.txt b/test/jdk/ProblemList.txt
index 72a248408ac..caa9ed76c70 100644
--- a/test/jdk/ProblemList.txt
+++ b/test/jdk/ProblemList.txt
@@ -500,6 +500,7 @@ java/awt/GraphicsDevice/DisplayModes/UnknownRefrshRateTest.java 8286436 macosx-a
 java/awt/image/multiresolution/MultiresolutionIconTest.java 8291979 linux-x64,windows-all
 java/awt/event/SequencedEvent/MultipleContextsFunctionalTest.java 8305061 macosx-x64
 sun/java2d/DirectX/OnScreenRenderingResizeTest/OnScreenRenderingResizeTest.java 8301177 linux-x64
+sun/awt/image/bug8038000.java 8373065 generic-all
 
 # Several tests which fail on some hidpi systems/macosx12-aarch64 system
 java/awt/Window/8159168/SetShapeTest.java 8274106 macosx-aarch64
diff --git a/test/jdk/java/lang/management/MemoryMXBean/MemoryManagement.java b/test/jdk/java/lang/management/MemoryMXBean/MemoryManagement.java
index b136b724b71..f6c7446d1f3 100644
--- a/test/jdk/java/lang/management/MemoryMXBean/MemoryManagement.java
+++ b/test/jdk/java/lang/management/MemoryMXBean/MemoryManagement.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -49,7 +49,7 @@
  *
  * @modules jdk.management
  *
- * @run main/timeout=600 MemoryManagement
+ * @run main/othervm/timeout=600 -Xmn8m MemoryManagement
  */
 
 import java.lang.management.*;
@@ -58,6 +58,10 @@
 import javax.management.openmbean.CompositeData;
 
 public class MemoryManagement {
+
+    private static final int YOUNG_GEN_SIZE = 8 * 1024 * 1024; // Must match -Xmn set on the @run line
+    private static final int NUM_CHUNKS = 2;
+
     private static final MemoryMXBean mm = ManagementFactory.getMemoryMXBean();
     private static final List pools =
             Collections.synchronizedList(ManagementFactory.getMemoryPoolMXBeans());
@@ -66,9 +70,6 @@ public class MemoryManagement {
     private static volatile MemoryPoolMXBean mpool = null;
     private static volatile boolean trace = false;
     private static volatile boolean testFailed = false;
-    private static final int NUM_CHUNKS = 2;
-    // Must match -Xmn set on the @run line
-    private static final int YOUNG_GEN_SIZE = 8 * 1024 * 1024;
     private static volatile long chunkSize;
     private static volatile int listenerInvoked = 0;