diff --git a/hipamd/hip-config-amd.cmake b/hipamd/hip-config-amd.cmake
index b27029be8c..00f05dc9c9 100755
--- a/hipamd/hip-config-amd.cmake
+++ b/hipamd/hip-config-amd.cmake
@@ -178,6 +178,12 @@ if( CLANGRT_Error )
 else()
   # Add support for __fp16 and _Float16, explicitly link with compiler-rt
   if( "${CLANGRT_BUILTINS_FETCH_EXIT_CODE}" STREQUAL "0" )
+    # The HIP_CXX_COMPILER by default prefers backward slashes for path seperators on windows.
+    # Prefer forward slashes here to avoid escaping issues on certain build systems.
+    if(WIN32)
+      string(REPLACE "\\" "/" CLANGRT_BUILTINS ${CLANGRT_BUILTINS})
+    endif()
+
     # CLANG_RT Builtins found Successfully Set interface link libraries property
     set_property(TARGET hip::host APPEND PROPERTY INTERFACE_LINK_LIBRARIES "${CLANGRT_BUILTINS}")
     set_property(TARGET hip::device APPEND PROPERTY INTERFACE_LINK_LIBRARIES "${CLANGRT_BUILTINS}")
diff --git a/hipamd/include/hip/amd_detail/amd_hip_atomic.h b/hipamd/include/hip/amd_detail/amd_hip_atomic.h
index c02a57b079..e35a79abdc 100644
--- a/hipamd/include/hip/amd_detail/amd_hip_atomic.h
+++ b/hipamd/include/hip/amd_detail/amd_hip_atomic.h
@@ -612,11 +612,17 @@ float atomicMin(float* addr, float val) {
 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
   return unsafeAtomicMin(addr, val);
 #else
+  typedef union u_hold {
+    float a;
+    unsigned int b;
+  } u_hold_t;
+  u_hold_t u{val};
+  bool neg_zero = 0x80000000U == u.b;
   #if __has_builtin(__hip_atomic_load) && \
       __has_builtin(__hip_atomic_compare_exchange_strong)
   float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   bool done = false;
-  while (!done && value > val) {
+  while (!done && (value > val || (neg_zero && value == 0.0f))) {
     done = __hip_atomic_compare_exchange_strong(addr, &value, val,
                __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   }
@@ -625,7 +631,7 @@ float atomicMin(float* addr, float val) {
   unsigned int *uaddr = (unsigned int *)addr;
   unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
   bool done = false;
-  while (!done && __uint_as_float(value) > val) {
+  while (!done && (__uint_as_float(value) > val || (neg_zero && __uint_as_float(value) == 0.0f))) {
     done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
                __ATOMIC_RELAXED, __ATOMIC_RELAXED);
   }
@@ -658,11 +664,17 @@ double atomicMin(double* addr, double val) {
 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
   return unsafeAtomicMin(addr, val);
 #else
+  typedef union u_hold {
+    double a;
+    unsigned long long b;
+  } u_hold_t;
+  u_hold_t u{val};
+  bool neg_zero = 0x8000000000000000ULL == u.b;
   #if __has_builtin(__hip_atomic_load) && \
       __has_builtin(__hip_atomic_compare_exchange_strong)
   double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   bool done = false;
-  while (!done && value > val) {
+  while (!done && (value > val || (neg_zero && value == 0.0)))  {
     done = __hip_atomic_compare_exchange_strong(addr, &value, val,
                __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   }
@@ -671,7 +683,8 @@ double atomicMin(double* addr, double val) {
   unsigned long long *uaddr = (unsigned long long *)addr;
   unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
   bool done = false;
-  while (!done && __longlong_as_double(value) > val) {
+  while (!done &&
+         (__longlong_as_double(value) > val || (neg_zero && __longlong_as_double(value) == 0.0))) {
     done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
                __ATOMIC_RELAXED, __ATOMIC_RELAXED);
   }
@@ -856,11 +869,17 @@ float atomicMax(float* addr, float val) {
 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
   return unsafeAtomicMax(addr, val);
 #else
+  typedef union u_hold {
+    float a;
+    unsigned int b;
+  } u_hold_t;
+  u_hold_t u{val};
+  bool neg_zero = 0x80000000U == u.b;
   #if __has_builtin(__hip_atomic_load) && \
       __has_builtin(__hip_atomic_compare_exchange_strong)
   float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   bool done = false;
-  while (!done && value < val) {
+  while (!done && (value < val || (neg_zero && value == 0.0f))) {
     done = __hip_atomic_compare_exchange_strong(addr, &value, val,
                __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   }
@@ -869,7 +888,7 @@ float atomicMax(float* addr, float val) {
   unsigned int *uaddr = (unsigned int *)addr;
   unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
   bool done = false;
-  while (!done && __uint_as_float(value) < val) {
+  while (!done && (__uint_as_float(value) < val || (neg_zero && __uint_as_float(value) == 0.0f))) {
     done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
                __ATOMIC_RELAXED, __ATOMIC_RELAXED);
   }
@@ -902,11 +921,17 @@ double atomicMax(double* addr, double val) {
 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
   return unsafeAtomicMax(addr, val);
 #else
+  typedef union u_hold {
+    double a;
+    unsigned long long b;
+  } u_hold_t;
+  u_hold_t u{val};
+  bool neg_zero = 0x8000000000000000ULL == u.b;
   #if __has_builtin(__hip_atomic_load) && \
       __has_builtin(__hip_atomic_compare_exchange_strong)
   double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   bool done = false;
-  while (!done && value < val) {
+  while (!done && (value < val || (neg_zero && value == 0.0))) {
     done = __hip_atomic_compare_exchange_strong(addr, &value, val,
                __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   }
@@ -915,7 +940,8 @@ double atomicMax(double* addr, double val) {
   unsigned long long *uaddr = (unsigned long long *)addr;
   unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
   bool done = false;
-  while (!done && __longlong_as_double(value) < val) {
+  while (!done &&
+         (__longlong_as_double(value) < val || (neg_zero && __longlong_as_double(value) == 0.0))) {
     done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
                __ATOMIC_RELAXED, __ATOMIC_RELAXED);
   }
diff --git a/hipamd/include/hip/amd_detail/amd_warp_functions.h b/hipamd/include/hip/amd_detail/amd_warp_functions.h
index 64c2740132..98f8896cd9 100644
--- a/hipamd/include/hip/amd_detail/amd_warp_functions.h
+++ b/hipamd/include/hip/amd_detail/amd_warp_functions.h
@@ -103,14 +103,16 @@ unsigned long long int __ballot64(int predicate) {
     return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
 }
 
+// See amd_warp_sync_functions.h for an explanation of this preprocessor flag.
+#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
 // Since threads in a wave do not make independent progress, __activemask()
 // always returns the exact active mask, i.e, all active threads in the wave.
-
 __device__
 inline
 unsigned long long __activemask() {
   return __ballot(true);
 }
+#endif // HIP_ENABLE_WARP_SYNC_BUILTINS
 
 __device__ static inline unsigned int __lane_id() {
     return  __builtin_amdgcn_mbcnt_hi(
diff --git a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h
index 5ce2581a8d..b8c67a8972 100644
--- a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h
+++ b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h
@@ -22,6 +22,15 @@ THE SOFTWARE.
 
 #pragma once
 
+// Warp sync builtins (with explicit mask argument) introduced in ROCm 6.1 as a
+// preview to allow end-users to adapt to the new interface involving 64-bit
+// masks. These are disabled by default, and can be enabled by setting the macro
+// below. The builtins will be enabled unconditionally in ROCm 6.2.
+//
+// This arrangement also applies to the __activemask() builtin defined in
+// amd_warp_functions.h.
+#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
+
 #if !defined(__HIPCC_RTC__)
 #include "amd_warp_functions.h"
 #include "hip_assert.h"
@@ -259,3 +268,5 @@ T __shfl_xor_sync(MaskT mask, T var, int laneMask,
 
 #undef __hip_do_sync
 #undef __hip_check_mask
+
+#endif // HIP_ENABLE_WARP_SYNC_BUILTINS
diff --git a/hipamd/include/hip/amd_detail/hip_api_trace.hpp b/hipamd/include/hip/amd_detail/hip_api_trace.hpp
index 7bae717bf5..21f37258da 100644
--- a/hipamd/include/hip/amd_detail/hip_api_trace.hpp
+++ b/hipamd/include/hip/amd_detail/hip_api_trace.hpp
@@ -936,6 +936,7 @@ typedef hipError_t (*t_hipGraphExecExternalSemaphoresWaitNodeSetParams)(hipGraph
 typedef hipError_t (*t_hipGraphAddNode)(hipGraphNode_t *pGraphNode, hipGraph_t graph,
                            const hipGraphNode_t *pDependencies, size_t numDependencies,
                            hipGraphNodeParams *nodeParams);
+typedef hipError_t (*t_hipExtGetLastError)();
 
 // HIP Compiler dispatch table
 struct HipCompilerDispatchTable {
@@ -1396,4 +1397,5 @@ struct HipDispatchTable {
   t_hipGraphExecExternalSemaphoresSignalNodeSetParams hipGraphExecExternalSemaphoresSignalNodeSetParams_fn;
   t_hipGraphExecExternalSemaphoresWaitNodeSetParams hipGraphExecExternalSemaphoresWaitNodeSetParams_fn;
   t_hipGraphAddNode hipGraphAddNode_fn;
+  t_hipExtGetLastError hipExtGetLastError_fn;
 };
diff --git a/hipamd/src/CMakeLists.txt b/hipamd/src/CMakeLists.txt
index b038fd72dc..b2bf28bbaa 100644
--- a/hipamd/src/CMakeLists.txt
+++ b/hipamd/src/CMakeLists.txt
@@ -59,6 +59,9 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
 
 if(BUILD_SHARED_LIBS)
   add_library(amdhip64 SHARED)
+  if(WIN32)
+    set_target_properties(amdhip64 PROPERTIES RUNTIME_OUTPUT_NAME "amdhip64_${HIP_VERSION_MAJOR}")
+  endif()
   # Windows doesn't have a strip utility, so CMAKE_STRIP won't be set.
   if((CMAKE_BUILD_TYPE STREQUAL "Release") AND NOT ("${CMAKE_STRIP}" STREQUAL ""))
     add_custom_command(TARGET amdhip64 POST_BUILD COMMAND ${CMAKE_STRIP} $<TARGET_FILE:amdhip64>)
diff --git a/hipamd/src/amdhip.def b/hipamd/src/amdhip.def
index 2133815d86..f7ac5df298 100644
--- a/hipamd/src/amdhip.def
+++ b/hipamd/src/amdhip.def
@@ -427,7 +427,6 @@ hipMemcpy2DAsync_spt
 hipMemcpyFromSymbolAsync_spt
 hipMemcpyToSymbolAsync_spt
 hipMemcpyFromArray_spt
-hipMemcpy2DToArray_spt
 hipMemcpy2DFromArrayAsync_spt
 hipMemcpy2DToArrayAsync_spt
 hipDrvGetErrorName
diff --git a/hipamd/src/hip_api_trace.cpp b/hipamd/src/hip_api_trace.cpp
index 2ec7ab86d7..c06682fbdd 100644
--- a/hipamd/src/hip_api_trace.cpp
+++ b/hipamd/src/hip_api_trace.cpp
@@ -756,6 +756,7 @@ hipError_t hipGraphExternalSemaphoresWaitNodeSetParams(
     hipGraphNode_t hNode, const hipExternalSemaphoreWaitNodeParams* nodeParams);
 hipError_t hipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* launchParamsList,
                                                        unsigned int numDevices, unsigned int flags);
+hipError_t hipExtGetLastError();
 }  // namespace hip
 
 namespace hip {
@@ -808,6 +809,7 @@ void UpdateDispatchTable(HipDispatchTable* ptrDispatchTable) {
   ptrDispatchTable->hipCtxSetSharedMemConfig_fn = hip::hipCtxSetSharedMemConfig;
   ptrDispatchTable->hipCtxSynchronize_fn = hip::hipCtxSynchronize;
   ptrDispatchTable->hipDestroyExternalMemory_fn = hip::hipDestroyExternalMemory;
+  ptrDispatchTable->hipDestroyExternalSemaphore_fn = hip::hipDestroyExternalSemaphore;
   ptrDispatchTable->hipDestroySurfaceObject_fn = hip::hipDestroySurfaceObject;
   ptrDispatchTable->hipDestroyTextureObject_fn = hip::hipDestroyTextureObject;
   ptrDispatchTable->hipDeviceCanAccessPeer_fn = hip::hipDeviceCanAccessPeer;
@@ -1224,6 +1226,7 @@ void UpdateDispatchTable(HipDispatchTable* ptrDispatchTable) {
   ptrDispatchTable->hipGetStreamDeviceId_fn = hip::hipGetStreamDeviceId;
   ptrDispatchTable->hipDrvGraphAddMemsetNode_fn = hip::hipDrvGraphAddMemsetNode;
   ptrDispatchTable->hipGetDevicePropertiesR0000_fn = hip::hipGetDevicePropertiesR0000;
+  ptrDispatchTable->hipExtGetLastError_fn = hip::hipExtGetLastError;
 }
 
 #if HIP_ROCPROFILER_REGISTER > 0
diff --git a/hipamd/src/hip_context.cpp b/hipamd/src/hip_context.cpp
index 47569f0669..4b7f5c8098 100644
--- a/hipamd/src/hip_context.cpp
+++ b/hipamd/src/hip_context.cpp
@@ -90,7 +90,7 @@ void setCurrentDevice(unsigned int index) {
 
 hip::Stream* getStream(hipStream_t stream, bool wait) {
  if (stream == nullptr) {
-    return getNullStream();
+    return getNullStream(wait);
   } else {
     hip::Stream* hip_stream = reinterpret_cast<hip::Stream*>(stream);
     if (wait && !(hip_stream->Flags() & hipStreamNonBlocking)) {
@@ -128,9 +128,9 @@ int getDeviceID(amd::Context& ctx) {
 }
 
 // ================================================================================================
-hip::Stream* getNullStream() {
+hip::Stream* getNullStream(bool wait ) {
   Device* device = getCurrentDevice();
-  return device ? device->NullStream() : nullptr;
+  return device ? device->NullStream(wait) : nullptr;
 }
 
 hipError_t hipInit(unsigned int flags) {
diff --git a/hipamd/src/hip_device.cpp b/hipamd/src/hip_device.cpp
index 5657fd6371..bcf6830f12 100644
--- a/hipamd/src/hip_device.cpp
+++ b/hipamd/src/hip_device.cpp
@@ -30,7 +30,7 @@
 namespace hip {
 
 // ================================================================================================
-hip::Stream* Device::NullStream() {
+hip::Stream* Device::NullStream(bool wait) {
   if (null_stream_ == nullptr) {
     null_stream_ = new Stream(this, Stream::Priority::Normal, 0, true);
   }
@@ -38,8 +38,10 @@ hip::Stream* Device::NullStream() {
   if (null_stream_ == nullptr) {
     return nullptr;
   }
-  // Wait for all active streams before executing commands on the default
-  iHipWaitActiveStreams(null_stream_);
+  if (wait == true) {
+    // Wait for all active streams before executing commands on the default
+    iHipWaitActiveStreams(null_stream_);
+  }
   return null_stream_;
 }
 
@@ -463,6 +465,8 @@ hipError_t ihipGetDeviceProperties(hipDeviceProp_tR0600* props, int device) {
   deviceProps.timelineSemaphoreInteropSupported = 0;
   deviceProps.unifiedFunctionPointers = 0;
 
+  deviceProps.integrated = info.accelerator_;
+
   *props = deviceProps;
   return hipSuccess;
 }
diff --git a/hipamd/src/hip_embed_pch.sh b/hipamd/src/hip_embed_pch.sh
index 2d998407ae..9b01da8726 100755
--- a/hipamd/src/hip_embed_pch.sh
+++ b/hipamd/src/hip_embed_pch.sh
@@ -203,7 +203,7 @@ __hipRTC_header_size:
 EOF
 
   set -x
-  $LLVM_DIR/bin/clang -O3 --hip-path=$HIP_INC_DIR/.. -std=c++14 -nogpulib --hip-version=4.4 -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only -D__HIPCC_RTC__ -x hip $tmp/hipRTC_header.h -E -o $tmp/hiprtc &&
+  $LLVM_DIR/bin/clang -O3 --hip-path=$HIP_INC_DIR/.. -std=c++14 -nogpulib --hip-version=4.4 -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only -D__HIPCC_RTC__ -x hip $tmp/hipRTC_header.h -E -P -o $tmp/hiprtc &&
   cat $macroFile >> $tmp/hiprtc &&
   $LLVM_DIR/bin/llvm-mc -o $tmp/hiprtc_header.o $tmp/hipRTC_header.mcin --filetype=obj &&
   $LLVM_DIR/bin/clang $tmp/hiprtc_header.o -o $rtc_shared_lib_out -shared &&
diff --git a/hipamd/src/hip_event.cpp b/hipamd/src/hip_event.cpp
index d233e42438..f527766e4b 100644
--- a/hipamd/src/hip_event.cpp
+++ b/hipamd/src/hip_event.cpp
@@ -85,8 +85,6 @@ hipError_t Event::synchronize() {
       event_->awaitCompletion();
     }
   }
-  // Release freed memory for all memory pools on the device
-  hip_device->ReleaseFreedMemory();
   return hipSuccess;
 }
 
@@ -443,7 +441,12 @@ hipError_t hipEventSynchronize(hipEvent_t event) {
   if (hip::Stream::StreamCaptureOngoing(e->GetCaptureStream()) == true) {
     HIP_RETURN(hipErrorStreamCaptureUnsupported);
   }
-  HIP_RETURN(e->synchronize());
+
+  hipError_t status = e->synchronize();
+  // Release freed memory for all memory pools on the device
+  g_devices[e->deviceId()]->ReleaseFreedMemory();
+
+  HIP_RETURN(status);
 }
 
 hipError_t ihipEventQuery(hipEvent_t event) {
diff --git a/hipamd/src/hip_fatbin.cpp b/hipamd/src/hip_fatbin.cpp
index 9d5591d053..ac8775597d 100644
--- a/hipamd/src/hip_fatbin.cpp
+++ b/hipamd/src/hip_fatbin.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -115,10 +115,9 @@ void ListAllDeviceWithNoCOFromBundle(const std::unordered_map<std::string,
 }
 
 hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vector<hip::Device*>& devices) {
-  amd_comgr_data_t data_object;
+  amd_comgr_data_t data_object {0};
   amd_comgr_status_t comgr_status = AMD_COMGR_STATUS_SUCCESS;
   hipError_t hip_status = hipSuccess;
-  amd_comgr_code_object_info_t* query_list_array = nullptr;
 
   // If image was passed as a pointer to our hipMod* api, we can try to extract the file name
   // if it was mapped by the app. Otherwise use the COMGR data API.
@@ -266,12 +265,6 @@ hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vector<hip::Devi
       fatbin_dev_info_[device->deviceId()]->program_
         = new amd::Program(*(device->asContext()));
     }
-
-    if ((comgr_status = amd_comgr_release_data(data_object)) != AMD_COMGR_STATUS_SUCCESS) {
-      LogPrintfError("Releasing COMGR data failed with status %d ", comgr_status);
-      return hipErrorInvalidValue;
-    }
-
   } while(0);
 
   // Clean up file and memory resouces if hip_status failed for some reason.
@@ -292,7 +285,9 @@ hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vector<hip::Devi
       fdesc_ = 0;
       fsize_ = 0;
     }
+  }
 
+  if (data_object.handle) {
     if ((comgr_status = amd_comgr_release_data(data_object)) != AMD_COMGR_STATUS_SUCCESS) {
       LogPrintfError("Releasing COMGR data failed with status %d ", comgr_status);
       return hipErrorInvalidValue;
diff --git a/hipamd/src/hip_graph.cpp b/hipamd/src/hip_graph.cpp
index 7cfb1babde..89b3190fdc 100644
--- a/hipamd/src/hip_graph.cpp
+++ b/hipamd/src/hip_graph.cpp
@@ -32,6 +32,7 @@ std::vector<hip::Stream*> g_captureStreams;
 amd::Monitor g_captureStreamsLock{"StreamCaptureGlobalList"};
 amd::Monitor g_streamSetLock{"StreamCaptureset"};
 std::unordered_set<hip::Stream*> g_allCapturingStreams;
+hipError_t ihipGraphDebugDotPrint(hipGraph_t graph, const char* path, unsigned int flags);
 
 inline hipError_t ihipGraphAddNode(hip::GraphNode* graphNode, hip::Graph* graph,
                                    hip::GraphNode* const* pDependencies, size_t numDependencies,
@@ -1223,6 +1224,16 @@ hipError_t ihipGraphInstantiate(hip::GraphExec** pGraphExec, hip::Graph* graph,
                          flags);
   if (*pGraphExec != nullptr) {
     graph->SetGraphInstantiated(true);
+    if (DEBUG_HIP_GRAPH_DOT_PRINT) {
+      static int i = 1;
+      std::string filename =
+          "graph_" + std::to_string(amd::Os::getProcessId()) + "_dot_print_" + std::to_string(i++);
+      hipError_t status =
+          ihipGraphDebugDotPrint(reinterpret_cast<hipGraph_t>(graph), filename.c_str(), 0);
+      if (status == hipSuccess) {
+        LogPrintfInfo("[hipGraph] graph dump:%s", filename.c_str());
+      }
+    }
     return (*pGraphExec)->Init();
   } else {
     return hipErrorOutOfMemory;
@@ -1541,7 +1552,15 @@ hipError_t hipGraphExecKernelNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo
   if (clonedNode == nullptr) {
     HIP_RETURN(hipErrorInvalidValue);
   }
-  HIP_RETURN(reinterpret_cast<hip::GraphKernelNode*>(clonedNode)->SetParams(pNodeParams));
+  hipError_t status = reinterpret_cast<hip::GraphKernelNode*>(clonedNode)->SetParams(pNodeParams);
+  if(status != hipSuccess) {
+    HIP_RETURN(status);
+  }
+  if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
+    status = reinterpret_cast<hip::GraphExec*>(hGraphExec)
+        ->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
+  }
+  HIP_RETURN(status);
 }
 
 hipError_t hipGraphChildGraphNodeGetGraph(hipGraphNode_t node, hipGraph_t* pGraph) {
@@ -2550,13 +2569,10 @@ hipError_t hipGraphKernelNodeCopyAttributes(hipGraphNode_t hSrc, hipGraphNode_t
 }
 
 hipError_t ihipGraphDebugDotPrint(hipGraph_t graph, const char* path, unsigned int flags) {
-  if (graph == nullptr || path == nullptr) {
-    return hipErrorInvalidValue;
-  }
   std::ofstream fout;
   fout.open(path, std::ios::out);
   if (fout.fail()) {
-    ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Error during opening of file : %s", path);
+    LogPrintfError("[hipGraph] Error during opening of file : %s", path);
     return hipErrorOperatingSystem;
   }
   fout << "digraph dot {" << std::endl;
@@ -2568,6 +2584,9 @@ hipError_t ihipGraphDebugDotPrint(hipGraph_t graph, const char* path, unsigned i
 
 hipError_t hipGraphDebugDotPrint(hipGraph_t graph, const char* path, unsigned int flags) {
   HIP_INIT_API(hipGraphDebugDotPrint, graph, path, flags);
+  if (graph == nullptr || path == nullptr) {
+    return hipErrorInvalidValue;
+  }
   HIP_RETURN(ihipGraphDebugDotPrint(graph, path, flags));
 }
 
diff --git a/hipamd/src/hip_graph_internal.cpp b/hipamd/src/hip_graph_internal.cpp
index ee70f8b1b9..f3a5be0880 100644
--- a/hipamd/src/hip_graph_internal.cpp
+++ b/hipamd/src/hip_graph_internal.cpp
@@ -332,6 +332,9 @@ hipError_t GraphExec::CreateStreams(uint32_t num_streams) {
     }
     parallel_streams_.push_back(stream);
   }
+  // Don't wait for other streams to finish.
+  // Capture stream is to capture AQL packet.
+  capture_stream_ = hip::getNullStream(false);
   return hipSuccess;
 }
 
@@ -353,13 +356,10 @@ hipError_t GraphExec::CaptureAQLPackets() {
   hipError_t status = hipSuccess;
   if (parallelLists_.size() == 1) {
     size_t kernArgSizeForGraph = 0;
-    hip::Stream* stream = nullptr;
     // GPU packet capture is enabled for kernel nodes. Calculate the kernel
     // arg size required for all graph kernel nodes to allocate
     for (const auto& list : parallelLists_) {
-      stream = GetAvailableStreams();
       for (auto& node : list) {
-        node->SetStream(stream, this);
         if (node->GetType() == hipGraphNodeTypeKernel) {
           kernArgSizeForGraph += reinterpret_cast<hip::GraphKernelNode*>(node)->GetKerArgSize();
         }
@@ -386,7 +386,6 @@ hipError_t GraphExec::CaptureAQLPackets() {
     for (auto& node : topoOrder_) {
       if (node->GetType() == hipGraphNodeTypeKernel) {
         auto kernelNode = reinterpret_cast<hip::GraphKernelNode*>(node);
-        status = node->CreateCommand(node->GetQueue());
         // From the kernel pool allocate the kern arg size required for the current kernel node.
         address kernArgOffset = allocKernArg(kernelNode->GetKernargSegmentByteSize(),
                                              kernelNode->GetKernargSegmentAlignment());
@@ -394,7 +393,7 @@ hipError_t GraphExec::CaptureAQLPackets() {
           return hipErrorMemoryAllocation;
         }
         // Form GPU packet capture for the kernel node.
-        kernelNode->CaptureAndFormPacket(kernArgOffset);
+        kernelNode->CaptureAndFormPacket(capture_stream_, kernArgOffset) ;
       }
     }
 
@@ -408,7 +407,7 @@ hipError_t GraphExec::CaptureAQLPackets() {
         address dev_ptr = kernarg_pool_graph_ + kernarg_pool_size_graph_ - sizeof(int);
         *dev_ptr = host_val;
         if (device->info().hdpMemFlushCntl == nullptr) {
-          amd::Command* command = new amd::Marker(*stream, true);
+          amd::Command* command = new amd::Marker(*capture_stream_, true);
           if (command != nullptr) {
             command->enqueue();
             command->release();
@@ -426,6 +425,50 @@ hipError_t GraphExec::CaptureAQLPackets() {
   return status;
 }
 
+hipError_t GraphExec::UpdateAQLPacket(hip::GraphKernelNode* node) {
+  if (parallelLists_.size() == 1) {
+    size_t pool_new_usage = 0;
+    address result = nullptr;
+    if (!kernarg_graph_.empty()) {
+      // 1. Allocate memory for the kernel args
+      size_t kernArgSizeForNode = 0;
+      kernArgSizeForNode = node->GetKerArgSize();
+
+      result = amd::alignUp(kernarg_graph_.back() + kernarg_graph_cur_offset_,
+                            node->GetKernargSegmentAlignment());
+      pool_new_usage = (result + kernArgSizeForNode) - kernarg_graph_.back();
+    }
+    if (pool_new_usage != 0 && pool_new_usage <= kernarg_graph_size_) {
+      kernarg_graph_cur_offset_ = pool_new_usage;
+    } else {
+      address kernarg_graph;
+      auto device = g_devices[ihipGetDevice()]->devices()[0];
+      if (device->info().largeBar_) {
+        kernarg_graph = reinterpret_cast<address>(device->deviceLocalAlloc(kernarg_graph_size_));
+      } else {
+        kernarg_graph = reinterpret_cast<address>(
+            device->hostAlloc(kernarg_graph_size_, 0, amd::Device::MemorySegment::kKernArg));
+      }
+      kernarg_graph_.push_back(kernarg_graph);
+      kernarg_graph_cur_offset_ = 0;
+
+      // 1. Allocate memory for the kernel args
+      size_t kernArgSizeForNode = 0;
+      kernArgSizeForNode = node->GetKerArgSize();
+      result = amd::alignUp(kernarg_graph_.back() + kernarg_graph_cur_offset_,
+                            node->GetKernargSegmentAlignment());
+      const size_t pool_new_usage = (result + kernArgSizeForNode) - kernarg_graph_.back();
+      if (pool_new_usage <= kernarg_graph_size_) {
+        kernarg_graph_cur_offset_ = pool_new_usage;
+      }
+    }
+
+    // 2. copy kernel args / create new AQL packet
+    node->CaptureAndFormPacket(capture_stream_, result);
+  }
+  return hipSuccess;
+}
+
 hipError_t FillCommands(std::vector<std::vector<Node>>& parallelLists,
                         std::unordered_map<Node, std::vector<Node>>& nodeWaitLists,
                         std::vector<Node>& topoOrder, Graph* clonedGraph,
diff --git a/hipamd/src/hip_graph_internal.hpp b/hipamd/src/hip_graph_internal.hpp
index 7f6543cb91..2229b75d8c 100644
--- a/hipamd/src/hip_graph_internal.hpp
+++ b/hipamd/src/hip_graph_internal.hpp
@@ -543,7 +543,7 @@ struct Graph {
     graphInstantiated_ = graphInstantiate;
   }
 };
-
+struct GraphKernelNode;
 struct GraphExec {
   std::vector<std::vector<Node>> parallelLists_;
   // Topological order of the graph doesn't include nodes embedded as part of the child graph
@@ -551,6 +551,7 @@ struct GraphExec {
   std::unordered_map<Node, std::vector<Node>> nodeWaitLists_;
   struct Graph* clonedGraph_;
   std::vector<hip::Stream*> parallel_streams_;
+  hip::Stream* capture_stream_;
   uint currentQueueIndex_;
   std::unordered_map<Node, Node> clonedNodes_;
   amd::Command* lastEnqueuedCommand_;
@@ -563,6 +564,10 @@ struct GraphExec {
   address kernarg_pool_graph_ = nullptr;
   uint32_t kernarg_pool_size_graph_ = 0;
   uint32_t kernarg_pool_cur_graph_offset_ = 0;
+  std::vector<address> kernarg_graph_;
+  uint32_t kernarg_graph_cur_offset_ = 0;
+  uint32_t kernarg_graph_size_ = 128 * Ki;
+
  public:
   GraphExec(std::vector<Node>& topoOrder, std::vector<std::vector<Node>>& lists,
             std::unordered_map<Node, std::vector<Node>>& nodeWaitLists, struct Graph*& clonedGraph,
@@ -591,6 +596,9 @@ struct GraphExec {
     auto device = g_devices[ihipGetDevice()]->devices()[0];
     if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
       device->hostFree(kernarg_pool_graph_, kernarg_pool_size_graph_);
+      for (auto& element : kernarg_graph_) {
+        device->hostFree(element, kernarg_graph_size_);
+      }
     }
     amd::ScopedLock lock(graphExecSetLock_);
     graphExecSet_.erase(this);
@@ -636,6 +644,7 @@ struct GraphExec {
   hipError_t Run(hipStream_t stream);
   // Capture GPU Packets from graph commands
   hipError_t CaptureAQLPackets();
+  hipError_t UpdateAQLPacket(hip::GraphKernelNode* node);
 };
 
 struct ChildGraphNode : public GraphNode {
@@ -793,19 +802,20 @@ class GraphKernelNode : public GraphNode {
     out << "];";
     }
 
-    void CaptureAndFormPacket(address kernArgOffset) {
-      for (auto& command : commands_) {
-        reinterpret_cast<amd::NDRangeKernelCommand*>(command)->setCapturingState(
-            true, GetAqlPacket(), kernArgOffset);
-
-        // Enqueue command to capture GPU Packet. The packet is not submitted to the device.
-        // The packet is stored in gpuPacket_ and submitted during graph launch.
-        command->submit(*(command->queue())->vdev());
-        // Need to ensure if the command is NDRangeKernelCommand if we capture non kernel nodes
-        SetKernelName(reinterpret_cast<amd::NDRangeKernelCommand*>(command)->kernel().name());
-        command->release();
-      }
+  void CaptureAndFormPacket(hip::Stream* capture_stream, address kernArgOffset) {
+    hipError_t status = CreateCommand(capture_stream);
+    for (auto& command : commands_) {
+      reinterpret_cast<amd::NDRangeKernelCommand*>(command)->setCapturingState(
+          true, GetAqlPacket(), kernArgOffset);
+
+      // Enqueue command to capture GPU Packet. The packet is not submitted to the device.
+      // The packet is stored in gpuPacket_ and submitted during graph launch.
+      command->submit(*(command->queue())->vdev());
+      // Need to ensure if the command is NDRangeKernelCommand if we capture non kernel nodes
+      SetKernelName(reinterpret_cast<amd::NDRangeKernelCommand*>(command)->kernel().name());
+      command->release();
     }
+  }
 
   std::string GetLabel(hipGraphDebugDotFlags flag) {
     hipFunction_t func = getFunc(kernelParams_, ihipGetDevice());
diff --git a/hipamd/src/hip_hcc.map.in b/hipamd/src/hip_hcc.map.in
index b8f0b3da64..fc7d91e8bc 100644
--- a/hipamd/src/hip_hcc.map.in
+++ b/hipamd/src/hip_hcc.map.in
@@ -486,7 +486,6 @@ global:
     hipMemcpyFromSymbolAsync_spt;
     hipMemcpyToSymbolAsync_spt;
     hipMemcpyFromArray_spt;
-    hipMemcpy2DToArray_spt;
     hipMemcpy2DFromArrayAsync_spt;
     hipMemcpy2DToArrayAsync_spt;
     hipDrvGetErrorName;
diff --git a/hipamd/src/hip_internal.hpp b/hipamd/src/hip_internal.hpp
index 88b12d8451..d3d38178fa 100644
--- a/hipamd/src/hip_internal.hpp
+++ b/hipamd/src/hip_internal.hpp
@@ -487,7 +487,7 @@ class stream_per_thread {
     void setFlags(unsigned int flags) { flags_ = flags; }
     void Reset();
 
-    hip::Stream* NullStream();
+    hip::Stream* NullStream(bool wait = true);
     Stream* GetNullStream() const {return null_stream_;};
 
     void SetActiveStatus() {
@@ -572,7 +572,7 @@ class stream_per_thread {
   /// Get default stream associated with the ROCclr context
   extern hip::Stream* getNullStream(amd::Context&);
   /// Get default stream of the thread
-  extern hip::Stream* getNullStream();
+  extern hip::Stream* getNullStream(bool wait = true);
   /// Get device ID associated with the ROCclr context
   int getDeviceID(amd::Context& ctx);
   /// Check if stream is valid
diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp
index 64ed22985c..8dc0e176bd 100644
--- a/hipamd/src/hip_memory.cpp
+++ b/hipamd/src/hip_memory.cpp
@@ -1311,8 +1311,8 @@ hipError_t hipMemcpyToSymbol_common(const void* symbol, const void* src, size_t
                              size_t offset, hipMemcpyKind kind, hipStream_t stream=nullptr) {
   CHECK_STREAM_CAPTURING();
 
-  if (kind != hipMemcpyHostToDevice && (kind != hipMemcpyDeviceToDevice ||
-                                        kind != hipMemcpyDeviceToDeviceNoCU)) {
+  if (kind != hipMemcpyHostToDevice && kind != hipMemcpyDeviceToDevice &&
+      kind != hipMemcpyDeviceToDeviceNoCU) {
     HIP_RETURN(hipErrorInvalidMemcpyDirection);
   }
 
@@ -1345,8 +1345,8 @@ hipError_t hipMemcpyFromSymbol_common(void* dst, const void* symbol, size_t size
                                size_t offset, hipMemcpyKind kind, hipStream_t stream=nullptr) {
   CHECK_STREAM_CAPTURING();
 
-  if (kind != hipMemcpyDeviceToHost && (kind != hipMemcpyDeviceToDevice ||
-                                        kind != hipMemcpyDeviceToDeviceNoCU)) {
+  if (kind != hipMemcpyDeviceToHost && kind != hipMemcpyDeviceToDevice &&
+      kind != hipMemcpyDeviceToDeviceNoCU) {
     HIP_RETURN(hipErrorInvalidMemcpyDirection);
   }
 
@@ -1379,8 +1379,8 @@ hipError_t hipMemcpyToSymbolAsync_common(const void* symbol, const void* src, si
                                   size_t offset, hipMemcpyKind kind, hipStream_t stream) {
   STREAM_CAPTURE(hipMemcpyToSymbolAsync, stream, symbol, src, sizeBytes, offset, kind);
 
-  if (kind != hipMemcpyHostToDevice && (kind != hipMemcpyDeviceToDevice ||
-                                        kind != hipMemcpyDeviceToDeviceNoCU)) {
+  if (kind != hipMemcpyHostToDevice && kind != hipMemcpyDeviceToDevice &&
+      kind != hipMemcpyDeviceToDeviceNoCU) {
     return hipErrorInvalidMemcpyDirection;
   }
 
@@ -1412,8 +1412,8 @@ hipError_t hipMemcpyFromSymbolAsync_common(void* dst, const void* symbol, size_t
                                     size_t offset, hipMemcpyKind kind, hipStream_t stream) {
   STREAM_CAPTURE(hipMemcpyFromSymbolAsync, stream, dst, symbol, sizeBytes, offset, kind);
 
-  if (kind != hipMemcpyDeviceToHost && (kind != hipMemcpyDeviceToDevice ||
-                                        kind != hipMemcpyDeviceToDeviceNoCU)) {
+  if (kind != hipMemcpyDeviceToHost && kind != hipMemcpyDeviceToDevice &&
+      kind != hipMemcpyDeviceToDeviceNoCU) {
     return hipErrorInvalidMemcpyDirection;
   }
 
@@ -1820,28 +1820,43 @@ hipError_t ihipMemcpyDtoHCommand(amd::Command*& command, void* srcDevice, void*
   amd::Memory* srcMemory;
   amd::BufferRect srcRect;
   amd::BufferRect dstRect;
+  size_t dOffset = 0;
+  amd::Memory* dstMemory = getMemoryObject(dstHost, dOffset);
+
   hipError_t status = ihipMemcpyDtoHValidate(srcDevice, dstHost, srcOrigin, dstOrigin, copyRegion,
                                              srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch,
                                              srcMemory, srcRect, dstRect);
   if (status != hipSuccess) {
     return status;
   }
+
   amd::Coord3D srcStart(srcRect.start_, 0, 0);
   amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
-  amd::ReadMemoryCommand* readCommand =
+  if (dstMemory) {
+    amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand(
+      *stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{},
+      *srcMemory, *dstMemory, srcOrigin, dstOrigin,
+      copyRegion, srcRect, dstRect, copyMetadata);
+    if (copyCommand == nullptr) {
+      return hipErrorOutOfMemory;
+    }
+    command = copyCommand;
+  } else {
+    amd::ReadMemoryCommand* readCommand =
       new amd::ReadMemoryCommand(*stream, CL_COMMAND_READ_BUFFER_RECT, amd::Command::EventWaitList{},
                                  *srcMemory, srcStart, copyRegion, dstHost, srcRect, dstRect,
                                  copyMetadata);
+    if (readCommand == nullptr) {
+      return hipErrorOutOfMemory;
+    }
 
-  if (readCommand == nullptr) {
-    return hipErrorOutOfMemory;
+    if (!readCommand->validatePeerMemory()) {
+      delete readCommand;
+      return hipErrorInvalidValue;
+    }
+    command = readCommand;
   }
 
-  if (!readCommand->validatePeerMemory()) {
-    delete readCommand;
-    return hipErrorInvalidValue;
-  }
-  command = readCommand;
   return hipSuccess;
 }
 
@@ -1884,6 +1899,8 @@ hipError_t ihipMemcpyHtoDCommand(amd::Command*& command, const void* srcHost, vo
   amd::Memory* dstMemory;
   amd::BufferRect srcRect;
   amd::BufferRect dstRect;
+  size_t sOffset = 0;
+  amd::Memory* srcMemory = getMemoryObject(srcHost, sOffset);
 
   hipError_t status = ihipMemcpyHtoDValidate(srcHost, dstDevice, srcOrigin, dstOrigin, copyRegion,
                                              srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch,
@@ -1891,21 +1908,33 @@ hipError_t ihipMemcpyHtoDCommand(amd::Command*& command, const void* srcHost, vo
   if (status != hipSuccess) {
     return status;
   }
+
   amd::Coord3D dstStart(dstRect.start_, 0, 0);
   amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
-  amd::WriteMemoryCommand* writeCommand = new amd::WriteMemoryCommand(
+  if (srcMemory) {
+    amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand(
+      *stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{},
+      *srcMemory, *dstMemory, srcOrigin, dstOrigin,
+      copyRegion, srcRect, dstRect, copyMetadata);
+    if (copyCommand == nullptr) {
+      return hipErrorOutOfMemory;
+    }
+    command = copyCommand;
+  } else {
+    amd::WriteMemoryCommand *writeCommand = new amd::WriteMemoryCommand(
       *stream, CL_COMMAND_WRITE_BUFFER_RECT, amd::Command::EventWaitList{}, *dstMemory, dstStart,
       copyRegion, srcHost, dstRect, srcRect, copyMetadata);
+    if (writeCommand == nullptr) {
+      return hipErrorOutOfMemory;
+    }
 
-  if (writeCommand == nullptr) {
-    return hipErrorOutOfMemory;
+    if (!writeCommand->validatePeerMemory()) {
+      delete writeCommand;
+      return hipErrorInvalidValue;
+    }
+    command = writeCommand;
   }
 
-  if (!writeCommand->validatePeerMemory()) {
-    delete writeCommand;
-    return hipErrorInvalidValue;
-  }
-  command = writeCommand;
   return hipSuccess;
 }
 
@@ -2068,6 +2097,8 @@ hipError_t ihipMemcpyHtoACommand(amd::Command*& command, const void* srcHost, hi
                                  hip::Stream* stream, bool isAsync = false) {
   amd::Image* dstImage;
   size_t start = 0;  //!< Start offset for the copy region
+  size_t sOffset = 0;
+  amd::Memory* srcMemory = getMemoryObject(srcHost, sOffset);
 
   hipError_t status = ihipMemcpyHtoAValidate(srcHost, dstArray, srcOrigin, dstOrigin, copyRegion,
                                              srcRowPitch, srcSlicePitch, dstImage, start);
@@ -2076,20 +2107,31 @@ hipError_t ihipMemcpyHtoACommand(amd::Command*& command, const void* srcHost, hi
   }
 
   amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
-  amd::WriteMemoryCommand* writeMemCmd = new amd::WriteMemoryCommand(
+  if (srcMemory) {
+    amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand(
+      *stream, CL_COMMAND_COPY_BUFFER_TO_IMAGE, amd::Command::EventWaitList{},
+      *srcMemory, *dstImage, srcOrigin, dstOrigin,
+      copyRegion, copyMetadata);
+    if (copyCommand == nullptr) {
+      return hipErrorOutOfMemory;
+    }
+    command = copyCommand;
+  } else {
+    amd::WriteMemoryCommand* writeMemCmd = new amd::WriteMemoryCommand(
       *stream, CL_COMMAND_WRITE_IMAGE, amd::Command::EventWaitList{}, *dstImage, dstOrigin,
       copyRegion, static_cast<const char*>(srcHost) + start, srcRowPitch, srcSlicePitch,
       copyMetadata);
+    if (writeMemCmd == nullptr) {
+      return hipErrorOutOfMemory;
+    }
 
-  if (writeMemCmd == nullptr) {
-    return hipErrorOutOfMemory;
+    if (!writeMemCmd->validatePeerMemory()) {
+      delete writeMemCmd;
+      return hipErrorInvalidValue;
+    }
+    command = writeMemCmd;
   }
 
-  if (!writeMemCmd->validatePeerMemory()) {
-    delete writeMemCmd;
-    return hipErrorInvalidValue;
-  }
-  command = writeMemCmd;
   return hipSuccess;
 }
 
@@ -2127,8 +2169,9 @@ hipError_t ihipMemcpyAtoHCommand(amd::Command*& command, hipArray_t srcArray, vo
                                  hip::Stream* stream, bool isAsync = false) {
   amd::Image* srcImage;
   amd::BufferRect dstRect;
-  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
   size_t start = 0;  //!< Start offset for the copy region
+  size_t dOffset = 0;
+  amd::Memory* dstMemory = getMemoryObject(dstHost, dOffset);
 
   hipError_t status = ihipMemcpyAtoHValidate(srcArray, dstHost, srcOrigin, dstOrigin, copyRegion,
                                              dstRowPitch, dstSlicePitch, srcImage, start);
@@ -2136,20 +2179,32 @@ hipError_t ihipMemcpyAtoHCommand(amd::Command*& command, hipArray_t srcArray, vo
     return status;
   }
 
-  amd::ReadMemoryCommand* readMemCmd = new amd::ReadMemoryCommand(
+  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
+  if (dstMemory) {
+    amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand(
+      *stream, CL_COMMAND_COPY_IMAGE_TO_BUFFER, amd::Command::EventWaitList{},
+      *srcImage, *dstMemory, srcOrigin, dstOrigin, copyRegion, copyMetadata);
+    if (copyCommand == nullptr) {
+      return hipErrorOutOfMemory;
+    }
+    command = copyCommand;
+  } else {
+    amd::ReadMemoryCommand* readMemCmd = new amd::ReadMemoryCommand(
       *stream, CL_COMMAND_READ_IMAGE, amd::Command::EventWaitList{}, *srcImage, srcOrigin,
       copyRegion, static_cast<char*>(dstHost) + start, dstRowPitch, dstSlicePitch,
       copyMetadata);
 
-  if (readMemCmd == nullptr) {
-    return hipErrorOutOfMemory;
-  }
+    if (readMemCmd == nullptr) {
+      return hipErrorOutOfMemory;
+    }
 
-  if (!readMemCmd->validatePeerMemory()) {
-    delete readMemCmd;
-    return hipErrorInvalidValue;
+    if (!readMemCmd->validatePeerMemory()) {
+      delete readMemCmd;
+      return hipErrorInvalidValue;
+    }
+    command = readMemCmd;
   }
-  command = readMemCmd;
+
   return hipSuccess;
 }
 
diff --git a/hipamd/src/hip_mempool_impl.cpp b/hipamd/src/hip_mempool_impl.cpp
index 6971c0fa9c..3afe065bb0 100644
--- a/hipamd/src/hip_mempool_impl.cpp
+++ b/hipamd/src/hip_mempool_impl.cpp
@@ -44,18 +44,21 @@ void Heap::AddMemory(amd::Memory* memory, const MemoryTimestamp& ts) {
 amd::Memory* Heap::FindMemory(size_t size, hip::Stream* stream, bool opportunistic, void* dptr) {
   amd::Memory* memory = nullptr;
   auto start = allocations_.lower_bound({size, nullptr});
-  // Runtime can accept an allocation with 12.5% on the size threshold
-  uint32_t i = 0;
-  for (auto it = start; (it != allocations_.end()) && (it->first.first <= (size / 8.0) * 9);) {
-    i++;
+  for (auto it = start; it != allocations_.end();) {
     bool check_address = (dptr == nullptr);
     if (it->first.second->getSvmPtr() == dptr) {
       // If the search is done for the specified address then runtime must wait
       it->second.Wait();
       check_address = true;
     }
+    // Runtime can accept an allocation with 12.5% on the size threshold
+    bool opp_mode = opportunistic;
+    if (it->first.first > (size / 8.0) * 9) {
+      // Disable opportunistic mode for more aggressive search
+      opp_mode = false;
+    }
     // Check if size can match and it's safe to use this resource.
-    if (check_address && (it->second.IsSafeFind(stream, opportunistic))) {
+    if (check_address && (it->second.IsSafeFind(stream, opp_mode))) {
       memory = it->first.second;
       total_size_ -= memory->getSize();
       // Remove found allocation from the map
@@ -219,6 +222,22 @@ void* MemoryPool::AllocateMemory(size_t size, hip::Stream* stream, void* dptr) {
 bool MemoryPool::FreeMemory(amd::Memory* memory, hip::Stream* stream) {
   amd::ScopedLock lock(lock_pool_ops_);
 
+  // If the free heap grows over the busy heap, then force release
+  if (free_heap_.GetTotalSize() > busy_heap_.GetTotalSize()) {
+    // Use event base release to reduce memory pressure
+    constexpr size_t kBytesToHold = 0;
+    free_heap_.ReleaseAllMemory(kBytesToHold);
+
+    // If free mmeory is less than 12.5% of total, then force wait release
+    size_t free = 0;
+    size_t total = 0;
+    hipError_t err = hipMemGetInfo(&free, &total);
+    if ((err == hipSuccess) && (free < (total >> 3))) {
+      constexpr bool kSafeRelease = true;
+      free_heap_.ReleaseAllMemory(free_heap_.GetTotalSize() >> 1, kSafeRelease);
+    }
+  }
+
   MemoryTimestamp ts;
   // Remove memory object from the busy pool
   if (!busy_heap_.RemoveMemory(memory, &ts)) {
@@ -327,6 +346,7 @@ hipError_t MemoryPool::SetAttribute(hipMemPoolAttr attr, void* value) {
         return hipErrorInvalidValue;
       }
       free_heap_.SetMaxTotalSize(reset);
+      busy_heap_.SetMaxTotalSize(reset);
       break;
     case hipMemPoolAttrUsedMemCurrent:
       // Should be GetAttribute only
@@ -372,7 +392,8 @@ hipError_t MemoryPool::GetAttribute(hipMemPoolAttr attr, void* value) {
       break;
     case hipMemPoolAttrReservedMemHigh:
       // High watermark of all allocated memory in OS, since the last reset
-      *reinterpret_cast<uint64_t*>(value) = busy_heap_.GetTotalSize() + free_heap_.GetMaxTotalSize();
+      *reinterpret_cast<uint64_t*>(value) = busy_heap_.GetMaxTotalSize() +
+                                            free_heap_.GetMaxTotalSize();
       break;
     case hipMemPoolAttrUsedMemCurrent:
       // Total currently used memory by the pool
diff --git a/hipamd/src/hip_table_interface.cpp b/hipamd/src/hip_table_interface.cpp
index 8324e42371..81d681949e 100644
--- a/hipamd/src/hip_table_interface.cpp
+++ b/hipamd/src/hip_table_interface.cpp
@@ -1711,3 +1711,6 @@ hipError_t hipLaunchHostFunc_spt(hipStream_t stream, hipHostFn_t fn, void* userD
 extern "C" int hipGetStreamDeviceId(hipStream_t stream) {
   return hip::GetHipDispatchTable()->hipGetStreamDeviceId_fn(stream);
 }
+hipError_t hipExtGetLastError() {
+  return hip::GetHipDispatchTable()->hipExtGetLastError_fn();
+}
diff --git a/hipamd/src/hip_vm.cpp b/hipamd/src/hip_vm.cpp
index 2335c426bc..031c001834 100644
--- a/hipamd/src/hip_vm.cpp
+++ b/hipamd/src/hip_vm.cpp
@@ -288,6 +288,10 @@ hipError_t hipMemSetAccess(void* ptr, size_t size, const hipMemAccessDesc* desc,
       HIP_RETURN(hipErrorInvalidValue)
     }
 
+    if (desc[desc_idx].flags == hipMemAccessFlagsProtRead) {
+      HIP_RETURN(hipErrorInvalidValue)
+    }
+
     auto& dev = g_devices[desc[desc_idx].location.id];
     amd::Device::VmmAccess access_flags = static_cast<amd::Device::VmmAccess>(desc[desc_idx].flags);
 
diff --git a/hipamd/src/hiprtc/CMakeLists.txt b/hipamd/src/hiprtc/CMakeLists.txt
index b1745a02f5..6565e25ecd 100644
--- a/hipamd/src/hiprtc/CMakeLists.txt
+++ b/hipamd/src/hiprtc/CMakeLists.txt
@@ -168,7 +168,7 @@ generate_hiprtc_mcin("${HIPRTC_GEN_MCIN}" "${HIPRTC_GEN_PREPROCESSED}")
 # Note: second command appends define macros at build time.
 add_custom_command(
   OUTPUT ${HIPRTC_GEN_PREPROCESSED}
-  COMMAND $<TARGET_FILE:clang> -O3 --rocm-path=${PROJECT_SOURCE_DIR}/include/.. -std=c++17 -nogpulib --hip-version=${HIP_LIB_VERSION_MAJOR}.${HIP_LIB_VERSION_MINOR} -isystem ${HIP_COMMON_INCLUDE_DIR} -isystem ${PROJECT_SOURCE_DIR}/include -isystem ${PROJECT_BINARY_DIR}/include -isystem ${CMAKE_CURRENT_SOURCE_DIR}/include --cuda-device-only -D__HIPCC_RTC__ -DHIP_VERSION_MAJOR=${HIP_LIB_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_LIB_VERSION_MINOR} -x hip ${HIPRTC_GEN_HEADER} -E -o ${HIPRTC_GEN_PREPROCESSED}
+  COMMAND $<TARGET_FILE:clang> -O3 --rocm-path=${PROJECT_SOURCE_DIR}/include/.. -std=c++17 -nogpulib --hip-version=${HIP_LIB_VERSION_MAJOR}.${HIP_LIB_VERSION_MINOR} -isystem ${HIP_COMMON_INCLUDE_DIR} -isystem ${PROJECT_SOURCE_DIR}/include -isystem ${PROJECT_BINARY_DIR}/include -isystem ${CMAKE_CURRENT_SOURCE_DIR}/include --cuda-device-only -D__HIPCC_RTC__ -DHIP_VERSION_MAJOR=${HIP_LIB_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_LIB_VERSION_MINOR} -x hip ${HIPRTC_GEN_HEADER} -E -P -o ${HIPRTC_GEN_PREPROCESSED}
   COMMAND ${CMAKE_COMMAND} -DHIPRTC_ADD_MACROS=1 -DHIPRTC_HEADERS="${HIPRTC_HEADERS}" -DHIPRTC_PREPROCESSED_FILE=${HIPRTC_GEN_PREPROCESSED} -P ${HIPRTC_CMAKE}
   DEPENDS clang ${HIPRTC_GEN_HEADER})
 add_custom_command(
diff --git a/hipamd/src/hiprtc/hiprtcComgrHelper.cpp b/hipamd/src/hiprtc/hiprtcComgrHelper.cpp
index debecbe910..d213afe0f5 100644
--- a/hipamd/src/hiprtc/hiprtcComgrHelper.cpp
+++ b/hipamd/src/hiprtc/hiprtcComgrHelper.cpp
@@ -585,6 +585,87 @@ bool createAction(amd_comgr_action_info_t& action, std::vector<std::string>& opt
   return AMD_COMGR_STATUS_SUCCESS;
 }
 
+bool compileToExecutable(const amd_comgr_data_set_t compileInputs, const std::string& isa,
+                         std::vector<std::string>& compileOptions,
+                         std::vector<std::string>& linkOptions, std::string& buildLog,
+                         std::vector<char>& exe) {
+  amd_comgr_language_t lang = AMD_COMGR_LANGUAGE_HIP;
+  amd_comgr_action_info_t action;
+  amd_comgr_data_set_t reloc;
+  amd_comgr_data_set_t output;
+  amd_comgr_data_set_t input = compileInputs;
+
+  if (auto res = createAction(action, compileOptions, isa, lang); res != AMD_COMGR_STATUS_SUCCESS) {
+    return false;
+  }
+
+  if (auto res = amd::Comgr::create_data_set(&reloc); res != AMD_COMGR_STATUS_SUCCESS) {
+    amd::Comgr::destroy_action_info(action);
+    return false;
+  }
+
+  if (auto res = amd::Comgr::create_data_set(&output); res != AMD_COMGR_STATUS_SUCCESS) {
+    amd::Comgr::destroy_action_info(action);
+    amd::Comgr::destroy_data_set(reloc);
+    return false;
+  }
+
+  if (auto res = amd::Comgr::do_action(AMD_COMGR_ACTION_COMPILE_SOURCE_TO_RELOCATABLE, action,
+                                       input, reloc);
+      res != AMD_COMGR_STATUS_SUCCESS) {
+    extractBuildLog(reloc, buildLog);
+    amd::Comgr::destroy_action_info(action);
+    amd::Comgr::destroy_data_set(reloc);
+    amd::Comgr::destroy_data_set(output);
+    return false;
+  }
+
+  if (!extractBuildLog(reloc, buildLog)) {
+    amd::Comgr::destroy_action_info(action);
+    amd::Comgr::destroy_data_set(reloc);
+    amd::Comgr::destroy_data_set(output);
+    return false;
+  }
+
+  amd::Comgr::destroy_action_info(action);
+  if (auto res = createAction(action, linkOptions, isa, lang); res != AMD_COMGR_STATUS_SUCCESS) {
+    amd::Comgr::destroy_action_info(action);
+    amd::Comgr::destroy_data_set(reloc);
+    amd::Comgr::destroy_data_set(output);
+    return false;
+  }
+
+  if (auto res = amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE, action,
+                                       reloc, output);
+      res != AMD_COMGR_STATUS_SUCCESS) {
+    extractBuildLog(output, buildLog);
+    amd::Comgr::destroy_action_info(action);
+    amd::Comgr::destroy_data_set(output);
+    amd::Comgr::destroy_data_set(reloc);
+    return false;
+  }
+
+  if (!extractBuildLog(output, buildLog)) {
+    amd::Comgr::destroy_action_info(action);
+    amd::Comgr::destroy_data_set(output);
+    amd::Comgr::destroy_data_set(reloc);
+    return false;
+  }
+
+  if (!extractByteCodeBinary(output, AMD_COMGR_DATA_KIND_EXECUTABLE, exe)) {
+    amd::Comgr::destroy_action_info(action);
+    amd::Comgr::destroy_data_set(output);
+    amd::Comgr::destroy_data_set(reloc);
+    return false;
+  }
+
+  // Clean up
+  amd::Comgr::destroy_action_info(action);
+  amd::Comgr::destroy_data_set(output);
+  amd::Comgr::destroy_data_set(reloc);
+  return true;
+}
+
 bool compileToBitCode(const amd_comgr_data_set_t compileInputs, const std::string& isa,
                       std::vector<std::string>& compileOptions, std::string& buildLog,
                       std::vector<char>& LLVMBitcode) {
@@ -646,8 +727,7 @@ bool linkLLVMBitcode(const amd_comgr_data_set_t linkInputs, const std::string& i
     return false;
   }
 
-  if (auto res =
-          amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_BC_TO_BC, action, linkInputs, output);
+  if (auto res = amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_BC_TO_BC, action, linkInputs, output);
       res != AMD_COMGR_STATUS_SUCCESS) {
     amd::Comgr::destroy_action_info(action);
     amd::Comgr::destroy_data_set(output);
@@ -915,9 +995,9 @@ bool fillMangledNames(std::vector<char>& dataVec, std::map<std::string, std::str
     return false;
   }
 
-  for (auto &it : mangledNames) {
+  for (auto& it : mangledNames) {
     size_t Size;
-    char *data = const_cast<char*>(it.first.data());
+    char* data = const_cast<char*>(it.first.data());
 
     if (auto res = amd::Comgr::map_name_expression_to_symbol_name(dataObject, &Size, data, NULL)) {
       amd::Comgr::release_data(dataObject);
@@ -925,7 +1005,8 @@ bool fillMangledNames(std::vector<char>& dataVec, std::map<std::string, std::str
     }
 
     std::unique_ptr<char[]> mName(new char[Size]());
-    if (auto res = amd::Comgr::map_name_expression_to_symbol_name(dataObject, &Size, data, mName.get())) {
+    if (auto res =
+            amd::Comgr::map_name_expression_to_symbol_name(dataObject, &Size, data, mName.get())) {
       amd::Comgr::release_data(dataObject);
       return false;
     }
diff --git a/hipamd/src/hiprtc/hiprtcComgrHelper.hpp b/hipamd/src/hiprtc/hiprtcComgrHelper.hpp
index d34c9264c6..05e1c013d0 100644
--- a/hipamd/src/hiprtc/hiprtcComgrHelper.hpp
+++ b/hipamd/src/hiprtc/hiprtcComgrHelper.hpp
@@ -41,6 +41,10 @@ bool extractByteCodeBinary(const amd_comgr_data_set_t inDataSet,
 bool createAction(amd_comgr_action_info_t& action, std::vector<std::string>& options,
                   const std::string& isa,
                   const amd_comgr_language_t lang = AMD_COMGR_LANGUAGE_NONE);
+bool compileToExecutable(const amd_comgr_data_set_t compileInputs, const std::string& isa,
+                         std::vector<std::string>& compileOptions,
+                         std::vector<std::string>& linkOptions, std::string& buildLog,
+                         std::vector<char>& exe);
 bool compileToBitCode(const amd_comgr_data_set_t compileInputs, const std::string& isa,
                       std::vector<std::string>& compileOptions, std::string& buildLog,
                       std::vector<char>& LLVMBitcode);
@@ -54,8 +58,8 @@ bool dumpIsaFromBC(const amd_comgr_data_set_t isaInputs, const std::string& isa,
                    std::vector<std::string>& exeOptions, std::string name, std::string& buildLog);
 bool demangleName(const std::string& mangledName, std::string& demangledName);
 std::string handleMangledName(std::string loweredName);
-bool fillMangledNames(std::vector<char>& executable, std::map<std::string, std::string>& mangledNames,
-                      bool isBitcode);
+bool fillMangledNames(std::vector<char>& executable,
+                      std::map<std::string, std::string>& mangledNames, bool isBitcode);
 void GenerateUniqueFileName(std::string& name);
 }  // namespace helpers
 }  // namespace hiprtc
diff --git a/hipamd/src/hiprtc/hiprtcInternal.cpp b/hipamd/src/hiprtc/hiprtcInternal.cpp
index 46a034c828..ac02aa1070 100644
--- a/hipamd/src/hiprtc/hiprtcInternal.cpp
+++ b/hipamd/src/hiprtc/hiprtcInternal.cpp
@@ -59,7 +59,8 @@ RTCProgram::RTCProgram(std::string name) : name_(name) {
 bool RTCProgram::findIsa() {
   const char* libName;
 #ifdef _WIN32
-  libName = "amdhip64.dll";
+  std::string dll_name = std::string("amdhip64_" + std::to_string(HIP_VERSION_MAJOR) + ".dll");
+  libName = dll_name.c_str();
 #else
   libName = "libamdhip64.so";
 #endif
@@ -116,7 +117,6 @@ bool RTCProgram::findIsa() {
 
 // RTC Compile Program Member Functions
 void RTCProgram::AppendOptions(const std::string app_env_var, std::vector<std::string>* options) {
-
   if (options == nullptr) {
     LogError("Append options passed is nullptr.");
     return;
@@ -171,8 +171,6 @@ RTCCompileProgram::RTCCompileProgram(std::string name_) : RTCProgram(name_), fgp
   compile_options_.push_back("-fms-compatibility");
 #endif
   AppendCompileOptions();
-
-  exe_options_.push_back("-O3");
 }
 
 bool RTCCompileProgram::addSource(const std::string& source, const std::string& name) {
@@ -261,10 +259,6 @@ bool RTCCompileProgram::transformOptions(std::vector<std::string>& compile_optio
       i = "--offload-arch=" + val;
       continue;
     }
-    if (i == "--save-temps") {
-      settings_.dumpISA = true;
-      continue;
-    }
   }
 
   // Removed consumed options
@@ -300,78 +294,28 @@ bool RTCCompileProgram::compile(const std::vector<std::string>& options, bool fg
   compileOpts.reserve(compile_options_.size() + options.size() + 2);
   compileOpts.insert(compileOpts.end(), options.begin(), options.end());
 
-  if (!fgpu_rdc_) {
-    compileOpts.push_back("-Xclang");
-    compileOpts.push_back("-disable-llvm-passes");
-  }
-
   if (!transformOptions(compileOpts)) {
     LogError("Error in hiprtc: unable to transform options");
     return false;
   }
 
-  if (!compileToBitCode(compile_input_, isa_, compileOpts, build_log_, LLVMBitcode_)) {
-    LogError("Error in hiprtc: unable to compile source to bitcode");
-    return false;
-  }
-
-  if (fgpu_rdc_ && !mangled_names_.empty()) {
-    if (!fillMangledNames(LLVMBitcode_, mangled_names_, true)) {
-      LogError("Error in hiprtc: unable to fill mangled names");
+  if (fgpu_rdc_) {
+    if (!compileToBitCode(compile_input_, isa_, compileOpts, build_log_, LLVMBitcode_)) {
+      LogError("Error in hiprtc: unable to compile source to bitcode");
       return false;
     }
-
-    return true;
-  }
-
-  std::string linkFileName = "linked";
-  if (!addCodeObjData(link_input_, LLVMBitcode_, linkFileName, AMD_COMGR_DATA_KIND_BC)) {
-    LogError("Error in hiprtc: unable to add linked code object");
-    return false;
-  }
-
-  std::vector<char> LinkedLLVMBitcode;
-  if (!linkLLVMBitcode(link_input_, isa_, link_options_, build_log_, LinkedLLVMBitcode)) {
-    LogError("Error in hiprtc: unable to add device libs to linked bitcode");
-    return false;
-  }
-
-  std::string linkedFileName = "LLVMBitcode.bc";
-  if (!addCodeObjData(exec_input_, LinkedLLVMBitcode, linkedFileName, AMD_COMGR_DATA_KIND_BC)) {
-    LogError("Error in hiprtc: unable to add device libs linked code object");
-    return false;
-  }
-
-  std::vector<std::string> exe_options;
-  // Find the options passed by the app which can be used during BC to Relocatable phase.
-  if (!findExeOptions(options, exe_options)) {
-    LogError("Error in hiprtc: unable to find executable options");
-    return false;
-  }
-
-  std::vector<std::string> exeOpts(exe_options_);
-  exeOpts.reserve(exeOpts.size() + exe_options.size() + 2);
-  // Add these below options by default for optimizations during BC to Relocatable phase.
-  exeOpts.push_back("-mllvm");
-  exeOpts.push_back("-amdgpu-internalize-symbols");
-  // User provided options are appended at the end since they can override the above
-  // default options if necessary
-  exeOpts.insert(exeOpts.end(), exe_options.begin(), exe_options.end());
-
-  if (settings_.dumpISA) {
-    if (!dumpIsaFromBC(exec_input_, isa_, exeOpts, name_, build_log_)) {
-      LogError("Error in hiprtc: unable to dump isa code");
+  } else {
+    LogInfo("Using the new path of comgr");
+    if (!compileToExecutable(compile_input_, isa_, compileOpts, link_options_, build_log_,
+                             executable_)) {
+      LogError("Failing to compile to realloc");
       return false;
     }
   }
 
-  if (!createExecutable(exec_input_, isa_, exeOpts, build_log_, executable_)) {
-    LogError("Error in hiprtc: unable to create executable");
-    return false;
-  }
-
   if (!mangled_names_.empty()) {
-    if (!fillMangledNames(executable_, mangled_names_, false)) {
+    auto& compile_step_output = fgpu_rdc_ ? LLVMBitcode_ : executable_;
+    if (!fillMangledNames(compile_step_output, mangled_names_, fgpu_rdc_)) {
       LogError("Error in hiprtc: unable to fill mangled names");
       return false;
     }
@@ -380,6 +324,7 @@ bool RTCCompileProgram::compile(const std::vector<std::string>& options, bool fg
   return true;
 }
 
+
 void RTCCompileProgram::stripNamedExpression(std::string& strippedName) {
   if (strippedName.back() == ')') {
     strippedName.pop_back();
@@ -453,7 +398,6 @@ RTCLinkProgram::RTCLinkProgram(std::string name) : RTCProgram(name) {
 bool RTCLinkProgram::AddLinkerOptions(unsigned int num_options, hiprtcJIT_option* options_ptr,
                                       void** options_vals_ptr) {
   for (size_t opt_idx = 0; opt_idx < num_options; ++opt_idx) {
-
     switch (options_ptr[opt_idx]) {
       case HIPRTC_JIT_MAX_REGISTERS:
         link_args_.max_registers_ = *(reinterpret_cast<unsigned int*>(&options_vals_ptr[opt_idx]));
diff --git a/hipamd/src/hiprtc/hiprtcInternal.hpp b/hipamd/src/hiprtc/hiprtcInternal.hpp
index a1965d1b19..aae8b64f47 100644
--- a/hipamd/src/hiprtc/hiprtcInternal.hpp
+++ b/hipamd/src/hiprtc/hiprtcInternal.hpp
@@ -76,8 +76,9 @@ static amd::Monitor g_hiprtcInitlock{"hiprtcInit lock"};
 #define HIPRTC_INIT_API_INTERNAL(...)                                                              \
   amd::Thread* thread = amd::Thread::current();                                                    \
   if (!VDI_CHECK_THREAD(thread)) {                                                                 \
-    ClPrint(amd::LOG_NONE, amd::LOG_ALWAYS, "An internal error has occurred."                      \
-      " This may be due to insufficient memory.");                                                 \
+    ClPrint(amd::LOG_NONE, amd::LOG_ALWAYS,                                                        \
+            "An internal error has occurred."                                                      \
+            " This may be due to insufficient memory.");                                           \
     HIPRTC_RETURN(HIPRTC_ERROR_INTERNAL_ERROR);                                                    \
   }                                                                                                \
   amd::ScopedLock lock(g_hiprtcInitlock);                                                          \
@@ -107,7 +108,6 @@ static void crashWithMessage(std::string message) {
 }
 
 struct Settings {
-  bool dumpISA{false};
   bool offloadArchProvided{false};
 };
 
@@ -131,7 +131,6 @@ class RTCProgram {
   std::vector<char> executable_;
 
   amd_comgr_data_set_t exec_input_;
-  std::vector<std::string> exe_options_;
 };
 
 class RTCCompileProgram : public RTCProgram {
@@ -156,10 +155,8 @@ class RTCCompileProgram : public RTCProgram {
   bool addBuiltinHeader();
   bool transformOptions(std::vector<std::string>& compile_options);
   bool findExeOptions(const std::vector<std::string>& options,
-                       std::vector<std::string>& exe_options);
-  void AppendCompileOptions() {
-    AppendOptions(HIPRTC_COMPILE_OPTIONS_APPEND, &compile_options_);
-  }
+                      std::vector<std::string>& exe_options);
+  void AppendCompileOptions() { AppendOptions(HIPRTC_COMPILE_OPTIONS_APPEND, &compile_options_); }
 
   RTCCompileProgram() = delete;
   RTCCompileProgram(RTCCompileProgram&) = delete;
@@ -288,9 +285,7 @@ class RTCLinkProgram : public RTCProgram {
   bool AddLinkerData(void* image_ptr, size_t image_size, std::string link_file_name,
                      hiprtcJITInputType input_type);
   bool LinkComplete(void** bin_out, size_t* size_out);
-  void AppendLinkerOptions() {
-    AppendOptions(HIPRTC_LINK_OPTIONS_APPEND, &link_options_);
-  }
+  void AppendLinkerOptions() { AppendOptions(HIPRTC_LINK_OPTIONS_APPEND, &link_options_); }
 };
 
 // Thread Local Storage Variables Aggregator Class
diff --git a/opencl/tests/ocltst/module/runtime/OCLPerfCounters.cpp b/opencl/tests/ocltst/module/runtime/OCLPerfCounters.cpp
index 0e7de54e0d..84ac050c77 100644
--- a/opencl/tests/ocltst/module/runtime/OCLPerfCounters.cpp
+++ b/opencl/tests/ocltst/module/runtime/OCLPerfCounters.cpp
@@ -760,8 +760,10 @@ void OCLPerfCounters::run(void) {
   _wrapper->clSetDeviceClockModeAMD(global_device, setClockModeInput,
                                     &setClockModeOutput);
 
-  _wrapper->clGetPerfCounterInfoAMD(perfCounter, CL_PERFCOUNTER_DATA,
+  error_ = _wrapper->clGetPerfCounterInfoAMD(perfCounter, CL_PERFCOUNTER_DATA,
                                     sizeof(cl_ulong), &result, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS,
+    "clGetPerfCounterInfoAMD failed (Hint (Linux): install hsa-amd-aqlprofile)\n");
 
   err = _wrapper->clReleasePerfCounterAMD(perfCounter);
   CHECK_RESULT(err != CL_SUCCESS, "Release PerfCounter failed\n");
diff --git a/rocclr/cmake/ROCclrPAL.cmake b/rocclr/cmake/ROCclrPAL.cmake
index 3cc497e0fa..da04463651 100644
--- a/rocclr/cmake/ROCclrPAL.cmake
+++ b/rocclr/cmake/ROCclrPAL.cmake
@@ -43,6 +43,8 @@ set(PAL_BUILD_NAVI31        ON)
 set(PAL_BUILD_NAVI32        ON)
 set(PAL_BUILD_NAVI33        ON)
 set(PAL_BUILD_PHOENIX1      ON)
+set(PAL_BUILD_STRIX1        ON)
+set(PAL_BUILD_STRIX_HALO    ON)
 
 find_package(AMD_PAL)
 find_package(AMD_HSA_LOADER)
diff --git a/rocclr/device/comgrctx.cpp b/rocclr/device/comgrctx.cpp
index 67184ca944..1a8afe60a9 100644
--- a/rocclr/device/comgrctx.cpp
+++ b/rocclr/device/comgrctx.cpp
@@ -57,9 +57,11 @@ bool Comgr::LoadLib(bool is_versioned) {
     cep_.handle = Os::loadLibrary(comgr_lib_name);
 #endif
   } else {
-    static constexpr const char* comgr_lib_name =
+    std::string comgr_major_dll = "amd_comgr_" +
+                                  std::to_string(AMD_COMGR_INTERFACE_VERSION_MAJOR) + ".dll";
+    static const char* comgr_lib_name =
     LP64_SWITCH(WINDOWS_SWITCH("amd_comgr32.dll", "libamd_comgr32.so.2"),
-                WINDOWS_SWITCH("amd_comgr.dll", "libamd_comgr.so.2"));
+                WINDOWS_SWITCH(comgr_major_dll.c_str(), "libamd_comgr.so.2"));
     cep_.handle = Os::loadLibrary(comgr_lib_name);
   }
   if (nullptr == cep_.handle) {
diff --git a/rocclr/device/device.cpp b/rocclr/device/device.cpp
index ac91003104..c52c2e0425 100644
--- a/rocclr/device/device.cpp
+++ b/rocclr/device/device.cpp
@@ -219,8 +219,8 @@ std::pair<const Isa*, const Isa*> Isa::supportedIsas() {
     {"gfx1101",                "gfx1101",   true,  true,    11, 0,  1,    NONE,   NONE, 2,    32,   1,    256,    64 * Ki, 32},
     {"gfx1102",                "gfx1102",   true,  true,    11, 0,  2,    NONE,   NONE, 2,    32,   1,    256,    64 * Ki, 32},
     {"gfx1103",                "gfx1103",   true,  true,    11, 0,  3,    NONE,   NONE, 2,    32,   1,    256,    64 * Ki, 32},
-    {"gfx1150",                "gfx1150",   true,  true,    11, 5,  0,    NONE,   NONE, 2,    32,   1,    256,    64 * Ki, 32},
-    {"gfx1151",                "gfx1151",   true,  true,    11, 5,  1,    NONE,   NONE, 2,    32,   1,    256,    64 * Ki, 32},
+    {"gfx1150",                "gfx1150",   true,  true,    11, 5,  0,    NONE,   NONE, 2,    32,   1,    256,    64 * Ki, 32}, //Strix
+    {"gfx1151",                "gfx1151",   true,  true,    11, 5,  1,    NONE,   NONE, 2,    32,   1,    256,    64 * Ki, 32}, //Strix Halo
   };
   return std::make_pair(std::begin(supportedIsas_), std::end(supportedIsas_));
 }
@@ -855,7 +855,7 @@ bool Device::IpcCreate(void* dev_ptr, size_t* mem_size, void* handle, size_t* me
   auto dev_mem = static_cast<device::Memory*>(amd_mem_obj->getDeviceMemory(*this));
   auto result = dev_mem->ExportHandle(handle);
 
-  return true;
+  return result;
 }
 
 // ================================================================================================
diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp
index d00dfb4f6c..2dc7b09ee6 100644
--- a/rocclr/device/device.hpp
+++ b/rocclr/device/device.hpp
@@ -624,6 +624,8 @@ struct Info : public amd::EmbeddedObject {
   //! global CU mask which will be applied to all queues created on this device
   std::vector<uint32_t> globalCUMask_;
 
+  bool accelerator_; //!< Accelerator or discrete graphics card.
+
   //! AQL Barrier Value Packet support
   bool aqlBarrierValue_;
 
diff --git a/rocclr/device/pal/paldevice.cpp b/rocclr/device/pal/paldevice.cpp
index 6f65ad422b..4d9df9d111 100644
--- a/rocclr/device/pal/paldevice.cpp
+++ b/rocclr/device/pal/paldevice.cpp
@@ -113,6 +113,8 @@ static constexpr PalDevice supportedPalDevices[] = {
   {11, 0,  3,  Pal::GfxIpLevel::GfxIp11_0, "gfx1103",       Pal::AsicRevision::Phoenix2},
   {11, 0,  3,  Pal::GfxIpLevel::GfxIp11_0, "gfx1103",       Pal::AsicRevision::HawkPoint1},
   {11, 0,  3,  Pal::GfxIpLevel::GfxIp11_0, "gfx1103",       Pal::AsicRevision::HawkPoint2},
+  {11, 5,  0,  Pal::GfxIpLevel::GfxIp11_5, "gfx1150",       Pal::AsicRevision::Strix1},
+  {11, 5,  1,  Pal::GfxIpLevel::GfxIp11_5, "gfx1151",       Pal::AsicRevision::StrixHalo},
 };
 
 static std::tuple<const amd::Isa*, const char*> findIsa(Pal::AsicRevision asicRevision,
@@ -2601,15 +2603,6 @@ bool Device::createBlitProgram() {
     }
   }
 
-  blitProgram_ = new BlitProgram(context_);
-  // Create blit programs
-  if (blitProgram_ == nullptr || !blitProgram_->create(this, extraBlits, ocl20)) {
-    delete blitProgram_;
-    blitProgram_ = nullptr;
-    LogError("Couldn't create blit kernels!");
-    result = false;
-  }
-
   if (settings().useLightning_) {
     const std::string TrapHandlerAsm = TrapHandlerCode;
     // Create a program for trap handler
@@ -2635,6 +2628,15 @@ bool Device::createBlitProgram() {
       DevLogPrintfError("Trap handler creation failed\n");
     }
   }
+
+  blitProgram_ = new BlitProgram(context_);
+  // Create blit programs
+  if (blitProgram_ == nullptr || !blitProgram_->create(this, extraBlits, ocl20)) {
+    delete blitProgram_;
+    blitProgram_ = nullptr;
+    LogError("Couldn't create blit kernels!");
+    result = false;
+  }
   return result;
 }
 
diff --git a/rocclr/device/pal/palsettings.cpp b/rocclr/device/pal/palsettings.cpp
index a5d08c50be..ee85a1cd87 100644
--- a/rocclr/device/pal/palsettings.cpp
+++ b/rocclr/device/pal/palsettings.cpp
@@ -170,6 +170,8 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
   amd::Os::getAppPathAndFileName(appName, appPathAndName);
 
   switch (palProp.revision) {
+    case Pal::AsicRevision::StrixHalo:
+    case Pal::AsicRevision::Strix1:
     // Fall through for Navi3x ...
     case Pal::AsicRevision::Navi33:
     case Pal::AsicRevision::Navi32:
diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp
index 3b6b4d4d54..8758c62913 100644
--- a/rocclr/device/rocm/rocdevice.cpp
+++ b/rocclr/device/rocm/rocdevice.cpp
@@ -1836,6 +1836,19 @@ bool Device::populateOCLDeviceConstants() {
         std::numeric_limits<uint32_t>::max();  // gfx10+ does not share SGPRs between waves
   }
 
+  uint8_t memory_properties[8];
+  // Get the memory property from ROCr.
+  if (HSA_STATUS_SUCCESS != hsa_agent_get_info(bkendDevice_,
+                              (hsa_agent_info_t) HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES,
+                              memory_properties)) {
+    LogError("HSA_AGENT_INFO_AMD_MEMORY_PROPERTIES query failed");
+  }
+
+  // Check if the device is APU
+  if (hsa_flag_isset64(memory_properties, HSA_AMD_MEMORY_PROPERTY_AGENT_IS_APU)) {
+    info_.accelerator_ = 1;
+  }
+
   return true;
 }
 
diff --git a/rocclr/device/rocm/rocmemory.cpp b/rocclr/device/rocm/rocmemory.cpp
index 2ef82fec0c..9887d23334 100644
--- a/rocclr/device/rocm/rocmemory.cpp
+++ b/rocclr/device/rocm/rocmemory.cpp
@@ -682,15 +682,12 @@ void Buffer::destroy() {
   }
 
   if (deviceMemory_ != nullptr) {
+    bool needUnlockHostMem = false;
     if (deviceMemory_ != owner()->getHostMem()) {
       // if they are identical, the host pointer will be
       // deallocated later on => avoid double deallocation
       if (isHostMemDirectAccess()) {
-        if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) {
-          if (dev().agent_profile() != HSA_PROFILE_FULL) {
-            hsa_amd_memory_unlock(owner()->getHostMem());
-          }
-        }
+        needUnlockHostMem = true;
       } else {
         dev().memFree(deviceMemory_, size());
         const_cast<Device&>(dev()).updateFreeMemory(size(), true);
@@ -705,6 +702,15 @@ void Buffer::destroy() {
       } else if ((memFlags & CL_MEM_ALLOC_HOST_PTR) &&
                  (owner()->getContext().devices().size() == 1)) {
         dev().hostFree(deviceMemory_, size());
+      } else if (isHostMemDirectAccess()) {
+        needUnlockHostMem = true;
+      }
+    }
+
+    if (needUnlockHostMem) {
+      if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) {
+        if (dev().agent_profile() != HSA_PROFILE_FULL)
+          hsa_amd_memory_unlock(owner()->getHostMem());
       }
     }
   }
diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp
index 0046f130b3..42eb546eab 100644
--- a/rocclr/device/rocm/rocvirtual.cpp
+++ b/rocclr/device/rocm/rocvirtual.cpp
@@ -2857,50 +2857,43 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
   }
 
   uint64_t vqVA = reinterpret_cast<uint64_t>(vqMem->getDeviceMemory());
-  uint64_t pattern = 0;
-  amd::Coord3D origin(0, 0, 0);
-  amd::Coord3D region(virtualQueue_->getSize(), 1, 1);
 
-  if (!dev().xferMgr().fillBuffer(*vqMem, &pattern, sizeof(pattern), region, origin, region)) {
-    return false;
-  }
+  // Use shadow to prepare the data structure in host.
+  auto shadow = std::make_unique<uint8_t[]>(allocSize);
 
-  AmdVQueueHeader header = {};
+  std::memset(&shadow[0], 0, allocSize);
+
+  AmdVQueueHeader* header = reinterpret_cast<AmdVQueueHeader*>(&shadow[0]);
   // Initialize the virtual queue header
-  header.aql_slot_num = numSlots;
-  header.event_slot_num = dev().settings().numDeviceEvents_;
-  header.event_slot_mask = vqVA + eventMaskOffs;
-  header.event_slots = vqVA + eventsOffs;
-  header.aql_slot_mask = vqVA + slotMaskOffs;
-  header.wait_size = dev().settings().numWaitEvents_;
-  header.arg_size = dev().info().maxParameterSize_ + 64;
-  header.mask_groups = maskGroups_;
-
-  amd::Coord3D origin_header(0);
-  amd::Coord3D region_header(sizeof(AmdVQueueHeader));
-
-  if (!dev().xferMgr().writeBuffer(&header, *vqMem, origin_header, region_header)) {
-    return false;
-  }
+  header->aql_slot_num = numSlots;
+  header->event_slot_num = dev().settings().numDeviceEvents_;
+  header->event_slot_mask = vqVA + eventMaskOffs;
+  header->event_slots = vqVA + eventsOffs;
+  header->aql_slot_mask = vqVA + slotMaskOffs;
+  header->wait_size = dev().settings().numWaitEvents_;
+  header->arg_size = dev().info().maxParameterSize_ + 64;
+  header->mask_groups = maskGroups_;
 
   // Go over all slots and perform initialization
-  AmdAqlWrap slot = {};
   size_t offset = sizeof(AmdVQueueHeader);
   for (uint i = 0; i < numSlots; ++i) {
+    AmdAqlWrap * slot = reinterpret_cast<AmdAqlWrap*>(&shadow[0] + offset);
     uint64_t argStart = vqVA + argOffs + i * singleArgSize;
-    amd::Coord3D origin_slot(offset);
-    amd::Coord3D region_slot(sizeof(AmdAqlWrap));
-
-    slot.aql.kernarg_address = reinterpret_cast<void*>(argStart);
-    slot.wait_list = argStart + dev().info().maxParameterSize_ + 64;
 
-    if (!dev().xferMgr().writeBuffer(&slot, *vqMem, origin_slot, region_slot)) {
-      return false;
-    }
+    slot->aql.kernarg_address = reinterpret_cast<void*>(argStart);
+    slot->wait_list = argStart + dev().info().maxParameterSize_ + 64;
 
     offset += sizeof(AmdAqlWrap);
   }
 
+  amd::Coord3D origin (0, 0, 0);
+  amd::Coord3D region (allocSize, 1, 1);
+
+  // copy the data structure from host to GPU
+  if (!dev().xferMgr().writeBuffer(&shadow[0], *vqMem, origin, region)) {
+    return false;
+  }
+
   deviceQueueSize_ = deviceQueueSize;
   schedulerThreads_ = numSlots / (DeviceQueueMaskSize * maskGroups_);
 
@@ -3643,6 +3636,11 @@ void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) {
     // one to get the profile object
     amd::PerfCounter* amdCounter = static_cast<amd::PerfCounter*>(counters[0]);
     PerfCounter* counter = static_cast<PerfCounter*>(amdCounter->getDeviceCounter());
+    if (counter == nullptr) {
+      LogError("Invalid Performance Counter");
+      vcmd.setStatus(CL_INVALID_OPERATION);
+      return;
+    }
     PerfCounterProfile* profileRef =  counter->profileRef();
 
     // create the AQL packet for stop profiling
diff --git a/rocclr/utils/flags.hpp b/rocclr/utils/flags.hpp
index 51ebdd0d39..cd1f40c42d 100644
--- a/rocclr/utils/flags.hpp
+++ b/rocclr/utils/flags.hpp
@@ -245,6 +245,8 @@ release(cstring, HIPRTC_LINK_OPTIONS_APPEND, "",                              \
         "Set link options needed for hiprtc compilation")                     \
 release(bool, HIP_VMEM_MANAGE_SUPPORT, true,                                  \
         "Virtual Memory Management Support")                                  \
+release(bool, DEBUG_HIP_GRAPH_DOT_PRINT, false,                               \
+         "Enable/Disable graph debug dot print dump")                         \
 
 namespace amd {