diff --git a/.github/intel-llvm-mirror-base-commit b/.github/intel-llvm-mirror-base-commit
index fdb4d85658..92a6c05337 100644
--- a/.github/intel-llvm-mirror-base-commit
+++ b/.github/intel-llvm-mirror-base-commit
@@ -1 +1 @@
-3e95c0c70850b8b668116d9a491d25dd969c6329
+63c70a1425d2c91fa54ec6495aae8ecfa7a5a10c
diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp
index 32c3ea19b2..eb19063999 100644
--- a/source/adapters/level_zero/device.cpp
+++ b/source/adapters/level_zero/device.cpp
@@ -1139,9 +1139,31 @@ ur_result_t urDeviceGetInfo(
     return ReturnValue(Device->Platform->ZeBindlessImagesExtensionSupported &&
                        Device->ZeDeviceImageProperties->maxImageDims2D > 0);
   }
+  case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_WIDTH_EXP: {
+    ze_device_image_properties_t imageProps = {};
+    imageProps.stype = ZE_STRUCTURE_TYPE_DEVICE_IMAGE_PROPERTIES;
+    ze_device_pitched_alloc_exp_properties_t imageAllocProps = {};
+    imageAllocProps.stype =
+        ZE_STRUCTURE_TYPE_PITCHED_ALLOC_DEVICE_EXP_PROPERTIES;
+    imageProps.pNext = (void *)&imageAllocProps;
+
+    ZE_CALL_NOCHECK(zeDeviceGetImageProperties, (ZeDevice, &imageProps));
+
+    return ReturnValue(imageAllocProps.maxImageLinearWidth);
+  }
+  case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_HEIGHT_EXP: {
+    ze_device_image_properties_t imageProps = {};
+    imageProps.stype = ZE_STRUCTURE_TYPE_DEVICE_IMAGE_PROPERTIES;
+    ze_device_pitched_alloc_exp_properties_t imageAllocProps = {};
+    imageAllocProps.stype =
+        ZE_STRUCTURE_TYPE_PITCHED_ALLOC_DEVICE_EXP_PROPERTIES;
+    imageProps.pNext = (void *)&imageAllocProps;
+
+    ZE_CALL_NOCHECK(zeDeviceGetImageProperties, (ZeDevice, &imageProps));
+
+    return ReturnValue(imageAllocProps.maxImageLinearHeight);
+  }
   case UR_DEVICE_INFO_IMAGE_PITCH_ALIGN_EXP:
-  case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_WIDTH_EXP:
-  case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_HEIGHT_EXP:
   case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_PITCH_EXP:
     UR_LOG(ERR, "Unsupported ParamName in urGetDeviceInfo");
     UR_LOG(ERR, "ParamName=%{}(0x{})", ParamName, logger::toHex(ParamName));
diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
index d4203ab32a..5fecdc5b8f 100644
--- a/source/adapters/native_cpu/enqueue.cpp
+++ b/source/adapters/native_cpu/enqueue.cpp
@@ -52,17 +52,6 @@ struct NDRDescT {
 };
 } // namespace native_cpu
 
-#ifdef NATIVECPU_USE_OCK
-static native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr,
-                                         size_t itemsPerThread) {
-  native_cpu::state resized_state(
-      ndr.GlobalSize[0], ndr.GlobalSize[1], ndr.GlobalSize[2], itemsPerThread,
-      ndr.LocalSize[1], ndr.LocalSize[2], ndr.GlobalOffset[0],
-      ndr.GlobalOffset[1], ndr.GlobalOffset[2]);
-  return resized_state;
-}
-#endif
-
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
     const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -112,6 +101,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   // TODO: add proper error checking
   native_cpu::NDRDescT ndr(workDim, pGlobalWorkOffset, pGlobalWorkSize,
                            pLocalWorkSize);
+  unsigned long long numWI;
+  auto umulll_overflow = [](unsigned long long a, unsigned long long b,
+                            unsigned long long *c) -> bool {
+#ifdef __GNUC__
+    return __builtin_umulll_overflow(a, b, c);
+#else
+    *c = a * b;
+    return a != 0 && b != *c / a;
+#endif
+  };
+  if (umulll_overflow(ndr.GlobalSize[0], ndr.GlobalSize[1], &numWI) ||
+      umulll_overflow(numWI, ndr.GlobalSize[2], &numWI) || numWI > SIZE_MAX) {
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  }
+
   auto &tp = hQueue->getDevice()->tp;
   const size_t numParallelThreads = tp.num_threads();
   std::vector<std::future<void>> futures;
@@ -130,131 +134,56 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   auto kernel = std::make_unique<ur_kernel_handle_t_>(*hKernel);
   kernel->updateMemPool(numParallelThreads);
 
+  const size_t numWG = numWG0 * numWG1 * numWG2;
+  const size_t numWGPerThread = numWG / numParallelThreads;
+  const size_t remainderWG = numWG - numWGPerThread * numParallelThreads;
+  // The fourth value is the linearized value.
+  std::array<size_t, 4> rangeStart = {0, 0, 0, 0};
+  for (unsigned t = 0; t < numParallelThreads; ++t) {
+    auto rangeEnd = rangeStart;
+    rangeEnd[3] += numWGPerThread + (t < remainderWG);
+    if (rangeEnd[3] == rangeStart[3])
+      break;
+    rangeEnd[0] = rangeEnd[3] % numWG0;
+    rangeEnd[1] = (rangeEnd[3] / numWG0) % numWG1;
+    rangeEnd[2] = rangeEnd[3] / (numWG0 * numWG1);
+    futures.emplace_back(
+        tp.schedule_task([state, &kernel = *kernel, rangeStart,
+                          rangeEnd = rangeEnd[3], numWG0, numWG1,
 #ifndef NATIVECPU_USE_OCK
-  for (unsigned g2 = 0; g2 < numWG2; g2++) {
-    for (unsigned g1 = 0; g1 < numWG1; g1++) {
-      for (unsigned g0 = 0; g0 < numWG0; g0++) {
-        for (unsigned local2 = 0; local2 < ndr.LocalSize[2]; local2++) {
-          for (unsigned local1 = 0; local1 < ndr.LocalSize[1]; local1++) {
-            for (unsigned local0 = 0; local0 < ndr.LocalSize[0]; local0++) {
-              state.update(g0, g1, g2, local0, local1, local2);
-              kernel->_subhandler(kernel->getArgs(1, 0).data(), &state);
-            }
-          }
-        }
-      }
-    }
-  }
+                          localSize = ndr.LocalSize,
+#endif
+                          numParallelThreads](size_t threadId) mutable {
+          for (size_t g0 = rangeStart[0], g1 = rangeStart[1],
+                      g2 = rangeStart[2], g3 = rangeStart[3];
+               g3 < rangeEnd; ++g3) {
+#ifdef NATIVECPU_USE_OCK
+            state.update(g0, g1, g2);
+            kernel._subhandler(
+                kernel.getArgs(numParallelThreads, threadId).data(), &state);
 #else
-  bool isLocalSizeOne =
-      ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1;
-  if (isLocalSizeOne && ndr.GlobalSize[0] > numParallelThreads &&
-      !kernel->hasLocalArgs()) {
-    // If the local size is one, we make the assumption that we are running a
-    // parallel_for over a sycl::range.
-    // Todo: we could add more compiler checks and
-    // kernel properties for this (e.g. check that no barriers are called).
-
-    // Todo: this assumes that dim 0 is the best dimension over which we want to
-    // parallelize
-
-    // Since we also vectorize the kernel, and vectorization happens within the
-    // work group loop, it's better to have a large-ish local size. We can
-    // divide the global range by the number of threads, set that as the local
-    // size and peel everything else.
-
-    size_t new_num_work_groups_0 = numParallelThreads;
-    size_t itemsPerThread = ndr.GlobalSize[0] / numParallelThreads;
-
-    for (unsigned g2 = 0; g2 < numWG2; g2++) {
-      for (unsigned g1 = 0; g1 < numWG1; g1++) {
-        for (unsigned g0 = 0; g0 < new_num_work_groups_0; g0 += 1) {
-          futures.emplace_back(tp.schedule_task(
-              [ndr, itemsPerThread, &kernel = *kernel, g0, g1, g2](size_t) {
-                native_cpu::state resized_state =
-                    getResizedState(ndr, itemsPerThread);
-                resized_state.update(g0, g1, g2);
-                kernel._subhandler(kernel.getArgs().data(), &resized_state);
-              }));
-        }
-        // Peel the remaining work items. Since the local size is 1, we iterate
-        // over the work groups.
-        for (unsigned g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0;
-             g0++) {
-          state.update(g0, g1, g2);
-          kernel->_subhandler(kernel->getArgs().data(), &state);
-        }
-      }
-    }
-
-  } else {
-    // We are running a parallel_for over an nd_range
-
-    if (numWG1 * numWG2 >= numParallelThreads) {
-      // Dimensions 1 and 2 have enough work, split them across the threadpool
-      for (unsigned g2 = 0; g2 < numWG2; g2++) {
-        for (unsigned g1 = 0; g1 < numWG1; g1++) {
-          futures.emplace_back(
-              tp.schedule_task([state, &kernel = *kernel, numWG0, g1, g2,
-                                numParallelThreads](size_t threadId) mutable {
-                for (unsigned g0 = 0; g0 < numWG0; g0++) {
-                  state.update(g0, g1, g2);
+            for (size_t local2 = 0; local2 < localSize[2]; ++local2) {
+              for (size_t local1 = 0; local1 < localSize[1]; ++local1) {
+                for (size_t local0 = 0; local0 < localSize[0]; ++local0) {
+                  state.update(g0, g1, g2, local0, local1, local2);
                   kernel._subhandler(
                       kernel.getArgs(numParallelThreads, threadId).data(),
                       &state);
                 }
-              }));
-        }
-      }
-    } else {
-      // Split dimension 0 across the threadpool
-      // Here we try to create groups of workgroups in order to reduce
-      // synchronization overhead
-      for (unsigned g2 = 0; g2 < numWG2; g2++) {
-        for (unsigned g1 = 0; g1 < numWG1; g1++) {
-          for (unsigned g0 = 0; g0 < numWG0; g0++) {
-            groups.push_back([state, g0, g1, g2, numParallelThreads](
-                                 size_t threadId,
-                                 ur_kernel_handle_t_ &kernel) mutable {
-              state.update(g0, g1, g2);
-              kernel._subhandler(
-                  kernel.getArgs(numParallelThreads, threadId).data(), &state);
-            });
-          }
-        }
-      }
-      auto numGroups = groups.size();
-      auto groupsPerThread = numGroups / numParallelThreads;
-      if (groupsPerThread) {
-        for (unsigned thread = 0; thread < numParallelThreads; thread++) {
-          futures.emplace_back(
-              tp.schedule_task([groups, thread, groupsPerThread,
-                                &kernel = *kernel](size_t threadId) {
-                for (unsigned i = 0; i < groupsPerThread; i++) {
-                  auto index = thread * groupsPerThread + i;
-                  groups[index](threadId, kernel);
-                }
-              }));
-        }
-      }
-
-      // schedule the remaining tasks
-      auto remainder = numGroups % numParallelThreads;
-      if (remainder) {
-        futures.emplace_back(
-            tp.schedule_task([groups, remainder,
-                              scheduled = numParallelThreads * groupsPerThread,
-                              &kernel = *kernel](size_t threadId) {
-              for (unsigned i = 0; i < remainder; i++) {
-                auto index = scheduled + i;
-                groups[index](threadId, kernel);
               }
-            }));
-      }
-    }
+            }
+#endif
+            if (++g0 == numWG0) {
+              g0 = 0;
+              if (++g1 == numWG1) {
+                g1 = 0;
+                ++g2;
+              }
+            }
+          }
+        }));
+    rangeStart = rangeEnd;
   }
-
-#endif // NATIVECPU_USE_OCK
   event->set_futures(futures);
 
   if (phEvent) {
diff --git a/source/adapters/offload/enqueue.cpp b/source/adapters/offload/enqueue.cpp
index 62b91f82af..d35d2bd40e 100644
--- a/source/adapters/offload/enqueue.cpp
+++ b/source/adapters/offload/enqueue.cpp
@@ -74,7 +74,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
                      hKernel->Args.getStorageSize(), &LaunchArgs, &EventOut));
 
   if (phEvent) {
-    auto *Event = new ur_event_handle_t_();
+    auto *Event = new ur_event_handle_t_(UR_COMMAND_KERNEL_LAUNCH, hQueue);
     Event->OffloadEvent = EventOut;
     *phEvent = Event;
   }
@@ -94,10 +94,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
 }
 
 namespace {
-ur_result_t doMemcpy(ur_queue_handle_t hQueue, void *DestPtr,
-                     ol_device_handle_t DestDevice, const void *SrcPtr,
-                     ol_device_handle_t SrcDevice, size_t size, bool blocking,
-                     uint32_t numEventsInWaitList,
+ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue,
+                     void *DestPtr, ol_device_handle_t DestDevice,
+                     const void *SrcPtr, ol_device_handle_t SrcDevice,
+                     size_t size, bool blocking, uint32_t numEventsInWaitList,
                      const ur_event_handle_t *phEventWaitList,
                      ur_event_handle_t *phEvent) {
   // Ignore wait list for now
@@ -111,11 +111,11 @@ ur_result_t doMemcpy(ur_queue_handle_t hQueue, void *DestPtr,
                             SrcDevice, size, phEvent ? &EventOut : nullptr));
 
   if (blocking) {
-    OL_RETURN_ON_ERR(olWaitQueue(hQueue->OffloadQueue));
+    OL_RETURN_ON_ERR(olSyncQueue(hQueue->OffloadQueue));
   }
 
   if (phEvent) {
-    auto *Event = new ur_event_handle_t_();
+    auto *Event = new ur_event_handle_t_(Command, hQueue);
     Event->OffloadEvent = EventOut;
     *phEvent = Event;
   }
@@ -131,8 +131,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
   char *DevPtr =
       reinterpret_cast<char *>(std::get<BufferMem>(hBuffer->Mem).Ptr);
 
-  return doMemcpy(hQueue, pDst, Adapter->HostDevice, DevPtr + offset,
-                  hQueue->OffloadDevice, size, blockingRead,
+  return doMemcpy(UR_COMMAND_MEM_BUFFER_READ, hQueue, pDst, Adapter->HostDevice,
+                  DevPtr + offset, hQueue->OffloadDevice, size, blockingRead,
                   numEventsInWaitList, phEventWaitList, phEvent);
 }
 
@@ -143,9 +143,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
   char *DevPtr =
       reinterpret_cast<char *>(std::get<BufferMem>(hBuffer->Mem).Ptr);
 
-  return doMemcpy(hQueue, DevPtr + offset, hQueue->OffloadDevice, pSrc,
-                  Adapter->HostDevice, size, blockingWrite, numEventsInWaitList,
-                  phEventWaitList, phEvent);
+  return doMemcpy(UR_COMMAND_MEM_BUFFER_WRITE, hQueue, DevPtr + offset,
+                  hQueue->OffloadDevice, pSrc, Adapter->HostDevice, size,
+                  blockingWrite, numEventsInWaitList, phEventWaitList, phEvent);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead(
@@ -159,10 +159,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead(
     return Err;
   }
 
-  return doMemcpy(hQueue, pDst, Adapter->HostDevice,
-                  reinterpret_cast<const char *>(Ptr) + offset,
-                  hQueue->OffloadDevice, count, blockingRead,
-                  numEventsInWaitList, phEventWaitList, phEvent);
+  return doMemcpy(
+      UR_COMMAND_DEVICE_GLOBAL_VARIABLE_READ, hQueue, pDst, Adapter->HostDevice,
+      reinterpret_cast<const char *>(Ptr) + offset, hQueue->OffloadDevice,
+      count, blockingRead, numEventsInWaitList, phEventWaitList, phEvent);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
@@ -176,18 +176,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
     return Err;
   }
 
-  return doMemcpy(hQueue, reinterpret_cast<char *>(Ptr) + offset,
-                  hQueue->OffloadDevice, pSrc, Adapter->HostDevice, count,
-                  blockingWrite, numEventsInWaitList, phEventWaitList, phEvent);
+  return doMemcpy(UR_COMMAND_DEVICE_GLOBAL_VARIABLE_WRITE, hQueue,
+                  reinterpret_cast<char *>(Ptr) + offset, hQueue->OffloadDevice,
+                  pSrc, Adapter->HostDevice, count, blockingWrite,
+                  numEventsInWaitList, phEventWaitList, phEvent);
 }
 
-ur_result_t enqueueNoOp(ur_queue_handle_t hQueue, ur_event_handle_t *phEvent) {
+ur_result_t enqueueNoOp(ur_command_t Type, ur_queue_handle_t hQueue,
+                        ur_event_handle_t *phEvent) {
   // This path is a no-op, but we can't output a real event because
   // Offload doesn't currently support creating arbitrary events, and we
   // don't know the last real event in the queue. Instead we just have to
   // wait on the whole queue and then return an empty (implicitly
   // finished) event.
-  *phEvent = ur_event_handle_t_::createEmptyEvent();
+  *phEvent = ur_event_handle_t_::createEmptyEvent(Type, hQueue);
   return urQueueFinish(hQueue);
 }
 
@@ -221,7 +223,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
     }
 
     if (phEvent) {
-      enqueueNoOp(hQueue, phEvent);
+      enqueueNoOp(UR_COMMAND_MEM_BUFFER_MAP, hQueue, phEvent);
     }
   }
   *ppRetMap = MapPtr;
@@ -255,7 +257,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
     }
 
     if (phEvent) {
-      enqueueNoOp(hQueue, phEvent);
+      enqueueNoOp(UR_COMMAND_MEM_UNMAP, hQueue, phEvent);
     }
   }
   BufferImpl.unmap(pMappedPtr);
diff --git a/source/adapters/offload/event.cpp b/source/adapters/offload/event.cpp
index b4bbce8034..aab41ed3d2 100644
--- a/source/adapters/offload/event.cpp
+++ b/source/adapters/offload/event.cpp
@@ -12,6 +12,7 @@
 #include <ur_api.h>
 
 #include "event.hpp"
+#include "queue.hpp"
 #include "ur2offload.hpp"
 
 UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent,
@@ -22,6 +23,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent,
   UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
 
   switch (propName) {
+  case UR_EVENT_INFO_CONTEXT:
+    return ReturnValue(hEvent->UrQueue->UrContext);
+  case UR_EVENT_INFO_COMMAND_QUEUE:
+    return ReturnValue(hEvent->UrQueue);
+  case UR_EVENT_INFO_COMMAND_TYPE:
+    return ReturnValue(hEvent->Type);
   case UR_EVENT_INFO_REFERENCE_COUNT:
     return ReturnValue(hEvent->RefCount.load());
   default:
@@ -43,7 +50,7 @@ UR_APIEXPORT ur_result_t UR_APICALL
 urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) {
   for (uint32_t i = 0; i < numEvents; i++) {
     if (phEventWaitList[i]->OffloadEvent) {
-      OL_RETURN_ON_ERR(olWaitEvent(phEventWaitList[i]->OffloadEvent));
+      OL_RETURN_ON_ERR(olSyncEvent(phEventWaitList[i]->OffloadEvent));
     }
   }
   return UR_RESULT_SUCCESS;
@@ -61,9 +68,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) {
     if (Res) {
       return offloadResultToUR(Res);
     }
+    delete hEvent;
   }
 
-  delete hEvent;
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/source/adapters/offload/event.hpp b/source/adapters/offload/event.hpp
index 642c61e532..42860e68be 100644
--- a/source/adapters/offload/event.hpp
+++ b/source/adapters/offload/event.hpp
@@ -18,9 +18,14 @@
 struct ur_event_handle_t_ : RefCounted {
   ol_event_handle_t OffloadEvent;
   ur_command_t Type;
+  ur_queue_handle_t UrQueue;
 
-  static ur_event_handle_t createEmptyEvent() {
-    auto *Event = new ur_event_handle_t_();
+  ur_event_handle_t_(ur_command_t Type, ur_queue_handle_t Queue)
+      : Type(Type), UrQueue(Queue) {}
+
+  static ur_event_handle_t createEmptyEvent(ur_command_t Type,
+                                            ur_queue_handle_t Queue) {
+    auto *Event = new ur_event_handle_t_(Type, Queue);
     // Null event represents an empty event. Waiting on it is a no-op.
     Event->OffloadEvent = nullptr;
 
diff --git a/source/adapters/offload/queue.cpp b/source/adapters/offload/queue.cpp
index 57a10fafa0..b2e28ddd70 100644
--- a/source/adapters/offload/queue.cpp
+++ b/source/adapters/offload/queue.cpp
@@ -17,9 +17,10 @@
 #include "queue.hpp"
 #include "ur2offload.hpp"
 
-UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(
-    [[maybe_unused]] ur_context_handle_t hContext, ur_device_handle_t hDevice,
-    const ur_queue_properties_t *, ur_queue_handle_t *phQueue) {
+UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(ur_context_handle_t hContext,
+                                                  ur_device_handle_t hDevice,
+                                                  const ur_queue_properties_t *,
+                                                  ur_queue_handle_t *phQueue) {
 
   assert(hContext->Device == hDevice);
 
@@ -31,6 +32,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(
   }
 
   Queue->OffloadDevice = hDevice->OffloadDevice;
+  Queue->UrContext = hContext;
 
   *phQueue = Queue;
 
@@ -69,7 +71,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) {
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) {
-  return offloadResultToUR(olWaitQueue(hQueue->OffloadQueue));
+  return offloadResultToUR(olSyncQueue(hQueue->OffloadQueue));
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle(
diff --git a/source/adapters/offload/queue.hpp b/source/adapters/offload/queue.hpp
index 6afe4bf150..e9c642f528 100644
--- a/source/adapters/offload/queue.hpp
+++ b/source/adapters/offload/queue.hpp
@@ -18,4 +18,5 @@
 struct ur_queue_handle_t_ : RefCounted {
   ol_queue_handle_t OffloadQueue;
   ol_device_handle_t OffloadDevice;
+  ur_context_handle_t UrContext;
 };
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index b7a5abac1e..4f20f4a29e 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -42,11 +42,11 @@ if (NOT DEFINED UMF_REPO)
 endif()
 
 if (NOT DEFINED UMF_TAG)
-    # commit c02a5525bd6165e137129f7c5691e0faf14b7ceb
+    # commit 1de269c00e46b7cbdbafa2247812c8c4bb4ed4a5
     # Author: Łukasz Stolarczuk <lukasz.stolarczuk@intel.com>
-    # Date:   Wed Jul 16 11:04:14 2025 +0200
-    # 1.0.0-rc2 release
-    set(UMF_TAG v1.0.0-rc2)
+    # Date:   Mon Jul 21 15:42:59 2025 +0200
+    # 1.0.0 release
+    set(UMF_TAG v1.0.0)
 endif()
 
 message(STATUS "Will fetch Unified Memory Framework from ${UMF_REPO}")
diff --git a/source/loader/CMakeLists.txt b/source/loader/CMakeLists.txt
index a36ff4f698..15dc3127d3 100644
--- a/source/loader/CMakeLists.txt
+++ b/source/loader/CMakeLists.txt
@@ -211,21 +211,10 @@ if(UR_ENABLE_SANITIZER)
         # In in-tree build, if LLVM is built with libc++, we also need to build
         # symbolizer.cpp with libc++ abi and link libc++ in.
         if(NOT UR_STANDALONE_BUILD AND LLVM_LIBCXX_USED)
-            execute_process(
-                COMMAND ${CMAKE_CXX_COMPILER} --print-file-name=libc++.a
-                OUTPUT_VARIABLE LIBCXX_PATH
-                OUTPUT_STRIP_TRAILING_WHITESPACE)
-            execute_process(
-                COMMAND ${CMAKE_CXX_COMPILER} --print-file-name=libc++abi.a
-                OUTPUT_VARIABLE LIBCXX_ABI_PATH
-                OUTPUT_STRIP_TRAILING_WHITESPACE)
             set_property(SOURCE
                 ${symbolizer_sources}
                 APPEND_STRING PROPERTY COMPILE_FLAGS
                 " -stdlib=libc++ ")
-            if(NOT EXISTS ${LIBCXX_PATH} OR NOT EXISTS ${LIBCXX_ABI_PATH})
-                message(FATAL_ERROR "libc++ is required but can't find the libraries")
-            endif()
             # Link with gcc_s fisrt to avoid some symbols resolve to libc++/libc++abi/libunwind's one
             target_link_libraries(ur_loader PRIVATE gcc_s ${LIBCXX_PATH} ${LIBCXX_ABI_PATH})
         endif()
diff --git a/source/loader/layers/sanitizer/msan/msan_libdevice.hpp b/source/loader/layers/sanitizer/msan/msan_libdevice.hpp
index 575655fe99..d57d0c8553 100644
--- a/source/loader/layers/sanitizer/msan/msan_libdevice.hpp
+++ b/source/loader/layers/sanitizer/msan/msan_libdevice.hpp
@@ -14,6 +14,7 @@
 #pragma once
 
 #include "sanitizer_common/sanitizer_libdevice.hpp"
+#include <cstddef>
 
 #if !defined(__SPIR__) && !defined(__SPIRV__)
 namespace ur_sanitizer_layer {
diff --git a/test/conformance/event/urEventGetProfilingInfo.cpp b/test/conformance/event/urEventGetProfilingInfo.cpp
index e4f3d1f894..87b02ca9af 100644
--- a/test/conformance/event/urEventGetProfilingInfo.cpp
+++ b/test/conformance/event/urEventGetProfilingInfo.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "fixtures.h"
+#include "uur/checks.h"
 #include "uur/known_failure.h"
 
 using urEventGetProfilingInfoTest = uur::event::urEventTest;
@@ -179,8 +180,9 @@ TEST_P(urEventGetProfilingInfoTest, InvalidValue) {
 
   const ur_profiling_info_t property_name = UR_PROFILING_INFO_COMMAND_QUEUED;
   size_t property_size = 0;
-  ASSERT_SUCCESS(urEventGetProfilingInfo(event, property_name, 0, nullptr,
-                                         &property_size));
+  ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+      urEventGetProfilingInfo(event, property_name, 0, nullptr, &property_size),
+      property_name);
   ASSERT_NE(property_size, 0);
 
   uint64_t property_value = 0;
@@ -221,8 +223,10 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEventGetProfilingInfoForWaitWithBarrier);
 
 TEST_P(urEventGetProfilingInfoForWaitWithBarrier, Success) {
   uint64_t submit_value = 0;
-  ASSERT_SUCCESS(urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_START,
-                                         size, &submit_value, nullptr));
+  ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+      urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_START, size,
+                              &submit_value, nullptr),
+      UR_PROFILING_INFO_COMMAND_START);
   ASSERT_NE(submit_value, 0);
 
   uint64_t complete_value = 0;