diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake
index 72841724fa01d..202bfdcdc19e3 100644
--- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake
+++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake
@@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT)
       CACHE PATH "Path to external '${name}' adapter source dir" FORCE)
   endfunction()
 
-  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
+  set(UNIFIED_RUNTIME_REPO "https://github.com/Bensuo/unified-runtime.git")
   include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake)
 
   set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake
index 4c07083d355c7..91a797c4e6b85 100644
--- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake
+++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake
@@ -6,4 +6,4 @@
 #    Merge pull request #2578 from Bensuo/ewan/remove_command_ref_counting
 #
 #    Remove command-buffer command handle ref counting
-set(UNIFIED_RUNTIME_TAG 14f4a3ba70b91b3adc411ec6bfc8ae86e948a990)
+set(UNIFIED_RUNTIME_TAG "ewan/native_command")
diff --git a/sycl/include/sycl/detail/backend_traits_cuda.hpp b/sycl/include/sycl/detail/backend_traits_cuda.hpp
index 89bef47d01a4b..9a4df94693329 100644
--- a/sycl/include/sycl/detail/backend_traits_cuda.hpp
+++ b/sycl/include/sycl/detail/backend_traits_cuda.hpp
@@ -24,6 +24,7 @@ typedef struct CUctx_st *CUcontext;
 typedef struct CUstream_st *CUstream;
 typedef struct CUevent_st *CUevent;
 typedef struct CUmod_st *CUmodule;
+typedef struct CUgraph_st *CUgraph;
 
 // As defined in the CUDA 10.1 header file. This requires CUDA version > 3.2
 #if defined(_WIN64) || defined(__LP64__)
@@ -102,6 +103,16 @@ template <> struct BackendReturn<backend::ext_oneapi_cuda, queue> {
   using type = CUstream;
 };
 
+using graph = ext::oneapi::experimental::command_graph<
+    ext::oneapi::experimental::graph_state::modifiable>;
+template <> struct BackendInput<backend::ext_oneapi_cuda, graph> {
+  using type = CUgraph;
+};
+
+template <> struct BackendReturn<backend::ext_oneapi_cuda, graph> {
+  using type = CUgraph;
+};
+
 } // namespace detail
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
index e2e87c30ea945..6401bdbbab463 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
@@ -114,7 +114,8 @@ enum class node_type {
   prefetch = 6,
   memadvise = 7,
   ext_oneapi_barrier = 8,
-  host_task = 9
+  host_task = 9,
+  native_command = 10
 };
 
 /// Class representing a node in the graph, returned by command_graph::add().
diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
index 552c6afe195be..b03e1698acacf 100644
--- a/sycl/include/sycl/handler.hpp
+++ b/sycl/include/sycl/handler.hpp
@@ -1934,9 +1934,6 @@ class __SYCL_EXPORT handler {
                                               void(interop_handle)>::value>
   ext_codeplay_enqueue_native_command([[maybe_unused]] FuncT &&Func) {
 #ifndef __SYCL_DEVICE_ONLY__
-    throwIfGraphAssociated<
-        ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
-            sycl_ext_codeplay_enqueue_native_command>();
     ext_codeplay_enqueue_native_command_impl(Func);
 #endif
   }
diff --git a/sycl/include/sycl/interop_handle.hpp b/sycl/include/sycl/interop_handle.hpp
index 2e7408cf5c0f9..4415bdb2402ce 100644
--- a/sycl/include/sycl/interop_handle.hpp
+++ b/sycl/include/sycl/interop_handle.hpp
@@ -49,6 +49,9 @@ class interop_handle {
   /// interop_handle.
   __SYCL_EXPORT backend get_backend() const noexcept;
 
+  /// Returns true if command-group is being added to a graph as a node
+  __SYCL_EXPORT bool has_graph() const noexcept;
+
   /// Receives a SYCL accessor that has been defined as a requirement for the
   /// command group, and returns the underlying OpenCL memory object that is
   /// used by the SYCL runtime. If the accessor passed as parameter is not part
@@ -134,6 +137,26 @@ class interop_handle {
 #endif
   }
 
+  using graph = ext::oneapi::experimental::command_graph<
+      ext::oneapi::experimental::graph_state::modifiable>;
+  template <backend Backend = backend::opencl>
+  backend_return_t<Backend, graph> get_native_graph() const {
+#ifndef __SYCL_DEVICE_ONLY__
+    // TODO: replace the exception thrown below with the SYCL 2020 exception
+    // with the error code 'errc::backend_mismatch' when those new exceptions
+    // are ready to be used.
+    if (Backend != get_backend())
+      throw exception(make_error_code(errc::invalid),
+                      "Incorrect backend argument was passed");
+
+    // C-style cast required to allow various native types
+    return (backend_return_t<Backend, graph>)getNativeGraph();
+#else
+    // we believe this won't be ever called on device side
+    return 0;
+#endif
+  }
+
   /// Returns the SYCL application interoperability native backend object
   /// associated with the device associated with the SYCL queue that the host
   /// task was submitted to. The native backend object returned must be in
@@ -186,8 +209,9 @@ class interop_handle {
   interop_handle(std::vector<ReqToMem> MemObjs,
                  const std::shared_ptr<detail::queue_impl> &Queue,
                  const std::shared_ptr<detail::device_impl> &Device,
-                 const std::shared_ptr<detail::context_impl> &Context)
-      : MQueue(Queue), MDevice(Device), MContext(Context),
+                 const std::shared_ptr<detail::context_impl> &Context,
+                 const ur_exp_command_buffer_handle_t &Graph)
+      : MQueue(Queue), MDevice(Device), MContext(Context), MGraph(Graph),
         MMemObjs(std::move(MemObjs)) {}
 
   template <backend Backend, typename DataT, int Dims>
@@ -211,10 +235,12 @@ class interop_handle {
   getNativeQueue(int32_t &NativeHandleDesc) const;
   __SYCL_EXPORT ur_native_handle_t getNativeDevice() const;
   __SYCL_EXPORT ur_native_handle_t getNativeContext() const;
+  __SYCL_EXPORT ur_native_handle_t getNativeGraph() const;
 
   std::shared_ptr<detail::queue_impl> MQueue;
   std::shared_ptr<detail::device_impl> MDevice;
   std::shared_ptr<detail::context_impl> MContext;
+  ur_exp_command_buffer_handle_t MGraph;
 
   std::vector<ReqToMem> MMemObjs;
 };
diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp
index 5f9f79d878d03..c940b35c81448 100644
--- a/sycl/source/detail/graph_impl.cpp
+++ b/sycl/source/detail/graph_impl.cpp
@@ -824,6 +824,8 @@ ur_exp_command_buffer_sync_point_t exec_graph_impl::enqueueNode(
     std::shared_ptr<node_impl> Node) {
 
   // Queue which will be used for allocation operations for accessors.
+  // Will also be used in native commands to return to the user in
+  // `interop_handler::get_native_queue()` calls
   auto AllocaQueue = std::make_shared<sycl::detail::queue_impl>(
       DeviceImpl, sycl::detail::getSyclObjImpl(Ctx), sycl::async_handler{},
       sycl::property_list{});
diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp
index e609123b4f285..f5d960b2d15e6 100644
--- a/sycl/source/detail/graph_impl.hpp
+++ b/sycl/source/detail/graph_impl.hpp
@@ -71,6 +71,8 @@ inline node_type getNodeTypeFromCG(sycl::detail::CGType CGType) {
     return node_type::host_task;
   case sycl::detail::CGType::ExecCommandBuffer:
     return node_type::subgraph;
+  case sycl::detail::CGType::EnqueueNativeCommand:
+    return node_type::native_command;
   default:
     assert(false && "Invalid Graph Node Type");
     return node_type::empty;
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 005008a74ebd0..d92077084fc2b 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -454,7 +454,7 @@ class DispatchHostTask {
                "Host task submissions should have an associated queue");
         interop_handle IH{MReqToMem, HostTask.MQueue,
                           HostTask.MQueue->getDeviceImplPtr(),
-                          HostTask.MQueue->getContextImplPtr()};
+                          HostTask.MQueue->getContextImplPtr(), nullptr};
         // TODO: should all the backends that support this entry point use this
         // for host task?
         auto &Queue = HostTask.MQueue;
@@ -2879,6 +2879,19 @@ ur_result_t enqueueReadWriteHostPipe(const QueueImplPtr &Queue,
   return Error;
 }
 
+namespace {
+
+struct CommandBufferNativeCommandData {
+  sycl::interop_handle ih;
+  std::function<void(interop_handle)> func;
+};
+
+void CommandBufferInteropFreeFunc(void *InteropData) {
+  auto *Data = reinterpret_cast<CommandBufferNativeCommandData *>(InteropData);
+  return Data->func(Data->ih);
+}
+} // namespace
+
 ur_result_t ExecCGCommand::enqueueImpCommandBuffer() {
   assert(MQueue && "Command buffer enqueue should have an associated queue");
   // Wait on host command dependencies
@@ -3045,6 +3058,55 @@ ur_result_t ExecCGCommand::enqueueImpCommandBuffer() {
     MEvent->setSyncPoint(OutSyncPoint);
     return UR_RESULT_SUCCESS;
   }
+  case CGType::EnqueueNativeCommand: {
+    // Queue is created by graph_impl before creating command to submit to
+    // scheduler.
+    const AdapterPtr &Adapter = MQueue->getAdapter();
+    const auto Backend = MQueue->get_device().get_backend();
+    CGHostTask *HostTask = (CGHostTask *)MCommandGroup.get();
+
+    // TODO - Doc this
+    ur_exp_command_buffer_handle_t ChildCommandBuffer = nullptr;
+    if (Backend == sycl::backend::ext_oneapi_cuda ||
+        Backend == sycl::backend::ext_oneapi_hip) {
+
+      ur_exp_command_buffer_desc_t Desc{
+          UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC /*stype*/,
+          nullptr /*pnext*/, false /* updatable */, false /* in-order */,
+          false /* profilable*/
+      };
+      auto ContextImpl = sycl::detail::getSyclObjImpl(MQueue->get_context());
+      auto DeviceImpl = sycl::detail::getSyclObjImpl(MQueue->get_device());
+      Adapter->call<sycl::detail::UrApiKind::urCommandBufferCreateExp>(
+          ContextImpl->getHandleRef(), DeviceImpl->getHandleRef(), &Desc,
+          &ChildCommandBuffer);
+    }
+
+    std::vector<interop_handle::ReqToMem> ReqToMem; // TODO work with buffers
+    interop_handle IH{ReqToMem, HostTask->MQueue,
+                      HostTask->MQueue->getDeviceImplPtr(),
+                      HostTask->MQueue->getContextImplPtr(),
+                      ChildCommandBuffer ? ChildCommandBuffer : MCommandBuffer};
+    CommandBufferNativeCommandData CustomOpData{
+        IH, HostTask->MHostTask->MInteropTask};
+
+    Adapter->call<UrApiKind::urCommandBufferAppendNativeCommandExp>(
+        MCommandBuffer, CommandBufferInteropFreeFunc, &CustomOpData,
+        ChildCommandBuffer, MSyncPointDeps.size(),
+        MSyncPointDeps.empty() ? nullptr : MSyncPointDeps.data(),
+        &OutSyncPoint);
+
+    if (ChildCommandBuffer) {
+      ur_result_t Res = Adapter->call_nocheck<
+          sycl::detail::UrApiKind::urCommandBufferReleaseExp>(
+          ChildCommandBuffer);
+      (void)Res;
+      assert(Res == UR_RESULT_SUCCESS);
+    }
+
+    MEvent->setSyncPoint(OutSyncPoint);
+    return UR_RESULT_SUCCESS;
+  }
 
   default:
     throw exception(make_error_code(errc::runtime),
@@ -3416,7 +3478,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
     EnqueueNativeCommandData CustomOpData{
         interop_handle{ReqToMem, HostTask->MQueue,
                        HostTask->MQueue->getDeviceImplPtr(),
-                       HostTask->MQueue->getContextImplPtr()},
+                       HostTask->MQueue->getContextImplPtr(), nullptr},
         HostTask->MHostTask->MInteropTask};
 
     ur_bool_t NativeCommandSupport = false;
diff --git a/sycl/source/interop_handle.cpp b/sycl/source/interop_handle.cpp
index aabf22702ef5f..d4912c6c8b3e4 100644
--- a/sycl/source/interop_handle.cpp
+++ b/sycl/source/interop_handle.cpp
@@ -23,6 +23,8 @@ backend interop_handle::get_backend() const noexcept {
   return detail::getImplBackend(MQueue);
 }
 
+bool interop_handle::has_graph() const noexcept { return MGraph != nullptr; }
+
 ur_native_handle_t
 interop_handle::getNativeMem(detail::Requirement *Req) const {
   auto Iter = std::find_if(std::begin(MMemObjs), std::end(MMemObjs),
@@ -53,5 +55,17 @@ interop_handle::getNativeQueue(int32_t &NativeHandleDesc) const {
   return MQueue->getNative(NativeHandleDesc);
 }
 
+ur_native_handle_t interop_handle::getNativeGraph() const {
+  if (!MGraph) {
+    throw exception(make_error_code(errc::invalid),
+                    "Command-Group is not being added as a graph node");
+  }
+
+  auto Adapter = MQueue->getAdapter();
+  ur_native_handle_t Handle;
+  Adapter->call<detail::UrApiKind::urCommandBufferGetNativeHandleExp>(MGraph,
+                                                                      &Handle);
+  return Handle;
+}
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/test-e2e/Graph/NativeCommand/native_cuda_explicit_usm.cpp b/sycl/test-e2e/Graph/NativeCommand/native_cuda_explicit_usm.cpp
new file mode 100644
index 0000000000000..10acae92cd1a5
--- /dev/null
+++ b/sycl/test-e2e/Graph/NativeCommand/native_cuda_explicit_usm.cpp
@@ -0,0 +1,87 @@
+// RUN: %{build} -g -o %t.out -lcuda
+// RUN: %{run} %t.out
+// REQUIRES: cuda
+
+#include "../graph_common.hpp"
+#include <cuda.h>
+#include <sycl/backend.hpp>
+#include <sycl/interop_handle.hpp>
+
+int main() {
+  queue Queue;
+
+  int *PtrX = malloc_device<int>(Size, Queue);
+  int *PtrY = malloc_device<int>(Size, Queue);
+
+  exp_ext::command_graph Graph{Queue};
+
+  Graph.begin_recording(Queue);
+
+  auto EventA = Queue.submit([&](handler &CGH) {
+    CGH.single_task([=]() {
+      for (size_t i = 0; i < Size; i++) {
+        PtrX[i] = i;
+        PtrY[i] = 0;
+      }
+    });
+  });
+
+  auto EventB = Queue.submit([&](handler &CGH) {
+    CGH.depends_on(EventA);
+
+    CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) {
+      if (IH.has_graph()) {
+        CUgraph NativeGraph = IH.get_native_graph<backend::ext_oneapi_cuda>();
+        CUgraphNode Node;
+        // TODO figure this out
+        /*
+        CUDA_MEMCPY3D &Params
+   181   std::memset(&Params, 0, sizeof(CUDA_MEMCPY3D));
+   182
+   183   Params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+   184   Params.srcDevice = SrcType == 
+   185                          ? *static_cast<const CUdeviceptr *>(SrcPtr)
+   186                          : 0;
+   187   Params.srcHost = cType == CU_MEMORYTYPE_HOST ? SrcPtr : nullptr;
+   188   Params.dstMemoryType = DstType;
+   189   Params.dstDevice =
+   190       DstType == CU_MEMORYTYPE_DEVICE ? *static_cast<CUdeviceptr *>(DstPtr) : 0;
+   191   Params.dstHost = DstType == CU_MEMORYTYPE_HOST ? DstPtr : nullptr;
+   192   Params.WidthInBytes = Size;
+   193   Params.Height = 1;
+   194   Params.Depth = 1;
+        auto Res = cuGraphAddMemcpyNode1D(&Node, NativeGraph, nullptr, 0,
+                                          ((CUdeviceptr)PtrY, (CUdeviceptr)PtrX,
+                                          Size * sizeof(int), &Params, Context);
+        assert(Res == CUDA_SUCCESS);
+        */
+      } else {
+        assert(false && "Native Handle should have a graph");
+      }
+    });
+  });
+
+  Queue.submit([&](handler &CGH) {
+    CGH.depends_on(EventB);
+    CGH.parallel_for(range<1>{Size}, [=](id<1> it) { PtrY[it] *= 2; });
+  });
+
+  Graph.end_recording();
+
+  auto ExecGraph = Graph.finalize();
+  Queue.ext_oneapi_graph(ExecGraph).wait();
+
+  std::vector<int> HostData(Size);
+
+  Queue.copy(PtrY, HostData.data(), Size).wait();
+  for (size_t i = 0; i < Size; i++) {
+    const int Ref = i * 2;
+    assert(check_value(Ref, HostData[i],
+                       std::string("HostData at index ") + std::to_string(i)));
+  }
+
+  free(PtrX, Queue);
+  free(PtrY, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/NativeCommand/native_cuda_record_usm.cpp b/sycl/test-e2e/Graph/NativeCommand/native_cuda_record_usm.cpp
new file mode 100644
index 0000000000000..8d91d51cc2798
--- /dev/null
+++ b/sycl/test-e2e/Graph/NativeCommand/native_cuda_record_usm.cpp
@@ -0,0 +1,90 @@
+// RUN: %{build} -g -o %t.out -lcuda
+// RUN: %{run} %t.out
+// REQUIRES: cuda
+
+#include "../graph_common.hpp"
+#include <cuda.h>
+#include <sycl/backend.hpp>
+#include <sycl/interop_handle.hpp>
+
+int main() {
+  queue Queue;
+
+  int *PtrX = malloc_device<int>(Size, Queue);
+  int *PtrY = malloc_device<int>(Size, Queue);
+
+  exp_ext::command_graph Graph{Queue};
+
+  Graph.begin_recording(Queue);
+
+  auto EventA = Queue.submit([&](handler &CGH) {
+    CGH.single_task([=]() {
+      for (size_t i = 0; i < Size; i++) {
+        PtrX[i] = i;
+        PtrY[i] = 0;
+      }
+    });
+  });
+
+  auto EventB = Queue.submit([&](handler &CGH) {
+    CGH.depends_on(EventA);
+
+    CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) {
+      if (IH.has_graph()) {
+        // Newly created stream for this node
+        auto NativeStream = IH.get_native_queue<backend::ext_oneapi_cuda>();
+        // Graph already created with cuGraphCreate
+        CUgraph NativeGraph = IH.get_native_graph<backend::ext_oneapi_cuda>();
+
+        // Start stream capture
+        auto Res =
+            cuStreamBeginCapture(NativeStream, CU_STREAM_CAPTURE_MODE_GLOBAL);
+        assert(Res == CUDA_SUCCESS);
+
+        // Add memcopy node
+        Res = cuMemcpyAsync((CUdeviceptr)PtrY, (CUdeviceptr)PtrX,
+                            Size * sizeof(int), NativeStream);
+        assert(Res == CUDA_SUCCESS);
+
+        // cuStreamEndCapture returns a new graph, if we overwrite
+        // "NativeGraph" it won't be picked up by the UR runtime, as it's
+        // a passed-by-value pointer
+        CUgraph RecordedGraph;
+        Res = cuStreamEndCapture(NativeStream, &RecordedGraph);
+
+        // Add graph to native graph as a child node
+        // Need to return a node object for the node to be created,
+        // can't be nullptr.
+        CUgraphNode Node;
+        cuGraphAddChildGraphNode(&Node, NativeGraph, nullptr, 0, RecordedGraph);
+        assert(Res == CUDA_SUCCESS);
+      } else {
+        assert(false && "Native Handle should have a graph");
+      }
+    });
+  });
+
+  Queue.submit([&](handler &CGH) {
+    CGH.depends_on(EventB);
+    CGH.parallel_for(range<1>{Size}, [=](id<1> it) { PtrY[it] *= 2; });
+  });
+
+  Graph.end_recording();
+
+  auto ExecGraph = Graph.finalize();
+  Queue.ext_oneapi_graph(ExecGraph).wait();
+
+  std::vector<int> HostData(Size);
+
+  Queue.copy(PtrY, HostData.data(), Size).wait();
+  for (size_t i = 0; i < Size; i++) {
+    const int Ref = i * 2;
+    assert(check_value(Ref, HostData[i],
+                       std::string("HostData at index ") + std::to_string(i)));
+  }
+
+  free(PtrX, Queue);
+  free(PtrY, Queue);
+
+  return 0;
+}