diff --git a/sycl/include/sycl/detail/nd_range_view.hpp b/sycl/include/sycl/detail/nd_range_view.hpp
new file mode 100644
index 0000000000000..ea7f403f171b3
--- /dev/null
+++ b/sycl/include/sycl/detail/nd_range_view.hpp
@@ -0,0 +1,57 @@
+//==---- nd_range_view.hpp --- SYCL iteration with reference to ranges ---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/nd_range.hpp>
+
+namespace sycl {
+inline namespace _V1 {
+namespace detail {
+
+class NDRDescT;
+
+// The structure to keep dimension and references to ranges unified for
+// all dimensions.
+class nd_range_view {
+
+public:
+  nd_range_view() = default;
+  nd_range_view(const nd_range_view &Desc) = default;
+  nd_range_view(nd_range_view &&Desc) = default;
+  nd_range_view &operator=(const nd_range_view &Desc) = default;
+  nd_range_view &operator=(nd_range_view &&Desc) = default;
+
+  template <int Dims_>
+  nd_range_view(sycl::range<Dims_> &N, bool SetNumWorkGroups = false)
+      : MGlobalSize(&(N[0])), MSetNumWorkGroups(SetNumWorkGroups),
+        MDims{size_t(Dims_)} {}
+
+  template <int Dims_>
+  nd_range_view(sycl::range<Dims_> &GlobalSize, sycl::id<Dims_> &Offset)
+      : MGlobalSize(&(GlobalSize[0])), MOffset(&(Offset[0])),
+        MDims{size_t(Dims_)} {}
+
+  template <int Dims_>
+  nd_range_view(sycl::nd_range<Dims_> &ExecutionRange)
+      : MGlobalSize(&(ExecutionRange.globalSize[0])),
+        MLocalSize(&(ExecutionRange.localSize[0])),
+        MOffset(&(ExecutionRange.offset[0])), MDims{size_t(Dims_)} {}
+
+  sycl::detail::NDRDescT toNDRDescT() const;
+
+  const size_t *MGlobalSize = nullptr;
+  const size_t *MLocalSize = nullptr;
+  const size_t *MOffset = nullptr;
+  bool MSetNumWorkGroups = false;
+  size_t MDims = 0;
+};
+
+} // namespace detail
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
index d8d46d2a27814..6b2c4f65c7a85 100644
--- a/sycl/include/sycl/handler.hpp
+++ b/sycl/include/sycl/handler.hpp
@@ -362,6 +362,24 @@ class RoundedRangeKernelWithKH {
   }
 };
 
+template <typename WrapperT, typename TransformedArgType, int Dims,
+          typename KernelType,
+          std::enable_if_t<detail::KernelLambdaHasKernelHandlerArgT<
+              KernelType, TransformedArgType>::value> * = nullptr>
+auto getRangeRoundedKernelLambda(KernelType KernelFunc, range<Dims> UserRange) {
+  return detail::RoundedRangeKernelWithKH<TransformedArgType, Dims, KernelType>{
+      UserRange, KernelFunc};
+}
+
+template <typename WrapperT, typename TransformedArgType, int Dims,
+          typename KernelType,
+          std::enable_if_t<!detail::KernelLambdaHasKernelHandlerArgT<
+              KernelType, TransformedArgType>::value> * = nullptr>
+auto getRangeRoundedKernelLambda(KernelType KernelFunc, range<Dims> UserRange) {
+  return detail::RoundedRangeKernel<TransformedArgType, Dims, KernelType>{
+      UserRange, KernelFunc};
+}
+
 using std::enable_if_t;
 using sycl::detail::queue_impl;
 
@@ -1218,7 +1236,7 @@ class __SYCL_EXPORT handler {
     if (HasRoundedRange) {
       using NameWT = typename detail::get_kernel_wrapper_name_t<NameT>::name;
       auto Wrapper =
-          getRangeRoundedKernelLambda<NameWT, TransformedArgType, Dims>(
+          detail::getRangeRoundedKernelLambda<NameWT, TransformedArgType, Dims>(
               KernelFunc, UserRange);
 
       using KName = std::conditional_t<std::is_same<KernelType, NameT>::value,
@@ -3265,26 +3283,6 @@ class __SYCL_EXPORT handler {
   void GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor,
                                 size_t &MinRange);
 
-  template <typename WrapperT, typename TransformedArgType, int Dims,
-            typename KernelType,
-            std::enable_if_t<detail::KernelLambdaHasKernelHandlerArgT<
-                KernelType, TransformedArgType>::value> * = nullptr>
-  auto getRangeRoundedKernelLambda(KernelType KernelFunc,
-                                   range<Dims> UserRange) {
-    return detail::RoundedRangeKernelWithKH<TransformedArgType, Dims,
-                                            KernelType>{UserRange, KernelFunc};
-  }
-
-  template <typename WrapperT, typename TransformedArgType, int Dims,
-            typename KernelType,
-            std::enable_if_t<!detail::KernelLambdaHasKernelHandlerArgT<
-                KernelType, TransformedArgType>::value> * = nullptr>
-  auto getRangeRoundedKernelLambda(KernelType KernelFunc,
-                                   range<Dims> UserRange) {
-    return detail::RoundedRangeKernel<TransformedArgType, Dims, KernelType>{
-        UserRange, KernelFunc};
-  }
-
 #ifndef __INTEL_PREVIEW_BREAKING_CHANGES
   const std::shared_ptr<detail::context_impl> &getContextImplPtr() const;
 #endif
diff --git a/sycl/include/sycl/nd_range.hpp b/sycl/include/sycl/nd_range.hpp
index e4ff4881be17a..30816b8a4b354 100644
--- a/sycl/include/sycl/nd_range.hpp
+++ b/sycl/include/sycl/nd_range.hpp
@@ -15,6 +15,10 @@
 namespace sycl {
 inline namespace _V1 {
 
+namespace detail {
+class nd_range_view;
+}
+
 /// Defines the iteration domain of both the work-groups and the overall
 /// dispatch.
 ///
@@ -65,6 +69,8 @@ template <int Dimensions = 1> class nd_range {
   bool operator!=(const nd_range<Dimensions> &rhs) const {
     return !(*this == rhs);
   }
+
+  friend class sycl::_V1::detail::nd_range_view;
 };
 
 } // namespace _V1
diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp
index f7bff57c2df9a..070880f48682a 100644
--- a/sycl/include/sycl/queue.hpp
+++ b/sycl/include/sycl/queue.hpp
@@ -21,6 +21,7 @@
 #include <sycl/detail/id_queries_fit_in_int.hpp> // for checkValueRange
 #include <sycl/detail/info_desc_helpers.hpp>  // for is_queue_info_...
 #include <sycl/detail/kernel_desc.hpp>        // for KernelInfo
+#include <sycl/detail/nd_range_view.hpp>
 #include <sycl/detail/optional.hpp>
 #include <sycl/detail/owner_less_base.hpp> // for OwnerLessBase
 #include <sycl/device.hpp>                 // for device
@@ -40,6 +41,7 @@
 #include <sycl/nd_range.hpp>                         // for nd_range
 #include <sycl/property_list.hpp>                    // for property_list
 #include <sycl/range.hpp>                            // for range
+#include <sycl/range_rounding.hpp>
 #include <sycl/sycl_span.hpp>                        // for sycl::span
 
 #include <cstddef>     // for size_t
@@ -64,18 +66,16 @@ template <backend BackendName, class SyclObjectT>
 auto get_native(const SyclObjectT &Obj)
     -> backend_return_t<BackendName, SyclObjectT>;
 
-template <int Dims>
 event __SYCL_EXPORT submit_kernel_direct_with_event_impl(
-    const queue &Queue, const nd_range<Dims> &Range,
+    const queue &Queue, const detail::nd_range_view &RangeView,
     detail::HostKernelRefBase &HostKernel,
     detail::DeviceKernelInfo *DeviceKernelInfo,
     sycl::span<const event> DepEvents,
     const detail::KernelPropertyHolderStructTy &Props,
     const detail::code_location &CodeLoc, bool IsTopCodeLoc);
 
-template <int Dims>
 void __SYCL_EXPORT submit_kernel_direct_without_event_impl(
-    const queue &Queue, const nd_range<Dims> &Range,
+    const queue &Queue, const detail::nd_range_view &RangeView,
     detail::HostKernelRefBase &HostKernel,
     detail::DeviceKernelInfo *DeviceKernelInfo,
     sycl::span<const event> DepEvents,
@@ -162,118 +162,41 @@ class __SYCL_EXPORT SubmissionInfo {
 
 } // namespace v1
 
-template <detail::WrapAs WrapAs, typename LambdaArgType,
-          typename KernelName = detail::auto_name, bool EventNeeded = false,
+template <int Dims, typename LambdaArgType> struct TransformUserItemType {
+  using type = std::conditional_t<
+      std::is_convertible_v<nd_item<Dims>, LambdaArgType>, nd_item<Dims>,
+      std::conditional_t<std::is_convertible_v<item<Dims>, LambdaArgType>,
+                         item<Dims>, LambdaArgType>>;
+};
+
+template <detail::WrapAs WrapAs, typename LambdaArgType, typename KernelName,
+          bool EventNeeded = false,
           typename PropertiesT = ext::oneapi::experimental::empty_properties_t,
           typename KernelTypeUniversalRef, int Dims>
 auto submit_kernel_direct(
-    const queue &Queue, const nd_range<Dims> &Range,
+    const queue &Queue, detail::nd_range_view RangeView,
     KernelTypeUniversalRef &&KernelFunc, sycl::span<const event> DepEvents,
     const PropertiesT &ExtraProps =
         ext::oneapi::experimental::empty_properties_t{},
-    const detail::code_location &CodeLoc = detail::code_location::current()) {
-  detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
-
-  using KernelType =
-      std::remove_const_t<std::remove_reference_t<KernelTypeUniversalRef>>;
-
-  using NameT =
-      typename detail::get_kernel_name_t<KernelName, KernelType>::name;
-
-  detail::KernelWrapper<WrapAs, NameT, KernelType, LambdaArgType,
-                        PropertiesT>::wrap(KernelFunc);
-
-  HostKernelRef<KernelType, KernelTypeUniversalRef, LambdaArgType, Dims>
-      HostKernel(std::forward<KernelTypeUniversalRef>(KernelFunc));
-
-  // Instantiating the kernel on the host improves debugging.
-  // Passing this pointer to another translation unit prevents optimization.
-#ifndef NDEBUG
-  // TODO: call library to prevent dropping call due to optimization.
-  (void)
-      detail::GetInstantiateKernelOnHostPtr<KernelType, LambdaArgType, Dims>();
-#endif
-
-  detail::DeviceKernelInfo *DeviceKernelInfoPtr =
-      &detail::getDeviceKernelInfo<NameT>();
-  constexpr auto Info = detail::CompileTimeKernelInfo<NameT>;
-
-  assert(Info.Name != std::string_view{} && "Kernel must have a name!");
-
-  static_assert(
-      Info.Name == std::string_view{} || sizeof(KernelType) == Info.KernelSize,
-      "Unexpected kernel lambda size. This can be caused by an "
-      "external host compiler producing a lambda with an "
-      "unexpected layout. This is a limitation of the compiler."
-      "In many cases the difference is related to capturing constexpr "
-      "variables. In such cases removing constexpr specifier aligns the "
-      "captures between the host compiler and the device compiler."
-      "\n"
-      "In case of MSVC, passing "
-      "-fsycl-host-compiler-options='/std:c++latest' "
-      "might also help.");
-
-  detail::KernelPropertyHolderStructTy ParsedProperties;
-  if constexpr (ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod<
-                    const KernelType &>::value) {
-    // Merge properties via get() and manually specified properties.
-    // get() method is used for specifying kernel properties but properties
-    // passed via launch_config (ExtraProps) should be kernel launch properties.
-    // They are mutually exclusive, so there should not be any conflict when
-    // merging properties. merge_properties() throws if there's a conflict.
-    auto MergedProps =
-        sycl::ext::oneapi::experimental::detail::merge_properties(
-            ExtraProps,
-            KernelFunc.get(ext::oneapi::experimental::properties_tag{}));
-
-    ParsedProperties = extractKernelProperties(MergedProps);
-  } else {
-    ParsedProperties = extractKernelProperties(ExtraProps);
-  }
-
-  if constexpr (EventNeeded) {
-    return submit_kernel_direct_with_event_impl(
-        Queue, Range, HostKernel, DeviceKernelInfoPtr, DepEvents,
-        ParsedProperties, TlsCodeLocCapture.query(),
-        TlsCodeLocCapture.isToplevel());
-  } else {
-    submit_kernel_direct_without_event_impl(
-        Queue, Range, HostKernel, DeviceKernelInfoPtr, DepEvents,
-        ParsedProperties, TlsCodeLocCapture.query(),
-        TlsCodeLocCapture.isToplevel());
-  }
-}
+    const detail::code_location &CodeLoc = detail::code_location::current());
 
 template <typename KernelName = detail::auto_name, bool EventNeeded = false,
           typename PropertiesT = ext::oneapi::experimental::empty_properties_t,
           typename KernelTypeUniversalRef, int Dims>
 auto submit_kernel_direct_parallel_for(
-    const queue &Queue, const nd_range<Dims> &Range,
+    const queue &Queue, nd_range<Dims> Range,
     KernelTypeUniversalRef &&KernelFunc, sycl::span<const event> DepEvents = {},
     const PropertiesT &Props = ext::oneapi::experimental::empty_properties_t{},
-    const detail::code_location &CodeLoc = detail::code_location::current()) {
-
-  using KernelType =
-      std::remove_const_t<std::remove_reference_t<KernelTypeUniversalRef>>;
+    const detail::code_location &CodeLoc = detail::code_location::current());
 
-  using LambdaArgType =
-      sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
-  static_assert(
-      std::is_convertible_v<sycl::nd_item<Dims>, LambdaArgType>,
-      "Kernel argument of a sycl::parallel_for with sycl::nd_range "
-      "must be either sycl::nd_item or be convertible from sycl::nd_item");
-  using TransformedArgType = sycl::nd_item<Dims>;
-
-#ifndef __SYCL_DEVICE_ONLY__
-  detail::checkValueRange<Dims>(Range);
-#endif
-
-  return submit_kernel_direct<detail::WrapAs::parallel_for, TransformedArgType,
-                              KernelName, EventNeeded, PropertiesT,
-                              KernelTypeUniversalRef, Dims>(
-      Queue, Range, std::forward<KernelTypeUniversalRef>(KernelFunc), DepEvents,
-      Props, CodeLoc);
-}
+template <typename KernelName = detail::auto_name, bool EventNeeded = false,
+          typename PropertiesT = ext::oneapi::experimental::empty_properties_t,
+          typename KernelTypeUniversalRef, int Dims>
+auto submit_kernel_direct_parallel_for(
+    const queue &Queue, range<Dims> Range, KernelTypeUniversalRef &&KernelFunc,
+    sycl::span<const event> DepEvents = {},
+    const PropertiesT &Props = ext::oneapi::experimental::empty_properties_t{},
+    const detail::code_location &CodeLoc = detail::code_location::current());
 
 template <typename KernelName = detail::auto_name, bool EventNeeded = false,
           typename PropertiesT = ext::oneapi::experimental::empty_properties_t,
@@ -282,15 +205,7 @@ auto submit_kernel_direct_single_task(
     const queue &Queue, KernelTypeUniversalRef &&KernelFunc,
     sycl::span<const event> DepEvents = {},
     const PropertiesT &Props = ext::oneapi::experimental::empty_properties_t{},
-    const detail::code_location &CodeLoc = detail::code_location::current()) {
-
-  return submit_kernel_direct<detail::WrapAs::single_task, void, KernelName,
-                              EventNeeded, PropertiesT, KernelTypeUniversalRef,
-                              1>(
-      Queue, nd_range<1>{1, 1},
-      std::forward<KernelTypeUniversalRef>(KernelFunc), DepEvents, Props,
-      CodeLoc);
-}
+    const detail::code_location &CodeLoc = detail::code_location::current());
 
 } // namespace detail
 
@@ -3984,11 +3899,27 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
                     RestT &&...Rest) {
     constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
-    return submit(
-        [&](handler &CGH) {
-          CGH.template parallel_for<KernelName>(Range, Properties, Rest...);
-        },
-        TlsCodeLocCapture.query());
+    using KernelType = std::remove_const_t<
+        std::remove_reference_t<std::tuple_element_t<0, std::tuple<RestT...>>>>;
+    using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
+    using TransformedArgType = std::conditional_t<
+        std::is_integral<LambdaArgType>::value && Dims == 1, item<Dims>,
+        typename detail::TransformUserItemType<Dims, LambdaArgType>::type>;
+
+    // TODO The handler-less path does not support reductions, and
+    // kernel functions with the kernel_handler type argument yet.
+    if constexpr (sizeof...(RestT) == 1 &&
+                  !(detail::KernelLambdaHasKernelHandlerArgT<
+                      KernelType, TransformedArgType>::value)) {
+      return detail::submit_kernel_direct_parallel_for<KernelName, true>(
+          *this, Range, Rest..., {}, Properties, TlsCodeLocCapture.query());
+    } else {
+      return submit(
+          [&](handler &CGH) {
+            CGH.template parallel_for<KernelName>(Range, Properties, Rest...);
+          },
+          TlsCodeLocCapture.query());
+    }
   }
 
   /// parallel_for_impl with a kernel represented as a lambda + range that
@@ -4018,12 +3949,29 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
                     RestT &&...Rest) {
     constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
-    return submit(
-        [&](handler &CGH) {
-          CGH.depends_on(DepEvent);
-          CGH.template parallel_for<KernelName>(Range, Properties, Rest...);
-        },
-        TlsCodeLocCapture.query());
+    using KernelType = std::remove_const_t<
+        std::remove_reference_t<std::tuple_element_t<0, std::tuple<RestT...>>>>;
+    using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
+    using TransformedArgType = std::conditional_t<
+        std::is_integral<LambdaArgType>::value && Dims == 1, item<Dims>,
+        typename detail::TransformUserItemType<Dims, LambdaArgType>::type>;
+
+    // TODO The handler-less path does not support reductions, and
+    // kernel functions with the kernel_handler type argument yet.
+    if constexpr (sizeof...(RestT) == 1 &&
+                  !(detail::KernelLambdaHasKernelHandlerArgT<
+                      KernelType, TransformedArgType>::value)) {
+      return detail::submit_kernel_direct_parallel_for<KernelName, true>(
+          *this, Range, Rest..., sycl::span<const event>(&DepEvent, 1),
+          Properties, TlsCodeLocCapture.query());
+    } else {
+      return submit(
+          [&](handler &CGH) {
+            CGH.depends_on(DepEvent);
+            CGH.template parallel_for<KernelName>(Range, Properties, Rest...);
+          },
+          TlsCodeLocCapture.query());
+    }
   }
 
   /// parallel_for_impl with a kernel represented as a lambda + range that
@@ -4055,12 +4003,29 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
                     PropertiesT Properties, RestT &&...Rest) {
     constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
-    return submit(
-        [&](handler &CGH) {
-          CGH.depends_on(DepEvents);
-          CGH.template parallel_for<KernelName>(Range, Properties, Rest...);
-        },
-        TlsCodeLocCapture.query());
+    using KernelType = std::remove_const_t<
+        std::remove_reference_t<std::tuple_element_t<0, std::tuple<RestT...>>>>;
+    using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
+    using TransformedArgType = std::conditional_t<
+        std::is_integral<LambdaArgType>::value && Dims == 1, item<Dims>,
+        typename detail::TransformUserItemType<Dims, LambdaArgType>::type>;
+
+    // TODO The handler-less path does not support reductions, and
+    // kernel functions with the kernel_handler type argument yet.
+    if constexpr (sizeof...(RestT) == 1 &&
+                  !(detail::KernelLambdaHasKernelHandlerArgT<
+                      KernelType, TransformedArgType>::value)) {
+      return detail::submit_kernel_direct_parallel_for<KernelName, true>(
+          *this, Range, Rest..., DepEvents, Properties,
+          TlsCodeLocCapture.query());
+    } else {
+      return submit(
+          [&](handler &CGH) {
+            CGH.depends_on(DepEvents);
+            CGH.template parallel_for<KernelName>(Range, Properties, Rest...);
+          },
+          TlsCodeLocCapture.query());
+    }
   }
 
   /// parallel_for_impl version with a kernel represented as a lambda + range
@@ -4102,6 +4067,235 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   }
 };
 
+namespace detail {
+
+template <detail::WrapAs WrapAs, typename LambdaArgType, typename KernelName,
+          bool EventNeeded, typename PropertiesT,
+          typename KernelTypeUniversalRef, int Dims>
+auto submit_kernel_direct(const queue &Queue, detail::nd_range_view RangeView,
+                          KernelTypeUniversalRef &&KernelFunc,
+                          sycl::span<const event> DepEvents,
+                          const PropertiesT &ExtraProps,
+                          const detail::code_location &CodeLoc) {
+  detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
+
+  using KernelType =
+      std::remove_const_t<std::remove_reference_t<KernelTypeUniversalRef>>;
+
+  detail::KernelWrapper<WrapAs, KernelName, KernelType, LambdaArgType,
+                        PropertiesT>::wrap(KernelFunc);
+
+  HostKernelRef<KernelType, KernelTypeUniversalRef, LambdaArgType, Dims>
+      HostKernel(std::forward<KernelTypeUniversalRef>(KernelFunc));
+
+  // Instantiating the kernel on the host improves debugging.
+  // Passing this pointer to another translation unit prevents optimization.
+#ifndef NDEBUG
+  // TODO: call library to prevent dropping call due to optimization.
+  (void)
+      detail::GetInstantiateKernelOnHostPtr<KernelType, LambdaArgType, Dims>();
+#endif
+
+  detail::DeviceKernelInfo *DeviceKernelInfoPtr =
+      &detail::getDeviceKernelInfo<KernelName>();
+  constexpr auto Info = detail::CompileTimeKernelInfo<KernelName>;
+
+  assert(Info.Name != std::string_view{} && "Kernel must have a name!");
+
+  static_assert(
+      Info.Name == std::string_view{} || sizeof(KernelType) == Info.KernelSize,
+      "Unexpected kernel lambda size. This can be caused by an "
+      "external host compiler producing a lambda with an "
+      "unexpected layout. This is a limitation of the compiler."
+      "In many cases the difference is related to capturing constexpr "
+      "variables. In such cases removing constexpr specifier aligns the "
+      "captures between the host compiler and the device compiler."
+      "\n"
+      "In case of MSVC, passing "
+      "-fsycl-host-compiler-options='/std:c++latest' "
+      "might also help.");
+
+  detail::KernelPropertyHolderStructTy ParsedProperties;
+  if constexpr (ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod<
+                    const KernelType &>::value) {
+    // Merge properties via get() and manually specified properties.
+    // get() method is used for specifying kernel properties but properties
+    // passed via launch_config (ExtraProps) should be kernel launch properties.
+    // They are mutually exclusive, so there should not be any conflict when
+    // merging properties. merge_properties() throws if there's a conflict.
+    auto MergedProps =
+        sycl::ext::oneapi::experimental::detail::merge_properties(
+            ExtraProps,
+            KernelFunc.get(ext::oneapi::experimental::properties_tag{}));
+
+    ParsedProperties = extractKernelProperties(MergedProps);
+  } else {
+    ParsedProperties = extractKernelProperties(ExtraProps);
+  }
+
+  if constexpr (EventNeeded) {
+    return submit_kernel_direct_with_event_impl(
+        Queue, RangeView, HostKernel, DeviceKernelInfoPtr, DepEvents,
+        ParsedProperties, TlsCodeLocCapture.query(),
+        TlsCodeLocCapture.isToplevel());
+  } else {
+    submit_kernel_direct_without_event_impl(
+        Queue, RangeView, HostKernel, DeviceKernelInfoPtr, DepEvents,
+        ParsedProperties, TlsCodeLocCapture.query(),
+        TlsCodeLocCapture.isToplevel());
+  }
+}
+
+template <typename KernelName, bool EventNeeded, typename PropertiesT,
+          typename KernelTypeUniversalRef, int Dims>
+auto submit_kernel_direct_parallel_for(const queue &Queue, nd_range<Dims> Range,
+                                       KernelTypeUniversalRef &&KernelFunc,
+                                       sycl::span<const event> DepEvents,
+                                       const PropertiesT &Props,
+                                       const detail::code_location &CodeLoc) {
+
+  using KernelType =
+      std::remove_const_t<std::remove_reference_t<KernelTypeUniversalRef>>;
+  using NameT =
+      typename detail::get_kernel_name_t<KernelName, KernelType>::name;
+
+  using LambdaArgType =
+      sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
+  static_assert(
+      std::is_convertible_v<sycl::nd_item<Dims>, LambdaArgType>,
+      "Kernel argument of a sycl::parallel_for with sycl::nd_range "
+      "must be either sycl::nd_item or be convertible from sycl::nd_item");
+  using TransformedArgType = sycl::nd_item<Dims>;
+
+#ifndef __SYCL_DEVICE_ONLY__
+  detail::checkValueRange<Dims>(Range);
+#endif
+
+  return submit_kernel_direct<detail::WrapAs::parallel_for, TransformedArgType,
+                              NameT, EventNeeded, PropertiesT,
+                              KernelTypeUniversalRef, Dims>(
+      Queue, detail::nd_range_view(Range),
+      std::forward<KernelTypeUniversalRef>(KernelFunc), DepEvents, Props,
+      CodeLoc);
+}
+
+template <typename KernelName, bool EventNeeded, typename PropertiesT,
+          typename KernelTypeUniversalRef, int Dims>
+auto submit_kernel_direct_parallel_for(const queue &Queue, range<Dims> Range,
+                                       KernelTypeUniversalRef &&KernelFunc,
+                                       sycl::span<const event> DepEvents,
+                                       const PropertiesT &Props,
+                                       const detail::code_location &CodeLoc) {
+
+#ifndef __SYCL_DEVICE_ONLY__
+  if (!range_size_fits_in_size_t(Range))
+    throw sycl::exception(make_error_code(errc::runtime),
+                          "The total number of work-items in "
+                          "a range must fit within size_t");
+#endif
+
+  using KernelType =
+      std::remove_const_t<std::remove_reference_t<KernelTypeUniversalRef>>;
+  using NameT =
+      typename detail::get_kernel_name_t<KernelName, KernelType>::name;
+  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
+
+  // If 1D kernel argument is an integral type, convert it to sycl::item<1>
+  // If user type is convertible from sycl::item/sycl::nd_item, use
+  // sycl::item/sycl::nd_item to transport item information
+  using TransformedArgType = std::conditional_t<
+      std::is_integral<LambdaArgType>::value && Dims == 1, item<Dims>,
+      typename TransformUserItemType<Dims, LambdaArgType>::type>;
+
+  static_assert(!std::is_same_v<TransformedArgType, sycl::nd_item<Dims>>,
+                "Kernel argument cannot have a sycl::nd_item type in "
+                "sycl::parallel_for with sycl::range");
+
+  static_assert(std::is_convertible_v<item<Dims>, LambdaArgType> ||
+                    std::is_convertible_v<item<Dims, false>, LambdaArgType>,
+                "sycl::parallel_for(sycl::range) kernel must have the "
+                "first argument of sycl::item type, or of a type which is "
+                "implicitly convertible from sycl::item");
+
+  using RefLambdaArgType = std::add_lvalue_reference_t<LambdaArgType>;
+  static_assert(
+      (std::is_invocable_v<KernelType, RefLambdaArgType>),
+      "SYCL kernel lambda/functor has an unexpected signature, it should be "
+      "invocable with sycl::item");
+
+  // Range rounding can be disabled by the user.
+  // Range rounding is supported only for newer SYCL standards.
+#if !defined(__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__) &&                  \
+    SYCL_LANGUAGE_VERSION >= 202012L
+  auto [RoundedRange, HasRoundedRange] =
+      detail::getRoundedRange(Range, Queue.get_device());
+  if (HasRoundedRange) {
+    using NameWT = typename detail::get_kernel_wrapper_name_t<NameT>::name;
+    auto Wrapper =
+        detail::getRangeRoundedKernelLambda<NameWT, TransformedArgType, Dims>(
+            KernelFunc, Range);
+
+    using KTypeWrapper = decltype(Wrapper);
+    using KName = std::conditional_t<std::is_same<KernelType, NameT>::value,
+                                     KTypeWrapper, NameWT>;
+#ifndef __SYCL_DEVICE_ONLY__
+    // We are executing over the rounded range, but there are still
+    // items/ids that are are constructed in ther range rounded
+    // kernel use items/ids in the user range, which means that
+    // __SYCL_ASSUME_INT can still be violated. So check the bounds
+    // of the user range, instead of the rounded range.
+    detail::checkValueRange<Dims>(Range);
+#endif
+    return submit_kernel_direct<detail::WrapAs::parallel_for,
+                                TransformedArgType, KName, EventNeeded,
+                                PropertiesT, KTypeWrapper, Dims>(
+        Queue, detail::nd_range_view(Range), std::move(Wrapper), DepEvents,
+        Props, CodeLoc);
+  } else
+#endif // !__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ &&
+       // SYCL_LANGUAGE_VERSION >= 202012L
+  {
+#ifndef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
+#ifndef __SYCL_DEVICE_ONLY__
+    detail::checkValueRange<Dims>(Range);
+#endif
+    return submit_kernel_direct<detail::WrapAs::parallel_for,
+                                TransformedArgType, NameT, EventNeeded,
+                                PropertiesT, KernelTypeUniversalRef, Dims>(
+        Queue, detail::nd_range_view(Range),
+        std::forward<KernelTypeUniversalRef>(KernelFunc), DepEvents, Props,
+        CodeLoc);
+
+#else
+    (void)Range;
+    (void)Props;
+    (void)KernelFunc;
+#endif // __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
+  }
+}
+
+template <typename KernelName, bool EventNeeded, typename PropertiesT,
+          typename KernelTypeUniversalRef>
+auto submit_kernel_direct_single_task(const queue &Queue,
+                                      KernelTypeUniversalRef &&KernelFunc,
+                                      sycl::span<const event> DepEvents,
+                                      const PropertiesT &Props,
+                                      const detail::code_location &CodeLoc) {
+
+  using KernelType =
+      std::remove_const_t<std::remove_reference_t<KernelTypeUniversalRef>>;
+  using NameT =
+      typename detail::get_kernel_name_t<KernelName, KernelType>::name;
+
+  return submit_kernel_direct<detail::WrapAs::single_task, void, NameT,
+                              EventNeeded, PropertiesT, KernelTypeUniversalRef,
+                              1>(
+      Queue, detail::nd_range_view(),
+      std::forward<KernelTypeUniversalRef>(KernelFunc), DepEvents, Props,
+      CodeLoc);
+}
+} // namespace detail
+
 } // namespace _V1
 } // namespace sycl
 
diff --git a/sycl/include/sycl/range_rounding.hpp b/sycl/include/sycl/range_rounding.hpp
new file mode 100644
index 0000000000000..cf54004db4d73
--- /dev/null
+++ b/sycl/include/sycl/range_rounding.hpp
@@ -0,0 +1,161 @@
+//==----------- range_rounding.hpp --- SYCL range rounding utils -----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/detail/export.hpp>
+#include <sycl/detail/iostream_proxy.hpp>
+#include <sycl/device.hpp>
+#include <tuple>
+
+#include <stddef.h> // for size_t
+
+namespace sycl {
+inline namespace _V1 {
+
+namespace detail {
+
+void __SYCL_EXPORT GetRangeRoundingSettings(size_t &MinFactor,
+                                            size_t &GoodFactor,
+                                            size_t &MinRange);
+
+std::tuple<std::array<size_t, 3>, bool>
+    __SYCL_EXPORT getMaxWorkGroups_v2(const device &Device);
+
+bool __SYCL_EXPORT DisableRangeRounding();
+
+bool __SYCL_EXPORT RangeRoundingTrace();
+
+template <int Dims>
+std::tuple<range<Dims>, bool> getRoundedRange(range<Dims> UserRange,
+                                              const device &Device) {
+  range<Dims> RoundedRange = UserRange;
+  // Disable the rounding-up optimizations under these conditions:
+  // 1. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set.
+  // 2. The kernel is provided via an interoperability method (this uses a
+  // different code path).
+  // 3. The range is already a multiple of the rounding factor.
+  //
+  // Cases 2 and 3 could be supported with extra effort.
+  // As an optimization for the common case it is an
+  // implementation choice to not support those scenarios.
+  // Note that "this_item" is a free function, i.e. not tied to any
+  // specific id or item. When concurrent parallel_fors are executing
+  // on a device it is difficult to tell which parallel_for the call is
+  // being made from. One could replicate portions of the
+  // call-graph to make this_item calls kernel-specific but this is
+  // not considered worthwhile.
+
+  // Perform range rounding if rounding-up is enabled.
+  if (DisableRangeRounding())
+    return {range<Dims>{}, false};
+
+  // Range should be a multiple of this for reasonable performance.
+  size_t MinFactorX = 16;
+  // Range should be a multiple of this for improved performance.
+  size_t GoodFactor = 32;
+  // Range should be at least this to make rounding worthwhile.
+  size_t MinRangeX = 1024;
+
+  // Check if rounding parameters have been set through environment:
+  // SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
+  GetRangeRoundingSettings(MinFactorX, GoodFactor, MinRangeX);
+
+  // In SYCL, each dimension of a global range size is specified by
+  // a size_t, which can be up to 64 bits.  All backends should be
+  // able to accept a kernel launch with a 32-bit global range size
+  // (i.e. do not throw an error).  The OpenCL CPU backend will
+  // accept every 64-bit global range, but the GPU backends will not
+  // generally accept every 64-bit global range.  So, when we get a
+  // non-32-bit global range, we wrap the old kernel in a new kernel
+  // that has each work item peform multiple invocations the old
+  // kernel in a 32-bit global range.
+  id<Dims> MaxNWGs = [&] {
+    auto [MaxWGs, HasMaxWGs] = getMaxWorkGroups_v2(Device);
+    if (!HasMaxWGs) {
+      id<Dims> Default;
+      for (int i = 0; i < Dims; ++i)
+        Default[i] = (std::numeric_limits<int32_t>::max)();
+      return Default;
+    }
+
+    id<Dims> IdResult;
+    size_t Limit = (std::numeric_limits<int>::max)();
+    for (int i = 0; i < Dims; ++i)
+      IdResult[i] = (std::min)(Limit, MaxWGs[Dims - i - 1]);
+    return IdResult;
+  }();
+  auto M = (std::numeric_limits<uint32_t>::max)();
+  range<Dims> MaxRange;
+  for (int i = 0; i < Dims; ++i) {
+    auto DesiredSize = MaxNWGs[i] * GoodFactor;
+    MaxRange[i] =
+        DesiredSize <= M ? DesiredSize : (M / GoodFactor) * GoodFactor;
+  }
+
+  bool DidAdjust = false;
+  auto Adjust = [&](int Dim, size_t Value) {
+    if (RangeRoundingTrace())
+      std::cout << "parallel_for range adjusted at dim " << Dim << " from "
+                << RoundedRange[Dim] << " to " << Value << std::endl;
+    RoundedRange[Dim] = Value;
+    DidAdjust = true;
+  };
+
+#ifdef __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__
+  size_t GoodExpFactor = 1;
+  switch (Dims) {
+  case 1:
+    GoodExpFactor = 32; // Make global range multiple of {32}
+    break;
+  case 2:
+    GoodExpFactor = 16; // Make global range multiple of {16, 16}
+    break;
+  case 3:
+    GoodExpFactor = 8; // Make global range multiple of {8, 8, 8}
+    break;
+  }
+
+  // Check if rounding parameters have been set through environment:
+  // SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
+  GetRangeRoundingSettings(MinFactorX, GoodExpFactor, MinRangeX);
+
+  for (auto i = 0; i < Dims; ++i)
+    if (UserRange[i] % GoodExpFactor) {
+      Adjust(i, ((UserRange[i] / GoodExpFactor) + 1) * GoodExpFactor);
+    }
+#else
+  // Perform range rounding if there are sufficient work-items to
+  // need rounding and the user-specified range is not a multiple of
+  // a "good" value.
+  if (RoundedRange[0] % MinFactorX != 0 && RoundedRange[0] >= MinRangeX) {
+    // It is sufficient to round up just the first dimension.
+    // Multiplying the rounded-up value of the first dimension
+    // by the values of the remaining dimensions (if any)
+    // will yield a rounded-up value for the total range.
+    Adjust(0, ((RoundedRange[0] + GoodFactor - 1) / GoodFactor) * GoodFactor);
+  }
+#endif // __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__
+#ifdef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
+  // If we are forcing range rounding kernels to be used, we always want the
+  // rounded range kernel to be generated, even if rounding isn't needed
+  DidAdjust = true;
+#endif // __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
+
+  for (int i = 0; i < Dims; ++i)
+    if (RoundedRange[i] > MaxRange[i])
+      Adjust(i, MaxRange[i]);
+
+  if (!DidAdjust)
+    return {range<Dims>{}, false};
+  return {RoundedRange, true};
+}
+
+} // namespace detail
+} // namespace _V1
+} // namespace sycl
\ No newline at end of file
diff --git a/sycl/source/detail/ndrange_desc.hpp b/sycl/source/detail/ndrange_desc.hpp
index 30108d729db31..5426f2e09d43f 100644
--- a/sycl/source/detail/ndrange_desc.hpp
+++ b/sycl/source/detail/ndrange_desc.hpp
@@ -32,51 +32,62 @@ class NDRDescT {
   NDRDescT(const NDRDescT &Desc) = default;
   NDRDescT(NDRDescT &&Desc) = default;
 
-  template <int Dims_>
-  NDRDescT(sycl::range<Dims_> N, bool SetNumWorkGroups) : Dims{size_t(Dims_)} {
+  NDRDescT(const size_t *N, bool SetNumWorkGroups, int DimsVal)
+      : Dims{size_t(DimsVal)} {
     if (SetNumWorkGroups) {
-      for (size_t I = 0; I < Dims_; ++I) {
+      for (size_t I = 0; I < Dims; ++I) {
         NumWorkGroups[I] = N[I];
       }
     } else {
-      for (size_t I = 0; I < Dims_; ++I) {
+      for (size_t I = 0; I < Dims; ++I) {
         GlobalSize[I] = N[I];
       }
 
-      for (int I = Dims_; I < 3; ++I) {
+      for (int I = Dims; I < 3; ++I) {
         GlobalSize[I] = 1;
       }
     }
   }
 
   template <int Dims_>
-  NDRDescT(sycl::range<Dims_> NumWorkItems, sycl::range<Dims_> LocalSizes,
-           sycl::id<Dims_> Offset)
-      : Dims{size_t(Dims_)} {
-    for (size_t I = 0; I < Dims_; ++I) {
+  NDRDescT(sycl::range<Dims_> N, bool SetNumWorkGroups)
+      : NDRDescT(&(N[0]), SetNumWorkGroups, Dims_) {}
+
+  NDRDescT(const size_t *NumWorkItems, const size_t *LocalSizes,
+           const size_t *Offset, int DimsVal)
+      : Dims{size_t(DimsVal)} {
+    for (size_t I = 0; I < Dims; ++I) {
       GlobalSize[I] = NumWorkItems[I];
       LocalSize[I] = LocalSizes[I];
       GlobalOffset[I] = Offset[I];
     }
 
-    for (int I = Dims_; I < 3; ++I) {
+    for (int I = Dims; I < 3; ++I) {
       LocalSize[I] = LocalSizes[0] ? 1 : 0;
     }
 
-    for (int I = Dims_; I < 3; ++I) {
+    for (int I = Dims; I < 3; ++I) {
       GlobalSize[I] = 1;
     }
   }
 
   template <int Dims_>
-  NDRDescT(sycl::range<Dims_> NumWorkItems, sycl::id<Dims_> Offset)
-      : Dims{size_t(Dims_)} {
-    for (size_t I = 0; I < Dims_; ++I) {
+  NDRDescT(sycl::range<Dims_> NumWorkItems, sycl::range<Dims_> LocalSizes,
+           sycl::id<Dims_> Offset)
+      : NDRDescT(&(NumWorkItems[0]), &(LocalSizes[0]), &(Offset[0]), Dims_) {}
+
+  NDRDescT(const size_t *NumWorkItems, const size_t *Offset, int DimsVal)
+      : Dims{size_t(DimsVal)} {
+    for (size_t I = 0; I < Dims; ++I) {
       GlobalSize[I] = NumWorkItems[I];
       GlobalOffset[I] = Offset[I];
     }
   }
 
+  template <int Dims_>
+  NDRDescT(sycl::range<Dims_> NumWorkItems, sycl::id<Dims_> Offset)
+      : NDRDescT(&(NumWorkItems[0]), &(Offset[0]), Dims_) {}
+
   template <int Dims_>
   NDRDescT(sycl::nd_range<Dims_> ExecutionRange)
       : NDRDescT(ExecutionRange.get_global_range(),
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index 5b7bfb5e90fae..1e01acce7f457 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -12,6 +12,7 @@
 #include <detail/queue_impl.hpp>
 #include <sycl/context.hpp>
 #include <sycl/detail/common.hpp>
+#include <sycl/detail/nd_range_view.hpp>
 #include <sycl/detail/ur.hpp>
 #include <sycl/device.hpp>
 
@@ -126,6 +127,18 @@ prepareSYCLEventAssociatedWithQueue(detail::queue_impl &QueueImpl) {
   return detail::createSyclObjFromImpl<event>(EventImpl);
 }
 
+sycl::detail::NDRDescT nd_range_view::toNDRDescT() const {
+  if (!MGlobalSize) {
+    return NDRDescT(nd_range<1>{1, 1});
+  } else if (MLocalSize) {
+    return NDRDescT(MGlobalSize, MLocalSize, MOffset, MDims);
+  } else if (MOffset) {
+    return NDRDescT(MGlobalSize, MOffset, MDims);
+  } else {
+    return NDRDescT(MGlobalSize, MSetNumWorkGroups, MDims);
+  }
+}
+
 const std::vector<event> &
 queue_impl::getExtendDependencyList(const std::vector<event> &DepEvents,
                                     std::vector<event> &MutableVec,
@@ -637,6 +650,8 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
   detail::CG::StorageInitHelper CGData;
   std::unique_lock<std::mutex> Lock(MMutex);
 
+  NestedCallsTracker tracker;
+
   // Used by queue_empty() and getLastEvent()
   MEmpty.store(false, std::memory_order_release);
 
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 7c793b619ecab..87667c4ff1ef1 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -360,29 +360,29 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
     return createSyclObjFromImpl<event>(ResEvent);
   }
 
-  template <int Dims>
   event submit_kernel_direct_with_event(
-      const nd_range<Dims> &Range, detail::HostKernelRefBase &HostKernel,
+      const detail::nd_range_view &RangeView,
+      detail::HostKernelRefBase &HostKernel,
       detail::DeviceKernelInfo *DeviceKernelInfo,
       sycl::span<const event> DepEvents,
       const detail::KernelPropertyHolderStructTy &Props,
       const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
     detail::EventImplPtr EventImpl = submit_kernel_direct_impl(
-        NDRDescT{Range}, HostKernel, DeviceKernelInfo,
+        RangeView.toNDRDescT(), HostKernel, DeviceKernelInfo,
         /*CallerNeedsEvent*/ true, DepEvents, Props, CodeLoc, IsTopCodeLoc);
     return createSyclObjFromImpl<event>(EventImpl);
   }
 
-  template <int Dims>
   void submit_kernel_direct_without_event(
-      const nd_range<Dims> &Range, detail::HostKernelRefBase &HostKernel,
+      const detail::nd_range_view &RangeView,
+      detail::HostKernelRefBase &HostKernel,
       detail::DeviceKernelInfo *DeviceKernelInfo,
       sycl::span<const event> DepEvents,
       const detail::KernelPropertyHolderStructTy &Props,
       const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
-    submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo,
-                              /*CallerNeedsEvent*/ false, DepEvents, Props,
-                              CodeLoc, IsTopCodeLoc);
+    submit_kernel_direct_impl(
+        RangeView.toNDRDescT(), HostKernel, DeviceKernelInfo,
+        /*CallerNeedsEvent*/ false, DepEvents, Props, CodeLoc, IsTopCodeLoc);
   }
 
   void submit_without_event(const detail::type_erased_cgfo_ty &CGF,
diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp
index a58b4b234ab3a..d096f814dd2e4 100644
--- a/sycl/source/queue.cpp
+++ b/sycl/source/queue.cpp
@@ -20,8 +20,8 @@
 namespace sycl {
 inline namespace _V1 {
 
-#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
 namespace detail {
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
 SubmissionInfo::SubmissionInfo()
     : impl{std::make_shared<SubmissionInfoImpl>()} {}
 
@@ -58,10 +58,41 @@ const ext::oneapi::experimental::event_mode_enum &
 SubmissionInfo::EventMode() const {
   return impl->MEventMode;
 }
-} // namespace detail
 
 #endif // __INTEL_PREVIEW_BREAKING_CHANGES
 
+void GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor,
+                              size_t &MinRange) {
+  SYCLConfig<SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS>::GetSettings(
+      MinFactor, GoodFactor, MinRange);
+}
+
+std::tuple<std::array<size_t, 3>, bool>
+getMaxWorkGroups_v2(const device &Device) {
+  std::array<size_t, 3> UrResult = {};
+  auto &DeviceImpl = getSyclObjImpl(Device);
+
+  auto Ret = DeviceImpl->getAdapter().call_nocheck<UrApiKind::urDeviceGetInfo>(
+      DeviceImpl->getHandleRef(),
+      UrInfoCode<
+          ext::oneapi::experimental::info::device::max_work_groups<3>>::value,
+      sizeof(UrResult), &UrResult, nullptr);
+  if (Ret == UR_RESULT_SUCCESS) {
+    return {UrResult, true};
+  }
+  return {std::array<size_t, 3>{0, 0, 0}, false};
+}
+
+bool DisableRangeRounding() {
+  return SYCLConfig<SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING>::get();
+}
+
+bool RangeRoundingTrace() {
+  return SYCLConfig<SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE>::get();
+}
+
+} // namespace detail
+
 queue::queue(const context &SyclContext, const device_selector &DeviceSelector,
              const async_handler &AsyncHandler, const property_list &PropList) {
   const std::vector<device> Devs = SyclContext.get_devices();
@@ -463,80 +494,30 @@ void queue::ext_oneapi_set_external_event(const event &external_event) {
 
 const property_list &queue::getPropList() const { return impl->getPropList(); }
 
-template <int Dims>
 event submit_kernel_direct_with_event_impl(
-    const queue &Queue, const nd_range<Dims> &Range,
+    const queue &Queue, const detail::nd_range_view &RangeView,
     detail::HostKernelRefBase &HostKernel,
     detail::DeviceKernelInfo *DeviceKernelInfo,
     sycl::span<const event> DepEvents,
     const detail::KernelPropertyHolderStructTy &Props,
     const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
   return getSyclObjImpl(Queue)->submit_kernel_direct_with_event(
-      Range, HostKernel, DeviceKernelInfo, DepEvents, Props, CodeLoc,
+      RangeView, HostKernel, DeviceKernelInfo, DepEvents, Props, CodeLoc,
       IsTopCodeLoc);
 }
 
-template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<1>(
-    const queue &Queue, const nd_range<1> &Range,
-    detail::HostKernelRefBase &HostKernel,
-    detail::DeviceKernelInfo *DeviceKernelInfo,
-    sycl::span<const event> DepEvents,
-    const detail::KernelPropertyHolderStructTy &Props,
-    const detail::code_location &CodeLoc, bool IsTopCodeLoc);
-
-template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<2>(
-    const queue &Queue, const nd_range<2> &Range,
-    detail::HostKernelRefBase &HostKernel,
-    detail::DeviceKernelInfo *DeviceKernelInfo,
-    sycl::span<const event> DepEvents,
-    const detail::KernelPropertyHolderStructTy &Props,
-    const detail::code_location &CodeLoc, bool IsTopCodeLoc);
-
-template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<3>(
-    const queue &Queue, const nd_range<3> &Range,
-    detail::HostKernelRefBase &HostKernel,
-    detail::DeviceKernelInfo *DeviceKernelInfo,
-    sycl::span<const event> DepEvents,
-    const detail::KernelPropertyHolderStructTy &Props,
-    const detail::code_location &CodeLoc, bool IsTopCodeLoc);
-
-template <int Dims>
 void submit_kernel_direct_without_event_impl(
-    const queue &Queue, const nd_range<Dims> &Range,
+    const queue &Queue, const detail::nd_range_view &RangeView,
     detail::HostKernelRefBase &HostKernel,
     detail::DeviceKernelInfo *DeviceKernelInfo,
     sycl::span<const event> DepEvents,
     const detail::KernelPropertyHolderStructTy &Props,
     const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
   getSyclObjImpl(Queue)->submit_kernel_direct_without_event(
-      Range, HostKernel, DeviceKernelInfo, DepEvents, Props, CodeLoc,
+      RangeView, HostKernel, DeviceKernelInfo, DepEvents, Props, CodeLoc,
       IsTopCodeLoc);
 }
 
-template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<1>(
-    const queue &Queue, const nd_range<1> &Range,
-    detail::HostKernelRefBase &HostKernel,
-    detail::DeviceKernelInfo *DeviceKernelInfo,
-    sycl::span<const event> DepEvents,
-    const detail::KernelPropertyHolderStructTy &Props,
-    const detail::code_location &CodeLoc, bool IsTopCodeLoc);
-
-template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<2>(
-    const queue &Queue, const nd_range<2> &Range,
-    detail::HostKernelRefBase &HostKernel,
-    detail::DeviceKernelInfo *DeviceKernelInfo,
-    sycl::span<const event> DepEvents,
-    const detail::KernelPropertyHolderStructTy &Props,
-    const detail::code_location &CodeLoc, bool IsTopCodeLoc);
-
-template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<3>(
-    const queue &Queue, const nd_range<3> &Range,
-    detail::HostKernelRefBase &HostKernel,
-    detail::DeviceKernelInfo *DeviceKernelInfo,
-    sycl::span<const event> DepEvents,
-    const detail::KernelPropertyHolderStructTy &Props,
-    const detail::code_location &CodeLoc, bool IsTopCodeLoc);
-
 } // namespace _V1
 } // namespace sycl
 
diff --git a/sycl/test-e2e/Basic/test_num_kernel_copies.cpp b/sycl/test-e2e/Basic/test_num_kernel_copies.cpp
index 82f8477a10962..d0770a56696be 100644
--- a/sycl/test-e2e/Basic/test_num_kernel_copies.cpp
+++ b/sycl/test-e2e/Basic/test_num_kernel_copies.cpp
@@ -23,7 +23,8 @@ int main(int argc, char **argv) {
 
   kernel<0> krn0;
   q.parallel_for(sycl::range<1>{1}, krn0);
-  assert(copy_count == 1);
+  // The kernel is copied on the scheduler-based path only
+  assert(copy_count == 0);
   assert(move_count == 0);
   copy_count = 0;
 
diff --git a/sycl/test/abi/layout_nd_range_view.cpp b/sycl/test/abi/layout_nd_range_view.cpp
new file mode 100644
index 0000000000000..2a1d0693dab26
--- /dev/null
+++ b/sycl/test/abi/layout_nd_range_view.cpp
@@ -0,0 +1,19 @@
+// RUN: %clangxx -fsycl -c -fno-color-diagnostics -Xclang -fdump-record-layouts %s -o %t.out | FileCheck %s
+// RUN: %clangxx -fsycl -fsycl-device-only -c -fno-color-diagnostics -Xclang -fdump-record-layouts %s -o %t.out | FileCheck %s
+// REQUIRES: linux
+// UNSUPPORTED: libcxx
+
+// clang-format off
+
+#include <sycl/detail/nd_range_view.hpp>
+
+
+SYCL_EXTERNAL void nd_range_view(sycl::detail::nd_range_view) {}
+// CHECK: 0 | class sycl::detail::nd_range_view
+// CHECK-NEXT: 0 |   const size_t * MGlobalSize
+// CHECK-NEXT: 8 |   const size_t * MLocalSize
+// CHECK-NEXT: 16 |  const size_t * MOffset
+// CHECK-NEXT: 24 |  _Bool MSetNumWorkGroups
+// CHECK-NEXT: 32 |  size_t MDims
+// CHECK-NEXT: | [sizeof=40, dsize=40, align=8,
+// CHECK-NEXT: |  nvsize=40, nvalign=8]
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index e42759431d374..9b7a17a78a54a 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -2985,12 +2985,8 @@ _ZN4sycl3_V121__isgreaterequal_implEdd
 _ZN4sycl3_V121__isgreaterequal_implEff
 _ZN4sycl3_V122accelerator_selector_vERKNS0_6deviceE
 _ZN4sycl3_V128verifyUSMAllocatorPropertiesERKNS0_13property_listE
-_ZN4sycl3_V136submit_kernel_direct_with_event_implILi1EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoENS0_4spanIKS2_Lm18446744073709551615EEERKNSA_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSK_5intel12experimental12cache_configENSM_17use_root_sync_keyENSM_23work_group_progress_keyENSM_22sub_group_progress_keyENSM_22work_item_progress_keyENSM_4cuda12cluster_sizeILi1EEENSW_ILi2EEENSW_ILi3EEEEEERKNSA_13code_locationEb
-_ZN4sycl3_V136submit_kernel_direct_with_event_implILi2EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoENS0_4spanIKS2_Lm18446744073709551615EEERKNSA_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSK_5intel12experimental12cache_configENSM_17use_root_sync_keyENSM_23work_group_progress_keyENSM_22sub_group_progress_keyENSM_22work_item_progress_keyENSM_4cuda12cluster_sizeILi1EEENSW_ILi2EEENSW_ILi3EEEEEERKNSA_13code_locationEb
-_ZN4sycl3_V136submit_kernel_direct_with_event_implILi3EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoENS0_4spanIKS2_Lm18446744073709551615EEERKNSA_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSK_5intel12experimental12cache_configENSM_17use_root_sync_keyENSM_23work_group_progress_keyENSM_22sub_group_progress_keyENSM_22work_item_progress_keyENSM_4cuda12cluster_sizeILi1EEENSW_ILi2EEENSW_ILi3EEEEEERKNSA_13code_locationEb
-_ZN4sycl3_V139submit_kernel_direct_without_event_implILi1EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoENS0_4spanIKNS0_5eventELm18446744073709551615EEERKNS9_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSK_5intel12experimental12cache_configENSM_17use_root_sync_keyENSM_23work_group_progress_keyENSM_22sub_group_progress_keyENSM_22work_item_progress_keyENSM_4cuda12cluster_sizeILi1EEENSW_ILi2EEENSW_ILi3EEEEEERKNS9_13code_locationEb
-_ZN4sycl3_V139submit_kernel_direct_without_event_implILi2EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoENS0_4spanIKNS0_5eventELm18446744073709551615EEERKNS9_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSK_5intel12experimental12cache_configENSM_17use_root_sync_keyENSM_23work_group_progress_keyENSM_22sub_group_progress_keyENSM_22work_item_progress_keyENSM_4cuda12cluster_sizeILi1EEENSW_ILi2EEENSW_ILi3EEEEEERKNS9_13code_locationEb
-_ZN4sycl3_V139submit_kernel_direct_without_event_implILi3EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoENS0_4spanIKNS0_5eventELm18446744073709551615EEERKNS9_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSK_5intel12experimental12cache_configENSM_17use_root_sync_keyENSM_23work_group_progress_keyENSM_22sub_group_progress_keyENSM_22work_item_progress_keyENSM_4cuda12cluster_sizeILi1EEENSW_ILi2EEENSW_ILi3EEEEEERKNS9_13code_locationEb
+_ZN4sycl3_V136submit_kernel_direct_with_event_implERKNS0_5queueERKNS0_6detail13nd_range_viewERNS4_17HostKernelRefBaseEPNS4_16DeviceKernelInfoENS0_4spanIKNS0_5eventELm18446744073709551615EEERKNS4_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSI_5intel12experimental12cache_configENSK_17use_root_sync_keyENSK_23work_group_progress_keyENSK_22sub_group_progress_keyENSK_22work_item_progress_keyENSK_4cuda12cluster_sizeILi1EEENSU_ILi2EEENSU_ILi3EEEEEERKNS4_13code_locationEb
+_ZN4sycl3_V139submit_kernel_direct_without_event_implERKNS0_5queueERKNS0_6detail13nd_range_viewERNS4_17HostKernelRefBaseEPNS4_16DeviceKernelInfoENS0_4spanIKNS0_5eventELm18446744073709551615EEERKNS4_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSI_5intel12experimental12cache_configENSK_17use_root_sync_keyENSK_23work_group_progress_keyENSK_22sub_group_progress_keyENSK_22work_item_progress_keyENSK_4cuda12cluster_sizeILi1EEENSU_ILi2EEENSU_ILi3EEEEEERKNS4_13code_locationEb
 _ZN4sycl3_V13ext5intel12experimental9pipe_base13get_pipe_nameB5cxx11EPKv
 _ZN4sycl3_V13ext5intel12experimental9pipe_base17wait_non_blockingERKNS0_5eventE
 _ZN4sycl3_V13ext5intel12experimental9pipe_base18get_pipe_name_implEPKv
@@ -3318,11 +3314,13 @@ _ZN4sycl3_V16detail17HostProfilingInfo3endEv
 _ZN4sycl3_V16detail17HostProfilingInfo5startEv
 _ZN4sycl3_V16detail17device_global_map3addEPKvPKc
 _ZN4sycl3_V16detail17reduComputeWGSizeEmmRm
+_ZN4sycl3_V16detail18RangeRoundingTraceEv
 _ZN4sycl3_V16detail18get_kernel_id_implENS1_11string_viewE
 _ZN4sycl3_V16detail18make_kernel_bundleEmRKNS0_7contextENS0_12bundle_stateENS0_7backendE
 _ZN4sycl3_V16detail18make_kernel_bundleEmRKNS0_7contextEbNS0_12bundle_stateENS0_7backendE
 _ZN4sycl3_V16detail18stringifyErrorCodeEi
 _ZN4sycl3_V16detail19getDeviceKernelInfoERKNS1_27compile_time_kernel_info_v123CompileTimeKernelInfoTyE
+_ZN4sycl3_V16detail19getMaxWorkGroups_v2ERKNS0_6deviceE
 _ZN4sycl3_V16detail19kernel_bundle_plain21ext_oneapi_get_kernelENS1_11string_viewE
 _ZN4sycl3_V16detail19kernel_bundle_plain21ext_oneapi_has_kernelENS1_11string_viewE
 _ZN4sycl3_V16detail19kernel_bundle_plain28ext_oneapi_has_device_globalENS1_11string_viewE
@@ -3330,6 +3328,7 @@ _ZN4sycl3_V16detail19kernel_bundle_plain30ext_oneapi_get_raw_kernel_nameENS1_11s
 _ZN4sycl3_V16detail19kernel_bundle_plain32set_specialization_constant_implEPKcPvm
 _ZN4sycl3_V16detail19kernel_bundle_plain33ext_oneapi_get_device_global_sizeENS1_11string_viewE
 _ZN4sycl3_V16detail19kernel_bundle_plain36ext_oneapi_get_device_global_addressENS1_11string_viewERKNS0_6deviceE
+_ZN4sycl3_V16detail20DisableRangeRoundingEv
 _ZN4sycl3_V16detail20associateWithHandlerERNS0_7handlerEPNS1_16AccessorBaseHostENS0_6access6targetE
 _ZN4sycl3_V16detail20associateWithHandlerERNS0_7handlerEPNS1_28SampledImageAccessorBaseHostENS0_12image_targetE
 _ZN4sycl3_V16detail20associateWithHandlerERNS0_7handlerEPNS1_30UnsampledImageAccessorBaseHostENS0_12image_targetE
@@ -3355,6 +3354,7 @@ _ZN4sycl3_V16detail22reduGetPreferredWGSizeERNS0_7handlerEm
 _ZN4sycl3_V16detail22reduGetPreferredWGSizeERSt10shared_ptrINS1_10queue_implEEm
 _ZN4sycl3_V16detail22removeDuplicateDevicesERKSt6vectorINS0_6deviceESaIS3_EE
 _ZN4sycl3_V16detail23constructorNotificationEPvS2_NS0_6access6targetENS3_4modeERKNS1_13code_locationE
+_ZN4sycl3_V16detail24GetRangeRoundingSettingsERmS2_S2_
 _ZN4sycl3_V16detail24find_device_intersectionERKSt6vectorINS0_13kernel_bundleILNS0_12bundle_stateE1EEESaIS5_EE
 _ZN4sycl3_V16detail26createKernelNameBasedCacheEv
 _ZN4sycl3_V16detail26isDeviceGlobalUsedInKernelEPKv
diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump
index 1db98f1b5cf9d..0c071dd4d33c6 100644
--- a/sycl/test/abi/sycl_symbols_windows.dump
+++ b/sycl/test/abi/sycl_symbols_windows.dump
@@ -286,12 +286,6 @@
 ??$is_image_handle_supported@Usampled_image_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA_NAEBUimage_descriptor@01234@W4image_memory_handle_type@01234@AEBVqueue@34@@Z
 ??$is_image_handle_supported@Uunsampled_image_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA_NAEBUimage_descriptor@01234@W4image_memory_handle_type@01234@AEBVdevice@34@AEBVcontext@34@@Z
 ??$is_image_handle_supported@Uunsampled_image_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA_NAEBUimage_descriptor@01234@W4image_memory_handle_type@01234@AEBVqueue@34@@Z
-??$submit_kernel_direct_with_event_impl@$00@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$00@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@601@AEBUcode_location@601@_N@Z
-??$submit_kernel_direct_with_event_impl@$01@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$01@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@601@AEBUcode_location@601@_N@Z
-??$submit_kernel_direct_with_event_impl@$02@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$02@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@601@AEBUcode_location@601@_N@Z
-??$submit_kernel_direct_without_event_impl@$00@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$00@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@501@AEBUcode_location@501@_N@Z
-??$submit_kernel_direct_without_event_impl@$01@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$01@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@501@AEBUcode_location@501@_N@Z
-??$submit_kernel_direct_without_event_impl@$02@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$02@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@501@AEBUcode_location@501@_N@Z
 ??$update_nd_range@$00@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$00@45@@Z
 ??$update_nd_range@$01@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$01@45@@Z
 ??$update_nd_range@$02@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$02@45@@Z
@@ -4484,6 +4478,8 @@
 ?submit_impl_and_postprocess@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBUcode_location@detail@23@AEBV?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@6@_N@Z
 ?submit_impl_and_postprocess@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V123@AEBUcode_location@detail@23@AEBV?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@6@@Z
 ?submit_impl_and_postprocess@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V123@AEBUcode_location@detail@23@AEBV?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@6@_N@Z
+?submit_kernel_direct_with_event_impl@_V1@sycl@@YA?AVevent@12@AEBVqueue@12@AEBVnd_range_view@detail@12@AEAVHostKernelRefBase@612@PEAVDeviceKernelInfo@612@V?$span@$$CBVevent@_V1@sycl@@$0?0@12@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@612@AEBUcode_location@612@_N@Z
+?submit_kernel_direct_without_event_impl@_V1@sycl@@YAXAEBVqueue@12@AEBVnd_range_view@detail@12@AEAVHostKernelRefBase@512@PEAVDeviceKernelInfo@512@V?$span@$$CBVevent@_V1@sycl@@$0?0@12@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@512@AEBUcode_location@512@_N@Z
 ?submit_with_event_impl@queue@_V1@sycl@@AEAA?AVevent@23@AEBVtype_erased_cgfo_ty@detail@23@AEBVSubmissionInfo@623@AEBUcode_location@623@_N@Z
 ?submit_with_event_impl@queue@_V1@sycl@@AEAA?AVevent@23@AEBVtype_erased_cgfo_ty@detail@23@AEBVSubmissionInfo@v1@623@AEBUcode_location@623@_N@Z
 ?submit_with_event_impl@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBVSubmissionInfo@detail@23@AEBUcode_location@823@_N@Z
diff --git a/sycl/test/abi/symbol_size_alignment.cpp b/sycl/test/abi/symbol_size_alignment.cpp
index 63286da1b9786..36bf933ce6b28 100644
--- a/sycl/test/abi/symbol_size_alignment.cpp
+++ b/sycl/test/abi/symbol_size_alignment.cpp
@@ -6,6 +6,7 @@
 
 #include <sycl/accessor.hpp>
 #include <sycl/buffer.hpp>
+#include <sycl/detail/nd_range_view.hpp>
 #include <sycl/device.hpp>
 #include <sycl/device_event.hpp>
 #include <sycl/device_selector.hpp>
@@ -58,6 +59,7 @@ int main() {
 #endif
   check<image<1>, 16, 8>();
   check<kernel, 16, 8>();
+  check<detail::nd_range_view, 40, 8>();
   check<platform, 16, 8>();
 #ifdef __SYCL_DEVICE_ONLY__
   check<private_memory<int, 1>, 4, 4>();
diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp
index 5174c8a29bb6b..2d0cc14cd5c26 100644
--- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp
+++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp
@@ -98,6 +98,7 @@
 // CHECK-NEXT: nd_item.hpp
 // CHECK-NEXT: nd_range.hpp
 // CHECK-NEXT: detail/id_queries_fit_in_int.hpp
+// CHECK-NEXT: detail/nd_range_view.hpp
 // CHECK-NEXT: detail/optional.hpp
 // CHECK-NEXT: device.hpp
 // CHECK-NEXT: detail/string_view.hpp
@@ -151,6 +152,7 @@
 // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp
 // CHECK-NEXT: kernel.hpp
 // CHECK-NEXT: sampler.hpp
+// CHECK-NEXT: range_rounding.hpp
 // CHECK-NEXT: sycl_span.hpp
 // CHECK-NEXT: feature_test.hpp
 // CHECK-EMPTY:
diff --git a/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp
index 76570a99bdda7..846bd0ed4a436 100644
--- a/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp
+++ b/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp
@@ -102,6 +102,7 @@
 // CHECK-NEXT: nd_item.hpp
 // CHECK-NEXT: nd_range.hpp
 // CHECK-NEXT: detail/id_queries_fit_in_int.hpp
+// CHECK-NEXT: detail/nd_range_view.hpp
 // CHECK-NEXT: detail/optional.hpp
 // CHECK-NEXT: device.hpp
 // CHECK-NEXT: detail/string_view.hpp
@@ -155,5 +156,6 @@
 // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp
 // CHECK-NEXT: kernel.hpp
 // CHECK-NEXT: sampler.hpp
+// CHECK-NEXT: range_rounding.hpp
 // CHECK-NEXT: sycl_span.hpp
 // CHECK-EMPTY:
diff --git a/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp
index 50abdf954cca0..e6b167d5102a0 100644
--- a/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp
+++ b/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp
@@ -180,8 +180,10 @@
 // CHECK-NEXT: kernel.hpp
 // CHECK-NEXT: sampler.hpp
 // CHECK-NEXT: queue.hpp
+// CHECK-NEXT: detail/nd_range_view.hpp
 // CHECK-NEXT: detail/optional.hpp
 // CHECK-NEXT: ext/oneapi/experimental/event_mode_property.hpp
+// CHECK-NEXT: range_rounding.hpp
 // CHECK-NEXT: sycl_span.hpp
 // CHECK-NEXT: usm.hpp
 // CHECK-NEXT: usm/usm_pointer_info.hpp
diff --git a/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp
index f2feef5bd9871..b24f3577906ae 100644
--- a/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp
+++ b/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp
@@ -122,6 +122,7 @@
 // CHECK-NEXT: nd_item.hpp
 // CHECK-NEXT: nd_range.hpp
 // CHECK-NEXT: detail/id_queries_fit_in_int.hpp
+// CHECK-NEXT: detail/nd_range_view.hpp
 // CHECK-NEXT: detail/optional.hpp
 // CHECK-NEXT: event.hpp
 // CHECK-NEXT: exception_list.hpp
@@ -170,6 +171,7 @@
 // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp
 // CHECK-NEXT: kernel.hpp
 // CHECK-NEXT: sampler.hpp
+// CHECK-NEXT: range_rounding.hpp
 // CHECK-NEXT: sycl_span.hpp
 // CHECK-NEXT: usm/usm_pointer_info.hpp
 // CHECK-NEXT: usm/usm_allocator.hpp
diff --git a/sycl/unittests/scheduler/CMakeLists.txt b/sycl/unittests/scheduler/CMakeLists.txt
index afc0e185eb7c0..9041793ecdaf2 100644
--- a/sycl/unittests/scheduler/CMakeLists.txt
+++ b/sycl/unittests/scheduler/CMakeLists.txt
@@ -22,4 +22,5 @@ add_sycl_unittest(SchedulerTests OBJECT
     AccessorDefaultCtor.cpp
     HostTaskAndBarrier.cpp
     BarrierDependencies.cpp
+    NdRangeViewUsage.cpp
 )
diff --git a/sycl/unittests/scheduler/NdRangeViewUsage.cpp b/sycl/unittests/scheduler/NdRangeViewUsage.cpp
new file mode 100644
index 0000000000000..dd1e0659cb607
--- /dev/null
+++ b/sycl/unittests/scheduler/NdRangeViewUsage.cpp
@@ -0,0 +1,118 @@
+//==---- NdRangeViewUsage.cpp --- Check nd_range_view ------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include <detail/cg.hpp>
+#include <sycl/detail/nd_range_view.hpp>
+
+#include <gtest/gtest.h>
+
+template <int dims>
+void TestNdRangeView(sycl::range<dims> global, sycl::range<dims> local,
+                     sycl::id<dims> offset) {
+  {
+    sycl::nd_range<dims> nd_range{global, local, offset};
+    sycl::detail::nd_range_view r{nd_range};
+    ASSERT_EQ(r.MDims, size_t{dims});
+    for (int d = 0; d < dims; d++) {
+      ASSERT_EQ(r.MGlobalSize[d], global[d]);
+      ASSERT_EQ(r.MLocalSize[d], local[d]);
+      ASSERT_EQ(r.MOffset[d], offset[d]);
+    }
+
+    sycl::detail::NDRDescT NDRDesc = r.toNDRDescT();
+    ASSERT_EQ(NDRDesc.Dims, size_t{dims});
+    for (int d = 0; d < dims; d++) {
+      ASSERT_EQ(NDRDesc.GlobalSize[d], global[d]);
+      ASSERT_EQ(NDRDesc.LocalSize[d], local[d]);
+      ASSERT_EQ(NDRDesc.GlobalOffset[d], offset[d]);
+    }
+
+    for (int d = dims; d < 3; d++) {
+      ASSERT_EQ(NDRDesc.GlobalSize[d], 1UL);
+      ASSERT_EQ(NDRDesc.LocalSize[d], 1UL);
+    }
+  }
+  {
+    sycl::detail::nd_range_view r{global, offset};
+    ASSERT_EQ(r.MDims, size_t{dims});
+    for (int d = 0; d < dims; d++) {
+      ASSERT_EQ(r.MGlobalSize[d], global[d]);
+      ASSERT_EQ(r.MOffset[d], offset[d]);
+    }
+    ASSERT_EQ(r.MLocalSize, nullptr);
+
+    sycl::detail::NDRDescT NDRDesc = r.toNDRDescT();
+    ASSERT_EQ(NDRDesc.Dims, size_t{dims});
+    for (int d = 0; d < dims; d++) {
+      ASSERT_EQ(NDRDesc.GlobalSize[d], global[d]);
+      ASSERT_EQ(NDRDesc.GlobalOffset[d], offset[d]);
+    }
+    for (int d = dims; d < 3; d++) {
+      ASSERT_EQ(NDRDesc.GlobalSize[d], 0UL);
+      ASSERT_EQ(NDRDesc.GlobalOffset[d], 0UL);
+    }
+    for (int d = 0; d < 3; d++) {
+      ASSERT_EQ(NDRDesc.LocalSize[d], 0UL);
+    }
+  }
+  {
+    sycl::detail::nd_range_view r{global, true};
+    ASSERT_EQ(r.MDims, size_t{dims});
+    for (int d = 0; d < dims; d++) {
+      ASSERT_EQ(r.MGlobalSize[d], global[d]);
+      ASSERT_EQ(r.MSetNumWorkGroups, true);
+    }
+    ASSERT_EQ(r.MLocalSize, nullptr);
+    ASSERT_EQ(r.MOffset, nullptr);
+
+    sycl::detail::NDRDescT NDRDesc = r.toNDRDescT();
+    ASSERT_EQ(NDRDesc.Dims, size_t{dims});
+    for (int d = 0; d < dims; d++) {
+      ASSERT_EQ(NDRDesc.NumWorkGroups[d], global[d]);
+    }
+    for (int d = dims; d < 3; d++) {
+      ASSERT_EQ(NDRDesc.NumWorkGroups[d], 0UL);
+    }
+    for (int d = 0; d < 3; d++) {
+      ASSERT_EQ(NDRDesc.LocalSize[d], 0UL);
+      ASSERT_EQ(NDRDesc.GlobalSize[d], 0UL);
+      ASSERT_EQ(NDRDesc.GlobalOffset[d], 0UL);
+    }
+  }
+  {
+    sycl::detail::nd_range_view r{global, false};
+    ASSERT_EQ(r.MDims, size_t{dims});
+    for (int d = 0; d < dims; d++) {
+      ASSERT_EQ(r.MGlobalSize[d], global[d]);
+      ASSERT_EQ(r.MSetNumWorkGroups, false);
+    }
+    ASSERT_EQ(r.MLocalSize, nullptr);
+    ASSERT_EQ(r.MOffset, nullptr);
+
+    sycl::detail::NDRDescT NDRDesc = r.toNDRDescT();
+    ASSERT_EQ(NDRDesc.Dims, size_t{dims});
+    for (int d = 0; d < dims; d++) {
+      ASSERT_EQ(NDRDesc.GlobalSize[d], global[d]);
+    }
+    for (int d = dims; d < 3; d++) {
+      ASSERT_EQ(NDRDesc.GlobalSize[d], 1UL);
+    }
+    for (int d = 0; d < 3; d++) {
+      ASSERT_EQ(NDRDesc.LocalSize[d], 0UL);
+      ASSERT_EQ(NDRDesc.NumWorkGroups[d], 0UL);
+      ASSERT_EQ(NDRDesc.GlobalOffset[d], 0UL);
+    }
+  }
+}
+
+TEST(RangesRefUsage, RangesRefUsage) {
+  TestNdRangeView(sycl::range<1>{1024}, sycl::range<1>{64}, sycl::id<1>{10});
+  TestNdRangeView(sycl::range<2>{1024, 512}, sycl::range<2>{64, 32},
+                  sycl::id<2>{10, 5});
+  TestNdRangeView(sycl::range<3>{1024, 512, 256}, sycl::range<3>{64, 32, 16},
+                  sycl::id<3>{10, 5, 2});
+}