diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md index 1ba7b75dbcd64..399af9edc5042 100644 --- a/sycl/doc/syclcompat/README.md +++ b/sycl/doc/syclcompat/README.md @@ -59,6 +59,10 @@ If available, the following extensions extend SYCLcompat functionality: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_max_work_group_query.md) \[Optional\] +### Hardware Requirements + +Some of the functionalities provided by SYCLcompat rely on Unified Shared Memory (`aspect::usm_device_allocations`), though most of the USM-like memory APIs (malloc*, memcpy*, memset*) support hardware with only buffer/accessor support. See section [Buffer Support](#buffer-support) below. + ## Usage All functionality is available under the `syclcompat::` namespace, imported @@ -606,14 +610,6 @@ namespace syclcompat { namespace experimental { // Forward declarations for types relating to unsupported memcpy_parameter API: -enum memcpy_direction { - host_to_host, - host_to_device, - device_to_host, - device_to_device, - automatic -}; - #ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES class image_mem_wrapper; #endif @@ -632,7 +628,6 @@ struct memcpy_parameter { data_wrapper from{}; data_wrapper to{}; sycl::range<3> size{}; - syclcompat::detail::memcpy_direction direction{syclcompat::detail::memcpy_direction::automatic}; }; /// [UNSUPPORTED] Synchronously copies 2D/3D memory data specified by \p param . @@ -709,18 +704,16 @@ enum class memory_region { using byte_t = uint8_t; -enum class target { device, local }; - template class memory_traits { public: static constexpr sycl::access::address_space asp = (Memory == memory_region::local) ? sycl::access::address_space::local_space : sycl::access::address_space::global_space; - static constexpr target target = + static constexpr sycl::target target = (Memory == memory_region::local) - ? target::local - : target::device; + ? sycl::target::local + : sycl::target::device; static constexpr sycl::access_mode mode = (Memory == memory_region::constant) ? sycl::access_mode::read @@ -731,7 +724,7 @@ public: using value_t = typename std::remove_cv_t; template using accessor_t = typename std::conditional_t< - target == target::local, + target == sycl::target::local, sycl::local_accessor, sycl::accessor>; using pointer_t = T *; @@ -855,6 +848,23 @@ public: } // syclcompat ``` +#### Buffer Support + +Although SYCLcompat is primarily designed around the Unified Shared Memory +model, there is (limited) support for the buffer/accessor model. This can be +enabled by setting the compiler define `SYCLCOMPAT_USM_LEVEL_NONE`. This macro +instructs SYCLcompat to effectively provide emulated USM pointers via a Memory +Manager singleton. + +Note that in `SYCLCOMPAT_USM_LEVEL_NONE` mode, the pointers returned by e.g. +`syclcompat::malloc`, and passed to `syclcompat::memcpy` can *only* interact +with `syclcompat` APIs. It is legal to perform pointer arithmetic on these +virtual pointers, but attempting to dereference them, passing them to `sycl` +APIs, or passing them into kernels will result in an error. + +The SYCLcompat tests with the suffix `_usmnone.cpp` provide examples of how to +use `SYCLCOMPAT_USM_LEVEL_NONE`. + ### ptr_to_int The following cuda backend specific function is introduced in order to diff --git a/sycl/include/syclcompat/defs.hpp b/sycl/include/syclcompat/defs.hpp index 2415d1fe8847c..79c3a82489a61 100644 --- a/sycl/include/syclcompat/defs.hpp +++ b/sycl/include/syclcompat/defs.hpp @@ -22,7 +22,7 @@ **************************************************************************/ // The original source was under the license below: -//==---- dpct.hpp ---------------------------------*- C++ -*----------------==// +//==---- defs.hpp ---------------------------------*- C++ -*----------------==// // // Copyright (C) Intel Corporation // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception diff --git a/sycl/include/syclcompat/memory.hpp b/sycl/include/syclcompat/memory.hpp index ccd7cab5fe177..30f4151f20960 100644 --- a/sycl/include/syclcompat/memory.hpp +++ b/sycl/include/syclcompat/memory.hpp @@ -77,7 +77,7 @@ template auto *local_mem() { return As; } -namespace experimental { +namespace detail { enum memcpy_direction { host_to_host, host_to_device, @@ -85,7 +85,7 @@ enum memcpy_direction { device_to_device, automatic }; -} +} // namespace detail template __syclcompat_inline__ @@ -111,10 +111,11 @@ enum class memory_region { usm_shared, // memory which can be accessed by host and device }; -enum class target { device, local }; - using byte_t = uint8_t; +/// Buffer type to be used in Memory Management runtime. +typedef sycl::buffer buffer_t; + /// Pitched 2D/3D memory data. class pitched_data { public: @@ -170,12 +171,131 @@ struct memcpy_parameter { data_wrapper from{}; data_wrapper to{}; sycl::range<3> size{}; - syclcompat::experimental::memcpy_direction direction{ - syclcompat::experimental::memcpy_direction::automatic}; }; } // namespace experimental namespace detail { +class mem_mgr { + mem_mgr() { + // Reserved address space, no real memory allocation happens here. +#if defined(__linux__) + mapped_address_space = + (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); +#elif defined(_WIN64) + mapped_address_space = (byte_t *)VirtualAlloc( + NULL, // NULL specified as the base address parameter + mapped_region_size, // Size of allocation + MEM_RESERVE, // Allocate reserved pages + PAGE_NOACCESS); // Protection = no access +#else +#error "Only support Windows and Linux." +#endif + next_free = mapped_address_space; + }; + +public: + using buffer_id_t = int; + + struct allocation { + buffer_t buffer; + byte_t *alloc_ptr; + size_t size; + }; + + ~mem_mgr() { +#if defined(__linux__) + munmap(mapped_address_space, mapped_region_size); +#elif defined(_WIN64) + VirtualFree(mapped_address_space, 0, MEM_RELEASE); +#else +#error "Only support Windows and Linux." +#endif + }; + + mem_mgr(const mem_mgr &) = delete; + mem_mgr &operator=(const mem_mgr &) = delete; + mem_mgr(mem_mgr &&) = delete; + mem_mgr &operator=(mem_mgr &&) = delete; + + /// Allocate + void *mem_alloc(size_t size) { + if (!size) + return nullptr; + std::lock_guard lock(m_mutex); + if (next_free + size > mapped_address_space + mapped_region_size) { + throw std::runtime_error( + "[SYCLcompat] malloc: out of memory for virtual memory pool"); + } + // Allocation + sycl::range<1> buffer_range(size); + buffer_t buf(buffer_range); + allocation alloc{buf, next_free, size}; + // Map allocation to device pointer + void *result = next_free; + m_map.emplace(next_free + size, alloc); + // Update pointer to the next free space. + next_free += (size + extra_padding + alignment - 1) & ~(alignment - 1); + + return result; + } + + /// Deallocate + void mem_free(const void *ptr) { + if (!ptr) + return; + std::lock_guard lock(m_mutex); + auto it = get_map_iterator(ptr); + m_map.erase(it); + } + + /// map: device pointer -> allocation(buffer, alloc_ptr, size) + allocation translate_ptr(const void *ptr) { + std::lock_guard lock(m_mutex); + auto it = get_map_iterator(ptr); + return it->second; + } + + /// Check if the pointer represents device pointer or not. + bool is_device_ptr(const void *ptr) const { + std::lock_guard lock(m_mutex); + return (mapped_address_space <= ptr) && + (ptr < mapped_address_space + mapped_region_size); + } + + /// Returns the instance of memory manager singleton. + static mem_mgr &instance() { + static mem_mgr m; + return m; + } + +private: + std::map m_map; + mutable std::mutex m_mutex; + byte_t *mapped_address_space; + byte_t *next_free; + const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024; + const size_t alignment = 256; + /// This padding may be defined to some positive value to debug + /// out of bound accesses. + const size_t extra_padding = 0; + + std::map::iterator get_map_iterator(const void *ptr) { + auto it = m_map.upper_bound((byte_t *)ptr); + if (it == m_map.end()) { + // Not a virtual pointer. + throw std::runtime_error("[SYCLcompat] can not get buffer from non-virtual pointer"); + } + const allocation &alloc = it->second; + if (ptr < alloc.alloc_ptr) { + // Out of bound. + // This may happen if there's a gap between allocations due to alignment + // or extra padding and pointer points to this gap. + throw std::runtime_error("[SYCLcompat] invalid virtual pointer"); + } + return it; + } +}; template class accessor; template class memory_traits { @@ -184,8 +304,9 @@ template class memory_traits { (Memory == memory_region::local) ? sycl::access::address_space::local_space : sycl::access::address_space::global_space; - static constexpr target target = - (Memory == memory_region::local) ? target::local : target::device; + static constexpr sycl::target target = (Memory == memory_region::local) + ? sycl::target::local + : sycl::target::device; static constexpr sycl::access_mode mode = (Memory == memory_region::constant) ? sycl::access_mode::read : sycl::access_mode::read_write; @@ -196,14 +317,20 @@ template class memory_traits { using value_t = typename std::remove_cv_t; template using accessor_t = - typename std::conditional_t, sycl::accessor>; - using pointer_t = T *; + using pointer_t = + typename std::conditional_t; }; static inline void *malloc(size_t size, sycl::queue q) { +#ifdef SYCLCOMPAT_USM_LEVEL_NONE + return mem_mgr::instance().mem_alloc(size * sizeof(byte_t)); +#else return sycl::malloc_device(size, q.get_device(), q.get_context()); +#endif // SYCLCOMPAT_USM_LEVEL_NONE } /// Calculate pitch (padded length of major dimension \p x) by rounding up to @@ -239,7 +366,24 @@ static inline void *malloc(size_t &pitch, size_t x, size_t y, size_t z, template static inline sycl::event fill(sycl::queue q, void *dev_ptr, const T &pattern, size_t count) { +#ifdef SYCLCOMPAT_USM_LEVEL_NONE + auto &mm = mem_mgr::instance(); + assert(mm.is_device_ptr(dev_ptr)); + auto alloc = mm.translate_ptr(dev_ptr); + size_t offset = (T *)dev_ptr - (T *)alloc.alloc_ptr; + + return q.submit([&](sycl::handler &cgh) { + auto r = sycl::range<1>(count); + auto o = sycl::id<1>(offset); + auto new_buffer = + alloc.buffer.reinterpret(sycl::range<1>(alloc.size / sizeof(T))); + sycl::accessor + acc(new_buffer, cgh, r, o); + cgh.fill(acc, pattern); + }); +#else return q.fill(dev_ptr, pattern, count); +#endif } /// Set \p value to the first \p size bytes starting from \p dev_ptr in \p q. @@ -251,7 +395,25 @@ static inline sycl::event fill(sycl::queue q, void *dev_ptr, const T &pattern, /// \returns An event representing the memset operation. static inline sycl::event memset(sycl::queue q, void *dev_ptr, int value, size_t size) { +#ifdef SYCLCOMPAT_USM_LEVEL_NONE + auto &mm = mem_mgr::instance(); + assert(mm.is_device_ptr(dev_ptr)); + auto alloc = mm.translate_ptr(dev_ptr); + size_t offset = (byte_t *)dev_ptr - (byte_t *)alloc.alloc_ptr; + + return q.submit([&](sycl::handler &cgh) { + auto r = sycl::range<1>(size); + auto o = sycl::id<1>(offset); + auto new_buffer = alloc.buffer.reinterpret( + sycl::range<1>(alloc.size / sizeof(byte_t))); + sycl::accessor + acc(new_buffer, cgh, r, o); + cgh.fill(acc, static_cast(value)); + }); +#else return q.memset(dev_ptr, value, size); +#endif // SYCLCOMPAT_USM_LEVEL_NONE } /// \brief Sets \p value to the 3D memory region pointed by \p data in \p q. @@ -305,6 +467,11 @@ enum class pointer_access_attribute { static pointer_access_attribute get_pointer_attribute(sycl::queue q, const void *ptr) { +#ifdef SYCLCOMPAT_USM_LEVEL_NONE + return mem_mgr::instance().is_device_ptr(ptr) + ? pointer_access_attribute::device_only + : pointer_access_attribute::host_only; +#else switch (sycl::get_pointer_type(ptr, q.get_context())) { case sycl::usm::alloc::unknown: return pointer_access_attribute::host_only; @@ -314,12 +481,12 @@ static pointer_access_attribute get_pointer_attribute(sycl::queue q, case sycl::usm::alloc::host: return pointer_access_attribute::host_device; } +#endif // SYCLCOMPAT_USM_LEVEL_NONE } -static experimental::memcpy_direction +static memcpy_direction deduce_memcpy_direction(sycl::queue q, void *to_ptr, const void *from_ptr) { // table[to_attribute][from_attribute] - using namespace experimental; // for memcpy_direction static const memcpy_direction direction_table[static_cast(pointer_access_attribute::end)] [static_cast(pointer_access_attribute::end)] = { @@ -335,7 +502,67 @@ static sycl::event memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, const std::vector &dep_events = {}) { if (!size) return sycl::event{}; +#ifdef SYCLCOMPAT_USM_LEVEL_NONE + auto &mm = mem_mgr::instance(); + auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr); + + switch (real_direction) { + case host_to_host: + return q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dep_events); + cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); + }); + case host_to_device: { + auto alloc = mm.translate_ptr(to_ptr); + size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr; + return q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dep_events); + auto r = sycl::range<1>(size); + auto o = sycl::id<1>(offset); + sycl::accessor + acc(alloc.buffer, cgh, r, o); + cgh.copy(from_ptr, acc); + }); + } + case device_to_host: { + auto alloc = mm.translate_ptr(from_ptr); + size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr; + return q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dep_events); + auto r = sycl::range<1>(size); + auto o = sycl::id<1>(offset); + sycl::accessor + acc(alloc.buffer, cgh, r, o); + cgh.copy(acc, to_ptr); + }); + } + case device_to_device: { + auto to_alloc = mm.translate_ptr(to_ptr); + auto from_alloc = mm.translate_ptr(from_ptr); + size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr; + size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr; + return q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dep_events); + auto r = sycl::range<1>(size); + auto to_o = sycl::id<1>(to_offset); + auto from_o = sycl::id<1>(from_offset); + sycl::accessor + to_acc(to_alloc.buffer, cgh, r, to_o); + sycl::accessor + from_acc(from_alloc.buffer, cgh, r, from_o); + cgh.copy(from_acc, to_acc); + }); + } + default: + throw std::runtime_error("[SYCLcompat] memcpy: invalid direction value"); + } +#else return q.memcpy(to_ptr, from_ptr, size, dep_events); +#endif // SYCLCOMPAT_USM_LEVEL_NONE } // Get actual copy range and make sure it will not exceed range. @@ -453,6 +680,33 @@ memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, break; } case device_to_device: +#ifdef SYCLCOMPAT_USM_LEVEL_NONE + { + auto &mm = mem_mgr::instance(); + auto to_alloc = mm.translate_ptr(to_surface); + auto from_alloc = mm.translate_ptr(from_surface); + size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr; + size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr; + event_list.push_back(q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dep_events); + auto to_o = sycl::id<1>(to_offset); + auto from_o = sycl::id<1>(from_offset); + sycl::accessor + to_acc(to_alloc.buffer, cgh, + get_copy_range(size, to_slice, to_range.get(0)), to_o); + sycl::accessor + from_acc(from_alloc.buffer, cgh, + get_copy_range(size, from_slice, from_range.get(0)), from_o); + cgh.parallel_for( + size, [=](sycl::id<3> id) { + to_acc[get_offset(id, to_slice, to_range.get(0))] = + from_acc[get_offset(id, from_slice, from_range.get(0))]; + }); + })); + } +#else event_list.push_back(q.submit([&](sycl::handler &cgh) { cgh.depends_on(dep_events); cgh.parallel_for(size, [=](sycl::id<3> id) { @@ -460,6 +714,7 @@ memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, from_surface[get_offset(id, from_slice, from_range.get(0))]; }); })); +#endif // SYCLCOMPAT_USM_LEVEL_NONE break; default: throw std::runtime_error("[SYCLcompat] memcpy: invalid direction value"); @@ -498,6 +753,103 @@ static sycl::event combine_events(std::vector &events, } // namespace detail +#ifdef SYCLCOMPAT_USM_LEVEL_NONE +/// Check if the pointer \p ptr represents device pointer or not. +/// +/// \param ptr The pointer to be checked. +/// \returns true if \p ptr is a device pointer. +template static inline bool is_device_ptr(T ptr) { + if constexpr (std::is_pointer::value) { + return detail::mem_mgr::instance().is_device_ptr(ptr); + } + return false; +} +#endif + +/// Get the buffer and the offset of a piece of memory pointed to by \p ptr. +/// +/// \param ptr Pointer to a piece of memory. +/// If NULL is passed as an argument, an exception will be thrown. +/// \returns a pair containing both the buffer and the offset. +static std::pair get_buffer_and_offset(const void *ptr) { + if (ptr) { + auto alloc = detail::mem_mgr::instance().translate_ptr(ptr); + size_t offset = (byte_t *)ptr - alloc.alloc_ptr; + return std::make_pair(alloc.buffer, offset); + } else { + throw std::runtime_error( + "[SYCLcompat] NULL pointer argument in get_buffer_and_offset function is invalid"); + } +} + +/// Get the data pointed from \p ptr as a 1D buffer reinterpreted as type T. +template static sycl::buffer get_buffer(const void *ptr) { + if (!ptr) + return sycl::buffer(sycl::range<1>(0)); + auto alloc = detail::mem_mgr::instance().translate_ptr(ptr); + return alloc.buffer.reinterpret(sycl::range<1>(alloc.size / sizeof(T))); +} + +/// Get the buffer of a piece of memory pointed to by \p ptr. +/// +/// \param ptr Pointer to a piece of memory. +/// \returns the buffer. +static buffer_t get_buffer(const void *ptr) { + return detail::mem_mgr::instance().translate_ptr(ptr).buffer; +} + +/// Get the host pointer from a buffer that is mapped to virtual pointer ptr. +/// \param ptr Virtual Pointer mapped to device buffer +/// \returns A host pointer +template static inline T *get_host_ptr(const void *ptr) { + auto BufferOffset = get_buffer_and_offset(ptr); + auto host_ptr = BufferOffset.first.get_host_access() + .get_multi_ptr(); + return (T *)(host_ptr + BufferOffset.second); +} + +/// A wrapper class contains an accessor and an offset. +template +class access_wrapper { + sycl::accessor accessor; + size_t offset; + +public: + /// Construct the accessor wrapper for memory pointed by \p ptr. + /// + /// \param ptr Pointer to memory. + /// \param cgh The command group handler. + access_wrapper(const void *ptr, sycl::handler &cgh) + : accessor(get_buffer(ptr).get_access(cgh)), offset(0) { + auto alloc = detail::mem_mgr::instance().translate_ptr(ptr); + offset = (byte_t *)ptr - alloc.alloc_ptr; + } + + /// Get the device pointer. + /// + /// \returns a device pointer with offset. + dataT get_raw_pointer() const { return (dataT)(&accessor[0] + offset); } +}; + +/// Get the accessor for memory pointed by \p ptr. +/// +/// \param ptr Pointer to memory. +/// If NULL is passed as an argument, an exception will be thrown. +/// \param cgh The command group handler. +/// \returns an accessor. +template +static sycl::accessor get_access(const void *ptr, + sycl::handler &cgh) { + if (ptr) { + auto alloc = detail::mem_mgr::instance().translate_ptr(ptr); + return alloc.buffer.get_access(cgh); + } else { + throw std::runtime_error( + "[SYCLcompat] NULL pointer argument in get_access function is invalid"); + } +} + namespace experimental { namespace detail { static inline std::vector @@ -631,6 +983,19 @@ static inline void *malloc(size_t &pitch, size_t x, size_t y, return detail::malloc(pitch, x, y, 1, q); } +namespace detail { + +inline void free(void *ptr, const sycl::queue &q) { + if (ptr) { +#ifdef SYCLCOMPAT_USM_LEVEL_NONE + detail::mem_mgr::instance().mem_free(ptr); +#else + sycl::free(ptr, q.get_context()); +#endif // SYCLCOMPAT_USM_LEVEL_NONE + } +} +} // namespace detail + /// Wait on the queue \p q and free the memory \p ptr. /// \param ptr Point to free. /// \param q Queue to execute the free task. @@ -640,7 +1005,7 @@ static inline void wait_and_free(void *ptr, get_current_device().queues_wait_and_throw(); q.wait(); if (ptr) { - sycl::free(ptr, q); + detail::free(ptr, q); } } @@ -651,9 +1016,7 @@ namespace { /// \param ptr Point to free. /// \returns no return value. static inline void free(void *ptr, sycl::queue q = get_default_queue()) { - if (ptr) { - sycl::free(ptr, q); - } + detail::free(ptr, q); } } // namespace @@ -670,11 +1033,11 @@ inline sycl::event enqueue_free(const std::vector &pointers, const std::vector &events, sycl::queue q = get_default_queue()) { auto event = q.submit( - [&pointers, &events, ctxt = q.get_context()](sycl::handler &cgh) { + [&pointers, &events, &q](sycl::handler &cgh) { cgh.depends_on(events); cgh.host_task([=]() { for (auto p : pointers) - sycl::free(p, ctxt); + detail::free(p, q); }); }); get_current_device().add_event(event); @@ -1109,7 +1472,9 @@ template class accessor { const accessor_t>::type &acc) : accessor(acc, acc.get_range()) {} accessor(const accessor_t &acc, const sycl::range<3> &in_range) - : accessor(acc.get_pointer(), in_range) {} + : accessor( + acc.template get_multi_ptr().get(), + in_range) {} accessor operator[](size_t index) const { sycl::range<2> sub(_range.get(1), _range.get(2)); return accessor(_data + index * sub.size(), sub); @@ -1134,7 +1499,9 @@ template class accessor { const accessor_t>::type &acc) : accessor(acc, acc.get_range()) {} accessor(const accessor_t &acc, const sycl::range<2> &in_range) - : accessor(acc.get_pointer(), in_range) {} + : accessor( + acc.template get_multi_ptr().get(), + in_range) {} pointer_t operator[](size_t index) const { return _data + _range.get(1) * index; @@ -1148,6 +1515,8 @@ template class accessor { }; /// Device variable with address space of shared or global. +// TODO(syclcompat-lib-reviewers): This doesn't yet support multi-device (ptr +// per device) template class device_memory { public: using accessor_t = @@ -1198,6 +1567,9 @@ template class device_memory { "device memory region should be global, constant or shared"); // Make sure that singleton class dev_mgr will destruct later than this. detail::dev_mgr::instance(); +#ifdef SYCLCOMPAT_USM_LEVEL_NONE + detail::mem_mgr::instance(); +#endif } /// Constructor with range @@ -1257,9 +1629,24 @@ template class device_memory { template typename std::enable_if::type &operator[](size_t index) { init(); +#ifdef SYCLCOMPAT_USM_LEVEL_NONE + return syclcompat::get_buffer::type>( + _device_ptr) + .template get_access()[index]; +#else return _device_ptr[index]; +#endif // SYCLCOMPAT_USM_LEVEL_NONE } +#ifdef SYCLCOMPAT_USM_LEVEL_NONE + /// Get sycl::accessor for the device memory object when usm is not used. + accessor_t get_access(sycl::handler &cgh) { + return get_buffer(_device_ptr) + .template reinterpret(_range) + .template get_access::mode, + detail::memory_traits::target>(cgh); + } +#else /// Get compat_accessor with dimension info for the device memory object /// when usm is used and dimension is greater than 1. template @@ -1267,6 +1654,7 @@ template class device_memory { get_access(sycl::handler &cgh) { return syclcompat_accessor_t((T *)_device_ptr, _range); } +#endif // SYCLCOMPAT_USM_LEVEL_NONE private: device_memory(value_t *memory_ptr, size_t size, @@ -1275,6 +1663,7 @@ template class device_memory { _device_ptr(memory_ptr), _q(q) {} void allocate_device(sycl::queue q) { +#ifndef SYCLCOMPAT_USM_LEVEL_NONE if (Memory == memory_region::usm_shared) { _device_ptr = (value_t *)sycl::malloc_shared(_size, q.get_device(), q.get_context()); @@ -1287,6 +1676,7 @@ template class device_memory { sycl::ext::oneapi::property::usm::device_read_only()); return; } +#endif #endif _device_ptr = (value_t *)detail::malloc(_size, q); } @@ -1312,6 +1702,14 @@ class device_memory : public device_memory { /// Default constructor device_memory(sycl::queue q = get_default_queue()) : base(1, q) {} +#ifdef SYCLCOMPAT_USM_LEVEL_NONE + /// Get sycl::accessor for the device memory object when usm is not used. + accessor_t get_access(sycl::handler &cgh) { + auto buf = get_buffer(base::get_ptr()) + .template reinterpret(sycl::range<1>(1)); + return accessor_t(buf, cgh); + } +#endif // SYCLCOMPAT_USM_LEVEL_NONE }; template @@ -1324,6 +1722,10 @@ using shared_memory = device_memory; class pointer_attributes { public: void init(const void *ptr, sycl::queue q = get_default_queue()) { +#ifdef SYCLCOMPAT_USM_LEVEL_NONE + throw std::runtime_error( + "[SYCLcompat] pointer_attributes: only works for USM pointer."); +#else memory_type = sycl::get_pointer_type(ptr, q.get_context()); device_pointer = (memory_type != sycl::usm::alloc::unknown) ? ptr : nullptr; host_pointer = (memory_type != sycl::usm::alloc::unknown) && @@ -1332,6 +1734,7 @@ class pointer_attributes { : nullptr; sycl::device device_obj = sycl::get_pointer_device(ptr, q.get_context()); device_id = detail::dev_mgr::instance().get_device_id(device_obj); +#endif // SYCLCOMPAT_USM_LEVEL_NONE } sycl::usm::alloc get_memory_type() { return memory_type; } diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_memory_acq_rel.cpp b/sycl/test-e2e/syclcompat/atomic/atomic_memory_acq_rel.cpp index 361fddb3d3907..609652a58b17d 100644 --- a/sycl/test-e2e/syclcompat/atomic/atomic_memory_acq_rel.cpp +++ b/sycl/test-e2e/syclcompat/atomic/atomic_memory_acq_rel.cpp @@ -67,7 +67,8 @@ template void test_acquire_global() { error_buf.template get_access(cgh); auto val = val_buf.template get_access(cgh); cgh.parallel_for(range<1>(N_items), [=](item<1> it) { - volatile int *val_p = val.get_pointer(); + volatile int *val_p = + val.get_multi_ptr().get(); auto atm0 = atomic_ref(val[0]); @@ -122,7 +123,8 @@ template void test_acquire_local() { val[0] = 0; val[1] = 0; it.barrier(access::fence_space::local_space); - volatile int *val_p = val.get_pointer(); + volatile int *val_p = + val.get_multi_ptr().get(); auto atm0 = atomic_ref(val[0]); @@ -169,7 +171,8 @@ template void test_release_global() { error_buf.template get_access(cgh); auto val = val_buf.template get_access(cgh); cgh.parallel_for(range<1>(N_items), [=](item<1> it) { - volatile int *val_p = val.get_pointer(); + volatile int *val_p = + val.get_multi_ptr().get(); auto atm0 = atomic_ref(val[0]); @@ -223,7 +226,8 @@ template void test_release_local() { val[0] = 0; val[1] = 0; it.barrier(access::fence_space::local_space); - volatile int *val_p = val.get_pointer(); + volatile int *val_p = + val.get_multi_ptr().get(); auto atm0 = atomic_ref(val[0]); diff --git a/sycl/test-e2e/syclcompat/atomic/atomics_verification_usmnone.cpp b/sycl/test-e2e/syclcompat/atomic/atomics_verification_usmnone.cpp new file mode 100644 index 0000000000000..299c4e914ca04 --- /dev/null +++ b/sycl/test-e2e/syclcompat/atomic/atomics_verification_usmnone.cpp @@ -0,0 +1,307 @@ +// ====------ atomics_noneusm_verification.cpp---------- -*- C++ -* ----===//// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// +// ===----------------------------------------------------------------------===// + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#define SYCLCOMPAT_USM_LEVEL_NONE +#include +#include +#include +#include +#include +#include + +#define min(a, b) (a) < (b) ? (a) : (b) +#define max(a, b) (a) > (b) ? (a) : (b) + +#define LOOP_NUM 5 + +void atomicKernel(int *atom_arr, sycl::nd_item<3> item_ct1) { + unsigned int tid = item_ct1.get_local_range().get(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + for (int i = 0; i < LOOP_NUM; i++) { + // Atomic addition + syclcompat::atomic_fetch_add(&atom_arr[0], 10); + + // Atomic exchange + syclcompat::atomic_exchange(&atom_arr[1], (int)tid); + + // Atomic maximum + syclcompat::atomic_fetch_max(&atom_arr[2], (int)tid); + + // Atomic minimum + syclcompat::atomic_fetch_min(&atom_arr[3], (int)tid); + + // Atomic increment (modulo 17+1) + syclcompat::atomic_fetch_compare_inc((unsigned int *)&atom_arr[4], + (unsigned int)17); + + // Atomic compare-and-swap + syclcompat::atomic_compare_exchange_strong(&atom_arr[6], (int)(tid - 1), + (int)tid); + + // Bitwise atomic instructions + + // Atomic AND + syclcompat::atomic_fetch_and(&atom_arr[7], (int)(2 * tid + 7)); + + // Atomic OR + syclcompat::atomic_fetch_or(&atom_arr[8], 1 << tid); + + // Atomic XOR + syclcompat::atomic_fetch_xor(&atom_arr[9], (int)tid); + } +} + +void atomicKernel_CPU(int *atom_arr, int no_of_threads) { + + for (int i = no_of_threads; i < 2 * no_of_threads; i++) { + + for (int j = 0; j < LOOP_NUM; j++) { + // Atomic addition + __sync_fetch_and_add(&atom_arr[0], 10); + + // Atomic exchange + __sync_lock_test_and_set(&atom_arr[1], i); + + // Atomic maximum + int old, expected; + do { + expected = atom_arr[2]; + old = __sync_val_compare_and_swap(&atom_arr[2], expected, + max(expected, i)); + } while (old != expected); + + // Atomic minimum + do { + expected = atom_arr[3]; + old = __sync_val_compare_and_swap(&atom_arr[3], expected, + min(expected, i)); + } while (old != expected); + + // Atomic increment (modulo 17+1) + int limit = 17; + do { + expected = atom_arr[4]; + old = __sync_val_compare_and_swap( + &atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1); + } while (old != expected); + + // Atomic decrement + limit = 137; + do { + expected = atom_arr[5]; + old = __sync_val_compare_and_swap( + &atom_arr[5], expected, + ((expected == 0) || (expected > limit)) ? limit : expected - 1); + } while (old != expected); + + // Atomic compare-and-swap + __sync_val_compare_and_swap(&atom_arr[6], i - 1, i); + + // Bitwise atomic instructions + + // Atomic AND + __sync_fetch_and_and(&atom_arr[7], 2 * i + 7); + + // Atomic OR + __sync_fetch_and_or(&atom_arr[8], 1 << i); + + // Atomic XOR + // 11th element should be 0xff + __sync_fetch_and_xor(&atom_arr[9], i); + } + } +} + +int verify(int *testData, const int len) { + int val = 0; + + for (int i = 0; i < len * LOOP_NUM; ++i) { + val += 10; + } + + if (val != testData[0]) { + printf("atomicAdd failed val = %d testData = %d\n", val, testData[0]); + return false; + } + + val = 0; + + bool found = false; + + for (int i = 0; i < len; ++i) { + // second element should be a member of [0, len) + if (i == testData[1]) { + found = true; + break; + } + } + + if (!found) { + printf("atomicExch failed\n"); + return false; + } + + val = -(1 << 8); + + for (int i = 0; i < len; ++i) { + // third element should be len-1 + val = max(val, i); + } + + if (val != testData[2]) { + printf("atomicMax failed\n"); + return false; + } + + val = 1 << 8; + + for (int i = 0; i < len; ++i) { + val = min(val, i); + } + + if (val != testData[3]) { + printf("atomicMin failed\n"); + return false; + } + + int limit = 17; + val = 0; + + for (int i = 0; i < len * LOOP_NUM; ++i) { + val = (val >= limit) ? 0 : val + 1; + } + + if (val != testData[4]) { + printf("atomicInc failed\n"); + return false; + } + + limit = 137; + val = 0; + + for (int i = 0; i < len * LOOP_NUM; ++i) { + val = ((val == 0) || (val > limit)) ? limit : val - 1; + } + + found = false; + + for (int i = 0; i < len; ++i) { + // seventh element should be a member of [0, len) + if (i == testData[6]) { + found = true; + break; + } + } + + if (!found) { + printf("atomicCAS failed\n"); + return false; + } + + val = 0xff; + + for (int i = 0; i < len; ++i) { + // 8th element should be 1 + val &= (2 * i + 7); + } + + if (val != testData[7]) { + printf("atomicAnd failed\n"); + return false; + } + + val = 0; + + for (int i = 0; i < len; ++i) { + // 9th element should be 0xff + val |= (1 << i); + } + + if (val != testData[8]) { + printf("atomicOr failed\n"); + return false; + } + + val = 0xff; + + for (int i = 0; i < len; ++i) { + // 11th element should be 0xff + val ^= i; + } + + if (val != testData[9]) { + printf("atomicXor failed\n"); + return false; + } + + return true; +} + +int main(int argc, char **argv) { + + unsigned int numThreads = 256; + unsigned int numBlocks = 64; + unsigned int numData = 10; + + int *atom_arr; + + atom_arr = (int *)syclcompat::malloc(sizeof(int) * numData); + + for (unsigned int i = 0; i < numData; i++) { + *syclcompat::get_host_ptr(atom_arr + i) = 0; + } + + // To make the AND and XOR tests generate something other than 0... + *syclcompat::get_host_ptr(atom_arr + 7) = + *syclcompat::get_host_ptr(atom_arr + 9) = 0xff; + + std::cout << "Selected device: " + << syclcompat::get_default_queue() + .get_device() + .get_info() + << "\n"; + + { + std::pair atom_arr_buf_ct0 = + syclcompat::get_buffer_and_offset(atom_arr); + size_t atom_arr_offset_ct0 = atom_arr_buf_ct0.second; + syclcompat::get_default_queue().submit([&](sycl::handler &cgh) { + auto atom_arr_acc_ct0 = + atom_arr_buf_ct0.first.get_access( + cgh); + + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, numBlocks) * + sycl::range<3>(1, 1, numThreads), + sycl::range<3>(1, 1, numThreads)), + [=](sycl::nd_item<3> item_ct1) { + int *atom_arr_ct0 = (int *)(&atom_arr_acc_ct0[0] + + atom_arr_offset_ct0); + atomicKernel(atom_arr_ct0, item_ct1); + }); + }); + } + + atomicKernel_CPU(syclcompat::get_host_ptr(atom_arr), numBlocks * numThreads); + + syclcompat::get_current_device().queues_wait_and_throw(); + + // Compute & verify reference solution + int testResult = + verify(syclcompat::get_host_ptr(atom_arr), 2 * numThreads * numBlocks); + + syclcompat::free(atom_arr); + + printf("Atomics test completed, returned %s \n", + testResult ? "OK" : "ERROR!"); + exit(testResult ? 0 : -1); +} diff --git a/sycl/test-e2e/syclcompat/memory/global_memory_usmnone.cpp b/sycl/test-e2e/syclcompat/memory/global_memory_usmnone.cpp new file mode 100644 index 0000000000000..0be9c220efde6 --- /dev/null +++ b/sycl/test-e2e/syclcompat/memory/global_memory_usmnone.cpp @@ -0,0 +1,175 @@ +// ====------ global_memory.cpp---------- -*- C++ -* ----===//// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// +// ===----------------------------------------------------------------------===// + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +#define SYCLCOMPAT_USM_LEVEL_NONE +#include +#include + +class TestStruct { +public: + void test() {} + template void testTemplate() {} +}; + +template +class TemplateStuct { +public: + void test() {} + template void testTemplate() {} +}; + +syclcompat::global_memory d1_a(0); +syclcompat::global_memory d2_a(36); +syclcompat::global_memory, 0> d3_a; +syclcompat::global_memory d4_a; +syclcompat::constant_memory c1_a(16); +syclcompat::constant_memory c2_a; +syclcompat::constant_memory, 0> c3_a; +syclcompat::constant_memory c4_a; + +syclcompat::constant_memory c_2d_a(sycl::range<2>(5, 3), +{{0, 10, 20}, +{30, 40, 50}, +{60, 70, 80}, +{90, 100, 110}, +{120, 130, 140}}); +syclcompat::constant_memory c_2d_b(sycl::range<2>(3, 5), +{{0, 10, 20, 30, 40}, +{50, 60, 70, 80, 90}, +{100, 110, 120, 130, 140}}); +syclcompat::constant_memory c_2d_c(sycl::range<2>(3, 5), + {0, 10, 20, 30, 40, + 50, 60, 70, 80, 90, + 100, 110, 120, 130, 140}); +syclcompat::constant_memory c_3d(sycl::range<3>(2, 2, 4), + {0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, + 110, 120, 130, 140}); +syclcompat::constant_memory c_1d(sycl::range<1>(15), + {0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, + 110, 120, 130, 140}); + +bool verify_init(int *data) { + for(auto i = 0; i < 15; ++i) { + if (data[i] != i * 10) + return false; + } + return true; +} + +bool verify() { + const int size = 15; + auto size_bytes = 15 * sizeof(int); + + int h_result[15]; + syclcompat::memcpy(h_result, c_2d_a.get_ptr(), size_bytes); + if (!verify_init(h_result)) + return false; + syclcompat::memcpy(h_result, c_2d_b.get_ptr(), size_bytes); + if (!verify_init(h_result)) + return false; + syclcompat::memcpy(h_result, c_2d_c.get_ptr(), size_bytes); + if (!verify_init(h_result)) + return false; + syclcompat::memcpy(h_result, c_3d.get_ptr(), size_bytes); + if (!verify_init(h_result)) + return false; + syclcompat::memcpy(h_result, c_1d.get_ptr(), size_bytes); + if (!verify_init(h_result)) + return false; + return true; +} + +void test4(TemplateStuct *d3, TestStruct *d4) { + d3->test(); + d3->testTemplate(); + d4->test(); + d4->testTemplate(); +} + +void test3(TemplateStuct c3, TestStruct c4) { + c3.test(); + c3.testTemplate(); + c4.test(); + c4.testTemplate(); +} + +void test2(volatile int &a) { + a = 3; +} + +void test1(volatile int *acc_d1, int *acc_d2, int const *c1, int c2) { + unsigned d_a = 1; + *acc_d1 = 0; + *acc_d2 = d_a; + unsigned d_c = (unsigned)(*acc_d1); + unsigned *d_d = (unsigned *)acc_d2; + unsigned *d_e = (unsigned *)(acc_d2 + 5); + int *d_f = acc_d2 - 6; + test2(*acc_d1); +} + +int main() { + d1_a.init(); + d2_a.init(); + c1_a.init(); + c2_a.init(); + syclcompat::get_default_queue().submit( + [&](sycl::handler &cgh) { + auto d1_acc = d1_a.get_access(cgh); + auto d2_acc = d2_a.get_access(cgh); + auto c1_acc = c1_a.get_access(cgh); + auto c2_acc = c2_a.get_access(cgh); + cgh.parallel_for>( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item) { + test1(d1_acc.get_multi_ptr().get(), + d2_acc.get_multi_ptr().get(), + c1_acc.get_multi_ptr().get(), + c2_acc); + }); + }); + c3_a.init(); + c4_a.init(); + syclcompat::get_default_queue().submit( + [&](sycl::handler &cgh) { + auto c3_acc = c3_a.get_access(cgh); + auto c4_acc = c4_a.get_access(cgh); + cgh.parallel_for>( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=] (sycl::nd_item<3> item) { + test3(c3_acc, c4_acc); + }); + }); + + sycl::queue *q = syclcompat::get_current_device().create_queue(); + d3_a.init(*q); + d4_a.init(*q); + q->submit( + [&](sycl::handler &cgh) { + auto d3_acc = d3_a.get_access(cgh); + auto d4_acc = d4_a.get_access(cgh); + cgh.parallel_for>( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item) { + test4(d3_acc.get_multi_ptr().get(), + d4_acc.get_multi_ptr().get()); + }); + }); + + + if (verify()) { + printf("Init Constant Memory Success!\n"); + return 0; + } else { + printf("Init Constant Memory Fail!\n"); + return 1; + } +} diff --git a/sycl/test-e2e/syclcompat/memory/memcpy_3d2.cpp b/sycl/test-e2e/syclcompat/memory/memcpy_3d2.cpp index e47835d205a9b..5b0691c35059b 100644 --- a/sycl/test-e2e/syclcompat/memory/memcpy_3d2.cpp +++ b/sycl/test-e2e/syclcompat/memory/memcpy_3d2.cpp @@ -48,7 +48,6 @@ void test_memcpy3D_async_pitchedAPI() { size_t depth = 10; float *h_data; float *h_ref; - // test_feature:byte_t syclcompat::byte_t a = 'a'; assert(sizeof(syclcompat::byte_t) == 1); @@ -70,36 +69,23 @@ void test_memcpy3D_async_pitchedAPI() { // alloc memory. extent = sycl::range<3>(sizeof(float) * width, height, depth); - // test_feature:malloc d_data = (syclcompat::pitched_data)syclcompat::malloc(extent); // copy to Device. cpyParm_from_data_ct1 = syclcompat::pitched_data( (void *)h_data, sizeof(float) * width, width, height); - // test_feature:get_data_ptr - // test_feature:get_pitch - // test_feature:get_x - // test_feature:get_y if (cpyParm_from_data_ct1.get_data_ptr() != h_data || cpyParm_from_data_ct1.get_pitch() != sizeof(float) * width || cpyParm_from_data_ct1.get_x() != width || cpyParm_from_data_ct1.get_y() != height) { assert(false); } - // test_feature:set_data_ptr - // test_feature:set_pitch - // test_feature:set_x - // test_feature:set_y cpyParm_from_data_ct1.set_data_ptr((void *)h_data); cpyParm_from_data_ct1.set_pitch(sizeof(float) * width); cpyParm_from_data_ct1.set_x(width); cpyParm_from_data_ct1.set_y(height); - // test_feature:get_data_ptr - // test_feature:get_pitch - // test_feature:get_x - // test_feature:get_y if (cpyParm_from_data_ct1.get_data_ptr() != h_data || cpyParm_from_data_ct1.get_pitch() != sizeof(float) * width || cpyParm_from_data_ct1.get_x() != width || @@ -109,7 +95,6 @@ void test_memcpy3D_async_pitchedAPI() { cpyParm_to_data_ct1 = d_data; cpyParm_size_ct1 = extent; - // test_feature:memcpy_async syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1, cpyParm_from_data_ct1, cpyParm_from_pos_ct1, cpyParm_size_ct1); @@ -119,14 +104,12 @@ void test_memcpy3D_async_pitchedAPI() { cpyParm_to_data_ct1 = syclcompat::pitched_data( (void *)h_data, sizeof(float) * width, width, height); cpyParm_size_ct1 = extent; - // test_feature:memcpy_async syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1, cpyParm_from_data_ct1, cpyParm_from_pos_ct1, cpyParm_size_ct1); syclcompat::get_default_queue().wait_and_throw(); check(h_data, h_ref, width * height * depth); // memset device data. - // test_feature:memset_async syclcompat::memset_async(d_data, 0x1, extent); syclcompat::get_default_queue().wait_and_throw(); // copy back to host @@ -134,7 +117,6 @@ void test_memcpy3D_async_pitchedAPI() { cpyParm_to_data_ct1 = syclcompat::pitched_data( (void *)h_data, sizeof(float) * width, width, height); cpyParm_size_ct1 = extent; - // test_feature:memcpy_async syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1, cpyParm_from_data_ct1, cpyParm_from_pos_ct1, cpyParm_size_ct1); @@ -175,7 +157,6 @@ void test_memcpy3D_async_pitchedAPI_q() { // alloc memory. extent = sycl::range<3>(sizeof(float) * width, height, depth); - // test_feature:malloc d_data = (syclcompat::pitched_data)syclcompat::malloc(extent, q); // copy to Device. @@ -183,7 +164,6 @@ void test_memcpy3D_async_pitchedAPI_q() { (void *)h_data, sizeof(float) * width, width, height); cpyParm_to_data_ct1 = d_data; cpyParm_size_ct1 = extent; - // test_feature:memcpy_async syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1, cpyParm_from_data_ct1, cpyParm_from_pos_ct1, cpyParm_size_ct1, q); @@ -193,7 +173,6 @@ void test_memcpy3D_async_pitchedAPI_q() { cpyParm_to_data_ct1 = syclcompat::pitched_data( (void *)h_data, sizeof(float) * width, width, height); cpyParm_size_ct1 = extent; - // test_feature:memcpy_async syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1, cpyParm_from_data_ct1, cpyParm_from_pos_ct1, cpyParm_size_ct1, q); @@ -202,7 +181,6 @@ void test_memcpy3D_async_pitchedAPI_q() { check(h_data, h_ref, width * height * depth); // memset device data. - // test_feature:memset_async syclcompat::memset_async(d_data, 0x1, extent, q); q.wait_and_throw(); // copy back to host @@ -210,7 +188,6 @@ void test_memcpy3D_async_pitchedAPI_q() { cpyParm_to_data_ct1 = syclcompat::pitched_data( (void *)h_data, sizeof(float) * width, width, height); cpyParm_size_ct1 = extent; - // test_feature:memcpy_async syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1, cpyParm_from_data_ct1, cpyParm_from_pos_ct1, cpyParm_size_ct1, q); @@ -288,7 +265,6 @@ void test_memcpy3D_async_offset() { // alloc memory. extent = sycl::range<3>(sizeof(float) * width, height, depth); - // test_feature:malloc d_data = (syclcompat::pitched_data)syclcompat::malloc(extent); // copy to Device. @@ -296,7 +272,6 @@ void test_memcpy3D_async_offset() { (void *)h_data, sizeof(float) * width, width, height); cpyParm_to_data_ct1 = d_data; cpyParm_size_ct1 = extent; - // test_feature:memcpy_async syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1, cpyParm_from_data_ct1, cpyParm_from_pos_ct1, cpyParm_size_ct1); @@ -310,7 +285,6 @@ void test_memcpy3D_async_offset() { cpyParm_from_data_ct1 = d_data; cpyParm_to_data_ct1 = syclcompat::pitched_data( (void *)h_data, sizeof(float) * out_width, out_width, out_height); - // test_feature:memcpy_async syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1, cpyParm_from_data_ct1, cpyParm_from_pos_ct1, cpyParm_size_ct1); @@ -384,7 +358,6 @@ void test_memcpy3D_async_offset_q() { // alloc memory. extent = sycl::range<3>(sizeof(float) * width, height, depth); - // test_feature:malloc d_data = (syclcompat::pitched_data)syclcompat::malloc(extent, q); // copy to Device. @@ -392,7 +365,6 @@ void test_memcpy3D_async_offset_q() { (void *)h_data, sizeof(float) * width, width, height); cpyParm_to_data_ct1 = d_data; cpyParm_size_ct1 = extent; - // test_feature:memcpy_async syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1, cpyParm_from_data_ct1, cpyParm_from_pos_ct1, cpyParm_size_ct1, q); @@ -406,7 +378,6 @@ void test_memcpy3D_async_offset_q() { cpyParm_from_data_ct1 = d_data; cpyParm_to_data_ct1 = syclcompat::pitched_data( (void *)h_data, sizeof(float) * out_width, out_width, out_height); - // test_feature:memcpy_async syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1, cpyParm_from_data_ct1, cpyParm_from_pos_ct1, cpyParm_size_ct1, q); @@ -480,7 +451,6 @@ void test_memcpy3D_async_offsetZ() { // alloc memory. extent = sycl::range<3>(sizeof(float) * width, height, depth); - // test_feature:malloc d_data = (syclcompat::pitched_data)syclcompat::malloc(extent); // copy to Device. @@ -488,7 +458,6 @@ void test_memcpy3D_async_offsetZ() { (void *)h_data, sizeof(float) * width, width, height); cpyParm_to_data_ct1 = d_data; cpyParm_size_ct1 = extent; - // test_feature:memcpy_async syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1, cpyParm_from_data_ct1, cpyParm_from_pos_ct1, cpyParm_size_ct1); @@ -502,7 +471,6 @@ void test_memcpy3D_async_offsetZ() { cpyParm_from_data_ct1 = d_data; cpyParm_to_data_ct1 = syclcompat::pitched_data( (void *)h_data, sizeof(float) * out_width, out_width, out_height); - // test_feature:memcpy_async syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1, cpyParm_from_data_ct1, cpyParm_from_pos_ct1, cpyParm_size_ct1); @@ -576,7 +544,6 @@ void test_memcpy3D_async_offsetZ_q() { // alloc memory. extent = sycl::range<3>(sizeof(float) * width, height, depth); - // test_feature:malloc d_data = (syclcompat::pitched_data)syclcompat::malloc(extent, q); // copy to Device. @@ -584,7 +551,6 @@ void test_memcpy3D_async_offsetZ_q() { (void *)h_data, sizeof(float) * width, width, height); cpyParm_to_data_ct1 = d_data; cpyParm_size_ct1 = extent; - // test_feature:memcpy_async syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1, cpyParm_from_data_ct1, cpyParm_from_pos_ct1, cpyParm_size_ct1, q); @@ -598,7 +564,6 @@ void test_memcpy3D_async_offsetZ_q() { cpyParm_from_data_ct1 = d_data; cpyParm_to_data_ct1 = syclcompat::pitched_data( (void *)h_data, sizeof(float) * out_width, out_width, out_height); - // test_feature:memcpy_async syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1, cpyParm_from_data_ct1, cpyParm_from_pos_ct1, cpyParm_size_ct1, q); diff --git a/sycl/test-e2e/syclcompat/memory/memory_async.cpp b/sycl/test-e2e/syclcompat/memory/memory_async.cpp index aa3d8a4231bbb..fecd54fe921a0 100644 --- a/sycl/test-e2e/syclcompat/memory/memory_async.cpp +++ b/sycl/test-e2e/syclcompat/memory/memory_async.cpp @@ -32,17 +32,13 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out +// RUN: %{build} -DSYCLCOMPAT_USM_LEVEL_NONE -o %t.out +// RUN: %{run} %t.out // Tests for the sycl::events returned from syclcompat::*Async API calls -// TODO: Re-enable, see https://github.com/intel/llvm/issues/13636 -// and possibly related: https://github.com/intel/llvm/issues/14623 -// UNSUPPORTED: true #include - #include - #include - #include "memory_fixt.hpp" // enqueue_free is just a host task, so we are really testing the event @@ -193,6 +189,11 @@ void test_combine_events() { } int main() { +#ifdef SYCLCOMPAT_USM_LEVEL_NONE + std::cout << "Running SYCLCOMPAT_USM_LEVEL_NONE tests" << std::endl; +#else + std::cout << "Running USM tests" << std::endl; +#endif test_free_async(); test_memcpy_async1(); diff --git a/sycl/test-e2e/syclcompat/memory/memory_fixt.hpp b/sycl/test-e2e/syclcompat/memory/memory_fixt.hpp index e5b8c6ef37972..b410f2bf77b05 100644 --- a/sycl/test-e2e/syclcompat/memory/memory_fixt.hpp +++ b/sycl/test-e2e/syclcompat/memory/memory_fixt.hpp @@ -37,25 +37,39 @@ class AsyncTest { AsyncTest() : q_{syclcompat::get_default_queue()}, grid_{NUM_WG}, thread_{WG_SIZE}, size_{WG_SIZE * NUM_WG} { - d_A_ = sycl::malloc_device(size_, q_); - d_B_ = sycl::malloc_device(size_, q_); - d_C_ = sycl::malloc_device(size_, q_); + d_A_ = syclcompat::malloc(size_, q_); + d_B_ = syclcompat::malloc(size_, q_); + d_C_ = syclcompat::malloc(size_, q_); } ~AsyncTest() { - sycl::free(d_A_, q_); - sycl::free(d_B_, q_); - sycl::free(d_C_, q_); + syclcompat::free(d_A_, q_); + syclcompat::free(d_B_, q_); + syclcompat::free(d_C_, q_); } sycl::event launch_kernel() { auto &dd_A = d_A_; auto &dd_B = d_B_; auto &dd_C = d_C_; +#ifdef SYCLCOMPAT_USM_LEVEL_NONE + syclcompat::buffer_t buffer_A = syclcompat::get_buffer(d_A_); + syclcompat::buffer_t buffer_B = syclcompat::get_buffer(d_B_); + syclcompat::buffer_t buffer_C = syclcompat::get_buffer(d_C_); +#endif return q_.submit([&](sycl::handler &cgh) { +#ifdef SYCLCOMPAT_USM_LEVEL_NONE + auto A = buffer_A.get_access(cgh); + auto B = buffer_B.get_access(cgh); + auto C = buffer_C.get_access(cgh); +#else + auto A = dd_A; + auto B = dd_B; + auto C = dd_C; +#endif cgh.parallel_for(size_, [=](sycl::id<1> id) { - dd_A[id] = static_cast(id) + 1.0f; - dd_B[id] = static_cast(id) + 1.0f; - dd_C[id] = dd_A[id] + dd_B[id]; + A[id] = static_cast(id) + 1.0f; + B[id] = static_cast(id) + 1.0f; + C[id] = A[id] + B[id]; }); }); } diff --git a/sycl/test-e2e/syclcompat/memory/memory_image.cpp b/sycl/test-e2e/syclcompat/memory/memory_image.cpp index c1225dcc2708e..819a7d97d4ff6 100644 --- a/sycl/test-e2e/syclcompat/memory/memory_image.cpp +++ b/sycl/test-e2e/syclcompat/memory/memory_image.cpp @@ -205,7 +205,6 @@ void test_memcpy3D_async_parameter_offset() { // alloc memory. extent = sycl::range<3>(sizeof(float) * width, height, depth); - // test_feature:malloc d_data = (syclcompat::pitched_data)syclcompat::malloc(extent); // copy to Device. diff --git a/sycl/test-e2e/syclcompat/memory/memory_is_device_ptr_usmnone.cpp b/sycl/test-e2e/syclcompat/memory/memory_is_device_ptr_usmnone.cpp new file mode 100644 index 0000000000000..dbde31a15c63b --- /dev/null +++ b/sycl/test-e2e/syclcompat/memory/memory_is_device_ptr_usmnone.cpp @@ -0,0 +1,28 @@ +// ====------ memory_is_device_ptr.cpp---------- -*- C++ -* ----===//// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// +// ===----------------------------------------------------------------------===// + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#define SYCLCOMPAT_USM_LEVEL_NONE +#include +#include + +int main() { + float* f = (float*)syclcompat::malloc(sizeof(float)); + bool pass = false; + + if (syclcompat::is_device_ptr(f)) { + pass = true; + } + + syclcompat::free(f); + + return (pass ? 0 : 1); +} diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_test1.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_test1.cpp index c710d9933a622..e6c55134cdea4 100644 --- a/sycl/test-e2e/syclcompat/memory/memory_management_test1.cpp +++ b/sycl/test-e2e/syclcompat/memory/memory_management_test1.cpp @@ -422,14 +422,15 @@ template void test_fill_q() { free(h_A); } +constexpr size_t size = 2000; +constexpr size_t offset = 1000; + +syclcompat::constant_memory d_A(size); +syclcompat::constant_memory d_B(size); + void test_constant_memcpy() { std::cout << __PRETTY_FUNCTION__ << std::endl; - constexpr size_t size = 2000; - constexpr size_t offset = 1000; - - syclcompat::constant_memory d_A(size); - syclcompat::constant_memory d_B(size); float *h_A = (float *)malloc(size / 2 * sizeof(float)); float *h_B = (float *)malloc(size / 2 * sizeof(float)); diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_test1_usmnone.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_test1_usmnone.cpp new file mode 100644 index 0000000000000..09b51af153fa9 --- /dev/null +++ b/sycl/test-e2e/syclcompat/memory/memory_management_test1_usmnone.cpp @@ -0,0 +1,500 @@ +// ====------ memory_management_test1_usmnone.cpp---------- -*- C++ -* ----===//// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// +// ===----------------------------------------------------------------------===// + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#define SYCLCOMPAT_USM_LEVEL_NONE +#include +#include +#include "memory_common.hpp" + +void test_memcpy() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + int Num = 5000; + int N1 = 1000; + float *h_A = (float*)malloc(Num*sizeof(float)); + float *h_B = (float*)malloc(Num*sizeof(float)); + float *h_C = (float*)malloc(Num*sizeof(float)); + + for (int i = 0; i < Num; i++) { + h_A[i] = 1.0f; + h_B[i] = 2.0f; + } + + float *d_A; + // hostA[0..999] -> deviceA[0..999] + // hostB[0..3999] -> deviceA[1000..4999] + // deviceA[0..4999] -> hostC[0..4999] + d_A = (float *)syclcompat::malloc(Num * sizeof(float)); + syclcompat::memcpy((void*) d_A, (void*) h_A, N1 * sizeof(float)); + syclcompat::memcpy((void*) (d_A + N1), (void*) h_B, (Num-N1) * sizeof(float)); + syclcompat::memcpy((void*) h_C, (void*) d_A, Num * sizeof(float)); + syclcompat::free((void*)d_A); + + syclcompat::free(0); + syclcompat::free(NULL); + syclcompat::free(nullptr); + + // verify + for(int i = 0; i < N1; i++){ + if (fabs(h_A[i] - h_C[i]) > 1e-5) { + fprintf(stderr,"Check: Elements are A = %f, B = %f, C = %f:\n", h_A[i], h_B[i], h_C[i]); + fprintf(stderr,"Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + + for(int i = N1; i < Num; i++){ + if (fabs(h_B[i] - h_C[i]) > 1e-5) { + fprintf(stderr,"Check: Elements are A = %f, B = %f, C = %f:\n", h_A[i], h_B[i], h_C[i]); + fprintf(stderr,"Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + + + free(h_A); + free(h_B); + free(h_C); +} + +void test_vecadd() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + int Num = 5000; + int Offset = 0; // Current dpcpp version in ics environment has bugs with Offset > 0, + // CORC-6222 has fixed this issue, but the version of dpcpp used in ics + // environment has not cover this patch. After it has this patch, + // Offest could be set to 100, and current test case will pass. + + float *h_A = (float*)malloc(Num*sizeof(float)); + float *h_B = (float*)malloc(Num*sizeof(float)); + float *h_C = (float*)malloc(Num*sizeof(float)); + + //syclcompat::dev_mgr::instance().select_device(0); + + for (int i = 0; i < Num; i++) { + h_A[i] = 1.0f; + h_B[i] = 2.0f; + } + + float *d_A, *d_B, *d_C; + // hostA -> deviceA + // hostB -> deviceB + // kernel: deviceC = deviceA + deviceB + // deviceA -> hostC + d_A = (float *)syclcompat::malloc(Num * sizeof(float)); + d_B = (float *)syclcompat::malloc(Num * sizeof(float)); + d_C = (float *)syclcompat::malloc(Num * sizeof(float)); + syclcompat::memcpy((void*) d_A, (void*) h_A, Num * sizeof(float)); + syclcompat::memcpy((void*) d_B, (void*) h_B, Num * sizeof(float)); + + d_A += Offset; + d_B += Offset; + d_C += Offset; + + { + std::pair buffer_and_offset_A = syclcompat::get_buffer_and_offset(d_A); + size_t offset_A = buffer_and_offset_A.second; + std::pair buffer_and_offset_B = syclcompat::get_buffer_and_offset(d_B); + size_t offset_B = buffer_and_offset_A.second; + std::pair buffer_and_offset_C = syclcompat::get_buffer_and_offset(d_C); + size_t offset_C = buffer_and_offset_A.second; + syclcompat::get_default_queue().submit( + [&](sycl::handler &cgh) { + auto d_A_acc = buffer_and_offset_A.first.get_access(cgh); + auto d_B_acc = buffer_and_offset_B.first.get_access(cgh); + auto d_C_acc = buffer_and_offset_C.first.get_access(cgh); + + cgh.parallel_for( + sycl::range<1>(Num-Offset), + [=](sycl::id<1> id) { + + float *A = (float*)(&d_A_acc[0]+offset_A); + float *B = (float*)(&d_B_acc[0]+offset_B); + float *C = (float*)(&d_C_acc[0]+offset_C); + int i = id[0]; + + C[i] = A[i] + B[i]; + }); + }); + } + + syclcompat::memcpy((void*) (h_C+Offset), (void*) d_C, (Num-Offset) * sizeof(float)); + syclcompat::free((void*)d_A); + syclcompat::free((void*)d_B); + syclcompat::free((void*)d_C); + + // verify + for(int i = Offset; i < Num; i++){ + if (fabs(h_C[i] - h_A[i] - h_B[i]) > 1e-5) { + fprintf(stderr,"Check %d: Elements are A = %f, B = %f, C = %f:\n", i,h_A[i], h_B[i], h_C[i]); + fprintf(stderr,"Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + + + free(h_A); + free(h_B); + free(h_C); +} + + +void test_memset() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + int Num = 10; + int *h_A = (int*)malloc(Num*sizeof(int)); + + for (int i = 0; i < Num; i++) { + h_A[i] = 4; + } + + int *d_A; + + d_A = (int *)syclcompat::malloc(Num * sizeof(int)); + // hostA -> deviceA + syclcompat::memcpy((void*) d_A, (void*) h_A, Num * sizeof(int)); + + // set d_A[0,..., 6] = 0 + syclcompat::memset((void*) d_A, 0, (Num - 3) * sizeof(int)); + + // deviceA -> hostA + syclcompat::memcpy((void*) h_A, (void*) d_A, Num * sizeof(int)); + + syclcompat::free((void*)d_A); + + // check d_A[0,..., 6] = 0 + for (int i = 0; i < Num - 3; i++) { + if (h_A[i] != 0) { + fprintf(stderr, "Check: h_A[%d] is %d:\n", i, h_A[i]); + fprintf(stderr, "Result verification failed at element [%d]!\n", i); + exit(EXIT_FAILURE); + } + } + + // check d_A[7,..., 9] = 4 + for (int i = Num - 3; i < Num; i++) { + if (h_A[i] != 4) { + fprintf(stderr, "Check: h_A[%d] is %d:\n", i, h_A[i]); + fprintf(stderr, "Result verification failed at element h_A[%d]!\n", i); + exit(EXIT_FAILURE); + } + } + + + free(h_A); +} + +const unsigned int Num = 5000; +const unsigned int N1 = 1000; +syclcompat::constant_memory d_A(Num * sizeof(float)); +syclcompat::constant_memory d_B(Num * sizeof(float)); + +void test_constant_memcpy() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + float h_A[Num]; + float h_B[Num]; + float h_C[Num]; + float h_D[Num]; + + for (int i = 0; i < Num; i++) { + h_A[i] = 1.0f; + h_B[i] = 2.0f; + } + + for (int i = 0; i < Num; i++) { + h_A[i] = 1.0f; + h_B[i] = 2.0f; + } + // hostA[0..999] -> deviceA[0..999] + // hostB[0..3999] -> deviceA[1000..4999] + // deviceA[0..4999] -> deviceB[0..4999] + // deviceA[0..4999] -> hostC[0..4999] + // deviceB[0..4999] -> hostD[0..4999] + + syclcompat::memcpy((void *)d_A.get_ptr(), (void *)&h_A[0], N1 * sizeof(float)); + syclcompat::memcpy((char *)d_A.get_ptr() + N1 * sizeof(float), (void*) h_B, (Num-N1) * sizeof(float)); + syclcompat::memcpy((void *)h_C, (void *)d_A.get_ptr(), Num * sizeof(float)); + + syclcompat::memcpy((void *)d_B.get_ptr(), (void *)d_A.get_ptr(), N1 * sizeof(float)); + syclcompat::memcpy((char *)d_B.get_ptr() + N1 * sizeof(float), (void *)((size_t)d_A.get_ptr() + N1* sizeof(float)), (Num - N1) * sizeof(float)); + syclcompat::memcpy((void *)h_D, (void *)d_B.get_ptr(), Num * sizeof(float)); + + // verify hostD + for (int i = 0; i < N1; i++) { + if (fabs(h_A[i] - h_D[i]) > 1e-5) { + fprintf(stderr, "Check: Elements are A = %f, D = %f:\n", h_A[i], h_D[i]); + fprintf(stderr, "Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + + for (int i = N1; i < Num; i++) { + if (fabs(h_B[i] - h_D[i]) > 1e-5) { + fprintf(stderr, "Check: Elements are B = %f, D = %f:\n", h_B[i], h_D[i]); + fprintf(stderr, "Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + +} + +void test_memcpy(sycl::queue &q) { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + int Num = 5000; + int N1 = 1000; + float *h_A = (float*)malloc(Num*sizeof(float)); + float *h_B = (float*)malloc(Num*sizeof(float)); + float *h_C = (float*)malloc(Num*sizeof(float)); + + for (int i = 0; i < Num; i++) { + h_A[i] = 1.0f; + h_B[i] = 2.0f; + } + + float *d_A; + // hostA[0..999] -> deviceA[0..999] + // hostB[0..3999] -> deviceA[1000..4999] + // deviceA[0..4999] -> hostC[0..4999] + d_A = (float *)syclcompat::malloc(Num * sizeof(float), q); + syclcompat::memcpy((void*) d_A, (void*) h_A, N1 * sizeof(float), q); + syclcompat::memcpy((void*) (d_A + N1), (void*) h_B, (Num-N1) * sizeof(float), q); + syclcompat::memcpy((void*) h_C, (void*) d_A, Num * sizeof(float), q); + syclcompat::free((void*)d_A, q); + + syclcompat::free(0, q); + syclcompat::free(NULL, q); + syclcompat::free(nullptr, q); + + // verify + for(int i = 0; i < N1; i++){ + if (fabs(h_A[i] - h_C[i]) > 1e-5) { + fprintf(stderr,"Check: Elements are A = %f, B = %f, C = %f:\n", h_A[i], h_B[i], h_C[i]); + fprintf(stderr,"Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + + for(int i = N1; i < Num; i++){ + if (fabs(h_B[i] - h_C[i]) > 1e-5) { + fprintf(stderr,"Check: Elements are A = %f, B = %f, C = %f:\n", h_A[i], h_B[i], h_C[i]); + fprintf(stderr,"Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + + + free(h_A); + free(h_B); + free(h_C); +} + +void test_vecadd(sycl::queue &q) { + std::cout << __PRETTY_FUNCTION__ << std::endl; + int Num = 5000; + int Offset = 0; // Current dpcpp version in ics environment has bugs with Offset > 0, + // CORC-6222 has fixed this issue, but the version of dpcpp used in ics + // environment has not cover this patch. After it has this patch, + // Offest could be set to 100, and current test case will pass. + + float *h_A = (float*)malloc(Num*sizeof(float)); + float *h_B = (float*)malloc(Num*sizeof(float)); + float *h_C = (float*)malloc(Num*sizeof(float)); + + //syclcompat::dev_mgr::instance().select_device(0); + + for (int i = 0; i < Num; i++) { + h_A[i] = 1.0f; + h_B[i] = 2.0f; + } + + float *d_A, *d_B, *d_C; + // hostA -> deviceA + // hostB -> deviceB + // kernel: deviceC = deviceA + deviceB + // deviceA -> hostC + d_A = (float *)syclcompat::malloc(Num * sizeof(float), q); + d_B = (float *)syclcompat::malloc(Num * sizeof(float), q); + d_C = (float *)syclcompat::malloc(Num * sizeof(float), q); + syclcompat::memcpy((void*) d_A, (void*) h_A, Num * sizeof(float), q); + syclcompat::memcpy((void*) d_B, (void*) h_B, Num * sizeof(float), q); + + d_A += Offset; + d_B += Offset; + d_C += Offset; + + { + std::pair buffer_and_offset_A = syclcompat::get_buffer_and_offset(d_A); + size_t offset_A = buffer_and_offset_A.second; + std::pair buffer_and_offset_B = syclcompat::get_buffer_and_offset(d_B); + size_t offset_B = buffer_and_offset_A.second; + std::pair buffer_and_offset_C = syclcompat::get_buffer_and_offset(d_C); + size_t offset_C = buffer_and_offset_A.second; + syclcompat::get_default_queue().submit( + [&](sycl::handler &cgh) { + auto d_A_acc = buffer_and_offset_A.first.get_access(cgh); + auto d_B_acc = buffer_and_offset_B.first.get_access(cgh); + auto d_C_acc = buffer_and_offset_C.first.get_access(cgh); + + cgh.parallel_for( + sycl::range<1>(Num-Offset), + [=](sycl::id<1> id) { + + float *A = (float*)(&d_A_acc[0]+offset_A); + float *B = (float*)(&d_B_acc[0]+offset_B); + float *C = (float*)(&d_C_acc[0]+offset_C); + int i = id[0]; + + C[i] = A[i] + B[i]; + }); + }); + } + + syclcompat::memcpy((void*) (h_C+Offset), (void*) d_C, (Num-Offset) * sizeof(float), q); + syclcompat::free((void*)d_A, q); + syclcompat::free((void*)d_B, q); + syclcompat::free((void*)d_C, q); + + // verify + for(int i = Offset; i < Num; i++){ + if (fabs(h_C[i] - h_A[i] - h_B[i]) > 1e-5) { + fprintf(stderr,"Check %d: Elements are A = %f, B = %f, C = %f:\n", i,h_A[i], h_B[i], h_C[i]); + fprintf(stderr,"Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + + + free(h_A); + free(h_B); + free(h_C); +} + + +void test_memset(sycl::queue &q) { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + int Num = 10; + int *h_A = (int*)malloc(Num*sizeof(int)); + + for (int i = 0; i < Num; i++) { + h_A[i] = 4; + } + + int *d_A; + + d_A = (int *)syclcompat::malloc(Num * sizeof(int), q); + // hostA -> deviceA + syclcompat::memcpy((void*) d_A, (void*) h_A, Num * sizeof(int), q); + + // set d_A[0,..., 6] = 0 + syclcompat::memset((void*) d_A, 0, (Num - 3) * sizeof(int), q); + + // deviceA -> hostA + syclcompat::memcpy((void*) h_A, (void*) d_A, Num * sizeof(int), q); + + syclcompat::free((void*)d_A, q); + + // check d_A[0,..., 6] = 0 + for (int i = 0; i < Num - 3; i++) { + if (h_A[i] != 0) { + fprintf(stderr, "Check: h_A[%d] is %d:\n", i, h_A[i]); + fprintf(stderr, "Result verification failed at element [%d]!\n", i); + exit(EXIT_FAILURE); + } + } + + // check d_A[7,..., 9] = 4 + for (int i = Num - 3; i < Num; i++) { + if (h_A[i] != 4) { + fprintf(stderr, "Check: h_A[%d] is %d:\n", i, h_A[i]); + fprintf(stderr, "Result verification failed at element h_A[%d]!\n", i); + exit(EXIT_FAILURE); + } + } + + + free(h_A); +} + +void test_constant_memcpy(sycl::queue &q) { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + const unsigned int Num = 5000; + const unsigned int N1 = 1000; + syclcompat::constant_memory d_A(Num * sizeof(float)); + syclcompat::constant_memory d_B(Num * sizeof(float)); + + float h_A[Num]; + float h_B[Num]; + float h_C[Num]; + float h_D[Num]; + + for (int i = 0; i < Num; i++) { + h_A[i] = 1.0f; + h_B[i] = 2.0f; + } + + for (int i = 0; i < Num; i++) { + h_A[i] = 1.0f; + h_B[i] = 2.0f; + } + // hostA[0..999] -> deviceA[0..999] + // hostB[0..3999] -> deviceA[1000..4999] + // deviceA[0..4999] -> deviceB[0..4999] + // deviceA[0..4999] -> hostC[0..4999] + // deviceB[0..4999] -> hostD[0..4999] + + syclcompat::memcpy((void *)d_A.get_ptr(), (void *)&h_A[0], N1 * sizeof(float), q); + syclcompat::memcpy((char *)d_A.get_ptr() + N1 * sizeof(float), (void*) h_B, (Num-N1) * sizeof(float), q); + syclcompat::memcpy((void *)h_C, (void *)d_A.get_ptr(), Num * sizeof(float), q); + + syclcompat::memcpy((void *)d_B.get_ptr(), (void *)d_A.get_ptr(), N1 * sizeof(float), q); + syclcompat::memcpy((char *)d_B.get_ptr() + N1 * sizeof(float), (void *)((size_t)d_A.get_ptr() + N1* sizeof(float)), (Num - N1) * sizeof(float), q); + syclcompat::memcpy((void *)h_D, (void *)d_B.get_ptr(), Num * sizeof(float), q); + + // verify hostD + for (int i = 0; i < N1; i++) { + if (fabs(h_A[i] - h_D[i]) > 1e-5) { + fprintf(stderr, "Check: Elements are A = %f, D = %f:\n", h_A[i], h_D[i]); + fprintf(stderr, "Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + + for (int i = N1; i < Num; i++) { + if (fabs(h_B[i] - h_D[i]) > 1e-5) { + fprintf(stderr, "Check: Elements are B = %f, D = %f:\n", h_B[i], h_D[i]); + fprintf(stderr, "Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + +} + +int main() { + test_memcpy(); + test_vecadd(); + test_memset(); + test_constant_memcpy(); + + sycl::queue q; + test_memcpy(q); + test_vecadd(q); + test_memset(q); + test_constant_memcpy(q); + + return 0; +} diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_test2.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_test2.cpp index c6687ed9c9dc0..47d8488294e61 100644 --- a/sycl/test-e2e/syclcompat/memory/memory_management_test2.cpp +++ b/sycl/test-e2e/syclcompat/memory/memory_management_test2.cpp @@ -187,16 +187,10 @@ void test_global_memory() { auto g_B_acc = g_B.get_access(cgh); auto g_C_acc = g_C.get_access(cgh); cgh.parallel_for(sycl::range<2>(DataW, DataH), [=](sycl::id<2> id) { - // test_feature:accessor - // test_feature:memory_region syclcompat::accessor A( g_A_acc); - // test_feature:accessor - // test_feature:memory_region syclcompat::accessor B( g_B_acc); - // test_feature:accessor - // test_feature:memory_region syclcompat::accessor C( g_C_acc); int i = id[0], j = id[1]; diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_test2_usmnone.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_test2_usmnone.cpp new file mode 100644 index 0000000000000..e36f2384382af --- /dev/null +++ b/sycl/test-e2e/syclcompat/memory/memory_management_test2_usmnone.cpp @@ -0,0 +1,551 @@ +// ====------ memory_management_test_mempcy_2_usmnone.cpp---------- -*- C++ -* ----===//// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// +// ===----------------------------------------------------------------------===// +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#define SYCLCOMPAT_USM_LEVEL_NONE +#include +#include +#include "memory_common.hpp" + +void check(float *h_data, float *h_ref, size_t width, size_t height, + size_t depth) { + for (int i = 0; i < width * height * depth; i++) { + float diff = fabs(h_data[i] - h_ref[i]); + if (diff > 1.e-6) { + printf("Verification failed!"); + printf("h_data[%d]=%f, h_ref[%d]=%f, diff=%f\n", i, h_data[i], i, + h_ref[i], diff); + exit(-1); + } + } +} + + +void test_mempcy_pitched() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + size_t width = 6; + size_t height = 8; + float *h_data; + float *h_ref; + size_t h_pitch = sizeof(float) * width; + h_data = (float *)malloc(sizeof(float) * width * height); + for (int i = 0; i < width * height; i++) + h_data[i] = (float)i; + + h_ref = (float *)malloc(sizeof(float) * width * height); + for (int i = 0; i < width * height; i++) + h_ref[i] = (float)i; + + // alloc device memory. + size_t d_pitch; + float *d_data; + d_data = (float *)syclcompat::malloc(d_pitch, sizeof(float) * width, height); + + // copy to Device. + syclcompat::memcpy(d_data, d_pitch, h_data, h_pitch, sizeof(float) * width, height); + + // copy back to host. + syclcompat::memcpy(h_data, h_pitch, d_data, d_pitch, sizeof(float) * width, height); + + check(h_data, h_ref, width, height, 1); + + // memset device data. + syclcompat::memset(d_data, d_pitch, 0x1, sizeof(float) * width, height); + + // copy back to host + syclcompat::memcpy(h_data, h_pitch, d_data, d_pitch, sizeof(float) * width, height); + + // memset reference data. + memset(h_ref, 0x1, width * height * sizeof(float)); + check(h_data, h_ref, width, height, 1); + + free(h_data); + free(h_ref); + syclcompat::free((void *)d_data); +} + +void test_memcpy_reinterp_kernel() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + int Num = 5000; + + float *h_A = (float*)malloc(Num*sizeof(float)); + float *h_B = (float*)malloc(Num*sizeof(float)); + float *h_C = (float*)malloc(Num*sizeof(float)); + + for (int i = 0; i < Num; i++) { + h_A[i] = 1.0f; + h_B[i] = 2.0f; + } + + float *d_A, *d_B, *d_C; + // hostA -> deviceA + // hostB -> deviceB + // kernel: deviceC = deviceA + deviceB + // deviceA -> hostC + d_A = (float *)syclcompat::malloc(Num * sizeof(float)); + d_B = (float *)syclcompat::malloc(Num * sizeof(float)); + d_C = (float *)syclcompat::malloc(Num * sizeof(float)); + syclcompat::memcpy((void*) d_A, (void*) h_A, Num * sizeof(float)); + syclcompat::memcpy((void*) d_B, (void*) h_B, Num * sizeof(float)); + + { + syclcompat::buffer_t buffer_A = syclcompat::get_buffer(d_A); + syclcompat::buffer_t buffer_B = syclcompat::get_buffer(d_B); + syclcompat::buffer_t buffer_C = syclcompat::get_buffer(d_C); + + syclcompat::get_default_queue().submit( + [&](sycl::handler &cgh) { + auto A = buffer_A.reinterpret().get_access(cgh); + auto B = buffer_B.reinterpret().get_access(cgh); + auto C = buffer_C.reinterpret().get_access(cgh); + + cgh.parallel_for( + sycl::range<1>(Num), + [=](sycl::id<1> id) { + int i = id[0]; + + C[i] = A[i] + B[i]; + }); + }); + syclcompat::get_default_queue().wait_and_throw(); + } + + syclcompat::memcpy((void*) (h_C), (void*) d_C, (Num) * sizeof(float)); + syclcompat::free((void*)d_A); + syclcompat::free((void*)d_B); + syclcompat::free((void*)d_C); + + // verify + for(int i = 0; i < Num; i++){ + if (fabs(h_C[i] - h_A[i] - h_B[i]) > 1e-5) { + fprintf(stderr,"Check %d: Elements are A = %f, B = %f, C = %f:\n", i,h_A[i], h_B[i], h_C[i]); + fprintf(stderr,"Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + + free(h_A); + free(h_B); + free(h_C); +} + +void test_memcpy_kernel() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + int Num = 5000; + + float *h_A = (float*)malloc(Num*sizeof(float)); + float *h_B = (float*)malloc(Num*sizeof(float)); + float *h_C = (float*)malloc(Num*sizeof(float)); + + for (int i = 0; i < Num; i++) { + h_A[i] = 1.0f; + h_B[i] = 2.0f; + } + + float *d_A, *d_B, *d_C; + // hostA -> deviceA + // hostB -> deviceB + // kernel: deviceC = deviceA + deviceB + // deviceA -> hostC + d_A = (float *)syclcompat::malloc(Num * sizeof(float)); + d_B = (float *)syclcompat::malloc(Num * sizeof(float)); + d_C = (float *)syclcompat::malloc(Num * sizeof(float)); + syclcompat::memcpy((void*) d_A, (void*) h_A, Num * sizeof(float)); + syclcompat::memcpy((void*) d_B, (void*) h_B, Num * sizeof(float)); + + { + auto buffer_A = syclcompat::get_buffer(d_A); + auto buffer_B = syclcompat::get_buffer(d_B); + auto buffer_C = syclcompat::get_buffer(d_C); + + syclcompat::get_default_queue().submit( + [&](sycl::handler &cgh) { + auto A = buffer_A.get_access(cgh); + auto B = buffer_B.get_access(cgh); + auto C = buffer_C.get_access(cgh); + + cgh.parallel_for( + sycl::range<1>(Num), + [=](sycl::id<1> id) { + int i = id[0]; + + C[i] = A[i] + B[i]; + }); + }); + syclcompat::get_default_queue().wait_and_throw(); + } + + syclcompat::memcpy((void*) (h_C), (void*) d_C, Num * sizeof(float)); + syclcompat::free((void*)d_A); + syclcompat::free((void*)d_B); + syclcompat::free((void*)d_C); + + // verify + for(int i = 0; i < Num; i++){ + if (fabs(h_C[i] - h_A[i] - h_B[i]) > 1e-5) { + fprintf(stderr,"Check %d: Elements are A = %f, B = %f, C = %f:\n", i,h_A[i], h_B[i], h_C[i]); + fprintf(stderr,"Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + + free(h_A); + free(h_B); + free(h_C); +} + +void test_access_wrapper() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + int Num = 5000; + int Offset = 0; // Current dpcpp version in ics environment has bugs with Offset > 0, + // CORC-6222 has fixed this issue, but the version of dpcpp used in ics + // environment has not cover this patch. After it has this patch, + // Offest could be set to 100, and current test case will pass. + + float *h_A = (float*)malloc(Num*sizeof(float)); + float *h_B = (float*)malloc(Num*sizeof(float)); + float *h_C = (float*)malloc(Num*sizeof(float)); + + for (int i = 0; i < Num; i++) { + h_A[i] = 1.0f; + h_B[i] = 2.0f; + } + + float *d_A, *d_B, *d_C; + // hostA -> deviceA + // hostB -> deviceB + // kernel: deviceC = deviceA + deviceB + // deviceA -> hostC + d_A = (float *)syclcompat::malloc(Num * sizeof(float)); + d_B = (float *)syclcompat::malloc(Num * sizeof(float)); + d_C = (float *)syclcompat::malloc(Num * sizeof(float)); + syclcompat::memcpy((void*) d_A, (void*) h_A, Num * sizeof(float)); + syclcompat::memcpy((void*) d_B, (void*) h_B, Num * sizeof(float)); + + d_A += Offset; + d_B += Offset; + d_C += Offset; + + { + syclcompat::get_default_queue().submit( + [&](sycl::handler &cgh) { + syclcompat::access_wrapper d_A_acc(d_A, cgh); + syclcompat::access_wrapper d_B_acc(d_B, cgh); + syclcompat::access_wrapper d_C_acc(d_C, cgh); + + cgh.parallel_for( + sycl::range<1>(Num-Offset), + [=](sycl::id<1> id) { + + float *A = d_A_acc.get_raw_pointer(); + float *B = d_B_acc.get_raw_pointer(); + float *C = d_C_acc.get_raw_pointer(); + int i = id[0]; + C[i] = A[i] + B[i]; + }); + }); + syclcompat::get_default_queue().wait_and_throw(); + } + + syclcompat::memcpy((void*) (h_C+Offset), (void*) d_C, (Num-Offset) * sizeof(float)); + syclcompat::free((void*)d_A); + syclcompat::free((void*)d_B); + syclcompat::free((void*)d_C); + + // verify + for(int i = Offset; i < Num; i++){ + if (fabs(h_C[i] - h_A[i] - h_B[i]) > 1e-5) { + fprintf(stderr,"Check %d: Elements are A = %f, B = %f, C = %f:\n", i,h_A[i], h_B[i], h_C[i]); + fprintf(stderr,"Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + + free(h_A); + free(h_B); + free(h_C); +} + +#define DataW 100 +#define DataH 100 +syclcompat::constant_memory c_A(DataW, DataH); +syclcompat::constant_memory c_B(DataW, DataH); +syclcompat::global_memory c_C(DataW, DataH); + +void test_constant_memory() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + float h_A[DataW][DataH]; + float h_B[DataW][DataH]; + float h_C[DataW][DataH]; + + for (int i = 0; i < DataW; i++) { + for (int j = 0; j < DataH; j++) { + h_A[i][j] = 1.0f; + h_B[i][j] = 2.0f; + } + } + + c_A.init(); + c_B.init(); + c_C.init(); + syclcompat::memcpy((void *)c_A.get_ptr(), (void *)&h_A[0][0], DataW * DataH * sizeof(float)); + syclcompat::memcpy((void *)c_B.get_ptr(), (void *)&h_B[0][0], DataW * DataH * sizeof(float)); + + { + syclcompat::get_default_queue().submit( + [&](sycl::handler &cgh) { + auto c_A_acc = c_A.get_access(cgh); + auto c_B_acc = c_B.get_access(cgh); + auto c_C_acc = c_C.get_access(cgh); + cgh.parallel_for( + sycl::range<2>(DataW, DataH), + [=](sycl::id<2> id) { + syclcompat::accessor A(c_A_acc); + syclcompat::accessor B(c_B_acc); + syclcompat::accessor C(c_C_acc); + int i = id[0], j = id[1]; + C[i][j] = A[i][j] + B[i][j]; + }); + }); + syclcompat::get_default_queue().wait_and_throw(); + } + syclcompat::memcpy((void *)&h_C[0][0], (void *)c_C.get_ptr(), DataW * DataH * sizeof(float)); + + // verify hostD + for (int i = 0; i < DataW; i++) { + for (int j = 0; j < DataH; j++) { + if (fabs(h_C[i][j] - h_A[i][j] - h_B[i][j]) > 1e-5) { + fprintf(stderr, "Result verification failed at element [%d][%d]:\n", i, j); + exit(EXIT_FAILURE); + } + } + } +} + +syclcompat::global_memory g_A(DataW, DataH); +syclcompat::global_memory g_B(DataW, DataH); +syclcompat::global_memory g_C(DataW, DataH); + +void test_global_memory() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + float h_A[DataW][DataH]; + float h_B[DataW][DataH]; + float h_C[DataW][DataH]; + + for (int i = 0; i < DataW; i++) { + for (int j = 0; j < DataH; j++) { + h_A[i][j] = 1.0f; + h_B[i][j] = 2.0f; + } + } + + g_A.init(); + g_B.init(); + g_C.init(); + + syclcompat::memcpy((void *)g_A.get_ptr(), (void *)&h_A[0][0], DataW * DataH * sizeof(float)); + syclcompat::memcpy((void *)g_B.get_ptr(), (void *)&h_B[0][0], DataW * DataH * sizeof(float)); + + { + syclcompat::get_default_queue().submit( + [&](sycl::handler &cgh) { + auto g_A_acc = g_A.get_access(cgh); + auto g_B_acc = g_B.get_access(cgh); + auto g_C_acc = g_C.get_access(cgh); + cgh.parallel_for( + sycl::range<2>(DataW, DataH), + [=](sycl::id<2> id) { + syclcompat::accessor A(g_A_acc); + syclcompat::accessor B(g_B_acc); + syclcompat::accessor C(g_C_acc); + int i = id[0], j = id[1]; + C[i][j] = A[i][j] + B[i][j]; + }); + }); + syclcompat::get_default_queue().wait_and_throw(); + } + syclcompat::memcpy((void *)&h_C[0][0], (void *)g_C.get_ptr(), DataW * DataH * sizeof(float)); + + // verify hostD + for (int i = 0; i < DataW; i++) { + for (int j = 0; j < DataH; j++) { + if (fabs(h_C[i][j] - h_A[i][j] - h_B[i][j]) > 1e-5) { + fprintf(stderr, "Result verification failed at element [%d][%d]:\n", i, j); + exit(EXIT_FAILURE); + } + } + } +} + +syclcompat::shared_memory s_A(DataW); +syclcompat::shared_memory s_B(DataW); +syclcompat::shared_memory s_C(DataW); + +void test_shared_memory() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + s_A.init(); + s_B.init(); + s_C.init(); + + for (int i = 0; i < DataW; i++) { + s_A[i] = 1.0f; + s_B[i] = 2.0f; + } + + { + syclcompat::get_default_queue().submit( + [&](sycl::handler &cgh) { + syclcompat::access_wrapper A_acc(s_A.get_ptr(), cgh); + syclcompat::access_wrapper B_acc(s_B.get_ptr(), cgh); + syclcompat::access_wrapper C_acc(s_C.get_ptr(), cgh); + cgh.parallel_for( + sycl::range<1>(DataW), + [=](sycl::id<1> id) { + int i = id[0]; + float * A = A_acc.get_raw_pointer(); + float * B = B_acc.get_raw_pointer(); + float * C = C_acc.get_raw_pointer(); + C[i] = A[i] + B[i]; + }); + }); + syclcompat::get_default_queue().wait_and_throw(); + } + + // verify hostD + for (int i = 0; i < DataW; i++) { + for (int j = 0; j < DataH; j++) { + if (fabs(s_C[i] - s_A[i] - s_B[i]) > 1e-5) { + fprintf(stderr, "Result verification failed at element [%d][%d]:\n", i, j); + exit(EXIT_FAILURE); + } + } + } +} + +void test_local_memory() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + int Num = 16; + + float *h_A = (float*)malloc(Num * Num * sizeof(float)); + float *h_B = (float*)malloc(Num * Num * sizeof(float)); + + for (int i = 0; i < Num; i++) { + for(int j = 0; j < Num; j++) { + h_A[i * Num + j] = 2.0f; + } + } + + float *d_A; + d_A = (float *)syclcompat::malloc(Num * Num * sizeof(float)); + + { + auto buffer_A = syclcompat::get_buffer(d_A); + + syclcompat::get_default_queue().submit( + [&](sycl::handler &cgh) { + sycl::range<2> acc_range(Num, Num); + sycl::local_accessor C_local_acc(acc_range, cgh); + auto A = buffer_A.get_access(cgh); + + cgh.parallel_for( + sycl::nd_range<2>(sycl::range<2>(Num, Num), sycl::range<2>(Num, Num)), + [=](sycl::nd_item<2> id) { + syclcompat::accessor C_local(C_local_acc, acc_range); + int i = id.get_local_id(0), j = id.get_local_id(1); + C_local[i][j] = 1; + A[i * Num + j] = C_local[i][j] * 2; + }); + }); + syclcompat::get_default_queue().wait_and_throw(); + } + + syclcompat::memcpy((void*) (h_B), (void*) d_A, Num * Num * sizeof(float)); + syclcompat::free((void*)d_A); + + // verify + for(int i = 0; i < Num * Num; i++){ + if (fabs(h_A[i] - h_B[i]) > 1e-5) { + fprintf(stderr,"Check %d: Elements are A = %f, B = %f\n", i, h_A[i], h_B[i]); + fprintf(stderr,"Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + + free(h_A); + free(h_B); +} + +void test_mempcy_pitched(sycl::queue &q) { + std::cout << __PRETTY_FUNCTION__ << std::endl; + size_t width = 6; + size_t height = 8; + float *h_data; + float *h_ref; + size_t h_pitch = sizeof(float) * width; + h_data = (float *)malloc(sizeof(float) * width * height); + for (int i = 0; i < width * height; i++) + h_data[i] = (float)i; + + h_ref = (float *)malloc(sizeof(float) * width * height); + for (int i = 0; i < width * height; i++) + h_ref[i] = (float)i; + + // alloc device memory. + size_t d_pitch; + float *d_data; + d_data = (float *)syclcompat::malloc(d_pitch, sizeof(float) * width, height, q); + + // copy to Device. + syclcompat::memcpy(d_data, d_pitch, h_data, h_pitch, sizeof(float) * width, height, q); + + // copy back to host. + syclcompat::memcpy(h_data, h_pitch, d_data, d_pitch, sizeof(float) * width, height, q); + + check(h_data, h_ref, width, height, 1); + + // memset device data. + syclcompat::memset(d_data, d_pitch, 0x1, sizeof(float) * width, height, q); + + // copy back to host + syclcompat::memcpy(h_data, h_pitch, d_data, d_pitch, sizeof(float) * width, height, q); + + // memset reference data. + memset(h_ref, 0x1, width * height * sizeof(float)); + check(h_data, h_ref, width, height, 1); + + free(h_data); + free(h_ref); + syclcompat::free((void *)d_data, q); + + printf("test_mempcy_pitched passed!\n"); +} + +int main() { + test_mempcy_pitched(); + test_memcpy_reinterp_kernel(); + test_memcpy_kernel(); + test_access_wrapper(); + test_constant_memory(); + test_global_memory(); + test_shared_memory(); + test_local_memory(); + + sycl::queue q; + test_mempcy_pitched(q); + + return 0; +} diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_test3_usmnone.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_test3_usmnone.cpp new file mode 100644 index 0000000000000..3a8e86183b080 --- /dev/null +++ b/sycl/test-e2e/syclcompat/memory/memory_management_test3_usmnone.cpp @@ -0,0 +1,598 @@ +// ====------ memory_management_test3_usmnone.cpp---------- -*- C++ -* ----===//// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// +// ===----------------------------------------------------------------------===// + + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#define SYCLCOMPAT_USM_LEVEL_NONE +#include +#include +#include "memory_common.hpp" + +void check(float *h_data, float *h_ref, size_t width, size_t height, + size_t depth) { + for (int i = 0; i < width * height * depth; i++) { + float diff = fabs(h_data[i] - h_ref[i]); + if (diff > 1.e-6) { + printf("Verification failed!"); + printf("h_data[%d]=%f, h_ref[%d]=%f, diff=%f\n", i, h_data[i], i, + h_ref[i], diff); + exit(-1); + } + } +} + +void test_mempcy_async() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + int Num = 5000; + int N1 = 1000; + float *h_A = (float*)malloc(Num*sizeof(float)); + float *h_B = (float*)malloc(Num*sizeof(float)); + float *h_C = (float*)malloc(Num*sizeof(float)); + + for (int i = 0; i < Num; i++) { + h_A[i] = 1.0f; + h_B[i] = 2.0f; + } + + float *d_A; + // hostA[0..999] -> deviceA[0..999] + // hostB[0..3999] -> deviceA[1000..4999] + // deviceA[0..4999] -> hostC[0..4999] + d_A = (float *)syclcompat::malloc(Num * sizeof(float)); + syclcompat::memcpy_async((void*) d_A, (void*) h_A, N1 * sizeof(float)); + syclcompat::memcpy_async((void*) (d_A + N1), (void*) h_B, (Num-N1) * sizeof(float)); + syclcompat::memcpy_async((void*) h_C, (void*) d_A, Num * sizeof(float)); + + syclcompat::wait_and_free((void*)d_A); + + syclcompat::free(0); + syclcompat::free(NULL); + syclcompat::free(nullptr); + + // verify + for(int i = 0; i < N1; i++){ + if (fabs(h_A[i] - h_C[i]) > 1e-5) { + fprintf(stderr,"Check: Elements are A = %f, B = %f, C = %f:\n", h_A[i], h_B[i], h_C[i]); + fprintf(stderr,"Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + + for(int i = N1; i < Num; i++){ + if (fabs(h_B[i] - h_C[i]) > 1e-5) { + fprintf(stderr,"Check: Elements are A = %f, B = %f, C = %f:\n", h_A[i], h_B[i], h_C[i]); + fprintf(stderr,"Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + + free(h_A); + free(h_B); + free(h_C); +} + +void test_buffer_and_offset_kernel() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + int Num = 5000; + int Offset = 0; // Current dpcpp version in ics environment has bugs with Offset > 0, + // CORC-6222 has fixed this issue, but the version of dpcpp used in ics + // environment has not cover this patch. After it has this patch, + // Offest could be set to 100, and current test case will pass. + + float *h_A = (float*)malloc(Num*sizeof(float)); + float *h_B = (float*)malloc(Num*sizeof(float)); + float *h_C = (float*)malloc(Num*sizeof(float)); + + //syclcompat::dev_mgr::instance().select_device(0); + + for (int i = 0; i < Num; i++) { + h_A[i] = 1.0f; + h_B[i] = 2.0f; + } + + float *d_A, *d_B, *d_C; + // hostA -> deviceA + // hostB -> deviceB + // kernel: deviceC = deviceA + deviceB + // deviceA -> hostC + d_A = (float *)syclcompat::malloc(Num * sizeof(float)); + d_B = (float *)syclcompat::malloc(Num * sizeof(float)); + d_C = (float *)syclcompat::malloc(Num * sizeof(float)); + syclcompat::memcpy_async((void*) d_A, (void*) h_A, Num * sizeof(float)); + syclcompat::memcpy_async((void*) d_B, (void*) h_B, Num * sizeof(float)); + + syclcompat::get_default_queue().wait_and_throw(); + + d_A += Offset; + d_B += Offset; + d_C += Offset; + + { + std::pair buffer_and_offset_A = syclcompat::get_buffer_and_offset(d_A); + size_t offset_A = buffer_and_offset_A.second; + std::pair buffer_and_offset_B = syclcompat::get_buffer_and_offset(d_B); + size_t offset_B = buffer_and_offset_A.second; + std::pair buffer_and_offset_C = syclcompat::get_buffer_and_offset(d_C); + size_t offset_C = buffer_and_offset_A.second; + syclcompat::get_default_queue().submit( + [&](sycl::handler &cgh) { + auto d_A_acc = buffer_and_offset_A.first.get_access(cgh); + auto d_B_acc = buffer_and_offset_B.first.get_access(cgh); + auto d_C_acc = buffer_and_offset_C.first.get_access(cgh); + + cgh.parallel_for( + sycl::range<1>(Num-Offset), + [=](sycl::id<1> id) { + + float *A = (float*)(&d_A_acc[0]+offset_A); + float *B = (float*)(&d_B_acc[0]+offset_B); + float *C = (float*)(&d_C_acc[0]+offset_C); + int i = id[0]; + + C[i] = A[i] + B[i]; + }); + }); + } + syclcompat::memcpy_async((void*) (h_C+Offset), (void*) d_C, (Num-Offset) * sizeof(float)); + + syclcompat::get_default_queue().wait_and_throw(); + + syclcompat::free((void*)d_A); + syclcompat::free((void*)d_B); + syclcompat::free((void*)d_C); + + // verify + for(int i = Offset; i < Num; i++){ + if (fabs(h_C[i] - h_A[i] - h_B[i]) > 1e-5) { + fprintf(stderr,"Check %d: Elements are A = %f, B = %f, C = %f:\n", i,h_A[i], h_B[i], h_C[i]); + fprintf(stderr,"Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + + free(h_A); + free(h_B); + free(h_C); +} + + +void test_memset_async() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + int Num = 10; + int *h_A = (int*)malloc(Num*sizeof(int)); + + for (int i = 0; i < Num; i++) { + h_A[i] = 4; + } + + int *d_A; + + d_A = (int *)syclcompat::malloc(Num * sizeof(int)); + // hostA -> deviceA + syclcompat::memcpy_async((void*) d_A, (void*) h_A, Num * sizeof(int)); + + // set d_A[0,..., 6] = 0 + syclcompat::memset_async((void*) d_A, 0, (Num - 3) * sizeof(int)); + + // deviceA -> hostA + syclcompat::memcpy_async((void*) h_A, (void*) d_A, Num * sizeof(int)); + + syclcompat::get_default_queue().wait_and_throw(); + + syclcompat::free((void*)d_A); + + // check d_A[0,..., 6] = 0 + for (int i = 0; i < Num - 3; i++) { + if (h_A[i] != 0) { + fprintf(stderr, "Check: h_A[%d] is %d:\n", i, h_A[i]); + fprintf(stderr, "Result verification failed at element [%d]!\n", i); + exit(EXIT_FAILURE); + } + } + + // check d_A[7,..., 9] = 4 + for (int i = Num - 3; i < Num; i++) { + if (h_A[i] != 4) { + fprintf(stderr, "Check: h_A[%d] is %d:\n", i, h_A[i]); + fprintf(stderr, "Result verification failed at element h_A[%d]!\n", i); + exit(EXIT_FAILURE); + } + } + + free(h_A); +} + +const unsigned int Num = 5000; +const unsigned int N1 = 1000; +syclcompat::constant_memory d_A(Num * sizeof(float)); +syclcompat::constant_memory d_B(Num * sizeof(float)); + +void test_memcpy_async_getptr() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + float h_A[Num]; + float h_B[Num]; + float h_C[Num]; + float h_D[Num]; + + for (int i = 0; i < Num; i++) { + h_A[i] = 1.0f; + h_B[i] = 2.0f; + } + + for (int i = 0; i < Num; i++) { + h_A[i] = 1.0f; + h_B[i] = 2.0f; + } + // hostA[0..999] -> deviceA[0..999] + // hostB[0..3999] -> deviceA[1000..4999] + // deviceA[0..4999] -> deviceB[0..4999] + // deviceA[0..4999] -> hostC[0..4999] + // deviceB[0..4999] -> hostD[0..4999] + syclcompat::memcpy_async((void *)d_A.get_ptr(), (void *)&h_A[0], N1 * sizeof(float)); + syclcompat::memcpy_async((char *)d_A.get_ptr() + N1 * sizeof(float), (void*) h_B, (Num-N1) * sizeof(float)); + syclcompat::memcpy_async((void *)h_C, (void *)d_A.get_ptr(), Num * sizeof(float)); + syclcompat::memcpy_async((void *)d_B.get_ptr(), (void *)d_A.get_ptr(), N1 * sizeof(float)); + syclcompat::memcpy_async((char *)d_B.get_ptr() + N1 * sizeof(float), (void *)((size_t)d_A.get_ptr() + N1* sizeof(float)), (Num - N1) * sizeof(float)); + syclcompat::memcpy_async((void *)h_D, (void *)d_B.get_ptr(), Num * sizeof(float)); + + syclcompat::get_default_queue().wait_and_throw(); + // verify hostD + for (int i = 0; i < N1; i++) { + if (fabs(h_A[i] - h_D[i]) > 1e-5) { + fprintf(stderr, "Check: Elements are A = %f, D = %f:\n", h_A[i], h_D[i]); + fprintf(stderr, "Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + + for (int i = N1; i < Num; i++) { + if (fabs(h_B[i] - h_D[i]) > 1e-5) { + fprintf(stderr, "Check: Elements are B = %f, D = %f:\n", h_B[i], h_D[i]); + fprintf(stderr, "Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } +} + +void test_memcpy_pitched_async() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + size_t width = 6; + size_t height = 8; + float *h_data; + float *h_ref; + size_t h_pitch = sizeof(float) * width; + h_data = (float *)malloc(sizeof(float) * width * height); + for (int i = 0; i < width * height; i++) + h_data[i] = (float)i; + + h_ref = (float *)malloc(sizeof(float) * width * height); + for (int i = 0; i < width * height; i++) + h_ref[i] = (float)i; + + // alloc device memory. + size_t d_pitch; + float *d_data; + d_data = (float *)syclcompat::malloc(d_pitch, sizeof(float) * width, height); + + // copy to Device. + syclcompat::memcpy_async(d_data, d_pitch, h_data, h_pitch, sizeof(float) * width, height); + + // copy back to host. + syclcompat::memcpy_async(h_data, h_pitch, d_data, d_pitch, sizeof(float) * width, height); + + syclcompat::get_default_queue().wait_and_throw(); + check(h_data, h_ref, width, height, 1); + + // memset device data. + syclcompat::memset_async(d_data, d_pitch, 0x1, sizeof(float) * width, height); + + // copy back to host + syclcompat::memcpy_async(h_data, h_pitch, d_data, d_pitch, sizeof(float) * width, height); + syclcompat::get_default_queue().wait_and_throw(); + // memset reference data. + memset(h_ref, 0x1, width * height * sizeof(float)); + check(h_data, h_ref, width, height, 1); + + free(h_data); + free(h_ref); + syclcompat::free((void *)d_data); +} + +void test_mempcy_async(sycl::queue &q) { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + int Num = 5000; + int N1 = 1000; + float *h_A = (float*)malloc(Num*sizeof(float)); + float *h_B = (float*)malloc(Num*sizeof(float)); + float *h_C = (float*)malloc(Num*sizeof(float)); + + for (int i = 0; i < Num; i++) { + h_A[i] = 1.0f; + h_B[i] = 2.0f; + } + + float *d_A; + // hostA[0..999] -> deviceA[0..999] + // hostB[0..3999] -> deviceA[1000..4999] + // deviceA[0..4999] -> hostC[0..4999] + d_A = (float *)syclcompat::malloc(Num * sizeof(float), q); + syclcompat::memcpy_async((void*) d_A, (void*) h_A, N1 * sizeof(float), q); + syclcompat::memcpy_async((void*) (d_A + N1), (void*) h_B, (Num-N1) * sizeof(float), q); + syclcompat::memcpy_async((void*) h_C, (void*) d_A, Num * sizeof(float), q); + q.wait_and_throw(); + syclcompat::free((void*)d_A, q); + + syclcompat::free(0, q); + syclcompat::free(NULL, q); + syclcompat::free(nullptr, q); + + // verify + for(int i = 0; i < N1; i++){ + if (fabs(h_A[i] - h_C[i]) > 1e-5) { + fprintf(stderr,"Check: Elements are A = %f, B = %f, C = %f:\n", h_A[i], h_B[i], h_C[i]); + fprintf(stderr,"Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + + for(int i = N1; i < Num; i++){ + if (fabs(h_B[i] - h_C[i]) > 1e-5) { + fprintf(stderr,"Check: Elements are A = %f, B = %f, C = %f:\n", h_A[i], h_B[i], h_C[i]); + fprintf(stderr,"Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + + free(h_A); + free(h_B); + free(h_C); +} + +void test_buffer_and_offset_kernel(sycl::queue &q) { + std::cout << __PRETTY_FUNCTION__ << std::endl; + int Num = 5000; + int Offset = 0; // Current dpcpp version in ics environment has bugs with Offset > 0, + // CORC-6222 has fixed this issue, but the version of dpcpp used in ics + // environment has not cover this patch. After it has this patch, + // Offest could be set to 100, and current test case will pass. + + float *h_A = (float*)malloc(Num*sizeof(float)); + float *h_B = (float*)malloc(Num*sizeof(float)); + float *h_C = (float*)malloc(Num*sizeof(float)); + + //syclcompat::dev_mgr::instance().select_device(0); + + for (int i = 0; i < Num; i++) { + h_A[i] = 1.0f; + h_B[i] = 2.0f; + } + + float *d_A, *d_B, *d_C; + // hostA -> deviceA + // hostB -> deviceB + // kernel: deviceC = deviceA + deviceB + // deviceA -> hostC + d_A = (float *)syclcompat::malloc(Num * sizeof(float), q); + d_B = (float *)syclcompat::malloc(Num * sizeof(float), q); + d_C = (float *)syclcompat::malloc(Num * sizeof(float), q); + syclcompat::memcpy_async((void*) d_A, (void*) h_A, Num * sizeof(float), q); + syclcompat::memcpy_async((void*) d_B, (void*) h_B, Num * sizeof(float), q); + q.wait_and_throw(); + d_A += Offset; + d_B += Offset; + d_C += Offset; + + { + std::pair buffer_and_offset_A = syclcompat::get_buffer_and_offset(d_A); + size_t offset_A = buffer_and_offset_A.second; + std::pair buffer_and_offset_B = syclcompat::get_buffer_and_offset(d_B); + size_t offset_B = buffer_and_offset_A.second; + std::pair buffer_and_offset_C = syclcompat::get_buffer_and_offset(d_C); + size_t offset_C = buffer_and_offset_A.second; + syclcompat::get_default_queue().submit( + [&](sycl::handler &cgh) { + auto d_A_acc = buffer_and_offset_A.first.get_access(cgh); + auto d_B_acc = buffer_and_offset_B.first.get_access(cgh); + auto d_C_acc = buffer_and_offset_C.first.get_access(cgh); + + cgh.parallel_for( + sycl::range<1>(Num-Offset), + [=](sycl::id<1> id) { + + float *A = (float*)(&d_A_acc[0]+offset_A); + float *B = (float*)(&d_B_acc[0]+offset_B); + float *C = (float*)(&d_C_acc[0]+offset_C); + int i = id[0]; + + C[i] = A[i] + B[i]; + }); + }); + } + syclcompat::memcpy_async((void*) (h_C+Offset), (void*) d_C, (Num-Offset) * sizeof(float), q); + q.wait_and_throw(); + syclcompat::free((void*)d_A, q); + syclcompat::free((void*)d_B, q); + syclcompat::free((void*)d_C, q); + + // verify + for(int i = Offset; i < Num; i++){ + if (fabs(h_C[i] - h_A[i] - h_B[i]) > 1e-5) { + fprintf(stderr,"Check %d: Elements are A = %f, B = %f, C = %f:\n", i,h_A[i], h_B[i], h_C[i]); + fprintf(stderr,"Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + + free(h_A); + free(h_B); + free(h_C); +} + + +void test_memset_async(sycl::queue &q) { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + int Num = 10; + int *h_A = (int*)malloc(Num*sizeof(int)); + + for (int i = 0; i < Num; i++) { + h_A[i] = 4; + } + + int *d_A; + + d_A = (int *)syclcompat::malloc(Num * sizeof(int), q); + // hostA -> deviceA + syclcompat::memcpy_async((void*) d_A, (void*) h_A, Num * sizeof(int), q); + + // set d_A[0,..., 6] = 0 + syclcompat::memset_async((void*) d_A, 0, (Num - 3) * sizeof(int), q); + + // deviceA -> hostA + syclcompat::memcpy_async((void*) h_A, (void*) d_A, Num * sizeof(int), q); + syclcompat::wait_and_free((void*)d_A, q); + + // check d_A[0,..., 6] = 0 + for (int i = 0; i < Num - 3; i++) { + if (h_A[i] != 0) { + fprintf(stderr, "Check: h_A[%d] is %d:\n", i, h_A[i]); + fprintf(stderr, "Result verification failed at element [%d]!\n", i); + exit(EXIT_FAILURE); + } + } + + // check d_A[7,..., 9] = 4 + for (int i = Num - 3; i < Num; i++) { + if (h_A[i] != 4) { + fprintf(stderr, "Check: h_A[%d] is %d:\n", i, h_A[i]); + fprintf(stderr, "Result verification failed at element h_A[%d]!\n", i); + exit(EXIT_FAILURE); + } + } + free(h_A); +} + +void test_memcpy_async_getptr(sycl::queue &q) { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + const unsigned int Num = 5000; + const unsigned int N1 = 1000; + syclcompat::constant_memory d_A(Num * sizeof(float)); + syclcompat::constant_memory d_B(Num * sizeof(float)); + + float h_A[Num]; + float h_B[Num]; + float h_C[Num]; + float h_D[Num]; + + for (int i = 0; i < Num; i++) { + h_A[i] = 1.0f; + h_B[i] = 2.0f; + } + + for (int i = 0; i < Num; i++) { + h_A[i] = 1.0f; + h_B[i] = 2.0f; + } + // hostA[0..999] -> deviceA[0..999] + // hostB[0..3999] -> deviceA[1000..4999] + // deviceA[0..4999] -> deviceB[0..4999] + // deviceA[0..4999] -> hostC[0..4999] + // deviceB[0..4999] -> hostD[0..4999] + syclcompat::memcpy_async((void *)d_A.get_ptr(), (void *)&h_A[0], N1 * sizeof(float), q); + syclcompat::memcpy_async((char *)d_A.get_ptr() + N1 * sizeof(float), (void*) h_B, (Num-N1) * sizeof(float), q); + syclcompat::memcpy_async((void *)h_C, (void *)d_A.get_ptr(), Num * sizeof(float), q); + syclcompat::memcpy_async((void *)d_B.get_ptr(), (void *)d_A.get_ptr(), N1 * sizeof(float), q); + syclcompat::memcpy_async((char *)d_B.get_ptr() + N1 * sizeof(float), (void *)((size_t)d_A.get_ptr() + N1* sizeof(float)), (Num - N1) * sizeof(float), q); + syclcompat::memcpy_async((void *)h_D, (void *)d_B.get_ptr(), Num * sizeof(float), q); + q.wait_and_throw(); + // verify hostD + for (int i = 0; i < N1; i++) { + if (fabs(h_A[i] - h_D[i]) > 1e-5) { + fprintf(stderr, "Check: Elements are A = %f, D = %f:\n", h_A[i], h_D[i]); + fprintf(stderr, "Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } + + for (int i = N1; i < Num; i++) { + if (fabs(h_B[i] - h_D[i]) > 1e-5) { + fprintf(stderr, "Check: Elements are B = %f, D = %f:\n", h_B[i], h_D[i]); + fprintf(stderr, "Result verification failed at element %d:\n", i); + exit(EXIT_FAILURE); + } + } +} + +void test_memcpy_pitched_async(sycl::queue &q) { + std::cout << __PRETTY_FUNCTION__ << std::endl; + size_t width = 6; + size_t height = 8; + float *h_data; + float *h_ref; + size_t h_pitch = sizeof(float) * width; + h_data = (float *)malloc(sizeof(float) * width * height); + for (int i = 0; i < width * height; i++) + h_data[i] = (float)i; + + h_ref = (float *)malloc(sizeof(float) * width * height); + for (int i = 0; i < width * height; i++) + h_ref[i] = (float)i; + + // alloc device memory. + size_t d_pitch; + float *d_data; + d_data = (float *)syclcompat::malloc(d_pitch, sizeof(float) * width, height, q); + + // copy to Device. + syclcompat::memcpy_async(d_data, d_pitch, h_data, h_pitch, sizeof(float) * width, height, q); + + // copy back to host. + syclcompat::memcpy_async(h_data, h_pitch, d_data, d_pitch, sizeof(float) * width, height, q); + q.wait_and_throw(); + check(h_data, h_ref, width, height, 1); + + // memset device data. + syclcompat::memset_async(d_data, d_pitch, 0x1, sizeof(float) * width, height, q); + + // copy back to host + syclcompat::memcpy_async(h_data, h_pitch, d_data, d_pitch, sizeof(float) * width, height, q); + q.wait_and_throw(); + // memset reference data. + memset(h_ref, 0x1, width * height * sizeof(float)); + check(h_data, h_ref, width, height, 1); + + free(h_data); + free(h_ref); + syclcompat::free((void *)d_data, q); +} + +int main() { + test_mempcy_async(); + test_buffer_and_offset_kernel(); + test_memset_async(); + test_memcpy_async_getptr(); + test_memcpy_pitched_async(); + + sycl::queue q; + test_mempcy_async(q); + test_buffer_and_offset_kernel(q); + test_memset_async(q); + test_memcpy_async_getptr(q); + test_memcpy_pitched_async(q); + + return 0; +} diff --git a/sycl/test-e2e/syclcompat/memory/shared_memory_usmnone.cpp b/sycl/test-e2e/syclcompat/memory/shared_memory_usmnone.cpp new file mode 100644 index 0000000000000..2714977056312 --- /dev/null +++ b/sycl/test-e2e/syclcompat/memory/shared_memory_usmnone.cpp @@ -0,0 +1,77 @@ +// ====------ shared_memory.cpp---------- -*- C++ -* ----===//// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// +// ===----------------------------------------------------------------------===// +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#define SYCLCOMPAT_USM_LEVEL_NONE +#include +#include +#include +#include + +#define M 4 +#define N 8 + +syclcompat::shared_memory array(N); +syclcompat::shared_memory result(M*N); + +void my_kernel(float* array, float* result, + sycl::nd_item<3> item_ct1, + float *resultInGroup) +{ + + + if(item_ct1.get_group_linear_id() == 0) + array[item_ct1.get_local_id(2)] = item_ct1.get_local_id(2); + resultInGroup[item_ct1.get_local_id(2)] = item_ct1.get_group(2); + + item_ct1.barrier(); + + if (item_ct1.get_local_id(2) == 0) { + memcpy(&result[item_ct1.get_group(2)*N], resultInGroup, sizeof(float)*N); + } +} + + +int main () { + { + std::pair array_buf_ct0 = syclcompat::get_buffer_and_offset(array.get_ptr()); + size_t array_offset_ct0 = array_buf_ct0.second; + std::pair result_buf_ct1 = syclcompat::get_buffer_and_offset(result.get_ptr()); + size_t result_offset_ct1 = result_buf_ct1.second; + syclcompat::get_default_queue().submit( + [&](sycl::handler &cgh) { + sycl::local_accessor resultInGroup_acc_ct1(sycl::range<1>(8), cgh); + auto array_acc_ct0 = array_buf_ct0.first.get_access(cgh); + auto result_acc_ct1 = result_buf_ct1.first.get_access(cgh); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, M) * sycl::range<3>(1, 1, N), sycl::range<3>(1, 1, N)), + [=](sycl::nd_item<3> item_ct1) { + float *array_ct0 = (float *)(&array_acc_ct0[0] + array_offset_ct0); + float *result_ct1 = (float *)(&result_acc_ct1[0] + result_offset_ct1); + my_kernel(array_ct0, result_ct1, item_ct1, + resultInGroup_acc_ct1 + .get_multi_ptr() + .get()); + }); + }); + } + + syclcompat::get_current_device().queues_wait_and_throw(); + for(int j = 0; j < M; j++) { + for (int i = 0; i < N; i++) { + assert(result[j*N + i] == static_cast(j)); + } + } + for(int j = 0; j < N; j++) + assert(array[j] == static_cast(j)); + return 0; +} + diff --git a/sycl/test-e2e/syclcompat/memory/usm_allocations.cpp b/sycl/test-e2e/syclcompat/memory/usm_allocations.cpp index f8db37ab62304..c460c5e1304fb 100644 --- a/sycl/test-e2e/syclcompat/memory/usm_allocations.cpp +++ b/sycl/test-e2e/syclcompat/memory/usm_allocations.cpp @@ -88,7 +88,6 @@ void test_non_templated_host() { void test_deduce() { std::cout << __PRETTY_FUNCTION__ << std::endl; - using namespace syclcompat::experimental; // for memcpy_direction auto default_queue = syclcompat::get_default_queue(); if (!default_queue.get_device().has(sycl::aspect::usm_host_allocations)) return; // Skip unsupported @@ -100,33 +99,35 @@ void test_deduce() { // * to host assert(syclcompat::detail::deduce_memcpy_direction(default_queue, h_ptr, h_ptr) == - memcpy_direction::device_to_device); - assert(syclcompat::detail::deduce_memcpy_direction( - default_queue, h_ptr, sys_ptr) == memcpy_direction::host_to_host); + syclcompat::detail::memcpy_direction::device_to_device); + assert(syclcompat::detail::deduce_memcpy_direction(default_queue, h_ptr, + sys_ptr) == + syclcompat::detail::memcpy_direction::host_to_host); assert(syclcompat::detail::deduce_memcpy_direction(default_queue, h_ptr, d_ptr) == - memcpy_direction::device_to_device); + syclcompat::detail::memcpy_direction::device_to_device); // * to sys - assert(syclcompat::detail::deduce_memcpy_direction( - default_queue, sys_ptr, h_ptr) == memcpy_direction::host_to_host); + assert(syclcompat::detail::deduce_memcpy_direction(default_queue, sys_ptr, + h_ptr) == + syclcompat::detail::memcpy_direction::host_to_host); assert(syclcompat::detail::deduce_memcpy_direction(default_queue, sys_ptr, sys_ptr) == - memcpy_direction::host_to_host); + syclcompat::detail::memcpy_direction::host_to_host); assert(syclcompat::detail::deduce_memcpy_direction(default_queue, sys_ptr, d_ptr) == - memcpy_direction::device_to_host); + syclcompat::detail::memcpy_direction::device_to_host); // * to dev assert(syclcompat::detail::deduce_memcpy_direction(default_queue, d_ptr, h_ptr) == - memcpy_direction::device_to_device); + syclcompat::detail::memcpy_direction::device_to_device); assert(syclcompat::detail::deduce_memcpy_direction(default_queue, d_ptr, sys_ptr) == - memcpy_direction::host_to_device); + syclcompat::detail::memcpy_direction::host_to_device); assert(syclcompat::detail::deduce_memcpy_direction(default_queue, d_ptr, d_ptr) == - memcpy_direction::device_to_device); + syclcompat::detail::memcpy_direction::device_to_device); std::free(sys_ptr); syclcompat::free(h_ptr); diff --git a/sycl/test-e2e/syclcompat/memory/usm_shared_allocations.cpp b/sycl/test-e2e/syclcompat/memory/usm_shared_allocations.cpp index 9833f0a39242d..b3918254f144e 100644 --- a/sycl/test-e2e/syclcompat/memory/usm_shared_allocations.cpp +++ b/sycl/test-e2e/syclcompat/memory/usm_shared_allocations.cpp @@ -63,7 +63,6 @@ void test_non_templated_shared() { void test_deduce_shared() { std::cout << __PRETTY_FUNCTION__ << std::endl; - using namespace syclcompat::experimental; auto default_queue = syclcompat::get_default_queue(); int *h_ptr = (int *)syclcompat::malloc_host(sizeof(int)); @@ -74,29 +73,31 @@ void test_deduce_shared() { // * to host assert(syclcompat::detail::deduce_memcpy_direction(default_queue, h_ptr, s_ptr) == - memcpy_direction::device_to_device); + syclcompat::detail::memcpy_direction::device_to_device); // * to sys - assert(syclcompat::detail::deduce_memcpy_direction( - default_queue, sys_ptr, s_ptr) == memcpy_direction::host_to_host); + assert(syclcompat::detail::deduce_memcpy_direction(default_queue, sys_ptr, + s_ptr) == + syclcompat::detail::memcpy_direction::host_to_host); // * to dev assert(syclcompat::detail::deduce_memcpy_direction(default_queue, d_ptr, s_ptr) == - memcpy_direction::device_to_device); + syclcompat::detail::memcpy_direction::device_to_device); // * to shared assert(syclcompat::detail::deduce_memcpy_direction(default_queue, s_ptr, h_ptr) == - memcpy_direction::device_to_device); - assert(syclcompat::detail::deduce_memcpy_direction( - default_queue, s_ptr, sys_ptr) == memcpy_direction::host_to_host); + syclcompat::detail::memcpy_direction::device_to_device); + assert(syclcompat::detail::deduce_memcpy_direction(default_queue, s_ptr, + sys_ptr) == + syclcompat::detail::memcpy_direction::host_to_host); assert(syclcompat::detail::deduce_memcpy_direction(default_queue, s_ptr, d_ptr) == - memcpy_direction::device_to_device); + syclcompat::detail::memcpy_direction::device_to_device); assert(syclcompat::detail::deduce_memcpy_direction(default_queue, s_ptr, s_ptr) == - memcpy_direction::device_to_device); + syclcompat::detail::memcpy_direction::device_to_device); syclcompat::free(s_ptr); std::free(sys_ptr); diff --git a/sycl/test-e2e/syclcompat/util/util_fast_length_test_usmnone.cpp b/sycl/test-e2e/syclcompat/util/util_fast_length_test_usmnone.cpp new file mode 100644 index 0000000000000..a05595b18d6de --- /dev/null +++ b/sycl/test-e2e/syclcompat/util/util_fast_length_test_usmnone.cpp @@ -0,0 +1,169 @@ +// ====------ util_fast_length_test.cpp---------- -*- C++ -* ----===//// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// +// ===----------------------------------------------------------------------===// + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#define SYCLCOMPAT_USM_LEVEL_NONE +#include +#include + +void fast_length_test() { + + { + float inputData_0(0.8970062715); + + sycl::range<1> ndRng(1); + float *kernelResult = new float[1]; + auto testQueue = syclcompat::get_default_queue(); + { + sycl::buffer buffer(kernelResult, ndRng); + + testQueue.submit([&](sycl::handler &h) { + auto resultPtr = + buffer.template get_access(h); + + h.single_task( + [=]() { resultPtr[0] = syclcompat::fast_length(&inputData_0, 1); }); + }); + } + testQueue.wait_and_throw(); + if (fabs(inputData_0 - *kernelResult) > 1e-5) { + printf("fast_length_test 1 failed\n"); + exit(-1); + } + delete[] kernelResult; + } + + { + float inputData_0[2] = {0.8335529744, 0.7346600673}; + + sycl::range<1> ndRng(1); + float *kernelResult = new float[1]; + auto testQueue = syclcompat::get_default_queue(); + { + sycl::buffer buffer(kernelResult, ndRng); + + testQueue.submit([&](sycl::handler &h) { + auto resultPtr = + buffer.template get_access(h); + + h.single_task( + [=]() { resultPtr[0] = syclcompat::fast_length(&inputData_0[0], 2); }); + }); + } + testQueue.wait_and_throw(); + + if (fabs(sqrtf(0.8335529744 * 0.8335529744 + 0.7346600673 * 0.7346600673) - + *kernelResult) > 1e-5) { + printf("fast_length_test 2 failed\n"); + exit(-1); + } + + delete[] kernelResult; + } + + { + float inputData_0[3] = {0.1658983906, 0.590226484, 0.4891553616}; + + sycl::range<1> ndRng(1); + float *kernelResult = new float[1]; + auto testQueue = syclcompat::get_default_queue(); + { + sycl::buffer buffer(kernelResult, ndRng); + + testQueue.submit([&](sycl::handler &h) { + auto resultPtr = + buffer.template get_access(h); + + h.single_task( + [=]() { resultPtr[0] = syclcompat::fast_length(&inputData_0[0], 3); }); + }); + } + testQueue.wait_and_throw(); + + if (fabs(sqrtf(0.1658983906 * 0.1658983906 + 0.590226484 * 0.590226484 + + 0.4891553616 * 0.4891553616) - + *kernelResult) > 1e-5) { + printf("fast_length_test 3 failed\n"); + exit(-1); + } + + delete[] kernelResult; + } + + { + float inputData_0[4] = {0.6041178723, 0.7760620605, 0.2944284976, + 0.6851913766}; + + sycl::range<1> ndRng(1); + float *kernelResult = new float[1]; + auto testQueue = syclcompat::get_default_queue(); + { + sycl::buffer buffer(kernelResult, ndRng); + + testQueue.submit([&](sycl::handler &h) { + auto resultPtr = + buffer.template get_access(h); + + h.single_task( + [=]() { resultPtr[0] = syclcompat::fast_length(&inputData_0[0], 4); }); + }); + } + testQueue.wait_and_throw(); + + if (fabs(sqrtf(0.6041178723 * 0.6041178723 + 0.7760620605 * 0.7760620605 + + 0.2944284976 * 0.2944284976 + 0.6851913766 * 0.6851913766) - + *kernelResult) > 1e-5) { + printf("fast_length_test 4 failed\n"); + exit(-1); + } + + delete[] kernelResult; + } + + { + float inputData_0[5] = {0.6041178723, 0.7760620605, 0.2944284976, + 0.6851913766, 0.6851913766}; + + sycl::range<1> ndRng(1); + float *kernelResult = new float[1]; + auto testQueue = syclcompat::get_default_queue(); + { + sycl::buffer buffer(kernelResult, ndRng); + + testQueue.submit([&](sycl::handler &h) { + auto resultPtr = + buffer.template get_access(h); + + h.single_task( + [=]() { resultPtr[0] = syclcompat::fast_length(&inputData_0[0], 5); }); + }); + } + testQueue.wait_and_throw(); + + if (fabs(sqrtf(0.6041178723 * 0.6041178723 + 0.7760620605 * 0.7760620605 + + 0.2944284976 * 0.2944284976 + 0.6851913766 * 0.6851913766 + + 0.6851913766 * 0.6851913766) - + *kernelResult) > 1e-5) { + printf("fast_length_test 5 failed\n"); + exit(-1); + } + + delete[] kernelResult; + } + printf("fast_length test is passed!\n"); +} + +int main() { + + fast_length_test(); + + return 0; +}