diff --git a/examples/run/alpaka/full_chain_algorithm.cpp b/examples/run/alpaka/full_chain_algorithm.cpp index d7a4744a4e..d08445d5e7 100644 --- a/examples/run/alpaka/full_chain_algorithm.cpp +++ b/examples/run/alpaka/full_chain_algorithm.cpp @@ -29,9 +29,8 @@ full_chain_algorithm::full_chain_algorithm( m_queue(), m_vecmem_objects(m_queue), m_host_mr(host_mr), - m_cached_device_mr( - std::make_unique<::vecmem::binary_page_memory_resource>( - m_vecmem_objects.device_mr())), + m_cached_pinned_host_mr(m_vecmem_objects.host_mr()), + m_cached_device_mr(m_vecmem_objects.device_mr()), m_field_vec{0.f, 0.f, finder_config.bFieldInZ}, m_field(field), m_det_descr(det_descr), @@ -40,29 +39,27 @@ full_chain_algorithm::full_chain_algorithm( m_det_descr.get().size()), m_vecmem_objects.device_mr()), m_detector(detector), - m_clusterization(memory_resource{*m_cached_device_mr, &m_host_mr}, + m_clusterization({m_cached_device_mr, &m_cached_pinned_host_mr}, m_vecmem_objects.async_copy(), m_queue, clustering_config), - m_measurement_sorting(memory_resource{*m_cached_device_mr, &m_host_mr}, + m_measurement_sorting({m_cached_device_mr, &m_cached_pinned_host_mr}, m_vecmem_objects.async_copy(), m_queue, logger->cloneWithSuffix("MeasSortingAlg")), - m_spacepoint_formation(memory_resource{*m_cached_device_mr, &m_host_mr}, + m_spacepoint_formation({m_cached_device_mr, &m_cached_pinned_host_mr}, m_vecmem_objects.async_copy(), m_queue, logger->cloneWithSuffix("SpFormationAlg")), m_seeding(finder_config, grid_config, filter_config, - memory_resource{*m_cached_device_mr, &m_host_mr}, + {m_cached_device_mr, &m_cached_pinned_host_mr}, m_vecmem_objects.async_copy(), m_queue, logger->cloneWithSuffix("SeedingAlg")), m_track_parameter_estimation( - memory_resource{*m_cached_device_mr, &m_host_mr}, + {m_cached_device_mr, &m_cached_pinned_host_mr}, m_vecmem_objects.async_copy(), m_queue, logger->cloneWithSuffix("TrackParamEstAlg")), - m_finding(finding_config, - memory_resource{*m_cached_device_mr, &m_host_mr}, + m_finding(finding_config, {m_cached_device_mr, &m_cached_pinned_host_mr}, m_vecmem_objects.async_copy(), m_queue, logger->cloneWithSuffix("TrackFindingAlg")), - m_fitting(fitting_config, - memory_resource{*m_cached_device_mr, &m_host_mr}, + m_fitting(fitting_config, {m_cached_device_mr, &m_cached_pinned_host_mr}, m_vecmem_objects.async_copy(), m_queue, logger->cloneWithSuffix("TrackFittingAlg")), m_clustering_config(clustering_config), @@ -92,9 +89,8 @@ full_chain_algorithm::full_chain_algorithm(const full_chain_algorithm& parent) m_queue(), m_vecmem_objects(m_queue), m_host_mr(parent.m_host_mr), - m_cached_device_mr( - std::make_unique<::vecmem::binary_page_memory_resource>( - m_vecmem_objects.device_mr())), + m_cached_pinned_host_mr(m_vecmem_objects.host_mr()), + m_cached_device_mr(m_vecmem_objects.device_mr()), m_field_vec(parent.m_field_vec), m_field(parent.m_field), m_det_descr(parent.m_det_descr), @@ -103,30 +99,30 @@ full_chain_algorithm::full_chain_algorithm(const full_chain_algorithm& parent) m_det_descr.get().size()), m_vecmem_objects.device_mr()), m_detector(parent.m_detector), - m_clusterization(memory_resource{*m_cached_device_mr, &m_host_mr}, + m_clusterization({m_cached_device_mr, &m_cached_pinned_host_mr}, m_vecmem_objects.async_copy(), m_queue, parent.m_clustering_config), - m_measurement_sorting(memory_resource{*m_cached_device_mr, &m_host_mr}, + m_measurement_sorting({m_cached_device_mr, &m_cached_pinned_host_mr}, m_vecmem_objects.async_copy(), m_queue, parent.logger().cloneWithSuffix("MeasSortingAlg")), - m_spacepoint_formation(memory_resource{*m_cached_device_mr, &m_host_mr}, + m_spacepoint_formation({m_cached_device_mr, &m_cached_pinned_host_mr}, m_vecmem_objects.async_copy(), m_queue, parent.logger().cloneWithSuffix("SpFormationAlg")), m_seeding(parent.m_finder_config, parent.m_grid_config, parent.m_filter_config, - memory_resource{*m_cached_device_mr, &m_host_mr}, + {m_cached_device_mr, &m_cached_pinned_host_mr}, m_vecmem_objects.async_copy(), m_queue, parent.logger().cloneWithSuffix("SeedingAlg")), m_track_parameter_estimation( - memory_resource{*m_cached_device_mr, &m_host_mr}, + {m_cached_device_mr, &m_cached_pinned_host_mr}, m_vecmem_objects.async_copy(), m_queue, parent.logger().cloneWithSuffix("TrackParamEstAlg")), m_finding(parent.m_finding_config, - memory_resource{*m_cached_device_mr, &m_host_mr}, + {m_cached_device_mr, &m_cached_pinned_host_mr}, m_vecmem_objects.async_copy(), m_queue, parent.logger().cloneWithSuffix("TrackFindingAlg")), m_fitting(parent.m_fitting_config, - memory_resource{*m_cached_device_mr, &m_host_mr}, + {m_cached_device_mr, &m_cached_pinned_host_mr}, m_vecmem_objects.async_copy(), m_queue, parent.logger().cloneWithSuffix("TrackFittingAlg")), m_clustering_config(parent.m_clustering_config), @@ -156,7 +152,7 @@ full_chain_algorithm::output_type full_chain_algorithm::operator()( // Create device copy of input collections edm::silicon_cell_collection::buffer cells_buffer( - static_cast(cells.size()), *m_cached_device_mr); + static_cast(cells.size()), m_cached_device_mr); m_vecmem_objects.async_copy()(::vecmem::get_data(cells), cells_buffer) ->ignore(); @@ -184,8 +180,12 @@ full_chain_algorithm::output_type full_chain_algorithm::operator()( m_device_detector_view, m_field, {track_candidates, measurements}); // Copy a limited amount of result data back to the host. + const auto host_tracks = m_vecmem_objects.async_copy().to( + track_states.tracks, m_cached_pinned_host_mr, nullptr, + ::vecmem::copy::type::device_to_host); output_type result{m_host_mr}; - m_vecmem_objects.async_copy()(track_states.tracks, result)->wait(); + ::vecmem::copy host_copy; + host_copy(host_tracks, result)->wait(); return result; } @@ -207,7 +207,7 @@ bound_track_parameters_collection_types::host full_chain_algorithm::seeding( // Create device copy of input collections edm::silicon_cell_collection::buffer cells_buffer( - static_cast(cells.size()), *m_cached_device_mr); + static_cast(cells.size()), m_cached_device_mr); m_vecmem_objects.async_copy()(::vecmem::get_data(cells), cells_buffer) ->ignore(); @@ -227,8 +227,12 @@ bound_track_parameters_collection_types::host full_chain_algorithm::seeding( m_seeding(spacepoints), m_field_vec); // Copy a limited amount of result data back to the host. + const auto host_seeds = m_vecmem_objects.async_copy().to( + track_params, m_cached_pinned_host_mr, + ::vecmem::copy::type::device_to_host); bound_track_parameters_collection_types::host result{&m_host_mr}; - m_vecmem_objects.async_copy()(track_params, result)->wait(); + ::vecmem::copy host_copy; + host_copy(host_seeds, result)->wait(); return result; } diff --git a/examples/run/alpaka/full_chain_algorithm.hpp b/examples/run/alpaka/full_chain_algorithm.hpp index 8cbd6fa7fe..564b138a37 100644 --- a/examples/run/alpaka/full_chain_algorithm.hpp +++ b/examples/run/alpaka/full_chain_algorithm.hpp @@ -125,8 +125,10 @@ class full_chain_algorithm /// Host memory resource ::vecmem::memory_resource& m_host_mr; + /// Cached pinned host memory resource + mutable ::vecmem::binary_page_memory_resource m_cached_pinned_host_mr; /// Device caching memory resource - std::unique_ptr<::vecmem::binary_page_memory_resource> m_cached_device_mr; + mutable ::vecmem::binary_page_memory_resource m_cached_device_mr; /// Constant B field for the (seed) track parameter estimation traccc::vector3 m_field_vec; diff --git a/examples/run/alpaka/throughput_mt.cpp b/examples/run/alpaka/throughput_mt.cpp index 958ae518be..f159e4c010 100644 --- a/examples/run/alpaka/throughput_mt.cpp +++ b/examples/run/alpaka/throughput_mt.cpp @@ -13,9 +13,6 @@ int main(int argc, char* argv[]) { // Execute the throughput test. - static const bool use_host_caching = true; - return traccc::throughput_mt( - "Multi-threaded Alpaka GPU throughput tests", argc, argv, - use_host_caching); + return traccc::throughput_mt( + "Multi-threaded Alpaka GPU throughput tests", argc, argv); } diff --git a/examples/run/alpaka/throughput_st.cpp b/examples/run/alpaka/throughput_st.cpp index 8c51cdf05e..1084688a31 100644 --- a/examples/run/alpaka/throughput_st.cpp +++ b/examples/run/alpaka/throughput_st.cpp @@ -13,9 +13,6 @@ int main(int argc, char* argv[]) { // Execute the throughput test. - static const bool use_host_caching = true; - return traccc::throughput_st( - "Single-threaded Alpaka GPU throughput tests", argc, argv, - use_host_caching); + return traccc::throughput_st( + "Single-threaded Alpaka GPU throughput tests", argc, argv); } diff --git a/examples/run/common/throughput_mt.hpp b/examples/run/common/throughput_mt.hpp index 88c88192cb..1d84bfb1bc 100644 --- a/examples/run/common/throughput_mt.hpp +++ b/examples/run/common/throughput_mt.hpp @@ -1,15 +1,12 @@ /** TRACCC library, part of the ACTS project (R&D line) * - * (c) 2022 CERN for the benefit of the ACTS project + * (c) 2022-2025 CERN for the benefit of the ACTS project * * Mozilla Public License Version 2.0 */ #pragma once -// VecMem include(s). -#include - // System include(s). #include @@ -18,18 +15,15 @@ namespace traccc { /// Helper function running a multi-threaded throughput test /// /// @tparam FULL_CHAIN_ALG The type of the full chain algorithm to use -/// @tparam HOST_MR The host memory resource type to use +/// /// @param description A short description of the application /// @param argc The count of command line arguments (from @c main(...)) /// @param argv The command line arguments (from @c main(...)) -/// @param use_host_caching Flag specifying whether host-side memory caching -/// should be used +/// /// @return The value to be returned from @c main(...) /// -template -int throughput_mt(std::string_view description, int argc, char* argv[], - bool use_host_caching = false); +template +int throughput_mt(std::string_view description, int argc, char* argv[]); } // namespace traccc diff --git a/examples/run/common/throughput_mt.ipp b/examples/run/common/throughput_mt.ipp index d6c5385ec1..6698f82cf1 100644 --- a/examples/run/common/throughput_mt.ipp +++ b/examples/run/common/throughput_mt.ipp @@ -38,7 +38,7 @@ #include "traccc/performance/timing_info.hpp" // VecMem include(s). -#include +#include // TBB include(s). #include @@ -61,9 +61,9 @@ namespace traccc { -template -int throughput_mt(std::string_view description, int argc, char* argv[], - bool use_host_caching) { +template +int throughput_mt(std::string_view description, int argc, char* argv[]) { + std::unique_ptr ilogger = traccc::getDefaultLogger( "ThroughputExample", traccc::Logging::Level::INFO); TRACCC_LOCAL_LOGGER(std::move(ilogger)); @@ -92,20 +92,20 @@ int throughput_mt(std::string_view description, int argc, char* argv[], performance::timing_info times; // Memory resource to use in the test. - HOST_MR uncached_host_mr; + vecmem::host_memory_resource host_mr; // Construct the detector description object. - traccc::silicon_detector_description::host det_descr{uncached_host_mr}; + traccc::silicon_detector_description::host det_descr{host_mr}; traccc::io::read_detector_description( det_descr, detector_opts.detector_file, detector_opts.digitization_file, (detector_opts.use_detray_detector ? traccc::data_format::json : traccc::data_format::csv)); // Construct a Detray detector object, if supported by the configuration. - traccc::default_detector::host detector{uncached_host_mr}; + traccc::default_detector::host detector{host_mr}; if (detector_opts.use_detray_detector) { traccc::io::read_detector( - detector, uncached_host_mr, detector_opts.detector_file, + detector, host_mr, detector_opts.detector_file, detector_opts.material_file, detector_opts.grid_file); } @@ -113,7 +113,7 @@ int throughput_mt(std::string_view description, int argc, char* argv[], const auto field = details::make_magnetic_field(bfield_opts); // Read in all input events into memory. - vecmem::vector input{&uncached_host_mr}; + vecmem::vector input{&host_mr}; { performance::timer t{"File reading", times}; // Set up the container for the input events. @@ -121,7 +121,7 @@ int throughput_mt(std::string_view description, int argc, char* argv[], const std::size_t first_event = input_opts.skip; const std::size_t last_event = input_opts.skip + input_opts.events; for (std::size_t i = first_event; i < last_event; ++i) { - input.emplace_back(uncached_host_mr); + input.emplace_back(host_mr); } // Read the input cells into memory in parallel. tbb::parallel_for( @@ -138,19 +138,6 @@ int throughput_mt(std::string_view description, int argc, char* argv[], }); } - // Set up cached memory resources on top of the host memory resource - // separately for each CPU thread. - std::vector > - cached_host_mrs; - if (use_host_caching) { - cached_host_mrs.reserve(threading_opts.threads + 1); - for (std::size_t i = 0; i < threading_opts.threads + 1; ++i) { - cached_host_mrs.push_back( - std::make_unique( - uncached_host_mr)); - } - } - // Algorithm configuration(s). typename FULL_CHAIN_ALG::clustering_algorithm::config_type clustering_cfg( clusterization_opts); @@ -170,16 +157,9 @@ int throughput_mt(std::string_view description, int argc, char* argv[], std::vector algs; algs.reserve(threading_opts.threads + 1); for (std::size_t i = 0; i < threading_opts.threads + 1; ++i) { - - vecmem::memory_resource& alg_host_mr = - use_host_caching - ? static_cast( - *(cached_host_mrs.at(i))) - : static_cast(uncached_host_mr); algs.push_back( - {alg_host_mr, clustering_cfg, seedfinder_config, - spacepoint_grid_config, seedfilter_config, finding_cfg, - fitting_cfg, det_descr, field, + {host_mr, clustering_cfg, seedfinder_config, spacepoint_grid_config, + seedfilter_config, finding_cfg, fitting_cfg, det_descr, field, (detector_opts.use_detray_detector ? &detector : nullptr), logger().clone()}); } @@ -304,10 +284,9 @@ int throughput_mt(std::string_view description, int argc, char* argv[], group.wait(); } - // Delete the algorithms and host memory caches explicitly before their - // parent object would go out of scope. + // Delete the algorithms explicitly before their parent object would go out + // of scope. algs.clear(); - cached_host_mrs.clear(); // Print some results. TRACCC_INFO("Reconstructed track parameters: " << rec_track_params.load()); diff --git a/examples/run/common/throughput_st.hpp b/examples/run/common/throughput_st.hpp index 34e4ff01ff..8e559f1449 100644 --- a/examples/run/common/throughput_st.hpp +++ b/examples/run/common/throughput_st.hpp @@ -1,18 +1,12 @@ /** TRACCC library, part of the ACTS project (R&D line) * - * (c) 2022 CERN for the benefit of the ACTS project + * (c) 2022-2025 CERN for the benefit of the ACTS project * * Mozilla Public License Version 2.0 */ #pragma once -// Projection include(s). -#include "traccc/seeding/detail/seeding_config.hpp" - -// VecMem include(s). -#include - // System include(s). #include @@ -21,18 +15,15 @@ namespace traccc { /// Helper function running a single-threaded throughput test /// /// @tparam FULL_CHAIN_ALG The type of the full chain algorithm to use -/// @tparam HOST_MR The host memory resource type to use +/// /// @param description A short description of the application /// @param argc The count of command line arguments (from @c main(...)) /// @param argv The command line arguments (from @c main(...)) -/// @param use_host_caching Flag specifying whether host-side memory caching -/// should be used +/// /// @return The value to be returned from @c main(...) /// -template -int throughput_st(std::string_view description, int argc, char* argv[], - bool use_host_caching = false); +template +int throughput_st(std::string_view description, int argc, char* argv[]); } // namespace traccc diff --git a/examples/run/common/throughput_st.ipp b/examples/run/common/throughput_st.ipp index 6e12d49c66..f03922822d 100644 --- a/examples/run/common/throughput_st.ipp +++ b/examples/run/common/throughput_st.ipp @@ -37,7 +37,7 @@ #include "traccc/performance/timing_info.hpp" // VecMem include(s). -#include +#include // Indicators include(s). #include @@ -51,9 +51,9 @@ namespace traccc { -template -int throughput_st(std::string_view description, int argc, char* argv[], - bool use_host_caching) { +template +int throughput_st(std::string_view description, int argc, char* argv[]) { + std::unique_ptr logger = traccc::getDefaultLogger( "ThroughputExample", traccc::Logging::Level::INFO); @@ -80,42 +80,35 @@ int throughput_st(std::string_view description, int argc, char* argv[], performance::timing_info times; // Memory resource to use in the test. - HOST_MR uncached_host_mr; - std::unique_ptr cached_host_mr = - std::make_unique(uncached_host_mr); + vecmem::host_memory_resource host_mr; // Construct the detector description object. - traccc::silicon_detector_description::host det_descr{uncached_host_mr}; + traccc::silicon_detector_description::host det_descr{host_mr}; traccc::io::read_detector_description( det_descr, detector_opts.detector_file, detector_opts.digitization_file, (detector_opts.use_detray_detector ? traccc::data_format::json : traccc::data_format::csv)); // Construct a Detray detector object, if supported by the configuration. - traccc::default_detector::host detector{uncached_host_mr}; + traccc::default_detector::host detector{host_mr}; if (detector_opts.use_detray_detector) { traccc::io::read_detector( - detector, uncached_host_mr, detector_opts.detector_file, + detector, host_mr, detector_opts.detector_file, detector_opts.material_file, detector_opts.grid_file); } // Construct the magnetic field object. const auto field = details::make_magnetic_field(bfield_opts); - vecmem::memory_resource& alg_host_mr = - use_host_caching - ? static_cast(*cached_host_mr) - : static_cast(uncached_host_mr); - // Read in all input events into memory. - vecmem::vector input{&uncached_host_mr}; + vecmem::vector input{&host_mr}; { performance::timer t{"File reading", times}; // Read the input cells into memory event-by-event. input.reserve(input_opts.events); for (std::size_t i = input_opts.skip; i < input_opts.skip + input_opts.events; ++i) { - input.emplace_back(uncached_host_mr); + input.emplace_back(host_mr); static constexpr bool DEDUPLICATE = true; io::read_cells(input.back(), i, input_opts.directory, logger->clone(), &det_descr, input_opts.format, @@ -143,7 +136,7 @@ int throughput_st(std::string_view description, int argc, char* argv[], // Set up the full-chain algorithm. std::unique_ptr alg = std::make_unique( - alg_host_mr, clustering_cfg, seedfinder_config, spacepoint_grid_config, + host_mr, clustering_cfg, seedfinder_config, spacepoint_grid_config, seedfilter_config, finding_cfg, fitting_cfg, det_descr, field, (detector_opts.use_detray_detector ? &detector : nullptr), logger->clone("FullChainAlg")); @@ -235,7 +228,6 @@ int throughput_st(std::string_view description, int argc, char* argv[], // Explicitly delete the objects in the correct order. alg.reset(); - cached_host_mr.reset(); // Print some results. std::cout << "Reconstructed track parameters: " << rec_track_params diff --git a/examples/run/cpu/throughput_mt.cpp b/examples/run/cpu/throughput_mt.cpp index ab79e836d2..0623f0da9b 100644 --- a/examples/run/cpu/throughput_mt.cpp +++ b/examples/run/cpu/throughput_mt.cpp @@ -13,8 +13,6 @@ int main(int argc, char* argv[]) { // Execute the throughput test. - static constexpr bool USE_HOST_CACHING = false; return traccc::throughput_mt( - "Multi-threaded host-only throughput tests", argc, argv, - USE_HOST_CACHING); + "Multi-threaded host-only throughput tests", argc, argv); } diff --git a/examples/run/cpu/throughput_st.cpp b/examples/run/cpu/throughput_st.cpp index 202b50c50d..e37a9abeb9 100644 --- a/examples/run/cpu/throughput_st.cpp +++ b/examples/run/cpu/throughput_st.cpp @@ -13,8 +13,6 @@ int main(int argc, char* argv[]) { // Execute the throughput test. - static constexpr bool USE_HOST_CACHING = false; return traccc::throughput_st( - "Single-threaded host-only throughput tests", argc, argv, - USE_HOST_CACHING); + "Single-threaded host-only throughput tests", argc, argv); } diff --git a/examples/run/cuda/full_chain_algorithm.cpp b/examples/run/cuda/full_chain_algorithm.cpp index cb23475fa7..367fbd2200 100644 --- a/examples/run/cuda/full_chain_algorithm.cpp +++ b/examples/run/cuda/full_chain_algorithm.cpp @@ -43,10 +43,11 @@ full_chain_algorithm::full_chain_algorithm( std::unique_ptr logger) : messaging(logger->clone()), m_host_mr(host_mr), + m_pinned_host_mr(), + m_cached_pinned_host_mr(m_pinned_host_mr), m_stream(), m_device_mr(), - m_cached_device_mr( - std::make_unique(m_device_mr)), + m_cached_device_mr(m_device_mr), m_copy(m_stream.cudaStream()), m_field_vec{0.f, 0.f, finder_config.bFieldInZ}, m_field(make_magnetic_field(field)), @@ -56,26 +57,24 @@ full_chain_algorithm::full_chain_algorithm( m_det_descr.get().size()), m_device_mr), m_detector(detector), - m_clusterization(memory_resource{*m_cached_device_mr, &m_host_mr}, m_copy, + m_clusterization({m_cached_device_mr, &m_cached_pinned_host_mr}, m_copy, m_stream, clustering_config), - m_measurement_sorting(memory_resource{*m_cached_device_mr, &m_host_mr}, + m_measurement_sorting({m_cached_device_mr, &m_cached_pinned_host_mr}, m_copy, m_stream, logger->cloneWithSuffix("MeasSortingAlg")), - m_spacepoint_formation(memory_resource{*m_cached_device_mr, &m_host_mr}, + m_spacepoint_formation({m_cached_device_mr, &m_cached_pinned_host_mr}, m_copy, m_stream, logger->cloneWithSuffix("SpFormationAlg")), m_seeding(finder_config, grid_config, filter_config, - memory_resource{*m_cached_device_mr, &m_host_mr}, m_copy, + {m_cached_device_mr, &m_cached_pinned_host_mr}, m_copy, m_stream, logger->cloneWithSuffix("SeedingAlg")), m_track_parameter_estimation( - memory_resource{*m_cached_device_mr, &m_host_mr}, m_copy, m_stream, + {m_cached_device_mr, &m_cached_pinned_host_mr}, m_copy, m_stream, logger->cloneWithSuffix("TrackParEstAlg")), - m_finding(finding_config, - memory_resource{*m_cached_device_mr, &m_host_mr}, m_copy, - m_stream, logger->cloneWithSuffix("TrackFindingAlg")), - m_fitting(fitting_config, - memory_resource{*m_cached_device_mr, &m_host_mr}, m_copy, - m_stream, logger->cloneWithSuffix("TrackFittingAlg")), + m_finding(finding_config, {m_cached_device_mr, &m_cached_pinned_host_mr}, + m_copy, m_stream, logger->cloneWithSuffix("TrackFindingAlg")), + m_fitting(fitting_config, {m_cached_device_mr, &m_cached_pinned_host_mr}, + m_copy, m_stream, logger->cloneWithSuffix("TrackFittingAlg")), m_clustering_config(clustering_config), m_finder_config(finder_config), m_grid_config(grid_config), @@ -105,10 +104,11 @@ full_chain_algorithm::full_chain_algorithm( full_chain_algorithm::full_chain_algorithm(const full_chain_algorithm& parent) : messaging(parent.logger().clone()), m_host_mr(parent.m_host_mr), + m_pinned_host_mr(), + m_cached_pinned_host_mr(m_pinned_host_mr), m_stream(), m_device_mr(), - m_cached_device_mr( - std::make_unique(m_device_mr)), + m_cached_device_mr(m_device_mr), m_copy(m_stream.cudaStream()), m_field_vec(parent.m_field_vec), m_field(parent.m_field), @@ -118,26 +118,26 @@ full_chain_algorithm::full_chain_algorithm(const full_chain_algorithm& parent) m_det_descr.get().size()), m_device_mr), m_detector(parent.m_detector), - m_clusterization(memory_resource{*m_cached_device_mr, &m_host_mr}, m_copy, + m_clusterization({m_cached_device_mr, &m_cached_pinned_host_mr}, m_copy, m_stream, parent.m_clustering_config), - m_measurement_sorting(memory_resource{*m_cached_device_mr, &m_host_mr}, + m_measurement_sorting({m_cached_device_mr, &m_cached_pinned_host_mr}, m_copy, m_stream, parent.logger().cloneWithSuffix("MeasSortingAlg")), - m_spacepoint_formation(memory_resource{*m_cached_device_mr, &m_host_mr}, + m_spacepoint_formation({m_cached_device_mr, &m_cached_pinned_host_mr}, m_copy, m_stream, parent.logger().cloneWithSuffix("SpFormationAlg")), m_seeding(parent.m_finder_config, parent.m_grid_config, parent.m_filter_config, - memory_resource{*m_cached_device_mr, &m_host_mr}, m_copy, + {m_cached_device_mr, &m_cached_pinned_host_mr}, m_copy, m_stream, parent.logger().cloneWithSuffix("SeedingAlg")), m_track_parameter_estimation( - memory_resource{*m_cached_device_mr, &m_host_mr}, m_copy, m_stream, + {m_cached_device_mr, &m_cached_pinned_host_mr}, m_copy, m_stream, parent.logger().cloneWithSuffix("TrackParamEstAlg")), m_finding(parent.m_finding_config, - memory_resource{*m_cached_device_mr, &m_host_mr}, m_copy, + {m_cached_device_mr, &m_cached_pinned_host_mr}, m_copy, m_stream, parent.logger().cloneWithSuffix("TrackFindingAlg")), m_fitting(parent.m_fitting_config, - memory_resource{*m_cached_device_mr, &m_host_mr}, m_copy, + {m_cached_device_mr, &m_cached_pinned_host_mr}, m_copy, m_stream, parent.logger().cloneWithSuffix("TrackFittingAlg")), m_clustering_config(parent.m_clustering_config), m_finder_config(parent.m_finder_config), @@ -163,7 +163,7 @@ full_chain_algorithm::output_type full_chain_algorithm::operator()( // Create device copy of input collections edm::silicon_cell_collection::buffer cells_buffer( - static_cast(cells.size()), *m_cached_device_mr); + static_cast(cells.size()), m_cached_device_mr); m_copy(vecmem::get_data(cells), cells_buffer)->ignore(); // Run the clusterization (asynchronously). @@ -190,8 +190,12 @@ full_chain_algorithm::output_type full_chain_algorithm::operator()( m_device_detector_view, m_field, {track_candidates, measurements}); // Copy a limited amount of result data back to the host. + const auto host_tracks = + m_copy.to(track_states.tracks, m_cached_pinned_host_mr, nullptr, + vecmem::copy::type::device_to_host); output_type result{m_host_mr}; - m_copy(track_states.tracks, result)->wait(); + vecmem::copy host_copy; + host_copy(host_tracks, result)->wait(); return result; } @@ -213,7 +217,7 @@ bound_track_parameters_collection_types::host full_chain_algorithm::seeding( // Create device copy of input collections edm::silicon_cell_collection::buffer cells_buffer( - static_cast(cells.size()), *m_cached_device_mr); + static_cast(cells.size()), m_cached_device_mr); m_copy(vecmem::get_data(cells), cells_buffer)->ignore(); // Run the clusterization (asynchronously). @@ -232,8 +236,11 @@ bound_track_parameters_collection_types::host full_chain_algorithm::seeding( m_seeding(spacepoints), m_field_vec); // Copy a limited amount of result data back to the host. + const auto host_seeds = m_copy.to(track_params, m_cached_pinned_host_mr, + vecmem::copy::type::device_to_host); bound_track_parameters_collection_types::host result{&m_host_mr}; - m_copy(track_params, result)->wait(); + vecmem::copy host_copy; + host_copy(host_seeds, result)->wait(); return result; } diff --git a/examples/run/cuda/full_chain_algorithm.hpp b/examples/run/cuda/full_chain_algorithm.hpp index fe9c3155de..1dd436cae4 100644 --- a/examples/run/cuda/full_chain_algorithm.hpp +++ b/examples/run/cuda/full_chain_algorithm.hpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -121,12 +122,16 @@ class full_chain_algorithm private: /// Host memory resource vecmem::memory_resource& m_host_mr; + /// Pinned host memory resource + vecmem::cuda::host_memory_resource m_pinned_host_mr; + /// Cached pinned host memory resource + mutable vecmem::binary_page_memory_resource m_cached_pinned_host_mr; /// CUDA stream to use stream m_stream; /// Device memory resource vecmem::cuda::device_memory_resource m_device_mr; /// Device caching memory resource - std::unique_ptr m_cached_device_mr; + mutable vecmem::binary_page_memory_resource m_cached_device_mr; /// (Asynchronous) Memory copy object mutable vecmem::cuda::async_copy m_copy; diff --git a/examples/run/cuda/throughput_mt.cpp b/examples/run/cuda/throughput_mt.cpp index 4889b5c7ca..458faff04e 100644 --- a/examples/run/cuda/throughput_mt.cpp +++ b/examples/run/cuda/throughput_mt.cpp @@ -10,15 +10,9 @@ #include "full_chain_algorithm.hpp" -// VecMem include(s). -#include - int main(int argc, char* argv[]) { // Execute the throughput test. - static const bool use_host_caching = true; - return traccc::throughput_mt( - "Multi-threaded CUDA GPU throughput tests", argc, argv, - use_host_caching); + return traccc::throughput_mt( + "Multi-threaded CUDA GPU throughput tests", argc, argv); } diff --git a/examples/run/cuda/throughput_st.cpp b/examples/run/cuda/throughput_st.cpp index cedc6b18e5..d0458da681 100644 --- a/examples/run/cuda/throughput_st.cpp +++ b/examples/run/cuda/throughput_st.cpp @@ -10,15 +10,9 @@ #include "full_chain_algorithm.hpp" -// VecMem include(s). -#include - int main(int argc, char* argv[]) { // Execute the throughput test. - static const bool use_host_caching = true; - return traccc::throughput_st( - "Single-threaded CUDA GPU throughput tests", argc, argv, - use_host_caching); + return traccc::throughput_st( + "Single-threaded CUDA GPU throughput tests", argc, argv); } diff --git a/examples/run/sycl/full_chain_algorithm.hpp b/examples/run/sycl/full_chain_algorithm.hpp index a3d0f288ba..4a906d4616 100644 --- a/examples/run/sycl/full_chain_algorithm.hpp +++ b/examples/run/sycl/full_chain_algorithm.hpp @@ -26,6 +26,7 @@ #include #include #include +#include #include // System include(s). @@ -119,6 +120,10 @@ class full_chain_algorithm std::unique_ptr m_data; /// Host memory resource std::reference_wrapper m_host_mr; + /// Pinned host memory resource + vecmem::sycl::host_memory_resource m_pinned_host_mr; + /// Cached pinned host memory resource + mutable vecmem::binary_page_memory_resource m_cached_pinned_host_mr; /// Device memory resource vecmem::sycl::device_memory_resource m_device_mr; /// Device caching memory resource diff --git a/examples/run/sycl/full_chain_algorithm.sycl b/examples/run/sycl/full_chain_algorithm.sycl index 5ed191869e..772babcaf1 100644 --- a/examples/run/sycl/full_chain_algorithm.sycl +++ b/examples/run/sycl/full_chain_algorithm.sycl @@ -66,6 +66,8 @@ full_chain_algorithm::full_chain_algorithm( m_data(std::make_unique( ::handle_async_error)), m_host_mr(host_mr), + m_pinned_host_mr(&(m_data->m_queue)), + m_cached_pinned_host_mr(m_pinned_host_mr), m_device_mr{&(m_data->m_queue)}, m_cached_device_mr{m_device_mr}, m_copy{&(m_data->m_queue)}, @@ -78,31 +80,40 @@ full_chain_algorithm::full_chain_algorithm( m_device_mr}, m_detector(detector), m_device_detector{}, - m_clusterization{memory_resource{m_cached_device_mr, &(m_host_mr.get())}, - m_copy, m_data->m_queue_wrapper, clustering_config, + m_clusterization{{m_cached_device_mr, &m_cached_pinned_host_mr}, + m_copy, + m_data->m_queue_wrapper, + clustering_config, logger->clone("ClusteringAlg")}, - m_measurement_sorting( - memory_resource{m_cached_device_mr, &(m_host_mr.get())}, m_copy, - m_data->m_queue_wrapper, logger->clone("MeasSortingAlg")), - m_spacepoint_formation{ - memory_resource{m_cached_device_mr, &(m_host_mr.get())}, m_copy, - m_data->m_queue_wrapper, logger->clone("SpFormationAlg")}, + m_measurement_sorting({m_cached_device_mr, &m_cached_pinned_host_mr}, + m_copy, m_data->m_queue_wrapper, + logger->clone("MeasSortingAlg")), + m_spacepoint_formation{{m_cached_device_mr, &m_cached_pinned_host_mr}, + m_copy, + m_data->m_queue_wrapper, + logger->clone("SpFormationAlg")}, m_seeding{finder_config, grid_config, filter_config, - memory_resource{m_cached_device_mr, &(m_host_mr.get())}, + {m_cached_device_mr, &m_cached_pinned_host_mr}, m_copy, m_data->m_queue_wrapper, logger->clone("SeedingAlg")}, m_track_parameter_estimation{ - memory_resource{m_cached_device_mr, &(m_host_mr.get())}, m_copy, - m_data->m_queue_wrapper, logger->clone("TrackParEstAlg")}, + {m_cached_device_mr, &m_cached_pinned_host_mr}, + m_copy, + m_data->m_queue_wrapper, + logger->clone("TrackParEstAlg")}, m_finding{finding_config, - memory_resource{m_cached_device_mr, &(m_host_mr.get())}, m_copy, - m_data->m_queue_wrapper, logger->clone("TrackFindingAlg")}, + {m_cached_device_mr, &m_cached_pinned_host_mr}, + m_copy, + m_data->m_queue_wrapper, + logger->clone("TrackFindingAlg")}, m_fitting{fitting_config, - memory_resource{m_cached_device_mr, &(m_host_mr.get())}, m_copy, - m_data->m_queue_wrapper, logger->clone("TrackFittingAlg")}, + {m_cached_device_mr, &m_cached_pinned_host_mr}, + m_copy, + m_data->m_queue_wrapper, + logger->clone("TrackFittingAlg")}, m_clustering_config(clustering_config), m_finder_config(finder_config), m_grid_config(grid_config), @@ -131,6 +142,8 @@ full_chain_algorithm::full_chain_algorithm(const full_chain_algorithm& parent) m_data(std::make_unique( ::handle_async_error)), m_host_mr(parent.m_host_mr), + m_pinned_host_mr(&(m_data->m_queue)), + m_cached_pinned_host_mr(m_pinned_host_mr), m_device_mr{&(m_data->m_queue)}, m_cached_device_mr{m_device_mr}, m_copy{&(m_data->m_queue)}, @@ -143,32 +156,40 @@ full_chain_algorithm::full_chain_algorithm(const full_chain_algorithm& parent) m_device_mr}, m_detector(parent.m_detector), m_device_detector{}, - m_clusterization{memory_resource{m_cached_device_mr, &(m_host_mr.get())}, - m_copy, m_data->m_queue_wrapper, + m_clusterization{{m_cached_device_mr, &m_cached_pinned_host_mr}, + m_copy, + m_data->m_queue_wrapper, parent.m_clustering_config, parent.logger().clone("ClusteringAlg")}, - m_measurement_sorting( - memory_resource{m_cached_device_mr, &(m_host_mr.get())}, m_copy, - m_data->m_queue_wrapper, parent.logger().clone("MeasSortingAlg")), - m_spacepoint_formation{ - memory_resource{m_cached_device_mr, &(m_host_mr.get())}, m_copy, - m_data->m_queue_wrapper, parent.logger().clone("SpFormationAlg")}, + m_measurement_sorting({m_cached_device_mr, &m_cached_pinned_host_mr}, + m_copy, m_data->m_queue_wrapper, + parent.logger().clone("MeasSortingAlg")), + m_spacepoint_formation{{m_cached_device_mr, &m_cached_pinned_host_mr}, + m_copy, + m_data->m_queue_wrapper, + parent.logger().clone("SpFormationAlg")}, m_seeding{parent.m_finder_config, parent.m_grid_config, parent.m_filter_config, - memory_resource{m_cached_device_mr, &(m_host_mr.get())}, + {m_cached_device_mr, &m_cached_pinned_host_mr}, m_copy, m_data->m_queue_wrapper, parent.logger().clone("SeedingAlg")}, m_track_parameter_estimation{ - memory_resource{m_cached_device_mr, &(m_host_mr.get())}, m_copy, - m_data->m_queue_wrapper, parent.logger().clone("TrackParEstAlg")}, + {m_cached_device_mr, &m_cached_pinned_host_mr}, + m_copy, + m_data->m_queue_wrapper, + parent.logger().clone("TrackParEstAlg")}, m_finding{parent.m_finding_config, - memory_resource{m_cached_device_mr, &(m_host_mr.get())}, m_copy, - m_data->m_queue_wrapper, parent.logger().clone("FindingAlg")}, + {m_cached_device_mr, &m_cached_pinned_host_mr}, + m_copy, + m_data->m_queue_wrapper, + parent.logger().clone("FindingAlg")}, m_fitting{parent.m_fitting_config, - memory_resource{m_cached_device_mr, &(m_host_mr.get())}, m_copy, - m_data->m_queue_wrapper, parent.logger().clone("FittingAlg")}, + {m_cached_device_mr, &m_cached_pinned_host_mr}, + m_copy, + m_data->m_queue_wrapper, + parent.logger().clone("FittingAlg")}, m_clustering_config(parent.m_clustering_config), m_finder_config(parent.m_finder_config), m_grid_config(parent.m_grid_config), @@ -222,8 +243,12 @@ full_chain_algorithm::output_type full_chain_algorithm::operator()( m_device_detector_view, m_field, {track_candidates, measurements}); // Copy a limited amount of result data back to the host. + const auto host_tracks = + m_copy.to(track_states.tracks, m_cached_pinned_host_mr, nullptr, + vecmem::copy::type::device_to_host); output_type result{m_host_mr.get()}; - m_copy(track_states.tracks, result)->wait(); + vecmem::copy host_copy; + host_copy(host_tracks, result)->wait(); return result; } // If not, copy the measurements back to the host, and return @@ -266,9 +291,12 @@ bound_track_parameters_collection_types::host full_chain_algorithm::seeding( {0.f, 0.f, m_finder_config.bFieldInZ}); // Copy a limited amount of result data back to the host. + const auto host_seeds = m_copy.to(track_params, m_cached_pinned_host_mr, + vecmem::copy::type::device_to_host); bound_track_parameters_collection_types::host result{ &(m_host_mr.get())}; - m_copy(track_params, result)->wait(); + vecmem::copy host_copy; + host_copy(host_seeds, result)->wait(); return result; } // If not, copy the measurements back to the host, and return diff --git a/examples/run/sycl/throughput_mt.cpp b/examples/run/sycl/throughput_mt.cpp index 09eb2f1101..21fde461a6 100644 --- a/examples/run/sycl/throughput_mt.cpp +++ b/examples/run/sycl/throughput_mt.cpp @@ -10,15 +10,10 @@ #include "full_chain_algorithm.hpp" -// VecMem include(s). -#include - int main(int argc, char* argv[]) { // Execute the throughput test. - static const bool use_host_caching = true; - return traccc::throughput_mt( - "Multi-threaded SYCL GPU throughput tests", argc, argv, - use_host_caching); + + return traccc::throughput_mt( + "Multi-threaded SYCL GPU throughput tests", argc, argv); } diff --git a/examples/run/sycl/throughput_st.cpp b/examples/run/sycl/throughput_st.cpp index d54f5adb10..9290fd09d9 100644 --- a/examples/run/sycl/throughput_st.cpp +++ b/examples/run/sycl/throughput_st.cpp @@ -13,8 +13,6 @@ int main(int argc, char* argv[]) { // Execute the throughput test. - static const bool use_host_caching = true; return traccc::throughput_st( - "Single-threaded SYCL GPU throughput tests", argc, argv, - use_host_caching); + "Single-threaded SYCL GPU throughput tests", argc, argv); }