diff --git a/core/include/traccc/seeding/detail/seeding_config.hpp b/core/include/traccc/seeding/detail/seeding_config.hpp index b7d2d4e97c..6006d36956 100644 --- a/core/include/traccc/seeding/detail/seeding_config.hpp +++ b/core/include/traccc/seeding/detail/seeding_config.hpp @@ -194,9 +194,9 @@ struct seedfilter_config { float deltaInvHelixDiameter = 0.00003f / unit::mm; // the impact parameters (d0) is multiplied by this factor and subtracted // from weight - float impactWeightFactor = 1.f; + float impactWeightFactor = 0.5f; // seed weight increased by this value if a compatible seed has been found. - float compatSeedWeight = 200.f; + float compatSeedWeight = 2.5f; // minimum distance between compatible seeds to be considered for weight // boost float deltaRMin = 5.f * unit::mm; diff --git a/device/common/include/traccc/seeding/device/impl/select_seeds.ipp b/device/common/include/traccc/seeding/device/impl/select_seeds.ipp index 786331cfa6..fe3b4699f5 100644 --- a/device/common/include/traccc/seeding/device/impl/select_seeds.ipp +++ b/device/common/include/traccc/seeding/device/impl/select_seeds.ipp @@ -67,7 +67,9 @@ inline void select_seeds( const triplet_counter_spM_collection_types::const_view& spM_tc_view, const triplet_counter_collection_types::const_view& tc_view, const device_triplet_collection_types::const_view& triplet_view, - device_triplet* data, edm::seed_collection::view seed_view) { + device_triplet* data, + const vecmem::data::vector_view votes_per_triplet_view, + edm::seed_collection::view seed_view) { // Check if anything needs to be done. const triplet_counter_spM_collection_types::const_device triplet_counts_spM( @@ -85,6 +87,8 @@ inline void select_seeds( sp_view); const device_triplet_collection_types::const_device triplets(triplet_view); + const vecmem::device_vector votes_per_triplet( + votes_per_triplet_view); edm::seed_collection::device seeds_device(seed_view); // Current work item = middle spacepoint @@ -99,9 +103,14 @@ inline void select_seeds( const unsigned int end_triplets_spM = spM_counter.posTriplets + spM_counter.m_nTriplets; + // iterate over the triplets in the bin for (unsigned int i = spM_counter.posTriplets; i < end_triplets_spM; ++i) { - device_triplet aTriplet = triplets[i]; + const device_triplet& aTriplet = triplets[i]; + + if (votes_per_triplet.at(i) < 3) { + continue; + } // spacepoints bottom and top for this triplet const unsigned int spB_idx = aTriplet.spB; @@ -111,16 +120,6 @@ inline void select_seeds( const edm::spacepoint_collection::const_device::const_proxy_type spT = spacepoints.at(spT_idx); - // update weight of triplet - seed_selecting_helper::seed_weight(filter_config, spM, spB, spT, - aTriplet.weight); - - // check if it is a good triplet - if (!seed_selecting_helper::single_seed_cut(filter_config, spM, spB, - spT, aTriplet.weight)) { - continue; - } - // if the number of good triplets is larger than the threshold, // the triplet with the lowest weight is removed if (n_triplets_per_spM >= finder_config.maxSeedsPerSpM) { diff --git a/device/common/include/traccc/seeding/device/select_seeds.hpp b/device/common/include/traccc/seeding/device/select_seeds.hpp index 2e8a7ce230..916a4d4643 100644 --- a/device/common/include/traccc/seeding/device/select_seeds.hpp +++ b/device/common/include/traccc/seeding/device/select_seeds.hpp @@ -42,7 +42,9 @@ inline void select_seeds( const triplet_counter_spM_collection_types::const_view& spM_tc_view, const triplet_counter_collection_types::const_view& tc_view, const device_triplet_collection_types::const_view& triplet_view, - triplet* data, edm::seed_collection::view seed_view); + triplet* data, + const vecmem::data::vector_view votes_per_triplet_view, + edm::seed_collection::view seed_view); } // namespace traccc::device diff --git a/device/cuda/src/seeding/seed_finding.cu b/device/cuda/src/seeding/seed_finding.cu index ac0decf2df..2f26439576 100644 --- a/device/cuda/src/seeding/seed_finding.cu +++ b/device/cuda/src/seeding/seed_finding.cu @@ -20,6 +20,7 @@ #include "traccc/edm/device/doublet_counter.hpp" #include "traccc/edm/device/seeding_global_counter.hpp" #include "traccc/edm/device/triplet_counter.hpp" +#include "traccc/seeding/detail/singlet.hpp" #include "traccc/seeding/device/count_doublets.hpp" #include "traccc/seeding/device/count_triplets.hpp" #include "traccc/seeding/device/find_doublets.hpp" @@ -37,6 +38,195 @@ namespace traccc::cuda { namespace kernels { +namespace detail { +/** + * @brief Encode the state of our parameter insertion mutex. + */ +TRACCC_HOST_DEVICE inline uint64_t encode_insertion_mutex(const bool locked, + const uint32_t size, + const float max) { + + // Assert that the MSB of the size is zero + assert(size <= 0x7FFFFFFF); + const uint32_t hi = size | (locked ? 0x80000000 : 0x0); + const uint32_t lo = std::bit_cast(max); + return (static_cast(hi) << 32) | lo; +} + +/** + * @brief Decode the state of our parameter insertion mutex. + */ +TRACCC_HOST_DEVICE inline std::tuple +decode_insertion_mutex(const uint64_t val) { + const uint32_t hi = static_cast(val >> 32); + const uint32_t lo = val & 0xFFFFFFFF; + return {static_cast(hi & 0x80000000), (hi & 0x7FFFFFFF), + std::bit_cast(lo)}; +} +} // namespace detail + +__global__ inline void gather_best_triplets_per_spacepoint( + const device::device_triplet_collection_types::const_view triplet_view, + const traccc::details::spacepoint_grid_types::const_view grid_view, + vecmem::data::vector_view insertion_mutex_view, + vecmem::data::vector_view triplet_index_view, + vecmem::data::vector_view triplet_weight_view, + const unsigned int max_num_triplets_per_spacepoint) { + + const device::device_triplet_collection_types::const_device triplets( + triplet_view); + const traccc::details::spacepoint_grid_types::const_device grid(grid_view); + + vecmem::device_vector insertion_mutex( + insertion_mutex_view); + vecmem::device_vector triplet_index(triplet_index_view); + vecmem::device_vector triplet_weight(triplet_weight_view); + + unsigned int triplet_idx = blockIdx.x * blockDim.x + threadIdx.x; + + scalar weight; + bool need_to_write = true; + unsigned int current_state = 0; + unsigned int spacepoint_idx = 0; + + auto get_idx_for_state = [&triplets, &grid, + triplet_idx](unsigned int state) { + const device::device_triplet& this_triplet = triplets.at(triplet_idx); + if (state == 0) { + return this_triplet.spB; + } else if (state == 1) { + return this_triplet.spM; + } else { + return this_triplet.spT; + } + }; + + if (triplet_idx < triplets.size()) { + weight = triplets.at(triplet_idx).weight; + spacepoint_idx = get_idx_for_state(0); + } else { + need_to_write = false; + current_state = 3; + } + + while (__syncthreads_or(current_state < 3 || need_to_write)) { + if (current_state < 3 || need_to_write) { + if (need_to_write) { + vecmem::device_atomic_ref mutex( + insertion_mutex.at(spacepoint_idx)); + + unsigned long long int assumed = mutex.load(); + unsigned long long int desired_set; + + auto [locked, size, worst] = + detail::decode_insertion_mutex(assumed); + + if (need_to_write && size >= max_num_triplets_per_spacepoint && + weight <= worst) { + need_to_write = false; + } + + bool holds_lock = false; + + if (need_to_write && !locked) { + desired_set = + detail::encode_insertion_mutex(true, size, worst); + if (mutex.compare_exchange_strong(assumed, desired_set)) { + holds_lock = true; + } + } + + if (holds_lock) { + unsigned int new_size; + unsigned int offset = + spacepoint_idx * max_num_triplets_per_spacepoint; + unsigned int out_idx; + + if (size == max_num_triplets_per_spacepoint) { + new_size = size; + scalar worst_weight = + std::numeric_limits::max(); + for (unsigned int i = 0; i < size; ++i) { + if (triplet_weight.at(offset + i) < worst_weight) { + worst_weight = triplet_weight.at(offset + i); + out_idx = i; + } + } + } else { + new_size = size + 1; + out_idx = size; + } + + triplet_index.at(offset + out_idx) = triplet_idx; + triplet_weight.at(offset + out_idx) = weight; + + scalar new_worst = std::numeric_limits::max(); + + for (unsigned int i = 0; i < new_size; ++i) { + new_worst = + std::min(new_worst, triplet_weight.at(offset + i)); + } + + [[maybe_unused]] bool cas_result = + mutex.compare_exchange_strong( + desired_set, detail::encode_insertion_mutex( + false, new_size, new_worst)); + assert(cas_result); + + need_to_write = false; + } + } + + if (!need_to_write && current_state < 3) { + if (current_state < 2) { + spacepoint_idx = get_idx_for_state(++current_state); + need_to_write = true; + } else { + ++current_state; + } + } + } + } +} + +__global__ inline void gather_spacepoint_votes( + const vecmem::data::vector_view + insertion_mutex_view, + const vecmem::data::vector_view triplet_index_view, + vecmem::data::vector_view votes_per_triplet_view, + const unsigned int max_num_triplets_per_spacepoint) { + + unsigned int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int spacepoint_idx = thread_idx / max_num_triplets_per_spacepoint; + unsigned int triplet_idx = thread_idx % max_num_triplets_per_spacepoint; + + const vecmem::device_vector insertion_mutex( + insertion_mutex_view); + const vecmem::device_vector triplet_index( + triplet_index_view); + vecmem::device_vector votes_per_triplet( + votes_per_triplet_view); + + if (spacepoint_idx >= insertion_mutex.size()) { + return; + } + + auto [locked, size, worst] = + detail::decode_insertion_mutex(insertion_mutex.at(spacepoint_idx)); + + if (size == 0) { + return; + } + + unsigned int num_votes = + (size + max_num_triplets_per_spacepoint - 1) / size; + + if (triplet_idx < size) { + vecmem::device_atomic_ref( + votes_per_triplet.at(triplet_index.at(thread_idx))) + .fetch_add(num_votes); + } +} /// CUDA kernel for running @c traccc::device::count_doublets __global__ void count_doublets( @@ -134,6 +324,7 @@ __global__ void select_seeds( device::triplet_counter_spM_collection_types::const_view spM_tc, device::triplet_counter_collection_types::const_view midBot_tc, device::device_triplet_collection_types::view triplet_view, + const vecmem::data::vector_view votes_per_triplet_view, edm::seed_collection::view seed_view) { // Array for temporary storage of triplets for comparing within seed @@ -145,7 +336,7 @@ __global__ void select_seeds( device::select_seeds(details::global_index1(), finder_config, filter_config, spacepoints, sp_view, spM_tc, midBot_tc, triplet_view, - dataPos, seed_view); + dataPos, votes_per_triplet_view, seed_view); } } // namespace kernels @@ -184,6 +375,10 @@ edm::seed_collection::buffer seed_finding::operator()( return {0, m_mr.main}; } + // Number of spacepoints including those not in the grid. + const unsigned int num_spacepoints_total = + m_copy.get_size(spacepoints_view); + // Set up the doublet counter buffer. device::doublet_counter_collection_types::buffer doublet_counter_buffer = { num_spacepoints, m_mr.main, vecmem::data::buffer_type::resizable}; @@ -341,6 +536,64 @@ edm::seed_collection::buffer seed_finding::operator()( triplet_buffer); TRACCC_CUDA_ERROR_CHECK(cudaGetLastError()); + vecmem::data::vector_buffer votes_per_triplet_buffer( + globalCounter_host->m_nTriplets, m_mr.main); + m_copy.setup(votes_per_triplet_buffer)->wait(); + m_copy.memset(votes_per_triplet_buffer, 0)->wait(); + + { + unsigned int num_votes_per_sp = 3; + + vecmem::data::vector_buffer + best_triplets_per_spacepoint_index_buffer( + num_votes_per_sp * num_spacepoints_total, m_mr.main); + m_copy.setup(best_triplets_per_spacepoint_index_buffer)->wait(); + + vecmem::data::vector_buffer + best_triplets_per_spacepoint_insertion_mutex_buffer( + num_spacepoints_total, m_mr.main); + m_copy.setup(best_triplets_per_spacepoint_insertion_mutex_buffer) + ->wait(); + m_copy.memset(best_triplets_per_spacepoint_insertion_mutex_buffer, 0) + ->wait(); + + { + vecmem::data::vector_buffer + best_triplets_per_spacepoint_weight_buffer( + num_votes_per_sp * num_spacepoints_total, m_mr.main); + m_copy.setup(best_triplets_per_spacepoint_weight_buffer)->wait(); + + unsigned int num_threads = 64; + unsigned int num_blocks = + (globalCounter_host->m_nTriplets + num_threads - 1) / + num_threads; + + kernels::gather_best_triplets_per_spacepoint<<< + num_blocks, num_threads, 0, stream>>>( + triplet_buffer, g2_view, + best_triplets_per_spacepoint_insertion_mutex_buffer, + best_triplets_per_spacepoint_index_buffer, + best_triplets_per_spacepoint_weight_buffer, num_votes_per_sp); + + TRACCC_CUDA_ERROR_CHECK(cudaGetLastError()); + } + + { + unsigned int num_threads = 512; + unsigned int num_blocks = + (num_votes_per_sp * num_spacepoints_total + num_threads - 1) / + num_threads; + + kernels:: + gather_spacepoint_votes<<>>( + best_triplets_per_spacepoint_insertion_mutex_buffer, + best_triplets_per_spacepoint_index_buffer, + votes_per_triplet_buffer, num_votes_per_sp); + + TRACCC_CUDA_ERROR_CHECK(cudaGetLastError()); + } + } + // Create result object: collection of seeds edm::seed_collection::buffer seed_buffer( globalCounter_host->m_nTriplets, m_mr.main, @@ -362,7 +615,7 @@ edm::seed_collection::buffer seed_finding::operator()( stream>>>( m_seedfinder_config, m_seedfilter_config, spacepoints_view, g2_view, triplet_counter_spM_buffer, triplet_counter_midBot_buffer, - triplet_buffer, seed_buffer); + triplet_buffer, votes_per_triplet_buffer, seed_buffer); TRACCC_CUDA_ERROR_CHECK(cudaGetLastError()); return seed_buffer;