diff --git a/core/include/traccc/gbts_seeding/gbts_seeding_config.hpp b/core/include/traccc/gbts_seeding/gbts_seeding_config.hpp index 04d67e084e..8a55a2252a 100644 --- a/core/include/traccc/gbts_seeding/gbts_seeding_config.hpp +++ b/core/include/traccc/gbts_seeding/gbts_seeding_config.hpp @@ -48,7 +48,7 @@ struct gbts_consts { static constexpr unsigned short max_cca_iter = 15; // shared memory allocation sizes static constexpr unsigned short node_buffer_length = 128; - static constexpr unsigned short shared_state_buffer_size = 608; + static constexpr unsigned short live_path_buffer = 1024; // access into output graph static constexpr char node1 = 0; @@ -61,7 +61,7 @@ struct gbts_consts { namespace traccc { -struct gbts_algo_params { +struct gbts_graph_building_params { // edge making cuts float min_delta_phi = 0.015f; @@ -73,13 +73,14 @@ struct gbts_algo_params { float min_z0 = -160.0f; float max_z0 = 160.0f; - float maxOuterRadius = 550.0f; - float cut_zMinU = min_z0 - maxOuterRadius * 45; - float cut_zMaxU = max_z0 + maxOuterRadius * 45; // how to get ROI dzdr + float maxOuterRadius = 350.0f; + // how to get ROI dzdr + float cut_zMinU = min_z0 - maxOuterRadius * 45.0f; + float cut_zMaxU = max_z0 + maxOuterRadius * 45.0f; float max_Kappa = 3.75e-4f; - float low_Kappa_d0 = 0.0f; // used to be 0.2f - float high_Kappa_d0 = 0.0f; // used to be 1.0f + float low_Kappa_d0 = 0.00f; + float high_Kappa_d0 = 0.0f; // tau prediction cut float tMin_slope = 6.7f; @@ -96,6 +97,34 @@ struct gbts_algo_params { float cut_tau_ratio_max = 0.01f; }; +struct gbts_seed_extraction_params { + // for 900 MeV track at eta=0 + float sigmaMS = 0.016f; + // 2.5% per layer + float radLen = 0.025f; + + float sigma_x = 0.08f; + float sigma_y = 0.25f; + + float weight_x = 0.5f; + float weight_y = 0.5f; + + float maxDChi2_x = 5.0f; + float maxDChi2_y = 6.0f; + // controls if seeds of shorter lengths + // can win bidding against longer seeds + float add_hit = 14.0f; + // m_J is stored in 30 + 1 bits + // max qual = add_hit*max_length*qual_scale + float qual_scale = + 0.01f * static_cast(INT_MAX) / + (add_hit * + static_cast(traccc::device::gbts_consts::max_cca_iter)); + + float inv_max_curvature = 900.0f; + float max_z0 = 160.0f; +}; + struct gbts_seedfinder_config { bool setLinkingScheme( const std::vector>>& binTables, @@ -112,15 +141,18 @@ struct gbts_seedfinder_config { std::vector> surfaceToLayerMap{}; // tuned for 900 MeV pT cut and scaled by input minPt - gbts_algo_params algo_params{}; + gbts_graph_building_params graph_building_params{}; + + gbts_seed_extraction_params seed_extraction_params{}; // node making bin counts unsigned int n_eta_bins = 0; // calculated from input layerInfo unsigned int n_phi_bins = 128; // graph making maxiums - unsigned char max_num_neighbours = 10; - // graph extraction cuts - int minLevel = 3; // equivlent to a cut of #seed edges or #spacepoints-1 + unsigned char max_num_neighbours = 6; + // graph extraction minimum seed edge length + // equivlent to a cut of #spacepoints-1 + int minLevel = 3; }; } // namespace traccc diff --git a/core/src/gbts_seeding/gbts_seeding_config.cpp b/core/src/gbts_seeding/gbts_seeding_config.cpp index ff0f8e848e..6f32cb4211 100644 --- a/core/src/gbts_seeding/gbts_seeding_config.cpp +++ b/core/src/gbts_seeding/gbts_seeding_config.cpp @@ -103,11 +103,11 @@ bool gbts_seedfinder_config::setLinkingScheme( volumeToLayerMap[vLpair.second] = vLpair.first; // scale cuts float ptScale = 900.0f / minPt; - algo_params.min_delta_phi *= ptScale; - algo_params.dphi_coeff *= ptScale; - algo_params.min_delta_phi_low_dr *= ptScale; - algo_params.dphi_coeff_low_dr *= ptScale; - algo_params.max_Kappa *= ptScale; + graph_building_params.min_delta_phi *= ptScale; + graph_building_params.dphi_coeff *= ptScale; + graph_building_params.min_delta_phi_low_dr *= ptScale; + graph_building_params.dphi_coeff_low_dr *= ptScale; + graph_building_params.max_Kappa *= ptScale; // contianers sizes nLayers = static_cast(layerInfo.type.size()); diff --git a/device/cuda/src/gbts_seeding/gbts_seeding_algorithm.cu b/device/cuda/src/gbts_seeding/gbts_seeding_algorithm.cu index 5bf8b64b9c..271fc173db 100644 --- a/device/cuda/src/gbts_seeding/gbts_seeding_algorithm.cu +++ b/device/cuda/src/gbts_seeding/gbts_seeding_algorithm.cu @@ -32,8 +32,8 @@ struct gbts_ctx { // nEdges, nConnections, nConnectedEdges, .., nSeeds unsigned int* d_counters{}; - // device cut values - gbts_algo_params* d_algo_params; + // device side graph building cuts + gbts_graph_building_params* d_graph_building_params; // node making and binning int* d_layerCounts{}; @@ -95,23 +95,19 @@ struct gbts_ctx { int* d_active_edges{}; // d_levels[edge_idx] = the maxium length of seeds starting with this edge char* d_levels{}; - int* d_level_views{}; // edge indices by level - int* d_level_boundaries{}; // number of edges for each level in the above + // #paths, is terminus + short2* d_outgoing_paths{}; // seed-extraction walkthrough - // edge_idx and prev mini_state forms a seeds uniuqe path through the graph - int2* d_mini_states{}; + // edge_idx and prev path_store idx forms a uniuqe path through the graph + int2* d_path_store{}; int2* d_seed_proposals{}; // int quality and final mini_state_idx - kernels::edgeState* d_state_store{}; // global overflow of live edgeStates // first 32 bits are seed quality second 32 bits are seed_proposals index unsigned long long int* d_edge_bids{}; // 0 as default/is real seed, 1 as maybe seed, //-1 as maybe fake seed, -2 as fake char* d_seed_ambiguity{}; - - // output - kernels::Tracklet* d_seeds{}; }; gbts_seeding_algorithm::gbts_seeding_algorithm( @@ -132,20 +128,23 @@ gbts_seeding_algorithm::output_type gbts_seeding_algorithm::operator()( cudaStream_t stream = details::get_stream(m_stream); - cudaMalloc(&ctx.d_algo_params, sizeof(m_config.algo_params)); - cudaMemcpyAsync(ctx.d_algo_params, &m_config.algo_params, - sizeof(m_config.algo_params), cudaMemcpyHostToDevice); + cudaMalloc(&ctx.d_graph_building_params, + sizeof(m_config.graph_building_params)); + cudaMemcpyAsync( + ctx.d_graph_building_params, &m_config.graph_building_params, + sizeof(m_config.graph_building_params), cudaMemcpyHostToDevice); // 0. bin spacepoints by the maping supplied to config.m_surfaceToLayerMap ctx.nSp = m_copy.get().get_size(spacepoints); - if (ctx.nSp == 0) + if (ctx.nSp == 0) { return {0, m_mr.main}; - + } unsigned int nThreads = 128; unsigned int nBlocks = 1 + (ctx.nSp - 1) / nThreads; cudaMalloc(&ctx.d_layerCounts, (m_config.nLayers + 1) * sizeof(int)); - cudaMemset(ctx.d_layerCounts, 0, (m_config.nLayers + 1) * sizeof(int)); + cudaMemsetAsync(ctx.d_layerCounts, 0, (m_config.nLayers + 1) * sizeof(int), + stream); cudaMalloc(&ctx.d_spacepointsLayer, ctx.nSp * sizeof(short)); cudaMalloc(&ctx.d_reducedSP, ctx.nSp * sizeof(float4)); @@ -176,7 +175,7 @@ gbts_seeding_algorithm::output_type gbts_seeding_algorithm::operator()( spacepoints, measurements, ctx.d_volumeToLayerMap, ctx.d_surfaceToLayerMap, ctx.d_layerType, ctx.d_reducedSP, ctx.d_layerCounts, ctx.d_spacepointsLayer, - m_config.algo_params.type1_max_width, ctx.nSp, + m_config.graph_building_params.type1_max_width, ctx.nSp, m_config.volumeToLayerMap.size(), m_config.surfaceToLayerMap.size()); cudaStreamSynchronize(stream); @@ -271,7 +270,7 @@ gbts_seeding_algorithm::output_type gbts_seeding_algorithm::operator()( } size_t hist_size = sizeof(int) * m_config.n_eta_bins * m_config.n_phi_bins; cudaMalloc(&ctx.d_eta_phi_histo, hist_size); - cudaMemset(ctx.d_eta_phi_histo, 0, hist_size); + cudaMemsetAsync(ctx.d_eta_phi_histo, 0, hist_size, stream); cudaMalloc(&ctx.d_phi_cusums, hist_size); nBlocks = 1 + (ctx.nNodes - 1) / nNodesPerBlock; @@ -367,8 +366,8 @@ gbts_seeding_algorithm::output_type gbts_seeding_algorithm::operator()( kernels::node_sorting_kernel<<>>( ctx.d_sp_params, ctx.d_node_eta_index, ctx.d_node_phi_index, ctx.d_phi_cusums, ctx.d_node_params, ctx.d_node_index, - ctx.d_original_sp_idx, ctx.d_algo_params, nNodesPerBlock, ctx.nNodes, - m_config.n_phi_bins); + ctx.d_original_sp_idx, ctx.d_graph_building_params, nNodesPerBlock, + ctx.nNodes, m_config.n_phi_bins); cudaStreamSynchronize(stream); cudaFree(ctx.d_sp_params); @@ -435,7 +434,9 @@ gbts_seeding_algorithm::output_type gbts_seeding_algorithm::operator()( // large bins will be split into smaller sub-views int nNodesInBin1 = bin1_end - bin1_begin; - + if (bin1_begin > bin1_end) { + nNodesInBin1 = bin1_begin - bin1_end; + } int_nBinPairs += 1 + (nNodesInBin1 - 1) / traccc::device::gbts_consts::node_buffer_length; @@ -463,12 +464,13 @@ gbts_seeding_algorithm::output_type gbts_seeding_algorithm::operator()( // max radius of bin2 - min radius of bin1 float maxDeltaR = std::fabs(rb2 - rb1); - float deltaPhi = m_config.algo_params.min_delta_phi + - m_config.algo_params.dphi_coeff * maxDeltaR; + float deltaPhi = m_config.graph_building_params.min_delta_phi + + m_config.graph_building_params.dphi_coeff * maxDeltaR; if (maxDeltaR < 60) { - deltaPhi = m_config.algo_params.min_delta_phi_low_dr + - m_config.algo_params.dphi_coeff_low_dr * maxDeltaR; + deltaPhi = + m_config.graph_building_params.min_delta_phi_low_dr + + m_config.graph_building_params.dphi_coeff_low_dr * maxDeltaR; } // splitting large bins into more consistent sizes @@ -526,7 +528,7 @@ gbts_seeding_algorithm::output_type gbts_seeding_algorithm::operator()( cudaMemcpyHostToDevice, stream); cudaMalloc(&ctx.d_counters, sizeof(unsigned int) * 12); - cudaMemset(ctx.d_counters, 0, sizeof(unsigned int) * 12); + cudaMemsetAsync(ctx.d_counters, 0, sizeof(unsigned int) * 12, stream); cudaStreamSynchronize(stream); @@ -536,15 +538,17 @@ gbts_seeding_algorithm::output_type gbts_seeding_algorithm::operator()( cudaMalloc(&ctx.d_edge_nodes, sizeof(int2) * ctx.nMaxEdges); cudaMalloc(&ctx.d_num_incoming_edges, sizeof(int) * (ctx.nNodes + 1)); - cudaMemset(ctx.d_num_incoming_edges, 0, sizeof(int) * (ctx.nNodes + 1)); + cudaMemsetAsync(ctx.d_num_incoming_edges, 0, sizeof(int) * (ctx.nNodes + 1), + stream); nBlocks = ctx.nUsedBinPairs; nThreads = 128; kernels::graphEdgeMakingKernel<<>>( ctx.d_bin_pair_views, ctx.d_bin_pair_dphi, ctx.d_node_params, - ctx.d_algo_params, ctx.d_counters, ctx.d_edge_nodes, ctx.d_edge_params, - ctx.d_num_incoming_edges, ctx.nMaxEdges, m_config.n_phi_bins); + ctx.d_graph_building_params, ctx.d_counters, ctx.d_edge_nodes, + ctx.d_edge_params, ctx.d_num_incoming_edges, ctx.nMaxEdges, + m_config.n_phi_bins); cudaStreamSynchronize(stream); cudaFree(ctx.d_node_params); @@ -615,18 +619,18 @@ gbts_seeding_algorithm::output_type gbts_seeding_algorithm::operator()( data_size = ctx.nEdges * sizeof(unsigned char); cudaMalloc(&ctx.d_num_neighbours, data_size); - cudaMemset(ctx.d_num_neighbours, 0, data_size); + cudaMemsetAsync(ctx.d_num_neighbours, 0, data_size, stream); data_size = ctx.nEdges * sizeof(int); cudaMalloc(&ctx.d_reIndexer, data_size); - cudaMemset(ctx.d_reIndexer, 0xFF, data_size); + cudaMemsetAsync(ctx.d_reIndexer, 0xFF, data_size, stream); data_size = m_config.max_num_neighbours * ctx.nEdges * sizeof(int); cudaMalloc(&ctx.d_neighbours, data_size); kernels::graphEdgeMatchingKernel<<>>( - ctx.d_algo_params, ctx.d_edge_params, ctx.d_edge_nodes, + ctx.d_graph_building_params, ctx.d_edge_params, ctx.d_edge_nodes, ctx.d_num_incoming_edges, ctx.d_edge_links, ctx.d_num_neighbours, ctx.d_neighbours, ctx.d_reIndexer, ctx.d_counters, ctx.nEdges, m_config.max_num_neighbours); @@ -709,7 +713,8 @@ gbts_seeding_algorithm::output_type gbts_seeding_algorithm::operator()( data_size = ctx.nConnectedEdges * sizeof(int); cudaMalloc(&ctx.d_active_edges, data_size); - cudaMemset(ctx.d_active_edges, 0xFF, data_size); // initialize to -1 + cudaMemsetAsync(ctx.d_active_edges, 0xFF, data_size, + stream); // initialize to -1 data_size = 2 * ctx.nConnectedEdges * sizeof(unsigned char); @@ -717,16 +722,9 @@ gbts_seeding_algorithm::output_type gbts_seeding_algorithm::operator()( cudaMalloc(&ctx.d_levels, data_size); // initalize to 1 so level counts the maxium number of edge segments // for a seed originating at the edge - cudaMemset(ctx.d_levels, 0x1, data_size); - - data_size = ctx.nConnectedEdges * sizeof(int); - - cudaMalloc(&ctx.d_level_views, data_size); // level-based edge views - - data_size = (traccc::device::gbts_consts::max_cca_iter + 1) * sizeof(int); + cudaMemsetAsync(ctx.d_levels, 0x1, data_size, stream); - cudaMalloc(&ctx.d_level_boundaries, data_size); - cudaMemset(ctx.d_level_boundaries, 0, data_size); + cudaMalloc(&ctx.d_outgoing_paths, ctx.nConnectedEdges * sizeof(short2)); unsigned int nEdgesLeft = ctx.nConnectedEdges; @@ -744,8 +742,8 @@ gbts_seeding_algorithm::output_type gbts_seeding_algorithm::operator()( iter++) { kernels::CCA_IterationKernel<<>>( ctx.d_output_graph, ctx.d_levels, ctx.d_active_edges, - ctx.d_level_views, ctx.d_level_boundaries, ctx.d_counters, iter, - ctx.nConnectedEdges, m_config.max_num_neighbours); + ctx.d_outgoing_paths, ctx.d_counters, iter, ctx.nConnectedEdges, + m_config.max_num_neighbours, m_config.minLevel); cudaStreamSynchronize(stream); } @@ -762,131 +760,114 @@ gbts_seeding_algorithm::output_type gbts_seeding_algorithm::operator()( return {0, m_mr.main}; } - int nEdgesByLevel_cuml[traccc::device::gbts_consts::max_cca_iter + 1]; - nEdgesByLevel_cuml[traccc::device::gbts_consts::max_cca_iter] = 0; - - cudaMemcpyAsync(&nEdgesByLevel_cuml[0], ctx.d_level_boundaries, - sizeof(nEdgesByLevel_cuml), cudaMemcpyDeviceToHost, stream); + nThreads = 128; + nBlocks = 1 + (ctx.nConnectedEdges - 1) / nThreads; - int level_max = traccc::device::gbts_consts::max_cca_iter; - for (; nEdgesByLevel_cuml[level_max - 1] == 0; level_max--) - ; + cudaStreamSynchronize(stream); - if (level_max < m_config.minLevel) - return {0, m_mr.main}; - // 7. extract seeds, longest segment first + kernels::count_terminus_edges<<>>( + ctx.d_path_store, ctx.d_outgoing_paths, ctx.d_counters, + ctx.nConnectedEdges); - int device; - cudaGetDevice(&device); - int SM_count; - cudaDeviceGetAttribute(&SM_count, cudaDevAttrMultiProcessorCount, device); - int smem; - cudaDeviceGetAttribute(&smem, cudaDevAttrMaxSharedMemoryPerMultiprocessor, - device); + cudaStreamSynchronize(stream); - nThreads = 1024; + // nPaths to terminus, nTerminusEdges + unsigned int path_sizes[2]; + cudaMemcpyAsync(&path_sizes, &ctx.d_counters[6], 2 * sizeof(unsigned int), + cudaMemcpyDeviceToHost, stream); - nBlocks = 0; - int smem_per_block = static_cast(sizeof(kernels::edgeState)) * - traccc::device::gbts_consts::shared_state_buffer_size; + TRACCC_DEBUG(path_sizes[0] << "size of path store | nTerminusEdges " + << path_sizes[1]); - unsigned int soft_max_blocks = - static_cast(SM_count * (smem / smem_per_block)); + cudaMalloc(&ctx.d_path_store, + (path_sizes[0] + path_sizes[1]) * sizeof(int2)); + cudaMalloc(&ctx.d_seed_proposals, path_sizes[0] * sizeof(int2)); + cudaMalloc(&ctx.d_seed_ambiguity, path_sizes[0] * sizeof(char)); - unsigned int nMaxMini = 10000 + ctx.nConnectedEdges * 3; - cudaMalloc(&ctx.d_mini_states, sizeof(int2) * nMaxMini); + cudaMalloc(&ctx.d_edge_bids, + ctx.nConnectedEdges * sizeof(unsigned long long int)); + cudaMemsetAsync(ctx.d_edge_bids, 0, + ctx.nConnectedEdges * sizeof(unsigned long long int), + stream); - unsigned int nMaxStateStore = 2000 + ctx.nConnectedEdges; - cudaMalloc(&ctx.d_state_store, sizeof(kernels::edgeState) * nMaxStateStore); + kernels::add_terminus_to_path_store<<>>( + ctx.d_path_store, ctx.d_outgoing_paths, ctx.d_counters, + ctx.nConnectedEdges); - unsigned int nMaxProps = 4000 + ctx.nConnectedEdges; - cudaMalloc(&ctx.d_seed_proposals, sizeof(int2) * nMaxProps); - cudaMalloc(&ctx.d_seed_ambiguity, sizeof(char) * nMaxProps); + nBlocks = 1 + (path_sizes[1] - 1) / 16; + nThreads = 128; - cudaMalloc(&ctx.d_edge_bids, - sizeof(unsigned long long int) * ctx.nConnectedEdges); + kernels::fill_path_store<<>>( + ctx.d_path_store, ctx.d_output_graph, ctx.d_levels, ctx.d_counters, + path_sizes[1], m_config.max_num_neighbours); - unsigned int nMaxSeeds = 20 + ctx.nConnectedEdges / 20; - cudaMalloc(&ctx.d_seeds, sizeof(kernels::Tracklet) * nMaxSeeds); + nThreads = 128; + nBlocks = 1 + (path_sizes[0] + path_sizes[1] - 1) / nThreads; - int view_shift = nEdgesByLevel_cuml[0]; - for (int level = level_max - 1; level + 1 >= m_config.minLevel; level--) { - unsigned int nRootEdges = static_cast( - nEdgesByLevel_cuml[level] - nEdgesByLevel_cuml[level_max]); + kernels::fit_segments<<>>( + ctx.d_reducedSP, ctx.d_output_graph, ctx.d_path_store, + ctx.d_seed_proposals, ctx.d_edge_bids, ctx.d_seed_ambiguity, + ctx.d_levels, ctx.d_counters, path_sizes[1], m_config.minLevel, + m_config.max_num_neighbours, m_config.seed_extraction_params); - if (nRootEdges == 0) - continue; - unsigned int estimated_nStates = - static_cast(std::pow(1.3f, level + 1) * nRootEdges); + unsigned int nProps = 0; + cudaMemcpyAsync(&nProps, &ctx.d_counters[8], sizeof(unsigned int), + cudaMemcpyDeviceToHost, stream); - nBlocks += - 1 + estimated_nStates / - traccc::device::gbts_consts::shared_state_buffer_size; + TRACCC_DEBUG("nProps " << nProps); - if (nBlocks > soft_max_blocks || level_max - level > 3 || - level + 1 == m_config.minLevel) { + cudaStreamSynchronize(stream); - int view_min = view_shift - nEdgesByLevel_cuml[level]; - int view_max = view_shift - nEdgesByLevel_cuml[level_max]; + cudaFree(ctx.d_levels); + cudaFree(ctx.d_outgoing_paths); + cudaFree(ctx.d_reducedSP); - if (view_min == view_max) - continue; + if (nProps == 0) { + return {0, m_mr.main}; + } - cudaMemset(ctx.d_edge_bids, 0, - sizeof(unsigned long long int) * ctx.nConnectedEdges); + nThreads = 128; + nBlocks = 1 + (nProps - 1) / nThreads; - kernels::seed_extracting_kernel<<>>( - view_min, view_max, ctx.d_level_views, ctx.d_levels, - ctx.d_reducedSP, ctx.d_output_graph, ctx.d_mini_states, - ctx.d_state_store, ctx.d_edge_bids, ctx.d_seed_ambiguity, - ctx.d_seed_proposals, ctx.d_seeds, ctx.d_counters, - m_config.minLevel, nMaxMini, nMaxProps, - nMaxStateStore / nBlocks, nMaxSeeds, - m_config.max_num_neighbours); + kernels::reset_edge_bids<<>>( + ctx.d_path_store, ctx.d_seed_proposals, ctx.d_edge_bids, + ctx.d_seed_ambiguity, ctx.d_counters, -1); - level_max = level; - nBlocks = 0; - } - } - cudaStreamSynchronize(stream); + for (int round = 0; round < 5; ++round) { - cudaFree(ctx.d_levels); - cudaFree(ctx.d_level_views); - cudaFree(ctx.d_level_boundaries); - cudaFree(ctx.d_state_store); - cudaFree(ctx.d_mini_states); - cudaFree(ctx.d_seed_proposals); - cudaFree(ctx.d_edge_bids); - cudaFree(ctx.d_seed_ambiguity); + cudaMemsetAsync(ctx.d_edge_bids, 0, + ctx.nConnectedEdges * sizeof(unsigned long long int), + stream); - cudaFree(ctx.d_output_graph); - cudaFree(ctx.d_algo_params); - cudaFree(ctx.d_reducedSP); + kernels::seeds_rebid_for_edges<<>>( + ctx.d_path_store, ctx.d_seed_proposals, ctx.d_edge_bids, + ctx.d_seed_ambiguity, nProps); - cudaMemcpyAsync(&ctx.nSeeds, &ctx.d_counters[9], sizeof(unsigned int), - cudaMemcpyDeviceToHost, stream); + kernels::reset_edge_bids<<>>( + ctx.d_path_store, ctx.d_seed_proposals, ctx.d_edge_bids, + ctx.d_seed_ambiguity, ctx.d_counters, round); + } - if (ctx.nSeeds > nMaxSeeds) - ctx.nSeeds = nMaxSeeds; - if (ctx.nSeeds == 0) - return {0, m_mr.main}; + cudaFree(ctx.d_edge_bids); + cudaFree(ctx.d_counters); + cudaFree(ctx.d_graph_building_params); // 8. convert to 3sp seeds and make output buffer edm::seed_collection::buffer output_seeds( - ctx.nSeeds, m_mr.main, vecmem::data::buffer_type::resizable); + nProps, m_mr.main, vecmem::data::buffer_type::resizable); m_copy.get().setup(output_seeds)->ignore(); - nThreads = 128; - nBlocks = 1 + (ctx.nSeeds - 1) / nThreads; - kernels::gbts_seed_conversion_kernel<<>>( - ctx.d_seeds, output_seeds, ctx.nSeeds); + ctx.d_seed_proposals, ctx.d_seed_ambiguity, ctx.d_path_store, + ctx.d_output_graph, output_seeds, nProps, m_config.max_num_neighbours); cudaStreamSynchronize(stream); - cudaFree(ctx.d_seeds); - cudaFree(ctx.d_counters); + cudaFree(ctx.d_output_graph); + cudaFree(ctx.d_path_store); + cudaFree(ctx.d_seed_proposals); + cudaFree(ctx.d_seed_ambiguity); error = cudaGetLastError(); diff --git a/device/cuda/src/gbts_seeding/kernels/GbtsGraphMakingKernels.cuh b/device/cuda/src/gbts_seeding/kernels/GbtsGraphMakingKernels.cuh index 96cd847589..8a6c22989c 100644 --- a/device/cuda/src/gbts_seeding/kernels/GbtsGraphMakingKernels.cuh +++ b/device/cuda/src/gbts_seeding/kernels/GbtsGraphMakingKernels.cuh @@ -35,7 +35,8 @@ inline __device__ __host__ half4 make_half4(const __half x, const __half y, __global__ static void graphEdgeMakingKernel( const uint4* d_bin_pair_views, const float* d_bin_pair_dphi, - const float* d_node_params, const gbts_algo_params* d_algo_params, + const float* d_node_params, + const gbts_graph_building_params* d_graph_building_params, unsigned int* d_counters, int2* d_edge_nodes, half4* d_edge_params, int* d_num_outgoing_edges, const unsigned int nMaxEdges, const unsigned int nPhiBins) { @@ -71,15 +72,15 @@ __global__ static void graphEdgeMakingKernel( num_nodes1 = views.y - begin_bin1; num_nodes2 = views.w - begin_bin2; - minDeltaRad = d_algo_params->minDeltaRadius; - min_z0 = d_algo_params->min_z0; - max_z0 = d_algo_params->max_z0; - maxOuterRad = d_algo_params->maxOuterRadius; - min_zU = d_algo_params->cut_zMinU; - max_zU = d_algo_params->cut_zMaxU; - max_kappa = d_algo_params->max_Kappa; - low_Kappa_d0 = d_algo_params->low_Kappa_d0; - high_Kappa_d0 = d_algo_params->high_Kappa_d0; + minDeltaRad = d_graph_building_params->minDeltaRadius; + min_z0 = d_graph_building_params->min_z0; + max_z0 = d_graph_building_params->max_z0; + maxOuterRad = d_graph_building_params->maxOuterRadius; + min_zU = d_graph_building_params->cut_zMinU; + max_zU = d_graph_building_params->cut_zMaxU; + max_kappa = d_graph_building_params->max_Kappa; + low_Kappa_d0 = d_graph_building_params->low_Kappa_d0; + high_Kappa_d0 = d_graph_building_params->high_Kappa_d0; } __syncthreads(); @@ -221,14 +222,15 @@ __global__ static void graphEdgeMakingKernel( unsigned int nEdges = atomicAdd(&d_counters[0], 1); if (nEdges < nMaxEdges) { __half exp_eta = __float2half(sqrtf(1 + tau * tau) - tau); - atomicAdd(&d_num_outgoing_edges[globalIdx2], 1); + // edge linking order is inside->out + atomicAdd(&d_num_outgoing_edges[begin_bin1 + n1Idx], 1); d_edge_nodes[nEdges] = - make_int2(begin_bin1 + n1Idx, globalIdx2); + make_int2(globalIdx2, begin_bin1 + n1Idx); d_edge_params[nEdges] = make_half4( - exp_eta, __float2half(curv), __float2half(phi1 + curv * r1), - __float2half(phi2 + curv * r2)); + exp_eta, __float2half(curv), __float2half(phi2 + curv * r2), + __float2half(phi1 + curv * r1)); } } } @@ -244,21 +246,21 @@ __global__ static void graphEdgeLinkingKernel(const int2* d_edge_nodes, if (edge_idx >= nEdges) { return; } - int n2Idx = d_edge_nodes[edge_idx].y; // global index of n2 + int sharedNode = d_edge_nodes[edge_idx].y; // this converts num_outgoing_edges to the start postion for each node in // d_edge_links - int pos = atomicSub(&d_num_outgoing_edges[n2Idx], 1); - // this edge starts from n2, matching will check edge's n1 and then loop - // over edges outgoing from that node + int pos = atomicSub(&d_num_outgoing_edges[sharedNode], 1); + // provides views of edges leaving the sharedNode for linking d_edge_links[pos - 1] = edge_idx; } __global__ static void graphEdgeMatchingKernel( - const gbts_algo_params* d_algo_params, const half4* d_edge_params, - const int2* d_edge_nodes, const int* d_num_outgoing_edges, - const int* d_edge_links, unsigned char* d_num_neighbours, int* d_neighbours, - int* d_reIndexer, unsigned int* d_counters, const unsigned int nEdges, + const gbts_graph_building_params* d_graph_building_params, + const half4* d_edge_params, const int2* d_edge_nodes, + const int* d_num_outgoing_edges, const int* d_edge_links, + unsigned char* d_num_neighbours, int* d_neighbours, int* d_reIndexer, + unsigned int* d_counters, const unsigned int nEdges, const unsigned int nMaxNei) { __shared__ __half cut_dphi_max; __shared__ __half cut_dcurv_max; @@ -267,9 +269,10 @@ __global__ static void graphEdgeMatchingKernel( __shared__ __half PI_2_h; __shared__ __half ONE_h; if (threadIdx.x == 0) { - cut_dphi_max = __float2half(d_algo_params->cut_dphi_max); - cut_dcurv_max = __float2half(d_algo_params->cut_dcurv_max); - cut_tau_ratio_max = __float2half(d_algo_params->cut_tau_ratio_max); + cut_dphi_max = __float2half(d_graph_building_params->cut_dphi_max); + cut_dcurv_max = __float2half(d_graph_building_params->cut_dcurv_max); + cut_tau_ratio_max = + __float2half(d_graph_building_params->cut_tau_ratio_max); PI_h = __float2half(CUDART_PI_F); PI_2_h = __float2half(2 * CUDART_PI_F); @@ -282,12 +285,12 @@ __global__ static void graphEdgeMatchingKernel( if (edge1_idx >= nEdges) { return; } - // global index of n1 node of the edge1 - int n1Idx = d_edge_nodes[edge1_idx].x; - int link_begin = d_num_outgoing_edges[n1Idx]; - // the number of edges which has n1 as their starting node (n2) - int nLinks = d_num_outgoing_edges[n1Idx + 1] - link_begin; + int sharedNode = d_edge_nodes[edge1_idx].x; + + int link_begin = d_num_outgoing_edges[sharedNode]; + // the number of edges leaving the sharedNode + int nLinks = d_num_outgoing_edges[sharedNode + 1] - link_begin; if (nLinks == 0) { return; } diff --git a/device/cuda/src/gbts_seeding/kernels/GbtsGraphProcessingKernels.cuh b/device/cuda/src/gbts_seeding/kernels/GbtsGraphProcessingKernels.cuh index 16e952d650..f357a04611 100644 --- a/device/cuda/src/gbts_seeding/kernels/GbtsGraphProcessingKernels.cuh +++ b/device/cuda/src/gbts_seeding/kernels/GbtsGraphProcessingKernels.cuh @@ -16,20 +16,19 @@ namespace traccc::cuda::kernels { -// currently 80 bytes -> 5 v4 loads/stores -struct __align__(16) edgeState { +struct edgeState { __device__ inline void initialize(const float4& node1_params, const float4& node2_params); __device__ inline float& m_Cx(const int i, const int j) { - return Cx[i + j + 1 * (i == 0) * (j == 0)]; + return Cx[i + j + 1 * (i != 0) * (j != 0)]; } __device__ inline float& m_Cy(const int i, const int j) { return Cy[i + j]; } __device__ inline const float& m_Cx(const int i, const int j) const { - return Cx[i + j + 1 * (i == 0) * (j == 0)]; + return Cx[i + j + 1 * (i != 0) * (j != 0)]; } __device__ inline const float& m_Cy(const int i, const int j) const { return Cy[i + j]; @@ -38,17 +37,13 @@ struct __align__(16) edgeState { float m_X[3], m_Y[2]; float m_c, m_s, m_refX, m_refY; - int m_J : 31; + float m_J; - bool m_head_node_type : 1; - - unsigned int m_mini_idx : 27; - unsigned int m_length : 5; - int m_edge_idx; + bool m_head_node_type; // upper triangle of the Cov matrix for the parabola in the x,y plane since // symetry gives the rest - float Cx[5]; //(0,0), (0,1), (0,2), (1,1), (1,2), (2,2) + float Cx[6]; //(0,0), (0,1), (0,2), (1,1), (1,2), (2,2) // Cov matrix for the linear fit of eta and z float Cy[3]; //(0,0), (0,1), (1,1) }; @@ -61,22 +56,21 @@ struct Tracklet { /** @brief Performs one iteration of the CCA over the graph to calculate * potential seed length * - * This also constructs a level view over the graph edges for use in seed - * extraction + * also counts the size of d_path_store to describe all the paths back + * to the inner-most (terminus) edges * * @param[in] d_output_graph see comments in device_context.h * @param[in] d_levels is the maximum seed length originating at each edge * @param[in/out] d_active_edges stores the edge_idx for edges that have not - * reached their level - * @param[out] d_level_views / d_level_boundaires together they construct a - * view over the edges by level for use in seed extraction + * reached their level + * @param[out] d_outgoing_paths [#paths needed, is-terminus] * @param[internal] d_counters[5 and 6] used for counting the last block and * for the number of active edges */ __global__ static void CCA_IterationKernel( const int* d_output_graph, char* d_levels, int* d_active_edges, - int* d_level_views, int* d_level_boundaries, unsigned int* d_counters, - int iter, unsigned int nEdges, unsigned int max_num_neighbours) { + short2* d_outgoing_paths, unsigned int* d_counters, int iter, + unsigned int nEdges, unsigned int max_num_neighbours, int minLevel) { __shared__ unsigned int nEdgesLeft; unsigned int edge_size = 2 + 1 + max_num_neighbours; @@ -117,13 +111,34 @@ __global__ static void CCA_IterationKernel( } } // add all remianing edges to level_views on the last iteration - if (localChange && - iter < traccc::device::gbts_consts::max_cca_iter - 1) { - // nChanges - unsigned int edgesLeftPlace = atomicAdd(&d_counters[4 - toggle], 1); - d_active_edges[edgesLeftPlace] = edgeIdx; // for the next iteration + if (localChange) { + if (iter == traccc::device::gbts_consts::max_cca_iter - 1) { + // shorten paths longer than max_cca_iter + d_outgoing_paths[edgeIdx].y = -1; + } else { + unsigned int edgesLeftPlace = + atomicAdd(&d_counters[4 - toggle], 1); + d_active_edges[edgesLeftPlace] = + edgeIdx; // for the next iteration + } } else { - d_level_views[atomicAdd(&d_counters[6], 1)] = edgeIdx; + short out_paths = 0; + for (int nIdx = 0; nIdx < nNei; ++nIdx) { + int nextEdgeIdx = + d_output_graph[edge_pos + + traccc::device::gbts_consts::nei_start + + nIdx]; + if (next_level == 1 + d_levels[nextEdgeIdx]) { + // calculate the #d_state_store nodes for segment extraction + // starting at this edge + out_paths += 1 + d_outgoing_paths[nextEdgeIdx].x; + } + // flag as not terminus edge + d_outgoing_paths[nextEdgeIdx].y = -1; + } + // flag as long enough segement to become a seed + d_outgoing_paths[edgeIdx] = + make_short2(out_paths, (next_level >= minLevel) - 1); } // store new level and ensure all final // levels are on both sides of the array @@ -134,24 +149,165 @@ __global__ static void CCA_IterationKernel( if (threadIdx.x == 0) { if (atomicAdd(&d_counters[5], 1) == gridDim.x - 1) { // this is the last block - d_level_boundaries[iter] = nEdgesLeft; d_counters[3 + toggle] = 0; d_counters[5] = 0; } } } +void __global__ count_terminus_edges(int2* d_path_store, + short2* d_outgoing_paths, + unsigned int* d_counters, + unsigned int nEdges) { + + // count in shared first to reduce global atomics + __shared__ int outgoingCount; + + if (threadIdx.x == 0) { + outgoingCount = 0; + } + __syncthreads(); + + int edge_idx = threadIdx.x + blockIdx.x * blockDim.x; + if (edge_idx < nEdges) { + + short2 out_paths = d_outgoing_paths[edge_idx]; + // only count terminus edges that could lead to a seed + // fill the first part of path_store so fitting can skip go-nowhere + // paths + if (out_paths.y != -1) { + d_outgoing_paths[edge_idx].y = atomicAdd(&d_counters[7], 1); + atomicAdd(&outgoingCount, out_paths.x); + } + } + __syncthreads(); + if (threadIdx.x == 0) { + atomicAdd(&d_counters[6], outgoingCount); + } +} + +void __global__ add_terminus_to_path_store(int2* d_path_store, + short2* d_outgoing_paths, + unsigned int* d_counters, + unsigned int nEdges) { + + int edge_idx = threadIdx.x + blockIdx.x * blockDim.x; + if (edge_idx >= nEdges) { + return; + } + short2 out_paths = d_outgoing_paths[edge_idx]; + if (out_paths.y == -1) { + return; + } + // -1 flags as the terminus of a path + d_path_store[out_paths.y] = make_int2(edge_idx, -1); +} + +// each node in the path_store defines a unique path through the graph +// but includes subsets like 0->1->2 and 1->2 +// The paths order the edges in reverse order compared to graph linking +// extracting all paths here allows for fitting to occor down know paths in +// registers +void __global__ fill_path_store(int2* d_path_store, int* d_output_graph, + char* d_levels, unsigned int* d_counters, + unsigned int nTerminus, + unsigned int max_num_neighbours) { + + __shared__ int2 live_paths[traccc::device::gbts_consts::live_path_buffer]; + __shared__ int n_live_paths; + + if (threadIdx.x == 0) { + n_live_paths = 0; + } + __syncthreads(); + + int edge_size = 2 + 1 + max_num_neighbours; + + unsigned int path_idx = threadIdx.x + blockIdx.x * 16; + // populate live_paths with terminus to start exploration from + if (threadIdx.x < 16 && path_idx < nTerminus) { + int2 path = d_path_store[path_idx]; + int nNei = d_output_graph[traccc::device::gbts_consts::nNei + + edge_size * path.x]; + char level = d_levels[path.x]; + for (int nei = 0; nei < nNei; ++nei) { + int edge_idx = + d_output_graph[traccc::device::gbts_consts::nei_start + nei + + edge_size * path.x]; + // only search down longest path + if (level != d_levels[edge_idx] + 1) { + continue; + } + int live_idx = atomicAdd(&n_live_paths, 1); + if (live_idx >= traccc::device::gbts_consts::live_path_buffer) { + break; + } + int new_path_idx = atomicAdd(&d_counters[7], 1); + // head edge idx, link back + d_path_store[new_path_idx] = make_int2(edge_idx, path_idx); + live_paths[live_idx] = make_int2(edge_idx, new_path_idx); + } + } + __syncthreads(); + + int2 path = make_int2(0, 0); + bool has_path = false; + + while (n_live_paths > 0) { + has_path = false; + if (threadIdx.x == 0) { + n_live_paths = min(n_live_paths, + traccc::device::gbts_consts::live_path_buffer); + } + __syncthreads(); + // get path + if (threadIdx.x < n_live_paths) { + path = live_paths[n_live_paths - threadIdx.x - 1]; + has_path = true; + } + __syncthreads(); + if (threadIdx.x == 0) { + n_live_paths = + n_live_paths < blockDim.x ? 0 : n_live_paths - blockDim.x; + } + __syncthreads(); + if (has_path) { + int nNei = d_output_graph[traccc::device::gbts_consts::nNei + + edge_size * path.x]; + char level = d_levels[path.x]; + for (int nei = 0; nei < nNei; ++nei) { + int edge_idx = + d_output_graph[traccc::device::gbts_consts::nei_start + + nei + edge_size * path.x]; + // only search down longest segments + if (level != d_levels[edge_idx] + 1) { + continue; + } + int live_idx = atomicAdd(&n_live_paths, 1); + if (live_idx >= traccc::device::gbts_consts::live_path_buffer) { + break; + } + path_idx = atomicAdd(&d_counters[7], 1); + // head edge idx, link back + d_path_store[path_idx] = make_int2(edge_idx, path.y); + live_paths[live_idx] = make_int2(edge_idx, path_idx); + } + } + // wait for live_paths to repopulate + __syncthreads(); + } +} + /** @brief initialize the Kalman filter for this new edgeState from the starting * edge (2 nodes) * * @param[in] node1_params / node2_params the nodes of the starting edge. Node - * 1 is the inner node and we filter outside in + * 1 is the inner node and we filter outside in */ __device__ inline void edgeState::initialize( const float4& node1_params, const float4& node2_params) { // x, y, z,type - m_J = 0; - m_length = 1; + m_J = 0.0f; m_head_node_type = (node1_params.w < 0); // n2->n1 @@ -204,51 +360,28 @@ __device__ inline void edgeState::initialize( * Seed extraction goes outside in * * @param[out] new_ts output edgeState for the updated seed including node1 - * @param[in] ts input edgeState is const because it will be re used for each - * of its head edge's connections + * this is also used for register space + * @param[in] ts input edgeState * @param[in] node1_params params of the inner node of the new edge to be added - * to the seed + * to the seed */ -inline __device__ bool update( - edgeState* new_ts, const edgeState* ts, - const float4& node1_params) { // params are x, y, z, type - - const float sigma_t = 0.0003f; - const float sigma_w = .00009f; +inline __device__ bool update(edgeState* new_ts, const edgeState* ts, + const float4& node1_params, + // node params are x, y, z, type + const gbts_seed_extraction_params& KF_params) { - const float sigmaMS = 0.016f; + float tau2 = ts->m_Y[1] * ts->m_Y[1]; + float invSin2 = 1 + tau2; - const float sigma_x = 0.25f; // was 0.22 - const float sigma_y = 2.5f; // was 1.7 + float lenCorr = (node1_params.w != -1) ? invSin2 : invSin2 / tau2; + float minPtFrac = fabsf(ts->m_X[2]) * KF_params.inv_max_curvature; - const float weight_x = 0.5f; - const float weight_y = 0.5f; - - const float maxDChi2_x = 60.0f; // was 35.0; - const float maxDChi2_y = 60.0f; // was 31.0; - - const float add_hit = 14.0f; - // m_J is stored in 30 + sign bits so max qual = INT_MAX/2 = - // add_hit*max_length*qual_scale - const float qual_scale = - 0.5 * static_cast(INT_MAX) / - static_cast(add_hit * - traccc::device::gbts_consts::max_cca_iter) - - 1; + float corrMS = KF_params.sigmaMS * minPtFrac; + float sigma2 = KF_params.radLen * lenCorr * corrMS * corrMS; // /invSin2; // add ms. - float m_Cx22 = ts->m_Cx(2, 2) + sigma_w * sigma_w; - float m_Cx11 = ts->m_Cx(1, 1) + sigma_t * sigma_t; - - float t2 = node1_params.w != -1 ? 1 + ts->m_Y[1] * ts->m_Y[1] - : 1 + 1 / (ts->m_Y[1] * ts->m_Y[1]); - - float s1 = sigmaMS * t2; - float s2 = s1 * s1; - - s2 *= sqrtf(t2); - - float m_Cy11 = ts->m_Cy(1, 1) + s2; + float m_Cx11 = ts->m_Cx(1, 1) + sigma2; + float m_Cy11 = ts->m_Cy(1, 1) + sigma2; // extrapolation @@ -274,18 +407,20 @@ inline __device__ bool update( new_ts->m_Cx(0, 0) = ts->m_Cx(0, 0) + 2 * ts->m_Cx(0, 1) * A + 2 * ts->m_Cx(0, 2) * B + A * m_Cx11 * A + - 2 * A * ts->m_Cx(1, 2) * B + B * m_Cx22 * B; + 2 * A * ts->m_Cx(1, 2) * B + B * ts->m_Cx(2, 2) * B; new_ts->m_Cx(0, 1) = ts->m_Cx(0, 1) + m_Cx11 * A + ts->m_Cx(1, 2) * B + - ts->m_Cx(0, 2) * A + A * A * ts->m_Cx(1, 2) + - A * m_Cx22 * B; + ts->m_Cx(0, 2) * A + A * ts->m_Cx(1, 2) * A + + A * ts->m_Cx(2, 2) * B; - new_ts->m_Cx(0, 2) = ts->m_Cx(0, 2) + ts->m_Cx(1, 2) * A + m_Cx22 * B; + new_ts->m_Cx(0, 2) = + ts->m_Cx(0, 2) + ts->m_Cx(1, 2) * A + ts->m_Cx(2, 2) * B; - new_ts->m_Cx(1, 1) = m_Cx11 + 2 * A * ts->m_Cx(1, 2) + A * m_Cx22 * A; - new_ts->m_Cx(1, 2) = ts->m_Cx(1, 2) + m_Cx22 * A; + new_ts->m_Cx(1, 1) = + m_Cx11 + 2 * A * ts->m_Cx(1, 2) + A * ts->m_Cx(2, 2) * A; + new_ts->m_Cx(1, 2) = ts->m_Cx(1, 2) + ts->m_Cx(2, 2) * A; - new_ts->m_Cx(2, 2) = m_Cx22; + new_ts->m_Cx(2, 2) = ts->m_Cx(2, 2); new_ts->m_Y[0] = ts->m_Y[0] + ts->m_Y[1] * dr; new_ts->m_Y[1] = ts->m_Y[1]; @@ -304,47 +439,60 @@ inline __device__ bool update( if (!ts->m_head_node_type) { // barrel TO-DO: split into barrel Pixel and barrel SCT - sigma_rz = sigma_y * sigma_y; + sigma_rz = KF_params.sigma_y; } else { - sigma_rz = sigma_y * ts->m_Y[1]; - sigma_rz = sigma_rz * sigma_rz; + sigma_rz = KF_params.sigma_y * ts->m_Y[1]; } - float Dx = 1 / (new_ts->m_Cx(0, 0) + sigma_x * sigma_x); + float inv_Dx = new_ts->m_Cx(0, 0) + KF_params.sigma_x * KF_params.sigma_x; + float Dx = 1 / inv_Dx; - float Dy = 1 / (new_ts->m_Cy(0, 0) + sigma_rz); + float Dy = 1 / (new_ts->m_Cy(0, 0) + sigma_rz * sigma_rz); float dchi2_x = resid_x * resid_x * Dx; float dchi2_y = resid_y * resid_y * Dy; - if (dchi2_x > maxDChi2_x || dchi2_y > maxDChi2_y) { + if (dchi2_x > KF_params.maxDChi2_x || dchi2_y > KF_params.maxDChi2_y) { return false; } // state update - new_ts->m_J = - ts->m_J + - static_cast((add_hit - dchi2_x * weight_x - dchi2_y * weight_y) * - qual_scale); - - new_ts->m_length = ts->m_length + 1; + new_ts->m_J = ts->m_J + (KF_params.add_hit - dchi2_x * KF_params.weight_x - + dchi2_y * KF_params.weight_y); for (int i = 0; i < 3; i++) { new_ts->m_X[i] += Dx * new_ts->m_Cx(0, i) * resid_x; } + + if (fabsf(new_ts->m_X[2]) * KF_params.inv_max_curvature > 1.0f) { + return false; + } + for (int i = 0; i < 2; i++) { new_ts->m_Y[i] += Dx * new_ts->m_Cy(0, i) * resid_y; } - new_ts->m_Cx(2, 2) -= Dx * new_ts->m_Cx(0, 2) * new_ts->m_Cx(0, 2); - new_ts->m_Cx(1, 2) -= Dx * new_ts->m_Cx(0, 1) * new_ts->m_Cx(0, 2); - new_ts->m_Cx(1, 1) -= Dx * new_ts->m_Cx(0, 1) * new_ts->m_Cx(0, 1); - new_ts->m_Cx(0, 2) -= Dx * new_ts->m_Cx(0, 0) * new_ts->m_Cx(0, 2); - new_ts->m_Cx(0, 1) -= Dx * new_ts->m_Cx(0, 0) * new_ts->m_Cx(0, 1); - new_ts->m_Cx(0, 0) -= Dx * new_ts->m_Cx(0, 0) * new_ts->m_Cx(0, 0); - new_ts->m_Cy(1, 1) -= Dx * new_ts->m_Cy(0, 1) * new_ts->m_Cy(0, 1); - new_ts->m_Cy(0, 1) -= Dx * new_ts->m_Cy(0, 0) * new_ts->m_Cy(0, 1); - new_ts->m_Cy(0, 0) -= Dx * new_ts->m_Cy(0, 0) * new_ts->m_Cy(0, 0); + float z0 = new_ts->m_Y[0] - new_ts->m_refY * ts->m_Y[1]; + if (fabsf(z0) > KF_params.max_z0) { + return false; + } + + // less loss from float precsion this way (helps prevent sign change) + new_ts->m_Cx(2, 2) = Dx * (new_ts->m_Cx(2, 2) * inv_Dx - + new_ts->m_Cx(0, 2) * new_ts->m_Cx(0, 2)); + new_ts->m_Cx(1, 2) = Dx * (new_ts->m_Cx(1, 2) * inv_Dx - + new_ts->m_Cx(0, 1) * new_ts->m_Cx(0, 2)); + new_ts->m_Cx(1, 1) = Dx * (new_ts->m_Cx(1, 1) * inv_Dx - + new_ts->m_Cx(0, 1) * new_ts->m_Cx(0, 1)); + new_ts->m_Cx(0, 2) = Dx * (new_ts->m_Cx(0, 2) * inv_Dx - + new_ts->m_Cx(0, 0) * new_ts->m_Cx(0, 2)); + new_ts->m_Cx(0, 1) = Dx * (new_ts->m_Cx(0, 1) * inv_Dx - + new_ts->m_Cx(0, 0) * new_ts->m_Cx(0, 1)); + new_ts->m_Cx(0, 0) *= Dx * (KF_params.sigma_x * KF_params.sigma_x); + + new_ts->m_Cy(1, 1) -= Dy * new_ts->m_Cy(0, 1) * new_ts->m_Cy(0, 1); + new_ts->m_Cy(0, 1) -= Dy * new_ts->m_Cy(0, 0) * new_ts->m_Cy(0, 1); + new_ts->m_Cy(0, 0) -= Dy * new_ts->m_Cy(0, 0) * new_ts->m_Cy(0, 0); new_ts->m_c = ts->m_c; new_ts->m_s = ts->m_s; @@ -356,396 +504,220 @@ inline __device__ bool update( /** @brief Performs seed disambiguation through seeds biding to use edges with * seed quality * - * @param[in] m_J is the quality metric output by the Kalman filter - * @param[in] mini_idx the index of final mini_state, backtracking from this - * gives the seeds path (edges->nodes) - * @param[in] d_mini_states stores the path each seed took through the graph in - * reverse order + * @param[in] qual is the quality metric output by the Kalman filter + * @param[in] path_idx the index of inital path for this seed + * @param[in] d_path_store stores the path each seed took through the graph in + * reverse order * @param[in] prop_idx the index of this new seeds proposition in - * d_seed_proposals + * d_seed_proposals * @param[in/out] d_edge_bids is [int_m_J, prop_idx] so that atomicMax will - * swap it out with higher quality bids. The index is then used to flag the - * replaced seed as maybe fake + * swap it out with higher quality bids. The index is then used to flag the + * replaced seed as maybe fake * @param[out] d_seed_proposals stores the information needed to construct an - * output Tracklet for this seed + * output Tracklet for this seed * @param[out] d_seed_ambiguity here is 0 if the seed is the highest quality - * seed using all of its edges and -1 otherwise + * seed using all of its edges and -1 otherwise */ -inline __device__ void add_seed_proposal(const int m_J, const int mini_idx, +inline __device__ void add_seed_proposal(const int qual, const int path_idx, const unsigned int prop_idx, char* d_seed_ambiguity, int2* d_seed_proposals, unsigned long long int* d_edge_bids, - const int2* d_mini_states) { + const int2* d_path_store, + char depth = -1) { // new seed bids for its edges - d_seed_proposals[prop_idx] = make_int2(m_J, mini_idx); + d_seed_proposals[prop_idx] = make_int2(qual, path_idx); d_seed_ambiguity[prop_idx] = 0; __threadfence(); // ensure above proposal info is written before biding unsigned long long int seed_bid = - (static_cast(m_J) << 32) | + (static_cast(qual) << 32) | (static_cast(prop_idx)); - int2 mini_state; - for (int next_mini = mini_idx; next_mini >= 0;) { - mini_state = d_mini_states[next_mini]; + // dummy path to start the loop + int2 path = make_int2(0, d_seed_proposals[prop_idx].y); + while (path.y >= 0 && depth != 0) { + path = d_path_store[path.y]; + depth--; unsigned long long int competing_offer = - atomicMax(&d_edge_bids[mini_state.x], seed_bid); + atomicMax(&d_edge_bids[path.x], seed_bid); if (competing_offer > seed_bid) { d_seed_ambiguity[prop_idx] = -1; } else if (competing_offer != 0) { d_seed_ambiguity[competing_offer & 0xFFFFFFFFLL] = -1; } // default bids are 0 so no need to replace - - next_mini = mini_state.y; } } -/** @brief This kerenel extracts the seeds from the graph starting with sets of - * edges with simmlar levels - * - * We start with higher levels so that the best seeds are found first and - * pruned from the graph The last block to finsh perfoms disambiguation through - * iteritive biding and fills d_seeds with the winning seeds with the nodes in - * inside-out order - * - * @param[in] view_min/view_max the range of input edges in the d_level_views - * to start forming seeds from - * @param[in] d_level_views view on the edges of d_output_graph calculated by - * the CCA - * @param[in/out] d_levels the level of each edge by edge, -1 signifes an edge - * that is allready used in seed found from a previous iteration and so has been - * removed from the graph - * @param[in] d_sp_params x,y,z,cluster-width for all nodes. Here cluster width - * denotes if a sp is in the barrel (cw != -1 => barrel) - * @param[in] d_output_graph stores the nodes, number of neighbours and - * self-referential neighbour index. - * @param[in] minLevel is length, in edges, of the smallest accepable seed (and - * is 3 by default so 4 space points) - * @param[internal] d_counters[7,8 and 10] are used to track the length of - * d_mini_state, d_seed_proposals and number of finished blocks - * @param[internal] d_seed_ambiguity, d_edge_bids used in seed diambiguation - * for details see add_seed_proposal - * @param[internal] d_mini_states stores the path each seed took through the - * graph in reverse order (inside out) - * @param[internal] d_seed_proposals stores the infomation needed to construct - * an output Tracklet for this seed - * @param[out] d_seeds stores the output seeds after disambiguation. This is - * what gets transfered back to CPU and is the final output of these kernels - * @param[out] d_counters[9] number of seeds in d_seeds - */ -__global__ void seed_extracting_kernel( - int view_min, int view_max, int* d_level_views, char* d_levels, - float4* d_sp_params, int* d_output_graph, int2* d_mini_states, - edgeState* d_state_store, unsigned long long int* d_edge_bids, - char* d_seed_ambiguity, int2* d_seed_proposals, Tracklet* d_seeds, - unsigned int* d_counters, const int minLevel, const unsigned int nMaxMini, - const unsigned int nMaxProps, const unsigned int nMaxStateStorePerBlock, - const unsigned int nMaxSeeds, const unsigned int max_num_neighbours) { +void __global__ fit_segments( + float4* d_sp_params, int* d_output_graph, int2* d_path_store, + int2* d_seed_proposals, unsigned long long int* d_edge_bids, + char* d_seed_ambiguity, char* d_levels, unsigned int* d_counters, + unsigned int nTerminusEdges, int minLevel, unsigned int max_num_neighbours, + gbts_seed_extraction_params seed_extraction_params) { + + // take an extracted path and fit it to produce a quality score + unsigned int path_idx = + threadIdx.x + blockIdx.x * blockDim.x + nTerminusEdges; + if (path_idx >= d_counters[7]) { + return; + } + int edge_size = 2 + 1 + max_num_neighbours; - __shared__ int block_start; + char length = 1; - __shared__ int total_live_states; - __shared__ int nStates; - __shared__ int nSharedSpace; + bool toggle = false; + edgeState state1; + edgeState state2; - __shared__ edgeState - current_states[traccc::device::gbts_consts::shared_state_buffer_size]; + int2 path = d_path_store[path_idx]; - int edge_size = 2 + 1 + max_num_neighbours; - if (threadIdx.x == 0) { - total_live_states = 0; + int nodeidx = + d_output_graph[traccc::device::gbts_consts::node1 + edge_size * path.x]; + float4 node1 = d_sp_params[nodeidx]; + nodeidx = + d_output_graph[traccc::device::gbts_consts::node2 + edge_size * path.x]; + float4 node2 = d_sp_params[nodeidx]; - int total_nStates = view_max - view_min; - nStates = 1 + (total_nStates - 1) / gridDim.x; + state1.initialize(node2, node1); + while (path.y >= 0) { + path = d_path_store[path.y]; - block_start = view_min + nStates * blockIdx.x; - if (block_start >= view_max) { - nStates = 0; - } else if (block_start + nStates >= view_max) { - nStates = view_max - block_start; + node2 = d_sp_params[d_output_graph[traccc::device::gbts_consts::node2 + + edge_size * path.x]]; + if (toggle) { + if (!update(&state1, &state2, node2, seed_extraction_params)) { + state1 = state2; + break; + } + } else if (!update(&state2, &state1, node2, seed_extraction_params)) { + break; } + toggle = !toggle; + length++; } - __syncthreads(); - // assign root edges to blocks and populate shared with inital states - // must have less input than shared space - for (int root_edge_idx = threadIdx.x; root_edge_idx < nStates; - root_edge_idx += blockDim.x) { - - int edge_idx = d_level_views[block_start + root_edge_idx]; - char level = d_levels[edge_idx]; - if (level == -1) { - continue; - } - int edge_pos = edge_size * edge_idx; - - float4 node1_params = - d_sp_params[d_output_graph[edge_pos + - traccc::device::gbts_consts::node1]]; - float4 node2_params = - d_sp_params[d_output_graph[edge_pos + - traccc::device::gbts_consts::node2]]; - - int root_idx = atomicAdd(&total_live_states, 1); - current_states[root_idx].initialize(node1_params, node2_params); - current_states[root_idx].m_edge_idx = edge_idx; - - unsigned int mini_idx = atomicAdd(&d_counters[7], 1); - d_mini_states[mini_idx] = - make_int2(edge_idx, -1); // prev mini -1 for roots with no prev - current_states[root_idx].m_mini_idx = mini_idx; + // only keep long enough seeds + if (length < minLevel) { + return; } - __syncthreads(); - if (threadIdx.x == 0) { - nStates = total_live_states; // update after removed edges are exculded + int qual = 0; + if (toggle) { + qual = static_cast(seed_extraction_params.qual_scale * state2.m_J); + } else { + qual = static_cast(seed_extraction_params.qual_scale * state1.m_J); } - edgeState state; - edgeState new_state; - - __syncthreads(); - while (total_live_states > 0) { - // propogate to next level - bool has_state = false; - if (threadIdx.x < nStates) { - state = current_states[nStates - 1 - threadIdx.x]; - has_state = true; - } else if (threadIdx.x < total_live_states) { - state = d_state_store[total_live_states - 1 - threadIdx.x + - nMaxStateStorePerBlock * blockIdx.x]; - has_state = true; - } - __syncthreads(); - if (threadIdx.x == 0) { // update state counts - total_live_states = (total_live_states < blockDim.x) - ? 0 - : total_live_states - blockDim.x; - - nStates = (nStates < blockDim.x) ? 0 : nStates - blockDim.x; - nSharedSpace = total_live_states - nStates; - // total state count when shared memory is - // filled - max shared states - } - __syncthreads(); - if (has_state) { - - int edge_idx = state.m_edge_idx; - - int edge_pos = edge_idx * edge_size; - - unsigned char nNei = - d_output_graph[edge_pos + traccc::device::gbts_consts::nNei]; - - char edge_level = d_levels[edge_idx]; - - bool no_updates = true; - for (unsigned char nei = 0; nei < nNei; nei++) { + int prop_idx = atomicAdd(&d_counters[8], 1); + // perform first round of bidding for disambiguation + // only on the outermost edge + add_seed_proposal(qual, path_idx, prop_idx, d_seed_ambiguity, + d_seed_proposals, d_edge_bids, d_path_store, 1); +} - int nei_idx = - d_output_graph[edge_pos + - traccc::device::gbts_consts::nei_start + - nei]; +void __global__ reset_edge_bids(int2* d_path_store, int2* d_seed_proposals, + unsigned long long int* d_edge_bids, + char* d_seed_ambiguity, + unsigned int* d_counters, int round) { - char nei_level = d_levels[nei_idx]; - if (edge_level - 1 != nei_level) { - continue; - } - float4 node1_params = d_sp_params - [d_output_graph[edge_size * nei_idx + - traccc::device::gbts_consts::node1]]; - bool success = update(&new_state, &state, node1_params); + int nProps = d_counters[8]; + // first round find best seed starting at each edge + for (int prop_idx = threadIdx.x + blockIdx.x * blockDim.x; + prop_idx < nProps; prop_idx += blockDim.x * gridDim.x) { - if (!success) { - continue; - } - no_updates = false; - - new_state.m_edge_idx = nei_idx; - new_state.m_mini_idx = atomicAdd(&d_counters[7], 1); - - if (new_state.m_mini_idx < nMaxMini) { - d_mini_states[new_state.m_mini_idx] = - make_int2(nei_idx, state.m_mini_idx); - if (d_output_graph[edge_size * nei_idx + - traccc::device::gbts_consts::nNei] == - 0) { - - // no neighbours so will fail next round anyway - // so save shared - if (new_state.m_length >= minLevel) { - unsigned prop_idx = atomicAdd(&d_counters[8], 1); - if (prop_idx < nMaxProps) { - add_seed_proposal(new_state.m_J, - new_state.m_mini_idx, - prop_idx, d_seed_ambiguity, - d_seed_proposals, d_edge_bids, - d_mini_states); - } - } - - } else { - int stateStoreIdx = atomicAdd(&total_live_states, 1) - - traccc::device::gbts_consts:: - shared_state_buffer_size; - - if (stateStoreIdx < nSharedSpace) { - current_states[atomicAdd(&nStates, 1)] = new_state; - } else { - if (stateStoreIdx < nMaxStateStorePerBlock) { - d_state_store[stateStoreIdx + - nMaxStateStorePerBlock * - blockIdx.x] = new_state; - } else { - d_counters[10] = stateStoreIdx; - } - } - } - } - } - if (no_updates) { - if (state.m_length >= minLevel) { - unsigned int prop_idx = atomicAdd(&d_counters[8], 1); - if (prop_idx < nMaxProps) { - add_seed_proposal(state.m_J, state.m_mini_idx, prop_idx, - d_seed_ambiguity, d_seed_proposals, - d_edge_bids, d_mini_states); - } - } + char ambi = d_seed_ambiguity[prop_idx]; + if (round == -1) { + if (ambi == 0) { + // rebid 'best seed from edge' in later rounds + d_seed_ambiguity[prop_idx] = 1; + continue; + } else { + d_seed_ambiguity[prop_idx] = -2; + continue; } - } // if has state - __syncthreads(); // wait for current_states to repopulate - } - __syncthreads(); - // move remianing seed props to seeds after - // all tracking for this set is done - if (threadIdx.x == 0) { - nStates = atomicAdd(&d_counters[11], 1); - } - __syncthreads(); - if (nStates != gridDim.x - 1) { - return; - } - unsigned int nProps = d_counters[8]; - __syncthreads(); - // reset for next launch - if (threadIdx.x == 0) { - // exit if any overflows have occured - if (nProps > nMaxProps || d_counters[7] > nMaxMini || - d_counters[10] != 0) { - - nStates = 0; - } else { - nStates = 1; - } - d_counters[11] = 0; - d_counters[10] = 0; - d_counters[7] = 0; - d_counters[8] = 0; - } - __syncthreads(); - if (nProps == 0 || nStates == 0) { - return; - } - for (int round = 0; round < 5 && nStates > 0; round++) { - // re-check maybe seeds that don't clash with a definte seed - if (threadIdx.x == 0) { - nStates = 0; + } else if (ambi == -2 | ambi == 0) { + // only reset maybes + continue; } - __syncthreads(); // fit maybe seeds into unused spaces - for (unsigned int prop_idx = threadIdx.x; prop_idx < nProps; - prop_idx += blockDim.x) { - - char ambiguity = d_seed_ambiguity[prop_idx]; - if (ambiguity == 0 || ambiguity == -2) { - continue; // is not ambiguous - } - int2 prop = d_seed_proposals[prop_idx]; + int2 prop = d_seed_proposals[prop_idx]; - bool isgood = true; + bool isgood = true; - int2 mini_state; - for (int next_mini = prop.y; next_mini >= 0;) { - mini_state = d_mini_states[next_mini]; - next_mini = mini_state.y; + // dummy path to start the loop + int2 path = make_int2(0, prop.y); + while (path.y >= 0) { + path = d_path_store[path.y]; - unsigned long long int best_bid = d_edge_bids[mini_state.x]; - if (best_bid == 0) { - continue; // already reset - } - if (d_seed_ambiguity[best_bid & 0xFFFFFFFFLL] == 0) { - isgood = false; - break; - } // clashes with definate seed - d_edge_bids[mini_state.x] = - 0; // reset edge bid from (possibly) fake seed + unsigned long long int best_bid = d_edge_bids[path.x]; + if (d_seed_ambiguity[best_bid & 0xFFFFFFFFLL] == 0) { + isgood = false; + break; } - if (isgood) { - d_seed_ambiguity[prop_idx] = 1; - atomicAdd(&nStates, 1); - } // flag as maybe seed - else - d_seed_ambiguity[prop_idx] = -2; // definate fake } - __syncthreads(); - for (unsigned int prop_idx = threadIdx.x; prop_idx < nProps; - prop_idx += blockDim.x) { - if (d_seed_ambiguity[prop_idx] != 1) { - continue; - } - int2 prop = d_seed_proposals[prop_idx]; - - add_seed_proposal(prop.x, prop.y, prop_idx, d_seed_ambiguity, - d_seed_proposals, d_edge_bids, - d_mini_states); // reset and re bid + if (isgood) { + d_seed_ambiguity[prop_idx] = 1; + } // flag as maybe seed + else { + d_seed_ambiguity[prop_idx] = -2; + // definate fake (never the best) } - __syncthreads(); } - __syncthreads(); - for (unsigned int prop_idx = threadIdx.x; prop_idx < nProps; - prop_idx += blockDim.x) { - if (d_seed_ambiguity[prop_idx] != 0) { +} + +// TO-DO?: reset prop count each iter and make new props like CCA active_edges +void __global__ seeds_rebid_for_edges(int2* d_path_store, + int2* d_seed_proposals, + unsigned long long int* d_edge_bids, + char* d_seed_ambiguity, + unsigned int nProps) { + + for (int prop_idx = threadIdx.x + blockIdx.x * blockDim.x; + prop_idx < nProps; prop_idx += blockDim.x * gridDim.x) { + + char ambi = d_seed_ambiguity[prop_idx]; + if (ambi == -2 | ambi == 0) { + // only rebid for maybes continue; } int2 prop = d_seed_proposals[prop_idx]; - unsigned int seed_idx = atomicAdd(&d_counters[9], 1); - if (seed_idx > nMaxSeeds) { - break; - } - // add good seed to output - int2 mini_state; - int length = 0; - for (int next_mini = prop.y; next_mini >= 0; length++) { - - mini_state = d_mini_states[next_mini]; - next_mini = mini_state.y; - - d_seeds[seed_idx].nodes[length] = - d_output_graph[mini_state.x * edge_size + - traccc::device::gbts_consts::node1]; - d_levels[mini_state.x] = -1; // remove edge from graph - } - d_seeds[seed_idx].nodes[length] = - d_output_graph[mini_state.x * edge_size + - traccc::device::gbts_consts::node2]; - d_seeds[seed_idx].size = ++length; + add_seed_proposal(prop.x, prop.y, prop_idx, d_seed_ambiguity, + d_seed_proposals, d_edge_bids, d_path_store, -1); } - __syncthreads(); } void __global__ gbts_seed_conversion_kernel( - Tracklet* d_seeds, edm::seed_collection::view output_seeds, - const unsigned int nSeeds) { + int2* d_seed_proposals, char* d_seed_ambiguity, int2* d_path_store, + int* d_output_graph, edm::seed_collection::view output_seeds, + const unsigned int nProps, const unsigned int max_num_neighbours) { + + int edge_size = 2 + 1 + max_num_neighbours; edm::seed_collection::device seeds_device(output_seeds); - for (int tracklet = threadIdx.x + blockIdx.x * blockDim.x; - tracklet < nSeeds; tracklet += blockDim.x * gridDim.x) { - int length = d_seeds[tracklet].size; + for (int prop_idx = threadIdx.x + blockIdx.x * blockDim.x; + prop_idx < nProps; prop_idx += blockDim.x * gridDim.x) { + + if (d_seed_ambiguity[prop_idx] == -2) { + // drop seeds that lost the bidding + continue; + } + Tracklet seed; + seed.size = 0; + // dummy path to start the loop + int2 path = make_int2(0, d_seed_proposals[prop_idx].y); + while (path.y >= 0) { + path = d_path_store[path.y]; + seed.nodes[seed.size++] = + d_output_graph[traccc::device::gbts_consts::node1 + + edge_size * path.x]; + } + seed.nodes[seed.size++] = + d_output_graph[traccc::device::gbts_consts::node2 + + edge_size * path.x]; // sample begining, middle, end sp from tracklet for now - seeds_device.push_back({d_seeds[tracklet].nodes[0], - d_seeds[tracklet].nodes[length / 2], - d_seeds[tracklet].nodes[length - 1]}); + seeds_device.push_back({seed.nodes[seed.size - 1], + seed.nodes[(1 + seed.size) / 2 - 1], + seed.nodes[0]}); } } diff --git a/device/cuda/src/gbts_seeding/kernels/GbtsNodesMakingKernels.cuh b/device/cuda/src/gbts_seeding/kernels/GbtsNodesMakingKernels.cuh index 8cb3402179..d7ba61bda3 100644 --- a/device/cuda/src/gbts_seeding/kernels/GbtsNodesMakingKernels.cuh +++ b/device/cuda/src/gbts_seeding/kernels/GbtsNodesMakingKernels.cuh @@ -281,9 +281,9 @@ __global__ void eta_phi_prefix_sum_kernel(const int* d_eta_node_counter, __global__ void node_sorting_kernel( const float4* d_sp_params, const int* d_node_eta_index, const int* d_node_phi_index, int* d_phi_cusums, float* d_node_params, - int* d_node_index, int* d_original_sp_idx, const gbts_algo_params* ap, - const unsigned int nNodesPerBlock, const unsigned int nNodes, - const unsigned int nPhiBins) { + int* d_node_index, int* d_original_sp_idx, + const gbts_graph_building_params* ap, const unsigned int nNodesPerBlock, + const unsigned int nNodes, const unsigned int nPhiBins) { int begin_node = blockIdx.x * nNodesPerBlock;