@@ -1499,15 +1499,6 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
14991499 unsigned int thirdSegmentIndex = triplets.segmentIndices ()[outerTripletIndex][0 ];
15001500 unsigned int fourthSegmentIndex = triplets.segmentIndices ()[outerTripletIndex][1 ];
15011501
1502- unsigned int innerOuterOuterMiniDoubletIndex =
1503- segments.mdIndices ()[secondSegmentIndex][1 ]; // inner triplet outer segment outer MD index
1504- unsigned int outerInnerInnerMiniDoubletIndex =
1505- segments.mdIndices ()[thirdSegmentIndex][0 ]; // outer triplet inner segment inner MD index
1506-
1507- // this cut reduces the number of candidates by a factor of 3, i.e., 2 out of 3 warps can end right here!
1508- if (innerOuterOuterMiniDoubletIndex != outerInnerInnerMiniDoubletIndex)
1509- return false ;
1510-
15111502 unsigned int firstMDIndex = segments.mdIndices ()[firstSegmentIndex][0 ];
15121503 unsigned int secondMDIndex = segments.mdIndices ()[secondSegmentIndex][0 ];
15131504 unsigned int thirdMDIndex = segments.mdIndices ()[secondSegmentIndex][1 ];
@@ -1699,8 +1690,27 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
16991690 const float ptCut) const {
17001691 ALPAKA_ASSERT_ACC ((alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[1 ] == 1 ) &&
17011692 (alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[2 ] == 1 ));
1693+
1694+ int & matchCount = alpaka::declareSharedVar<int , __COUNTER__>(acc);
1695+
1696+ const auto threadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
1697+ const auto blockDim = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
1698+
1699+ const int threadIdX = threadIdx.x ();
1700+ const int threadIdY = threadIdx.y ();
1701+ const int blockSizeX = blockDim.x ();
1702+ const int blockSizeY = blockDim.y ();
1703+ const int blockSize = blockSizeX * blockSizeY;
1704+ const int flatThreadIdxXY = threadIdY * blockSizeX + threadIdX;
1705+ const int flatThreadExtent = blockSize; // total threads per block
1706+
17021707 for (int iter : cms::alpakatools::uniform_groups_z (acc, nEligibleT5Modules)) {
17031708 uint16_t lowerModule1 = ranges.indicesOfEligibleT5Modules ()[iter];
1709+
1710+ if (cms::alpakatools::once_per_block (acc)) {
1711+ matchCount = 0 ;
1712+ }
1713+
17041714 short layer2_adjustment;
17051715 int layer = modules.layers ()[lowerModule1];
17061716 if (layer == 1 ) {
@@ -1712,107 +1722,158 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
17121722 else {
17131723 continue ;
17141724 }
1725+
17151726 unsigned int nInnerTriplets = tripletsOccupancy.nTriplets ()[lowerModule1];
1727+
1728+ alpaka::syncBlockThreads (acc);
1729+
1730+ // Step 1: Make inner and outer triplet pairs
17161731 for (unsigned int innerTripletArrayIndex : cms::alpakatools::uniform_elements_y (acc, nInnerTriplets)) {
17171732 unsigned int innerTripletIndex = ranges.tripletModuleIndices ()[lowerModule1] + innerTripletArrayIndex;
1718- uint16_t lowerModule2 = triplets.lowerModuleIndices ()[innerTripletIndex][1 ];
17191733 uint16_t lowerModule3 = triplets.lowerModuleIndices ()[innerTripletIndex][2 ];
17201734 unsigned int nOuterTriplets = tripletsOccupancy.nTriplets ()[lowerModule3];
17211735 for (unsigned int outerTripletArrayIndex : cms::alpakatools::uniform_elements_x (acc, nOuterTriplets)) {
17221736 unsigned int outerTripletIndex = ranges.tripletModuleIndices ()[lowerModule3] + outerTripletArrayIndex;
1723- uint16_t lowerModule4 = triplets.lowerModuleIndices ()[outerTripletIndex][1 ];
1724- uint16_t lowerModule5 = triplets.lowerModuleIndices ()[outerTripletIndex][2 ];
1725-
1726- float innerRadius, outerRadius, bridgeRadius, regressionCenterX, regressionCenterY, regressionRadius,
1727- rzChiSquared, chiSquared, nonAnchorChiSquared, dBeta1, dBeta2; // required for making distributions
1728-
1729- float t5Embed[Params_T5::kEmbed ] = {0 .f };
1730-
1731- bool tightCutFlag = false ;
1732- bool success = runQuintupletDefaultAlgo (acc,
1733- modules,
1734- mds,
1735- segments,
1736- triplets,
1737- lowerModule1,
1738- lowerModule2,
1739- lowerModule3,
1740- lowerModule4,
1741- lowerModule5,
1742- innerTripletIndex,
1743- outerTripletIndex,
1744- innerRadius,
1745- outerRadius,
1746- bridgeRadius,
1747- regressionCenterX,
1748- regressionCenterY,
1749- regressionRadius,
1750- rzChiSquared,
1751- chiSquared,
1752- nonAnchorChiSquared,
1753- dBeta1,
1754- dBeta2,
1755- tightCutFlag,
1756- t5Embed,
1757- ptCut);
1758-
1759- if (success) {
1760- int totOccupancyQuintuplets = alpaka::atomicAdd (
1761- acc, &quintupletsOccupancy.totOccupancyQuintuplets ()[lowerModule1], 1u , alpaka::hierarchy::Threads{});
1762- if (totOccupancyQuintuplets >= ranges.quintupletModuleOccupancy ()[lowerModule1]) {
1737+
1738+ unsigned int secondSegmentIndex = triplets.segmentIndices ()[innerTripletIndex][1 ];
1739+ unsigned int thirdSegmentIndex = triplets.segmentIndices ()[outerTripletIndex][0 ];
1740+
1741+ unsigned int miniDoublet3Index =
1742+ segments.mdIndices ()[secondSegmentIndex][1 ]; // inner triplet outer segment outer MD index
1743+ unsigned int outerInnerInnerMiniDoubletIndex =
1744+ segments.mdIndices ()[thirdSegmentIndex][0 ]; // outer triplet inner segment inner MD index
1745+
1746+ // this cut reduces the number of candidates by a factor of 3, i.e., 2 out of 3 warps can end right here!
1747+ if (miniDoublet3Index != outerInnerInnerMiniDoubletIndex)
1748+ continue ;
1749+
1750+ // Match inner Sg and Outer Sg
1751+ int mIdx = alpaka::atomicAdd (acc, &matchCount, 1 , alpaka::hierarchy::Threads{});
1752+ unsigned int quintupletIndex = ranges.quintupletModuleIndices ()[lowerModule1] + mIdx ;
1753+
17631754#ifdef WARNINGS
1764- printf (" Quintuplet excess alert! Module index = %d, Occupancy = %d\n " ,
1765- lowerModule1,
1766- totOccupancyQuintuplets);
1755+ const unsigned int rightBound =
1756+ static_cast <unsigned int >(ranges.quintupletModuleIndices ()[lowerModule1 + 1 ]);
1757+ if (quintupletIndex >= rightBound) {
1758+ printf (
1759+ " Quintuplet module occupancy alert! module quintuplet starting index = %d, Pair quintuplet index = "
1760+ " %d, next module quintuplet starting index = %d\n " ,
1761+ ranges.quintupletModuleIndices ()[lowerModule1],
1762+ mIdx ,
1763+ ranges.quintupletModuleIndices ()[lowerModule1 + 1 ]);
1764+ }
17671765#endif
1768- } else {
1769- int quintupletModuleIndex = alpaka::atomicAdd (
1770- acc, &quintupletsOccupancy.nQuintuplets ()[lowerModule1], 1u , alpaka::hierarchy::Threads{});
1771- // this if statement should never get executed!
1772- if (ranges.quintupletModuleIndices ()[lowerModule1] == -1 ) {
1766+
1767+ quintuplets.preAllocatedTripletIndices ()[quintupletIndex][0 ] = innerTripletIndex;
1768+ quintuplets.preAllocatedTripletIndices ()[quintupletIndex][1 ] = outerTripletIndex;
1769+ }
1770+ }
1771+
1772+ alpaka::syncBlockThreads (acc);
1773+ if (matchCount == 0 ) {
1774+ continue ;
1775+ }
1776+
1777+ // Step 2: Parallel processing of triplet pairs
1778+ for (int i = flatThreadIdxXY; i < matchCount; i += flatThreadExtent) {
1779+ unsigned int quintupletIndex = ranges.quintupletModuleIndices ()[lowerModule1] + i;
1780+ int innerTripletIndex = quintuplets.preAllocatedTripletIndices ()[quintupletIndex][0 ];
1781+ int outerTripletIndex = quintuplets.preAllocatedTripletIndices ()[quintupletIndex][1 ];
1782+
1783+ uint16_t lowerModule2 = triplets.lowerModuleIndices ()[innerTripletIndex][1 ];
1784+ uint16_t lowerModule3 = triplets.lowerModuleIndices ()[innerTripletIndex][2 ];
1785+ uint16_t lowerModule4 = triplets.lowerModuleIndices ()[outerTripletIndex][1 ];
1786+ uint16_t lowerModule5 = triplets.lowerModuleIndices ()[outerTripletIndex][2 ];
1787+
1788+ float innerRadius, outerRadius, bridgeRadius, regressionCenterX, regressionCenterY, regressionRadius,
1789+ rzChiSquared, chiSquared, nonAnchorChiSquared, dBeta1, dBeta2; // required for making distributions
1790+
1791+ float t5Embed[Params_T5::kEmbed ] = {0 .f };
1792+
1793+ bool tightCutFlag = false ;
1794+
1795+ bool success = runQuintupletDefaultAlgo (acc,
1796+ modules,
1797+ mds,
1798+ segments,
1799+ triplets,
1800+ lowerModule1,
1801+ lowerModule2,
1802+ lowerModule3,
1803+ lowerModule4,
1804+ lowerModule5,
1805+ innerTripletIndex,
1806+ outerTripletIndex,
1807+ innerRadius,
1808+ outerRadius,
1809+ bridgeRadius,
1810+ regressionCenterX,
1811+ regressionCenterY,
1812+ regressionRadius,
1813+ rzChiSquared,
1814+ chiSquared,
1815+ nonAnchorChiSquared,
1816+ dBeta1,
1817+ dBeta2,
1818+ tightCutFlag,
1819+ t5Embed,
1820+ ptCut);
1821+ if (success) {
1822+ int totOccupancyQuintuplets = alpaka::atomicAdd (
1823+ acc, &quintupletsOccupancy.totOccupancyQuintuplets ()[lowerModule1], 1u , alpaka::hierarchy::Threads{});
1824+ if (totOccupancyQuintuplets >= ranges.quintupletModuleOccupancy ()[lowerModule1]) {
1825+ #ifdef WARNINGS
1826+ printf (" Quintuplet excess alert! Module index = %d, Occupancy = %d\n " ,
1827+ lowerModule1,
1828+ totOccupancyQuintuplets);
1829+ #endif
1830+ } else {
1831+ int quintupletModuleIndex = alpaka::atomicAdd (
1832+ acc, &quintupletsOccupancy.nQuintuplets ()[lowerModule1], 1u , alpaka::hierarchy::Threads{});
1833+ // this if statement should never get executed!
1834+ if (ranges.quintupletModuleIndices ()[lowerModule1] == -1 ) {
17731835#ifdef WARNINGS
1774- printf (" Quintuplets : no memory for module at module index = %d\n " , lowerModule1);
1836+ printf (" Quintuplets : no memory for module at module index = %d\n " , lowerModule1);
17751837#endif
1776- } else {
1777- unsigned int quintupletIndex = ranges.quintupletModuleIndices ()[lowerModule1] + quintupletModuleIndex;
1778- float phi = mds.anchorPhi ()[segments.mdIndices ()[triplets.segmentIndices ()[innerTripletIndex][0 ]]
1779- [layer2_adjustment]];
1780- float eta = mds.anchorEta ()[segments.mdIndices ()[triplets.segmentIndices ()[innerTripletIndex][0 ]]
1781- [layer2_adjustment]];
1782- float pt = (innerRadius + outerRadius) * k2Rinv1GeVf;
1783- float scores = chiSquared + nonAnchorChiSquared;
1784- addQuintupletToMemory (triplets,
1785- quintuplets,
1786- innerTripletIndex,
1787- outerTripletIndex,
1788- lowerModule1,
1789- lowerModule2,
1790- lowerModule3,
1791- lowerModule4,
1792- lowerModule5,
1793- innerRadius,
1794- bridgeRadius,
1795- outerRadius,
1796- regressionCenterX,
1797- regressionCenterY,
1798- regressionRadius,
1799- rzChiSquared,
1800- chiSquared,
1801- nonAnchorChiSquared,
1802- dBeta1,
1803- dBeta2,
1804- pt,
1805- eta,
1806- phi,
1807- scores,
1808- layer,
1809- quintupletIndex,
1810- t5Embed,
1811- tightCutFlag);
1812-
1813- triplets.partOfT5 ()[quintuplets.tripletIndices ()[quintupletIndex][0 ]] = true ;
1814- triplets.partOfT5 ()[quintuplets.tripletIndices ()[quintupletIndex][1 ]] = true ;
1815- }
1838+ } else {
1839+ unsigned int quintupletIndex = ranges.quintupletModuleIndices ()[lowerModule1] + quintupletModuleIndex;
1840+ float phi = mds.anchorPhi ()[segments.mdIndices ()[triplets.segmentIndices ()[innerTripletIndex][0 ]]
1841+ [layer2_adjustment]];
1842+ float eta = mds.anchorEta ()[segments.mdIndices ()[triplets.segmentIndices ()[innerTripletIndex][0 ]]
1843+ [layer2_adjustment]];
1844+ float pt = (innerRadius + outerRadius) * k2Rinv1GeVf;
1845+ float scores = chiSquared + nonAnchorChiSquared;
1846+ addQuintupletToMemory (triplets,
1847+ quintuplets,
1848+ innerTripletIndex,
1849+ outerTripletIndex,
1850+ lowerModule1,
1851+ lowerModule2,
1852+ lowerModule3,
1853+ lowerModule4,
1854+ lowerModule5,
1855+ innerRadius,
1856+ bridgeRadius,
1857+ outerRadius,
1858+ regressionCenterX,
1859+ regressionCenterY,
1860+ regressionRadius,
1861+ rzChiSquared,
1862+ chiSquared,
1863+ nonAnchorChiSquared,
1864+ dBeta1,
1865+ dBeta2,
1866+ pt,
1867+ eta,
1868+ phi,
1869+ scores,
1870+ layer,
1871+ quintupletIndex,
1872+ t5Embed,
1873+ tightCutFlag);
1874+
1875+ triplets.partOfT5 ()[quintuplets.tripletIndices ()[quintupletIndex][0 ]] = true ;
1876+ triplets.partOfT5 ()[quintuplets.tripletIndices ()[quintupletIndex][1 ]] = true ;
18161877 }
18171878 }
18181879 }
0 commit comments