Skip to content

Commit 7c4b566

Browse files
committed
Reshape LST T3 and T5 kernels to increase the active threads per warp
1 parent 1db657f commit 7c4b566

File tree

4 files changed

+392
-514
lines changed

4 files changed

+392
-514
lines changed

RecoTracker/LSTCore/interface/QuintupletsSoA.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99

1010
namespace lst {
1111
GENERATE_SOA_LAYOUT(QuintupletsSoALayout,
12-
SOA_COLUMN(ArrayUx2, tripletIndices), // inner and outer triplet indices
12+
SOA_COLUMN(ArrayUx2,
13+
preAllocatedTripletIndices), // pre-allocated the theoretical max triplet indices
14+
SOA_COLUMN(ArrayUx2, tripletIndices), // inner and outer triplet indices
1315
SOA_COLUMN(Params_T5::ArrayU16xLayers, lowerModuleIndices), // lower module index in each layer
1416
SOA_COLUMN(Params_T5::ArrayU8xLayers, logicalLayers), // layer ID
1517
SOA_COLUMN(Params_T5::ArrayUxHits, hitIndices), // hit indices

RecoTracker/LSTCore/interface/TripletsSoA.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99

1010
namespace lst {
1111
GENERATE_SOA_LAYOUT(TripletsSoALayout,
12-
SOA_COLUMN(ArrayUx2, segmentIndices), // inner and outer segment indices
12+
SOA_COLUMN(ArrayUx2,
13+
preAllocatedSegmentIndices), // pre-allocated the theoretical max segment indices
14+
SOA_COLUMN(ArrayUx2, segmentIndices), // inner and outer segment indices
1315
SOA_COLUMN(Params_T3::ArrayU16xLayers, lowerModuleIndices), // lower module index in each layer
1416
SOA_COLUMN(Params_T3::ArrayU8xLayers, logicalLayers), // layer ID
1517
SOA_COLUMN(Params_T3::ArrayUxHits, hitIndices), // hit indices
@@ -22,8 +24,6 @@ namespace lst {
2224
SOA_COLUMN(float, displacedScore), // DNN confidence score for real (displaced) t3
2325
SOA_COLUMN(unsigned int, connectedMax), // number of outer-triplets that pass the MD-equality cut
2426
#ifdef CUT_VALUE_DEBUG
25-
SOA_COLUMN(float, zOut),
26-
SOA_COLUMN(float, rtOut),
2727
SOA_COLUMN(float, betaInCut),
2828
#endif
2929
SOA_COLUMN(bool, partOfPT5), // is it used in a pT5

RecoTracker/LSTCore/src/alpaka/Quintuplet.h

Lines changed: 160 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -1499,15 +1499,6 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
14991499
unsigned int thirdSegmentIndex = triplets.segmentIndices()[outerTripletIndex][0];
15001500
unsigned int fourthSegmentIndex = triplets.segmentIndices()[outerTripletIndex][1];
15011501

1502-
unsigned int innerOuterOuterMiniDoubletIndex =
1503-
segments.mdIndices()[secondSegmentIndex][1]; //inner triplet outer segment outer MD index
1504-
unsigned int outerInnerInnerMiniDoubletIndex =
1505-
segments.mdIndices()[thirdSegmentIndex][0]; //outer triplet inner segment inner MD index
1506-
1507-
//this cut reduces the number of candidates by a factor of 3, i.e., 2 out of 3 warps can end right here!
1508-
if (innerOuterOuterMiniDoubletIndex != outerInnerInnerMiniDoubletIndex)
1509-
return false;
1510-
15111502
unsigned int firstMDIndex = segments.mdIndices()[firstSegmentIndex][0];
15121503
unsigned int secondMDIndex = segments.mdIndices()[secondSegmentIndex][0];
15131504
unsigned int thirdMDIndex = segments.mdIndices()[secondSegmentIndex][1];
@@ -1699,8 +1690,27 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
16991690
const float ptCut) const {
17001691
ALPAKA_ASSERT_ACC((alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[1] == 1) &&
17011692
(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[2] == 1));
1693+
1694+
int& matchCount = alpaka::declareSharedVar<int, __COUNTER__>(acc);
1695+
1696+
const auto threadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
1697+
const auto blockDim = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
1698+
1699+
const int threadIdX = threadIdx.x();
1700+
const int threadIdY = threadIdx.y();
1701+
const int blockSizeX = blockDim.x();
1702+
const int blockSizeY = blockDim.y();
1703+
const int blockSize = blockSizeX * blockSizeY;
1704+
const int flatThreadIdxXY = threadIdY * blockSizeX + threadIdX;
1705+
const int flatThreadExtent = blockSize; // total threads per block
1706+
17021707
for (int iter : cms::alpakatools::uniform_groups_z(acc, nEligibleT5Modules)) {
17031708
uint16_t lowerModule1 = ranges.indicesOfEligibleT5Modules()[iter];
1709+
1710+
if (cms::alpakatools::once_per_block(acc)) {
1711+
matchCount = 0;
1712+
}
1713+
17041714
short layer2_adjustment;
17051715
int layer = modules.layers()[lowerModule1];
17061716
if (layer == 1) {
@@ -1712,107 +1722,158 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
17121722
else {
17131723
continue;
17141724
}
1725+
17151726
unsigned int nInnerTriplets = tripletsOccupancy.nTriplets()[lowerModule1];
1727+
1728+
alpaka::syncBlockThreads(acc);
1729+
1730+
// Step 1: Make inner and outer triplet pairs
17161731
for (unsigned int innerTripletArrayIndex : cms::alpakatools::uniform_elements_y(acc, nInnerTriplets)) {
17171732
unsigned int innerTripletIndex = ranges.tripletModuleIndices()[lowerModule1] + innerTripletArrayIndex;
1718-
uint16_t lowerModule2 = triplets.lowerModuleIndices()[innerTripletIndex][1];
17191733
uint16_t lowerModule3 = triplets.lowerModuleIndices()[innerTripletIndex][2];
17201734
unsigned int nOuterTriplets = tripletsOccupancy.nTriplets()[lowerModule3];
17211735
for (unsigned int outerTripletArrayIndex : cms::alpakatools::uniform_elements_x(acc, nOuterTriplets)) {
17221736
unsigned int outerTripletIndex = ranges.tripletModuleIndices()[lowerModule3] + outerTripletArrayIndex;
1723-
uint16_t lowerModule4 = triplets.lowerModuleIndices()[outerTripletIndex][1];
1724-
uint16_t lowerModule5 = triplets.lowerModuleIndices()[outerTripletIndex][2];
1725-
1726-
float innerRadius, outerRadius, bridgeRadius, regressionCenterX, regressionCenterY, regressionRadius,
1727-
rzChiSquared, chiSquared, nonAnchorChiSquared, dBeta1, dBeta2; //required for making distributions
1728-
1729-
float t5Embed[Params_T5::kEmbed] = {0.f};
1730-
1731-
bool tightCutFlag = false;
1732-
bool success = runQuintupletDefaultAlgo(acc,
1733-
modules,
1734-
mds,
1735-
segments,
1736-
triplets,
1737-
lowerModule1,
1738-
lowerModule2,
1739-
lowerModule3,
1740-
lowerModule4,
1741-
lowerModule5,
1742-
innerTripletIndex,
1743-
outerTripletIndex,
1744-
innerRadius,
1745-
outerRadius,
1746-
bridgeRadius,
1747-
regressionCenterX,
1748-
regressionCenterY,
1749-
regressionRadius,
1750-
rzChiSquared,
1751-
chiSquared,
1752-
nonAnchorChiSquared,
1753-
dBeta1,
1754-
dBeta2,
1755-
tightCutFlag,
1756-
t5Embed,
1757-
ptCut);
1758-
1759-
if (success) {
1760-
int totOccupancyQuintuplets = alpaka::atomicAdd(
1761-
acc, &quintupletsOccupancy.totOccupancyQuintuplets()[lowerModule1], 1u, alpaka::hierarchy::Threads{});
1762-
if (totOccupancyQuintuplets >= ranges.quintupletModuleOccupancy()[lowerModule1]) {
1737+
1738+
unsigned int secondSegmentIndex = triplets.segmentIndices()[innerTripletIndex][1];
1739+
unsigned int thirdSegmentIndex = triplets.segmentIndices()[outerTripletIndex][0];
1740+
1741+
unsigned int miniDoublet3Index =
1742+
segments.mdIndices()[secondSegmentIndex][1]; //inner triplet outer segment outer MD index
1743+
unsigned int outerInnerInnerMiniDoubletIndex =
1744+
segments.mdIndices()[thirdSegmentIndex][0]; //outer triplet inner segment inner MD index
1745+
1746+
//this cut reduces the number of candidates by a factor of 3, i.e., 2 out of 3 warps can end right here!
1747+
if (miniDoublet3Index != outerInnerInnerMiniDoubletIndex)
1748+
continue;
1749+
1750+
// Match inner Sg and Outer Sg
1751+
int mIdx = alpaka::atomicAdd(acc, &matchCount, 1, alpaka::hierarchy::Threads{});
1752+
unsigned int quintupletIndex = ranges.quintupletModuleIndices()[lowerModule1] + mIdx;
1753+
17631754
#ifdef WARNINGS
1764-
printf("Quintuplet excess alert! Module index = %d, Occupancy = %d\n",
1765-
lowerModule1,
1766-
totOccupancyQuintuplets);
1755+
const unsigned int rightBound =
1756+
static_cast<unsigned int>(ranges.quintupletModuleIndices()[lowerModule1 + 1]);
1757+
if (quintupletIndex >= rightBound) {
1758+
printf(
1759+
"Quintuplet module occupancy alert! module quintuplet starting index = %d, Pair quintuplet index = "
1760+
"%d, next module quintuplet starting index = %d\n",
1761+
ranges.quintupletModuleIndices()[lowerModule1],
1762+
mIdx,
1763+
ranges.quintupletModuleIndices()[lowerModule1 + 1]);
1764+
}
17671765
#endif
1768-
} else {
1769-
int quintupletModuleIndex = alpaka::atomicAdd(
1770-
acc, &quintupletsOccupancy.nQuintuplets()[lowerModule1], 1u, alpaka::hierarchy::Threads{});
1771-
//this if statement should never get executed!
1772-
if (ranges.quintupletModuleIndices()[lowerModule1] == -1) {
1766+
1767+
quintuplets.preAllocatedTripletIndices()[quintupletIndex][0] = innerTripletIndex;
1768+
quintuplets.preAllocatedTripletIndices()[quintupletIndex][1] = outerTripletIndex;
1769+
}
1770+
}
1771+
1772+
alpaka::syncBlockThreads(acc);
1773+
if (matchCount == 0) {
1774+
continue;
1775+
}
1776+
1777+
// Step 2: Parallel processing of triplet pairs
1778+
for (int i = flatThreadIdxXY; i < matchCount; i += flatThreadExtent) {
1779+
unsigned int quintupletIndex = ranges.quintupletModuleIndices()[lowerModule1] + i;
1780+
int innerTripletIndex = quintuplets.preAllocatedTripletIndices()[quintupletIndex][0];
1781+
int outerTripletIndex = quintuplets.preAllocatedTripletIndices()[quintupletIndex][1];
1782+
1783+
uint16_t lowerModule2 = triplets.lowerModuleIndices()[innerTripletIndex][1];
1784+
uint16_t lowerModule3 = triplets.lowerModuleIndices()[innerTripletIndex][2];
1785+
uint16_t lowerModule4 = triplets.lowerModuleIndices()[outerTripletIndex][1];
1786+
uint16_t lowerModule5 = triplets.lowerModuleIndices()[outerTripletIndex][2];
1787+
1788+
float innerRadius, outerRadius, bridgeRadius, regressionCenterX, regressionCenterY, regressionRadius,
1789+
rzChiSquared, chiSquared, nonAnchorChiSquared, dBeta1, dBeta2; //required for making distributions
1790+
1791+
float t5Embed[Params_T5::kEmbed] = {0.f};
1792+
1793+
bool tightCutFlag = false;
1794+
1795+
bool success = runQuintupletDefaultAlgo(acc,
1796+
modules,
1797+
mds,
1798+
segments,
1799+
triplets,
1800+
lowerModule1,
1801+
lowerModule2,
1802+
lowerModule3,
1803+
lowerModule4,
1804+
lowerModule5,
1805+
innerTripletIndex,
1806+
outerTripletIndex,
1807+
innerRadius,
1808+
outerRadius,
1809+
bridgeRadius,
1810+
regressionCenterX,
1811+
regressionCenterY,
1812+
regressionRadius,
1813+
rzChiSquared,
1814+
chiSquared,
1815+
nonAnchorChiSquared,
1816+
dBeta1,
1817+
dBeta2,
1818+
tightCutFlag,
1819+
t5Embed,
1820+
ptCut);
1821+
if (success) {
1822+
int totOccupancyQuintuplets = alpaka::atomicAdd(
1823+
acc, &quintupletsOccupancy.totOccupancyQuintuplets()[lowerModule1], 1u, alpaka::hierarchy::Threads{});
1824+
if (totOccupancyQuintuplets >= ranges.quintupletModuleOccupancy()[lowerModule1]) {
1825+
#ifdef WARNINGS
1826+
printf("Quintuplet excess alert! Module index = %d, Occupancy = %d\n",
1827+
lowerModule1,
1828+
totOccupancyQuintuplets);
1829+
#endif
1830+
} else {
1831+
int quintupletModuleIndex = alpaka::atomicAdd(
1832+
acc, &quintupletsOccupancy.nQuintuplets()[lowerModule1], 1u, alpaka::hierarchy::Threads{});
1833+
//this if statement should never get executed!
1834+
if (ranges.quintupletModuleIndices()[lowerModule1] == -1) {
17731835
#ifdef WARNINGS
1774-
printf("Quintuplets : no memory for module at module index = %d\n", lowerModule1);
1836+
printf("Quintuplets : no memory for module at module index = %d\n", lowerModule1);
17751837
#endif
1776-
} else {
1777-
unsigned int quintupletIndex = ranges.quintupletModuleIndices()[lowerModule1] + quintupletModuleIndex;
1778-
float phi = mds.anchorPhi()[segments.mdIndices()[triplets.segmentIndices()[innerTripletIndex][0]]
1779-
[layer2_adjustment]];
1780-
float eta = mds.anchorEta()[segments.mdIndices()[triplets.segmentIndices()[innerTripletIndex][0]]
1781-
[layer2_adjustment]];
1782-
float pt = (innerRadius + outerRadius) * k2Rinv1GeVf;
1783-
float scores = chiSquared + nonAnchorChiSquared;
1784-
addQuintupletToMemory(triplets,
1785-
quintuplets,
1786-
innerTripletIndex,
1787-
outerTripletIndex,
1788-
lowerModule1,
1789-
lowerModule2,
1790-
lowerModule3,
1791-
lowerModule4,
1792-
lowerModule5,
1793-
innerRadius,
1794-
bridgeRadius,
1795-
outerRadius,
1796-
regressionCenterX,
1797-
regressionCenterY,
1798-
regressionRadius,
1799-
rzChiSquared,
1800-
chiSquared,
1801-
nonAnchorChiSquared,
1802-
dBeta1,
1803-
dBeta2,
1804-
pt,
1805-
eta,
1806-
phi,
1807-
scores,
1808-
layer,
1809-
quintupletIndex,
1810-
t5Embed,
1811-
tightCutFlag);
1812-
1813-
triplets.partOfT5()[quintuplets.tripletIndices()[quintupletIndex][0]] = true;
1814-
triplets.partOfT5()[quintuplets.tripletIndices()[quintupletIndex][1]] = true;
1815-
}
1838+
} else {
1839+
unsigned int quintupletIndex = ranges.quintupletModuleIndices()[lowerModule1] + quintupletModuleIndex;
1840+
float phi = mds.anchorPhi()[segments.mdIndices()[triplets.segmentIndices()[innerTripletIndex][0]]
1841+
[layer2_adjustment]];
1842+
float eta = mds.anchorEta()[segments.mdIndices()[triplets.segmentIndices()[innerTripletIndex][0]]
1843+
[layer2_adjustment]];
1844+
float pt = (innerRadius + outerRadius) * k2Rinv1GeVf;
1845+
float scores = chiSquared + nonAnchorChiSquared;
1846+
addQuintupletToMemory(triplets,
1847+
quintuplets,
1848+
innerTripletIndex,
1849+
outerTripletIndex,
1850+
lowerModule1,
1851+
lowerModule2,
1852+
lowerModule3,
1853+
lowerModule4,
1854+
lowerModule5,
1855+
innerRadius,
1856+
bridgeRadius,
1857+
outerRadius,
1858+
regressionCenterX,
1859+
regressionCenterY,
1860+
regressionRadius,
1861+
rzChiSquared,
1862+
chiSquared,
1863+
nonAnchorChiSquared,
1864+
dBeta1,
1865+
dBeta2,
1866+
pt,
1867+
eta,
1868+
phi,
1869+
scores,
1870+
layer,
1871+
quintupletIndex,
1872+
t5Embed,
1873+
tightCutFlag);
1874+
1875+
triplets.partOfT5()[quintuplets.tripletIndices()[quintupletIndex][0]] = true;
1876+
triplets.partOfT5()[quintuplets.tripletIndices()[quintupletIndex][1]] = true;
18161877
}
18171878
}
18181879
}

0 commit comments

Comments
 (0)