@@ -953,43 +953,65 @@ class AMDGPULowerModuleLDS {
953953 return NewGV;
954954 }
955955
956+ // / Assigns an absolute address for special kinds of GVs like semaphores and
957+ // / barriers. Does this in two rounds: first by assigning a module-absolute
958+ // / address for any GV that is indirectly used by more than one kernel, and
959+ // / second by computing a kernel relative assignment for any GVs remaining.
956960 bool lowerSpecialLDSVariables (
957961 Module &M, LDSUsesInfoTy &LDSUsesInfo,
958962 VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
959963 bool Changed = false ;
960- constexpr unsigned NumScopes =
961- static_cast <unsigned >(Barrier::Scope::NUM_SCOPES);
962964 const DataLayout &DL = M.getDataLayout ();
965+
966+ unsigned NumSemAbsolutes[MAX_WAVES_PER_WAVEGROUP] = {0 };
967+ constexpr unsigned NumBarScopes =
968+ static_cast <unsigned >(Barrier::Scope::NUM_SCOPES);
969+ unsigned NumBarAbsolutes[NumBarScopes] = {0 };
970+
963971 // The 1st round: give module-absolute assignments
964- unsigned NumAbsolutes[NumScopes] = {0 };
965972 std::vector<GlobalVariable *> OrderedGVs;
966973 for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
967974 GlobalVariable *GV = K.first ;
968- if (!isNamedBarrier (*GV))
975+ if (!( isNamedBarrier (*GV) || isLDSSemaphore (*GV) ))
969976 continue ;
970- // give a module-absolute assignment if it is indirectly accessed by
977+
978+ // Give a module-absolute assignment if it is indirectly accessed by
971979 // multiple kernels. This is not precise, but we don't want to duplicate
972980 // a function when it is called by multiple kernels.
973981 if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size () > 1 ) {
974982 OrderedGVs.push_back (GV);
975983 } else {
976- // leave it to the 2nd round, which will give a kernel-relative
977- // assignment if it is only indirectly accessed by one kernel
984+ // Leave it to the 2nd round, which will give a kernel-relative
985+ // assignment if it is only indirectly accessed by one kernel.
978986 LDSUsesInfo.direct_access [*K.second .begin ()].insert (GV);
979987 }
980988 LDSToKernelsThatNeedToAccessItIndirectly.erase (GV);
981989 }
982990 OrderedGVs = sortByName (std::move (OrderedGVs));
983991 for (GlobalVariable *GV : OrderedGVs) {
984- TargetExtType *ExtTy = isNamedBarrier (*GV);
985- unsigned BarrierScope = ExtTy->getIntParameter (0 );
986- unsigned BarId = NumAbsolutes[BarrierScope] + 1 ;
987- unsigned BarCnt = DL.getTypeAllocSize (GV->getValueType ()) / 16 ;
988- NumAbsolutes[BarrierScope] += BarCnt;
989-
990- // 4 bits for alignment, 5 bits for the barrier num,
991- // 3 bits for the barrier scope
992- unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4 ;
992+ unsigned Offset;
993+ if (TargetExtType *ExtTy = isNamedBarrier (*GV)) {
994+ unsigned BarrierScope = ExtTy->getIntParameter (0 );
995+ unsigned BarId = NumBarAbsolutes[BarrierScope] + 1 ;
996+ unsigned BarCnt = DL.getTypeAllocSize (GV->getValueType ()) / 16 ;
997+ NumBarAbsolutes[BarrierScope] += BarCnt;
998+
999+ // 4 bits for alignment, 5 bits for the barrier num,
1000+ // 3 bits for the barrier scope
1001+ Offset = 0x802000u | BarrierScope << 9 | BarId << 4 ;
1002+
1003+ } else if (TargetExtType *ExtTy = isLDSSemaphore (*GV)) {
1004+ unsigned OwningRank = ExtTy->getIntParameter (0 );
1005+ assert (OwningRank < MAX_WAVES_PER_WAVEGROUP);
1006+ unsigned Num = ++NumSemAbsolutes[OwningRank];
1007+
1008+ // 4 bits for alignment, 4 bits for the semaphore num,
1009+ // 4 bits for the owning rank
1010+ Offset = 0x801000u | OwningRank << 8 | Num << 4 ;
1011+
1012+ } else
1013+ llvm_unreachable (" Unhandled special variable type." );
1014+
9931015 recordLDSAbsoluteAddress (&M, GV, Offset);
9941016 }
9951017 OrderedGVs.clear ();
@@ -1005,32 +1027,52 @@ class AMDGPULowerModuleLDS {
10051027 }
10061028 OrderedKernels = sortByName (std::move (OrderedKernels));
10071029
1008- DenseMap<Function *, unsigned > Kernel2BarId[NumScopes];
1030+ DenseMap<Function *, unsigned > Kernel2BarId[NumBarScopes];
1031+ DenseMap<Function *, unsigned > Kernel2SemRelative[MAX_WAVES_PER_WAVEGROUP];
10091032 for (Function *F : OrderedKernels) {
1033+
1034+ // Collect all globals for each kernel.
10101035 for (GlobalVariable *GV : LDSUsesInfo.direct_access [F]) {
1011- if (!isNamedBarrier (*GV))
1036+ if (!( isNamedBarrier (*GV) || isLDSSemaphore (*GV) ))
10121037 continue ;
10131038
10141039 LDSUsesInfo.direct_access [F].erase (GV);
10151040 if (GV->isAbsoluteSymbolRef ()) {
1016- // already assigned
1041+ // Already assigned.
10171042 continue ;
10181043 }
10191044 OrderedGVs.push_back (GV);
10201045 }
1046+
10211047 OrderedGVs = sortByName (std::move (OrderedGVs));
10221048 for (GlobalVariable *GV : OrderedGVs) {
10231049 // GV could also be used directly by other kernels. If so, we need to
10241050 // create a new GV used only by this kernel and its function.
10251051 auto NewGV = uniquifyGVPerKernel (M, GV, F);
10261052 Changed |= (NewGV != GV);
1027- TargetExtType *ExtTy = isNamedBarrier (*GV);
1028- unsigned BarrierScope = ExtTy->getIntParameter (0 );
1029- unsigned BarId = Kernel2BarId[BarrierScope][F];
1030- BarId += NumAbsolutes[BarrierScope] + 1 ;
1031- unsigned BarCnt = DL.getTypeAllocSize (GV->getValueType ()) / 16 ;
1032- Kernel2BarId[BarrierScope][F] += BarCnt;
1033- unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4 ;
1053+ unsigned Offset;
1054+ if (TargetExtType *ExtTy = isNamedBarrier (*GV)) {
1055+ // Place each barrier in the next open slot above the module-relative
1056+ // and already assigned kernel-relative barriers.
1057+ unsigned BarrierScope = ExtTy->getIntParameter (0 );
1058+ unsigned BarId = Kernel2BarId[BarrierScope][F];
1059+ BarId += NumBarAbsolutes[BarrierScope] + 1 ;
1060+ unsigned BarCnt = DL.getTypeAllocSize (GV->getValueType ()) / 16 ;
1061+ Kernel2BarId[BarrierScope][F] += BarCnt;
1062+ Offset = 0x802000u | BarrierScope << 9 | BarId << 4 ;
1063+
1064+ } else if (TargetExtType *ExtTy = isLDSSemaphore (*GV)) {
1065+ // Determine which semaphore GVs were already assigned, and for the
1066+ // remaining ones assign the semaphore nums above.
1067+ unsigned OwningRank =
1068+ ExtTy->getIntParameter (0 ) % MAX_WAVES_PER_WAVEGROUP;
1069+ unsigned Num = NumSemAbsolutes[OwningRank];
1070+ Kernel2SemRelative[OwningRank][F]++;
1071+ Num += Kernel2SemRelative[OwningRank][F];
1072+ Offset = 0x801000u | OwningRank << 8 | Num << 4 ;
1073+
1074+ } else
1075+ llvm_unreachable (" Unhandled special variable type." );
10341076 recordLDSAbsoluteAddress (&M, NewGV, Offset);
10351077 }
10361078 OrderedGVs.clear ();
@@ -1039,7 +1081,7 @@ class AMDGPULowerModuleLDS {
10391081 for (auto &K : LDSUsesInfo.indirect_access ) {
10401082 assert (isKernelLDS (K.first ));
10411083 for (GlobalVariable *GV : K.second ) {
1042- if (isNamedBarrier (*GV))
1084+ if (isNamedBarrier (*GV) || isLDSSemaphore (*GV) )
10431085 K.second .erase (GV);
10441086 }
10451087 }
0 commit comments