ktf
diff --git a/‎Detectors/CTP/reconstruction/src/RawDataDecoder.cxx‎
Lines changed: 4 additions & 2 deletions b/‎Detectors/CTP/reconstruction/src/RawDataDecoder.cxx‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎Detectors/CTP/workflow/src/RawDecoderSpec.cxx‎
Lines changed: 4 additions & 2 deletions b/‎Detectors/CTP/workflow/src/RawDecoderSpec.cxx‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎GPU/Common/GPUCommonDef.h‎
Lines changed: 4 additions & 0 deletions b/‎GPU/Common/GPUCommonDef.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx‎
Lines changed: 5 additions & 0 deletions b/‎GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc‎
Lines changed: 2 additions & 84 deletions b/‎GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc‎
Lines changed: 2 additions & 84 deletions
diff --git a/‎GPU/GPUTracking/Base/cuda/GPUReconstructionCUDArtc.cu‎
Lines changed: 1 addition & 0 deletions b/‎GPU/GPUTracking/Base/cuda/GPUReconstructionCUDArtc.cu‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎GPU/GPUTracking/Base/hip/CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions b/‎GPU/GPUTracking/Base/hip/CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎GPU/GPUTracking/CMakeLists.txt‎
Lines changed: 3 additions & 2 deletions b/‎GPU/GPUTracking/CMakeLists.txt‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx‎
Lines changed: 1 addition & 1 deletion b/‎GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx‎
Lines changed: 5 additions & 5 deletions b/‎GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx‎
Lines changed: 5 additions & 5 deletions
@@ -615,8 +615,9 @@ int RawDataDecoder::checkReadoutConsistentncy(o2::pmr::vector<CTPDigit>& digits,
           continue;
         }
         mClassCountersA[i]++;
-        if (cls->descriptor == nullptr)
+        if (cls->descriptor == nullptr) {
           continue;
+        }
         uint64_t clsinpmask = cls->descriptor->getInputsMask();
         uint64_t diginpmask = digit.CTPInputMask.to_ullong();
         if (!((clsinpmask & diginpmask) == clsinpmask)) {
@@ -632,8 +633,9 @@ int RawDataDecoder::checkReadoutConsistentncy(o2::pmr::vector<CTPDigit>& digits,
     // if inps => class mask
     for (auto const& cls : mCTPConfig.getCTPClasses()) {
       // cls.printStream(std::cout);
-      if (cls.descriptor == nullptr)
+      if (cls.descriptor == nullptr) {
         continue;
+      }
       uint64_t clsinpmask = cls.descriptor->getInputsMask(); // class definition
       uint64_t diginpmask = digit.CTPInputMask.to_ullong();
       uint64_t digclsmask = digit.CTPClassMask.to_ullong();
 
@@ -71,17 +71,19 @@ void RawDecoderSpec::endOfStream(framework::EndOfStreamContext& ec)
   std::cout << std::endl;
   LOG(info) << " Lost due to the shift:" << mDecoder.getLostDueToShift();
   LOG(info) << "Number of missing TF:" << nmiss << std::endl;
-  if (mDecoder.getErrorIR() || mDecoder.getErrorTCR())
+  if (mDecoder.getErrorIR() || mDecoder.getErrorTCR()) {
     LOG(error) << "# of IR errors:" << mDecoder.getErrorIR() << " TCR errors:" << mDecoder.getErrorTCR() << std::endl;
+  }
   std::array<uint64_t, o2::ctp::CTP_NCLASSES> clsA = mDecoder.getClassCountersA();
   std::array<uint64_t, o2::ctp::CTP_NCLASSES> clsB = mDecoder.getClassCountersB();
   std::array<uint64_t, o2::ctp::CTP_NCLASSES> clsEA = mDecoder.getClassErrorsA();
   std::array<uint64_t, o2::ctp::CTP_NCLASSES> clsEB = mDecoder.getClassErrorsB();
 
   for (int i = 0; i < o2::ctp::CTP_NCLASSES; i++) {
     bool print = clsA[i] > 0 || clsB[i] > 0 || clsEA[i] > 0 || clsEB[i] > 0;
-    if (clsEA[i])
+    if (clsEA[i]) {
       LOG(error) << " Class without inputs:";
+    }
     LOG(important) << "CLASS:" << i << " Cls=>Inp:" << clsA[i] << " Inp=>Cls:" << clsB[i] << "  ErrorsCls=>Inps:" << clsEA[i] << "  MissingInps=>Cls:" << clsEB[i];
   }
 }
 
@@ -72,6 +72,10 @@
   #define GPUCA_RTC_SPECIAL_CODE(...)
 #endif
 
+#ifndef GPUCA_RTC_CONSTEXPR
+  #define GPUCA_RTC_CONSTEXPR
+#endif
+
 #ifndef GPUCA_DETERMINISTIC_CODE
   #ifdef GPUCA_DETERMINISTIC_MODE
     #define GPUCA_DETERMINISTIC_CODE(det, indet) det // In deterministic mode, take deterministic code path
 
@@ -38,6 +38,11 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
 {
   std::string rtcparam = std::string("#define GPUCA_RTC_CODE\n") +
                          std::string(GetProcessingSettings().rtc.optSpecialCode ? "#define GPUCA_RTC_SPECIAL_CODE(...) __VA_ARGS__\n" : "#define GPUCA_RTC_SPECIAL_CODE(...)\n") +
+#ifndef GPUCA_HIP_WORKAROUND_CONSTEXPR // TODO: Fixme, once we have C++ P2280R4 in Clang
+                         std::string(GetProcessingSettings().rtc.optConstexpr ? "#define GPUCA_RTC_CONSTEXPR constexpr\n" : "#define GPUCA_RTC_CONSTEXPR\n") +
+#else
+                         std::string("#define GPUCA_RTC_CONSTEXPR\n") +
+#endif
                          GPUParamRTC::generateRTCCode(param(), GetProcessingSettings().rtc.optConstexpr);
   if (filename == "") {
     filename = "/tmp/o2cagpu_rtc_";
 
@@ -14,88 +14,6 @@
 
 #if defined(GPUCA_SPECIALIZE_THRUST_SORTS) && !defined(GPUCA_GPUCODE_COMPILEKERNELS)
 
-namespace o2::gpu::internal
-{
-namespace // anonymous
-{
-struct MergeBorderTracks_compMax {
-  GPUd() bool operator()(const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b)
-  {
-    return GPUCA_DETERMINISTIC_CODE((a.fMax != b.fMax) ? (a.fMax < b.fMax) : (a.fId < b.fId), a.fMax < b.fMax);
-  }
-};
-struct MergeBorderTracks_compMin {
-  GPUd() bool operator()(const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b)
-  {
-    return GPUCA_DETERMINISTIC_CODE((a.fMin != b.fMin) ? (a.fMin < b.fMin) : (a.fId < b.fId), a.fMin < b.fMin);
-  }
-};
-
-struct GPUTPCGMMergerSortTracks_comp {
-  const GPUTPCGMMergedTrack* const mCmp;
-  GPUhd() GPUTPCGMMergerSortTracks_comp(GPUTPCGMMergedTrack* cmp) : mCmp(cmp) {}
-  GPUd() bool operator()(const int32_t aa, const int32_t bb)
-  {
-    const GPUTPCGMMergedTrack& GPUrestrict() a = mCmp[aa];
-    const GPUTPCGMMergedTrack& GPUrestrict() b = mCmp[bb];
-    if (a.CCE() != b.CCE()) {
-      return a.CCE() > b.CCE();
-    }
-    if (a.Legs() != b.Legs()) {
-      return a.Legs() > b.Legs();
-    }
-    GPUCA_DETERMINISTIC_CODE( // clang-format off
-      if (a.NClusters() != b.NClusters()) {
-        return a.NClusters() > b.NClusters();
-      } if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) {
-        return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt());
-      } if (a.GetParam().GetY() != b.GetParam().GetY()) {
-        return a.GetParam().GetY() > b.GetParam().GetY();
-      }
-      return aa > bb;
-    , // !GPUCA_DETERMINISTIC_CODE
-      return a.NClusters() > b.NClusters();
-    ) // clang-format on
-  }
-};
-
-struct GPUTPCGMMergerSortTracksQPt_comp {
-  const GPUTPCGMMergedTrack* const mCmp;
-  GPUhd() GPUTPCGMMergerSortTracksQPt_comp(GPUTPCGMMergedTrack* cmp) : mCmp(cmp) {}
-  GPUd() bool operator()(const int32_t aa, const int32_t bb)
-  {
-    const GPUTPCGMMergedTrack& GPUrestrict() a = mCmp[aa];
-    const GPUTPCGMMergedTrack& GPUrestrict() b = mCmp[bb];
-    GPUCA_DETERMINISTIC_CODE( // clang-format off
-      if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) {
-        return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt());
-      } if (a.GetParam().GetY() != b.GetParam().GetY()) {
-        return a.GetParam().GetY() > b.GetParam().GetY();
-      }
-      return a.GetParam().GetZ() > b.GetParam().GetZ();
-    , // !GPUCA_DETERMINISTIC_CODE
-      return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt());
-    ) // clang-format on
-  }
-};
-
-struct GPUTPCGMMergerMergeLoopers_comp {
-  GPUd() bool operator()(const MergeLooperParam& a, const MergeLooperParam& b)
-  {
-    return CAMath::Abs(a.refz) < CAMath::Abs(b.refz);
-  }
-};
-
-struct GPUTPCGMO2OutputSort_comp {
-  GPUd() bool operator()(const GPUTPCGMMerger::tmpSort& a, const GPUTPCGMMerger::tmpSort& b)
-  {
-    return (a.y > b.y);
-  }
-};
-
-} // anonymous namespace
-} // namespace o2::gpu::internal
-
 template <>
 inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMMergerMergeBorders, 3>(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax)
 {
@@ -109,13 +27,13 @@ inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed
 template <>
 inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMMergerSortTracks, 0>(const krnlSetupTime& _xyz)
 {
-  GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackOrderProcess(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
+  GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackOrderProcess(), processors()->tpcMerger.NMergedTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.MergedTracks()));
 }
 
 template <>
 inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMMergerSortTracksQPt, 0>(const krnlSetupTime& _xyz)
 {
-  GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSort(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
+  GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSort(), processors()->tpcMerger.NMergedTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.MergedTracks()));
 }
 
 template <>
 
@@ -18,6 +18,7 @@
 // Keep some preprocessor calls unprocessed
 #define GPUCA_RTC_SPECIAL_CODE(...) GPUCA_RTC_SPECIAL_CODE(__VA_ARGS__)
 #define GPUCA_DETERMINISTIC_CODE(...) GPUCA_DETERMINISTIC_CODE(__VA_ARGS__)
+#define GPUCA_RTC_CONSTEXPR GPUCA_RTC_CONSTEXPR
 
 // GPUReconstructionCUDAIncludesSystem.h prependended by CMakewithout preprocessor running
 #include "GPUReconstructionCUDADef.h"
 
@@ -270,3 +270,8 @@ add_dependencies(GPUTrackingHIPExternalProvider O2::GPUTracking) # must not depe
 if(NOT DEFINED GPUCA_HIP_HIPIFY_FROM_CUDA OR "${GPUCA_HIP_HIPIFY_FROM_CUDA}")
   add_dependencies(GPUTrackingHIPExternalProvider ${MODULE}_HIPIFIED)
 endif()
+
+set_source_files_properties("${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPGenRTC.cxx"
+TARGET_DIRECTORY O2::GPUTrackingHIP
+PROPERTIES
+COMPILE_DEFINITIONS "GPUCA_HIP_WORKAROUND_CONSTEXPR")
@@ -16,11 +16,12 @@ set(MODULE GPUTracking)
 
 if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_NO_FAST_MATH})
   set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${GPUCA_CXX_NO_FAST_MATH_FLAGS}")
+elseif(NOT CMAKE_BUILD_TYPE_UPPER STREQUAL "DEBUG")
   if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_OPTO2})
     set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -O2")
+  else()
+    set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -O3 -ffast-math")
   endif()
-elseif(NOT CMAKE_BUILD_TYPE_UPPER STREQUAL "DEBUG")
-  set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -O3 -ffast-math")
 endif()
 set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${GPUCA_CXX_DENORMALS_FLAGS}")
 
 
@@ -125,7 +125,7 @@ void GPUTPCCompression::SetMaxData(const GPUTrackingInOutPointers& io)
   mMaxClusterFactorBase1024 = mMaxClusters > 100000000 ? mRec->MemoryScalers()->NTPCUnattachedHitsBase1024(mRec->GetParam().rec.tpc.rejectionStrategy) : 1024;
   mMaxClustersInCache = mMaxClusters * mMaxClusterFactorBase1024 / 1024;
   mMaxTrackClusters = mRec->GetConstantMem().tpcMerger.NOutputTrackClusters(); // TODO: Why is this not using ioPtrs? Could remove GPUConstantMem.h include
-  mMaxTracks = mRec->GetConstantMem().tpcMerger.NOutputTracks();
+  mMaxTracks = mRec->GetConstantMem().tpcMerger.NMergedTracks();
   if (mMaxClusters % 16) {
     mMaxClusters += 16 - (mMaxClusters % 16);
   }
 
@@ -201,7 +201,7 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1un
     const uint32_t iSector = iSectorRow / GPUCA_ROW_COUNT;
     const uint32_t iRow = iSectorRow % GPUCA_ROW_COUNT;
     const uint32_t idOffset = clusters->clusterOffset[iSector][iRow];
-    const uint32_t idOffsetOut = clusters->clusterOffset[iSector][iRow] * compressor.mMaxClusterFactorBase1024 / 1024;
+    const uint32_t idOffsetOut = clusters->clusterOffset[iSector][iRow] * compressor.mMaxClusterFactorBase1024 / 1024;                           // 32 bit enough for number of clusters per row * 1024
     const uint32_t idOffsetOutMax = ((const uint32_t*)clusters->clusterOffset[iSector])[iRow + 1] * compressor.mMaxClusterFactorBase1024 / 1024; // Array out of bounds access is ok, since it goes to the correct nClustersTotal
     if (iThread == nThreads - 1) {
       smem.nCount = 0;
@@ -214,7 +214,7 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1un
     const uint32_t nn = CAMath::nextMultipleOf<GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionKernels_step1unattached)>(clusters->nClusters[iSector][iRow]);
     for (uint32_t i = iThread; i < nn + nThreads; i += nThreads) {
       const int32_t idx = idOffset + i;
-      int32_t cidx = 0;
+      int32_t storeCluster = 0;
       do {
         if (i >= clusters->nClusters[iSector][iRow]) {
           break;
@@ -239,13 +239,13 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1un
             break;
           }
         }
-        cidx = 1;
+        storeCluster = 1;
       } while (false);
 
       GPUbarrier();
-      int32_t myIndex = work_group_scan_inclusive_add(cidx);
+      int32_t myIndex = work_group_scan_inclusive_add(storeCluster);
       int32_t storeLater = -1;
-      if (cidx) {
+      if (storeCluster) {
         if (smem.nCount + myIndex <= GPUCA_TPC_COMP_CHUNK_SIZE) {
           sortBuffer[smem.nCount + myIndex - 1] = i;
         } else {
Original file line number	Diff line number	Diff line change
`@@ -125,7 +125,7 @@ void GPUTPCCompression::SetMaxData(const GPUTrackingInOutPointers& io)`
`125`	`125`	`mMaxClusterFactorBase1024 = mMaxClusters > 100000000 ? mRec->MemoryScalers()->NTPCUnattachedHitsBase1024(mRec->GetParam().rec.tpc.rejectionStrategy) : 1024;`
`126`	`126`	`mMaxClustersInCache = mMaxClusters * mMaxClusterFactorBase1024 / 1024;`
`127`	`127`	`mMaxTrackClusters = mRec->GetConstantMem().tpcMerger.NOutputTrackClusters(); // TODO: Why is this not using ioPtrs? Could remove GPUConstantMem.h include`
`128`		`- mMaxTracks = mRec->GetConstantMem().tpcMerger.NOutputTracks();`
	`128`	`+ mMaxTracks = mRec->GetConstantMem().tpcMerger.NMergedTracks();`
`129`	`129`	`if (mMaxClusters % 16) {`
`130`	`130`	`mMaxClusters += 16 - (mMaxClusters % 16);`
`131`	`131`	`}`