[RF] Avoid expensive double loop in setData() for likelihooods

guitargeek · guitargeek · commit c1bea85f30ed · 2025-08-04T23:26:54.000+02:00
About 2 years ago in 85c5cb4, I made the mistake of introducing a potentially expensive double loop when setting the data for a likelihood: the `RooEvaluatorWapper` iterates over all data variables to set, and for each of them it calls `RooFit::Evaluator::setInput()`, which has a loop over all nodes in the computation graph to identify for which one the data is set. This caused a big performance hit when instantiating likelihoods with the new vectorizing CPU backend, manifesting in 10 minutes for `createNLL` for the big ATLAS Higgs combination benchmark instead of 1 min. This commit fixes the problem by caching all nodes in the compute graph in a hash map for fast lookup.
diff --git a/roofit/roofitcore/inc/RooFit/Evaluator.h b/roofit/roofitcore/inc/RooFit/Evaluator.h
@@ -64,7 +64,8 @@ class Evaluator {
    bool _needToUpdateOutputSizes = false;
    RooFit::EvalContext _evalContextCPU;
    RooFit::EvalContext _evalContextCUDA;
-   std::vector<NodeInfo> _nodes; // the ordered computation graph
+   std::vector<NodeInfo> _nodes;                             // the ordered computation graph
+   std::unordered_map<TNamed const *, NodeInfo *> _nodesMap; // for quick lookup of nodes
    std::stack<std::unique_ptr<ChangeOperModeRAII>> _changeOperModeRAIIs;
 };
 
diff --git a/roofit/roofitcore/src/RooFit/Evaluator.cxx b/roofit/roofitcore/src/RooFit/Evaluator.cxx
@@ -177,6 +177,8 @@ Evaluator::Evaluator(const RooAbsReal &absReal, bool useGPU)
 
       _nodes.emplace_back();
       auto &nodeInfo = _nodes.back();
+      _nodesMap[arg->namePtr()] = &nodeInfo;
+
       nodeInfo.absArg = arg;
       nodeInfo.originalOperMode = arg->operMode();
       nodeInfo.iNode = iNode;
@@ -244,49 +246,51 @@ void Evaluator::setInput(std::string const &name, std::span<const double> inputA
       throw std::runtime_error("Evaluator can only take device array as input in CUDA mode!");
    }
 
-   auto namePtr = RooNameReg::ptr(name.c_str());
+   // Check if "name" is used in the computation graph. If yes, add the span to
+   // the data map and set the node info accordingly.
 
-   // Iterate over the given data spans and add them to the data map. Check if
-   // they are used in the computation graph. If yes, add the span to the data
-   // map and set the node info accordingly.
-   std::size_t iNode = 0;
-   for (auto &info : _nodes) {
-      const bool fromArrayInput = info.absArg->namePtr() == namePtr;
-      if (fromArrayInput) {
-         info.fromArrayInput = true;
-         info.absArg->setDataToken(iNode);
-         info.outputSize = inputArray.size();
-         if (_useGPU && info.outputSize <= 1) {
-            // Empty or scalar observables from the data don't need to be
-            // copied to the GPU.
-            _evalContextCPU.set(info.absArg, inputArray);
-            _evalContextCUDA.set(info.absArg, inputArray);
-         } else if (_useGPU && info.outputSize > 1) {
-            // For simplicity, we put the data on both host and device for
-            // now. This could be optimized by inspecting the clients of the
-            // variable.
-            if (isOnDevice) {
-               _evalContextCUDA.set(info.absArg, inputArray);
-               auto gpuSpan = _evalContextCUDA.at(info.absArg);
-               info.buffer = _bufferManager->makeCpuBuffer(gpuSpan.size());
-               info.buffer->assignFromDevice(gpuSpan);
-               _evalContextCPU.set(info.absArg, {info.buffer->hostReadPtr(), gpuSpan.size()});
-            } else {
-               _evalContextCPU.set(info.absArg, inputArray);
-               auto cpuSpan = _evalContextCPU.at(info.absArg);
-               info.buffer = _bufferManager->makeGpuBuffer(cpuSpan.size());
-               info.buffer->assignFromHost(cpuSpan);
-               _evalContextCUDA.set(info.absArg, {info.buffer->deviceReadPtr(), cpuSpan.size()});
-            }
-         } else {
-            _evalContextCPU.set(info.absArg, inputArray);
-         }
-      }
-      info.isDirty = !info.fromArrayInput;
-      ++iNode;
-   }
+   auto found = _nodesMap.find(RooNameReg::ptr(name.c_str()));
+
+   if (found == _nodesMap.end())
+      return;
 
    _needToUpdateOutputSizes = true;
+
+   NodeInfo &info = *found->second;
+
+   info.fromArrayInput = true;
+   info.absArg->setDataToken(info.iNode);
+   info.outputSize = inputArray.size();
+
+   if (!_useGPU) {
+      _evalContextCPU.set(info.absArg, inputArray);
+      return;
+   }
+
+   if (info.outputSize <= 1) {
+      // Empty or scalar observables from the data don't need to be
+      // copied to the GPU.
+      _evalContextCPU.set(info.absArg, inputArray);
+      _evalContextCUDA.set(info.absArg, inputArray);
+      return;
+   }
+
+   // For simplicity, we put the data on both host and device for
+   // now. This could be optimized by inspecting the clients of the
+   // variable.
+   if (isOnDevice) {
+      _evalContextCUDA.set(info.absArg, inputArray);
+      auto gpuSpan = _evalContextCUDA.at(info.absArg);
+      info.buffer = _bufferManager->makeCpuBuffer(gpuSpan.size());
+      info.buffer->assignFromDevice(gpuSpan);
+      _evalContextCPU.set(info.absArg, {info.buffer->hostReadPtr(), gpuSpan.size()});
+   } else {
+      _evalContextCPU.set(info.absArg, inputArray);
+      auto cpuSpan = _evalContextCPU.at(info.absArg);
+      info.buffer = _bufferManager->makeGpuBuffer(cpuSpan.size());
+      info.buffer->assignFromHost(cpuSpan);
+      _evalContextCUDA.set(info.absArg, {info.buffer->deviceReadPtr(), cpuSpan.size()});
+   }
 }
 
 void Evaluator::updateOutputSizes()
@@ -309,6 +313,7 @@ void Evaluator::updateOutputSizes()
 
    for (auto &info : _nodes) {
       info.outputSize = outputSizeMap.at(info.absArg);
+      info.isDirty = true;
 
       // In principle we don't need dirty flag propagation because the driver
       // takes care of deciding which node needs to be re-evaluated. However,