Add benchmark on CMS onnx file (DDB_B1.onnx) used with batch size=1

lmoneta · lmoneta · commit 93663e3ec48d · 2022-04-03T11:40:19.000+02:00
Modifiy noth ONNXRuntimeinference and SOFIEINference to support models with multiple inputs
diff --git a/root/tmva/sofie/CMakeLists.txt b/root/tmva/sofie/CMakeLists.txt
@@ -190,8 +190,8 @@ add_dependencies(RDF_SOFIE_Inference SofieCompileModels)
 #if (ROOT_PLATFORM MATCHES "linux|macosx" AND CMAKE_SYSTEM_PROCESSOR MATCHES x86_64 AND CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
 ## assume we run only on linux/macos with gnu or gcc
 set(gnu-flags $<$<CXX_COMPILER_ID:GNU>:-fno-signaling-nans>)
-target_compile_options(SOFIEInference  PRIVATE -mavx2  ${gnu-flags} -fno-trapping-math -O3)
-target_compile_options(RDF_SOFIE_Inference  PRIVATE -mavx2 ${gnu-flags} -fno-trapping-math -O3)
+target_compile_options(SOFIEInference  PRIVATE   ${gnu-flags} -fno-trapping-math -O3)
+target_compile_options(RDF_SOFIE_Inference  PRIVATE  ${gnu-flags} -fno-trapping-math -O3)
 #endif()
 
 endif()  # endif blas 
@@ -230,4 +230,4 @@ if (ONNXRuntime_FOUND)
    )
    target_link_directories(RDF_ONNXRuntime_Inference PRIVATE ${ONNXRuntime_LIBRARIES})
    target_include_directories(RDF_ONNXRuntime_Inference PRIVATE ${ONNXRuntime_INCLUDE_DIR})
-endif()
+endif()
diff --git a/root/tmva/sofie/ONNXRuntimeInference_Template.cxx.in b/root/tmva/sofie/ONNXRuntimeInference_Template.cxx.in
@@ -16,75 +16,108 @@ using namespace std;
 static void @FUNC_NAME@(benchmark::State& state, string model_path)
 {
    Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "benchmark");
-   
+
    Ort::SessionOptions session_options;
    session_options.SetIntraOpNumThreads(1);
    session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
 
    //std::cout << "benchmarking model " << model_path << std::endl;
    Ort::Session session(env, model_path.c_str(), session_options);
 
-   vector<const char*> input_node_names(1);
-   vector<const char*> output_node_names(1);
-   
+   int nin = session.GetInputCount();
+   int nout = 1;
+
+   vector<const char*> input_node_names(nin);
+   vector<const char*> output_node_names(nout);
+
    Ort::AllocatorWithDefaultOptions allocator;
-   input_node_names[0] = session.GetInputName(0, allocator);
-   output_node_names[0] = session.GetOutputName(0, allocator);
+   for (int i = 0; i < nin; i++)
+     input_node_names[i] = session.GetInputName(i, allocator);
+   for (int i = 0; i < nout; i++)
+     output_node_names[i] = session.GetOutputName(i, allocator);
 
    // Getting the shapes
+   vector<vector<int64_t>> input_node_dims(nin);
+   vector<vector<int64_t>> output_node_dims(nout);
+
+   for (int i = 0; i < nin; i++)
+      input_node_dims[i] = session.GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape();
+   for (int i = 0; i < nout; i++)
+      output_node_dims[i] = session.GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape();
+
+   for (int i = 0; i < nin; i++) {
+      std::cout << "input " << input_node_names[i] << " shape : ";
+      for (int j = 0; j < input_node_dims[i].size(); j++)
+         std::cout << "  " << input_node_dims[i][j];
+      std::cout << std::endl;
+   }
+   // fix negative shapes
+   for (int i = 0; i < nin; i++) {
+      for (int j = 0; j < input_node_dims[i].size(); j++) {
+         if (input_node_dims[i][j] < 0) input_node_dims[i][j] = - input_node_dims[i][j];
+      }
+   }
 
-   vector<int64_t> input_node_dims = session
-      .GetInputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape();
-   vector<int64_t> output_node_dims = session
-      .GetOutputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape();
 
    // Calculating the dimension of the input tensor
    int nevts = 64;
-   int bsize = input_node_dims[0];
+   int bsize = input_node_dims[0][0]; // assume this
    //std::cout << "Using bsize = " << bsize << std::endl;
    int nbatches = nevts / bsize;
 
-   size_t input_tensor_size = accumulate(input_node_dims.begin(),
-      input_node_dims.end(), 1, multiplies<int>());
-   vector<float> input_tensor_values(input_tensor_size*nbatches);
-   //std::cout << "input tensor size " << input_tensor_size << "  " << input_tensor_values.size() << std::endl;
-
-   // Input tensor initialization   
-   static std::uniform_real_distribution<float> distribution(-1,1);
-   static std::default_random_engine generator;
-   std::generate(input_tensor_values.begin(), input_tensor_values.end(), []() { return distribution(generator); });
-   //fill_n(input_tensor_values.begin(), input_tensor_size, 1.0);
-
-   auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
-   // Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info,
-   //    input_tensor_values.data(), input_tensor_size,
-   //    input_node_dims.data(), input_node_dims.size());
-
-   // Running the model
-   float * floatarr = nullptr;
-
-   double totDuration = 0;
-   int ntimes = 0;
-   for (auto _ : state) {
-      auto t1 = std::chrono::high_resolution_clock::now();
-      size_t input_offset = 0;
-      for (int i = 0; i < nevts; i += bsize) {
-         // if (input_offset > input_tensor_values.size()) {
-         //    std::cout << "Error in input size " << i << "  " << nevts << "  " << model_path << std::endl;
-         //    throw std::runtime_error("Bad input size ");
-         // }
-         Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
-            memory_info, input_tensor_values.data()+input_offset, input_tensor_size, input_node_dims.data(), input_node_dims.size());
-         auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), &input_tensor, 1,
-                                           output_node_names.data(), 1);
-         floatarr = output_tensors.front().GetTensorMutableData<float>();
-         input_offset += input_tensor_size;
-      }
+   std::vector<std::vector<float>> inputData(nin);
+   std::vector<size_t> inputSizes(nin);
+
+   for (int i = 0; i < nin; i++) {
+      size_t input_tensor_size = accumulate(input_node_dims[i].begin(), input_node_dims[i].end(), 1, multiplies<int>());
+      inputSizes[i] = input_tensor_size;
+      auto &input_tensor_values = inputData[i];
+      input_tensor_values.resize(input_tensor_size * nbatches);
+      // std::cout << "input tensor size " << input_tensor_size << "  " << input_tensor_values.size() << std::endl;
+
+      // Input tensor initialization
+      static std::uniform_real_distribution<float> distribution(-1, 1);
+      static std::default_random_engine generator;
+      std::generate(input_tensor_values.begin(), input_tensor_values.end(), []() { return distribution(generator); });
+      // fill_n(input_tensor_values.begin(), input_tensor_size, 1.0);
+   }
 
-      auto t2 = std::chrono::high_resolution_clock::now();
-      auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
-      totDuration += duration / 1.E3; // in milliseconds
-      ntimes++;
+      auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+      // Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info,
+      //    input_tensor_values.data(), input_tensor_size,
+      //    input_node_dims.data(), input_node_dims.size());
+
+      // Running the model
+      float *floatarr = nullptr;
+
+      std::vector<Ort::Value> input_tensors;
+
+      double totDuration = 0;
+      int ntimes = 0;
+      for (auto _ : state) {
+         auto t1 = std::chrono::high_resolution_clock::now();
+         std::vector<size_t> input_offset(nin);
+         for (int i = 0; i < nevts; i += bsize) {
+            // if (input_offset > input_tensor_values.size()) {
+            //    std::cout << "Error in input size " << i << "  " << nevts << "  " << model_path << std::endl;
+            //    throw std::runtime_error("Bad input size ");
+            // }
+            for (int k = 0; k < nin; k++) {
+               input_tensors.emplace_back(Ort::Value::CreateTensor<float>(memory_info, inputData[k].data() + input_offset[k],
+                                               inputSizes[k], input_node_dims[k].data(), input_node_dims[k].size()));
+            }
+            auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), input_tensors.data(), nin,
+                                              output_node_names.data(), nout);
+            floatarr = output_tensors.front().GetTensorMutableData<float>();
+            for (int k = 0; k < nin; k++) {
+               input_offset[k] += inputSizes[k];
+            }
+         }
+
+         auto t2 = std::chrono::high_resolution_clock::now();
+         auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+         totDuration += duration / 1.E3; // in milliseconds
+         ntimes++;
    }
    //for (int i = 0; i < 10; i++)
    //  printf("%f\t", i, floatarr[i]);
diff --git a/root/tmva/sofie/SOFIEInference.cxx b/root/tmva/sofie/SOFIEInference.cxx
@@ -25,16 +25,17 @@
 #include "GRU_d10_L20_h8_B1.hxx"
 #include "LSTM_d10_L20_h8_B1.hxx"
 #include "higgs_model_dense.hxx"
+#include "DDB_B1.hxx"   // CMS onnx model
 
 #include "resnet18v1.hxx"
 #include "TMath.h"
 
 
 using namespace std;
-bool verbose = false; 
+bool verbose = false;
 template <class S>
 void BM_SOFIE_Inference(benchmark::State &state)
-{ 
+{
    size_t inputSize = state.range(0);  // input size (without batch size)
    size_t bsize = (state.range(1) > 0) ? state.range(1) : 0;
    size_t nevts = 64;
@@ -74,8 +75,56 @@ void BM_SOFIE_Inference(benchmark::State &state)
    //  }
     //if (verbose) std::cout << "output : " << output.size() << " : " << output.front() << " ......" << output.back() << std::endl;
 }
-//typedef TMVA_SOFIE_Conv_d100_L1_B1::Session S1;
-//BENCHMARK(BM_SOFIE_Inference<S1>);//->Name( "Conv_d100_L1_B1");
+
+// inference for model with 3 inputs
+template <class S>
+void BM_SOFIE_Inference_3(benchmark::State &state)
+{
+   size_t bsize = state.range(0);  // batch size
+   size_t inputSize1 = state.range(1);  // input 1 size
+   size_t inputSize2 = state.range(2);  // input 2 size
+   size_t inputSize3 = state.range(3);
+
+   size_t nevts = 64;
+   size_t nrep = nevts / bsize;
+
+   size_t eventSize = inputSize1 + inputSize2+inputSize3;
+
+   vector<float> input1(inputSize1*nevts);
+   vector<float> input2(inputSize2*nevts);
+   vector<float> input3(inputSize3*nevts);
+
+   static std::uniform_real_distribution<float> distribution(-1, 1);
+   static std::default_random_engine generator;
+   std::generate(input1.begin(), input1.end(), []() { return distribution(generator); });
+   std::generate(input2.begin(), input2.end(), []() { return distribution(generator); });
+   std::generate(input3.begin(), input3.end(), []() { return distribution(generator); });
+
+   S s("");
+
+   std::cout << "initi done - do benchmark \n";
+
+   double totDuration = 0;
+   int ntimes = 0;
+   for (auto _ : state) {
+      auto t1 = std::chrono::high_resolution_clock::now();
+      for (int i = 0; i < nevts; i += bsize) {
+         float * p1 = input1.data()+ inputSize1*i;
+         float * p2 = input2.data()+ inputSize2*i;
+         float * p3 = input3.data()+ inputSize3*i;
+         auto y = s.infer(p1,p2,p3);
+      }
+      auto t2 = std::chrono::high_resolution_clock::now();
+      auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+      totDuration += duration / 1.E3;  // in milliseconds
+      ntimes++;
+   }
+
+   state.counters["time/evt(ms)"] = totDuration / double(ntimes * nevts);
+}
+
+// CMS benchmark (3 inputs)
+BENCHMARK_TEMPLATE(BM_SOFIE_Inference_3, TMVA_SOFIE_DDB_B1::Session)->Name("DDB_B1")->Args({1, 1*27, 60*8, 5*2})->Unit(benchmark::kMillisecond);
 
 //Gemm benchmarks
 BENCHMARK_TEMPLATE(BM_SOFIE_Inference, TMVA_SOFIE_Linear_16::Session)->Name("Linear_16")->Args({100, 16})->Unit(benchmark::kMillisecond);
diff --git a/root/tmva/sofie/input_models/DDB_B1.onnx b/root/tmva/sofie/input_models/DDB_B1.onnx