Skip to content

Commit 70da9f5

Browse files
committed
[tmva][sofie] Fix the memory optimization for intermediate tensor
Fix an issue of merging free chunks in the list of available_stack memory This will make easier to re-use more efficiently the memory In addition order the optput tensor by decreasing sizes Add also debug of the current chunk allocated and avaialable during the process
1 parent 04c9691 commit 70da9f5

File tree

2 files changed

+161
-72
lines changed

2 files changed

+161
-72
lines changed

tmva/sofie/inc/TMVA/RModel.hxx

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@ private:
1717
int fVerbose = 0;
1818
int fBatchSize = -1;
1919
long fReadPos = 0; // reading file position
20-
size_t fConstantTensorSize = 0;
21-
size_t fWeightsTensorSize = 0;
22-
size_t fOtherTensorSize = 0;
20+
size_t fConstantTensorSize = 0; // size (in Bytes) of the allocated constant tensors
21+
size_t fWeightsTensorSize = 0; // size (in Bytes) of the allocated weight tensors
22+
size_t fOtherTensorSize = 0; // size (in Bytes) of intermediate tensors which are not managed by the memory pool
2323

2424
OptimizationLevel fOptimizationLevel = OptimizationLevel::kExtended;
2525

@@ -167,12 +167,17 @@ public:
167167

168168
void SetOptimizationLevel(const OptimizationLevel &optim_level) { fOptimizationLevel = optim_level; }
169169

170-
size_t GetConstantTensorSize() const { return fConstantTensorSize;}
171-
size_t GetWeightsTensorSize() const { return fWeightsTensorSize;}
172-
size_t GetOtherTensorSize() const { return fOtherTensorSize;}
170+
// get the size in bytes of the constant tensors
171+
size_t GetConstantTensorSize() const { return fConstantTensorSize; }
172+
// get the size in bytes of the weight tensors
173+
size_t GetWeightsTensorSize() const { return fWeightsTensorSize; }
174+
// get the size in bytes of the intermediate tensors which are not part of the memory pool
175+
size_t GetOtherTensorSize() const { return fOtherTensorSize; }
176+
// get the size in bytes of the intermediate tensors managed by the memory pool
173177
size_t GetIntermediateTensorSize() const {
174-
return (!fIntermediateMemoryInfo.total_stack.empty()) ?
175-
fIntermediateMemoryInfo.total_stack.rbegin()->first + fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size : 0;
178+
return (!fIntermediateMemoryInfo.total_stack.empty())
179+
? fIntermediateMemoryInfo.total_stack.rbegin()->first + fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size
180+
: 0;
176181
}
177182

178183
protected:

tmva/sofie/src/RModel.cxx

Lines changed: 148 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,13 @@ std::string RModel::AllocateIntermediateMemory(std::span<const std::string_view>
330330
{
331331
std::stringstream code;
332332

333+
if (fVerbose) {
334+
std::cout << "Total chunks allocated\n";
335+
for (auto chunk = fIntermediateMemoryInfo.total_stack.begin(); chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk) {
336+
std::cout << "..... chunk " << chunk->first << " size " << chunk->second.tensor_size << " " << chunk->second.tensor_name << std::endl;
337+
}
338+
}
339+
333340
auto declareIntermediateTensor = [this, &code](std::string const &name, size_t size, size_t location) {
334341
std::string typeName = ConvertTypeToString(GetTensorType(name));
335342
code << "\n // Allocating memory for intermediate tensor " << name << " with size " << size << " bytes";
@@ -338,89 +345,161 @@ std::string RModel::AllocateIntermediateMemory(std::span<const std::string_view>
338345
<< "*>(fIntermediateMemoryPool.data() + " << location << ");\n";
339346
};
340347

348+
if (fVerbose) std::cout << "*** AllocateIntermediateMemory: Loop on op output tensors\n";
349+
// order output tensors by size
350+
std::vector<TensorMemoryInfo> ordered_output_tensors;
351+
341352
for (auto &it : op_output_tensors) {
342-
std::string name = std::string{it};
343-
bool allocated = false;
344-
if (GetTensorType(name) == ETensorType::BOOL ||
345-
fInitializedTensors.find(name) != fInitializedTensors.end() ||
346-
fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end()) continue;
353+
auto name = std::string(it);
354+
if (GetTensorType(name) == ETensorType::BOOL || fInitializedTensors.find(name) != fInitializedTensors.end() ||
355+
fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end())
356+
continue;
357+
358+
auto tensor_size = GetTypeSize(GetTensorType(name)) * ConvertShapeToLength(GetTensorShape(name));
359+
// important fill the pair in the ordered output tensors with the string view and not the string
360+
TensorMemoryInfo tmi = {it, tensor_size};
361+
ordered_output_tensors.push_back(tmi);
362+
}
363+
std::sort(ordered_output_tensors.begin(), ordered_output_tensors.end(),
364+
[](const TensorMemoryInfo &a, const TensorMemoryInfo &b) { return a.tensor_size > b.tensor_size; });
347365

348-
auto tensor_size = GetTypeSize(GetTensorType(name)) * ConvertShapeToLength(GetTensorShape(name));
366+
for (auto &it : ordered_output_tensors) {
367+
bool allocated = false;
368+
std::string name = std::string{it.tensor_name};
369+
size_t tensor_size = it.tensor_size;
370+
if (fVerbose)
371+
std::cout << "output tensor " << name << " size " << tensor_size << std::endl;
349372

350-
for (auto chunk = fIntermediateMemoryInfo.available_stack.begin(); chunk != fIntermediateMemoryInfo.available_stack.end(); ) {
373+
for (auto chunk = fIntermediateMemoryInfo.available_stack.begin();
374+
chunk != fIntermediateMemoryInfo.available_stack.end();) {
351375

352-
// check if available memory chunks can accommodate the tensor
353-
if (chunk->second >= tensor_size) {
354-
auto new_chunk = fIntermediateMemoryInfo.total_stack[chunk->first].split(it, tensor_size);
355-
auto new_chunk_location = chunk->first+chunk->second-tensor_size;
356-
fIntermediateMemoryInfo.total_stack[new_chunk_location] = new_chunk;
376+
if (fVerbose) std::cout << ".. available chunk " << chunk->first << " with size = " << chunk->second;
377+
// check if available memory chunks can accommodate the tensor
378+
if (chunk->second >= tensor_size) {
379+
// need to use here string_view (i.e it.tensor_name)
380+
// split returns the new chunk with size of new tensor. The free chunk is before the used one
381+
auto new_chunk = fIntermediateMemoryInfo.total_stack[chunk->first].split(it.tensor_name, tensor_size);
382+
auto new_chunk_location = chunk->first + chunk->second - tensor_size;
383+
fIntermediateMemoryInfo.total_stack[new_chunk_location] = new_chunk;
357384

358-
declareIntermediateTensor(name, tensor_size, new_chunk_location);
359-
chunk->second -= tensor_size;
385+
declareIntermediateTensor(name, tensor_size, new_chunk_location);
386+
chunk->second -= tensor_size;
360387

361-
allocated = true;
388+
allocated = true;
362389

363-
if (chunk->second == 0) {
364-
chunk = fIntermediateMemoryInfo.available_stack.erase(chunk);
365-
}
390+
if (fVerbose) std::cout << " is re-used and split in a new of size " << new_chunk.tensor_size << " at " << new_chunk_location;
366391

367-
break;
368-
}
369-
++chunk;
392+
if (chunk->second == 0) {
393+
if (fVerbose) std::cout << " and deleted since size matches";
394+
fIntermediateMemoryInfo.available_stack.erase(chunk);
370395
}
396+
if (fVerbose) std::cout << std::endl;
397+
break;
398+
} else if (chunk->first == fIntermediateMemoryInfo.available_stack.rbegin()->first &&
399+
fIntermediateMemoryInfo.total_stack.rbegin()->first == chunk->first) {
400+
// case last available chunk is the last in the memory, we can increase that one
401+
fIntermediateMemoryInfo.total_stack[chunk->first] = {it.tensor_name, tensor_size};
402+
declareIntermediateTensor(name, tensor_size, chunk->first);
403+
fIntermediateMemoryInfo.available_stack.erase(chunk);
404+
allocated = true;
405+
if (fVerbose) std::cout << " is extended with a bigger one of size " << tensor_size << std::endl;
406+
break;
407+
}
408+
++chunk;
409+
if (fVerbose) std::cout << std::endl;
410+
}
371411

372-
if (!allocated) {
373-
size_t chunk_idx = fIntermediateMemoryInfo.total_stack.empty()
374-
? 0
375-
: fIntermediateMemoryInfo.total_stack.rbegin()->first + fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size;
412+
if (!allocated) {
413+
size_t chunk_idx = fIntermediateMemoryInfo.total_stack.empty()
414+
? 0
415+
: fIntermediateMemoryInfo.total_stack.rbegin()->first +
416+
fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size;
376417

377-
fIntermediateMemoryInfo.total_stack[chunk_idx] = {it, tensor_size};
418+
fIntermediateMemoryInfo.total_stack[chunk_idx] = it;
378419

379-
declareIntermediateTensor(name, tensor_size, chunk_idx);
380-
}
420+
declareIntermediateTensor(name, tensor_size, chunk_idx);
421+
422+
if (fVerbose) std::cout << "no chunk available - add in total stack a new chunk with size of tensor and idx : " << chunk_idx
423+
<< std::endl;
424+
}
381425
}
382426
return code.str();
383427
}
384428

385429
void RModel::CheckAndFlushIntermediateMemory(std::span<const std::string_view> op_input_tensors, const size_t& op_idx){
386-
for (auto &it : op_input_tensors){
430+
if (fVerbose) std::cout << "*** CheckAndFlushIntermediateMemory: Loop on input tensors for op " << op_idx << "\n";
431+
//print available chunks
432+
if (fVerbose) std::cout << "available chunks before freeing them : \n";
433+
for (auto chunk = fIntermediateMemoryInfo.available_stack.begin();
434+
chunk != fIntermediateMemoryInfo.available_stack.end(); chunk++) {
435+
if (fVerbose) std::cout << "-- free chunk " << chunk->first << " size = " << chunk->second << std::endl;
436+
}
437+
for (auto &it : op_input_tensors) {
387438
// last occurence of the tensor is reached => flush it from memory
439+
if (fVerbose) std::cout << ".. input tensors : " << it;
388440
if (fIntermediateTensorFrequencyLookup[it] == op_idx) {
441+
if (fVerbose) std::cout << " flash condition is met - looping on chunks to find matching one \n";
389442
for (auto chunk = fIntermediateMemoryInfo.total_stack.begin();
390-
chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk ) {
391-
if (chunk->second.tensor_name == it) {
392-
393-
// check if nearby chunks in available memory can coalesce
394-
auto first_greater = fIntermediateMemoryInfo.available_stack.upper_bound(chunk->first); // smallest element greater than the flushed chunk idx
395-
auto last_smaller = (first_greater == fIntermediateMemoryInfo.available_stack.begin()) ? fIntermediateMemoryInfo.available_stack.end() : std::prev(first_greater); // largest element smaller than the flushed chunk idx
396-
397-
// check if the next stack entry is actually adjacent in memory
398-
if (last_smaller->first+last_smaller->second + 1 == chunk->first){
399-
last_smaller->second += chunk->second.tensor_size;
400-
fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(chunk->second);
401-
402-
if (last_smaller->first + last_smaller->second + 1 == first_greater->first){
403-
fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(fIntermediateMemoryInfo.total_stack[first_greater->first]);
404-
first_greater = fIntermediateMemoryInfo.available_stack.erase(first_greater);
405-
}
406-
} else{
407-
if (chunk->first + chunk->second.tensor_size + 1 == first_greater->first){
408-
fIntermediateMemoryInfo.total_stack[chunk->first].merge(fIntermediateMemoryInfo.total_stack[first_greater->first]);
409-
first_greater = fIntermediateMemoryInfo.available_stack.erase(first_greater);
410-
}
411-
fIntermediateMemoryInfo.available_stack.insert({
412-
chunk->first,
413-
chunk->second.tensor_size
414-
});
415-
}
443+
chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk) {
444+
if (fVerbose) std::cout << "--- chunk " << chunk->first << " , " << chunk->second.tensor_name << " size " << chunk->second.tensor_size;
445+
if (chunk->second.tensor_name == it) {
446+
if (fVerbose) std::cout << " -- Found chunk corresponding to input tensor: " << chunk->first;
447+
// check if nearby chunks in available memory can coalesce
448+
auto first_greater = fIntermediateMemoryInfo.available_stack.upper_bound(
449+
chunk->first); // smallest element greater than the flushed chunk idx
450+
auto last_smaller = (first_greater == fIntermediateMemoryInfo.available_stack.begin())
451+
? fIntermediateMemoryInfo.available_stack.end()
452+
: std::prev(first_greater); // largest element smaller than the flushed chunk idx
453+
454+
// check if the next stack entry is actually adjacent in memory
455+
456+
if (last_smaller != fIntermediateMemoryInfo.available_stack.end() &&
457+
last_smaller->first + last_smaller->second == chunk->first) {
458+
// merge chunk with previous one
459+
last_smaller->second += chunk->second.tensor_size;
460+
fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(chunk->second);
461+
if (fVerbose) std::cout << " is adjacent in memory with previous one - merge ";
462+
if (first_greater != fIntermediateMemoryInfo.available_stack.end() &&
463+
last_smaller->first + last_smaller->second == first_greater->first) {
464+
// merge also with following one
465+
last_smaller->second += first_greater->second;
466+
fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(
467+
fIntermediateMemoryInfo.total_stack[first_greater->first]);
468+
// delete merged one in available stack and in total stack
469+
fIntermediateMemoryInfo.total_stack.erase(first_greater->first);
470+
fIntermediateMemoryInfo.available_stack.erase(first_greater);
471+
if (fVerbose) std::cout << " merge also with following that is free ";
472+
}
473+
fIntermediateMemoryInfo.total_stack.erase(chunk->first);
474+
if (fVerbose) std::cout << std::endl;
475+
break;
476+
} else if (first_greater != fIntermediateMemoryInfo.available_stack.end() &&
477+
chunk->first + chunk->second.tensor_size == first_greater->first) {
478+
// merge with first greater
479+
if (fVerbose) std::cout << " is adjacent in memory with following one - merge \n";
480+
// cannot modify idx of first_greter. Insert a new one and delete previous one
481+
size_t new_size = chunk->second.tensor_size + first_greater->second;
482+
size_t first_greater_idx = first_greater->first;
483+
fIntermediateMemoryInfo.available_stack.erase(first_greater);
484+
// cannot use anymore first_greater
485+
fIntermediateMemoryInfo.available_stack.insert({chunk->first, new_size});
486+
fIntermediateMemoryInfo.total_stack[chunk->first].merge(
487+
fIntermediateMemoryInfo.total_stack[first_greater_idx]);
488+
fIntermediateMemoryInfo.total_stack.erase(first_greater_idx);
489+
} else {
490+
fIntermediateMemoryInfo.available_stack.insert({chunk->first, chunk->second.tensor_size});
491+
if (fVerbose) std::cout << " insert in the available stack the chunk with size " << chunk->second.tensor_size << std::endl;
416492
}
493+
chunk->second.tensor_name = "free";
494+
break;
495+
}
417496
}
497+
} else {
498+
if (fVerbose) std::cout << std::endl;
418499
}
419500
}
420501
}
421502

422-
423-
424503
void RModel::Initialize(int batchSize, bool verbose) {
425504
std::map<std::string, size_t> inputParams;
426505
if (batchSize > 0) {
@@ -609,12 +688,12 @@ void RModel::GenerateInitializedTensorInfo()
609688

610689
for (auto &i : fInitializedTensors) {
611690
if (!fUseWeightFile || i.second.IsConstantTensor()) {
612-
if (i.second.type() == ETensorType::FLOAT) {
691+
if (i.second.type() == ETensorType::FLOAT) {
613692
fGC += GenerateConstantTensorCode<float>(i);
614-
fConstantTensorSize += ConvertShapeToLength(i.second.shape())*4;
693+
fConstantTensorSize += ConvertShapeToLength(i.second.shape()) * 4;
615694
} else if (i.second.type() == ETensorType::INT64) {
616695
fGC += GenerateConstantTensorCode<int64_t>(i);
617-
fConstantTensorSize += ConvertShapeToLength(i.second.shape())*8;
696+
fConstantTensorSize += ConvertShapeToLength(i.second.shape()) * 8;
618697
}
619698

620699
} else {
@@ -623,7 +702,7 @@ void RModel::GenerateInitializedTensorInfo()
623702
if (i.second.type() == ETensorType::FLOAT) {
624703
fGC += "std::vector<float> fTensor_" + i.first + " = std::vector<float>(" + std::to_string(length) + ");\n";
625704
fGC += "float * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
626-
fWeightsTensorSize += ConvertShapeToLength(i.second.shape())*4;
705+
fWeightsTensorSize += ConvertShapeToLength(i.second.shape()) * 4;
627706
}
628707
}
629708
}
@@ -661,17 +740,17 @@ void RModel::GenerateIntermediateTensorInfo() {
661740
if (i.second.type == ETensorType::FLOAT) {
662741
tensor_declaration_block += "std::vector<float> fTensor_" + i.first + " = std::vector<float>(" + std::to_string(length) + ");\n";
663742
tensor_declaration_block += "float * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
664-
fOtherTensorSize += 4*length;
743+
fOtherTensorSize += 4 * length;
665744
}
666745
else if (i.second.type == ETensorType::DOUBLE) {
667746
tensor_declaration_block += "std::vector<double> fTensor_" + i.first + " = std::vector<double>(" + std::to_string(length) + ");\n";
668747
tensor_declaration_block += "double * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
669-
fOtherTensorSize += 8*length;
748+
fOtherTensorSize += 8 * length;
670749
}
671750
else if (i.second.type == ETensorType::INT64) {
672751
tensor_declaration_block += "std::vector<int64_t> fTensor_" + i.first + " = std::vector<int64_t>(" + std::to_string(length) + ");\n";
673752
tensor_declaration_block += "int64_t * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
674-
fOtherTensorSize += 8*length;
753+
fOtherTensorSize += 8 * length;
675754
}
676755
}
677756
}
@@ -853,6 +932,11 @@ void RModel::GenerateSessionCode()
853932
std::string intermediate_memory_alloc_string = "";
854933
intermediate_memory_alloc_string += "\n// --- Positioning intermediate tensor memory --";
855934
for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
935+
if (fVerbose) {
936+
auto op = fOperators[op_idx].get();
937+
std::cout << "\n******************\n analyzing input/output operator " << op_idx << " "
938+
<< typeid(*op).name() << std::endl;
939+
}
856940
intermediate_memory_alloc_string += AllocateIntermediateMemory(fOperators[op_idx]->GetOpOutputTensors());
857941
CheckAndFlushIntermediateMemory(fOperators[op_idx]->GetOpInputTensors(), op_idx);
858942
}

0 commit comments

Comments
 (0)