@@ -330,6 +330,13 @@ std::string RModel::AllocateIntermediateMemory(std::span<const std::string_view>
330
330
{
331
331
std::stringstream code;
332
332
333
+ if (fVerbose ) {
334
+ std::cout << " Total chunks allocated\n " ;
335
+ for (auto chunk = fIntermediateMemoryInfo .total_stack .begin (); chunk != fIntermediateMemoryInfo .total_stack .end (); ++chunk) {
336
+ std::cout << " ..... chunk " << chunk->first << " size " << chunk->second .tensor_size << " " << chunk->second .tensor_name << std::endl;
337
+ }
338
+ }
339
+
333
340
auto declareIntermediateTensor = [this , &code](std::string const &name, size_t size, size_t location) {
334
341
std::string typeName = ConvertTypeToString (GetTensorType (name));
335
342
code << " \n // Allocating memory for intermediate tensor " << name << " with size " << size << " bytes" ;
@@ -338,89 +345,161 @@ std::string RModel::AllocateIntermediateMemory(std::span<const std::string_view>
338
345
<< " *>(fIntermediateMemoryPool.data() + " << location << " );\n " ;
339
346
};
340
347
348
+ if (fVerbose ) std::cout << " *** AllocateIntermediateMemory: Loop on op output tensors\n " ;
349
+ // order output tensors by size
350
+ std::vector<TensorMemoryInfo> ordered_output_tensors;
351
+
341
352
for (auto &it : op_output_tensors) {
342
- std::string name = std::string{it};
343
- bool allocated = false ;
344
- if (GetTensorType (name) == ETensorType::BOOL ||
345
- fInitializedTensors .find (name) != fInitializedTensors .end () ||
346
- fDynamicTensorInfos .find (name) != fDynamicTensorInfos .end ()) continue ;
353
+ auto name = std::string (it);
354
+ if (GetTensorType (name) == ETensorType::BOOL || fInitializedTensors .find (name) != fInitializedTensors .end () ||
355
+ fDynamicTensorInfos .find (name) != fDynamicTensorInfos .end ())
356
+ continue ;
357
+
358
+ auto tensor_size = GetTypeSize (GetTensorType (name)) * ConvertShapeToLength (GetTensorShape (name));
359
+ // important fill the pair in the ordered output tensors with the string view and not the string
360
+ TensorMemoryInfo tmi = {it, tensor_size};
361
+ ordered_output_tensors.push_back (tmi);
362
+ }
363
+ std::sort (ordered_output_tensors.begin (), ordered_output_tensors.end (),
364
+ [](const TensorMemoryInfo &a, const TensorMemoryInfo &b) { return a.tensor_size > b.tensor_size ; });
347
365
348
- auto tensor_size = GetTypeSize (GetTensorType (name)) * ConvertShapeToLength (GetTensorShape (name));
366
+ for (auto &it : ordered_output_tensors) {
367
+ bool allocated = false ;
368
+ std::string name = std::string{it.tensor_name };
369
+ size_t tensor_size = it.tensor_size ;
370
+ if (fVerbose )
371
+ std::cout << " output tensor " << name << " size " << tensor_size << std::endl;
349
372
350
- for (auto chunk = fIntermediateMemoryInfo .available_stack .begin (); chunk != fIntermediateMemoryInfo .available_stack .end (); ) {
373
+ for (auto chunk = fIntermediateMemoryInfo .available_stack .begin ();
374
+ chunk != fIntermediateMemoryInfo .available_stack .end ();) {
351
375
352
- // check if available memory chunks can accommodate the tensor
353
- if (chunk->second >= tensor_size) {
354
- auto new_chunk = fIntermediateMemoryInfo .total_stack [chunk->first ].split (it, tensor_size);
355
- auto new_chunk_location = chunk->first +chunk->second -tensor_size;
356
- fIntermediateMemoryInfo .total_stack [new_chunk_location] = new_chunk;
376
+ if (fVerbose ) std::cout << " .. available chunk " << chunk->first << " with size = " << chunk->second ;
377
+ // check if available memory chunks can accommodate the tensor
378
+ if (chunk->second >= tensor_size) {
379
+ // need to use here string_view (i.e it.tensor_name)
380
+ // split returns the new chunk with size of new tensor. The free chunk is before the used one
381
+ auto new_chunk = fIntermediateMemoryInfo .total_stack [chunk->first ].split (it.tensor_name , tensor_size);
382
+ auto new_chunk_location = chunk->first + chunk->second - tensor_size;
383
+ fIntermediateMemoryInfo .total_stack [new_chunk_location] = new_chunk;
357
384
358
- declareIntermediateTensor (name, tensor_size, new_chunk_location);
359
- chunk->second -= tensor_size;
385
+ declareIntermediateTensor (name, tensor_size, new_chunk_location);
386
+ chunk->second -= tensor_size;
360
387
361
- allocated = true ;
388
+ allocated = true ;
362
389
363
- if (chunk->second == 0 ) {
364
- chunk = fIntermediateMemoryInfo .available_stack .erase (chunk);
365
- }
390
+ if (fVerbose ) std::cout << " is re-used and split in a new of size " << new_chunk.tensor_size << " at " << new_chunk_location;
366
391
367
- break ;
368
- }
369
- ++ chunk;
392
+ if (chunk-> second == 0 ) {
393
+ if ( fVerbose ) std::cout << " and deleted since size matches " ;
394
+ fIntermediateMemoryInfo . available_stack . erase ( chunk) ;
370
395
}
396
+ if (fVerbose ) std::cout << std::endl;
397
+ break ;
398
+ } else if (chunk->first == fIntermediateMemoryInfo .available_stack .rbegin ()->first &&
399
+ fIntermediateMemoryInfo .total_stack .rbegin ()->first == chunk->first ) {
400
+ // case last available chunk is the last in the memory, we can increase that one
401
+ fIntermediateMemoryInfo .total_stack [chunk->first ] = {it.tensor_name , tensor_size};
402
+ declareIntermediateTensor (name, tensor_size, chunk->first );
403
+ fIntermediateMemoryInfo .available_stack .erase (chunk);
404
+ allocated = true ;
405
+ if (fVerbose ) std::cout << " is extended with a bigger one of size " << tensor_size << std::endl;
406
+ break ;
407
+ }
408
+ ++chunk;
409
+ if (fVerbose ) std::cout << std::endl;
410
+ }
371
411
372
- if (!allocated) {
373
- size_t chunk_idx = fIntermediateMemoryInfo .total_stack .empty ()
374
- ? 0
375
- : fIntermediateMemoryInfo .total_stack .rbegin ()->first + fIntermediateMemoryInfo .total_stack .rbegin ()->second .tensor_size ;
412
+ if (!allocated) {
413
+ size_t chunk_idx = fIntermediateMemoryInfo .total_stack .empty ()
414
+ ? 0
415
+ : fIntermediateMemoryInfo .total_stack .rbegin ()->first +
416
+ fIntermediateMemoryInfo .total_stack .rbegin ()->second .tensor_size ;
376
417
377
- fIntermediateMemoryInfo .total_stack [chunk_idx] = {it, tensor_size} ;
418
+ fIntermediateMemoryInfo .total_stack [chunk_idx] = it ;
378
419
379
- declareIntermediateTensor (name, tensor_size, chunk_idx);
380
- }
420
+ declareIntermediateTensor (name, tensor_size, chunk_idx);
421
+
422
+ if (fVerbose ) std::cout << " no chunk available - add in total stack a new chunk with size of tensor and idx : " << chunk_idx
423
+ << std::endl;
424
+ }
381
425
}
382
426
return code.str ();
383
427
}
384
428
385
429
void RModel::CheckAndFlushIntermediateMemory (std::span<const std::string_view> op_input_tensors, const size_t & op_idx){
386
- for (auto &it : op_input_tensors){
430
+ if (fVerbose ) std::cout << " *** CheckAndFlushIntermediateMemory: Loop on input tensors for op " << op_idx << " \n " ;
431
+ // print available chunks
432
+ if (fVerbose ) std::cout << " available chunks before freeing them : \n " ;
433
+ for (auto chunk = fIntermediateMemoryInfo .available_stack .begin ();
434
+ chunk != fIntermediateMemoryInfo .available_stack .end (); chunk++) {
435
+ if (fVerbose ) std::cout << " -- free chunk " << chunk->first << " size = " << chunk->second << std::endl;
436
+ }
437
+ for (auto &it : op_input_tensors) {
387
438
// last occurence of the tensor is reached => flush it from memory
439
+ if (fVerbose ) std::cout << " .. input tensors : " << it;
388
440
if (fIntermediateTensorFrequencyLookup [it] == op_idx) {
441
+ if (fVerbose ) std::cout << " flash condition is met - looping on chunks to find matching one \n " ;
389
442
for (auto chunk = fIntermediateMemoryInfo .total_stack .begin ();
390
- chunk != fIntermediateMemoryInfo .total_stack .end (); ++chunk ) {
391
- if (chunk->second .tensor_name == it) {
392
-
393
- // check if nearby chunks in available memory can coalesce
394
- auto first_greater = fIntermediateMemoryInfo .available_stack .upper_bound (chunk->first ); // smallest element greater than the flushed chunk idx
395
- auto last_smaller = (first_greater == fIntermediateMemoryInfo .available_stack .begin ()) ? fIntermediateMemoryInfo .available_stack .end () : std::prev (first_greater); // largest element smaller than the flushed chunk idx
396
-
397
- // check if the next stack entry is actually adjacent in memory
398
- if (last_smaller->first +last_smaller->second + 1 == chunk->first ){
399
- last_smaller->second += chunk->second .tensor_size ;
400
- fIntermediateMemoryInfo .total_stack [last_smaller->first ].merge (chunk->second );
401
-
402
- if (last_smaller->first + last_smaller->second + 1 == first_greater->first ){
403
- fIntermediateMemoryInfo .total_stack [last_smaller->first ].merge (fIntermediateMemoryInfo .total_stack [first_greater->first ]);
404
- first_greater = fIntermediateMemoryInfo .available_stack .erase (first_greater);
405
- }
406
- } else {
407
- if (chunk->first + chunk->second .tensor_size + 1 == first_greater->first ){
408
- fIntermediateMemoryInfo .total_stack [chunk->first ].merge (fIntermediateMemoryInfo .total_stack [first_greater->first ]);
409
- first_greater = fIntermediateMemoryInfo .available_stack .erase (first_greater);
410
- }
411
- fIntermediateMemoryInfo .available_stack .insert ({
412
- chunk->first ,
413
- chunk->second .tensor_size
414
- });
415
- }
443
+ chunk != fIntermediateMemoryInfo .total_stack .end (); ++chunk) {
444
+ if (fVerbose ) std::cout << " --- chunk " << chunk->first << " , " << chunk->second .tensor_name << " size " << chunk->second .tensor_size ;
445
+ if (chunk->second .tensor_name == it) {
446
+ if (fVerbose ) std::cout << " -- Found chunk corresponding to input tensor: " << chunk->first ;
447
+ // check if nearby chunks in available memory can coalesce
448
+ auto first_greater = fIntermediateMemoryInfo .available_stack .upper_bound (
449
+ chunk->first ); // smallest element greater than the flushed chunk idx
450
+ auto last_smaller = (first_greater == fIntermediateMemoryInfo .available_stack .begin ())
451
+ ? fIntermediateMemoryInfo .available_stack .end ()
452
+ : std::prev (first_greater); // largest element smaller than the flushed chunk idx
453
+
454
+ // check if the next stack entry is actually adjacent in memory
455
+
456
+ if (last_smaller != fIntermediateMemoryInfo .available_stack .end () &&
457
+ last_smaller->first + last_smaller->second == chunk->first ) {
458
+ // merge chunk with previous one
459
+ last_smaller->second += chunk->second .tensor_size ;
460
+ fIntermediateMemoryInfo .total_stack [last_smaller->first ].merge (chunk->second );
461
+ if (fVerbose ) std::cout << " is adjacent in memory with previous one - merge " ;
462
+ if (first_greater != fIntermediateMemoryInfo .available_stack .end () &&
463
+ last_smaller->first + last_smaller->second == first_greater->first ) {
464
+ // merge also with following one
465
+ last_smaller->second += first_greater->second ;
466
+ fIntermediateMemoryInfo .total_stack [last_smaller->first ].merge (
467
+ fIntermediateMemoryInfo .total_stack [first_greater->first ]);
468
+ // delete merged one in available stack and in total stack
469
+ fIntermediateMemoryInfo .total_stack .erase (first_greater->first );
470
+ fIntermediateMemoryInfo .available_stack .erase (first_greater);
471
+ if (fVerbose ) std::cout << " merge also with following that is free " ;
472
+ }
473
+ fIntermediateMemoryInfo .total_stack .erase (chunk->first );
474
+ if (fVerbose ) std::cout << std::endl;
475
+ break ;
476
+ } else if (first_greater != fIntermediateMemoryInfo .available_stack .end () &&
477
+ chunk->first + chunk->second .tensor_size == first_greater->first ) {
478
+ // merge with first greater
479
+ if (fVerbose ) std::cout << " is adjacent in memory with following one - merge \n " ;
480
+ // cannot modify idx of first_greter. Insert a new one and delete previous one
481
+ size_t new_size = chunk->second .tensor_size + first_greater->second ;
482
+ size_t first_greater_idx = first_greater->first ;
483
+ fIntermediateMemoryInfo .available_stack .erase (first_greater);
484
+ // cannot use anymore first_greater
485
+ fIntermediateMemoryInfo .available_stack .insert ({chunk->first , new_size});
486
+ fIntermediateMemoryInfo .total_stack [chunk->first ].merge (
487
+ fIntermediateMemoryInfo .total_stack [first_greater_idx]);
488
+ fIntermediateMemoryInfo .total_stack .erase (first_greater_idx);
489
+ } else {
490
+ fIntermediateMemoryInfo .available_stack .insert ({chunk->first , chunk->second .tensor_size });
491
+ if (fVerbose ) std::cout << " insert in the available stack the chunk with size " << chunk->second .tensor_size << std::endl;
416
492
}
493
+ chunk->second .tensor_name = " free" ;
494
+ break ;
495
+ }
417
496
}
497
+ } else {
498
+ if (fVerbose ) std::cout << std::endl;
418
499
}
419
500
}
420
501
}
421
502
422
-
423
-
424
503
void RModel::Initialize (int batchSize, bool verbose) {
425
504
std::map<std::string, size_t > inputParams;
426
505
if (batchSize > 0 ) {
@@ -609,12 +688,12 @@ void RModel::GenerateInitializedTensorInfo()
609
688
610
689
for (auto &i : fInitializedTensors ) {
611
690
if (!fUseWeightFile || i.second .IsConstantTensor ()) {
612
- if (i.second .type () == ETensorType::FLOAT) {
691
+ if (i.second .type () == ETensorType::FLOAT) {
613
692
fGC += GenerateConstantTensorCode<float >(i);
614
- fConstantTensorSize += ConvertShapeToLength (i.second .shape ())* 4 ;
693
+ fConstantTensorSize += ConvertShapeToLength (i.second .shape ()) * 4 ;
615
694
} else if (i.second .type () == ETensorType::INT64) {
616
695
fGC += GenerateConstantTensorCode<int64_t >(i);
617
- fConstantTensorSize += ConvertShapeToLength (i.second .shape ())* 8 ;
696
+ fConstantTensorSize += ConvertShapeToLength (i.second .shape ()) * 8 ;
618
697
}
619
698
620
699
} else {
@@ -623,7 +702,7 @@ void RModel::GenerateInitializedTensorInfo()
623
702
if (i.second .type () == ETensorType::FLOAT) {
624
703
fGC += " std::vector<float> fTensor_" + i.first + " = std::vector<float>(" + std::to_string (length) + " );\n " ;
625
704
fGC += " float * tensor_" + i.first + " = fTensor_" + i.first + " .data();\n " ;
626
- fWeightsTensorSize += ConvertShapeToLength (i.second .shape ())* 4 ;
705
+ fWeightsTensorSize += ConvertShapeToLength (i.second .shape ()) * 4 ;
627
706
}
628
707
}
629
708
}
@@ -661,17 +740,17 @@ void RModel::GenerateIntermediateTensorInfo() {
661
740
if (i.second .type == ETensorType::FLOAT) {
662
741
tensor_declaration_block += " std::vector<float> fTensor_" + i.first + " = std::vector<float>(" + std::to_string (length) + " );\n " ;
663
742
tensor_declaration_block += " float * tensor_" + i.first + " = fTensor_" + i.first + " .data();\n " ;
664
- fOtherTensorSize += 4 * length;
743
+ fOtherTensorSize += 4 * length;
665
744
}
666
745
else if (i.second .type == ETensorType::DOUBLE) {
667
746
tensor_declaration_block += " std::vector<double> fTensor_" + i.first + " = std::vector<double>(" + std::to_string (length) + " );\n " ;
668
747
tensor_declaration_block += " double * tensor_" + i.first + " = fTensor_" + i.first + " .data();\n " ;
669
- fOtherTensorSize += 8 * length;
748
+ fOtherTensorSize += 8 * length;
670
749
}
671
750
else if (i.second .type == ETensorType::INT64) {
672
751
tensor_declaration_block += " std::vector<int64_t> fTensor_" + i.first + " = std::vector<int64_t>(" + std::to_string (length) + " );\n " ;
673
752
tensor_declaration_block += " int64_t * tensor_" + i.first + " = fTensor_" + i.first + " .data();\n " ;
674
- fOtherTensorSize += 8 * length;
753
+ fOtherTensorSize += 8 * length;
675
754
}
676
755
}
677
756
}
@@ -853,6 +932,11 @@ void RModel::GenerateSessionCode()
853
932
std::string intermediate_memory_alloc_string = " " ;
854
933
intermediate_memory_alloc_string += " \n // --- Positioning intermediate tensor memory --" ;
855
934
for (size_t op_idx = 0 ; op_idx < fOperators .size (); ++op_idx) {
935
+ if (fVerbose ) {
936
+ auto op = fOperators [op_idx].get ();
937
+ std::cout << " \n ******************\n analyzing input/output operator " << op_idx << " "
938
+ << typeid (*op).name () << std::endl;
939
+ }
856
940
intermediate_memory_alloc_string += AllocateIntermediateMemory (fOperators [op_idx]->GetOpOutputTensors ());
857
941
CheckAndFlushIntermediateMemory (fOperators [op_idx]->GetOpInputTensors (), op_idx);
858
942
}
0 commit comments