Skip to content

Commit e7df14d

Browse files
ai-edge-botcopybara-github
authored andcommitted
Try reuse mmaped quantization scale to avoid
we can't optimize zero points since flatbuffer defines zero_point in long PiperOrigin-RevId: 847795250
1 parent 63806ac commit e7df14d

File tree

12 files changed

+84
-36
lines changed

12 files changed

+84
-36
lines changed

tflite/core/c/c_api_test.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1610,8 +1610,8 @@ TEST(CApiSimple, OpaqueApiAccessors) {
16101610
TfLiteQuantization new_quantization{};
16111611
new_quantization.type = kTfLiteAffineQuantization;
16121612
TfLiteAffineQuantization* affine_quant =
1613-
(TfLiteAffineQuantization*)malloc(
1614-
sizeof(TfLiteAffineQuantization));
1613+
(TfLiteAffineQuantization*)calloc(
1614+
1, sizeof(TfLiteAffineQuantization));
16151615
affine_quant->scale = TfLiteFloatArrayCreate(1);
16161616
affine_quant->zero_point = TfLiteIntArrayCreate(1);
16171617
new_quantization.params = affine_quant;

tflite/core/c/common.cc

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -241,11 +241,17 @@ void TfLiteQuantizationFree(TfLiteQuantization* quantization) {
241241
if (quantization->type == kTfLiteAffineQuantization) {
242242
TfLiteAffineQuantization* q_params =
243243
reinterpret_cast<TfLiteAffineQuantization*>(quantization->params);
244-
if (q_params->scale) {
244+
// Only free arrays that are owned (not borrowed from mmap).
245+
// When flag is clear (0), the array is owned and must be freed.
246+
// When flag is set (1), the array is borrowed from mmap and must NOT be
247+
// freed.
248+
if (q_params->scale &&
249+
!(q_params->ownership_flags & kTfLiteQuantizationScaleBorrowed)) {
245250
TfLiteFloatArrayFree(q_params->scale);
246251
q_params->scale = nullptr;
247252
}
248-
if (q_params->zero_point) {
253+
if (q_params->zero_point &&
254+
!(q_params->ownership_flags & kTfLiteQuantizationZeroPointBorrowed)) {
249255
TfLiteIntArrayFree(q_params->zero_point);
250256
q_params->zero_point = nullptr;
251257
}

tflite/core/c/common.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,8 +357,24 @@ typedef struct TfLiteAffineQuantization {
357357
TfLiteFloatArray* scale;
358358
TfLiteIntArray* zero_point;
359359
int32_t quantized_dimension;
360+
// Bit flags for ownership tracking to enable zero-copy quantization.
361+
// When a flag is set, the corresponding array points to memory-mapped (mmap)
362+
// data and must NOT be freed. When clear, the array is heap-allocated and
363+
// must be freed. Zero-initialization (via calloc) means "owned" by default.
364+
// Note: scale and zero_point may have different ownership - scale can be
365+
// borrowed from mmap while zero_point must be copied (int64->int32
366+
// conversion).
367+
uint8_t ownership_flags; // See kTfLiteQuantization* constants below
360368
} TfLiteAffineQuantization;
361369

370+
/// Bit flags for TfLiteAffineQuantization::ownership_flags.
371+
/// When set, the array is borrowed (from mmap) and must NOT be freed.
372+
/// When clear, the array is owned (heap-allocated) and must be freed.
373+
enum {
374+
kTfLiteQuantizationScaleBorrowed = (1 << 0),
375+
kTfLiteQuantizationZeroPointBorrowed = (1 << 1),
376+
};
377+
362378
/// Parameters for blockwise quantization across the output channels dimension.
363379
/// For a particular value in quantized_dimension, quantized values can be
364380
/// converted back to float using:

tflite/core/c/common_test.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ TEST(Quantization, TestQuantizationFree) {
142142
t.quantization.type = kTfLiteAffineQuantization;
143143
t.sparsity = nullptr;
144144
auto* params = reinterpret_cast<TfLiteAffineQuantization*>(
145-
malloc(sizeof(TfLiteAffineQuantization)));
145+
calloc(1, sizeof(TfLiteAffineQuantization)));
146146
params->scale = TfLiteFloatArrayCreate(3);
147147
params->zero_point = TfLiteIntArrayCreate(3);
148148
t.quantization.params = reinterpret_cast<void*>(params);
@@ -907,7 +907,7 @@ TEST(TensorCloneTest, CloneATensorAttributes) {
907907
auto dims_signature_data = BuildTfLiteArray<int>({11, 12, 13});
908908
TfLiteAffineQuantization* affine_quantization =
909909
reinterpret_cast<TfLiteAffineQuantization*>(
910-
malloc(sizeof(TfLiteAffineQuantization)));
910+
calloc(1, sizeof(TfLiteAffineQuantization)));
911911
affine_quantization->scale = BuildTfLiteArray<float>({7, 8, 9}).release();
912912
affine_quantization->zero_point = BuildTfLiteArray({4, 5, 6}).release();
913913
affine_quantization->quantized_dimension = 34;

tflite/core/interpreter_builder.cc

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -488,10 +488,36 @@ TfLiteStatus InterpreterBuilder::ParseQuantization(
488488
quantization->type = kTfLiteAffineQuantization;
489489
auto* affine_quantization = reinterpret_cast<TfLiteAffineQuantization*>(
490490
malloc(sizeof(TfLiteAffineQuantization)));
491-
affine_quantization->scale = TfLiteFloatArrayCreate(num_scales);
492-
for (size_t i = 0; i < num_scales; ++i) {
493-
affine_quantization->scale->data[i] = src_quantization->scale()->Get(i);
491+
// Memory optimization: When the model is mmap-backed, avoid copying scale
492+
// data by directly referencing the flatbuffer's scale array. The memory
493+
// layout of flatbuffers::Vector<float> ([uint32_t length][float data[]]) is
494+
// compatible with TfLiteFloatArray ([int size][float data[]]) on platforms
495+
// where sizeof(int) == sizeof(uint32_t).
496+
//
497+
// Note: zero_point cannot use this optimization because the flatbuffer uses
498+
// int64_t but TfLiteIntArray uses int32_t, requiring conversion.
499+
const bool can_borrow_scale =
500+
allocation_ != nullptr &&
501+
allocation_->type() == Allocation::Type::kMMap &&
502+
sizeof(int) == sizeof(uint32_t);
503+
504+
// Initialize ownership_flags to 0 (all arrays owned by default).
505+
affine_quantization->ownership_flags = 0;
506+
if (can_borrow_scale) {
507+
// Borrow scale directly from mmap'd flatbuffer (zero-copy).
508+
// The flatbuffers::Vector<float> pointer points to the length field,
509+
// followed by the float data - matching TfLiteFloatArray layout.
510+
affine_quantization->scale = const_cast<TfLiteFloatArray*>(
511+
reinterpret_cast<const TfLiteFloatArray*>(src_quantization->scale()));
512+
affine_quantization->ownership_flags |= kTfLiteQuantizationScaleBorrowed;
513+
} else {
514+
// Copy scale data to heap (original behavior).
515+
affine_quantization->scale = TfLiteFloatArrayCreate(num_scales);
516+
for (size_t i = 0; i < num_scales; ++i) {
517+
affine_quantization->scale->data[i] = src_quantization->scale()->Get(i);
518+
}
494519
}
520+
// Zero point must always be copied due to int64_t -> int32_t conversion.
495521
if (all_zero_points_same) {
496522
affine_quantization->zero_point = TfLiteIntArrayCreate(1);
497523
affine_quantization->zero_point->data[0] = zero_point;

tflite/core/model_building.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ TfLiteQuantization ToTfLiteQuantization(Quantization quantization) {
219219
Overload([&q](NoQuantization) { q.type = kTfLiteNoQuantization; },
220220
[&q](const AffineQuantization& src) {
221221
q.type = kTfLiteAffineQuantization;
222-
q.params = calloc(sizeof(TfLiteAffineQuantization), 1);
222+
q.params = calloc(1, sizeof(TfLiteAffineQuantization));
223223
TfLiteAffineQuantization& qa =
224224
*reinterpret_cast<TfLiteAffineQuantization*>(q.params);
225225
qa.quantized_dimension = src.axis;

tflite/delegates/delegate_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ TEST_F(TestDelegate, StaticDelegateMakesGraphImmutable) {
164164
TfLiteQuantization quant = {};
165165
quant.type = kTfLiteAffineQuantization;
166166
auto quant_params = static_cast<TfLiteAffineQuantization*>(
167-
malloc(sizeof(TfLiteAffineQuantization)));
167+
calloc(1, sizeof(TfLiteAffineQuantization)));
168168
quant_params->scale = nullptr;
169169
quant_params->zero_point = nullptr;
170170
quant_params->quantized_dimension = 0;

tflite/delegates/gpu/common/model_builder_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1168,7 +1168,7 @@ class InterpreterQuantized : public DelegatedInterpreter {
11681168
TfLiteQuantization rw_quantization;
11691169
rw_quantization.type = kTfLiteAffineQuantization;
11701170
auto* rw_affine_quantization = static_cast<TfLiteAffineQuantization*>(
1171-
malloc(sizeof(TfLiteAffineQuantization)));
1171+
calloc(1, sizeof(TfLiteAffineQuantization)));
11721172
rw_affine_quantization->scale = TfLiteFloatArrayCreate(1);
11731173
rw_affine_quantization->zero_point = TfLiteIntArrayCreate(1);
11741174
rw_affine_quantization->scale->data[0] = scale;

tflite/interpreter_test.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ TEST(BasicInterpreter, CheckQuantization) {
231231
TfLiteQuantization rw_quantization;
232232
rw_quantization.type = kTfLiteAffineQuantization;
233233
auto* rw_affine_quantization = static_cast<TfLiteAffineQuantization*>(
234-
malloc(sizeof(TfLiteAffineQuantization)));
234+
calloc(1, sizeof(TfLiteAffineQuantization)));
235235
rw_affine_quantization->scale = TfLiteFloatArrayCreate(1);
236236
rw_affine_quantization->zero_point = TfLiteIntArrayCreate(1);
237237
rw_affine_quantization->scale->data[0] = scale;
@@ -241,7 +241,7 @@ TEST(BasicInterpreter, CheckQuantization) {
241241
TfLiteQuantization ro_quantization;
242242
ro_quantization.type = kTfLiteAffineQuantization;
243243
auto* ro_affine_quantization = static_cast<TfLiteAffineQuantization*>(
244-
malloc(sizeof(TfLiteAffineQuantization)));
244+
calloc(1, sizeof(TfLiteAffineQuantization)));
245245
ro_affine_quantization->scale = TfLiteFloatArrayCreate(1);
246246
ro_affine_quantization->zero_point = TfLiteIntArrayCreate(1);
247247
ro_affine_quantization->scale->data[0] = scale;

tflite/kernels/batch_matmul.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -662,7 +662,7 @@ TfLiteTensor* GetTempRhs(TfLiteContext* context, TfLiteNode* node,
662662
free(transposed_rhs->quantization.params);
663663
}
664664
transposed_rhs->quantization.params =
665-
malloc(sizeof(TfLiteAffineQuantization));
665+
calloc(1, sizeof(TfLiteAffineQuantization));
666666
const auto* rhs_affine_quantization =
667667
reinterpret_cast<TfLiteAffineQuantization*>(rhs->quantization.params);
668668
auto* transposed_rhs_affine_quantization =

0 commit comments

Comments
 (0)