diff --git a/tflite/core/c/c_api_test.cc b/tflite/core/c/c_api_test.cc index d48ea5a2e0..eedbcaa17f 100644 --- a/tflite/core/c/c_api_test.cc +++ b/tflite/core/c/c_api_test.cc @@ -1610,8 +1610,8 @@ TEST(CApiSimple, OpaqueApiAccessors) { TfLiteQuantization new_quantization{}; new_quantization.type = kTfLiteAffineQuantization; TfLiteAffineQuantization* affine_quant = - (TfLiteAffineQuantization*)malloc( - sizeof(TfLiteAffineQuantization)); + (TfLiteAffineQuantization*)calloc( + 1, sizeof(TfLiteAffineQuantization)); affine_quant->scale = TfLiteFloatArrayCreate(1); affine_quant->zero_point = TfLiteIntArrayCreate(1); new_quantization.params = affine_quant; diff --git a/tflite/core/c/common.cc b/tflite/core/c/common.cc index 43005846f0..5d8f5363a5 100644 --- a/tflite/core/c/common.cc +++ b/tflite/core/c/common.cc @@ -241,11 +241,17 @@ void TfLiteQuantizationFree(TfLiteQuantization* quantization) { if (quantization->type == kTfLiteAffineQuantization) { TfLiteAffineQuantization* q_params = reinterpret_cast(quantization->params); - if (q_params->scale) { + // Only free arrays that are owned (not borrowed from mmap). + // When flag is clear (0), the array is owned and must be freed. + // When flag is set (1), the array is borrowed from mmap and must NOT be + // freed. + if (q_params->scale && + !(q_params->ownership_flags & kTfLiteQuantizationScaleBorrowed)) { TfLiteFloatArrayFree(q_params->scale); q_params->scale = nullptr; } - if (q_params->zero_point) { + if (q_params->zero_point && + !(q_params->ownership_flags & kTfLiteQuantizationZeroPointBorrowed)) { TfLiteIntArrayFree(q_params->zero_point); q_params->zero_point = nullptr; } diff --git a/tflite/core/c/common.h b/tflite/core/c/common.h index 4f3e1555c5..ba3ff4428c 100644 --- a/tflite/core/c/common.h +++ b/tflite/core/c/common.h @@ -357,8 +357,24 @@ typedef struct TfLiteAffineQuantization { TfLiteFloatArray* scale; TfLiteIntArray* zero_point; int32_t quantized_dimension; + // Bit flags for ownership tracking to enable zero-copy quantization. + // When a flag is set, the corresponding array points to memory-mapped (mmap) + // data and must NOT be freed. When clear, the array is heap-allocated and + // must be freed. Zero-initialization (via calloc) means "owned" by default. + // Note: scale and zero_point may have different ownership - scale can be + // borrowed from mmap while zero_point must be copied (int64->int32 + // conversion). + uint8_t ownership_flags; // See kTfLiteQuantization* constants below } TfLiteAffineQuantization; +/// Bit flags for TfLiteAffineQuantization::ownership_flags. +/// When set, the array is borrowed (from mmap) and must NOT be freed. +/// When clear, the array is owned (heap-allocated) and must be freed. +enum { + kTfLiteQuantizationScaleBorrowed = (1 << 0), + kTfLiteQuantizationZeroPointBorrowed = (1 << 1), +}; + /// Parameters for blockwise quantization across the output channels dimension. /// For a particular value in quantized_dimension, quantized values can be /// converted back to float using: diff --git a/tflite/core/c/common_test.cc b/tflite/core/c/common_test.cc index ca17e88408..b124b41a63 100644 --- a/tflite/core/c/common_test.cc +++ b/tflite/core/c/common_test.cc @@ -142,7 +142,7 @@ TEST(Quantization, TestQuantizationFree) { t.quantization.type = kTfLiteAffineQuantization; t.sparsity = nullptr; auto* params = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); params->scale = TfLiteFloatArrayCreate(3); params->zero_point = TfLiteIntArrayCreate(3); t.quantization.params = reinterpret_cast(params); @@ -907,7 +907,7 @@ TEST(TensorCloneTest, CloneATensorAttributes) { auto dims_signature_data = BuildTfLiteArray({11, 12, 13}); TfLiteAffineQuantization* affine_quantization = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); affine_quantization->scale = BuildTfLiteArray({7, 8, 9}).release(); affine_quantization->zero_point = BuildTfLiteArray({4, 5, 6}).release(); affine_quantization->quantized_dimension = 34; diff --git a/tflite/core/interpreter_builder.cc b/tflite/core/interpreter_builder.cc index 3cdfd84fbc..62be5c827a 100644 --- a/tflite/core/interpreter_builder.cc +++ b/tflite/core/interpreter_builder.cc @@ -488,10 +488,36 @@ TfLiteStatus InterpreterBuilder::ParseQuantization( quantization->type = kTfLiteAffineQuantization; auto* affine_quantization = reinterpret_cast( malloc(sizeof(TfLiteAffineQuantization))); - affine_quantization->scale = TfLiteFloatArrayCreate(num_scales); - for (size_t i = 0; i < num_scales; ++i) { - affine_quantization->scale->data[i] = src_quantization->scale()->Get(i); + // Memory optimization: When the model is mmap-backed, avoid copying scale + // data by directly referencing the flatbuffer's scale array. The memory + // layout of flatbuffers::Vector ([uint32_t length][float data[]]) is + // compatible with TfLiteFloatArray ([int size][float data[]]) on platforms + // where sizeof(int) == sizeof(uint32_t). + // + // Note: zero_point cannot use this optimization because the flatbuffer uses + // int64_t but TfLiteIntArray uses int32_t, requiring conversion. + const bool can_borrow_scale = + allocation_ != nullptr && + allocation_->type() == Allocation::Type::kMMap && + sizeof(int) == sizeof(uint32_t); + + // Initialize ownership_flags to 0 (all arrays owned by default). + affine_quantization->ownership_flags = 0; + if (can_borrow_scale) { + // Borrow scale directly from mmap'd flatbuffer (zero-copy). + // The flatbuffers::Vector pointer points to the length field, + // followed by the float data - matching TfLiteFloatArray layout. + affine_quantization->scale = const_cast( + reinterpret_cast(src_quantization->scale())); + affine_quantization->ownership_flags |= kTfLiteQuantizationScaleBorrowed; + } else { + // Copy scale data to heap (original behavior). + affine_quantization->scale = TfLiteFloatArrayCreate(num_scales); + for (size_t i = 0; i < num_scales; ++i) { + affine_quantization->scale->data[i] = src_quantization->scale()->Get(i); + } } + // Zero point must always be copied due to int64_t -> int32_t conversion. if (all_zero_points_same) { affine_quantization->zero_point = TfLiteIntArrayCreate(1); affine_quantization->zero_point->data[0] = zero_point; diff --git a/tflite/core/model_building.cc b/tflite/core/model_building.cc index 1b845f0ce8..c22e3e3a36 100644 --- a/tflite/core/model_building.cc +++ b/tflite/core/model_building.cc @@ -219,7 +219,7 @@ TfLiteQuantization ToTfLiteQuantization(Quantization quantization) { Overload([&q](NoQuantization) { q.type = kTfLiteNoQuantization; }, [&q](const AffineQuantization& src) { q.type = kTfLiteAffineQuantization; - q.params = calloc(sizeof(TfLiteAffineQuantization), 1); + q.params = calloc(1, sizeof(TfLiteAffineQuantization)); TfLiteAffineQuantization& qa = *reinterpret_cast(q.params); qa.quantized_dimension = src.axis; diff --git a/tflite/delegates/delegate_test.cc b/tflite/delegates/delegate_test.cc index 956c1b513b..6884da6799 100644 --- a/tflite/delegates/delegate_test.cc +++ b/tflite/delegates/delegate_test.cc @@ -164,7 +164,7 @@ TEST_F(TestDelegate, StaticDelegateMakesGraphImmutable) { TfLiteQuantization quant = {}; quant.type = kTfLiteAffineQuantization; auto quant_params = static_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); quant_params->scale = nullptr; quant_params->zero_point = nullptr; quant_params->quantized_dimension = 0; diff --git a/tflite/delegates/gpu/common/model_builder_test.cc b/tflite/delegates/gpu/common/model_builder_test.cc index aac1cc9eba..074bc25872 100644 --- a/tflite/delegates/gpu/common/model_builder_test.cc +++ b/tflite/delegates/gpu/common/model_builder_test.cc @@ -1168,7 +1168,7 @@ class InterpreterQuantized : public DelegatedInterpreter { TfLiteQuantization rw_quantization; rw_quantization.type = kTfLiteAffineQuantization; auto* rw_affine_quantization = static_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); rw_affine_quantization->scale = TfLiteFloatArrayCreate(1); rw_affine_quantization->zero_point = TfLiteIntArrayCreate(1); rw_affine_quantization->scale->data[0] = scale; diff --git a/tflite/interpreter_test.cc b/tflite/interpreter_test.cc index 0b1db3f659..b1106478e1 100644 --- a/tflite/interpreter_test.cc +++ b/tflite/interpreter_test.cc @@ -231,7 +231,7 @@ TEST(BasicInterpreter, CheckQuantization) { TfLiteQuantization rw_quantization; rw_quantization.type = kTfLiteAffineQuantization; auto* rw_affine_quantization = static_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); rw_affine_quantization->scale = TfLiteFloatArrayCreate(1); rw_affine_quantization->zero_point = TfLiteIntArrayCreate(1); rw_affine_quantization->scale->data[0] = scale; @@ -241,7 +241,7 @@ TEST(BasicInterpreter, CheckQuantization) { TfLiteQuantization ro_quantization; ro_quantization.type = kTfLiteAffineQuantization; auto* ro_affine_quantization = static_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); ro_affine_quantization->scale = TfLiteFloatArrayCreate(1); ro_affine_quantization->zero_point = TfLiteIntArrayCreate(1); ro_affine_quantization->scale->data[0] = scale; diff --git a/tflite/kernels/batch_matmul.cc b/tflite/kernels/batch_matmul.cc index df55f5a055..756b6ac62a 100644 --- a/tflite/kernels/batch_matmul.cc +++ b/tflite/kernels/batch_matmul.cc @@ -662,7 +662,7 @@ TfLiteTensor* GetTempRhs(TfLiteContext* context, TfLiteNode* node, free(transposed_rhs->quantization.params); } transposed_rhs->quantization.params = - malloc(sizeof(TfLiteAffineQuantization)); + calloc(1, sizeof(TfLiteAffineQuantization)); const auto* rhs_affine_quantization = reinterpret_cast(rhs->quantization.params); auto* transposed_rhs_affine_quantization = diff --git a/tflite/kernels/kernel_util_test.cc b/tflite/kernels/kernel_util_test.cc index db86eb2a62..305d345e0a 100644 --- a/tflite/kernels/kernel_util_test.cc +++ b/tflite/kernels/kernel_util_test.cc @@ -303,7 +303,7 @@ TEST_F(QuantizationParamsTest, PerChannelConvolution) { input->params = input_quant; input->quantization.type = kTfLiteAffineQuantization; auto* input_params = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); input_params->scale = TfLiteFloatArrayCreate(1); input_params->scale->data[0] = 0.5; input_params->zero_point = TfLiteIntArrayCreate(1); @@ -323,7 +323,7 @@ TEST_F(QuantizationParamsTest, PerChannelConvolution) { filter->params = filter_quant; filter->quantization.type = kTfLiteAffineQuantization; auto* filter_params = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); filter_params->scale = TfLiteFloatArrayCreate(3); filter_params->scale->data[0] = 0.25; filter_params->scale->data[1] = 0.125; @@ -344,7 +344,7 @@ TEST_F(QuantizationParamsTest, PerChannelConvolution) { bias->params = bias_quant; bias->quantization.type = kTfLiteAffineQuantization; auto* bias_params = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); bias_params->scale = TfLiteFloatArrayCreate(3); bias_params->scale->data[0] = 0.125; bias_params->scale->data[1] = 0.0625; @@ -364,7 +364,7 @@ TEST_F(QuantizationParamsTest, PerChannelConvolution) { output->params = output_quant; output->quantization.type = kTfLiteAffineQuantization; auto* output_params = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); output_params->scale = TfLiteFloatArrayCreate(1); output_params->scale->data[0] = 0.5; output_params->zero_point = TfLiteIntArrayCreate(1); @@ -403,7 +403,7 @@ TEST_F(QuantizationParamsTest, CheckAndPopulateShift) { input->params = input_quant; input->quantization.type = kTfLiteAffineQuantization; auto* input_params = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); input_params->scale = TfLiteFloatArrayCreate(1); input_params->scale->data[0] = 0.5; input_params->zero_point = TfLiteIntArrayCreate(1); @@ -423,7 +423,7 @@ TEST_F(QuantizationParamsTest, CheckAndPopulateShift) { filter->params = filter_quant; filter->quantization.type = kTfLiteAffineQuantization; auto* filter_params = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); // Create scale of size one. filter_params->scale = TfLiteFloatArrayCreate(1); filter_params->scale->data[0] = 0.25; @@ -441,7 +441,7 @@ TEST_F(QuantizationParamsTest, CheckAndPopulateShift) { bias->params = bias_quant; bias->quantization.type = kTfLiteAffineQuantization; auto* bias_params = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); bias_params->scale = TfLiteFloatArrayCreate(3); bias_params->scale->data[0] = 0.125; bias_params->scale->data[1] = 0.0625; @@ -461,7 +461,7 @@ TEST_F(QuantizationParamsTest, CheckAndPopulateShift) { output->params = output_quant; output->quantization.type = kTfLiteAffineQuantization; auto* output_params = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); output_params->scale = TfLiteFloatArrayCreate(1); output_params->scale->data[0] = 0.5; output_params->zero_point = TfLiteIntArrayCreate(1); @@ -504,7 +504,7 @@ TEST_F(QuantizationParamsTest, CheckAndPopulateZeroValue) { input->params = input_quant; input->quantization.type = kTfLiteAffineQuantization; auto* input_params = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); input_params->scale = TfLiteFloatArrayCreate(1); input_params->scale->data[0] = 1; input_params->zero_point = TfLiteIntArrayCreate(1); @@ -524,7 +524,7 @@ TEST_F(QuantizationParamsTest, CheckAndPopulateZeroValue) { filter->params = filter_quant; filter->quantization.type = kTfLiteAffineQuantization; auto* filter_params = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); filter_params->scale = TfLiteFloatArrayCreate(3); filter_params->scale->data[0] = std::ldexp(1.0f, -31); filter_params->scale->data[1] = std::ldexp(1.0f, -32); @@ -545,7 +545,7 @@ TEST_F(QuantizationParamsTest, CheckAndPopulateZeroValue) { bias->params = bias_quant; bias->quantization.type = kTfLiteAffineQuantization; auto* bias_params = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); bias_params->scale = TfLiteFloatArrayCreate(3); bias_params->scale->data[0] = std::ldexp(1.0f, -31); bias_params->scale->data[1] = std::ldexp(1.0f, -32); @@ -565,7 +565,7 @@ TEST_F(QuantizationParamsTest, CheckAndPopulateZeroValue) { output->params = output_quant; output->quantization.type = kTfLiteAffineQuantization; auto* output_params = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); output_params->scale = TfLiteFloatArrayCreate(1); output_params->scale->data[0] = 1; output_params->zero_point = TfLiteIntArrayCreate(1); @@ -603,7 +603,7 @@ TEST_F(QuantizationParamsTest, CheckAndPopulateUint8) { input->params = input_quant; input->quantization.type = kTfLiteAffineQuantization; auto* input_params = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); input_params->scale = TfLiteFloatArrayCreate(1); input_params->scale->data[0] = 1; input_params->zero_point = TfLiteIntArrayCreate(1); @@ -623,7 +623,7 @@ TEST_F(QuantizationParamsTest, CheckAndPopulateUint8) { filter->params = filter_quant; filter->quantization.type = kTfLiteAffineQuantization; auto* filter_params = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); filter_params->scale = TfLiteFloatArrayCreate(1); int32_t two_pow_neg_31 = 0x30000000; // 2^-31 so shift = -30. filter_params->scale->data[0] = *reinterpret_cast(&two_pow_neg_31); @@ -641,7 +641,7 @@ TEST_F(QuantizationParamsTest, CheckAndPopulateUint8) { bias->params = bias_quant; bias->quantization.type = kTfLiteAffineQuantization; auto* bias_params = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); bias_params->scale = TfLiteFloatArrayCreate(1); bias_params->scale->data[0] = 4.6566129e-10; // 2^-31 bias_params->zero_point = TfLiteIntArrayCreate(1); @@ -657,7 +657,7 @@ TEST_F(QuantizationParamsTest, CheckAndPopulateUint8) { output->params = output_quant; output->quantization.type = kTfLiteAffineQuantization; auto* output_params = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); output_params->scale = TfLiteFloatArrayCreate(1); output_params->scale->data[0] = 1; output_params->zero_point = TfLiteIntArrayCreate(1); @@ -695,7 +695,7 @@ TEST_F(QuantizationParamsTest, CheckAndPopulateWithoutBias) { input->params = input_quant; input->quantization.type = kTfLiteAffineQuantization; auto* input_params = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); input_params->scale = TfLiteFloatArrayCreate(1); input_params->scale->data[0] = 1; input_params->zero_point = TfLiteIntArrayCreate(1); @@ -715,7 +715,7 @@ TEST_F(QuantizationParamsTest, CheckAndPopulateWithoutBias) { filter->params = filter_quant; filter->quantization.type = kTfLiteAffineQuantization; auto* filter_params = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); filter_params->scale = TfLiteFloatArrayCreate(1); int32_t two_pow_neg_31 = 0x30000000; // 2^-31 so shift = -30. filter_params->scale->data[0] = *reinterpret_cast(&two_pow_neg_31); @@ -733,7 +733,7 @@ TEST_F(QuantizationParamsTest, CheckAndPopulateWithoutBias) { output->params = output_quant; output->quantization.type = kTfLiteAffineQuantization; auto* output_params = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); output_params->scale = TfLiteFloatArrayCreate(1); output_params->scale->data[0] = 1; output_params->zero_point = TfLiteIntArrayCreate(1); @@ -770,7 +770,7 @@ TEST_F(QuantizationParamsTest, ActivationRangeQuantizedOverflow) { output->params = output_quant; output->quantization.type = kTfLiteAffineQuantization; auto* output_params = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); output_params->scale = TfLiteFloatArrayCreate(1); output_params->scale->data[0] = 1; output_params->zero_point = TfLiteIntArrayCreate(1); diff --git a/tflite/kernels/test_util.h b/tflite/kernels/test_util.h index 67740a848c..c7dd418d3b 100644 --- a/tflite/kernels/test_util.h +++ b/tflite/kernels/test_util.h @@ -1243,7 +1243,7 @@ class SingleOpModel { TfLiteQuantizationFree(&t->quantization); t->quantization.type = kTfLiteAffineQuantization; auto* affine_quantization = reinterpret_cast( - malloc(sizeof(TfLiteAffineQuantization))); + calloc(1, sizeof(TfLiteAffineQuantization))); affine_quantization->quantized_dimension = 0; affine_quantization->scale = TfLiteFloatArrayCreate(1); affine_quantization->zero_point = TfLiteIntArrayCreate(1);