Add support for kTfLiteInt2 (srq) in tfl.fully_connected.

majiddadashi · copybara-github · commit 7a5bccc5801c · 2025-10-21T21:25:23.000-07:00
PiperOrigin-RevId: 822405584
diff --git a/tflite/core/kernels/register.cc b/tflite/core/kernels/register.cc
@@ -83,7 +83,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(),
              /* min_version = */ 1,
-             /* max_version = */ 13);
+             /* max_version = */ 14);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(),
diff --git a/tflite/kernels/BUILD b/tflite/kernels/BUILD
@@ -2225,6 +2225,7 @@ cc_test(
         "//tflite/core/api",
         "//tflite/kernels/internal:tensor_utils",
         "//tflite/schema:schema_fbs",
+        "@com_google_absl//absl/log:absl_check",
         "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
diff --git a/tflite/kernels/fully_connected.cc b/tflite/kernels/fully_connected.cc
@@ -186,7 +186,7 @@ inline TfLiteStatus CheckTypes(TfLiteContext* context,
                                TfLiteFullyConnectedParams* params) {
   const bool is_quantized =
       ((filter->type == kTfLiteUInt8) || (filter->type == kTfLiteInt8) ||
-       (filter->type == kTfLiteInt4));
+       (filter->type == kTfLiteInt4) || (filter->type == kTfLiteInt2));
   const bool is_hybrid = is_quantized && (input->type == kTfLiteFloat32);
   const bool is_shuffled =
       is_quantized && (params->weights_format ==
@@ -448,7 +448,8 @@ TfLiteStatus PrepareImpl(TfLiteContext* context, TfLiteNode* node,
       TF_LITE_ENSURE(context,
                      input->type == kTfLiteInt8 || input->type == kTfLiteInt16);
       TF_LITE_ENSURE(context, (filter->type == kTfLiteInt8 ||
-                               filter->type == kTfLiteInt4));
+                               filter->type == kTfLiteInt4 ||
+                               filter->type == kTfLiteInt2));
       TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
                         per_channel_quantization_size);
       TF_LITE_ENSURE_EQ(
@@ -654,7 +655,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
   const bool is_quantized =
       ((filter->type == kTfLiteUInt8) || (filter->type == kTfLiteInt8) ||
-       (filter->type == kTfLiteInt4));
+       (filter->type == kTfLiteInt4) || (filter->type == kTfLiteInt2));
   const bool is_hybrid = is_quantized && (input->type == kTfLiteFloat32);
   const bool is_pie = kernel_type == kLegacyPie;
 
@@ -666,7 +667,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                 params->activation == kTfLiteActReluN1To1 ||
                                 params->activation == kTfLiteActRelu6);
   }
-  if (filter->type == kTfLiteInt4) {
+  if (filter->type == kTfLiteInt4 || filter->type == kTfLiteInt2) {
     TF_LITE_ENSURE_MSG(
         context,
         kTfLiteOk == VerifyQuantizationZeroPoint(filter, /*expected_value=*/0),
@@ -1420,6 +1421,7 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
       case kTfLiteUInt8:
         if (kernel_type == kReference) {
           TF_LITE_ENSURE(context, filter->type != kTfLiteInt4);
+          TF_LITE_ENSURE(context, filter->type != kTfLiteInt2);
           reference_ops::FullyConnected(
               op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
               GetTensorShape(filter), GetTensorData<uint8_t>(filter),
@@ -1456,8 +1458,10 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                 "Invalid quantized and sparse fully-connected format.");
             return kTfLiteError;
           }
-          // Int4 support for sparse filter tensor is currently not supported
+          // Int4/Int2 support for sparse filter tensor is currently not
+          // supported
           TF_LITE_ENSURE(context, filter->type != kTfLiteInt4);
+          TF_LITE_ENSURE(context, filter->type != kTfLiteInt2);
           if (sparsity.dim_metadata_size == kDimMetadataSizeBlockSparse &&
               sparsity.dim_metadata[2].dense_size == 16) {
             // Block sparse with block size of 1x16.
@@ -1485,6 +1489,14 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                 GetTensorShape(filter).FlatSize(), /*bit_width=*/4,
                 unpacked_filter_data.get());
             filter_data = unpacked_filter_data.get();
+          } else if (filter->type == kTfLiteInt2) {
+            const size_t bytes_unpacked = filter->bytes * 4;
+            unpacked_filter_data = std::make_unique<int8_t[]>(bytes_unpacked);
+            tflite::tensor_utils::UnpackPackedIntToInt8(
+                GetTensorData<int8_t>(filter),
+                GetTensorShape(filter).FlatSize(), /*bit_width=*/2,
+                unpacked_filter_data.get());
+            filter_data = unpacked_filter_data.get();
           } else {
             filter_data = GetTensorData<int8_t>(filter);
           }
@@ -1514,6 +1526,14 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                 GetTensorShape(filter).FlatSize(), /*bit_width=*/4,
                 unpacked_filter_data.get());
             filter_data = unpacked_filter_data.get();
+          } else if (filter->type == kTfLiteInt2) {
+            const size_t bytes_unpacked = filter->bytes * 4;
+            unpacked_filter_data = std::make_unique<int8_t[]>(bytes_unpacked);
+            tflite::tensor_utils::UnpackPackedIntToInt8(
+                GetTensorData<int8_t>(filter),
+                GetTensorShape(filter).FlatSize(), /*bit_width=*/2,
+                unpacked_filter_data.get());
+            filter_data = unpacked_filter_data.get();
           } else {
             filter_data = GetTensorData<int8_t>(filter);
           }
@@ -1762,14 +1782,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         return kTfLiteError;
       }
     case kTfLiteInt8:
-      if (params->weights_format == kTfLiteFullyConnectedWeightsFormatDefault) {
-        return EvalQuantized<kernel_type>(context, node, params, data, input,
-                                          filter, bias, output);
-      } else {
-        TF_LITE_KERNEL_LOG(context, "Unhandled fully-connected weights format");
-        return kTfLiteError;
-      }
     case kTfLiteInt4:
+    case kTfLiteInt2:
       if (params->weights_format == kTfLiteFullyConnectedWeightsFormatDefault) {
         return EvalQuantized<kernel_type>(context, node, params, data, input,
                                           filter, bias, output);
diff --git a/tflite/kernels/fully_connected_test.cc b/tflite/kernels/fully_connected_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/absl_check.h"
 #include "tflite/core/interpreter.h"
 #include "tflite/kernels/test_util.h"
 #include "tflite/schema/schema_generated.h"
@@ -159,22 +160,34 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
       std::vector<int64_t> per_channel_quantization_offsets(
           per_channel_quantization_scales.size(), 0);
       weights_ = AddInput({filter_type,
-                           {units_, input_size_},
-                           0,
-                           0,
-                           0,
-                           0,
-                           true,
+                           /*shape=*/{units_, input_size_},
+                           /*min=*/0,
+                           /*max=*/0,
+                           /*scale=*/0,
+                           /*zero_point=*/0,
+                           /*per_channel_quantization=*/true,
                            per_channel_quantization_scales,
                            per_channel_quantization_offsets,
-                           0});
+                           /*channel_index=*/0});
     } else {
       // per-tensor
       float min = input.min;
       float max = input.max;
-      if (filter_type == TensorType_INT4 || filter_type == TensorType_INT8) {
-        min = filter_type == TensorType_INT4 ? -7.f : -63.5f;
-        max = filter_type == TensorType_INT4 ? 7.f : 64.f;
+      switch (filter_type) {
+        case TensorType_INT4:
+          min = -7.f;
+          max = 7.f;
+          break;
+        case TensorType_INT2:
+          min = -2.f;
+          max = 2.f;
+          break;
+        case TensorType_INT8:
+          min = -63.5f;
+          max = 64.f;
+          break;
+        default:
+          break;
       }
       weights_ = AddInput({filter_type, {units_, input_size_}, min, max});
     }
@@ -292,6 +305,13 @@ class QuantizedFullyConnectedOpModel : public BaseFullyConnectedOpModel {
     QuantizeAndPopulate4bit(weights_, data);
   }
 
+  void SetWeights2bit(const std::vector<float>& data) {
+    TfLiteTensor* t = interpreter_->tensor(weights_);
+    std::vector<int8_t> u =
+        Quantize<int8_t>(data, t->params.scale, t->params.zero_point, t->type);
+    PopulateTensor2bit(weights_, 0, u.data(), u.data() + u.size());
+  }
+
   template <typename T>
   void ShuffleAndSetWeights(const std::vector<float>& data, int input_depth,
                             int output_depth) {
@@ -372,6 +392,12 @@ class PerChannelQuantizedFullyConnectedOpModel
     PerChannelSymmetricQuantizeAndPopulate(weights_, data);
   }
 
+  void SetWeights2bit(const std::vector<float>& data) {
+    // 2 bit logic handled in PerChannelSymmetricQuantizeAndPopulate.
+    ABSL_CHECK_EQ(interpreter_->tensor(weights_)->type, kTfLiteInt2);
+    PerChannelSymmetricQuantizeAndPopulate(weights_, data);
+  }
+
   template <typename T>
   void SetInput(const std::vector<float>& data) {
     QuantizeAndPopulate<T>(input_, data);
@@ -734,6 +760,38 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt4) {
   EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(103, 104, 105, 97, 98, 99));
 }
 
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt2) {
+  QuantizedFullyConnectedOpModel m(
+      GetRegistration(), /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_INT8, {2, 10}, -63.5, 64},
+      /*output=*/{TensorType_INT8, {}, -127, 128}, TensorType_INT32, false,
+      false, ActivationFunctionType_RELU,
+      FullyConnectedOptionsWeightsFormat_DEFAULT, -1, TensorType_INT2);
+
+  m.SetWeights2bit({
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 0
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 1
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 2
+  });
+  m.SetBias({1., 2., 3.});
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  // The quantization parameters for the model.
+  // input s, zp: 0.5, -1
+  // filter s, zp: 0.5, 0
+  // output s, zp: 1, -1
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              testing::Pointwise(testing::FloatEq(),
+                                 {26.0, 27.0, 28.0, 8.0, 9.0, 10.0}));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(25, 26, 27, 7, 8, 9));
+}
+
 TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt8) {
   QuantizedFullyConnectedOpModel m(
       GetRegistration(), /*units=*/3, /*batches*/ 2,
@@ -863,6 +921,34 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTestPerChannelQuantizedInt4) {
   EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(103, 104, 105, 97, 98, 99));
 }
 
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTestPerChannelQuantizedInt2) {
+  PerChannelQuantizedFullyConnectedOpModel m(
+      GetRegistration(), /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_INT8, {2, 10}, -63.5, 64},
+      /*per_channel_quantization_scales=*/{1.0, 1.0, 1.0},
+      /*output=*/{TensorType_INT8, {}, -127, 128},
+      /*bias_type=*/TensorType_INT32, false, false, ActivationFunctionType_RELU,
+      FullyConnectedOptionsWeightsFormat_DEFAULT, -1, TensorType_INT2);
+
+  m.SetWeights2bit({
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 0
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 1
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({26, 27, 28, 8, 9, 10})));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(25, 26, 27, 7, 8, 9));
+}
+
 TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt16NoBias) {
   const float scale = 128.0 / 65536;
   QuantizedFullyConnectedOpModel m(
@@ -1018,6 +1104,37 @@ TEST_P(QuantizedFullyConnectedOpTest,
               ElementsAre(1536, 2048, 2560, 11776, 12288, 12800));
 }
 
+TEST_P(QuantizedFullyConnectedOpTest,
+       SimpleTestPerChannelQuantizedInt16Bias32Weight2) {
+  const float scale = 128.0 / 65536;
+  PerChannelQuantizedFullyConnectedOpModel m(
+      GetRegistration(), /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_INT16, {2, 10}, 0, 0, scale, 0},
+      /*per_channel_quantization_scales=*/{1.0, 1.0, 1.0},
+      /*output=*/{TensorType_INT16, {}, 0, 0, scale, 0},
+      /*bias_type=*/TensorType_INT32, false, false, ActivationFunctionType_RELU,
+      FullyConnectedOptionsWeightsFormat_DEFAULT, -1, TensorType_INT2);
+
+  m.SetWeights2bit({
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 0
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 1
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput<int16_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear({26, 27, 28, 8, 9, 10})));
+  EXPECT_THAT(m.GetOutput<int16_t>(),
+              ElementsAre(13312, 13824, 14336, 4096, 4608, 5120));
+}
+
 TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt16Bias64) {
   const float scale = 128.0 / 65536;
   QuantizedFullyConnectedOpModel m(
diff --git a/tflite/kernels/register_ref.cc b/tflite/kernels/register_ref.cc
@@ -280,7 +280,7 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() {
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED_REF(),
              /* min_version */ 1,
-             /* max_version */ 11);
+             /* max_version */ 14);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX_REF(),
diff --git a/tflite/kernels/test_util.h b/tflite/kernels/test_util.h
@@ -109,6 +109,9 @@ inline std::vector<T> Quantize(const std::vector<float>& data, float scale,
   if (type == kTfLiteInt4) {
     min = -7;
     max = 7;
+  } else if (type == kTfLiteInt2) {
+    min = -2;
+    max = 1;
   }
 
   q.reserve(data.size());
@@ -570,6 +573,15 @@ class SingleOpModel {
                        quantized_output.data() + quantized_output.size());
   }
 
+  void QuantizeAndPopulate2bit(int index, const std::vector<float>& data) {
+    TfLiteTensor* t = interpreter_->tensor(index);
+    t->type = kTfLiteInt2;
+    std::vector<int8_t> quantized_output =
+        Quantize<int8_t>(data, t->params.scale, t->params.zero_point, t->type);
+    PopulateTensor2bit(index, /*offset=*/0, quantized_output.data(),
+                       quantized_output.data() + quantized_output.size());
+  }
+
   void SymmetricQuantizeAndPopulate(int index, const std::vector<float>& data) {
     std::vector<int8_t> q = QuantizeTensor(index, data);
     PopulateTensor(index, /*offset=*/0, reinterpret_cast<uint8_t*>(q.data()),
@@ -583,6 +595,10 @@ class SingleOpModel {
       std::vector<int8_t> q = Quantize<int8_t>(data, t->params.scale,
                                                t->params.zero_point, t->type);
       PopulateTensor4bit(index, /*offset=*/0, q.data(), q.data() + q.size());
+    } else if (t->type == kTfLiteInt2) {
+      std::vector<int8_t> q = Quantize<int8_t>(data, t->params.scale,
+                                               t->params.zero_point, t->type);
+      PopulateTensor2bit(index, /*offset=*/0, q.data(), q.data() + q.size());
     } else {
       std::vector<int8_t> q = QuantizeTensor(index, data);
       PopulateTensor(index, /*offset=*/0, q.data(), q.data() + q.size());
@@ -663,6 +679,9 @@ class SingleOpModel {
       PopulateTensor4bit(index, /*offset=*/0, quantized_output.data(),
                          quantized_output.data() + quantized_output.size());
 
+    } else if (t->type == kTfLiteInt2) {
+      PopulateTensor2bit(index, /*offset=*/0, quantized_output.data(),
+                         quantized_output.data() + quantized_output.size());
     } else {
       PopulateTensor(index, /*offset=*/0, quantized_output.data(),
                      quantized_output.data() + quantized_output.size());
@@ -888,6 +907,9 @@ class SingleOpModel {
         } else if (t.type == TensorType_INT4) {
           std::tie(t.scale, t.zero_point) =
               QuantizationParams<int8_t>(t.min, t.max, kTfLiteInt4);
+        } else if (t.type == TensorType_INT2) {
+          std::tie(t.scale, t.zero_point) =
+              QuantizationParams<int8_t>(t.min, t.max, kTfLiteInt2);
         } else {
           ABSL_LOG(FATAL) << "No support for the requested quantized type";
         }
@@ -940,6 +962,9 @@ class SingleOpModel {
     if (type == kTfLiteInt4) {
       qmin = -7;
       qmax = 7;
+    } else if (type == kTfLiteInt2) {
+      qmin = -2;
+      qmax = 2;
     } else {
       qmin = std::numeric_limits<T>::min();
       qmax = std::numeric_limits<T>::max();