Migrate NLClassifier to using TextPreprocessor

ziyeqinghan · tflite-support-robot · commit aed9982779b0 · 2021-11-16T23:44:36.000-08:00
PiperOrigin-RevId: 410445262
diff --git a/tensorflow_lite_support/cc/task/processor/text_preprocessor.cc b/tensorflow_lite_support/cc/task/processor/text_preprocessor.cc
@@ -214,7 +214,8 @@ absl::Status TextPreprocessor::BertPreprocess(const std::string& input_text) {
 
 absl::Status TextPreprocessor::RegexPreprocess(const std::string& input_text) {
   TfLiteTensor* input_tensor = GetTensor();
-  auto regex_tokenizer = dynamic_cast<RegexTokenizer*>(tokenizer_.get());
+  auto regex_tokenizer = std::unique_ptr<RegexTokenizer>(
+      dynamic_cast<RegexTokenizer*>(tokenizer_.release()));
 
   //                              |<-------sentence_length-------->|
   // input_tensor                 <START>, t1, t2... <PAD>, <PAD>...
diff --git a/tensorflow_lite_support/cc/task/text/nlclassifier/BUILD b/tensorflow_lite_support/cc/task/text/nlclassifier/BUILD
@@ -20,7 +20,6 @@ cc_library_with_tflite(
         "@org_tensorflow//tensorflow/lite/core/shims:builtin_ops",
         "//tensorflow_lite_support/cc/task/core:base_task_api",
         "//tensorflow_lite_support/cc/task/core:task_api_factory",
-        "//tensorflow_lite_support/cc/task/processor:text_preprocessor",
     ],
     deps = [
         "//tensorflow_lite_support/cc:common",
@@ -29,7 +28,10 @@ cc_library_with_tflite(
         "//tensorflow_lite_support/cc/task/core:category",
         "//tensorflow_lite_support/cc/task/core:task_utils",
         "//tensorflow_lite_support/cc/task/text/proto:nl_classifier_options_proto_inc",
+        "//tensorflow_lite_support/cc/text/tokenizers:regex_tokenizer",
+        "//tensorflow_lite_support/cc/text/tokenizers:tokenizer",
         "//tensorflow_lite_support/cc/utils:common_utils",
+        "//tensorflow_lite_support/metadata/cc:metadata_extractor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/status",
diff --git a/tensorflow_lite_support/cc/task/text/nlclassifier/nl_classifier.cc b/tensorflow_lite_support/cc/task/text/nlclassifier/nl_classifier.cc
@@ -35,6 +35,8 @@ limitations under the License.
 #include "tensorflow_lite_support/cc/task/core/category.h"
 #include "tensorflow_lite_support/cc/task/core/task_api_factory.h"
 #include "tensorflow_lite_support/cc/task/core/task_utils.h"
+#include "tensorflow_lite_support/cc/text/tokenizers/regex_tokenizer.h"
+#include "tensorflow_lite_support/cc/text/tokenizers/tokenizer.h"
 #include "tensorflow_lite_support/cc/utils/common_utils.h"
 
 namespace tflite {
@@ -49,16 +51,22 @@ using ::tflite::TensorMetadata;
 using ::tflite::support::CreateStatusWithPayload;
 using ::tflite::support::StatusOr;
 using ::tflite::support::TfLiteSupportStatus;
+using ::tflite::support::text::tokenizer::RegexTokenizer;
+using ::tflite::support::text::tokenizer::Tokenizer;
+using ::tflite::support::text::tokenizer::TokenizerResult;
 using ::tflite::support::utils::LoadVocabFromBuffer;
 using ::tflite::task::core::Category;
 using ::tflite::task::core::Dequantize;
 using ::tflite::task::core::GetStringAtIndex;
+using ::tflite::task::core::PopulateTensor;
 using ::tflite::task::core::TaskAPIFactory;
 // To differenciate it with the struct option,
 // tflite::task::text::nl_classifier::NLClassifierOptions.
 using NLClassifierProtoOptions = ::tflite::task::text::NLClassifierOptions;
 
 namespace {
+constexpr int kRegexTokenizerInputTensorIndex = 0;
+constexpr int kRegexTokenizerProcessUnitIndex = 0;
 
 absl::Status SanityCheckOptions(const NLClassifierProtoOptions& options) {
   if (!options.has_base_options()) {
@@ -69,6 +77,78 @@ absl::Status SanityCheckOptions(const NLClassifierProtoOptions& options) {
   return absl::OkStatus();
 }
 
+StatusOr<absl::string_view> CheckAndLoadFirstAssociatedFile(
+    const flatbuffers::Vector<flatbuffers::Offset<tflite::AssociatedFile>>*
+        associated_files,
+    const tflite::metadata::ModelMetadataExtractor* metadata_extractor) {
+  if (associated_files == nullptr || associated_files->size() < 1 ||
+      associated_files->Get(0)->name() == nullptr) {
+    return CreateStatusWithPayload(
+        absl::StatusCode::kInvalidArgument,
+        "Invalid vocab_file from input process unit.",
+        TfLiteSupportStatus::kMetadataInvalidTokenizerError);
+  }
+  ASSIGN_OR_RETURN(absl::string_view vocab_buffer,
+                   metadata_extractor->GetAssociatedFile(
+                       associated_files->Get(0)->name()->str()));
+  return vocab_buffer;
+}
+
+StatusOr<std::unique_ptr<Tokenizer>> CreateRegexTokenizerFromProcessUnit(
+    const tflite::ProcessUnit* tokenizer_process_unit,
+    const tflite::metadata::ModelMetadataExtractor* metadata_extractor) {
+  if (metadata_extractor == nullptr || tokenizer_process_unit == nullptr) {
+    return CreateStatusWithPayload(
+        absl::StatusCode::kInvalidArgument,
+        "No metadata or input process unit found.",
+        TfLiteSupportStatus::kMetadataInvalidTokenizerError);
+  }
+
+  if (tokenizer_process_unit->options_type() !=
+      ProcessUnitOptions_RegexTokenizerOptions) {
+    return CreateStatusWithPayload(
+        absl::StatusCode::kNotFound,
+        absl::StrCat(
+            "Incorrect options_type:", tokenizer_process_unit->options_type(),
+            " need RegexTokenizerOptions."),
+        TfLiteSupportStatus::kMetadataInvalidTokenizerError);
+  }
+
+  const tflite::RegexTokenizerOptions* options =
+      tokenizer_process_unit->options_as<RegexTokenizerOptions>();
+  ASSIGN_OR_RETURN(absl::string_view vocab_buffer,
+                   CheckAndLoadFirstAssociatedFile(options->vocab_file(),
+                                                   metadata_extractor));
+  if (options->delim_regex_pattern() == nullptr) {
+    return CreateStatusWithPayload(
+        absl::StatusCode::kInvalidArgument,
+        "Invalid delim_regex_pattern from input process unit.",
+        TfLiteSupportStatus::kMetadataInvalidTokenizerError);
+  }
+
+  std::unique_ptr<RegexTokenizer> regex_tokenizer =
+      absl::make_unique<RegexTokenizer>(options->delim_regex_pattern()->str(),
+                                        vocab_buffer.data(),
+                                        vocab_buffer.size());
+
+  int unknown_token_id = 0;
+  if (!regex_tokenizer->GetUnknownToken(&unknown_token_id)) {
+    return CreateStatusWithPayload(
+        absl::StatusCode::kInvalidArgument,
+        "RegexTokenizer doesn't have <UNKNOWN> token.",
+        TfLiteSupportStatus::kMetadataInvalidTokenizerError);
+  }
+
+  int pad_token_id = 0;
+  if (!regex_tokenizer->GetPadToken(&pad_token_id)) {
+    return CreateStatusWithPayload(
+        absl::StatusCode::kInvalidArgument,
+        "RegexTokenizer doesn't have <PAD> token.",
+        TfLiteSupportStatus::kMetadataInvalidTokenizerError);
+  }
+  return std::move(regex_tokenizer);
+}
+
 }  // namespace
 
 const NLClassifierOptions& NLClassifier::GetOptions() const {
@@ -121,7 +201,58 @@ std::vector<Category> NLClassifier::Classify(const std::string& text) {
 
 absl::Status NLClassifier::Preprocess(
     const std::vector<TfLiteTensor*>& input_tensors, const std::string& input) {
-  return preprocessor_->Preprocess(input);
+  TfLiteTensor* input_tensor = FindTensorWithNameOrIndex(
+      input_tensors, GetMetadataExtractor()->GetInputTensorMetadata(),
+      struct_options_.input_tensor_name, struct_options_.input_tensor_index);
+  if (input_tensor == nullptr) {
+    return CreateStatusWithPayload(
+        absl::StatusCode::kInvalidArgument,
+        "No input tensor found from NLClassifierOptions.",
+        TfLiteSupportStatus::kInputTensorNotFoundError);
+  }
+
+  if (HasRegexTokenizerMetadata()) {
+    //                              |<-------sentence_length-------->|
+    // input_tensor                 <START>, t1, t2... <PAD>, <PAD>...
+    // <START> is optional, t1, t2... will be replaced by <UNKNOWN> if it's not
+    // found in tokenizer vocab.
+    TokenizerResult result = tokenizer_->Tokenize(input);
+
+    size_t max_sentence_length = input_tensor->dims->size == 2
+                                     ? input_tensor->dims->data[1]
+                                     : input_tensor->dims->data[0];
+
+    int unknown_token_id = 0;
+    tokenizer_->GetUnknownToken(&unknown_token_id);
+
+    int pad_token_id = 0;
+    tokenizer_->GetPadToken(&pad_token_id);
+
+    std::vector<int> input_tokens(max_sentence_length, pad_token_id);
+    int start_token_id = 0;
+    size_t input_token_index = 0;
+    if (tokenizer_->GetStartToken(&start_token_id)) {
+      input_tokens[0] = start_token_id;
+      input_token_index = 1;
+    }
+
+    for (size_t i = 0; (i < result.subwords.size()) &&
+                       (input_token_index < max_sentence_length);
+         ++i, ++input_token_index) {
+      const std::string& token = result.subwords[i];
+      int token_id = 0;
+      if (tokenizer_->LookupId(token, &token_id)) {
+        input_tokens[input_token_index] = token_id;
+      } else {
+        input_tokens[input_token_index] = unknown_token_id;
+      }
+    }
+
+    RETURN_IF_ERROR(PopulateTensor(input_tokens, input_tensor));
+  } else {
+    RETURN_IF_ERROR(PopulateTensor(input, input_tensor));
+  }
+  return absl::OkStatus();
 }
 
 StatusOr<std::vector<Category>> NLClassifier::Postprocess(
@@ -196,23 +327,38 @@ absl::Status NLClassifier::Initialize(
 
 absl::Status NLClassifier::Initialize(const NLClassifierOptions& options) {
   struct_options_ = options;
-
-  int input_index = FindTensorIndex(
+  // input tensor should be type STRING
+  auto input_tensor = FindTensorWithNameOrIndex(
       GetInputTensors(), GetMetadataExtractor()->GetInputTensorMetadata(),
       options.input_tensor_name, options.input_tensor_index);
-
-  if (input_index < 0 || input_index >= GetInputCount()) {
+  if (input_tensor == nullptr) {
     return CreateStatusWithPayload(
         StatusCode::kInvalidArgument,
         absl::StrCat("No input tensor found with name ",
                      options.input_tensor_name, " or at index ",
                      options.input_tensor_index),
         TfLiteSupportStatus::kInputTensorNotFoundError);
   }
-
-  // Create preprocessor.
-  ASSIGN_OR_RETURN(preprocessor_, processor::TextPreprocessor::Create(
-                                      GetTfLiteEngine(), {input_index}));
+  if (HasRegexTokenizerMetadata()) {
+    if (input_tensor->type != kTfLiteInt32) {
+      return CreateStatusWithPayload(
+          StatusCode::kInvalidArgument,
+          absl::StrCat("Type mismatch for input tensor ", input_tensor->name,
+                       ". Requested INT32, got ",
+                       TfLiteTypeGetName(input_tensor->type), "."),
+          TfLiteSupportStatus::kInvalidInputTensorTypeError);
+    }
+    RETURN_IF_ERROR(SetupRegexTokenizer());
+  } else {
+    if (input_tensor->type != kTfLiteString) {
+      return CreateStatusWithPayload(
+          StatusCode::kInvalidArgument,
+          absl::StrCat("Type mismatch for input tensor ", input_tensor->name,
+                       ". Requested STRING, got ",
+                       TfLiteTypeGetName(input_tensor->type), "."),
+          TfLiteSupportStatus::kInvalidInputTensorTypeError);
+    }
+  }
 
   // output score tensor should be type
   // UINT8/INT8/INT16(quantized) or FLOAT32/FLOAT64(dequantized) or BOOL
@@ -334,6 +480,35 @@ StatusOr<std::unique_ptr<NLClassifier>> NLClassifier::CreateFromFdAndOptions(
   return std::move(nl_classifier);
 }
 
+bool NLClassifier::HasRegexTokenizerMetadata() {
+  const TensorMetadata* input_tensor_metadata =
+      GetMetadataExtractor()->GetInputTensorMetadata(
+          kRegexTokenizerInputTensorIndex);
+  if (input_tensor_metadata == nullptr) {
+    return false;
+  }
+  tflite::support::StatusOr<const tflite::ProcessUnit*> status =
+      GetMetadataExtractor()->FindFirstProcessUnit(
+          *input_tensor_metadata, ProcessUnitOptions_RegexTokenizerOptions);
+  return status.ok() ? status.value() != nullptr : false;
+}
+
+absl::Status NLClassifier::SetupRegexTokenizer() {
+  ASSIGN_OR_RETURN(
+      std::unique_ptr<Tokenizer> base_tokenizer,
+      CreateRegexTokenizerFromProcessUnit(
+          GetMetadataExtractor()
+              ->GetInputTensorMetadata(kRegexTokenizerInputTensorIndex)
+              ->process_units()
+              ->Get(kRegexTokenizerProcessUnitIndex),
+          GetMetadataExtractor()));
+
+  tokenizer_ = std::unique_ptr<RegexTokenizer>(
+      dynamic_cast<RegexTokenizer*>(base_tokenizer.release()));
+
+  return absl::OkStatus();
+}
+
 }  // namespace nlclassifier
 }  // namespace text
 }  // namespace task
diff --git a/tensorflow_lite_support/cc/task/text/nlclassifier/nl_classifier.h b/tensorflow_lite_support/cc/task/text/nlclassifier/nl_classifier.h
@@ -34,8 +34,8 @@ limitations under the License.
 #include "tensorflow_lite_support/cc/port/statusor.h"
 #include "tensorflow_lite_support/cc/task/core/base_task_api.h"
 #include "tensorflow_lite_support/cc/task/core/category.h"
-#include "tensorflow_lite_support/cc/task/processor/text_preprocessor.h"
 #include "tensorflow_lite_support/cc/task/text/proto/nl_classifier_options_proto_inc.h"
+#include "tensorflow_lite_support/cc/text/tokenizers/regex_tokenizer.h"
 
 namespace tflite {
 namespace task {
@@ -181,41 +181,25 @@ class NLClassifier : public core::BaseTaskApi<std::vector<core::Category>,
       const flatbuffers::Vector<flatbuffers::Offset<TensorMetadata>>*
           metadata_array,
       const std::string& name, int index) {
-    int tensor_index = FindTensorIndex(tensors, metadata_array, name, index);
-    return tensor_index >= 0 && tensor_index < tensors.size()
-               ? tensors[tensor_index]
-               : nullptr;
-  }
-
-  // Gets the tensor index of the specified tensor name from a vector of tensors
-  // Return nullptr if no tensor is found by name (metadata tensor name or model
-  // tensor name).
-  template <typename TensorType>
-  static int FindTensorIndex(
-      const std::vector<TensorType*>& tensors,
-      const flatbuffers::Vector<flatbuffers::Offset<TensorMetadata>>*
-          metadata_array,
-      const std::string& name, int default_index) {
     if (metadata_array != nullptr && metadata_array->size() == tensors.size()) {
       for (int i = 0; i < metadata_array->size(); i++) {
         if (strcmp(name.data(), metadata_array->Get(i)->name()->c_str()) == 0) {
-          return i;
+          return tensors[i];
         }
       }
     }
 
-    for (int i = 0; i < tensors.size(); i++) {
-      TensorType* tensor = tensors[i];
+    for (TensorType* tensor : tensors) {
       if (tensor->name == name) {
-        return i;
+        return tensor;
       }
     }
-    return default_index;
+    return index >= 0 && index < tensors.size() ? tensors[index] : nullptr;
   }
 
  private:
-  std::unique_ptr<tflite::task::processor::TextPreprocessor> preprocessor_ =
-      nullptr;
+  bool HasRegexTokenizerMetadata();
+  absl::Status SetupRegexTokenizer();
 
   std::unique_ptr<tflite::task::text::NLClassifierOptions> proto_options_;
 
@@ -226,6 +210,7 @@ class NLClassifier : public core::BaseTaskApi<std::vector<core::Category>,
   // labels vector initialized from output tensor's associated file, if one
   // exists.
   std::unique_ptr<std::vector<std::string>> labels_vector_;
+  std::unique_ptr<tflite::support::text::tokenizer::RegexTokenizer> tokenizer_;
 };
 
 }  // namespace nlclassifier