Migrate NLClassifier to using TextPreprocessor

lu-wang-g · tflite-support-robot · commit a2786869b4f5 · 2021-11-16T21:25:20.000-08:00
PiperOrigin-RevId: 410426912
diff --git a/tensorflow_lite_support/cc/task/processor/text_preprocessor.cc b/tensorflow_lite_support/cc/task/processor/text_preprocessor.cc
@@ -214,8 +214,7 @@ absl::Status TextPreprocessor::BertPreprocess(const std::string& input_text) {
 
 absl::Status TextPreprocessor::RegexPreprocess(const std::string& input_text) {
   TfLiteTensor* input_tensor = GetTensor();
-  auto regex_tokenizer = std::unique_ptr<RegexTokenizer>(
-      dynamic_cast<RegexTokenizer*>(tokenizer_.release()));
+  auto regex_tokenizer = dynamic_cast<RegexTokenizer*>(tokenizer_.get());
 
   //                              |<-------sentence_length-------->|
   // input_tensor                 <START>, t1, t2... <PAD>, <PAD>...
diff --git a/tensorflow_lite_support/cc/task/text/nlclassifier/BUILD b/tensorflow_lite_support/cc/task/text/nlclassifier/BUILD
@@ -20,6 +20,7 @@ cc_library_with_tflite(
         "@org_tensorflow//tensorflow/lite/core/shims:builtin_ops",
         "//tensorflow_lite_support/cc/task/core:base_task_api",
         "//tensorflow_lite_support/cc/task/core:task_api_factory",
+        "//tensorflow_lite_support/cc/task/processor:text_preprocessor",
     ],
     deps = [
         "//tensorflow_lite_support/cc:common",
@@ -28,10 +29,7 @@ cc_library_with_tflite(
         "//tensorflow_lite_support/cc/task/core:category",
         "//tensorflow_lite_support/cc/task/core:task_utils",
         "//tensorflow_lite_support/cc/task/text/proto:nl_classifier_options_proto_inc",
-        "//tensorflow_lite_support/cc/text/tokenizers:regex_tokenizer",
-        "//tensorflow_lite_support/cc/text/tokenizers:tokenizer",
         "//tensorflow_lite_support/cc/utils:common_utils",
-        "//tensorflow_lite_support/metadata/cc:metadata_extractor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/status",
diff --git a/tensorflow_lite_support/cc/task/text/nlclassifier/nl_classifier.cc b/tensorflow_lite_support/cc/task/text/nlclassifier/nl_classifier.cc
@@ -35,8 +35,6 @@ limitations under the License.
 #include "tensorflow_lite_support/cc/task/core/category.h"
 #include "tensorflow_lite_support/cc/task/core/task_api_factory.h"
 #include "tensorflow_lite_support/cc/task/core/task_utils.h"
-#include "tensorflow_lite_support/cc/text/tokenizers/regex_tokenizer.h"
-#include "tensorflow_lite_support/cc/text/tokenizers/tokenizer.h"
 #include "tensorflow_lite_support/cc/utils/common_utils.h"
 
 namespace tflite {
@@ -51,22 +49,16 @@ using ::tflite::TensorMetadata;
 using ::tflite::support::CreateStatusWithPayload;
 using ::tflite::support::StatusOr;
 using ::tflite::support::TfLiteSupportStatus;
-using ::tflite::support::text::tokenizer::RegexTokenizer;
-using ::tflite::support::text::tokenizer::Tokenizer;
-using ::tflite::support::text::tokenizer::TokenizerResult;
 using ::tflite::support::utils::LoadVocabFromBuffer;
 using ::tflite::task::core::Category;
 using ::tflite::task::core::Dequantize;
 using ::tflite::task::core::GetStringAtIndex;
-using ::tflite::task::core::PopulateTensor;
 using ::tflite::task::core::TaskAPIFactory;
 // To differenciate it with the struct option,
 // tflite::task::text::nl_classifier::NLClassifierOptions.
 using NLClassifierProtoOptions = ::tflite::task::text::NLClassifierOptions;
 
 namespace {
-constexpr int kRegexTokenizerInputTensorIndex = 0;
-constexpr int kRegexTokenizerProcessUnitIndex = 0;
 
 absl::Status SanityCheckOptions(const NLClassifierProtoOptions& options) {
   if (!options.has_base_options()) {
@@ -77,78 +69,6 @@ absl::Status SanityCheckOptions(const NLClassifierProtoOptions& options) {
   return absl::OkStatus();
 }
 
-StatusOr<absl::string_view> CheckAndLoadFirstAssociatedFile(
-    const flatbuffers::Vector<flatbuffers::Offset<tflite::AssociatedFile>>*
-        associated_files,
-    const tflite::metadata::ModelMetadataExtractor* metadata_extractor) {
-  if (associated_files == nullptr || associated_files->size() < 1 ||
-      associated_files->Get(0)->name() == nullptr) {
-    return CreateStatusWithPayload(
-        absl::StatusCode::kInvalidArgument,
-        "Invalid vocab_file from input process unit.",
-        TfLiteSupportStatus::kMetadataInvalidTokenizerError);
-  }
-  ASSIGN_OR_RETURN(absl::string_view vocab_buffer,
-                   metadata_extractor->GetAssociatedFile(
-                       associated_files->Get(0)->name()->str()));
-  return vocab_buffer;
-}
-
-StatusOr<std::unique_ptr<Tokenizer>> CreateRegexTokenizerFromProcessUnit(
-    const tflite::ProcessUnit* tokenizer_process_unit,
-    const tflite::metadata::ModelMetadataExtractor* metadata_extractor) {
-  if (metadata_extractor == nullptr || tokenizer_process_unit == nullptr) {
-    return CreateStatusWithPayload(
-        absl::StatusCode::kInvalidArgument,
-        "No metadata or input process unit found.",
-        TfLiteSupportStatus::kMetadataInvalidTokenizerError);
-  }
-
-  if (tokenizer_process_unit->options_type() !=
-      ProcessUnitOptions_RegexTokenizerOptions) {
-    return CreateStatusWithPayload(
-        absl::StatusCode::kNotFound,
-        absl::StrCat(
-            "Incorrect options_type:", tokenizer_process_unit->options_type(),
-            " need RegexTokenizerOptions."),
-        TfLiteSupportStatus::kMetadataInvalidTokenizerError);
-  }
-
-  const tflite::RegexTokenizerOptions* options =
-      tokenizer_process_unit->options_as<RegexTokenizerOptions>();
-  ASSIGN_OR_RETURN(absl::string_view vocab_buffer,
-                   CheckAndLoadFirstAssociatedFile(options->vocab_file(),
-                                                   metadata_extractor));
-  if (options->delim_regex_pattern() == nullptr) {
-    return CreateStatusWithPayload(
-        absl::StatusCode::kInvalidArgument,
-        "Invalid delim_regex_pattern from input process unit.",
-        TfLiteSupportStatus::kMetadataInvalidTokenizerError);
-  }
-
-  std::unique_ptr<RegexTokenizer> regex_tokenizer =
-      absl::make_unique<RegexTokenizer>(options->delim_regex_pattern()->str(),
-                                        vocab_buffer.data(),
-                                        vocab_buffer.size());
-
-  int unknown_token_id = 0;
-  if (!regex_tokenizer->GetUnknownToken(&unknown_token_id)) {
-    return CreateStatusWithPayload(
-        absl::StatusCode::kInvalidArgument,
-        "RegexTokenizer doesn't have <UNKNOWN> token.",
-        TfLiteSupportStatus::kMetadataInvalidTokenizerError);
-  }
-
-  int pad_token_id = 0;
-  if (!regex_tokenizer->GetPadToken(&pad_token_id)) {
-    return CreateStatusWithPayload(
-        absl::StatusCode::kInvalidArgument,
-        "RegexTokenizer doesn't have <PAD> token.",
-        TfLiteSupportStatus::kMetadataInvalidTokenizerError);
-  }
-  return std::move(regex_tokenizer);
-}
-
 }  // namespace
 
 const NLClassifierOptions& NLClassifier::GetOptions() const {
@@ -201,58 +121,7 @@ std::vector<Category> NLClassifier::Classify(const std::string& text) {
 
 absl::Status NLClassifier::Preprocess(
     const std::vector<TfLiteTensor*>& input_tensors, const std::string& input) {
-  TfLiteTensor* input_tensor = FindTensorWithNameOrIndex(
-      input_tensors, GetMetadataExtractor()->GetInputTensorMetadata(),
-      struct_options_.input_tensor_name, struct_options_.input_tensor_index);
-  if (input_tensor == nullptr) {
-    return CreateStatusWithPayload(
-        absl::StatusCode::kInvalidArgument,
-        "No input tensor found from NLClassifierOptions.",
-        TfLiteSupportStatus::kInputTensorNotFoundError);
-  }
-
-  if (HasRegexTokenizerMetadata()) {
-    //                              |<-------sentence_length-------->|
-    // input_tensor                 <START>, t1, t2... <PAD>, <PAD>...
-    // <START> is optional, t1, t2... will be replaced by <UNKNOWN> if it's not
-    // found in tokenizer vocab.
-    TokenizerResult result = tokenizer_->Tokenize(input);
-
-    size_t max_sentence_length = input_tensor->dims->size == 2
-                                     ? input_tensor->dims->data[1]
-                                     : input_tensor->dims->data[0];
-
-    int unknown_token_id = 0;
-    tokenizer_->GetUnknownToken(&unknown_token_id);
-
-    int pad_token_id = 0;
-    tokenizer_->GetPadToken(&pad_token_id);
-
-    std::vector<int> input_tokens(max_sentence_length, pad_token_id);
-    int start_token_id = 0;
-    size_t input_token_index = 0;
-    if (tokenizer_->GetStartToken(&start_token_id)) {
-      input_tokens[0] = start_token_id;
-      input_token_index = 1;
-    }
-
-    for (size_t i = 0; (i < result.subwords.size()) &&
-                       (input_token_index < max_sentence_length);
-         ++i, ++input_token_index) {
-      const std::string& token = result.subwords[i];
-      int token_id = 0;
-      if (tokenizer_->LookupId(token, &token_id)) {
-        input_tokens[input_token_index] = token_id;
-      } else {
-        input_tokens[input_token_index] = unknown_token_id;
-      }
-    }
-
-    RETURN_IF_ERROR(PopulateTensor(input_tokens, input_tensor));
-  } else {
-    RETURN_IF_ERROR(PopulateTensor(input, input_tensor));
-  }
-  return absl::OkStatus();
+  return preprocessor_->Preprocess(input);
 }
 
 StatusOr<std::vector<Category>> NLClassifier::Postprocess(
@@ -327,38 +196,23 @@ absl::Status NLClassifier::Initialize(
 
 absl::Status NLClassifier::Initialize(const NLClassifierOptions& options) {
   struct_options_ = options;
-  // input tensor should be type STRING
-  auto input_tensor = FindTensorWithNameOrIndex(
+
+  int input_index = FindTensorIndex(
       GetInputTensors(), GetMetadataExtractor()->GetInputTensorMetadata(),
       options.input_tensor_name, options.input_tensor_index);
-  if (input_tensor == nullptr) {
+
+  if (input_index < 0 || input_index >= GetInputCount()) {
     return CreateStatusWithPayload(
         StatusCode::kInvalidArgument,
         absl::StrCat("No input tensor found with name ",
                      options.input_tensor_name, " or at index ",
                      options.input_tensor_index),
         TfLiteSupportStatus::kInputTensorNotFoundError);
   }
-  if (HasRegexTokenizerMetadata()) {
-    if (input_tensor->type != kTfLiteInt32) {
-      return CreateStatusWithPayload(
-          StatusCode::kInvalidArgument,
-          absl::StrCat("Type mismatch for input tensor ", input_tensor->name,
-                       ". Requested INT32, got ",
-                       TfLiteTypeGetName(input_tensor->type), "."),
-          TfLiteSupportStatus::kInvalidInputTensorTypeError);
-    }
-    RETURN_IF_ERROR(SetupRegexTokenizer());
-  } else {
-    if (input_tensor->type != kTfLiteString) {
-      return CreateStatusWithPayload(
-          StatusCode::kInvalidArgument,
-          absl::StrCat("Type mismatch for input tensor ", input_tensor->name,
-                       ". Requested STRING, got ",
-                       TfLiteTypeGetName(input_tensor->type), "."),
-          TfLiteSupportStatus::kInvalidInputTensorTypeError);
-    }
-  }
+
+  // Create preprocessor.
+  ASSIGN_OR_RETURN(preprocessor_, processor::TextPreprocessor::Create(
+                                      GetTfLiteEngine(), {input_index}));
 
   // output score tensor should be type
   // UINT8/INT8/INT16(quantized) or FLOAT32/FLOAT64(dequantized) or BOOL
@@ -480,35 +334,6 @@ StatusOr<std::unique_ptr<NLClassifier>> NLClassifier::CreateFromFdAndOptions(
   return std::move(nl_classifier);
 }
 
-bool NLClassifier::HasRegexTokenizerMetadata() {
-  const TensorMetadata* input_tensor_metadata =
-      GetMetadataExtractor()->GetInputTensorMetadata(
-          kRegexTokenizerInputTensorIndex);
-  if (input_tensor_metadata == nullptr) {
-    return false;
-  }
-  tflite::support::StatusOr<const tflite::ProcessUnit*> status =
-      GetMetadataExtractor()->FindFirstProcessUnit(
-          *input_tensor_metadata, ProcessUnitOptions_RegexTokenizerOptions);
-  return status.ok() ? status.value() != nullptr : false;
-}
-
-absl::Status NLClassifier::SetupRegexTokenizer() {
-  ASSIGN_OR_RETURN(
-      std::unique_ptr<Tokenizer> base_tokenizer,
-      CreateRegexTokenizerFromProcessUnit(
-          GetMetadataExtractor()
-              ->GetInputTensorMetadata(kRegexTokenizerInputTensorIndex)
-              ->process_units()
-              ->Get(kRegexTokenizerProcessUnitIndex),
-          GetMetadataExtractor()));
-
-  tokenizer_ = std::unique_ptr<RegexTokenizer>(
-      dynamic_cast<RegexTokenizer*>(base_tokenizer.release()));
-
-  return absl::OkStatus();
-}
-
 }  // namespace nlclassifier
 }  // namespace text
 }  // namespace task
diff --git a/tensorflow_lite_support/cc/task/text/nlclassifier/nl_classifier.h b/tensorflow_lite_support/cc/task/text/nlclassifier/nl_classifier.h
@@ -34,8 +34,8 @@ limitations under the License.
 #include "tensorflow_lite_support/cc/port/statusor.h"
 #include "tensorflow_lite_support/cc/task/core/base_task_api.h"
 #include "tensorflow_lite_support/cc/task/core/category.h"
+#include "tensorflow_lite_support/cc/task/processor/text_preprocessor.h"
 #include "tensorflow_lite_support/cc/task/text/proto/nl_classifier_options_proto_inc.h"
-#include "tensorflow_lite_support/cc/text/tokenizers/regex_tokenizer.h"
 
 namespace tflite {
 namespace task {
@@ -181,25 +181,41 @@ class NLClassifier : public core::BaseTaskApi<std::vector<core::Category>,
       const flatbuffers::Vector<flatbuffers::Offset<TensorMetadata>>*
           metadata_array,
       const std::string& name, int index) {
+    int tensor_index = FindTensorIndex(tensors, metadata_array, name, index);
+    return tensor_index >= 0 && tensor_index < tensors.size()
+               ? tensors[tensor_index]
+               : nullptr;
+  }
+
+  // Gets the tensor index of the specified tensor name from a vector of tensors
+  // Return nullptr if no tensor is found by name (metadata tensor name or model
+  // tensor name).
+  template <typename TensorType>
+  static int FindTensorIndex(
+      const std::vector<TensorType*>& tensors,
+      const flatbuffers::Vector<flatbuffers::Offset<TensorMetadata>>*
+          metadata_array,
+      const std::string& name, int default_index) {
     if (metadata_array != nullptr && metadata_array->size() == tensors.size()) {
       for (int i = 0; i < metadata_array->size(); i++) {
         if (strcmp(name.data(), metadata_array->Get(i)->name()->c_str()) == 0) {
-          return tensors[i];
+          return i;
         }
       }
     }
 
-    for (TensorType* tensor : tensors) {
+    for (int i = 0; i < tensors.size(); i++) {
+      TensorType* tensor = tensors[i];
       if (tensor->name == name) {
-        return tensor;
+        return i;
       }
     }
-    return index >= 0 && index < tensors.size() ? tensors[index] : nullptr;
+    return default_index;
   }
 
  private:
-  bool HasRegexTokenizerMetadata();
-  absl::Status SetupRegexTokenizer();
+  std::unique_ptr<tflite::task::processor::TextPreprocessor> preprocessor_ =
+      nullptr;
 
   std::unique_ptr<tflite::task::text::NLClassifierOptions> proto_options_;
 
@@ -210,7 +226,6 @@ class NLClassifier : public core::BaseTaskApi<std::vector<core::Category>,
   // labels vector initialized from output tensor's associated file, if one
   // exists.
   std::unique_ptr<std::vector<std::string>> labels_vector_;
-  std::unique_ptr<tflite::support::text::tokenizer::RegexTokenizer> tokenizer_;
 };
 
 }  // namespace nlclassifier