Add LoRA support to AI Edge Transformers.

hheydary · copybara-github · commit b91c2e97ac32 · 2025-01-07T14:11:54.000-08:00
PiperOrigin-RevId: 713037640
diff --git a/ai_edge_torch/generative/examples/cpp/BUILD b/ai_edge_torch/generative/examples/cpp/BUILD
@@ -24,9 +24,17 @@ package(
 
 cc_library(
     name = "utils",
+    srcs = ["utils.cc"],
     hdrs = ["utils.h"],
     deps = [
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@org_tensorflow//tensorflow/lite:framework",
         "@org_tensorflow//tensorflow/lite:util",
+        "@org_tensorflow//tensorflow/lite/schema:schema_fbs",
     ],
 )
 
diff --git a/ai_edge_torch/generative/examples/cpp/text_generator_main.cc b/ai_edge_torch/generative/examples/cpp/text_generator_main.cc
@@ -71,10 +71,12 @@ ABSL_FLAG(std::string, stop_token, "",
 ABSL_FLAG(int, num_threads, 4, "Number of threads to use. Defaults to 4.");
 ABSL_FLAG(std::string, weight_cache_path, "",
           "XNNPACK weight caching path, e.g. /tmp/model.xnnpack_cache.");
+ABSL_FLAG(std::string, lora_path, "", "Optional path to LoRA artifact.");
 
 namespace {
 
 using ai_edge_torch::examples::AlignedAllocator;
+using ai_edge_torch::examples::LoRA;
 
 std::unique_ptr<tflite::FlatBufferModel> LoadModel() {
   std::unique_ptr<tflite::FlatBufferModel> model =
@@ -172,23 +174,32 @@ void PrepareRunner(
 tflite::SignatureRunner* GetPrefillRunner(
     tflite::Interpreter* interpreter, std::size_t num_input_tokens,
     std::map<std::string, std::vector<float, AlignedAllocator<float>>>&
-        kv_cache) {
-  // Find the prefill signature that best matches the input token size.
+        kv_cache,
+    const LoRA* lora) {
+  // Find the prefill signature length that best matches the input token size.
   tflite::SignatureRunner* runner = nullptr;
+  int best_seq_size = -1;
   int delta = std::numeric_limits<int>::max();
   for (const std::string* key : interpreter->signature_keys()) {
-    if (!absl::StrContains(*key, "prefill")) {
+    if (!absl::StrContains(*key, "prefill") ||
+        absl::StrContains(*key, "lora")) {
       continue;
     }
     TfLiteTensor* input_pos = interpreter->GetSignatureRunner(key->c_str())
                                   ->input_tensor("input_pos");
     // The expected shape for input position is [Seq].
     int seq_size = input_pos->dims->data[0];
     if (num_input_tokens <= seq_size && seq_size - num_input_tokens < delta) {
-      runner = interpreter->GetSignatureRunner(key->c_str());
+      if (lora == nullptr) {
+        runner = interpreter->GetSignatureRunner(key->c_str());
+      }
+      best_seq_size = seq_size;
       delta = seq_size - num_input_tokens;
     }
   }
+  if (lora != nullptr) {
+    runner = lora->GetPrefillRunner(interpreter, best_seq_size);
+  }
   MINIMAL_CHECK(runner != nullptr);
   PrepareRunner(runner, kv_cache);
   return runner;
@@ -197,8 +208,11 @@ tflite::SignatureRunner* GetPrefillRunner(
 tflite::SignatureRunner* GetDecodeRunner(
     tflite::Interpreter* interpreter,
     std::map<std::string, std::vector<float, AlignedAllocator<float>>>&
-        kv_cache) {
-  tflite::SignatureRunner* runner = interpreter->GetSignatureRunner("decode");
+        kv_cache,
+    LoRA* lora) {
+  tflite::SignatureRunner* runner =
+      lora == nullptr ? interpreter->GetSignatureRunner("decode")
+                      : lora->GetDecodeRunner(interpreter);
   MINIMAL_CHECK(runner != nullptr);
   PrepareRunner(runner, kv_cache);
   return runner;
@@ -242,7 +256,13 @@ int main(int argc, char* argv[]) {
       LoadSentencePieceProcessor();
   std::map<std::string, std::vector<float, AlignedAllocator<float>>> kv_cache =
       BuildKVCache(interpreter.get());
-  MINIMAL_CHECK(!kv_cache.empty())
+  MINIMAL_CHECK(!kv_cache.empty());
+
+  std::unique_ptr<LoRA> lora = nullptr;
+  if (!absl::GetFlag(FLAGS_lora_path).empty()) {
+    lora = LoRA::FromFile(absl::GetFlag(FLAGS_lora_path));
+    MINIMAL_CHECK(lora != nullptr);
+  }
 
   // Tokenize the input prompt.
   std::string prompt = absl::GetFlag(FLAGS_prompt);
@@ -263,10 +283,10 @@ int main(int argc, char* argv[]) {
   // Get prefill and decode signature runners.
   std::size_t effective_prefill_token_size = prompt_tokens.size() - 1;
   tflite::SignatureRunner* prefill_runner = GetPrefillRunner(
-      interpreter.get(), effective_prefill_token_size, kv_cache);
+      interpreter.get(), effective_prefill_token_size, kv_cache, lora.get());
   MINIMAL_CHECK(prefill_runner != nullptr);
   tflite::SignatureRunner* decode_runner =
-      GetDecodeRunner(interpreter.get(), kv_cache);
+      GetDecodeRunner(interpreter.get(), kv_cache, lora.get());
   MINIMAL_CHECK(decode_runner != nullptr);
 
   // Get Input Tensors for each of the runners.
diff --git a/ai_edge_torch/generative/examples/cpp/utils.cc b/ai_edge_torch/generative/examples/cpp/utils.cc
@@ -0,0 +1,133 @@
+/* Copyright 2025 The AI Edge Torch Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "ai_edge_torch/generative/examples/cpp/utils.h"
+
+#include <cstddef>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/signature_runner.h"
+
+namespace ai_edge_torch::examples {
+
+std::unique_ptr<LoRA> LoRA::FromFile(absl::string_view path) {
+  std::unique_ptr<tflite::FlatBufferModel> model =
+      tflite::FlatBufferModel::VerifyAndBuildFromFile(path.data());
+  if (model == nullptr) {
+    return nullptr;
+  }
+
+  int rank = -1;
+  absl::flat_hash_map<std::string, std::vector<float, AlignedAllocator<float>>>
+      tensors;
+  for (const auto& tensor :
+       *model->GetModel()->subgraphs()->Get(0)->tensors()) {
+    size_t size = 1;
+    for (const int& dim : *tensor->shape()) {
+      size *= dim;
+    }
+    std::vector<float, AlignedAllocator<float>> buffer(size);
+    const auto* data =
+        model->GetModel()->buffers()->Get(tensor->buffer())->data();
+    memcpy(buffer.data(), data->data(), data->size());
+    tensors.emplace(*tensor->name(), std::move(buffer));
+
+    if (tensor->name()->str() == "lora_atten_q_a_prime_weight_0") {
+      rank = tensor->shape()->Get(1);
+    }
+  }
+  if (rank == -1) {
+    return nullptr;
+  }
+
+  return absl::WrapUnique(new LoRA(rank, std::move(tensors)));
+}
+
+tflite::SignatureRunner* LoRA::GetPrefillRunner(
+    tflite::Interpreter* interpreter, int matched_sequence_length) const {
+  std::string signature_name =
+      absl::StrFormat("prefill_%d_lora_r%d", matched_sequence_length, rank_);
+  return GetRunnerHelper(interpreter, signature_name);
+}
+
+tflite::SignatureRunner* LoRA::GetDecodeRunner(
+    tflite::Interpreter* interpreter) const {
+  std::string signature_name = absl::StrFormat("decode_lora_r%d", rank_);
+  return GetRunnerHelper(interpreter, signature_name);
+};
+
+tflite::SignatureRunner* LoRA::GetRunnerHelper(
+    tflite::Interpreter* interpreter, absl::string_view signature_name) const {
+  tflite::SignatureRunner* runner =
+      interpreter->GetSignatureRunner(signature_name.data());
+  if (runner == nullptr) {
+    return nullptr;
+  }
+
+  absl::flat_hash_set<std::string> lora_input_tensors;
+  lora_input_tensors.reserve(runner->input_size());
+  for (const char* input_name : runner->input_names()) {
+    if (absl::StrContains(input_name, "lora")) {
+      lora_input_tensors.insert(input_name);
+    }
+  }
+
+  if (lora_input_tensors.size() < tensors_.size()) {
+    return nullptr;
+  }
+
+  for (const auto& [name, buffer] : tensors_) {
+    TfLiteTensor* tensor = runner->input_tensor(name.c_str());
+    if (tensor == nullptr) {
+      return nullptr;
+    }
+    lora_input_tensors.erase(name);
+    TfLiteCustomAllocation allocation = {
+        .data = static_cast<void*>(const_cast<float*>(buffer.data())),
+        .bytes = buffer.size() * sizeof(float)};
+    if (runner->SetCustomAllocationForInputTensor(name.c_str(), allocation) !=
+        kTfLiteOk) {
+      return nullptr;
+    }
+  };
+  if (runner->AllocateTensors() != kTfLiteOk) {
+    return nullptr;
+  }
+
+  for (const auto& name : lora_input_tensors) {
+    TfLiteTensor* tensor = runner->input_tensor(name.c_str());
+    if (tensor == nullptr) {
+      return nullptr;
+    }
+    memset(tensor->data.data, 0, tensor->bytes);
+  }
+
+  return runner;
+}
+
+}  // namespace ai_edge_torch::examples
diff --git a/ai_edge_torch/generative/examples/cpp/utils.h b/ai_edge_torch/generative/examples/cpp/utils.h
@@ -1,8 +1,31 @@
+/* Copyright 2025 The AI Edge Torch Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
 #ifndef THIRD_PARTY_PY_AI_EDGE_TORCH_GENERATIVE_EXAMPLES_CPP_UTILS_H_
 #define THIRD_PARTY_PY_AI_EDGE_TORCH_GENERATIVE_EXAMPLES_CPP_UTILS_H_
 
 #include <cstddef>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/signature_runner.h"
 #include "tensorflow/lite/util.h"
 
 namespace ai_edge_torch::examples {
@@ -39,6 +62,40 @@ class AlignedAllocator {
   void deallocate(T* ptr, std::size_t n) { free(ptr); }
 };
 
+// An example implementation of LoRA adapters manager for TFLite interpreter.
+// The class loads an adapter from a flatbuffers files and provides helper
+// methods for finding the right signature and setting the appropriate input
+// tensors. Please note the use of CustomAllocator to ensure zero-copy loading
+// and potentially hot-swapping between multiple adapters with minimal cost.
+class LoRA {
+ public:
+  static std::unique_ptr<LoRA> FromFile(absl::string_view path);
+
+  tflite::SignatureRunner* GetPrefillRunner(tflite::Interpreter* interpreter,
+                                            int matched_sequence_length) const;
+  tflite::SignatureRunner* GetDecodeRunner(
+      tflite::Interpreter* interpreter) const;
+
+  int rank() const { return rank_; };
+
+ private:
+  explicit LoRA(int rank,
+                absl::flat_hash_map<std::string,
+                                    std::vector<float, AlignedAllocator<float>>>
+                    tensors)
+      : rank_(rank), tensors_(std::move(tensors)) {}
+
+  tflite::SignatureRunner* GetRunnerHelper(
+      tflite::Interpreter* interpreter, absl::string_view signature_name) const;
+
+  // The rank of the LoRA adapter.
+  const int rank_;
+  // A Map of names to LoRA tensors.
+  const absl::flat_hash_map<std::string,
+                            std::vector<float, AlignedAllocator<float>>>
+      tensors_;
+};
+
 }  // namespace ai_edge_torch::examples
 
 #endif  // THIRD_PARTY_PY_AI_EDGE_TORCH_GENERATIVE_EXAMPLES_CPP_UTILS_H_