lora example

lucylq · lucylq · commit 8710b01ede47 · 2025-08-18T21:28:49.000-07:00
diff --git a/program-data-separation/cpp/CMakeLists.txt b/program-data-separation/cpp/CMakeLists.txt
@@ -14,30 +14,59 @@ option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON)
 option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON)
 option(EXECUTORCH_BUILD_XNNPACK "" ON)
 
-# Add ExecuTorch subdirectory
+# Dependencies required for llm runner in lora demo.
+if(EXECUTORCH_BUILD_LORA_DEMO)
+option(EXECUTORCH_BUILD_EXTENSION_LLM "" ON)
+option(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER "" ON)
+option(EXECUTORCH_BUILD_KERNELS_LLM "" ON)
+option(EXECUTORCH_BUILD_KERNELS_LLM_AOT "" ON)
+endif()
+
+# Add ExecuTorch subdirectory, after setting options.
 add_subdirectory("executorch")
 
-set(DEMO_SOURCES linear_example/main.cpp)
+set(LINK_LIBS executorch
+              executorch::extensions
+              xnnpack_backend
+              # NOTE: xnnpack_backend has to go before
+              # kernels otherwise it doesn't get registered.
+              executorch::kernels
+              gflags
+)
+
+# Add sources and dependencies.
+set(DEMO_SOURCES "")
+if(EXECUTORCH_BUILD_LINEAR_DEMO)
+  list(APPEND DEMO_SOURCES "linear_example/main.cpp")
+endif()
+if(EXECUTORCH_BUILD_LORA_DEMO)
+  list(APPEND DEMO_SOURCES "lora_example/main.cpp")
+  add_subdirectory("executorch/examples/models/llama/runner")
+  list(APPEND LINK_LIBS llama_runner)
+endif()
 
 # Create executable
 add_executable(executorch_program_data_separation ${DEMO_SOURCES})
 
-# Include directories
-target_include_directories(executorch_program_data_separation PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
-
 # Link libraries
 target_link_libraries(
   executorch_program_data_separation
-  PRIVATE executorch
-          extension_module_static
-          extension_flat_tensor
-          extension_tensor
-          xnnpack_backend
-          portable_ops_lib
-          portable_kernels
-          gflags
+  PRIVATE ${LINK_LIBS}
 )
 
+# Include directories for lora demo.
+if(EXECUTORCH_BUILD_LORA_DEMO)
+  # Include directories
+  target_include_directories(executorch_program_data_separation PRIVATE
+      ${CMAKE_CURRENT_SOURCE_DIR}
+      ${CMAKE_CURRENT_SOURCE_DIR}/executorch/extension/llm/tokenizers/include
+  )
+  target_link_libraries(
+    executorch_program_data_separation
+    PUBLIC tokenizers::tokenizers
+  )
+endif()
+
 # Set output directory
 set_target_properties(executorch_program_data_separation
     PROPERTIES
diff --git a/program-data-separation/cpp/linear_example/build_example.sh b/program-data-separation/cpp/linear_example/build_example.sh
@@ -7,7 +7,7 @@ mkdir -p build
 cd build
 
 # Configure CMake
-cmake -DCMAKE_BUILD_TYPE=Release ../..
+cmake -DCMAKE_BUILD_TYPE=Release -DEXECUTORCH_BUILD_LINEAR_DEMO=True  ../..
 
 # Build the project
 cmake --build . -j$(nproc)
diff --git a/program-data-separation/cpp/lora_example/README.md b/program-data-separation/cpp/lora_example/README.md
@@ -14,12 +14,16 @@ conda create -yn executorch-ptd python=3.10.0 && conda activate executorch-ptd
 ```
 
 Install dependencies:
-```bash
-pip install executorch==0.7.0
+LoRA isn't available in the 0.7.0 release of ExecuTorch. Instead, please install from source until ExecuTorch 1.0 is released.
+
+[Install ExecuTorch pip package from source](https://docs.pytorch.org/executorch/stable/using-executorch-building-from-source.html#install-executorch-pip-package-from-source).
+
+Currently, the LoRA changes aren't in nightlies. Once they are in, you can also install from the nightly build.
+```
+pip install executorch==0.8.0.devYYYYMMDD --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 ```
 
 ## Export the model/s.
-
 Change into the program-data-separation directory and create a directory to hold exported artifacts.
 ```bash
 cd ~/executorch-examples/program-data-separation
@@ -28,16 +32,22 @@ mkdir models
 
 Export models into the `models` directory. The first command will generated undelegated model/data files, and the second will generate XNNPACK-delegated model/data files.
 ```bash
-./export_lora.sh
+sh export_lora.sh
 ```
-Expect the files `lora.pte` and `lora.ptd`.
+Expect the files:
+- llama_3_2_1B.pte
+- llama_3_2_1B.ptd
+- llama_3_2_1B_lora.pte
+- foundation_weights.ptd
+- tokenizer.model
+
+llama_3_2_1B.ptd and foundation_weights.ptd contain the same contents, and you can remove llama_3_2_1B.ptd.
+tokenizer.model is copied from the temp directory where we downloaded the HF artifacts. It will be used at runtime.
 
 Note:
 - PTE: contains the program execution logic.
 - PTD: contains the constant tensors used by the PTE.
 
-See [program-data-separation](../../program-data-separation/README.md) for instructions.
-
 ## Install runtime dependencies.
 The ExecuTorch repository is configured as a git submodule at `~/executorch-examples/program-data-separation/cpp/executorch`.  To initialize it:
 ```bash
@@ -53,21 +63,24 @@ pip install -r requirements-dev.txt
 ```
 
 ## Build the runtime.
+Install some dependencies:
+```bash
+cd ~/executorch-examples/program-data-separation/cpp/executorch
+sh examples/models/llama/install_requirements.sh
+```
+
 Build the executable:
 ```bash
 cd ~/executorch-examples/program-data-separation/cpp/lora_example
-chmod +x build_example.sh
-./build_example.sh
+sh build_example.sh
 ```
 
 ## Run the executable.
 ```
-./build/bin/executorch_program_data_separation --model-path ../../models/linear.pte --data-path ../../models/linear.ptd
-
-./build/bin/executorch_program_data_separation --model-path ../../models/linear_xnnpack.pte --data-path ../../models/linear_xnnpack.ptd
+./build/bin/executorch_program_data_separation --lora_model_path=../../llama_3_2_1B_lora.pte --llama_model_path=../../llama_3_2_1B.pte --tokenizer_path=../../tokenizer.model --data_path=../../foundation.ptd
 ```
 
 ## Clean up.
 rm -rf build
 cd ~/executorch-examples/program-data-separation
-rm -rf models
+rm -rf *.pte *.ptd tokenizer.model
diff --git a/program-data-separation/cpp/lora_example/build_example.sh b/program-data-separation/cpp/lora_example/build_example.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -e
+
+# Clean and create build directory if it doesn't exist
+rm -rf build
+mkdir -p build
+cd build
+
+# Configure CMake
+cmake -DCMAKE_BUILD_TYPE=Release -DEXECUTORCH_BUILD_LORA_DEMO=True ../..
+
+# Build the project
+cmake --build . -j$(nproc)
+
+echo "Build complete! Executable located at: ./build/bin/executorch_program_data_separation"
diff --git a/program-data-separation/cpp/lora_example/main.cpp b/program-data-separation/cpp/lora_example/main.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ * @lint-ignore-every CLANGTIDY facebook-hte-Deprecated
+ */
+#include <gflags/gflags.h>
+
+#include <executorch/examples/models/llama/runner/runner.h>
+
+#if defined(ET_USE_THREADPOOL)
+#include <executorch/extension/threadpool/cpuinfo_utils.h>
+#include <executorch/extension/threadpool/threadpool.h>
+#endif
+
+#if defined(__linux__) || defined(__ANDROID__) || defined(__unix__)
+#include <sys/resource.h>
+#endif
+size_t inline get_rss_bytes() {
+#if defined(__linux__) || defined(__ANDROID__) || defined(__unix__)
+  struct rusage r_usage;
+  if (getrusage(RUSAGE_SELF, &r_usage) == 0) {
+    return r_usage.ru_maxrss * 1024;
+  }
+#endif // __linux__ || __ANDROID__ || __unix__
+  // Unsupported platform like Windows, or getrusage() failed.
+  // __APPLE__ and __MACH__ are not supported because r_usage.ru_maxrss does not
+  // consistently return kbytes on macOS. On older versions of macOS, it
+  // returns bytes, but on newer versions it returns kbytes. Need to figure out
+  // when this changed.
+  return 0;
+}
+
+DEFINE_string(lora_model_path, "llama_3_2_1B_lora.pte",
+              "LoRA model serialized in flatbuffer format.");
+DEFINE_string(llama_model_path, "llama_3_2_1B.pte",
+              "Model serialized in flatbuffer format.");
+DEFINE_string(data_path, "foundation.ptd",
+              "Data serialized in flatbuffer format.");
+
+DEFINE_string(tokenizer_path, "tokenizer.model", "Tokenizer stuff.");
+
+DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");
+
+DEFINE_double(temperature, 0.8f,
+              "Temperature; Default is 0.8f. 0 = greedy argmax sampling "
+              "(deterministic). Lower temperature = more deterministic");
+
+DEFINE_int32(
+    seq_len, 128,
+    "Total number of tokens to generate (prompt + output). Defaults to "
+    "max_seq_len. If the number of input tokens + seq_len > max_seq_len, the "
+    "output will be truncated to max_seq_len tokens.");
+
+using namespace ::executorch::extension;
+
+int main(int argc, char *argv[]) {
+  ET_LOG(Info, "Running program-data separation lora example...");
+
+  auto rss_0 = get_rss_bytes() / 1024.0 / 1024.0;
+  ET_LOG(Info, "0 RSS start: %f MiB (0 if unsupported)", rss_0);
+
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  const char *lora_model_path = FLAGS_lora_model_path.c_str();
+  const char *llama_model_path = FLAGS_llama_model_path.c_str();
+  const char *data_path = FLAGS_data_path.c_str();
+
+  const char *tokenizer_path = FLAGS_tokenizer_path.c_str();
+  const char *prompt = FLAGS_prompt.c_str();
+  float temperature = FLAGS_temperature;
+  int32_t seq_len = 128;
+  int32_t cpu_threads = -1;
+
+#if defined(ET_USE_THREADPOOL)
+  uint32_t num_performant_cores =
+      cpu_threads == -1
+          ? ::executorch::extension::cpuinfo::get_num_performant_cores()
+          : static_cast<uint32_t>(cpu_threads);
+  ET_LOG(Info, "Resetting threadpool with num threads = %d",
+         num_performant_cores);
+  if (num_performant_cores > 0) {
+    ::executorch::extension::threadpool::get_threadpool()
+        ->_unsafe_reset_threadpool(num_performant_cores);
+  }
+#endif
+
+  // Create runner for lora model.
+  auto rss_1 = get_rss_bytes() / 1024.0 / 1024.0;
+  ET_LOG(Info, "1 RSS before creating lora_runner: %f MiB (0 if unsupported)",
+         rss_1);
+  std::unique_ptr<::executorch::extension::llm::TextLLMRunner> lora_runner =
+      example::create_llama_runner(lora_model_path, tokenizer_path, data_path);
+  if (lora_runner == nullptr) {
+    ET_LOG(Error, "Failed to create lora_runner.");
+    return 1;
+  }
+
+  // create runner for llama model
+  auto rss_2 = get_rss_bytes() / 1024.0 / 1024.0;
+  ET_LOG(Info, "2 RSS before creating llama_runner: %f MiB (0 if unsupported)",
+         rss_2);
+  std::unique_ptr<::executorch::extension::llm::TextLLMRunner> llama_runner =
+      example::create_llama_runner(llama_model_path, tokenizer_path, data_path);
+  if (llama_runner == nullptr) {
+    ET_LOG(Error, "Failed to create llama_runner.");
+    return 1;
+  }
+  auto rss_3 = get_rss_bytes() / 1024.0 / 1024.0;
+  ET_LOG(Info,
+         "3 RSS before after creating the runners: %f MiB (0 if unsupported)",
+         rss_3);
+
+  // generate
+  executorch::extension::llm::GenerationConfig config{
+      .seq_len = seq_len, .temperature = temperature};
+
+  ET_LOG(Info, "Generating with lora...");
+  auto rss_4 = get_rss_bytes() / 1024.0 / 1024.0;
+  ET_LOG(Info, "4 RSS before running lora_runner: %f MiB (0 if unsupported)",
+         rss_4);
+  auto error = lora_runner->generate(prompt, config);
+  if (error != executorch::runtime::Error::Ok) {
+    ET_LOG(Error, "Failed to generate with lora_runner, error code %zu.",
+           error);
+    return 1;
+  }
+  auto rss_5 = get_rss_bytes() / 1024.0 / 1024.0;
+  ET_LOG(Info,
+         "5 RSS after lora_runner/before llama_runner: %f MiB "
+         "(0 if unsupported)",
+         rss_5);
+  ET_LOG(Info, "Generating with llama...");
+  error = lora_runner->generate(prompt, config);
+  if (error != executorch::runtime::Error::Ok) {
+    ET_LOG(Error, "Failed to generate with llama_runner, error code %zu.",
+           error);
+    return 1;
+  }
+  auto rss_6 = get_rss_bytes() / 1024.0 / 1024.0;
+  ET_LOG(Info,
+         "6 RSS after llama_runner: %f MiB "
+         "(0 if unsupported)",
+         rss_6);
+
+  return 0;
+}
diff --git a/program-data-separation/export_lora.sh b/program-data-separation/export_lora.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+python -m pip install torchtune==0.7.0.dev20250730  --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+
+# Download model artifacts from HF.
+DOWNLOADED_PATH=$(python -c "
+from huggingface_hub import snapshot_download
+path=snapshot_download(
+    repo_id=\"lucylq/llama3_1B_lora\",
+)
+import os
+print(path)
+")
+
+# Copy over tokenizer, for use at runtime.
+cp "${DOWNLOADED_PATH}/tokenizer.model" .
+
+# Export a non-LoRA model with program-data separated.
+MODEL="llama_3_2_1B"
+python -m executorch.extension.llm.export.export_llm \
+    base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
+    base.params="${DOWNLOADED_PATH}/params.json" \
+    base.tokenizer_path="${DOWNLOADED_PATH}/tokenizer.model" \
+    model.use_kv_cache=true \
+    model.use_sdpa_with_kv_cache=true \
+    model.dtype_override="fp32" \
+    backend.xnnpack.enabled=true \
+    backend.xnnpack.extended_ops=true \
+    export.output_name="${MODEL}.pte" \
+    export.foundation_weights_file="${MODEL}.ptd"
+
+# Export a LoRA model, with program and data separated.
+LORA_MODEL="llama_3_2_1B_lora"
+python -m executorch.extension.llm.export.export_llm \
+    base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
+    base.params="${DOWNLOADED_PATH}/params.json" \
+    base.adapter_checkpoint="${DOWNLOADED_PATH}/adapter_model.pt" \
+    base.adapter_config="${DOWNLOADED_PATH}/adapter_config.json" \
+    base.tokenizer_path="${DOWNLOADED_PATH}/tokenizer.model" \
+    model.use_kv_cache=true \
+    model.use_sdpa_with_kv_cache=true \
+    model.dtype_override="fp32" \
+    backend.xnnpack.enabled=true \
+    backend.xnnpack.extended_ops=true \
+    export.output_name="${LORA_MODEL}.pte" \
+    export.foundation_weights_file="foundation.ptd"