Save foundation weights separately

lucylq · lucylq · commit a5169271bf3e · 2025-08-06T10:32:02.000-07:00
This diff: 1. Introduces SerializationConfig to llm_config. Currently, this allows user to save the foundation weights in a separate file; majorly useful for lora case. 2. Adds a pass to tag foundation (non-lora) weights. This is at the top-level (export_llama_lib). The tags are preserved through run_decomps/other passes, and do not affect functionality. 3. Tags are read when placing constants into the named_data_store. 4. Tagged weights are serialized to a separate file. Notes 1. Adding tags to node.meta['custom']['blah'] means that they will not be discarded by run_decompositions 2. Adding tags to the lifted model (ep.graph_module) requires the EP to check is_param_node for xnnpack constants. Instead, add tags to the unlifted model (ep.module()), so we do not need to go through a re-export to get the EP. 3. Not an issue for this diff as llama doesn't have any higher order ops. Adding tags to models with higher-order ops is problematic due to nested submodules. Differential Revision: [D79181064](https://our.internmc.facebook.com/intern/diff/D79181064/) [ghstack-poisoned]
diff --git a/.ci/scripts/test_llama_lora.sh b/.ci/scripts/test_llama_lora.sh
@@ -48,8 +48,17 @@ DOWNLOADED_PATH=$(
     --model_id "${HF_MODEL_REPO}" \
     --files "adapter_config.json" "adapter_model.pt" "consolidated.00.pth" "params.json" "tokenizer.model"
 )
-EXPORTED_MODEL_NAME="llama_3_2_1B_lora.pte"
-# Export model.
+# Build llama runner.
+cmake_install_executorch_libraries
+cmake_build_llama_runner
+
+# Constants.
+RUNTIME_ARGS="--tokenizer_path=${DOWNLOADED_PATH}/tokenizer.model --temperature=0 --seq_len=20 --warmup=1"
+PROMPT="What happens if you eat watermelon seeds?"
+EXPECTED_PREFIX="What happens if you eat watermelon seeds? Watermelon seeds are a good source of vitamin C,"
+
+# Export LoRA PTE file.
+MODEL_NAME="llama_3_2_1B_lora"
 $PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
     base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
     base.params="${DOWNLOADED_PATH}/params.json" \
@@ -61,26 +70,17 @@ $PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
     model.dtype_override="fp32" \
     backend.xnnpack.enabled=true \
     backend.xnnpack.extended_ops=true \
-    export.output_name="${EXPORTED_MODEL_NAME}"
-
-# Build llama runner.
-cmake_install_executorch_libraries
-cmake_build_llama_runner
+    export.output_name="${MODEL_NAME}.pte"
 
-PROMPT="What happens if you eat watermelon seeds?"
 # Run llama runner
-RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --tokenizer_path=${DOWNLOADED_PATH}/tokenizer.model --temperature=0 --seq_len=20 --warmup=1"
-
 NOW=$(date +"%H:%M:%S")
 echo "Starting to run llama runner at ${NOW}"
 # shellcheck source=/dev/null
-cmake-out/examples/models/llama/llama_main --prompt="${PROMPT}" ${RUNTIME_ARGS} > result.txt
+cmake-out/examples/models/llama/llama_main --model_path=${MODEL_NAME}.pte --prompt="${PROMPT}" ${RUNTIME_ARGS} > result.txt
 NOW=$(date +"%H:%M:%S")
 echo "Finished at ${NOW}"
 
-RESULT=$(cat result.txt)
-EXPECTED_PREFIX="What happens if you eat watermelon seeds? Watermelon seeds are a good source of vitamin C,"
-
+RESULT=$(cat lora.txt)
 if [[ "${RESULT}" == "${EXPECTED_PREFIX}"* ]]; then
   echo "Expected result prefix: ${EXPECTED_PREFIX}"
   echo "Actual result: ${RESULT}"
@@ -90,7 +90,44 @@ else
   echo "Expected result prefix: ${EXPECTED_PREFIX}"
   echo "Actual result: ${RESULT}"
   echo "Failure; results not the same"
+  cleanup_files
+  exit 1
+fi
+
+# Export LoRA PTE, PTD file.
+MODEL_SEPARATE="${MODEL_NAME}_separate"
+$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
+    base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
+    base.params="${DOWNLOADED_PATH}/params.json" \
+    base.adapter_checkpoint="${DOWNLOADED_PATH}/adapter_model.pt" \
+    base.adapter_config="${DOWNLOADED_PATH}/adapter_config.json" \
+    base.tokenizer_path="${DOWNLOADED_PATH}/tokenizer.model" \
+    model.use_kv_cache=true \
+    model.use_sdpa_with_kv_cache=true \
+    model.dtype_override="fp32" \
+    backend.xnnpack.enabled=true \
+    backend.xnnpack.extended_ops=true \
+    export.output_name="${MODEL_SEPARATE}.pte" \
+    serialization.foundation_weights_file="${MODEL_SEPARATE}.ptd"
 
+# Run llama runner.
+NOW=$(date +"%H:%M:%S")
+echo "Starting to run llama runner at ${NOW}"
+# shellcheck source=/dev/null
+cmake-out/examples/models/llama/llama_main --model_path=${MODEL_SEPARATE}.pte --data_path=${MODEL_SEPARATE}.ptd --prompt="${PROMPT}" ${RUNTIME_ARGS} > result2.txt
+NOW=$(date +"%H:%M:%S")
+echo "Finished at ${NOW}"
+
+RESULT2=$(cat result2.txt)
+if [[ "${RESULT2}" == "${EXPECTED_PREFIX}"* ]]; then
+  echo "Expected result prefix: ${EXPECTED_PREFIX}"
+  echo "Actual result: ${RESULT2}"
+  echo "Success"
+  cleanup_files
+else
+  echo "Expected result prefix: ${EXPECTED_PREFIX}"
+  echo "Actual result: ${RESULT2}"
+  echo "Failure; results not the same"
   cleanup_files
   exit 1
 fi
diff --git a/backends/xnnpack/operators/node_visitor.py b/backends/xnnpack/operators/node_visitor.py
@@ -621,8 +621,10 @@ def get_serialized_buffer_index(
             ConstantDataOffset(offset=UINT64_MAX, size=size, named_key=named_key)
         )
 
-        external_tag = tensor.meta.get("delegate_constant_tag", None)
+        custom_meta = tensor.meta.get("custom", None)
+        external_tag = custom_meta.get("delegate_constant_tag", None) if custom_meta else None
         if external_tag is not None:
+            external_tag = custom_meta.get("delegate_constant_tag", None)
             logging.info(
                 f"Adding constant data with name {tensor.name}, key {named_key} and external_tag {external_tag} to named_data_store"
             )
diff --git a/examples/models/llama/TARGETS b/examples/models/llama/TARGETS
@@ -153,10 +153,10 @@ runtime.python_library(
         "//caffe2:torch",
         "//executorch/extension/llm/export/config:llm_config",
         "//executorch/backends/vulkan/_passes:vulkan_passes",
+        "//executorch/exir/passes:external_constants_pass",
         "//executorch/exir/passes:init_mutable_pass",
         "//executorch/examples/models:model_base",
         "//executorch/examples/models:models",
-        "//executorch/exir/passes:init_mutable_pass",
         "//executorch/extension/llm/custom_ops:custom_ops_aot_py",
         "//executorch/extension/llm/export:export_lib",
         # one definition has to be included in the user of the libarary
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -1078,6 +1078,22 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
         llm_config.backend.xnnpack.enabled = True
 
     if llm_config.backend.xnnpack.enabled:
+        if llm_config.serialization.foundation_weights_file is not None:
+            gen_tag_fn: Callable[[torch.fx.Node], str] = lambda x: (
+                llm_config.serialization.foundation_weights_file
+                if "lora" not in x.name
+                else None
+            )
+
+            from executorch.exir.passes.external_constants_pass import (
+                delegate_external_constants_pass_unlifted,
+            )
+
+            delegate_external_constants_pass_unlifted(
+                gm=builder_exported.pre_autograd_graph_module,
+                gen_tag_fn=gen_tag_fn,
+            )
+
         builder = _to_edge_and_lower_llama_xnnpack(
             builder_exported,
             modelname,
diff --git a/exir/passes/external_constants_pass.py b/exir/passes/external_constants_pass.py
@@ -113,6 +113,28 @@ def delegate_external_constants_pass(
         for node in module.graph.nodes:
             if node.op == "placeholder" and is_param_node(ep, node):
                 if gen_tag_fn is not None:
-                    node.meta["delegate_constant_tag"] = gen_tag_fn(node)
+                    node.meta.setdefault("custom", {})
+                    node.meta["custom"]["delegate_constant_tag"] = gen_tag_fn(node)
+                    mutated = True
+    return PassResult(gm, mutated)
+
+
+# Note: this pass must be run on an unlifted graph, e.g. ep.module(),
+# and not on a lifted graph, e.g. ep.graph_module.
+# This is using 'get_attr' to tag constants, which only appears in
+# unlifted graphs.
+def delegate_external_constants_pass_unlifted(
+    gm: GraphModule,
+    gen_tag_fn: Optional[Callable[[torch.fx.Node], str]] = None,
+) -> PassResult:
+    mutated = False
+    for module in gm.modules():
+        if not isinstance(module, torch.fx.GraphModule):
+            continue
+        for node in module.graph.nodes:
+            if node.op == "get_attr":
+                if gen_tag_fn is not None:
+                    node.meta.setdefault("custom", {})
+                    node.meta["custom"]["delegate_constant_tag"] = gen_tag_fn(node)
                     mutated = True
     return PassResult(gm, mutated)
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
@@ -541,6 +541,9 @@ def save_to_pte(self, output_name: str) -> None:
         filename = save_pte_program(self.export_program, output_name, self.output_dir)
         self._saved_pte_filename = filename
 
+        # If there are PTD files.
+        self.export_program.write_tensor_data_to_file(self.output_dir)
+
     def get_saved_pte_filename(self) -> Optional[str]:
         """
         Return the filename of the most recenet saved .pte file. Return None if the model is not saved.
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
@@ -227,6 +227,20 @@ def __post_init__(self):
             )
 
 
+@dataclass
+class SerializationConfig:
+    """
+    Configures properties relevant to the serialization process.
+
+    Attributes:
+        foundation_weights_file: configure the foundation weights of a model
+            to be placed in a separate file, external to the PTE. Pass the
+            intended file name here.
+    """
+
+    foundation_weights_file: Optional[str] = None
+
+
 ################################################################################
 ################################# DebugConfig ##################################
 ################################################################################
@@ -466,6 +480,7 @@ class LlmConfig:
     base: BaseConfig = field(default_factory=BaseConfig)
     model: ModelConfig = field(default_factory=ModelConfig)
     export: ExportConfig = field(default_factory=ExportConfig)
+    serialization: SerializationConfig = field(default_factory=SerializationConfig)
     debug: DebugConfig = field(default_factory=DebugConfig)
     quantization: QuantizationConfig = field(default_factory=QuantizationConfig)
     backend: BackendConfig = field(default_factory=BackendConfig)
@@ -546,6 +561,12 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
         if hasattr(args, "export_only"):
             llm_config.export.export_only = args.export_only
 
+        # SerializationConfig
+        if hasattr(args, "foundation_weights_file"):
+            llm_config.serialization.foundation_weights_file = (
+                args.foundation_weights_file
+            )
+
         # QuantizationConfig
         if hasattr(args, "quantization_mode"):
             llm_config.quantization.qmode = args.quantization_mode

Original file line number	Diff line number	Diff line change
`@@ -621,8 +621,10 @@ def get_serialized_buffer_index(`
`621`	`621`	`ConstantDataOffset(offset=UINT64_MAX, size=size, named_key=named_key)`
`622`	`622`	`)`
`623`	`623`
`624`		`- external_tag = tensor.meta.get("delegate_constant_tag", None)`
	`624`	`+ custom_meta = tensor.meta.get("custom", None)`
	`625`	`+ external_tag = custom_meta.get("delegate_constant_tag", None) if custom_meta else None`
`625`	`626`	`if external_tag is not None:`
	`627`	`+ external_tag = custom_meta.get("delegate_constant_tag", None)`
`626`	`628`	`logging.info(`
`627`	`629`	`f"Adding constant data with name {tensor.name}, key {named_key} and external_tag {external_tag} to named_data_store"`
`628`	`630`	`)`