From a1a612b1772f12de2f480a8c62ecba27813bd33c Mon Sep 17 00:00:00 2001
From: SS-JIA <ssjia@meta.com>
Date: Mon, 18 Aug 2025 11:16:27 -0400
Subject: [PATCH] [ET-VK][AOT] Serialize constant tensors via NamedDataMap

Summary:
When exporting models to Vulkan backend, save constant tensors in the NamedDataMap instead of the constant data section of the delegate header.

## Motivation

Prevent screen blackout (Llama 3.2 1B) / device crash (Llama 3.2 3B) when running Llama 3.2 models on Samsung Galaxy S24. This behaviour is related to high peak memory usage when loading the model. For more information, see the top diff/PR in the stack.

## Context

This change is based on the equivalent change D70315207/https://github.com/pytorch/executorch/pull/9153 in XNNPACK.

Test Plan:


## Memory Comparison with/without NamedDataMap

Measured VmRss using

```
uint64_t getVmRssInKB() {
  std::ifstream statusFile("/proc/self/status");
  std::string l, num;
  while (std::getline(statusFile, l)) {
    if (l.substr(0, 5) == "VmRSS") {
      size_t pos = l.find_first_of("0123456789");
      num = l.substr(pos);
      break;
    }
  }
  uint64_t vmRssInKB = std::stoi(num);
  return vmRssInKB;
}
```

P1908019767 (Meta only)

Excerpt:

```
Log 1                                             | Log 2
--------------------------------------------------|--------------------------------------------------
Memory usage before model compilation: 1115416 KB | Memory usage before model compilation: 1919228 KB
Memory usage after graph building: 1924340 KB     | Memory usage after graph building: 1924256 KB
Memory usage after graph preparation: 1798968 KB  | Memory usage after graph preparation: 1782464 KB
Memory usage prepack start: 1798968 KB            | Memory usage prepack start: 1781968 KB

Memory usage after prepack operations: 1271924 KB | Memory usage after prepack operations: 1653496 KB
```

[ghstack-poisoned]
---
 .../serialization/vulkan_graph_builder.py     | 36 +++++++++++++++++--
 .../serialization/vulkan_graph_serialize.py   | 15 ++++++--
 backends/vulkan/vulkan_preprocess.py          |  1 +
 3 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/backends/vulkan/serialization/vulkan_graph_builder.py b/backends/vulkan/serialization/vulkan_graph_builder.py
index b74a7fb1f8e..78ac51c8808 100644
--- a/backends/vulkan/serialization/vulkan_graph_builder.py
+++ b/backends/vulkan/serialization/vulkan_graph_builder.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import ctypes
+import hashlib
 import logging
 import operator
 from types import NoneType
@@ -25,6 +27,7 @@
     is_symint_node,
     TensorRepr,
 )
+from executorch.exir._serialize._named_data_store import NamedDataStore
 from executorch.exir.backend.utils import DelegateMappingBuilder
 
 from executorch.exir.tensor import TensorSpec
@@ -56,6 +59,7 @@ def __init__(
         self.input_ids = []
         self.output_ids = []
         self.const_tensors = []
+        self.named_data_store = NamedDataStore()
 
         # Mapping from Node to VkValue id
         self.node_to_value_ids = {}
@@ -129,8 +133,36 @@ def get_param_tensor(self, node: Node) -> torch.Tensor:
     def maybe_add_constant_tensor(self, node: Node) -> int:
         constant_id = -1
         if is_param_node(self.program, node):
-            constant_id = len(self.const_tensors)
-            self.const_tensors.append(self.get_param_tensor(node))
+            tensor = self.get_param_tensor(node)
+
+            # Serialize tensor data to bytes
+            tensor = tensor.contiguous()
+            size = tensor.untyped_storage().nbytes()
+
+            if size > 0:
+                array_type = ctypes.c_char * size
+                array = ctypes.cast(
+                    tensor.untyped_storage().data_ptr(),
+                    ctypes.POINTER(array_type),
+                ).contents
+
+                # Generate SHA256 hash as the named key
+                tensor_bytes = bytes(array)
+                sha256_hash = hashlib.sha256(tensor_bytes)
+                named_key = sha256_hash.hexdigest()
+
+                # Add to named data store with 16-byte alignment (matching XNNPACK)
+                self.named_data_store.add_named_data(
+                    named_key, tensor_bytes, alignment=16
+                )
+
+                # Create VkBytes entry with named_key and set offset to indicate named data usage
+                constant_id = len(self.const_tensors)
+                self.const_tensors.append((named_key, size))
+            else:
+                # Handle empty tensors
+                constant_id = len(self.const_tensors)
+                self.const_tensors.append(None)
 
         return constant_id
 
diff --git a/backends/vulkan/serialization/vulkan_graph_serialize.py b/backends/vulkan/serialization/vulkan_graph_serialize.py
index 2ceedf73d10..a83225a98b5 100644
--- a/backends/vulkan/serialization/vulkan_graph_serialize.py
+++ b/backends/vulkan/serialization/vulkan_graph_serialize.py
@@ -191,10 +191,21 @@ def serialize_constant_tensors(
 
     current_offset = len(raw_bytes)
     for tensor in const_tensors:
-        if tensor.numel() == 0:
+        # The tensor data is stored in the named data map
+        if isinstance(tensor, tuple):
+            named_key, size = tensor
+            vk_graph.constants.append(
+                VkBytes(
+                    offset=18446744073709551615,  # UINT64_MAX to indicate named data
+                    length=size,
+                    named_key=named_key,
+                )
+            )
+        elif tensor is None or tensor.numel() == 0:
+            assert isinstance(tensor, torch.Tensor)
             vk_graph.constants.append(VkBytes(current_offset, 0))
-            continue
         else:
+            assert isinstance(tensor, torch.Tensor)
             array_type = ctypes.c_char * tensor.untyped_storage().nbytes()
             array = ctypes.cast(
                 tensor.untyped_storage().data_ptr(),
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
index 8c1165a89df..1816d9b12de 100644
--- a/backends/vulkan/vulkan_preprocess.py
+++ b/backends/vulkan/vulkan_preprocess.py
@@ -229,4 +229,5 @@ def preprocess(  # noqa: C901
                 vk_graph, graph_builder.const_tensors, []
             ),
             debug_handle_map=graph_builder.delegate_mapping_builder.get_delegate_mapping(),
+            data_store_output=graph_builder.named_data_store.get_named_data_store_output(),
         )