Skip to content

Commit 5892e42

Browse files
committed
[ET-VK] Split up prepack command buffer
## Changes * Introduce `run_prepack()` API which combines the functionality of `encode_prepack()` and `prepack()`, but submits prepacking shaders incrementally rather than all at once. * Introduce graph config options to control command buffer submission behaviour during prepacking. Note that the current default values for the prepack submission thresholds were determined through experimentation. I will leave determining optimal values for specific devices as a later exercise. The goal of this diff is simply to introduce this mechanism to fix the Llama model loading crash on Samsung S24 (described below). ## Context Currently, ET-VK will encode all prepacking shaders, and then perform prepacking by submitting only one command buffer. However, this approach has some drawbacks: * CPU/GPU parallelism is decreased, since the command buffer is submitted only after all commands have been encoded. * There can be performance issues at the Vulkan API level when processing a single "large" command buffer. By splitting up prepacking to occur over multiple command buffers, performance can be improved by avoiding both the aforementioned issues. ## Llama 3.2 1B crash on Samsung S24 I have also noticed that running large models (i.e. Llama 3.2 1B) on the Samsung S24 with ET-VK, the device's display will crash (causing the screen to go black and become unresponsive), and sometimes the device will shut down entirely. Fortunately, this change also fixes this behaviour, in addition to providing a significant performance boost to model load time for Llama models (from 9s to 3s). ## Performance Impact * Improves model load time, especially on larger models. ## Future Work * Deprecate the `encode_prepack()` + `prepack()` pattern in favor of the `run_prepack()` pattern Differential Revision: [D78275586](https://our.internmc.facebook.com/intern/diff/D78275586/) [ghstack-poisoned]
1 parent bb3c625 commit 5892e42

File tree

8 files changed

+120
-26
lines changed

8 files changed

+120
-26
lines changed

backends/vulkan/runtime/VulkanBackend.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -503,8 +503,7 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
503503
compute_graph->prepare();
504504
compute_graph->prepare_pipelines();
505505

506-
compute_graph->encode_prepack();
507-
compute_graph->prepack();
506+
compute_graph->run_prepack();
508507

509508
// If dynamic shapes are not expected, then the command buffer only needs to
510509
// be encoded once. Otherwise, wait until the first inference to encode the

backends/vulkan/runtime/api/Context.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,16 @@ class Context final {
9292
return queue_.handle;
9393
}
9494

95+
// Device Metadata
96+
97+
inline bool device_is_adreno() const {
98+
return adapter_p_->device_type() == vkapi::DeviceType::ADRENO;
99+
}
100+
101+
inline bool device_name_contains(const char* substr) const {
102+
return adapter_p_->device_name().find(substr) != std::string::npos;
103+
}
104+
95105
// Device Caches
96106

97107
inline vkapi::ShaderLayoutCache& shader_layout_cache() {

backends/vulkan/runtime/graph/ComputeGraph.cpp

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515

1616
#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
1717

18+
#include <iostream>
19+
1820
namespace vkcompute {
1921

2022
//
@@ -145,6 +147,15 @@ ComputeGraph::ComputeGraph(GraphConfig config)
145147
execute_descriptor_counts_.descriptor_combined_sampler_count = 0;
146148
execute_descriptor_counts_.descriptor_storage_image_count = 0;
147149

150+
#define MB (1024.0 * 1024.0)
151+
// If certain graph config variables are not specified, then set them
152+
// automatically.
153+
if (config_.prepack_threshold_nbytes == 0) {
154+
config_.prepack_threshold_nbytes = 20 * MB;
155+
config_.prepack_initial_threshold_nbytes = 20 * MB;
156+
}
157+
#undef MB
158+
148159
context_->set_cmd(/*reusable = */ true);
149160
}
150161

@@ -212,11 +223,6 @@ utils::GPUMemoryLayout ComputeGraph::suggested_memory_layout(
212223
return utils::kChannelsPacked;
213224
}
214225

215-
bool ComputeGraph::device_name_contains(const char* substr) {
216-
return context_->adapter_ptr()->device_name().find(substr) !=
217-
std::string::npos;
218-
}
219-
220226
void ComputeGraph::check_no_active_value_ptrs() {
221227
VK_CHECK_COND(
222228
values_in_use_ == 0,
@@ -750,6 +756,15 @@ void ComputeGraph::prepare_pipelines() {
750756
vkapi::ComputePipelineCache::Hasher>();
751757
}
752758

759+
void ComputeGraph::submit_current_cmd_and_wait(const bool final_use) {
760+
vkapi::VulkanFence fence = context_->fences().get_fence();
761+
context_->submit_cmd_to_gpu(fence.get_submit_handle(), final_use);
762+
fence.wait();
763+
context_->fences().return_fence(fence);
764+
765+
context_->flush();
766+
}
767+
753768
void ComputeGraph::encode_prepack() {
754769
for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
755770
node->encode(this);
@@ -766,6 +781,28 @@ void ComputeGraph::prepack() const {
766781
context_->flush();
767782
}
768783

784+
void ComputeGraph::run_prepack() {
785+
int i = 0;
786+
bool submitted = false;
787+
for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
788+
// Do not trigger on the first or last prepack node.
789+
const bool not_terminal = i != 0 && i != (prepack_nodes_.size() - 1);
790+
size_t threshold = submitted ? config_.prepack_threshold_nbytes
791+
: config_.prepack_initial_threshold_nbytes;
792+
if (not_terminal && staging_nbytes_in_cmd_ > threshold) {
793+
submit_current_cmd_and_wait(/*final_use=*/true);
794+
staging_nbytes_in_cmd_ = 0;
795+
context_->set_cmd();
796+
submitted = true;
797+
}
798+
799+
node->encode(this);
800+
i++;
801+
}
802+
submit_current_cmd_and_wait(/*final_use=*/true);
803+
staging_nbytes_in_cmd_ = 0;
804+
}
805+
769806
void ComputeGraph::encode_execute() {
770807
context_->flush();
771808
context_->set_cmd(/*reusable = */ true);

backends/vulkan/runtime/graph/ComputeGraph.h

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,10 @@ class ComputeGraph final {
194194
size_t values_in_use_ = 0;
195195
size_t execute_count_ = 0;
196196

197+
// Represents the amount of staging buffer data that will be copied if the
198+
// current Context's command buffer is submitted now.
199+
size_t staging_nbytes_in_cmd_ = 0;
200+
197201
public:
198202
//
199203
// Accessors
@@ -512,14 +516,17 @@ class ComputeGraph final {
512516
utils::GPUMemoryLayout suggested_memory_layout(
513517
const std::vector<int64_t>& sizes);
514518

515-
inline bool device_is_adreno() {
516-
return context_->adapter_ptr()->device_type() == vkapi::DeviceType::ADRENO;
519+
inline bool device_is_adreno() const {
520+
return context_->device_is_adreno();
517521
}
518-
const std::string& device_name() {
519-
return context()->adapter_ptr()->device_name();
522+
523+
const std::string& device_name() const {
524+
return context_->adapter_ptr()->device_name();
520525
}
521526

522-
bool device_name_contains(const char* substr);
527+
inline bool device_name_contains(const char* substr) const {
528+
return context_->device_name_contains(substr);
529+
}
523530

524531
//
525532
// Graph Building
@@ -812,13 +819,34 @@ class ComputeGraph final {
812819
copy_into_staging(const ValueRef idx, const void* data, const size_t numel);
813820
void copy_from_staging(const ValueRef idx, void* data, const size_t numel);
814821

822+
protected:
823+
// Command Buffer Management
824+
825+
/*
826+
* Submits the current command buffer in the Context to the GPU for execution,
827+
* and wait for it to complete before returning. This function will also flush
828+
* the Context after execution.
829+
*/
830+
void submit_current_cmd_and_wait(const bool final_use = false);
831+
832+
public:
815833
//
816834
// Graph Prepacking
817835
//
818836

837+
inline void update_staging_nbytes_in_cmd(const size_t staging_bytes) {
838+
staging_nbytes_in_cmd_ += staging_bytes;
839+
}
840+
819841
void encode_prepack();
820842
void prepack() const;
821843

844+
/*
845+
* Executes prepacking operations to transfer model weight data from the CPU
846+
* to GPU.
847+
*/
848+
void run_prepack();
849+
822850
//
823851
// Graph Execution
824852
//

backends/vulkan/runtime/graph/GraphConfig.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,20 @@ struct GraphConfig final {
3636
// Whether or not the ComputeGraph should expect input shapes to be dynamic
3737
bool expect_dynamic_shapes;
3838

39+
// Execution properties that determine specifics re: how command buffer
40+
// submission is handled, etc. 0 means this field is not set.
41+
42+
// During prepacking, once this threshold is reached, submit the current
43+
// command buffer for execution. This allows the work to be distributed over
44+
// multiple command buffer submissions, which can improve model load
45+
// performance and prevent crashes when loading large models.
46+
size_t prepack_threshold_nbytes = 0;
47+
// Threshold used for the first command buffer submission during prepacking.
48+
// This can be set to be lower than prepack_submission_threshold_nbytes to
49+
// submit a command buffer for execution earlier which can improve performance
50+
// by taking more advantage of parallelism between the CPU and GPU.
51+
size_t prepack_initial_threshold_nbytes = 0;
52+
3953
vkapi::Adapter* external_adapter;
4054

4155
// Generate a default graph config with pre-configured settings

backends/vulkan/runtime/graph/ops/PrepackNode.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
6262
TensorRefPtr tref = graph->get_tref(tref_);
6363
size_t numel = utils::multiply_integers(tref->sizes);
6464
api::StagingBuffer staging(graph->context(), tref->dtype, numel);
65+
graph->update_staging_nbytes_in_cmd(staging.buffer().mem_size_as_size_t());
6566
size_t nbytes = numel * vkapi::element_size(tref->dtype);
6667
staging.copy_from(tref->data, nbytes);
6768
return staging;

backends/vulkan/runtime/vk_api/memory/Buffer.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,10 @@ class VulkanBuffer final {
138138
return buffer_properties_.size;
139139
}
140140

141+
inline size_t mem_size_as_size_t() const {
142+
return utils::safe_downcast<size_t>(mem_size());
143+
}
144+
141145
inline bool has_memory() const {
142146
return (memory_.allocation != VK_NULL_HANDLE);
143147
}

extension/llm/runner/stats.h

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -100,62 +100,63 @@ inline std::string stats_to_json_string(const Stats& stats) {
100100

101101
inline void print_report(const Stats& stats) {
102102
printf("PyTorchObserver %s\n", stats_to_json_string(stats).c_str());
103+
printf("\n");
103104

104-
ET_LOG(
105-
Info,
105+
printf(
106106
"\tPrompt Tokens: %" PRIu64 " Generated Tokens: %" PRIu64,
107107
stats.num_prompt_tokens,
108108
stats.num_generated_tokens);
109+
printf("\n");
109110

110-
ET_LOG(
111-
Info,
111+
printf(
112112
"\tModel Load Time:\t\t%f (seconds)",
113113
((double)(stats.model_load_end_ms - stats.model_load_start_ms) /
114114
stats.SCALING_FACTOR_UNITS_PER_SECOND));
115+
printf("\n");
115116
double inference_time_ms =
116117
(double)(stats.inference_end_ms - stats.inference_start_ms);
117-
ET_LOG(
118-
Info,
118+
printf(
119119
"\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
120120
inference_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND,
121121

122122
(stats.num_generated_tokens) /
123123
(double)(stats.inference_end_ms - stats.inference_start_ms) *
124124
stats.SCALING_FACTOR_UNITS_PER_SECOND);
125+
printf("\n");
125126
double prompt_eval_time =
126127
(double)(stats.prompt_eval_end_ms - stats.inference_start_ms);
127-
ET_LOG(
128-
Info,
128+
printf(
129129
"\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
130130
prompt_eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
131131
(stats.num_prompt_tokens) / prompt_eval_time *
132132
stats.SCALING_FACTOR_UNITS_PER_SECOND);
133+
printf("\n");
133134

134135
double eval_time =
135136
(double)(stats.inference_end_ms - stats.prompt_eval_end_ms);
136-
ET_LOG(
137-
Info,
137+
printf(
138138
"\t\tGenerated %" PRIu64
139139
" tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
140140
stats.num_generated_tokens,
141141
eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
142142
stats.num_generated_tokens / eval_time *
143143
stats.SCALING_FACTOR_UNITS_PER_SECOND);
144+
printf("\n");
144145

145146
// Time to first token is measured from the start of inference, excluding
146147
// model load time.
147-
ET_LOG(
148-
Info,
148+
printf(
149149
"\tTime to first generated token:\t%f (seconds)",
150150
((double)(stats.first_token_ms - stats.inference_start_ms) /
151151
stats.SCALING_FACTOR_UNITS_PER_SECOND));
152+
printf("\n");
152153

153-
ET_LOG(
154-
Info,
154+
printf(
155155
"\tSampling time over %" PRIu64 " tokens:\t%f (seconds)",
156156
stats.num_prompt_tokens + stats.num_generated_tokens,
157157
(double)stats.aggregate_sampling_time_ms /
158158
stats.SCALING_FACTOR_UNITS_PER_SECOND);
159+
printf("\n");
159160
}
160161

161162
} // namespace llm

0 commit comments

Comments
 (0)