Skip to content

Commit 6ef0978

Browse files
spcypptfacebook-github-bot
authored andcommitted
Merge VBE output (backend) (#4846)
Summary: Pull Request resolved: #4846 X-link: facebookresearch/FBGEMM#1775 Currently, Torchrec merges the outputs of individual VBE TBE ops to be ordered by ranks using [_merge_variable_batch_embeddings](https://www.internalfb.com/code/fbsource/[3bd69d7fa3534144dcb0162ca59803a6c3ff6e70]/fbcode/torchrec/distributed/embedding_lookup.py?lines=593-604). This function seems to cause ~30% QPS regression compared to baseline (HBM+UVM) for Jupiter V1 model with VBE enabled. To get rid of _merge_variable_batch_embeddings() function, we pre-allocate the `vbe_output` tensor which holds outputs from all VBE ops and calculate `vbe_output_offsets` to allow each individual VBE ops to write to the correct location in the `vbe_output` tensor. By default, `vbe_output` and `vbe_output_offsets` are `None`, which means VBE ops will return individual tensor the way it currently does. The feature is enabled when `vbe_output` and `vbe_output_offsets` are not `None`. --- **NOTE** 1. This feature is currently supported for Sparse TBE. 2. The support is limited for CUDA. 3. For backward compatibility, we append the newly introduced `vbe_output` to the existing API. Hence, we need to make the `vbe_output` tensor as `optional` with default value as `None` (there's no default value for Tensor). 4. We *cannot* annotate `vbe_output` because PyTorch registration does not support annotation of optional tensor. Adding annotation will incur the following error below. This may cause some issues to support this on MTIA, if MTIA relies on tensor annotation. ``` E0903 09:50:32.966235 2850885 ExceptionTracer.cpp:227] exception stack complete terminate called after throwing an instance of 'std::runtime_error' what(): expected ident but found '(' here: split_embedding_codegen_lookup_adagrad_function_pt2( Tensor placeholder_autograd_tensor, Tensor[](a!) weights, Tensor D_offsets, SymInt total_D, SymInt max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights, Tensor? feature_requires_grad, int output_dtype, Tensor?[](e!) aux_tensor, int[] aux_int, float[] aux_float, bool[] aux_bool, Tensor[](g!) momentum1, Tensor learning_rate_tensor, float[] optim_float, SymInt max_B=-1, SymInt max_B_feature_rank=-1, SymInt vbe_output_size=-1, Tensor?(t!) vbe_output=None ) -> Tensor ~ <--- HERE ``` See https://docs.google.com/document/d/1h5YyeCjYmmN-CIFB98CrBf1uMksidPbNvM1rl8yZeds/edit?tab=t.0#heading=h.tdfkkc6ujdyl Reviewed By: q10, sryap, ionuthristodorescu Differential Revision: D79704318 fbshipit-source-id: 633fc384e4852eb5b066fc3f278a1ba1a9d54d93
1 parent ba47c13 commit 6ef0978

14 files changed

+226
-66
lines changed

fbgemm_gpu/codegen/genscript/generate_backward_split.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,7 @@ def generate() -> None:
422422
"lxu_cache_locations", # 3
423423
"uvm_cache_stats", # 4
424424
"prev_iter_dev", # 5
425+
"vbe_output_offsets", # 6
425426
],
426427
"aux_int": [
427428
"iter", # 0

fbgemm_gpu/codegen/genscript/optimizer_args.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,7 @@ class OptimizerArgsSetItem:
7373
"row_counter_dev": "(q!)",
7474
"row_counter_uvm": "(r!)",
7575
"optim_tensor": "(s!)",
76-
"delta_weights_host": "(t!)",
77-
"delta_weights_dev": "(u!)",
78-
"delta_weights_uvm": "(v!)",
76+
"vbe_output": "(t!)",
7977
}
8078

8179
######################################################################

fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,7 @@ class {{ autograd_func }} :
708708
static auto generate_vbe_metadata_op =
709709
torch::Dispatcher::singleton()
710710
.findSchemaOrThrow("fbgemm::generate_vbe_metadata", "")
711-
.typed<std::tuple<Tensor, Tensor>(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const int64_t, const bool, const c10::SymInt, const int64_t, const c10::SymInt)>();
711+
.typed<std::tuple<Tensor, Tensor>(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const int64_t, const bool, const c10::SymInt, const int64_t, const c10::SymInt, const std::optional<Tensor>&)>();
712712

713713
auto [
714714
vbe_row_output_offsets,
@@ -729,7 +729,8 @@ class {{ autograd_func }} :
729729
{%- endif %}
730730
max_B_feature_rank,
731731
info_B_num_bits,
732-
/*total_B=*/offsets.sym_size(0) - 1
732+
/*total_B=*/offsets.sym_size(0) - 1,
733+
std::nullopt /* pre-allocated vbe_output is not supported in TBE interface V1 or Dense TBE */
733734
);
734735
{%- endif %}
735736

fbgemm_gpu/codegen/training/forward/embedding_forward_split_meta_template.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
// clang-format off
910
{#
1011
// @lint-ignore LINTIGNORE
1112
// @lint-ignore-every CLANGFORMAT
@@ -103,7 +104,12 @@ Tensor
103104
const int64_t iter,
104105
const double gwd_lower_bound,
105106
{%- endif %}
107+
{%- if vbe and not dense %}
108+
const bool is_experimental,
109+
std::optional<Tensor> vbe_output
110+
{%- else %}
106111
const bool is_experimental
112+
{%- endif %}
107113
) {
108114
// NB: omitted the device tests TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL
109115
{%- if not nobag %}
@@ -210,4 +216,4 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
210216
{%- endfor %} {#-/* for is_gwd */#}
211217
{%- endif %} {#/* if (not nobag or (not weighted and not vbe)) */#}
212218
{%- endfor %} {#-/* for nobag */#}
213-
// clang-format on
219+
// clang-format on

fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9-
{#
109
// @lint-ignore LINTIGNORE
1110
// @lint-ignore-every CLANGFORMAT
1211
// clang-format off
12+
{#
1313
// Note: clang-format off doesn't work with this templaterized code,
1414
// so we need to keep lint-ignore-every.
1515
// See https://fburl.com/dw9ljh4h
@@ -391,7 +391,12 @@ batch_index_select_dim0_codegen_forward_cuda(
391391
const int64_t iter,
392392
const double gwd_lower_bound,
393393
{%- endif %}
394+
{%- if vbe and not dense %}
395+
const bool is_experimental,
396+
std::optional<Tensor> vbe_output
397+
{%- else %}
394398
const bool is_experimental
399+
{%- endif %}
395400
{%- endif %} {#- /*if is_index_select*/ #}
396401
) {
397402
{%- if not nobag or is_index_select %}
@@ -529,11 +534,24 @@ batch_index_select_dim0_codegen_forward_cuda(
529534
o_dtype == SparseType::BF16 || o_dtype == SparseType::INT8);
530535

531536
{%- if vbe %}
532-
// Use a 2D tensor to make it compatible with 2D PackedTensorsAccessor of other output
537+
{%- if dense %}
533538
output = at::empty(
534539
{1, vbe_output_size},
535540
dev_weights.options().dtype(getScalarType(o_dtype))
536-
);
541+
);
542+
{%- else %}
543+
// Use a 2D tensor to make it compatible with 2D PackedTensorsAccessor of other output
544+
TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(vbe_row_output_offsets, vbe_output);
545+
if (vbe_output.has_value()){
546+
output = vbe_output.value().reshape({1, -1});
547+
}
548+
else {
549+
output = at::empty(
550+
{1, vbe_output_size},
551+
dev_weights.options().dtype(getScalarType(o_dtype))
552+
);
553+
}
554+
{%- endif %} {#-/* if dense */#}
537555
{%- else %}
538556
int64_t total_adjusted_D = total_D;
539557
if (o_dtype == SparseType::INT8) {
@@ -877,7 +895,12 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
877895
" int iter, "
878896
" float gwd_lower_bound, "
879897
{%- endif %}
898+
{%- if vbe and not dense %}
899+
" bool is_experimental,"
900+
" Tensor? vbe_output"
901+
{%- else %}
880902
" bool is_experimental"
903+
{%- endif %}
881904
") -> Tensor"
882905
{%- if not dense and not nobag and not vbe %}
883906
// only split_embedding_codegen_forward_[un]weighted_cuda

0 commit comments

Comments
 (0)