Skip to content

Commit f9ccd01

Browse files
Srikanth Kamathfacebook-github-bot
authored andcommitted
Back out "Merge VBE output (backend)" (pytorch#4881)
Summary: X-link: facebookresearch/FBGEMM#1906 Pull Request resolved: pytorch#4881 Original commit changeset: 633fc384e485 Original Phabricator Diff: D79704318 This diff was identified as the blame diff by bisect for this pyper release test: https://www.internalfb.com/intern/test/562950133905300?ref_report_id=0 Reviewed By: spcyppt, aschhabra Differential Revision: D82562539 fbshipit-source-id: 25b5ba6c913a0467faaaee9fbe2022c479bb1865
1 parent 659d807 commit f9ccd01

14 files changed

+66
-226
lines changed

fbgemm_gpu/codegen/genscript/generate_backward_split.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,6 @@ def generate() -> None:
422422
"lxu_cache_locations", # 3
423423
"uvm_cache_stats", # 4
424424
"prev_iter_dev", # 5
425-
"vbe_output_offsets", # 6
426425
],
427426
"aux_int": [
428427
"iter", # 0

fbgemm_gpu/codegen/genscript/optimizer_args.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,9 @@ class OptimizerArgsSetItem:
7373
"row_counter_dev": "(q!)",
7474
"row_counter_uvm": "(r!)",
7575
"optim_tensor": "(s!)",
76-
"vbe_output": "(t!)",
76+
"delta_weights_host": "(t!)",
77+
"delta_weights_dev": "(u!)",
78+
"delta_weights_uvm": "(v!)",
7779
}
7880

7981
######################################################################

fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,7 @@ class {{ autograd_func }} :
708708
static auto generate_vbe_metadata_op =
709709
torch::Dispatcher::singleton()
710710
.findSchemaOrThrow("fbgemm::generate_vbe_metadata", "")
711-
.typed<std::tuple<Tensor, Tensor>(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const int64_t, const bool, const c10::SymInt, const int64_t, const c10::SymInt, const std::optional<Tensor>&)>();
711+
.typed<std::tuple<Tensor, Tensor>(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const int64_t, const bool, const c10::SymInt, const int64_t, const c10::SymInt)>();
712712

713713
auto [
714714
vbe_row_output_offsets,
@@ -729,8 +729,7 @@ class {{ autograd_func }} :
729729
{%- endif %}
730730
max_B_feature_rank,
731731
info_B_num_bits,
732-
/*total_B=*/offsets.sym_size(0) - 1,
733-
std::nullopt /* pre-allocated vbe_output is not supported in TBE interface V1 or Dense TBE */
732+
/*total_B=*/offsets.sym_size(0) - 1
734733
);
735734
{%- endif %}
736735

fbgemm_gpu/codegen/training/forward/embedding_forward_split_meta_template.cpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9-
// clang-format off
109
{#
1110
// @lint-ignore LINTIGNORE
1211
// @lint-ignore-every CLANGFORMAT
@@ -104,12 +103,7 @@ Tensor
104103
const int64_t iter,
105104
const double gwd_lower_bound,
106105
{%- endif %}
107-
{%- if vbe and not dense %}
108-
const bool is_experimental,
109-
std::optional<Tensor> vbe_output
110-
{%- else %}
111106
const bool is_experimental
112-
{%- endif %}
113107
) {
114108
// NB: omitted the device tests TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL
115109
{%- if not nobag %}
@@ -216,4 +210,4 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
216210
{%- endfor %} {#-/* for is_gwd */#}
217211
{%- endif %} {#/* if (not nobag or (not weighted and not vbe)) */#}
218212
{%- endfor %} {#-/* for nobag */#}
219-
// clang-format on
213+
// clang-format on

fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu

Lines changed: 3 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
{#
910
// @lint-ignore LINTIGNORE
1011
// @lint-ignore-every CLANGFORMAT
1112
// clang-format off
12-
{#
1313
// Note: clang-format off doesn't work with this templaterized code,
1414
// so we need to keep lint-ignore-every.
1515
// See https://fburl.com/dw9ljh4h
@@ -391,12 +391,7 @@ batch_index_select_dim0_codegen_forward_cuda(
391391
const int64_t iter,
392392
const double gwd_lower_bound,
393393
{%- endif %}
394-
{%- if vbe and not dense %}
395-
const bool is_experimental,
396-
std::optional<Tensor> vbe_output
397-
{%- else %}
398394
const bool is_experimental
399-
{%- endif %}
400395
{%- endif %} {#- /*if is_index_select*/ #}
401396
) {
402397
{%- if not nobag or is_index_select %}
@@ -534,24 +529,11 @@ batch_index_select_dim0_codegen_forward_cuda(
534529
o_dtype == SparseType::BF16 || o_dtype == SparseType::INT8);
535530

536531
{%- if vbe %}
537-
{%- if dense %}
538-
output = at::empty(
539-
{1, vbe_output_size},
540-
dev_weights.options().dtype(getScalarType(o_dtype))
541-
);
542-
{%- else %}
543532
// Use a 2D tensor to make it compatible with 2D PackedTensorsAccessor of other output
544-
TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(vbe_row_output_offsets, vbe_output);
545-
if (vbe_output.has_value()){
546-
output = vbe_output.value().reshape({1, -1});
547-
}
548-
else {
549-
output = at::empty(
533+
output = at::empty(
550534
{1, vbe_output_size},
551535
dev_weights.options().dtype(getScalarType(o_dtype))
552-
);
553-
}
554-
{%- endif %} {#-/* if dense */#}
536+
);
555537
{%- else %}
556538
int64_t total_adjusted_D = total_D;
557539
if (o_dtype == SparseType::INT8) {
@@ -895,12 +877,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
895877
" int iter, "
896878
" float gwd_lower_bound, "
897879
{%- endif %}
898-
{%- if vbe and not dense %}
899-
" bool is_experimental,"
900-
" Tensor? vbe_output"
901-
{%- else %}
902880
" bool is_experimental"
903-
{%- endif %}
904881
") -> Tensor"
905882
{%- if not dense and not nobag and not vbe %}
906883
// only split_embedding_codegen_forward_[un]weighted_cuda

0 commit comments

Comments
 (0)