pytorch
diff --git a/‎fbgemm_gpu/codegen/genscript/generate_backward_split.py‎
Lines changed: 1 addition & 0 deletions b/‎fbgemm_gpu/codegen/genscript/generate_backward_split.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎fbgemm_gpu/codegen/genscript/optimizer_args.py‎
Lines changed: 1 addition & 3 deletions b/‎fbgemm_gpu/codegen/genscript/optimizer_args.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp‎
Lines changed: 3 additions & 2 deletions b/‎fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎fbgemm_gpu/codegen/training/forward/embedding_forward_split_meta_template.cpp‎
Lines changed: 7 additions & 1 deletion b/‎fbgemm_gpu/codegen/training/forward/embedding_forward_split_meta_template.cpp‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu‎
Lines changed: 26 additions & 3 deletions b/‎fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu‎
Lines changed: 26 additions & 3 deletions
@@ -422,6 +422,7 @@ def generate() -> None:
                 "lxu_cache_locations",  # 3
                 "uvm_cache_stats",  # 4
                 "prev_iter_dev",  # 5
+                "vbe_output_offsets",  # 6
             ],
             "aux_int": [
                 "iter",  # 0
 
@@ -73,9 +73,7 @@ class OptimizerArgsSetItem:
     "row_counter_dev": "(q!)",
     "row_counter_uvm": "(r!)",
     "optim_tensor": "(s!)",
-    "delta_weights_host": "(t!)",
-    "delta_weights_dev": "(u!)",
-    "delta_weights_uvm": "(v!)",
+    "vbe_output": "(t!)",
 }
 
 ######################################################################
 
@@ -708,7 +708,7 @@ class {{ autograd_func }} :
     static auto generate_vbe_metadata_op =
         torch::Dispatcher::singleton()
             .findSchemaOrThrow("fbgemm::generate_vbe_metadata", "")
-            .typed<std::tuple<Tensor, Tensor>(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const int64_t, const bool, const c10::SymInt, const int64_t, const c10::SymInt)>();
+            .typed<std::tuple<Tensor, Tensor>(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const int64_t, const bool, const c10::SymInt, const int64_t, const c10::SymInt, const std::optional<Tensor>&)>();
 
     auto [
         vbe_row_output_offsets,
@@ -729,7 +729,8 @@ class {{ autograd_func }} :
         {%- endif %}
         max_B_feature_rank,
         info_B_num_bits,
-        /*total_B=*/offsets.sym_size(0) - 1
+        /*total_B=*/offsets.sym_size(0) - 1,
+        std::nullopt /* pre-allocated vbe_output is not supported in TBE interface V1 or Dense TBE */
         );
     {%- endif %}
 
 
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+// clang-format off
 {#
 // @lint-ignore LINTIGNORE
 // @lint-ignore-every CLANGFORMAT
@@ -103,7 +104,12 @@ Tensor
     const int64_t iter,
     const double gwd_lower_bound,
     {%- endif %}
+    {%- if vbe and not dense %}
+    const bool is_experimental,
+    std::optional<Tensor> vbe_output
+    {%- else %}
     const bool is_experimental
+    {%- endif %}
 ) {
     // NB: omitted the device tests TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL
     {%- if not nobag %}
@@ -210,4 +216,4 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
 {%- endfor %} {#-/* for is_gwd */#}
 {%- endif %} {#/* if (not nobag or (not weighted and not vbe)) */#}
 {%- endfor %} {#-/* for nobag */#}
-    // clang-format on
+  // clang-format on
@@ -6,10 +6,10 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-{#
 // @lint-ignore LINTIGNORE
 // @lint-ignore-every CLANGFORMAT
 // clang-format off
+{#
 // Note: clang-format off doesn't work with this templaterized code,
 // so we need to keep lint-ignore-every.
 // See https://fburl.com/dw9ljh4h
@@ -391,7 +391,12 @@ batch_index_select_dim0_codegen_forward_cuda(
     const int64_t iter,
     const double gwd_lower_bound,
     {%- endif %}
+    {%- if vbe and not dense %}
+    const bool is_experimental,
+    std::optional<Tensor> vbe_output
+    {%- else %}
     const bool is_experimental
+    {%- endif %}
     {%- endif %} {#- /*if is_index_select*/ #}
 ) {
     {%- if not nobag or is_index_select %}
@@ -529,11 +534,24 @@ batch_index_select_dim0_codegen_forward_cuda(
                 o_dtype == SparseType::BF16 || o_dtype == SparseType::INT8);
 
     {%- if vbe %}
-    // Use a 2D tensor to make it compatible with 2D PackedTensorsAccessor of other output
+    {%- if dense %}
     output = at::empty(
         {1, vbe_output_size},
         dev_weights.options().dtype(getScalarType(o_dtype))
-    );
+      );
+    {%- else %}
+    // Use a 2D tensor to make it compatible with 2D PackedTensorsAccessor of other output
+    TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(vbe_row_output_offsets, vbe_output);
+    if (vbe_output.has_value()){
+      output = vbe_output.value().reshape({1, -1});
+    }
+    else {
+      output = at::empty(
+        {1, vbe_output_size},
+        dev_weights.options().dtype(getScalarType(o_dtype))
+      );
+    }
+    {%- endif %} {#-/* if dense */#}
     {%- else %}
     int64_t total_adjusted_D = total_D;
     if (o_dtype == SparseType::INT8) {
@@ -877,7 +895,12 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
           "    int iter, "
           "    float gwd_lower_bound, "
           {%- endif %}
+          {%- if vbe and not dense %}
+          "    bool is_experimental,"
+          "    Tensor? vbe_output"
+          {%- else %}
           "    bool is_experimental"
+          {%- endif %}
           ") -> Tensor"
           {%- if not dense and not nobag and not vbe %}
           // only split_embedding_codegen_forward_[un]weighted_cuda