ColinPeppler
diff --git a/‎fbgemm_gpu/codegen/genscript/generate_backward_split.py‎
Lines changed: 0 additions & 1 deletion b/‎fbgemm_gpu/codegen/genscript/generate_backward_split.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎fbgemm_gpu/codegen/genscript/optimizer_args.py‎
Lines changed: 3 additions & 1 deletion b/‎fbgemm_gpu/codegen/genscript/optimizer_args.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp‎
Lines changed: 2 additions & 3 deletions b/‎fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎fbgemm_gpu/codegen/training/forward/embedding_forward_split_meta_template.cpp‎
Lines changed: 1 addition & 7 deletions b/‎fbgemm_gpu/codegen/training/forward/embedding_forward_split_meta_template.cpp‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu‎
Lines changed: 3 additions & 26 deletions b/‎fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu‎
Lines changed: 3 additions & 26 deletions
@@ -422,7 +422,6 @@ def generate() -> None:
                 "lxu_cache_locations",  # 3
                 "uvm_cache_stats",  # 4
                 "prev_iter_dev",  # 5
-                "vbe_output_offsets",  # 6
             ],
             "aux_int": [
                 "iter",  # 0
 
@@ -73,7 +73,9 @@ class OptimizerArgsSetItem:
     "row_counter_dev": "(q!)",
     "row_counter_uvm": "(r!)",
     "optim_tensor": "(s!)",
-    "vbe_output": "(t!)",
+    "delta_weights_host": "(t!)",
+    "delta_weights_dev": "(u!)",
+    "delta_weights_uvm": "(v!)",
 }
 
 ######################################################################
 
@@ -708,7 +708,7 @@ class {{ autograd_func }} :
     static auto generate_vbe_metadata_op =
         torch::Dispatcher::singleton()
             .findSchemaOrThrow("fbgemm::generate_vbe_metadata", "")
-            .typed<std::tuple<Tensor, Tensor>(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const int64_t, const bool, const c10::SymInt, const int64_t, const c10::SymInt, const std::optional<Tensor>&)>();
+            .typed<std::tuple<Tensor, Tensor>(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const int64_t, const bool, const c10::SymInt, const int64_t, const c10::SymInt)>();
 
     auto [
         vbe_row_output_offsets,
@@ -729,8 +729,7 @@ class {{ autograd_func }} :
         {%- endif %}
         max_B_feature_rank,
         info_B_num_bits,
-        /*total_B=*/offsets.sym_size(0) - 1,
-        std::nullopt /* pre-allocated vbe_output is not supported in TBE interface V1 or Dense TBE */
+        /*total_B=*/offsets.sym_size(0) - 1
         );
     {%- endif %}
 
 
@@ -6,7 +6,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// clang-format off
 {#
 // @lint-ignore LINTIGNORE
 // @lint-ignore-every CLANGFORMAT
@@ -104,12 +103,7 @@ Tensor
     const int64_t iter,
     const double gwd_lower_bound,
     {%- endif %}
-    {%- if vbe and not dense %}
-    const bool is_experimental,
-    std::optional<Tensor> vbe_output
-    {%- else %}
     const bool is_experimental
-    {%- endif %}
 ) {
     // NB: omitted the device tests TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL
     {%- if not nobag %}
@@ -216,4 +210,4 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
 {%- endfor %} {#-/* for is_gwd */#}
 {%- endif %} {#/* if (not nobag or (not weighted and not vbe)) */#}
 {%- endfor %} {#-/* for nobag */#}
-  // clang-format on
+    // clang-format on
@@ -6,10 +6,10 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+{#
 // @lint-ignore LINTIGNORE
 // @lint-ignore-every CLANGFORMAT
 // clang-format off
-{#
 // Note: clang-format off doesn't work with this templaterized code,
 // so we need to keep lint-ignore-every.
 // See https://fburl.com/dw9ljh4h
@@ -391,12 +391,7 @@ batch_index_select_dim0_codegen_forward_cuda(
     const int64_t iter,
     const double gwd_lower_bound,
     {%- endif %}
-    {%- if vbe and not dense %}
-    const bool is_experimental,
-    std::optional<Tensor> vbe_output
-    {%- else %}
     const bool is_experimental
-    {%- endif %}
     {%- endif %} {#- /*if is_index_select*/ #}
 ) {
     {%- if not nobag or is_index_select %}
@@ -534,24 +529,11 @@ batch_index_select_dim0_codegen_forward_cuda(
                 o_dtype == SparseType::BF16 || o_dtype == SparseType::INT8);
 
     {%- if vbe %}
-    {%- if dense %}
-    output = at::empty(
-        {1, vbe_output_size},
-        dev_weights.options().dtype(getScalarType(o_dtype))
-      );
-    {%- else %}
     // Use a 2D tensor to make it compatible with 2D PackedTensorsAccessor of other output
-    TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(vbe_row_output_offsets, vbe_output);
-    if (vbe_output.has_value()){
-      output = vbe_output.value().reshape({1, -1});
-    }
-    else {
-      output = at::empty(
+    output = at::empty(
         {1, vbe_output_size},
         dev_weights.options().dtype(getScalarType(o_dtype))
-      );
-    }
-    {%- endif %} {#-/* if dense */#}
+    );
     {%- else %}
     int64_t total_adjusted_D = total_D;
     if (o_dtype == SparseType::INT8) {
@@ -895,12 +877,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
           "    int iter, "
           "    float gwd_lower_bound, "
           {%- endif %}
-          {%- if vbe and not dense %}
-          "    bool is_experimental,"
-          "    Tensor? vbe_output"
-          {%- else %}
           "    bool is_experimental"
-          {%- endif %}
           ") -> Tensor"
           {%- if not dense and not nobag and not vbe %}
           // only split_embedding_codegen_forward_[un]weighted_cuda