DeepRec-AI
diff --git a/‎tensorflow/core/framework/embedding/single_tier_storage.h‎
Lines changed: 1 addition & 1 deletion b/‎tensorflow/core/framework/embedding/single_tier_storage.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h‎
Lines changed: 141 additions & 128 deletions b/‎tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h‎
Lines changed: 141 additions & 128 deletions
@@ -469,7 +469,7 @@ class HbmStorage : public SingleTierStorage<K, V> {
       embedding::Iterator** it) override {
     GPUHashMapKV<K, V>* gpu_kv =
         dynamic_cast<GPUHashMapKV<K, V>*>(SingleTierStorage<K, V>::kv_);
-    gpu_kv->GetSnapshot(key_list, value_list, emb_config.emb_index);
+    gpu_kv->GetSnapshot(key_list, value_list, emb_config);
     return key_list->size();
   }
 
 
@@ -27,6 +27,12 @@ namespace {
 
 template <typename TKey, typename TValue>
 struct GroupEmbeddingBackWardArgs {
+  GroupEmbeddingBackWardArgs() = default;
+  GroupEmbeddingBackWardArgs(TValue *grads, TKey *sp_values,
+                             TValue *emb_variable, TValue *grads_output,
+                             int *offset_indices, int nnz)
+      : grads_(grads), sp_values_(sp_values),emb_variable_(emb_variable),
+        grads_output_(grads_output), offset_indices_(offset_indices), nnz_(nnz)  {}
   TValue *grads_;
   TKey *sp_values_;
   TValue *emb_variable_;
@@ -59,26 +65,29 @@ __global__ void ComputeEVGradFn(
         feature_num = args[idx].offset_indices_[bid + 1] - value_offset;
       }
 
-      float grad = args[idx].grads_[bid * dimension + tid];
-      grad = CombineGrad<combiner>(grad, feature_num);
-
-      for (int j = 0; j < feature_num; ++j) {
-        float grad_i = grad;
-        int feature_offset = (value_offset + j) * dimension;
-        if (max_norm > 0.0f) {
-          float emb_element = 0.0f;  // TODO: hujunqi get emb_weight
-          if (tid == 0) {
-            l2_sum = 0.0f;
-          }
-          tile.shfl(l2_sum, 0);
-          atomicAdd(&l2_sum, emb_element * emb_element);
-          tile.sync();
-          float l2_norm = sqrtf(l2_sum);
-          if (l2_norm > max_norm) {
-            grad_i *= max_norm / l2_norm;
+      if (feature_num > 0) {
+        float grad = args[idx].grads_[bid * dimension + tid];
+        grad = CombineGrad<combiner>(grad, feature_num);
+
+        for (int j = 0; j < feature_num; ++j) {
+          float grad_i = grad;
+          int feature_offset = (value_offset + j) * dimension;
+          if (max_norm > 0.0f) {
+            float emb_element = 0.0f;  // TODO(junqihu): get emb_weight
+            if (tid == 0) {
+              l2_sum = 0.0f;
+            }
+            tile.shfl(l2_sum, 0);
+            atomicAdd(&l2_sum, emb_element * emb_element);
+            tile.sync();
+            float l2_norm = sqrtf(l2_sum);
+            if (l2_norm > max_norm) {
+              grad_i *= max_norm / l2_norm;
+            }
           }
+          args[idx].grads_output_[(value_offset + j) * dimension + tid] =
+              grad_i;
         }
-        args[idx].grads_output_[(value_offset + j) * dimension + tid] = grad_i;
       }
     }
   }
@@ -105,31 +114,29 @@ __global__ void ComputeSparseGradFn(
     } else {
       feature_num = args[idx].offset_indices_[bid + 1] - value_offset;
     }
-    float grad = args[idx].grads_[bid * dimension + tid];
-    // printf("feature_num is %d , grad is %lld , bid is %d , tid is %d \n",
-    // feature_num, grad, blockIdx.x, threadIdx.x);
-    grad = CombineGrad<combiner>(grad, feature_num);
-    for (int i = 0; i < feature_num; i++) {
-      float grad_i = grad;
-      if (max_norm > 0.0f) {
-        int64_t indices = int(args[idx].sp_values_[value_offset + i]);
-        float emb_element = args[idx].emb_variable_[indices * dimension + tid];
-        // if (FastBoundsCheck(indices, args[idx].emb_row_size_)) {
-        //   emb_element =
-        //       args[idx].emb_variable_[indices * dimension + tid];
-        // }
-        if (tid == 0) {
-          l2_sum = 0.0f;
-        }
-        tile.shfl(l2_sum, 0);
-        atomicAdd(&l2_sum, emb_element * emb_element);
-        tile.sync();
-        float l2_norm = sqrtf(l2_sum);
-        if (l2_norm > max_norm) {
-          grad_i *= max_norm / l2_norm;
+
+    if (feature_num > 0) {
+      float grad = args[idx].grads_[bid * dimension + tid];
+      grad = CombineGrad<combiner>(grad, feature_num);
+      for (int i = 0; i < feature_num; i++) {
+        float grad_i = grad;
+        if (max_norm > 0.0f) {
+          int64_t indices = int(args[idx].sp_values_[value_offset + i]);
+          float emb_element =
+              args[idx].emb_variable_[indices * dimension + tid];
+          if (tid == 0) {
+            l2_sum = 0.0f;
+          }
+          tile.shfl(l2_sum, 0);
+          atomicAdd(&l2_sum, emb_element * emb_element);
+          tile.sync();
+          float l2_norm = sqrtf(l2_sum);
+          if (l2_norm > max_norm) {
+            grad_i *= max_norm / l2_norm;
+          }
         }
+        args[idx].grads_output_[(value_offset + i) * dimension + tid] = grad_i;
       }
-      args[idx].grads_output_[(value_offset + i) * dimension + tid] = grad_i;
     }
   }
 }
@@ -156,26 +163,29 @@ __global__ void NormalComputeEVGradFn(
         feature_num = args[idx].offset_indices_[bid + 1] - value_offset;
       }
 
-      float grad = args[idx].grads_[bid * dimension + tid];
-      grad = CombineGrad<combiner>(grad, feature_num);
-
-      for (int j = 0; j < feature_num; ++j) {
-        float grad_i = grad;
-        int feature_offset = (value_offset + j) * dimension;
-        if (max_norm > 0.0f) {
-          float emb_element = 0.0f;  // TODO: hujunqi get emb_weight
-          if (tid == 0) {
-            l2_sum[0] = 0.0f;
-          }
-          __syncthreads();
-          atomicAdd(l2_sum, emb_element * emb_element);
-          __syncthreads();
-          float l2_norm = sqrtf(l2_sum[0]);
-          if (l2_norm > max_norm) {
-            grad_i *= max_norm / l2_norm;
+      if (feature_num > 0) {
+        float grad = args[idx].grads_[bid * dimension + tid];
+        grad = CombineGrad<combiner>(grad, feature_num);
+
+        for (int j = 0; j < feature_num; ++j) {
+          float grad_i = grad;
+          int feature_offset = (value_offset + j) * dimension;
+          if (max_norm > 0.0f) {
+            float emb_element = 0.0f;  // TODO(junqihu): get emb_weight
+            if (tid == 0) {
+              l2_sum[0] = 0.0f;
+            }
+            __syncthreads();
+            atomicAdd(l2_sum, emb_element * emb_element);
+            __syncthreads();
+            float l2_norm = sqrtf(l2_sum[0]);
+            if (l2_norm > max_norm) {
+              grad_i *= max_norm / l2_norm;
+            }
           }
+          args[idx].grads_output_[(value_offset + j) * dimension + tid] =
+              grad_i;
         }
-        args[idx].grads_output_[(value_offset + j) * dimension + tid] = grad_i;
       }
     }
   }
@@ -201,27 +211,29 @@ __global__ void NormalComputeSparseGradFn(
     } else {
       feature_num = args[idx].offset_indices_[bid + 1] - value_offset;
     }
-    float grad = args[idx].grads_[bid * dimension + tid];
-    // printf("feature_num is %d , grad is %lld , bid is %d , tid is %d \n",
-    // feature_num, grad, blockIdx.x, threadIdx.x);
-    grad = CombineGrad<combiner>(grad, feature_num);
-    for (int i = 0; i < feature_num; i++) {
-      float grad_i = grad;
-      if (max_norm > 0.0f) {
-        int64_t indices = int(args[idx].sp_values_[value_offset + i]);
-        float emb_element = args[idx].emb_variable_[indices * dimension + tid];
-        if (tid == 0) {
-          l2_sum[0] = 0.0f;
-        }
-        __syncthreads();
-        atomicAdd(l2_sum, emb_element * emb_element);
-        __syncthreads();
-        float l2_norm = sqrtf(l2_sum[0]);
-        if (l2_norm > max_norm) {
-          grad_i *= max_norm / l2_norm;
+
+    if (feature_num > 0) {
+      float grad = args[idx].grads_[bid * dimension + tid];
+      grad = CombineGrad<combiner>(grad, feature_num);
+      for (int i = 0; i < feature_num; i++) {
+        float grad_i = grad;
+        if (max_norm > 0.0f) {
+          int64_t indices = int(args[idx].sp_values_[value_offset + i]);
+          float emb_element =
+              args[idx].emb_variable_[indices * dimension + tid];
+          if (tid == 0) {
+            l2_sum[0] = 0.0f;
+          }
+          __syncthreads();
+          atomicAdd(l2_sum, emb_element * emb_element);
+          __syncthreads();
+          float l2_norm = sqrtf(l2_sum[0]);
+          if (l2_norm > max_norm) {
+            grad_i *= max_norm / l2_norm;
+          }
         }
+        args[idx].grads_output_[(value_offset + i) * dimension + tid] = grad_i;
       }
-      args[idx].grads_output_[(value_offset + i) * dimension + tid] = grad_i;
     }
   }
 }
@@ -231,52 +243,54 @@ __global__ void NormalComputeSparseGradFn(
 template <typename TKey, typename TValue>
 class GroupEmbeddingLookupBackWard {
  public:
-  void initialize(int dimension, int num_lookups, float max_norm) {
-    CK_CUDA_THROW_(cudaMalloc(
-        &d_args_,
-        sizeof(GroupEmbeddingBackWardArgs<TKey, TValue>) * num_lookups));
-    args_.resize(num_lookups);
+  explicit GroupEmbeddingLookupBackWard(int dimension, int num_lookups,
+                                        float max_norm,
+                                        Allocator *gpu_allocator = nullptr)
+      : alloc_(gpu_allocator) {
+    d_args_ =
+        TypedAllocator::Allocate<GroupEmbeddingBackWardArgs<TKey, TValue>>(
+            gpu_allocator, num_lookups, AllocationAttributes());
+    h_args_.reserve(num_lookups);
     max_norm_ = max_norm;
     nums_ = num_lookups;
     dimension_ = dimension;
   }
 
-  void set(int idx, TValue *grads, TValue *grads_output, int *offset_indices,
-           TKey *sp_values, TValue *emb_variable, int nnz) {
-    args_[idx].grads_ = grads;
-    args_[idx].grads_output_ = grads_output;
-    args_[idx].offset_indices_ = offset_indices;
-    args_[idx].sp_values_ = sp_values;
-    args_[idx].emb_variable_ = emb_variable;
-    args_[idx].nnz_ = nnz;
+  void set(GroupEmbeddingBackWardArgs<TKey, TValue> &arg) {
+    h_args_.emplace_back(arg);
   }
 
   ~GroupEmbeddingLookupBackWard() {
-    if (d_args_) {
-      CK_CUDA_THROW_(cudaFree(d_args_));
-    }
+    TypedAllocator::Deallocate(alloc_, d_args_, nums_);
   }
 
   template <typename GradFn>
-  void Backward(GradFn fn, int batch_size, int tile_size, cudaStream_t stream) {
+  inline void Backward(GradFn fn, int batch_size, int tile_size,
+                       cudaStream_t stream) {
     CK_CUDA_THROW_(cudaMemcpyAsync(
-        d_args_, args_.data(),
-        args_.size() * sizeof(GroupEmbeddingBackWardArgs<TKey, TValue>),
+        d_args_, h_args_.data(),
+        h_args_.size() * sizeof(GroupEmbeddingBackWardArgs<TKey, TValue>),
         cudaMemcpyHostToDevice, stream));
 
     {
-      const int block_size = (batch_size - 1) / 64 * tile_size + 1;
+      if (tile_size <= 32) {
+        const int block_size = batch_size / 64 * tile_size + 1;
 
-      fn<<<block_size, 64, 0, stream>>>(batch_size, max_norm_, nums_,
-                                        dimension_, d_args_);
+        fn<<<block_size, 64, 0, stream>>>(batch_size, max_norm_, nums_,
+                                          dimension_, d_args_);
+      } else {
+        fn<<<batch_size, tile_size, 0, stream>>>(batch_size, max_norm_, nums_,
+                                                 dimension_, d_args_);
+      }
     }
 
     CK_CUDA_THROW_(cudaGetLastError());
   }
 
  protected:
-  std::vector<GroupEmbeddingBackWardArgs<TKey, TValue>> args_;
+  std::vector<GroupEmbeddingBackWardArgs<TKey, TValue>> h_args_;
   GroupEmbeddingBackWardArgs<TKey, TValue> *d_args_;
+  Allocator *alloc_;
   float max_norm_;
   int nums_;
   int dimension_;
@@ -290,56 +304,55 @@ class GroupLookupBackWardBaseOp : public OpKernel {
     OP_REQUIRES_OK(c, c->GetAttr("max_norm", &max_norm_));
     OP_REQUIRES_OK(c, c->GetAttr("num_lookups", &num_lookups_));
     OP_REQUIRES_OK(c, c->GetAttr("dimension", &dimension_));
-    lookuper_.initialize(dimension_, num_lookups_, max_norm_);
   }
 
   template <bool Isev = false, Combiner combiner>
-  inline void compute(const int batch_size, cudaStream_t stream) {
+  inline void compute(GroupEmbeddingLookupBackWard<TKey, TValue> &lookuper,
+                      const int batch_size, cudaStream_t stream) {
     if (Isev) {
       if (dimension_ <= 2) {
-        lookuper_.Backward(ComputeEVGradFn<TKey, TValue, combiner, 2>,
-                           batch_size, 2, stream);
+        lookuper.Backward(ComputeEVGradFn<TKey, TValue, combiner, 2>,
+                          batch_size, 2, stream);
       } else if (dimension_ <= 4) {
-        lookuper_.Backward(ComputeEVGradFn<TKey, TValue, combiner, 4>,
-                           batch_size, 4, stream);
+        lookuper.Backward(ComputeEVGradFn<TKey, TValue, combiner, 4>,
+                          batch_size, 4, stream);
       } else if (dimension_ <= 8) {
-        lookuper_.Backward(ComputeEVGradFn<TKey, TValue, combiner, 8>,
-                           batch_size, 8, stream);
+        lookuper.Backward(ComputeEVGradFn<TKey, TValue, combiner, 8>,
+                          batch_size, 8, stream);
       } else if (dimension_ <= 16) {
-        lookuper_.Backward(ComputeEVGradFn<TKey, TValue, combiner, 16>,
-                           batch_size, 16, stream);
+        lookuper.Backward(ComputeEVGradFn<TKey, TValue, combiner, 16>,
+                          batch_size, 16, stream);
       } else if (dimension_ <= 32) {
-        lookuper_.Backward(ComputeEVGradFn<TKey, TValue, combiner, 32>,
-                           batch_size, 32, stream);
+        lookuper.Backward(ComputeEVGradFn<TKey, TValue, combiner, 32>,
+                          batch_size, 32, stream);
       } else {
-        lookuper_.Backward(NormalComputeEVGradFn<TKey, TValue, combiner>,
-                           batch_size, 64, stream);
+        lookuper.Backward(NormalComputeEVGradFn<TKey, TValue, combiner>,
+                          batch_size, dimension_, stream);
       }
     } else {
       if (dimension_ <= 2) {
-        lookuper_.Backward(ComputeSparseGradFn<TKey, TValue, combiner, 2>,
-                           batch_size, 2, stream);
+        lookuper.Backward(ComputeSparseGradFn<TKey, TValue, combiner, 2>,
+                          batch_size, 2, stream);
       } else if (dimension_ <= 4) {
-        lookuper_.Backward(ComputeSparseGradFn<TKey, TValue, combiner, 4>,
-                           batch_size, 4, stream);
+        lookuper.Backward(ComputeSparseGradFn<TKey, TValue, combiner, 4>,
+                          batch_size, 4, stream);
       } else if (dimension_ <= 8) {
-        lookuper_.Backward(ComputeSparseGradFn<TKey, TValue, combiner, 8>,
-                           batch_size, 8, stream);
+        lookuper.Backward(ComputeSparseGradFn<TKey, TValue, combiner, 8>,
+                          batch_size, 8, stream);
       } else if (dimension_ <= 16) {
-        lookuper_.Backward(ComputeSparseGradFn<TKey, TValue, combiner, 16>,
-                           batch_size, 16, stream);
+        lookuper.Backward(ComputeSparseGradFn<TKey, TValue, combiner, 16>,
+                          batch_size, 16, stream);
       } else if (dimension_ <= 32) {
-        lookuper_.Backward(ComputeSparseGradFn<TKey, TValue, combiner, 32>,
-                           batch_size, 32, stream);
+        lookuper.Backward(ComputeSparseGradFn<TKey, TValue, combiner, 32>,
+                          batch_size, 32, stream);
       } else {
-        lookuper_.Backward(NormalComputeSparseGradFn<TKey, TValue, combiner>,
-                           batch_size, 64, stream);
+        lookuper.Backward(NormalComputeSparseGradFn<TKey, TValue, combiner>,
+                          batch_size, dimension_, stream);
       }
     }
   }
 
  protected:
-  GroupEmbeddingLookupBackWard<TKey, TValue> lookuper_;
   std::string combiner_;
   float max_norm_;
   int num_lookups_;
Original file line number	Diff line number	Diff line change
`@@ -469,7 +469,7 @@ class HbmStorage : public SingleTierStorage<K, V> {`
`469`	`469`	`embedding::Iterator** it) override {`
`470`	`470`	`GPUHashMapKV<K, V>* gpu_kv =`
`471`	`471`	`dynamic_cast<GPUHashMapKV<K, V>*>(SingleTierStorage<K, V>::kv_);`
`472`		`- gpu_kv->GetSnapshot(key_list, value_list, emb_config.emb_index);`
	`472`	`+ gpu_kv->GetSnapshot(key_list, value_list, emb_config);`
`473`	`473`	`return key_list->size();`
`474`	`474`	`}`
`475`	`475`