[Embedding] Fix coredump in HBM storage. (#642)

candyzone · web-flow · commit db2b757c3429 · 2023-01-17T15:11:35.000+08:00
diff --git a/tensorflow/core/framework/embedding/bloom_filter_policy.h b/tensorflow/core/framework/embedding/bloom_filter_policy.h
@@ -179,7 +179,6 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
       hash_val.emplace_back(
           FastHash64(key, seeds_[i]) % config_.num_counter);
     }
-   int64 min_freq;
     switch (config_.counter_type){
       case DT_UINT64:
         SetMinFreq<uint64>(hash_val, freq);
diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
@@ -730,166 +730,4 @@ class EmbeddingVar : public ResourceBase {
 
 }  // namespace tensorflow
 
-#if GOOGLE_CUDA
-namespace tensorflow {
-
-template <class K, class V>
-class EmbeddingVarGPU : public ResourceBase {
- public:
-  EmbeddingVarGPU(const string& name,
-                  embedding::GPUHashMapKV<K, V>* kv,
-                  Allocator* alloc,
-                  const EmbeddingConfig& emb_cfg = EmbeddingConfig()):
-      name_(name),
-      kv_(kv),
-      default_value_(nullptr),
-      value_len_(0),
-      emb_config_(emb_cfg) {
-    alloc_ =
-        DisableGPUEVAllocatorFromEnvironment() ? alloc : gpu_ev_allocator();
-  }
-
-  Status Init() {
-    if (kv_ == nullptr) {
-       return errors::InvalidArgument("Error to construct EmbeddingVarGPU");
-    } else {
-      return Status::OK();
-    }
-  }
-
-  Status Init(const Tensor& default_tensor,
-      int64 default_value_dim=1) {
-    if (DataTypeToEnum<V>::v() != default_tensor.dtype()) {
-       return errors::InvalidArgument(
-           "EV's default_tensor DTYPE must be same as EmbeddingVar Value Type");
-    } else if (kv_ == nullptr) {
-       return errors::InvalidArgument("Error to construct EmbeddingVarGPU");
-    } else {
-      emb_config_.default_value_dim = default_value_dim;
-      value_len_ =
-        default_tensor.NumElements() / emb_config_.default_value_dim;
-      kv_->SetValueLen(value_len_);
-      default_value_ = TypedAllocator::Allocate<V>(
-          alloc_, default_tensor.NumElements(), AllocationAttributes());
-      auto default_tensor_flat = default_tensor.flat<V>();
-      cudaMemcpy(default_value_, &default_tensor_flat(0),
-          default_tensor.TotalBytes(), cudaMemcpyDeviceToDevice);
-      return Status::OK();
-    }
-  }
-
-  void SetInitialized() {
-    is_initialized_ = true;
-  }
-
-  bool IsInitialized() const {
-    return is_initialized_;
-  }
-
-  void LookupOrCreateKey(const K* key, int32* item_idxs, size_t n,
-      const Eigen::GpuDevice& device, int64 update_version = -1) {
-    kv_->BatchLookupOrCreateKeys(key, n, item_idxs, device);
-  }
-
-  void LookupOrCreate(const K* key, V* val, V* default_v,
-      int32 default_v_num, bool is_use_default_value_tensor,
-      size_t n, const Eigen::GpuDevice& device) {
-    kv_->BatchLookupOrCreate(key, val, default_v, default_v_num,
-        is_use_default_value_tensor, n, device);
-  }
-
-  void GetSnapshot(K* keys, V* values, const Eigen::GpuDevice& device) {
-    kv_->GetSnapshot(keys, values, device);
-  }
-
-  int64 Size() const {
-    return kv_->Size();
-  }
-
-  int64 ValueLen() const {
-    return value_len_;
-  }
-
-  std::string DebugString() const {
-    return emb_config_.DebugString();
-  }
-
-  embedding::GPUHashMapKV<K, V>* kv() {
-    return kv_;
-  }
-
-  int64 MinFreq() {
-    return emb_config_.filter_freq;
-  }
-
-  float GetL2WeightThreshold() {
-    return emb_config_.l2_weight_threshold;
-  }
-
-  int32 SlotNum() {
-    return (emb_config_.block_num * (1 + emb_config_.slot_num));
-  }
-
-  int32 EmbIdx() {
-    return emb_config_.emb_index;
-  }
-
-  V* DefaultValuePtr() {
-    return default_value_;
-  }
-
-  void SetSlotNum(int64 slot_num) {
-    emb_config_.slot_num = slot_num;
-  }
-
-  int64 GetSlotNum() {
-    return emb_config_.slot_num;
-  }
-
-  V* GetDefaultValuePtr() {
-    return default_value_;
-  }
-
-  int64 GetDefaultValueDim() {
-    return emb_config_.default_value_dim;
-  }
-
-  Status Import(RestoreBuffer& restore_buff, int64 key_num,
-      int bucket_num, int64 partition_id, int64 partition_num,
-      bool is_filter, const Eigen::GpuDevice& device) {
-    return kv_->Import(restore_buff, key_num, bucket_num,
-        partition_id, partition_num, is_filter, device);
-  }
-
- private: 
-  bool DisableGPUEVAllocatorFromEnvironment() {
-    bool disable_gpu_ev_allocator = false;
-    ReadBoolFromEnvVar("TF_DISABLE_GPU_EV_ALLOCATOR", true,
-		       &disable_gpu_ev_allocator);
-    return disable_gpu_ev_allocator;
-  }
-  
- private:
-  ~EmbeddingVarGPU() override {
-    if (emb_config_.is_primary() && emb_config_.primary_emb_index == 0) {
-      delete kv_;
-    }
-    TypedAllocator::Deallocate(alloc_, default_value_, value_len_);
-  }
-  TF_DISALLOW_COPY_AND_ASSIGN(EmbeddingVarGPU);
-
- private:
-  bool is_initialized_ = false;
-  std::string name_;
-  embedding::GPUHashMapKV<K, V>* kv_ = nullptr;
-  Allocator* alloc_ = nullptr;
-  EmbeddingConfig emb_config_;
-  V* default_value_ = nullptr;
-  int64 value_len_;
-};
-
-}  // namespace tensorflow
-
-#endif  // GOOGLE_CUDA
-
 #endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_H_
diff --git a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
@@ -225,7 +225,7 @@ class GPUHashMapKV : public KVInterface<K, V> {
     return nullptr;
   }
 
-  GPUHashTable<K, V>* HashTable() {
+  GPUHashTable<K, V>* HashTable() override {
     return hash_table_;
   }
 
diff --git a/tensorflow/core/framework/embedding/kv_interface.h b/tensorflow/core/framework/embedding/kv_interface.h
@@ -23,6 +23,9 @@ namespace tensorflow {
 template <class V>
 class ValuePtr;
 
+template <class K, class V>
+class GPUHashTable;
+
 namespace embedding {
 class Iterator {
  public:
@@ -100,6 +103,12 @@ class KVInterface {
     return Status::OK();
   }
 
+  virtual GPUHashTable<K, V>* HashTable() {
+    return nullptr;
+  }
+
+  virtual void SetValueLen(int64 value_len) {}
+
 };
 
 }  // namespace embedding
diff --git a/tensorflow/core/framework/embedding/single_tier_storage.h b/tensorflow/core/framework/embedding/single_tier_storage.h
@@ -358,19 +358,23 @@ class HbmStorage : public SingleTierStorage<K, V> {
     return true;
   }
 
+  void SetValueLen(int64 value_len) override {
+    SingleTierStorage<K, V>::kv_->SetValueLen(value_len);
+  }
+
   void BatchLookupOrCreate(const K* key, V* val, V* default_v,
       int32 default_v_num, bool is_use_default_value_tensor,
-      size_t n, const Eigen::GpuDevice& device) {
+      size_t n, const Eigen::GpuDevice& device) override {
     SingleTierStorage<K, V>::kv_->BatchLookupOrCreate(key, val, default_v, default_v_num,
         is_use_default_value_tensor, n, device);
   }
 
   void BatchLookupOrCreateKeys(const K* key, int32* item_idxs, size_t n,
-      const Eigen::GpuDevice& device) {
+      const Eigen::GpuDevice& device) override {
     SingleTierStorage<K, V>::kv_->BatchLookupOrCreateKeys(key, n, item_idxs, device);
   }
 
-  GPUHashTable<K, V>* HashTable() {
+  GPUHashTable<K, V>* HashTable() override {
     return SingleTierStorage<K, V>::kv_->HashTable();
   }
 
diff --git a/tensorflow/core/framework/embedding/ssd_hash_kv.h b/tensorflow/core/framework/embedding/ssd_hash_kv.h
@@ -480,7 +480,6 @@ class SSDHashKV : public KVInterface<K, V> {
     if (iter.first == EMPTY_KEY) {
       return errors::NotFound("Unable to find Key: ", key, " in SSDHashKV.");
     } else {
-      ValuePtr<V>* val = new_value_ptr_fn_(total_dims_);
       return Status::OK();
     }
   }
diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h
@@ -119,6 +119,9 @@ class Storage {
       size_t n, const Eigen::GpuDevice& device) {}
   virtual void BatchLookupOrCreateKeys(const K* key, int32* item_idxs, size_t n,
       const Eigen::GpuDevice& device) {}
+  virtual GPUHashTable<K, V>* HashTable() {
+    return nullptr;
+  }
 
   virtual void InitCache(embedding::CacheStrategy cache_strategy) = 0;
   virtual int64 CacheSize() const = 0;
@@ -159,10 +162,6 @@ class Storage {
                           " storage capacity: ", storage_config_.size);
   }
 
-  GPUHashTable<K, V>* HashTable() {
-    return nullptr;
-  }
-
  protected:
   int64 alloc_len_ = 0;
   int64 total_dims_ = 0;
diff --git a/tensorflow/core/framework/embedding/storage_manager.h b/tensorflow/core/framework/embedding/storage_manager.h
@@ -50,7 +50,7 @@ class StorageManager {
     storage_->SetAllocLen(value_len, slot_num);
   }
 
-  void SetValueLen(int64 value_len){
+  void SetValueLen(int64 value_len) {
     storage_->SetValueLen(value_len);
   }
 
diff --git a/tensorflow/core/kernels/kv_variable_ops.cc b/tensorflow/core/kernels/kv_variable_ops.cc
@@ -250,6 +250,7 @@ class InitializeKvVariableOp : public OpKernel {
              handle_self](EmbeddingVar<TKey, TValue>** ptr) {
               Allocator* gpu_allocator =
                   context->device()->GetAllocator(AllocatorAttributes());
+                  //context->get_allocator(AllocatorAttributes());
               auto embedding_config = EmbeddingConfig(
                   emb_index_ + block_num_ * slot_index_,
                   emb_index_, block_num_, slot_num_,
@@ -284,6 +285,7 @@ class InitializeKvVariableOp : public OpKernel {
             handle_primary, context](EmbeddingVar<TKey, TValue>** ptr) {
              int64 primary_slot_index(0), primary_emb_index(0);
              Allocator* gpu_allocator = context->device()->GetAllocator(AllocatorAttributes());
+             //Allocator* gpu_allocator = context->get_allocator(AllocatorAttributes());
              auto embedding_config = EmbeddingConfig(
                  primary_emb_index + block_num_ * primary_slot_index,
                  primary_emb_index,
@@ -380,10 +382,6 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS_ALL_INDEX)
                               .TypeConstraint<ktype>("Tkeys")        \
                               .TypeConstraint<vtype>("dtype"),       \
                           InitializeKvVariableOp<ktype, vtype>);
-                              //.HostMemory("resource_self")           \
-                              //.HostMemory("resource_primary")        \
-                              //.HostMemory("value")                   \
-                              //.HostMemory("empty_key")               \
 
 #define REGISTER_GPU_KERNELS(type)        \
   REGISTER_KERNELS(int32, type);          \
@@ -444,7 +442,6 @@ class KvResourceInitCacheStrategyOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    Tensor* output;
     EmbeddingVar<TKey, TValue>* ev = nullptr;
     OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &ev));
     core::ScopedUnref unref_me(ev);
@@ -1043,12 +1040,9 @@ class KvResourceGatherGPUOp : public OpKernel {
 #define REGISTER_KERNELS(dev, ktype, vtype)                       \
   REGISTER_KERNEL_BUILDER(Name("KvResourceGather")                \
                               .Device(DEVICE_##dev)               \
-                              .HostMemory("resource")             \
                               .TypeConstraint<vtype>("dtype")     \
                               .TypeConstraint<ktype>("Tkeys"),    \
                           KvResourceGatherGPUOp<GPUDevice, ktype, vtype>)
-                              //.HostMemory("indices")              \
-                              //.HostMemory("default_value")        \
 
 #define REGISTER_KERNELS_ALL(dev, type)                           \
   REGISTER_KERNELS(dev, int32, type);                             \
diff --git a/tensorflow/core/kernels/kv_variable_ops.h b/tensorflow/core/kernels/kv_variable_ops.h
@@ -557,7 +557,6 @@ Status DynamicRestoreValue(EmbeddingVar<K, V>* ev, BundleReader* reader,
     int64 partition_id = 0, int64 partition_num = 1, bool reset_version = false) {
   string curr_partid_str = std::to_string(partition_id);
   bool filter_flag = true;
-  bool restore_filter_flag = true;
   for (int i = 0; i < orig_partnum; i++) {
     string part_id = std::to_string(i);
     string pre_subname =
@@ -1221,8 +1220,6 @@ Status EVRestoreDynamically(EmbeddingVar<K, V>* ev,
           int64 freq_filter_part_offset = subpart_filter_offset * sizeof(int64);
           int64 tot_key_filter_num =
             part_filter_offset_flat(subpart_id + 1) - subpart_filter_offset;
-          int64 tot_key_filter_bytes_read(0), tot_version_filter_bytes_read(0),
-                tot_freq_filter_bytes_read(0);
           size_t key_filter_bytes_read = 0;
           size_t version_filter_bytes_read = 0;
           size_t freq_filter_bytes_read = 0;
diff --git a/tensorflow/core/kernels/training_ali_ops.cc b/tensorflow/core/kernels/training_ali_ops.cc
@@ -393,7 +393,6 @@ DECLARE_GPU_SPEC(int64, double);
   REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdagrad")       \
                               .Device(DEVICE_GPU)                    \
                               .TypeConstraint<T>("T")                \
-                              .HostMemory("indices")                 \
                               .HostMemory("lr")                      \
                               .HostMemory("global_step")             \
                               .TypeConstraint<Tindices>("Tindices")  \

Original file line number	Diff line number	Diff line change
`@@ -179,7 +179,6 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {`
`179`	`179`	`hash_val.emplace_back(`
`180`	`180`	`FastHash64(key, seeds_[i]) % config_.num_counter);`
`181`	`181`	`}`
`182`		`- int64 min_freq;`
`183`	`182`	`switch (config_.counter_type){`
`184`	`183`	`case DT_UINT64:`
`185`	`184`	`SetMinFreq<uint64>(hash_val, freq);`
Original file line number	Diff line number	Diff line change
`@@ -225,7 +225,7 @@ class GPUHashMapKV : public KVInterface<K, V> {`
`225`	`225`	`return nullptr;`
`226`	`226`	`}`
`227`	`227`
`228`		`- GPUHashTable<K, V>* HashTable() {`
	`228`	`+ GPUHashTable<K, V>* HashTable() override {`
`229`	`229`	`return hash_table_;`
`230`	`230`	`}`
`231`	`231`
Original file line number	Diff line number	Diff line change
`@@ -358,19 +358,23 @@ class HbmStorage : public SingleTierStorage<K, V> {`
`358`	`358`	`return true;`
`359`	`359`	`}`
`360`	`360`
	`361`	`+ void SetValueLen(int64 value_len) override {`
	`362`	`+ SingleTierStorage<K, V>::kv_->SetValueLen(value_len);`
	`363`	`+ }`
	`364`	`+`
`361`	`365`	`void BatchLookupOrCreate(const K* key, V* val, V* default_v,`
`362`	`366`	`int32 default_v_num, bool is_use_default_value_tensor,`
`363`		`- size_t n, const Eigen::GpuDevice& device) {`
	`367`	`+ size_t n, const Eigen::GpuDevice& device) override {`
`364`	`368`	`SingleTierStorage<K, V>::kv_->BatchLookupOrCreate(key, val, default_v, default_v_num,`
`365`	`369`	`is_use_default_value_tensor, n, device);`
`366`	`370`	`}`
`367`	`371`
`368`	`372`	`void BatchLookupOrCreateKeys(const K* key, int32* item_idxs, size_t n,`
`369`		`- const Eigen::GpuDevice& device) {`
	`373`	`+ const Eigen::GpuDevice& device) override {`
`370`	`374`	`SingleTierStorage<K, V>::kv_->BatchLookupOrCreateKeys(key, n, item_idxs, device);`
`371`	`375`	`}`
`372`	`376`
`373`		`- GPUHashTable<K, V>* HashTable() {`
	`377`	`+ GPUHashTable<K, V>* HashTable() override {`
`374`	`378`	`return SingleTierStorage<K, V>::kv_->HashTable();`
`375`	`379`	`}`
`376`	`380`
Original file line number	Diff line number	Diff line change
`@@ -480,7 +480,6 @@ class SSDHashKV : public KVInterface<K, V> {`
`480`	`480`	`if (iter.first == EMPTY_KEY) {`
`481`	`481`	`return errors::NotFound("Unable to find Key: ", key, " in SSDHashKV.");`
`482`	`482`	`} else {`
`483`		`- ValuePtr<V>* val = new_value_ptr_fn_(total_dims_);`
`484`	`483`	`return Status::OK();`
`485`	`484`	`}`
`486`	`485`	`}`
Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ class StorageManager {`
`50`	`50`	`storage_->SetAllocLen(value_len, slot_num);`
`51`	`51`	`}`
`52`	`52`
`53`		`- void SetValueLen(int64 value_len){`
	`53`	`+ void SetValueLen(int64 value_len) {`
`54`	`54`	`storage_->SetValueLen(value_len);`
`55`	`55`	`}`
`56`	`56`