Skip to content

Commit db2b757

Browse files
authored
[Embedding] Fix coredump in HBM storage. (#642)
1 parent 8e3c6be commit db2b757

File tree

11 files changed

+23
-185
lines changed

11 files changed

+23
-185
lines changed

tensorflow/core/framework/embedding/bloom_filter_policy.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,6 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
179179
hash_val.emplace_back(
180180
FastHash64(key, seeds_[i]) % config_.num_counter);
181181
}
182-
int64 min_freq;
183182
switch (config_.counter_type){
184183
case DT_UINT64:
185184
SetMinFreq<uint64>(hash_val, freq);

tensorflow/core/framework/embedding/embedding_var.h

Lines changed: 0 additions & 162 deletions
Original file line numberDiff line numberDiff line change
@@ -730,166 +730,4 @@ class EmbeddingVar : public ResourceBase {
730730

731731
} // namespace tensorflow
732732

733-
#if GOOGLE_CUDA
734-
namespace tensorflow {
735-
736-
template <class K, class V>
737-
class EmbeddingVarGPU : public ResourceBase {
738-
public:
739-
EmbeddingVarGPU(const string& name,
740-
embedding::GPUHashMapKV<K, V>* kv,
741-
Allocator* alloc,
742-
const EmbeddingConfig& emb_cfg = EmbeddingConfig()):
743-
name_(name),
744-
kv_(kv),
745-
default_value_(nullptr),
746-
value_len_(0),
747-
emb_config_(emb_cfg) {
748-
alloc_ =
749-
DisableGPUEVAllocatorFromEnvironment() ? alloc : gpu_ev_allocator();
750-
}
751-
752-
Status Init() {
753-
if (kv_ == nullptr) {
754-
return errors::InvalidArgument("Error to construct EmbeddingVarGPU");
755-
} else {
756-
return Status::OK();
757-
}
758-
}
759-
760-
Status Init(const Tensor& default_tensor,
761-
int64 default_value_dim=1) {
762-
if (DataTypeToEnum<V>::v() != default_tensor.dtype()) {
763-
return errors::InvalidArgument(
764-
"EV's default_tensor DTYPE must be same as EmbeddingVar Value Type");
765-
} else if (kv_ == nullptr) {
766-
return errors::InvalidArgument("Error to construct EmbeddingVarGPU");
767-
} else {
768-
emb_config_.default_value_dim = default_value_dim;
769-
value_len_ =
770-
default_tensor.NumElements() / emb_config_.default_value_dim;
771-
kv_->SetValueLen(value_len_);
772-
default_value_ = TypedAllocator::Allocate<V>(
773-
alloc_, default_tensor.NumElements(), AllocationAttributes());
774-
auto default_tensor_flat = default_tensor.flat<V>();
775-
cudaMemcpy(default_value_, &default_tensor_flat(0),
776-
default_tensor.TotalBytes(), cudaMemcpyDeviceToDevice);
777-
return Status::OK();
778-
}
779-
}
780-
781-
void SetInitialized() {
782-
is_initialized_ = true;
783-
}
784-
785-
bool IsInitialized() const {
786-
return is_initialized_;
787-
}
788-
789-
void LookupOrCreateKey(const K* key, int32* item_idxs, size_t n,
790-
const Eigen::GpuDevice& device, int64 update_version = -1) {
791-
kv_->BatchLookupOrCreateKeys(key, n, item_idxs, device);
792-
}
793-
794-
void LookupOrCreate(const K* key, V* val, V* default_v,
795-
int32 default_v_num, bool is_use_default_value_tensor,
796-
size_t n, const Eigen::GpuDevice& device) {
797-
kv_->BatchLookupOrCreate(key, val, default_v, default_v_num,
798-
is_use_default_value_tensor, n, device);
799-
}
800-
801-
void GetSnapshot(K* keys, V* values, const Eigen::GpuDevice& device) {
802-
kv_->GetSnapshot(keys, values, device);
803-
}
804-
805-
int64 Size() const {
806-
return kv_->Size();
807-
}
808-
809-
int64 ValueLen() const {
810-
return value_len_;
811-
}
812-
813-
std::string DebugString() const {
814-
return emb_config_.DebugString();
815-
}
816-
817-
embedding::GPUHashMapKV<K, V>* kv() {
818-
return kv_;
819-
}
820-
821-
int64 MinFreq() {
822-
return emb_config_.filter_freq;
823-
}
824-
825-
float GetL2WeightThreshold() {
826-
return emb_config_.l2_weight_threshold;
827-
}
828-
829-
int32 SlotNum() {
830-
return (emb_config_.block_num * (1 + emb_config_.slot_num));
831-
}
832-
833-
int32 EmbIdx() {
834-
return emb_config_.emb_index;
835-
}
836-
837-
V* DefaultValuePtr() {
838-
return default_value_;
839-
}
840-
841-
void SetSlotNum(int64 slot_num) {
842-
emb_config_.slot_num = slot_num;
843-
}
844-
845-
int64 GetSlotNum() {
846-
return emb_config_.slot_num;
847-
}
848-
849-
V* GetDefaultValuePtr() {
850-
return default_value_;
851-
}
852-
853-
int64 GetDefaultValueDim() {
854-
return emb_config_.default_value_dim;
855-
}
856-
857-
Status Import(RestoreBuffer& restore_buff, int64 key_num,
858-
int bucket_num, int64 partition_id, int64 partition_num,
859-
bool is_filter, const Eigen::GpuDevice& device) {
860-
return kv_->Import(restore_buff, key_num, bucket_num,
861-
partition_id, partition_num, is_filter, device);
862-
}
863-
864-
private:
865-
bool DisableGPUEVAllocatorFromEnvironment() {
866-
bool disable_gpu_ev_allocator = false;
867-
ReadBoolFromEnvVar("TF_DISABLE_GPU_EV_ALLOCATOR", true,
868-
&disable_gpu_ev_allocator);
869-
return disable_gpu_ev_allocator;
870-
}
871-
872-
private:
873-
~EmbeddingVarGPU() override {
874-
if (emb_config_.is_primary() && emb_config_.primary_emb_index == 0) {
875-
delete kv_;
876-
}
877-
TypedAllocator::Deallocate(alloc_, default_value_, value_len_);
878-
}
879-
TF_DISALLOW_COPY_AND_ASSIGN(EmbeddingVarGPU);
880-
881-
private:
882-
bool is_initialized_ = false;
883-
std::string name_;
884-
embedding::GPUHashMapKV<K, V>* kv_ = nullptr;
885-
Allocator* alloc_ = nullptr;
886-
EmbeddingConfig emb_config_;
887-
V* default_value_ = nullptr;
888-
int64 value_len_;
889-
};
890-
891-
} // namespace tensorflow
892-
893-
#endif // GOOGLE_CUDA
894-
895733
#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_H_

tensorflow/core/framework/embedding/gpu_hash_map_kv.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ class GPUHashMapKV : public KVInterface<K, V> {
225225
return nullptr;
226226
}
227227

228-
GPUHashTable<K, V>* HashTable() {
228+
GPUHashTable<K, V>* HashTable() override {
229229
return hash_table_;
230230
}
231231

tensorflow/core/framework/embedding/kv_interface.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ namespace tensorflow {
2323
template <class V>
2424
class ValuePtr;
2525

26+
template <class K, class V>
27+
class GPUHashTable;
28+
2629
namespace embedding {
2730
class Iterator {
2831
public:
@@ -100,6 +103,12 @@ class KVInterface {
100103
return Status::OK();
101104
}
102105

106+
virtual GPUHashTable<K, V>* HashTable() {
107+
return nullptr;
108+
}
109+
110+
virtual void SetValueLen(int64 value_len) {}
111+
103112
};
104113

105114
} // namespace embedding

tensorflow/core/framework/embedding/single_tier_storage.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -358,19 +358,23 @@ class HbmStorage : public SingleTierStorage<K, V> {
358358
return true;
359359
}
360360

361+
void SetValueLen(int64 value_len) override {
362+
SingleTierStorage<K, V>::kv_->SetValueLen(value_len);
363+
}
364+
361365
void BatchLookupOrCreate(const K* key, V* val, V* default_v,
362366
int32 default_v_num, bool is_use_default_value_tensor,
363-
size_t n, const Eigen::GpuDevice& device) {
367+
size_t n, const Eigen::GpuDevice& device) override {
364368
SingleTierStorage<K, V>::kv_->BatchLookupOrCreate(key, val, default_v, default_v_num,
365369
is_use_default_value_tensor, n, device);
366370
}
367371

368372
void BatchLookupOrCreateKeys(const K* key, int32* item_idxs, size_t n,
369-
const Eigen::GpuDevice& device) {
373+
const Eigen::GpuDevice& device) override {
370374
SingleTierStorage<K, V>::kv_->BatchLookupOrCreateKeys(key, n, item_idxs, device);
371375
}
372376

373-
GPUHashTable<K, V>* HashTable() {
377+
GPUHashTable<K, V>* HashTable() override {
374378
return SingleTierStorage<K, V>::kv_->HashTable();
375379
}
376380

tensorflow/core/framework/embedding/ssd_hash_kv.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -480,7 +480,6 @@ class SSDHashKV : public KVInterface<K, V> {
480480
if (iter.first == EMPTY_KEY) {
481481
return errors::NotFound("Unable to find Key: ", key, " in SSDHashKV.");
482482
} else {
483-
ValuePtr<V>* val = new_value_ptr_fn_(total_dims_);
484483
return Status::OK();
485484
}
486485
}

tensorflow/core/framework/embedding/storage.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,9 @@ class Storage {
119119
size_t n, const Eigen::GpuDevice& device) {}
120120
virtual void BatchLookupOrCreateKeys(const K* key, int32* item_idxs, size_t n,
121121
const Eigen::GpuDevice& device) {}
122+
virtual GPUHashTable<K, V>* HashTable() {
123+
return nullptr;
124+
}
122125

123126
virtual void InitCache(embedding::CacheStrategy cache_strategy) = 0;
124127
virtual int64 CacheSize() const = 0;
@@ -159,10 +162,6 @@ class Storage {
159162
" storage capacity: ", storage_config_.size);
160163
}
161164

162-
GPUHashTable<K, V>* HashTable() {
163-
return nullptr;
164-
}
165-
166165
protected:
167166
int64 alloc_len_ = 0;
168167
int64 total_dims_ = 0;

tensorflow/core/framework/embedding/storage_manager.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ class StorageManager {
5050
storage_->SetAllocLen(value_len, slot_num);
5151
}
5252

53-
void SetValueLen(int64 value_len){
53+
void SetValueLen(int64 value_len) {
5454
storage_->SetValueLen(value_len);
5555
}
5656

tensorflow/core/kernels/kv_variable_ops.cc

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@ class InitializeKvVariableOp : public OpKernel {
250250
handle_self](EmbeddingVar<TKey, TValue>** ptr) {
251251
Allocator* gpu_allocator =
252252
context->device()->GetAllocator(AllocatorAttributes());
253+
//context->get_allocator(AllocatorAttributes());
253254
auto embedding_config = EmbeddingConfig(
254255
emb_index_ + block_num_ * slot_index_,
255256
emb_index_, block_num_, slot_num_,
@@ -284,6 +285,7 @@ class InitializeKvVariableOp : public OpKernel {
284285
handle_primary, context](EmbeddingVar<TKey, TValue>** ptr) {
285286
int64 primary_slot_index(0), primary_emb_index(0);
286287
Allocator* gpu_allocator = context->device()->GetAllocator(AllocatorAttributes());
288+
//Allocator* gpu_allocator = context->get_allocator(AllocatorAttributes());
287289
auto embedding_config = EmbeddingConfig(
288290
primary_emb_index + block_num_ * primary_slot_index,
289291
primary_emb_index,
@@ -380,10 +382,6 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS_ALL_INDEX)
380382
.TypeConstraint<ktype>("Tkeys") \
381383
.TypeConstraint<vtype>("dtype"), \
382384
InitializeKvVariableOp<ktype, vtype>);
383-
//.HostMemory("resource_self") \
384-
//.HostMemory("resource_primary") \
385-
//.HostMemory("value") \
386-
//.HostMemory("empty_key") \
387385

388386
#define REGISTER_GPU_KERNELS(type) \
389387
REGISTER_KERNELS(int32, type); \
@@ -444,7 +442,6 @@ class KvResourceInitCacheStrategyOp : public OpKernel {
444442
}
445443

446444
void Compute(OpKernelContext* ctx) override {
447-
Tensor* output;
448445
EmbeddingVar<TKey, TValue>* ev = nullptr;
449446
OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &ev));
450447
core::ScopedUnref unref_me(ev);
@@ -1043,12 +1040,9 @@ class KvResourceGatherGPUOp : public OpKernel {
10431040
#define REGISTER_KERNELS(dev, ktype, vtype) \
10441041
REGISTER_KERNEL_BUILDER(Name("KvResourceGather") \
10451042
.Device(DEVICE_##dev) \
1046-
.HostMemory("resource") \
10471043
.TypeConstraint<vtype>("dtype") \
10481044
.TypeConstraint<ktype>("Tkeys"), \
10491045
KvResourceGatherGPUOp<GPUDevice, ktype, vtype>)
1050-
//.HostMemory("indices") \
1051-
//.HostMemory("default_value") \
10521046

10531047
#define REGISTER_KERNELS_ALL(dev, type) \
10541048
REGISTER_KERNELS(dev, int32, type); \

tensorflow/core/kernels/kv_variable_ops.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,6 @@ Status DynamicRestoreValue(EmbeddingVar<K, V>* ev, BundleReader* reader,
557557
int64 partition_id = 0, int64 partition_num = 1, bool reset_version = false) {
558558
string curr_partid_str = std::to_string(partition_id);
559559
bool filter_flag = true;
560-
bool restore_filter_flag = true;
561560
for (int i = 0; i < orig_partnum; i++) {
562561
string part_id = std::to_string(i);
563562
string pre_subname =
@@ -1221,8 +1220,6 @@ Status EVRestoreDynamically(EmbeddingVar<K, V>* ev,
12211220
int64 freq_filter_part_offset = subpart_filter_offset * sizeof(int64);
12221221
int64 tot_key_filter_num =
12231222
part_filter_offset_flat(subpart_id + 1) - subpart_filter_offset;
1224-
int64 tot_key_filter_bytes_read(0), tot_version_filter_bytes_read(0),
1225-
tot_freq_filter_bytes_read(0);
12261223
size_t key_filter_bytes_read = 0;
12271224
size_t version_filter_bytes_read = 0;
12281225
size_t freq_filter_bytes_read = 0;

0 commit comments

Comments
 (0)