From cbfad89d1d1314cf5bd3a5ed5f074e3886d2ea42 Mon Sep 17 00:00:00 2001 From: Lifann Date: Fri, 16 Jun 2023 12:12:32 +0800 Subject: [PATCH] Hkv code draft --- .../dynamic_embedding/core/BUILD | 2 + .../core/hkv_hashtable_ops.cc | 311 ++ .../core/kernels/hkv_hashtable_op_gpu.cu.cc | 882 ++++++ .../kernels/lookup_impl/lookup_table_op_hkv.h | 360 +++ .../lookup_table_op_hkv_impl.cu.cc | 37 + .../core/lib/merlin_inc/BUILD | 42 + .../lib/merlin_inc/merlin/array_kernels.cuh | 345 +++ .../lib/merlin_inc/merlin/core_kernels.cuh | 2510 +++++++++++++++++ .../core/lib/merlin_inc/merlin/debug.hpp | 78 + .../lib/merlin_inc/merlin/flexible_buffer.cuh | 60 + .../core/lib/merlin_inc/merlin/group_lock.hpp | 229 ++ .../lib/merlin_inc/merlin/initializers.cuh | 147 + .../lib/merlin_inc/merlin/memory_pool.cuh | 619 ++++ .../core/lib/merlin_inc/merlin/optimizers.cuh | 77 + .../core/lib/merlin_inc/merlin/types.cuh | 217 ++ .../core/lib/merlin_inc/merlin/utils.cuh | 368 +++ .../core/lib/merlin_inc/merlin_hashtable.cuh | 1643 +++++++++++ .../core/lib/merlin_inc/merlin_localfile.hpp | 162 ++ .../dynamic_embedding/core/lib/utils/BUILD | 12 + .../core/lib/utils/cuda_utils.cuh | 134 + .../core/ops/hkv_hashtable_ops.cc | 327 +++ .../python/ops/hkv_hashtable_ops.py | 456 +++ 22 files changed, 9018 insertions(+) create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/hkv_hashtable_ops.cc create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/kernels/hkv_hashtable_op_gpu.cu.cc create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv_impl.cu.cc create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/BUILD create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/array_kernels.cuh create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/core_kernels.cuh create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/debug.hpp create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/flexible_buffer.cuh create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/group_lock.hpp create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/initializers.cuh create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/memory_pool.cuh create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/optimizers.cuh create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/types.cuh create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/utils.cuh create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_hashtable.cuh create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_localfile.hpp create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/BUILD create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/cuda_utils.cuh create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/ops/hkv_hashtable_ops.cc create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/python/ops/hkv_hashtable_ops.py diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/BUILD b/tensorflow_recommenders_addons/dynamic_embedding/core/BUILD index 076e2791b..da34241a0 100644 --- a/tensorflow_recommenders_addons/dynamic_embedding/core/BUILD +++ b/tensorflow_recommenders_addons/dynamic_embedding/core/BUILD @@ -97,3 +97,5 @@ custom_op_library( "kernels/sparse_reshape_op.cu.cc", ], ) + +# TODO: Add hkv targets. diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/hkv_hashtable_ops.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/hkv_hashtable_ops.cc new file mode 100644 index 000000000..7af17be5c --- /dev/null +++ b/tensorflow_recommenders_addons/dynamic_embedding/core/hkv_hashtable_ops.cc @@ -0,0 +1,311 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_def_builder.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow_recommenders_addons/dynamic_embedding/core/utils/utils.h" + +namespace tensorflow { + +using shape_inference::DimensionHandle; +using shape_inference::InferenceContext; +using shape_inference::ShapeAndType; +using shape_inference::ShapeHandle; + +namespace { + +Status ScalarAndTwoElementVectorInputsAndScalarOutputs(InferenceContext* c) { + ShapeHandle handle; + DimensionHandle unused_handle; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle)); + for (int i = 1; i < c->num_inputs(); ++i) { + TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &handle)); + TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_handle)); + } + for (int i = 0; i < c->num_outputs(); ++i) { + c->set_output(i, c->Scalar()); + } + return Status::OK(); +} + +} // namespace + +Status ValidateTableResourceHandle(InferenceContext* c, ShapeHandle keys, + const string& key_dtype_attr, + const string& value_dtype_attr, + bool is_lookup, + ShapeAndType* output_shape_and_type) { + auto* handle_data = c->input_handle_shapes_and_types(0); + if (handle_data == nullptr || handle_data->size() != 2) { + output_shape_and_type->shape = c->UnknownShape(); + output_shape_and_type->dtype = DT_INVALID; + } else { + const ShapeAndType& key_shape_and_type = (*handle_data)[0]; + const ShapeAndType& value_shape_and_type = (*handle_data)[1]; + DataType key_dtype; + TF_RETURN_IF_ERROR(c->GetAttr(key_dtype_attr, &key_dtype)); + if (key_shape_and_type.dtype != key_dtype) { + return errors::InvalidArgument( + "Trying to read value with wrong dtype. " + "Expected ", + DataTypeString(key_shape_and_type.dtype), " got ", + DataTypeString(key_dtype)); + } + DataType value_dtype; + TF_RETURN_IF_ERROR(c->GetAttr(value_dtype_attr, &value_dtype)); + if (value_shape_and_type.dtype != value_dtype) { + return errors::InvalidArgument( + "Trying to read value with wrong dtype. " + "Expected ", + DataTypeString(value_shape_and_type.dtype), " got ", + DataTypeString(value_dtype)); + } + output_shape_and_type->dtype = value_shape_and_type.dtype; + + if (is_lookup) { + if (c->RankKnown(key_shape_and_type.shape) && c->RankKnown(keys)) { + int keys_rank = c->Rank(keys); + int key_suffix_rank = c->Rank(key_shape_and_type.shape); + if (keys_rank < key_suffix_rank) { + return errors::InvalidArgument( + "Expected keys to have suffix ", + c->DebugString(key_shape_and_type.shape), + " but saw shape: ", c->DebugString(keys)); + } + for (int d = 0; d < key_suffix_rank; d++) { + // Ensure the suffix of keys match what's in the Table. + DimensionHandle dim = c->Dim(key_shape_and_type.shape, d); + TF_RETURN_IF_ERROR( + c->ReplaceDim(keys, keys_rank - key_suffix_rank + d, dim, &keys)); + } + std::vector keys_prefix_vec; + keys_prefix_vec.reserve(keys_rank - key_suffix_rank); + for (int d = 0; d < keys_rank - key_suffix_rank; ++d) { + keys_prefix_vec.push_back(c->Dim(keys, d)); + } + ShapeHandle keys_prefix = c->MakeShape(keys_prefix_vec); + TF_RETURN_IF_ERROR(c->Concatenate(keys_prefix, + value_shape_and_type.shape, + &output_shape_and_type->shape)); + } else { + output_shape_and_type->shape = c->UnknownShape(); + } + } else { + TF_RETURN_IF_ERROR(c->Concatenate(keys, value_shape_and_type.shape, + &output_shape_and_type->shape)); + } + } + return Status::OK(); +} + +Status HkvHashTableShape(InferenceContext* c, const ShapeHandle& key, + const ShapeHandle& value) { + c->set_output(0, c->Scalar()); + + ShapeHandle key_s; + TF_RETURN_IF_ERROR(c->WithRankAtMost(key, 1, &key_s)); + + DataType key_t; + TF_RETURN_IF_ERROR(c->GetAttr("key_dtype", &key_t)); + + DataType value_t; + TF_RETURN_IF_ERROR(c->GetAttr("value_dtype", &value_t)); + + c->set_output_handle_shapes_and_types( + 0, std::vector{{key_s, key_t}, {value, value_t}}); + + return Status::OK(); +} + +REGISTER_OP("TfraHkvHashTableFind") + .Input("table_handle: resource") + .Input("keys: Tin") + .Input("default_value: Tout") + .Output("values: Tout") + .Attr("Tin: type") + .Attr("Tout: type") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle handle; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle)); + + ShapeAndType value_shape_and_type; + TF_RETURN_IF_ERROR(ValidateTableResourceHandle( + c, + /*keys=*/c->input(1), + /*key_dtype_attr=*/"Tin", + /*value_dtype_attr=*/"Tout", + /*is_lookup=*/true, &value_shape_and_type)); + c->set_output(0, value_shape_and_type.shape); + + return Status::OK(); + }); + +REGISTER_OP("TfraHkvHashTableFindWithExists") + .Input("table_handle: resource") + .Input("keys: Tin") + .Input("default_value: Tout") + .Output("values: Tout") + .Output("exists: bool") + .Attr("Tin: type") + .Attr("Tout: type") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle handle; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle)); + + ShapeHandle keys = c->UnknownShapeOfRank(1); + ShapeAndType value_shape_and_type; + TF_RETURN_IF_ERROR(ValidateTableResourceHandle( + c, + /*keys=*/c->input(1), + /*key_dtype_attr=*/"Tin", + /*value_dtype_attr=*/"Tout", + /*is_lookup=*/true, &value_shape_and_type)); + c->set_output(0, value_shape_and_type.shape); + c->set_output(1, keys); + + return Status::OK(); + }); + +REGISTER_OP("TfraHkvHashTableInsert") + .Input("table_handle: resource") + .Input("keys: Tin") + .Input("values: Tout") + .Attr("Tin: type") + .Attr("Tout: type") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle handle; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle)); + + // TODO: Validate keys and values shape. + return Status::OK(); + }); + +REGISTER_OP("TfraHkvHashTableAccum") + .Input("table_handle: resource") + .Input("keys: key_dtype") + .Input("values_or_deltas: value_dtype") + .Input("exists: bool") + .Attr("key_dtype: type") + .Attr("value_dtype: type") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle handle; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle)); + + // TODO: Validate keys and values shape. + return Status::OK(); + }); + +REGISTER_OP("TfraHkvHashTableRemove") + .Input("table_handle: resource") + .Input("keys: Tin") + .Attr("Tin: type") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle handle; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle)); + TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &handle)); + + // TODO(turboale): Validate keys shape. + return Status::OK(); + }); + +REGISTER_OP("TfraHkvHashTableClear") + .Input("table_handle: resource") + .Attr("key_dtype: type") + .Attr("value_dtype: type"); + +REGISTER_OP("TfraHkvHashTableSize") + .Input("table_handle: resource") + .Output("size: int64") + .SetShapeFn(ScalarAndTwoElementVectorInputsAndScalarOutputs); + +REGISTER_OP("TfraHkvHashTableExport") + .Input("table_handle: resource") + .Output("keys: Tkeys") + .Output("values: Tvalues") + .Attr("Tkeys: type") + .Attr("Tvalues: type") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle handle; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle)); + ShapeHandle keys = c->UnknownShapeOfRank(1); + ShapeAndType value_shape_and_type; + TF_RETURN_IF_ERROR(ValidateTableResourceHandle( + c, + /*keys=*/keys, + /*key_dtype_attr=*/"Tkeys", + /*value_dtype_attr=*/"Tvalues", + /*is_lookup=*/false, &value_shape_and_type)); + c->set_output(0, keys); + c->set_output(1, value_shape_and_type.shape); + return Status::OK(); + }); + +REGISTER_OP("TfraHkvHashTableSaveToFileSystem") + .Input("table_handle: resource") + .Input("dirpath: string") + .Input("file_name: string") + .Attr("key_dtype: type") + .Attr("value_dtype: type") + .Attr("dirpath_env: string") + .Attr("append_to_file: bool") + .Attr("buffer_size: int >= 1"); + +REGISTER_OP("TfraHkvHashTableImport") + .Input("table_handle: resource") + .Input("keys: Tin") + .Input("values: Tout") + .Attr("Tin: type") + .Attr("Tout: type") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle handle; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle)); + + ShapeHandle keys; + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &keys)); + TF_RETURN_IF_ERROR(c->Merge(keys, c->input(2), &keys)); + return Status::OK(); + }); + +REGISTER_OP("TfraHkvHashTableLoadFromFileSystem") + .Input("table_handle: resource") + .Input("dirpath: string") + .Input("file_name: string") + .Attr("key_dtype: type") + .Attr("value_dtype: type") + .Attr("dirpath_env: string") + .Attr("load_entire_dir: bool") + .Attr("buffer_size: int >= 1"); + +REGISTER_OP("TfraHkvHashTableOfTensors") + .Output("table_handle: resource") + .Attr("container: string = ''") + .Attr("shared_name: string = ''") + .Attr("use_node_name_sharing: bool = false") + .Attr("key_dtype: type") + .Attr("value_dtype: type") + .Attr("value_shape: shape = {}") + .Attr("init_capacity: int = 0") + .Attr("max_capacity: int = 0") + .SetIsStateful() + .SetShapeFn([](InferenceContext* c) { + PartialTensorShape value_p; + TF_RETURN_IF_ERROR(c->GetAttr("value_shape", &value_p)); + ShapeHandle value_s; + TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(value_p, &value_s)); + return HkvHashTableShape(c, /*key=*/c->Scalar(), /*value=*/value_s); + }); +} // namespace tensorflow diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/hkv_hashtable_op_gpu.cu.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/hkv_hashtable_op_gpu.cu.cc new file mode 100644 index 000000000..3803893c3 --- /dev/null +++ b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/hkv_hashtable_op_gpu.cu.cc @@ -0,0 +1,882 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#if GOOGLE_CUDA + +#include "tensorflow_recommenders_addons/dynamic_embedding/core/kernels/cuckoo_hashtable_op_gpu.h" +#include "tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_gpu.h" +#include "tensorflow_recommenders_addons/dynamic_embedding/core/utils/utils.h" + +#define EIGEN_USE_GPU + +#include +#include + +#include +#include +#include +#include + +#include "tensorflow/core/framework/lookup_interface.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/variant.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/path.h" +#include "tensorflow/core/util/env_var.h" +#include "tensorflow/stream_executor/stream.h" + +#include "tensorflow/core/util/gpu_device_functions.h" +#include "tensorflow/core/util/gpu_kernel_helper.h" + +namespace tensorflow { + +using GPUDevice = Eigen::GpuDevice; + +namespace recommenders_addons { +namespace lookup { + +constexpr size_t kDefaultGpuInitCapacity = 1024; + +using tensorflow::OpKernelContext; +using tensorflow::lookup::LookupInterface; + +template +class HkvHashTableOfTensorsGpu final : public LookupInterface { + private: + + public: + HkvHashTableOfTensorsGpu(OpKernelContext* ctx, OpKernel* kernel) { + OP_REQUIRES_OK(ctx, + GetNodeAttr(kernel->def(), "value_shape", &value_shape_)); + OP_REQUIRES( + ctx, TensorShapeUtils::IsVector(value_shape_), + errors::InvalidArgument("Default value must be a vector, got shape ", + value_shape_.DebugString())); + runtime_dim_ = value_shape_.dim_size(0); + + gpu::TableWrapperInitOptions options; + + int64 init_capacity_i64 = 0; + int64 max_capacity_i64 = 0; + OP_REQUIRES_OK(ctx, GetNodeAttr(kernel->def(), "init_capacity", &init_capacity_i64)); + OP_REQUIRES_OK(ctx, GetNodeAttr(kernel->def(), "max_capacity", &max_capacity_i64)); + options.init_capacity = static_cast(init_capacity_i64); + options.max_capacity = static_cast(max_capacity_i64); + + if (options.max_capacity == 0) { + char* env_max_capacity_str = std::getenv("TFRA_GPU_HASHTABLE_UPLIMIT_SIZE"); + if (env_max_capacity_str) { + options.max_capacity = static_cast(std::atoll(env_max_capacity_str)); + LOG(WARNING) << "GPU table max capacity was not set in attribute, get " + << options.max_capacity << " from env TFRA_GPU_HASHTABLE_UPLIMIT_SIZE."; + } else { + throw std::runtime_error("max_capaicty=0 and TFRA_GPU_HASHTABLE_UPLIMIT_SIZE not set is not valid."); + } + } + if (options.init_capacity == 0) { + options.init_capacity = kDefaultGpuInitCapacity; + LOG(WARNING) << "GPU table init capacity was not set in attribute, use default" + << kDefaultGpuInitCapacity; + } + if (options.max_capacity < options.init_capacity) { + LOG(WARNING) << "GPU table max_capacity < init_capacity, (" << options.max_capacity + << "/" << options.init_capacity << "). Reset to " << options.init_capacity; + options.max_capacity = options.init_capacity; + } + + if (table_) { + return; + } + this->CreateTable(options, &table_); + OP_REQUIRES(ctx, (table_ != nullptr), + errors::InvalidArgument("HashTable on GPU is created failed!")); + + LOG(INFO) << "GPU table max capacity was created on max_capacity: " + << options.max_capacity << ", and init capacity: " + << options.init_capacity + << " with K=" << std::type_index(typeid(K)).name() + << ", V=" << std::type_index(typeid(V)).name(); + } + + ~HkvHashTableOfTensorsGpu() { + } + + void CreateTable(gpu::TableWrapperInitOptions& options, gpu::TableWrapper** pptable) { + gpu::CreateTableImpl(pptable, options, runtime_dim_); + } + + size_t size() const override { + tf_shared_lock l(mu_); + + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + size_t retv = table_->get_size(stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaStreamDestroy(stream)); + return retv; + } + + void size_i64(OpKernelContext* ctx, int64* s) { + tf_shared_lock l(mu_); + auto stream = ctx->eigen_device().stream(); + int64 hret = static_cast(table_->get_size(stream)); + CUDA_CHECK(cudaMemcpyAsync(s, &hret, sizeof(int64), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + } + + Status Find(OpKernelContext* ctx, const Tensor& d_keys, Tensor* value, + const Tensor& default_value) override { + size_t len = d_keys.flat().size(); + bool* d_status; + + auto value_flat = value->flat_inner_dims(); + const auto default_flat = default_value.flat(); + int64 total = value_flat.size(); + int64 default_total = default_flat.size(); + bool is_full_default = (total == default_total); + + auto stream = ctx->eigen_device().stream(); + + if (len > 0) { + size_t default_value_num = + is_full_default ? default_value.shape().dim_size(0) : 1; + CUDA_CHECK(cudaMallocAsync(&d_status, sizeof(bool) * len, stream)); + CUDA_CHECK(cudaMemsetAsync(d_status, 0, sizeof(bool) * len, stream)); + { + tf_shared_lock l(mu_); + table_->get((const K*)d_keys.tensor_data().data(), + (V*)(value->tensor_data().data()), + d_status, len, + (V*)(default_value.tensor_data().data()), + stream, is_full_default); + CUDA_CHECK(cudaStreamSynchronize(stream)); + } + CUDA_CHECK(cudaFreeAsync(d_status, stream)); + } + CUDA_CHECK(cudaStreamSynchronize(stream)); + return Status::OK(); + } + + Status FindWithExists(OpKernelContext* ctx, const Tensor& d_keys, + Tensor* value, const Tensor& default_value, + Tensor* exists) { + size_t len = d_keys.flat().size(); + + auto value_flat = value->flat_inner_dims(); + const auto default_flat = default_value.flat(); + int64 total = value_flat.size(); + int64 default_total = default_flat.size(); + bool is_full_default = (total == default_total); + + auto stream = ctx->eigen_device().stream(); + + if (len > 0) { + size_t default_value_num = + is_full_default ? default_value.shape().dim_size(0) : 1; + { + tf_shared_lock l(mu_); + table_->get((const K*)d_keys.tensor_data().data(), + (V*)(value->tensor_data().data()), + (bool*)exists->tensor_data().data(), len, + (V*)(default_value.tensor_data().data()), + stream, is_full_default); + } + CUDA_CHECK(cudaStreamSynchronize(stream)); + } + return Status::OK(); + } + + Status Insert(OpKernelContext* ctx, const Tensor& keys, + const Tensor& values) override { + size_t len = keys.flat().size(); + auto stream = ctx->eigen_device().stream(); + { + mutex_lock l(mu_); + table_->upsert((const K*)keys.tensor_data().data(), + (const V*)(values.tensor_data().data()), + len, stream); + }; + CUDA_CHECK(cudaStreamSynchronize(stream)); + + return Status::OK(); + } + + Status Accum(OpKernelContext* ctx, const Tensor& keys, + const Tensor& values_or_deltas, const Tensor& exists) { + size_t len = keys.flat().size(); + auto stream = ctx->eigen_device().stream(); + { + mutex_lock l(mu_); + table_->accum( + (const K*)keys.tensor_data().data(), + (const V*)(values_or_deltas.tensor_data().data()), + (const bool*)exists.tensor_data().data(), len, stream); + }; + CUDA_CHECK(cudaStreamSynchronize(stream)); + + return Status::OK(); + } + + Status Remove(OpKernelContext* ctx, const Tensor& keys) override { + size_t len = keys.flat().size(); + K* d_keys; + auto stream = ctx->eigen_device().stream(); + + if (len > 0) { + CUDA_CHECK(cudaMallocAsync((void**)&d_keys, sizeof(K) * len, stream)); + CUDA_CHECK(cudaMemsetAsync((void*)&d_keys, 0, sizeof(K) * len, stream)); + CUDA_CHECK(cudaMemcpyAsync((void*)d_keys, (void*)keys.tensor_data().data(), + sizeof(K) * len, cudaMemcpyDefault, stream)); + { + mutex_lock l(mu_); + table_->remove((const K*)d_keys, len, stream); + } + CUDA_CHECK(cudaFreeAsync(d_keys, stream)); + } + CUDA_CHECK(cudaStreamSynchronize(stream)); + return Status::OK(); + } + + Status Clear(OpKernelContext* ctx) { + auto stream = ctx->eigen_device().stream(); + { + mutex_lock l(mu_); + table_->clear(stream); + } + CUDA_CHECK(cudaStreamSynchronize(stream)); + return Status::OK(); + } + + Status ImportValues(OpKernelContext* ctx, const Tensor& keys, + const Tensor& values) override { + size_t len = keys.flat().size(); + K* d_keys; + V* d_values; + auto stream = ctx->eigen_device().stream(); + if (len > 0) { + CUDA_CHECK(cudaMallocAsync((void**)&d_keys, sizeof(K) * len, stream)); + CUDA_CHECK(cudaMemsetAsync((void*)&d_keys, 0, sizeof(K) * len, stream)); + CUDA_CHECK( + cudaMallocAsync((void**)&d_values, sizeof(V) * runtime_dim_ * len, stream)); + CUDA_CHECK( + cudaMemsetAsync((void*)&d_values, 0, sizeof(V) * runtime_dim_ * len, stream)); + CUDA_CHECK(cudaMemcpyAsync((void*)d_keys, (void*)keys.tensor_data().data(), + sizeof(K) * len, cudaMemcpyDefault, stream)); + CUDA_CHECK(cudaMemcpyAsync((void*)d_values, (void*)values.tensor_data().data(), + sizeof(V) * runtime_dim_ * len, cudaMemcpyDefault, stream)); + { + mutex_lock l(mu_); + table_->clear(stream); + table_->upsert((const K*)d_keys, + (const V*)d_values, len, stream); + } + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaFreeAsync(d_keys, stream)); + CUDA_CHECK(cudaFreeAsync(d_values, stream)); + } + CUDA_CHECK(cudaStreamSynchronize(stream)); + return Status::OK(); + } + + Status ExportValues(OpKernelContext* ctx) override { + size_t len = 0; + int64 size = 0; + + const size_t offset = 0; + + Tensor* keys; + Tensor* values; + + size_t* d_dump_counter = nullptr; + auto stream = ctx->eigen_device().stream(); + + { + tf_shared_lock l(mu_); + len = table_->get_capacity(); + size = (int64)table_->get_size(stream); + } + CUDA_CHECK(cudaStreamSynchronize(stream)); + + CUDA_CHECK(cudaMallocAsync(&d_dump_counter, sizeof(size_t), stream)); + CUDA_CHECK(cudaMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream)); + + AllocatorAttributes attr; + //attr.set_gpu_compatible(true); + //attr.set_nic_compatible(true); + attr.set_on_host(false); + + TF_RETURN_IF_ERROR( + ctx->allocate_output("keys", TensorShape({(size)}), &keys, attr)); + TF_RETURN_IF_ERROR(ctx->allocate_output( + "values", TensorShape({size, (int64)runtime_dim_}), &values, attr)); + if (size) { + tf_shared_lock l(mu_); + table_->dump((K*)keys->flat().data(), + (V*)(values->matrix().data()), offset, + len, d_dump_counter, stream); + } + CUDA_CHECK(cudaFreeAsync(d_dump_counter, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + return Status::OK(); + } + + Status ExportValuesWithMetas(OpKernelContext* ctx) { + size_t len = 0; + int64 size = 0; + + const size_t offset = 0; + + Tensor* keys; + Tensor* values; + Tensor* metas; + + size_t* d_dump_counter = nullptr; + auto stream = ctx->eigen_device().stream(); + + { + tf_shared_lock l(mu_); + len = table_->get_capacity(); + size = (int64)table_->get_size(stream); + } + CUDA_CHECK(cudaStreamSynchronize(stream)); + + CUDA_CHECK(cudaMallocAsync(&d_dump_counter, sizeof(size_t), stream)); + CUDA_CHECK(cudaMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream)); + + AllocatorAttributes attr; + //attr.set_gpu_compatible(true); + //attr.set_nic_compatible(true); + attr.set_on_host(false); + + TF_RETURN_IF_ERROR( + ctx->allocate_output("keys", TensorShape({(size)}), &keys, attr)); + TF_RETURN_IF_ERROR(ctx->allocate_output( + "values", TensorShape({size, (int64)runtime_dim_}), &values, attr)); + TF_RETURN_IF_ERROR( + ctx->allocate_output("metas", TensorShape({(size)}), &metas, attr)); + if (size) { + tf_shared_lock l(mu_); + table_->dump_with_metas((K*)keys->flat().data(), + (V*)(values->matrix().data()), + (uint64_t*)(metas->flat().data()), + offset, len, d_dump_counter, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + } + CUDA_CHECK(cudaFreeAsync(d_dump_counter, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + return Status::OK(); + } + + Status ExportKeysAndMetas(OpKernelContext* ctx, size_t split_size) { + tf_shared_lock l(mu_); + size_t span_len = 0; + int64 size = 0; + + const size_t offset = 0; + + Tensor* keys = nullptr; + Tensor* metas = nullptr; + + auto stream = ctx->eigen_device().stream(); + + AllocatorAttributes attr; + attr.set_on_host(false); + + { + size = (int64)table_->get_size(stream); + + TF_RETURN_IF_ERROR( + ctx->allocate_output("keys", TensorShape({(size)}), &keys, attr)); + TF_RETURN_IF_ERROR( + ctx->allocate_output("metas", TensorShape({(size)}), &metas, attr)); + + if (size) { + table_->dump_keys_and_metas((K*)keys->flat().data(), + (int64*)(metas->flat().data()), + static_cast(size), + split_size, stream); + } + } + CUDA_CHECK(cudaStreamSynchronize(stream)); + return Status::OK(); + } + + Status ExportValuesToFile(OpKernelContext* ctx, const string filepath, + const size_t buffer_size) { + auto stream = ctx->eigen_device().stream(); + + { + tf_shared_lock l(mu_); + table_->dump_to_file(filepath, runtime_dim_, stream, buffer_size); + } + CUDA_CHECK(cudaStreamSynchronize(stream)); + return Status::OK(); + } + + Status ImportValuesFromFile(OpKernelContext* ctx, const string filepath, + const size_t buffer_size) { + auto stream = ctx->eigen_device().stream(); + + { + mutex_lock l(mu_); + + string keyfile = filepath + ".keys"; + FILE* tmpfd = fopen(keyfile.c_str(), "rb"); + if (tmpfd == nullptr) { + return errors::NotFound("Failed to read key file", keyfile); + } + fseek(tmpfd, 0, SEEK_END); + long int filesize = ftell(tmpfd); + size_t size = static_cast(filesize) / sizeof(K); + fseek(tmpfd, 0, SEEK_SET); + fclose(tmpfd); + + table_->clear(stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + table_->load_from_file(filepath, size, runtime_dim_, stream, + buffer_size); + } + CUDA_CHECK(cudaStreamSynchronize(stream)); + return Status::OK(); + } + + DataType key_dtype() const override { return DataTypeToEnum::v(); } + DataType value_dtype() const override { return DataTypeToEnum::v(); } + TensorShape key_shape() const final { return TensorShape(); } + TensorShape value_shape() const override { return value_shape_; } + + private: + TensorShape value_shape_; + size_t runtime_dim_; + mutable mutex mu_; + gpu::TableWrapper* table_ = nullptr GUARDED_BY(mu_); +}; + +} // namespace lookup + +// Table lookup op. Perform the lookup operation on the given table. +class HashTableFindGpuOp : public OpKernel { + public: + explicit HashTableFindGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + lookup::LookupInterface* table; + OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table)); + core::ScopedUnref unref_me(table); + + // Input 0 could be a STRING_REF or a RESOURCE + DataType expected_input_0 = DT_RESOURCE; + DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(), + table->value_dtype()}; + DataTypeVector expected_outputs = {table->value_dtype()}; + OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, expected_outputs)); + + const Tensor& keys = ctx->input(1); + const Tensor& default_values = ctx->input(2); + + TensorShape output_shape = keys.shape(); + output_shape.RemoveLastDims(table->key_shape().dims()); + output_shape.AppendShape(table->value_shape()); + Tensor* out; + AllocatorAttributes attr; + attr.set_gpu_compatible(true); + OP_REQUIRES_OK(ctx, + ctx->allocate_output("values", output_shape, &out, attr)); + + OP_REQUIRES_OK(ctx, table->Find(ctx, keys, out, default_values)); + } +}; + +REGISTER_KERNEL_BUILDER( + Name(PREFIX_OP_NAME(HkvHashTableFind)).Device(DEVICE_GPU), + HashTableFindGpuOp); + +// Table lookup op. Perform the lookup operation on the given table. + +template +class HashTableFindWithExistsGpuOp : public OpKernel { + public: + explicit HashTableFindWithExistsGpuOp(OpKernelConstruction* ctx) + : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + lookup::LookupInterface* table; + OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table)); + core::ScopedUnref unref_me(table); + + lookup::HkvHashTableOfTensorsGpu* table_hkv = + (lookup::HkvHashTableOfTensorsGpu*)table; + + // Input 0 could be a STRING_REF or a RESOURCE + DataType expected_input_0 = DT_RESOURCE; + DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(), + table->value_dtype()}; + DataTypeVector expected_outputs = {table->value_dtype(), DT_BOOL}; + OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, expected_outputs)); + + const Tensor& keys = ctx->input(1); + const Tensor& default_values = ctx->input(2); + + TensorShape output_shape = keys.shape(); + output_shape.RemoveLastDims(table->key_shape().dims()); + output_shape.AppendShape(table->value_shape()); + Tensor* values; + Tensor* exists; + AllocatorAttributes attr; + attr.set_gpu_compatible(true); + OP_REQUIRES_OK(ctx, + ctx->allocate_output("values", output_shape, &values, attr)); + OP_REQUIRES_OK(ctx, + ctx->allocate_output("exists", keys.shape(), &exists, attr)); + + OP_REQUIRES_OK(ctx, table_hkv->FindWithExists(ctx, keys, values, + default_values, exists)); + } +}; + +// Table insert op. +class HashTableInsertGpuOp : public OpKernel { + public: + explicit HashTableInsertGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + lookup::LookupInterface* table; + OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table)); + core::ScopedUnref unref_me(table); + + DataType expected_input_0 = DT_RESOURCE; + DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(), + table->value_dtype()}; + OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, {})); + + const Tensor& keys = ctx->input(1); + const Tensor& values = ctx->input(2); + OP_REQUIRES_OK(ctx, table->CheckKeyAndValueTensorsForInsert(keys, values)); + OP_REQUIRES_OK(ctx, table->Insert(ctx, keys, values)); + } +}; + +REGISTER_KERNEL_BUILDER( + Name(PREFIX_OP_NAME(HkvHashTableInsert)).Device(DEVICE_GPU), + HashTableInsertGpuOp); + +// Table accum op. +template +class HashTableAccumGpuOp : public OpKernel { + public: + explicit HashTableAccumGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + lookup::LookupInterface* table; + OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table)); + core::ScopedUnref unref_me(table); + lookup::HkvHashTableOfTensorsGpu* table_hkv = + (lookup::HkvHashTableOfTensorsGpu*)table; + + DataType expected_input_0 = DT_RESOURCE; + DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(), + table->value_dtype(), + DataTypeToEnum::v()}; + OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, {})); + + const Tensor& keys = ctx->input(1); + const Tensor& values_or_deltas = ctx->input(2); + const Tensor& exists = ctx->input(3); + OP_REQUIRES_OK( + ctx, table->CheckKeyAndValueTensorsForInsert(keys, values_or_deltas)); + OP_REQUIRES_OK(ctx, + table_hkv->Accum(ctx, keys, values_or_deltas, exists)); + } +}; + +// Table remove op. +class HashTableRemoveGpuOp : public OpKernel { + public: + explicit HashTableRemoveGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + lookup::LookupInterface* table; + OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table)); + core::ScopedUnref unref_me(table); + + DataType expected_input_0 = DT_RESOURCE; + DataTypeVector expected_inputs = {expected_input_0, table->key_dtype()}; + OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, {})); + + const Tensor& key = ctx->input(1); + OP_REQUIRES_OK(ctx, table->CheckKeyTensorForRemove(key)); + OP_REQUIRES_OK(ctx, table->Remove(ctx, key)); + } +}; + +REGISTER_KERNEL_BUILDER( + Name(PREFIX_OP_NAME(HkvHashTableRemove)).Device(DEVICE_GPU), + HashTableRemoveGpuOp); + +// Table clear op. +template +class HashTableClearGpuOp : public OpKernel { + public: + explicit HashTableClearGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + lookup::LookupInterface* table; + OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table)); + core::ScopedUnref unref_me(table); + lookup::HkvHashTableOfTensorsGpu* table_hkv = + (lookup::HkvHashTableOfTensorsGpu*)table; + OP_REQUIRES_OK(ctx, table_hkv->Clear(ctx)); + } +}; + +// Op that returns the size of the given table. +class HashTableSizeGpuOp : public OpKernel { + public: + explicit HashTableSizeGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + lookup::LookupInterface* table; + OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table)); + core::ScopedUnref unref_me(table); + lookup::HkvHashTableOfTensorsGpu* table_hkv = + (lookup::HkvHashTableOfTensorsGpu*)table; + + Tensor* out; + AllocatorAttributes attr; + attr.set_gpu_compatible(true); + attr.set_on_host(false); + + OP_REQUIRES_OK(ctx, + ctx->allocate_output("size", TensorShape({}), &out, attr)); + + int64* p_size = (int64*)out->flat().data(); + table_hkv->size_i64(ctx, p_size); + } +}; + +// Op that outputs tensors of all keys and all values. +class HashTableExportGpuOp : public OpKernel { + public: + explicit HashTableExportGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + lookup::LookupInterface* table; + OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table)); + core::ScopedUnref unref_me(table); + + OP_REQUIRES_OK(ctx, table->ExportValues(ctx)); + } +}; + +REGISTER_KERNEL_BUILDER( + Name(PREFIX_OP_NAME(HkvHashTableExport)).Device(DEVICE_GPU), + HashTableExportGpuOp); + +// Op that export all keys and values to file. +template +class HashTableExportWithMetasGpuOp : public OpKernel { + public: + explicit HashTableExportWithMetasGpuOp(OpKernelConstruction* ctx) + : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + lookup::LookupInterface* table; + OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table)); + core::ScopedUnref unref_me(table); + lookup::HkvHashTableOfTensorsGpu* table_hkv = + (lookup::HkvHashTableOfTensorsGpu*)table; + OP_REQUIRES_OK( + ctx, table_hkv->ExportValuesWithMetas(ctx)); + } +}; + +template +class HashTableExportKeysAndMetasGpuOp : public OpKernel { + public: + explicit HashTableExportKeysAndMetasGpuOp(OpKernelConstruction* ctx) + : OpKernel(ctx) { + ctx->GetAttr("split_size", &split_size_i64_); + } + + void Compute(OpKernelContext* ctx) override { + lookup::LookupInterface* table; + OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table)); + core::ScopedUnref unref_me(table); + lookup::HkvHashTableOfTensorsGpu* table_hkv = + (lookup::HkvHashTableOfTensorsGpu*)table; + OP_REQUIRES_OK(ctx, table_hkv->ExportKeysAndMetas(ctx, static_cast(split_size_i64_))); + } + private: + int64 split_size_i64_; +}; + +template +class HashTableExportToFileGpuOp : public OpKernel { + public: + explicit HashTableExportToFileGpuOp(OpKernelConstruction* ctx) + : OpKernel(ctx) { + int64 signed_buffer_size = 0; + ctx->GetAttr("buffer_size", &signed_buffer_size); + buffer_size_ = static_cast(signed_buffer_size); + } + + void Compute(OpKernelContext* ctx) override { + lookup::LookupInterface* table; + OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table)); + core::ScopedUnref unref_me(table); + + const Tensor& ftensor = ctx->input(1); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(ftensor.shape()), + errors::InvalidArgument("filepath must be scalar.")); + string filepath = string(ftensor.scalar()().data()); + lookup::HkvHashTableOfTensorsGpu* table_hkv = + (lookup::HkvHashTableOfTensorsGpu*)table; + OP_REQUIRES_OK( + ctx, table_hkv->ExportValuesToFile(ctx, filepath, buffer_size_)); + } + + private: + size_t buffer_size_; +}; + +// Clear the table and insert data. +class HashTableImportGpuOp : public OpKernel { + public: + explicit HashTableImportGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + lookup::LookupInterface* table; + OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table)); + core::ScopedUnref unref_me(table); + + DataType expected_input_0 = DT_RESOURCE; + DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(), + table->value_dtype()}; + OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, {})); + + const Tensor& keys = ctx->input(1); + const Tensor& values = ctx->input(2); + OP_REQUIRES_OK(ctx, table->CheckKeyAndValueTensorsForImport(keys, values)); + OP_REQUIRES_OK(ctx, table->ImportValues(ctx, keys, values)); + } +}; + +REGISTER_KERNEL_BUILDER( + Name(PREFIX_OP_NAME(HkvHashTableImport)).Device(DEVICE_GPU), + HashTableImportGpuOp); + +// Clear the table and insert data from FileSystem. +template +class HashTableImportFromFileGpuOp : public OpKernel { + public: + explicit HashTableImportFromFileGpuOp(OpKernelConstruction* ctx) + : OpKernel(ctx) { + int64 signed_buffer_size = 0; + ctx->GetAttr("buffer_size", &signed_buffer_size); + buffer_size_ = static_cast(signed_buffer_size); + } + + void Compute(OpKernelContext* ctx) override { + lookup::LookupInterface* table; + OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table)); + core::ScopedUnref unref_me(table); + + const Tensor& ftensor = ctx->input(1); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(ftensor.shape()), + errors::InvalidArgument("filepath must be scalar.")); + string filepath = string(ftensor.scalar()().data()); + lookup::HkvHashTableOfTensorsGpu* table_hkv = + (lookup::HkvHashTableOfTensorsGpu*)table; + OP_REQUIRES_OK( + ctx, table_hkv->ImportValuesFromFile(ctx, filepath, buffer_size_)); + } + + private: + size_t buffer_size_; +}; + +// Register the HkvHashTableOfTensors op. + +#define REGISTER_KERNEL(key_dtype, value_dtype) \ + REGISTER_KERNEL_BUILDER( \ + Name(PREFIX_OP_NAME(HkvHashTableOfTensors)) \ + .Device(DEVICE_GPU) \ + .TypeConstraint("key_dtype") \ + .TypeConstraint("value_dtype"), \ + HashTableGpuOp< \ + lookup::HkvHashTableOfTensorsGpu, \ + key_dtype, value_dtype>); \ + REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableClear)) \ + .Device(DEVICE_GPU) \ + .TypeConstraint("key_dtype") \ + .TypeConstraint("value_dtype"), \ + HashTableClearGpuOp); \ + REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableSize)) \ + .Device(DEVICE_GPU) \ + .TypeConstraint("key_dtype") \ + .TypeConstraint("value_dtype"), \ + HashTableSizeGpuOp); \ + REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableAccum)) \ + .Device(DEVICE_GPU) \ + .TypeConstraint("key_dtype") \ + .TypeConstraint("value_dtype"), \ + HashTableAccumGpuOp); \ + REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableExportWithMetas)) \ + .Device(DEVICE_GPU) \ + .TypeConstraint("key_dtype") \ + .TypeConstraint("value_dtype"), \ + HashTableExportWithMetasGpuOp); \ + REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableExportToFile)) \ + .Device(DEVICE_GPU) \ + .HostMemory("filepath") \ + .TypeConstraint("key_dtype") \ + .TypeConstraint("value_dtype"), \ + HashTableExportToFileGpuOp); \ + REGISTER_KERNEL_BUILDER( \ + Name(PREFIX_OP_NAME(HkvHashTableImportFromFile)) \ + .Device(DEVICE_GPU) \ + .HostMemory("filepath") \ + .TypeConstraint("key_dtype") \ + .TypeConstraint("value_dtype"), \ + HashTableImportFromFileGpuOp); \ + REGISTER_KERNEL_BUILDER( \ + Name(PREFIX_OP_NAME(HkvHashTableFindWithExists)) \ + .Device(DEVICE_GPU) \ + .TypeConstraint("Tin") \ + .TypeConstraint("Tout"), \ + HashTableFindWithExistsGpuOp); + +REGISTER_KERNEL(int64, float); +REGISTER_KERNEL(int64, Eigen::half); +REGISTER_KERNEL(int64, int64); +REGISTER_KERNEL(int64, int32); +REGISTER_KERNEL(int64, int8); +REGISTER_KERNEL(int32, float); + +#undef REGISTER_KERNEL + +#define SINGLE_ATTR_REGISTER_KERNEL(key_dtype, value_type) \ + REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableExportKeysAndMetas)) \ + .Device(DEVICE_GPU) \ + .TypeConstraint("Tkeys"), \ + HashTableExportKeysAndMetasGpuOp); + +SINGLE_ATTR_REGISTER_KERNEL(int64, float); + +#undef SINGLE_ATTR_REGISTER_KERNEL + +} // namespace recommenders_addons +} // namespace tensorflow +#endif diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h new file mode 100644 index 000000000..0da759780 --- /dev/null +++ b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h @@ -0,0 +1,360 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TFRA_CORE_KERNELS_LOOKUP_TABLE_OP_GPU_H_ +#define TFRA_CORE_KERNELS_LOOKUP_TABLE_OP_GPU_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include "tensorflow/core/framework/bounds_check.h" +#include "tensorflow/core/framework/lookup_interface.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/kernels/lookup_util.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/thread_annotations.h" +#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_hashtable.cuh" +#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_localfile.hpp" +#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/types.cuh" +#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/utils.cuh" + +namespace tensorflow { +namespace recommenders_addons { +namespace lookup { +namespace gpu { + +template +class KVOnlyFile : public nv::merlin::BaseKVFile { + public: + KVOnlyFile() : keys_fp_(nullptr), values_fp_(nullptr) {} + + ~KVOnlyFile() { + close(); + } + + bool open(const std::string& keys_path, const std::string& values_path, + const char* mode) { + close(); + keys_fp_ = fopen(keys_path.c_str(), mode); + if (!keys_fp_) { + return false; + } + values_fp_ = fopen(values_path.c_str(), mode); + if (!values_fp_) { + close(); + return false; + } + return true; + } + + void close() noexcept { + if (keys_fp_) { + fclose(keys_fp_); + keys_fp_ = nullptr; + } + if (values_fp_) { + fclose(values_fp_); + values_fp_ = nullptr; + } + } + + size_t read(const size_t n, const size_t dim, K* keys, V* vectors, M* metas) override { + size_t nread_keys = + fread(keys, sizeof(K), static_cast(n), keys_fp_); + size_t nread_vecs = + fread(vectors, sizeof(V) * dim, static_cast(n), values_fp_); + if (nread_keys != nread_vecs) { + LOG(INFO) << "Partially read failed. " << nread_keys << " kv pairs by KVOnlyFile."; + return 0; + } + LOG(INFO) << "Partially read " << nread_keys << " kv pairs by KVOnlyFile."; + return nread_keys; + } + + size_t write(const size_t n, const size_t dim, const K* keys, const V* vectors, + const M* metas) override { + size_t nwritten_keys = + fwrite(keys, sizeof(K), static_cast(n), keys_fp_); + size_t nwritten_vecs = + fwrite(vectors, sizeof(V) * dim, static_cast(n), values_fp_); + if (nwritten_keys != nwritten_vecs) { + return 0; + } + LOG(INFO) << "Partially write " << nwritten_keys << " kv pairs by KVOnlyFile."; + return nwritten_keys; + } + + private: + FILE* keys_fp_; + FILE* values_fp_; +}; + +// template to avoid multidef in compile time only. +template +__global__ void gpu_u64_to_i64_kernel(const uint64_t* u64, int64* i64, size_t len) { + size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tid < len) { + i64[tid] = static_cast(u64[tid]); + } +} + +template +__global__ void broadcast_kernel(T* data, T val, size_t n) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < n) { + data[tid] = val; + } +} + +template +void gpu_cast_u64_to_i64(const uint64_t* u64, int64* i64, size_t len, cudaStream_t stream) { + size_t block_size = nv::merlin::SAFE_GET_BLOCK_SIZE(1024); + size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(len, block_size); + gpu_u64_to_i64_kernel<<>>(u64, i64, len); +} + +using GPUDevice = Eigen::ThreadPoolDevice; + +struct TableWrapperInitOptions { + size_t max_capacity; + size_t init_capacity; +}; + +template +class TableWrapper { + private: + //using M = uint64_t; + using Table = nv::merlin::HashTable; + + public: + TableWrapper(TableWrapperInitOptions& init_options, size_t dim) { + max_capacity_ = init_options.max_capacity; + dim_ = dim; + nv::merlin::HashTableOptions mkv_options; + mkv_options.init_capacity = std::min(init_options.init_capacity, max_capacity_); + mkv_options.max_capacity = max_capacity_; + // Since currently GPU nodes are not compatible to fast + // pcie connections for D2H non-continous wirte, so just + // use pure hbm mode now. + mkv_options.max_hbm_for_vectors = std::numeric_limits::max(); + mkv_options.max_load_factor = 0.63; + mkv_options.block_size = nv::merlin::SAFE_GET_BLOCK_SIZE(1024); + mkv_options.dim = dim; + mkv_options.evict_strategy = nv::merlin::EvictStrategy::kCustomized; + block_size_ = mkv_options.block_size; + table_ = new Table(); + table_->init(mkv_options); + } + + ~TableWrapper() { delete table_; } + + void upsert(const K* d_keys, const V* d_vals, size_t len, + cudaStream_t stream) { + uint64_t t0 = (uint64_t)time(NULL); + uint64_t* timestamp_metas = nullptr; + CUDA_CHECK(cudaMallocAsync(×tamp_metas, len * sizeof(uint64_t), stream)); + CUDA_CHECK(cudaMemsetAsync(timestamp_metas, 0, len * sizeof(uint64_t), stream)); + size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(len, block_size_); + broadcast_kernel<<>>(timestamp_metas, t0, len); + + table_->insert_or_assign(len, d_keys, d_vals, /*d_metas=*/timestamp_metas, stream); + CUDA_CHECK(cudaFreeAsync(timestamp_metas, stream)); + } + + void accum(const K* d_keys, const V* d_vals_or_deltas, + const bool* d_exists, size_t len, cudaStream_t stream) { + uint64_t t0 = (uint64_t)time(NULL); + uint64_t* timestamp_metas = nullptr; + CUDA_CHECK(cudaMallocAsync(×tamp_metas, len * sizeof(uint64_t), stream)); + CUDA_CHECK(cudaMemsetAsync(timestamp_metas, 0, len * sizeof(uint64_t), stream)); + size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(len, block_size_); + broadcast_kernel<<>>(timestamp_metas, t0, len); + table_->accum_or_assign(len, d_keys, d_vals_or_deltas, d_exists, /*d_metas=*/timestamp_metas, stream); + CUDA_CHECK(cudaFreeAsync(timestamp_metas, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)) + } + + void dump(K* d_key, V* d_val, const size_t offset, + const size_t search_length, size_t* d_dump_counter, + cudaStream_t stream) const { + table_->export_batch(search_length, offset, d_dump_counter, d_key, d_val, /*d_metas=*/nullptr, stream); + } + + void dump_with_metas(K* d_key, V* d_val, uint64_t* d_metas, const size_t offset, + const size_t search_length, size_t* d_dump_counter, + cudaStream_t stream) const { + table_->export_batch(search_length, offset, d_dump_counter, d_key, d_val, d_metas, stream); + } + + void dump_keys_and_metas(K* keys, int64* metas, size_t len, + size_t split_len, cudaStream_t stream) const { + V* values_buf = nullptr; + size_t offset = 0; + size_t real_offset = 0; + size_t skip = split_len; + uint64_t* metas_u64 = reinterpret_cast(metas); + size_t span_len = table_->capacity(); + CUDA_CHECK(cudaMallocAsync(&values_buf, sizeof(V) * dim_ * split_len, stream)); + CUDA_CHECK(cudaMemsetAsync(values_buf, 0, sizeof(V) * dim_ * split_len, stream)); + for (; offset < span_len; offset += split_len) { + if (offset + skip > span_len) { + skip = span_len - offset; + } + // TODO: overlap the loop + size_t h_dump_counter = table_->export_batch(skip, offset, keys + real_offset, values_buf, metas_u64 + real_offset, stream); + CudaCheckError(); + + if (h_dump_counter > 0) { + gpu_cast_u64_to_i64(metas_u64 + real_offset, metas + real_offset, h_dump_counter, stream); + real_offset += h_dump_counter; + } + CUDA_CHECK(cudaStreamSynchronize(stream)); + } + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaFreeAsync(values_buf, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + } + + void dump_to_file(const string filepath, size_t dim, + cudaStream_t stream, + const size_t buffer_size) const { + LOG(INFO) << "dump_to_file, filepath: " << filepath << ", dim: " << dim + << ", stream: " << stream << ", buffer_size: " << buffer_size; + std::unique_ptr> wfile; + string keyfile = ; + string valuefile = ; + string metafile = ; + + wfile.reset(new TimestampV1CompatFile); + bool open_ok = wfile->open(keyfile, valuefile, metafile, "wb"); + if (!open_ok) { + std::string error_msg = "Failed to dump to file to " + keyfile + ", " + valuefile + ", " + metafile; + throw std::runtime_error(error_msg); + } + + size_t n_saved = table_->save(wfile.get(), buffer_size, stream); + LOG(INFO) << "[op] Save " << n_saved << " pairs into keyfile: " + << keyfile << ", and valuefile: " << valuefile + << ", and metafile: " << metafile; + CUDA_CHECK(cudaStreamSynchronize(stream)); + wfile->close(); + } + + void load_from_file(const string filepath, + size_t key_num, size_t dim, cudaStream_t stream, + const size_t buffer_size) { + std::unique_ptr> rfile; + string keyfile = ; + string valuefile = ; + string metafile = ; + //rfile.reset(new TimestampV1CompatFile); + bool has_metas = false; + bool open_ok = false; + + if (is_valid_metas(keyfile, metafile)) { + rfile.reset(new TimestampV1CompatFile); + open_ok = reinterpret_cast*>(rfile.get())->open(keyfile, valuefile, metafile, "rb"); + has_metas = true; + } else { + rfile.reset(new KVOnlyFile); + open_ok = reinterpret_cast*>(rfile.get())->open(keyfile, valuefile, "rb"); + } + if (!open_ok) { + std::string error_msg = "Failed to load from file to " + keyfile + ", " + valuefile + ", " + metafile; + throw std::runtime_error("Failed to "); + } + + size_t n_loaded = table_->load(rfile.get(), buffer_size, stream); + if (has_metas) { + LOG(INFO) << "[op] Load " << n_loaded << " pairs into keyfile: " + << keyfile << ", and valuefile: " << valuefile + << ", and metafile" << metafile; + } else { + LOG(INFO) << "[op] Load " << n_loaded << " pairs into keyfile: " + << keyfile << ", and valuefile: " << valuefile; + } + CUDA_CHECK(cudaStreamSynchronize(stream)); + if (has_metas) { + reinterpret_cast*>(rfile.get())->close(); + } else { + reinterpret_cast*>(rfile.get())->close(); + } + } + + void get(const K* d_keys, V* d_vals, bool* d_status, size_t len, + V* d_def_val, cudaStream_t stream, + bool is_full_size_default) const { + if (is_full_size_default) { + CUDA_CHECK(cudaMemcpyAsync(d_vals, d_def_val, sizeof(V) * dim_ * len, cudaMemcpyDeviceToDevice, stream)); + } else { + size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(len, block_size_); + gpu_fill_default_values<<>>(d_vals, d_def_val, len, dim_); + } + table_->find(len, d_keys, d_vals, d_status, /*d_metas=*/nullptr, stream); + } + + // TODO: Implement a contain kernel instead of find. + void contains(const K* d_keys, V* d_status, size_t len, cudaStream_t stream) { + // pass + V* tmp_vals = nullptr; + CUDA_CHECK(cudaMallocAsync(&tmp_vals, sizeof(V) * len * dim_, stream)); + CUDA_CHECK(cudaMemsetAsync(&tmp_vals, 0, sizeof(V) * len * dim_, stream)); + table_->find(len, d_keys, tmp_vals, d_status, /*d_metas=*/nullptr, stream); + CUDA_CHECK(cudaFreeAsync(tmp_vals, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + } + + size_t get_size(cudaStream_t stream) const { + return table_->size(stream); + } + + size_t get_capacity() const { return table_->capacity(); } + + void remove(const K* d_keys, size_t len, cudaStream_t stream) { + table_->erase(len, d_keys, stream); + } + + void clear(cudaStream_t stream) { table_->clear(stream); } + + private: + Table* table_; + size_t max_capacity_; + size_t dim_; + int block_size_; + bool dynamic_mode_; +}; + +template +void CreateTableImpl(TableWrapper** pptable, TableWrapperInitOptions& options, + size_t runtime_dim) { + *pptable = new TableWrapper(options, runtime_dim); +} + +} // namespace gpu +} // namespace lookup +} // namespace recommenders_addons +} // namespace tensorflow + +#endif // TFRA_CORE_KERNELS_LOOKUP_TABLE_OP_GPU_H_ diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv_impl.cu.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv_impl.cu.cc new file mode 100644 index 000000000..47660f73c --- /dev/null +++ b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv_impl.cu.cc @@ -0,0 +1,37 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_gpu.h" +namespace tensorflow { +namespace recommenders_addons { +namespace lookup { +namespace gpu { + +#define DEFINE_PURE_GPU_HASHTABLE(key_type, value_type) \ + template<> class TableWrapper + + +DEFINE_PURE_GPU_HASHTABLE(int64, float); +DEFINE_PURE_GPU_HASHTABLE(int64, int32); +DEFINE_PURE_GPU_HASHTABLE(int64, int64); +DEFINE_PURE_GPU_HASHTABLE(int64, int64); + +#undef DEFINE_PURE_GPU_HASHTABLE + + +} // namespace gpu +} // namespace lookup +} // namespace recommenders_addons +} // namespace tensorflow diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/BUILD b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/BUILD new file mode 100644 index 000000000..b7d22e73b --- /dev/null +++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/BUILD @@ -0,0 +1,42 @@ +load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda", "if_cuda_is_configured") + +package(default_visibility = ["//visibility:public"]) + +load("//tensorflow_recommenders_addons:tensorflow_recommenders_addons.bzl", "custom_op_library", + "if_cuda_for_tf_serving", "tf_nvcc_binary") + +cc_library( + name = "merlin_kv", + hdrs = [ + "merlin/array_kernels.cuh", + "merlin/core_kernels.cuh", + "merlin/debug.hpp", + "merlin/flexible_buffer.cuh", + "merlin/memory_pool.cuh", + "merlin/group_lock.hpp", + #"merlin/initializers.cuh", + #"merlin/managed.cuh", + #"merlin/optimizers.cuh", + "merlin/types.cuh", + "merlin/utils.cuh", + "merlin_hashtable.cuh", + "merlin_localfile.hpp", + ], + deps = [ + "//tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils:cuda_utils", + ], +) + +tf_nvcc_binary( + name="merlin_kv_insert_and_evict_test", + srcs=[], + cuda_srcs=[ + "insert_and_evict_test.cu.cc", + "test_util.cu.h", + ], + cuda_deps=[ + ":merlin_kv", + ], + copts=[], + linkopts=[], +) diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/array_kernels.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/array_kernels.cuh new file mode 100644 index 000000000..cd11fb713 --- /dev/null +++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/array_kernels.cuh @@ -0,0 +1,345 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http:///www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/cuda_utils.cuh" + +#include +#include "cuda_runtime.h" +#include "thrust/device_vector.h" +#include "thrust/execution_policy.h" +#include "thrust/scan.h" +#include "thrust/count.h" +#include "types.cuh" +#include "utils.cuh" + +namespace nv { +namespace merlin { + +template +__global__ void keys_not_empty(const K* keys, bool* masks, size_t n) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < n) { + masks[tid] = keys[tid] != EMPTY_KEY; + } +} + +template +__global__ void gpu_cell_count(const bool* masks, bool target, + Tidx* offsets, size_t n, size_t* n_existed) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + auto g = cg::tiled_partition(cg::this_thread_block()); + int rank = g.thread_rank(); + bool is_existed = false; + if (tid < n) { + if (masks[tid] == target) { + is_existed = true; + } + } + unsigned int vote = g.ballot(is_existed); + int g_ones = __popc((int)vote); + if (rank == 0 && tid < n) { + offsets[tid / TILE_SIZE] = static_cast(g_ones); + atomicAdd(static_cast(n_existed), static_cast(g_ones)); + } +} + +template +__global__ void gpu_select_key_kernel(const bool* masks, bool target, size_t n, + const Tidx* offsets, const K* __restrict keys, + K* __restrict outkeys, Tidx* outoffsets) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + auto g = cg::tiled_partition(cg::this_thread_block()); + int rank = g.thread_rank(); + + bool is_existed = false; + if (tid < n) { + if (masks[tid] == target) { + is_existed = true; + } + } + unsigned int vote = g.ballot(is_existed); + unsigned int r_vote = __brev(vote) >> (32 - TILE_SIZE); + + if (tid < n) { + r_vote = r_vote >> (TILE_SIZE - rank - 1); + if (masks[tid] == target) { + int prefix_n = __popc(r_vote) - 1; + Tidx bias = offsets[tid / TILE_SIZE] + static_cast(prefix_n); + outkeys[bias] = keys[tid]; + outoffsets[bias] = static_cast(tid); + } + } +} + +template +__global__ void gpu_select_kv_kernel(const bool* masks, bool target, size_t n, + const Tidx* offsets, + const K* __restrict keys, + V* __restrict values, + K* __restrict outkeys, + V* __restrict outvalues, + const size_t dim) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + auto g = cg::tiled_partition(cg::this_thread_block()); + int rank = g.thread_rank(); + + bool is_existed = false; + if (tid < n) { + if (masks[tid] == target) { + is_existed = true; + } + } + unsigned int vote = g.ballot(is_existed); + unsigned int r_vote = __brev(vote) >> (32 - TILE_SIZE); + + if (tid < n) { + r_vote = r_vote >> (TILE_SIZE - rank - 1); + if (masks[tid] == target) { + int prefix_n = __popc(r_vote) - 1; + Tidx bias = offsets[tid / TILE_SIZE] + static_cast(prefix_n); + outkeys[bias] = keys[tid]; + for (size_t i=0;i +__global__ void gpu_select_kvm_kernel(const bool* masks, size_t n, + const Tidx* offsets, K* __restrict keys, + V* __restrict values, M* __restrict metas, + const size_t dim) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + auto g = cg::tiled_partition(cg::this_thread_block()); + int rank = g.thread_rank(); + + bool is_existed = false; + if (tid < n) { + if (masks[tid]) { + is_existed = true; + } + } + unsigned int vote = g.ballot(is_existed); + unsigned int r_vote = __brev(vote) >> (32 - TILE_SIZE); + K empty_key = (K)EMPTY_KEY; + + if (tid < n) { + r_vote = r_vote >> (TILE_SIZE - rank - 1); + if (masks[tid]) { + int prefix_n = __popc(r_vote) - 1; + Tidx bias = offsets[tid / TILE_SIZE] + static_cast(prefix_n); + + if (bias == tid) return; + + K target_key = 0; + AtomicKey* atomic_key = reinterpret_cast*>(keys) + bias; + while (target_key != empty_key) { + //target_key = atomicCAS(keys + bias, empty_key, keys[tid]); + target_key = empty_key; + atomic_key->compare_exchange_weak(target_key, keys[tid], + cuda::std::memory_order_relaxed, + cuda::std::memory_order_relaxed); + } + if (metas) metas[bias] = metas[tid]; + for (size_t j = 0; j < dim; j++) { + values[dim * bias + j] = values[dim * tid + j]; + } + //atomicExch(keys + tid, empty_key); + atomic_key = reinterpret_cast*>(keys) + tid; + atomic_key->store(empty_key, cuda::std::memory_order_relaxed); + } + } +} + +template +__global__ void gpu_select_kvm_kernel_v2(size_t n, + K* __restrict keys, + V* __restrict values, + M* __restrict metas, + K* __restrict tmp_keys, + V* __restrict tmp_values, + M* __restrict tmp_metas, + size_t* cnt, + const size_t dim) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < n) { + size_t offset = atomicAdd(cnt, 1llu); + tmp_keys[offset] = keys[tid]; + for (size_t j = 0; j < dim; j++) { + tmp_values[offset * dim + j] = values[tid * dim + j]; + } + if (metas) { + tmp_metas[offset] = metas[tid]; + } + } +} + +template +void gpu_pick_kvm_inplace(size_t grid_size, size_t block_size, const bool* masks, + bool target, size_t n, size_t* n_evicted, Tidx* offsets, + K* __restrict keys, V* __restrict values, + M* __restrict metas, size_t dim, cudaStream_t stream) { + size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE; + gpu_cell_count + <<>>(masks, target, offsets, n, n_evicted); +#if THRUST_VERSION >= 101600 + auto policy = thrust::cuda::par_nosync.on(stream); +#else + auto policy = thrust::cuda::par.on(stream); +#endif + thrust::device_ptr d_src(offsets); + thrust::device_ptr d_dest(offsets); + thrust::exclusive_scan(policy, d_src, d_src + n_offsets, d_dest); + if (target) { + gpu_select_kvm_kernel + <<>>(masks, n, offsets, + keys, values, metas, dim); + } else { + throw std::runtime_error("Not used"); + //gpu_select_kvm_kernel_reverse + // <<>>(masks, n, offsets, + // keys, values, metas, dim); + } +} + +template +size_t gpu_pick_kvm_v2(size_t grid_size, size_t block_size, + bool target, size_t n, size_t* n_evicted, + K* __restrict keys, V* __restrict values, + M* __restrict metas, size_t dim, cudaStream_t stream) { +#if THRUST_VERSION >= 101600 + auto policy = thrust::cuda::par_nosync.on(stream); +#else + auto policy = thrust::cuda::par.on(stream); +#endif + thrust::device_ptr d_src(keys); + int empty_cnt = thrust::count(policy, d_src, d_src + n, (K)EMPTY_KEY); + size_t h_cnt = n - static_cast(empty_cnt); + if (h_cnt == 0) { + return 0; + } + K* tmp_keys = nullptr; + V* tmp_values = nullptr; + M* tmp_metas = nullptr; + if (target) { + CUDA_CHECK(cudaMallocAsync(&tmp_keys, h_cnt * sizeof(K), stream)); + CUDA_CHECK(cudaMemsetAsync(tmp_keys, 0, h_cnt * sizeof(K), stream)); + CUDA_CHECK(cudaMallocAsync(&tmp_values, h_cnt * dim * sizeof(V), stream)); + CUDA_CHECK(cudaMemsetAsync(tmp_values, 0, h_cnt * dim * sizeof(V), stream)); + if (metas) { + CUDA_CHECK(cudaMallocAsync(&tmp_metas, h_cnt * sizeof(M), stream)); + CUDA_CHECK(cudaMemsetAsync(tmp_metas, 0, h_cnt * sizeof(M), stream)); + } + gpu_select_kvm_kernel_v2 + <<>>(n, + keys, values, metas, tmp_keys, tmp_values, tmp_metas, n_evicted, dim); + CUDA_CHECK(cudaMemcpyAsync(keys, tmp_keys, h_cnt * sizeof(K), cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(values, tmp_values, h_cnt * dim * sizeof(V), cudaMemcpyDeviceToDevice, stream)); + if(metas) { + CUDA_CHECK(cudaMemcpyAsync(metas, tmp_metas, h_cnt * sizeof(M), cudaMemcpyDeviceToDevice, stream)); + } + CUDA_CHECK(cudaFreeAsync(tmp_keys, stream)); + CUDA_CHECK(cudaFreeAsync(tmp_values, stream)); + if (tmp_metas) { + CUDA_CHECK(cudaFreeAsync(tmp_metas, stream)); + } + CUDA_CHECK(cudaStreamSynchronize(stream)); + } else { + throw std::runtime_error("Not used"); + //gpu_select_kvm_kernel_reverse + // <<>>(masks, n, offsets, + // keys, values, metas, dim); + } + return h_cnt; +} + +template +void gpu_pick_kvm_inplace_wrap(const bool* masks, bool target, + size_t n, size_t* n_evicted, + K* __restrict keys, V* __restrict values, + M* __restrict metas, size_t dim, cudaStream_t stream) { + size_t block_size = 256; + size_t grid_size = SAFE_GET_GRID_SIZE(n, block_size); + size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE; + int64_t* offsets = nullptr; + CUDA_CHECK(cudaMallocAsync(&offsets, sizeof(int64_t) * n_offsets, stream)); + gpu_pick_kvm_inplace(grid_size, block_size, + masks, target, n, n_evicted, offsets, keys, values, metas, dim, stream); + CUDA_CHECK(cudaFreeAsync(offsets, stream)); +} + +template +void gpu_pick_keys(const bool* masks, bool target, size_t n, size_t* n_evicted, + const K* __restrict keys, K* __restrict outkeys, + int64_t* outoffsets, cudaStream_t stream) { + size_t block_size = 256; + size_t grid_size = SAFE_GET_GRID_SIZE(n, block_size); + size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE; + int64_t* offsets = nullptr; + CUDA_CHECK(cudaMallocAsync(&offsets, sizeof(int64_t) * n_offsets, stream)); + CUDA_CHECK(cudaMemsetAsync(offsets, 0, sizeof(int64_t) * n_offsets, stream)); + + gpu_cell_count + <<>>(masks, target, offsets, n, n_evicted); +#if THRUST_VERSION >= 101600 + auto policy = thrust::cuda::par_nosync.on(stream); +#else + auto policy = thrust::cuda::par.on(stream); +#endif + thrust::device_ptr d_src(offsets); + thrust::device_ptr d_dest(offsets); + thrust::exclusive_scan(policy, d_src, d_src + n_offsets, d_dest); + gpu_select_key_kernel + <<>>(masks, target, n, offsets, + keys, outkeys, outoffsets); + CUDA_CHECK(cudaFreeAsync(offsets, stream)); +} + +template +void gpu_pick_kvs(const bool* masks, bool target, size_t n, size_t* n_evicted, + const K* __restrict keys, + V* __restrict values, + K* __restrict outkeys, + V* __restrict outvalues, + size_t dim, + cudaStream_t stream) { + size_t block_size = 256; + size_t grid_size = SAFE_GET_GRID_SIZE(n, block_size); + size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE; + int64_t* offsets = nullptr; + CUDA_CHECK(cudaMallocAsync(&offsets, sizeof(int64_t) * n_offsets, stream)); + CUDA_CHECK(cudaMemsetAsync(offsets, 0, sizeof(int64_t) * n_offsets, stream)); + + gpu_cell_count + <<>>(masks, target, offsets, n, n_evicted); +#if THRUST_VERSION >= 101600 + auto policy = thrust::cuda::par_nosync.on(stream); +#else + auto policy = thrust::cuda::par.on(stream); +#endif + thrust::device_ptr d_src(offsets); + thrust::device_ptr d_dest(offsets); + thrust::exclusive_scan(policy, d_src, d_src + n_offsets, d_dest); + gpu_select_kv_kernel + <<>>(masks, target, n, offsets, + keys, values, outkeys, outvalues, dim); + CUDA_CHECK(cudaFreeAsync(offsets, stream)); +} +} // namespace merlin +} // namespace nv diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/core_kernels.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/core_kernels.cuh new file mode 100644 index 000000000..43504dd68 --- /dev/null +++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/core_kernels.cuh @@ -0,0 +1,2510 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http:///www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "types.cuh" +#include "utils.cuh" + +using namespace cooperative_groups; +namespace cg = cooperative_groups; + +namespace nv { +namespace merlin { + +/* For improving performance consideration, allocating up to 64 table structures + * in constant memory is supported. To close this function, please set + * `TableOption::use_constant_memory` to `false`. + */ +constexpr int MAX_CONSTANT_TABLE = 64; +static std::mutex constant_table_mutex; +static uint64_t constant_table_flag = 0; + +__constant__ char + c_table_[sizeof(Table) * MAX_CONSTANT_TABLE]; + +template +int allocate_constant_table() { + std::lock_guard guard(constant_table_mutex); + if (constant_table_flag == std::numeric_limits::max()) return -1; + int table_index = 0; + while (constant_table_flag & (1l << table_index)) { + table_index++; + } + + constant_table_flag = constant_table_flag | (1l << table_index); + + return table_index; +} + +template +void release_constant_table(int table_index) { + std::lock_guard guard(constant_table_mutex); + if (table_index < 0 || table_index >= MAX_CONSTANT_TABLE) return; + constant_table_flag = constant_table_flag & (~(1l << table_index)); +} + +template +__global__ void create_locks(M* __restrict mutex, const size_t start, + const size_t end) { + size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (start + tid < end) { + new (mutex + start + tid) M(); + } +} + +template +__global__ void release_locks(M* __restrict mutex, const size_t start, + const size_t end) { + size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (start + tid < end) { + (mutex + start + tid)->~M(); + } +} + +template +__global__ void create_atomic_keys(Bucket* __restrict buckets, + const size_t start, const size_t end, + const size_t bucket_max_size) { + size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (start + tid < end) { + for (size_t i = 0; i < bucket_max_size; i++) + new (buckets[start + tid].keys(i)) + AtomicKey{static_cast(EMPTY_KEY)}; + } +} + +template +__global__ void create_atomic_metas(Bucket* __restrict buckets, + const size_t start, const size_t end, + const size_t bucket_max_size) { + size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (start + tid < end) { + for (size_t i = 0; i < bucket_max_size; i++) { + new (buckets[start + tid].metas(i)) + AtomicMeta{static_cast(EMPTY_META)}; + } + new (&(buckets[start + tid].cur_meta)) + AtomicMeta{static_cast(EMPTY_META)}; + new (&(buckets[start + tid].min_meta)) + AtomicMeta{static_cast(EMPTY_META)}; + new (&(buckets[start + tid].min_pos)) AtomicPos{1}; + } +} + +/* Initialize the buckets with index from start to end. */ +template +void initialize_buckets(Table** table, const size_t start, + const size_t end) { + /* As testing results show us, when the number of buckets is greater than + * the 4 million the performance will drop significantly, we believe the + * to many pinned memory allocation causes this issue, so we change the + * strategy to allocate some memory slices whose size is not greater than + * 64GB, and put the buckets pointer point to the slices. + */ + MERLIN_CHECK(start < end, + "initialize_buckets, start should be less than end!"); + size_t buckets_num = end - start; + const size_t total_size_of_vectors = + buckets_num * (*table)->bucket_max_size * sizeof(V) * (*table)->dim; + const size_t num_of_memory_slices = + 1 + (total_size_of_vectors - 1) / (*table)->bytes_per_slice; + size_t num_of_buckets_in_one_slice = + (*table)->bytes_per_slice / + ((*table)->bucket_max_size * sizeof(V) * (*table)->dim); + size_t num_of_allocated_buckets = 0; + + realloc_managed( + &((*table)->slices), (*table)->num_of_memory_slices * sizeof(V*), + ((*table)->num_of_memory_slices + num_of_memory_slices) * sizeof(V*)); + + for (size_t i = (*table)->num_of_memory_slices; + i < (*table)->num_of_memory_slices + num_of_memory_slices; i++) { + if (i == (*table)->num_of_memory_slices + num_of_memory_slices - 1) { + num_of_buckets_in_one_slice = buckets_num - num_of_allocated_buckets; + } + size_t slice_real_size = num_of_buckets_in_one_slice * + (*table)->bucket_max_size * sizeof(V) * + (*table)->dim; + if ((*table)->remaining_hbm_for_vectors >= slice_real_size) { + CUDA_CHECK(cudaMalloc(&((*table)->slices[i]), slice_real_size)); + (*table)->remaining_hbm_for_vectors -= slice_real_size; + } else { + (*table)->is_pure_hbm = false; + CUDA_CHECK( + cudaMallocHost(&((*table)->slices[i]), slice_real_size, + cudaHostAllocMapped | cudaHostAllocWriteCombined)); + } + for (int j = 0; j < num_of_buckets_in_one_slice; j++) { + (*table)->buckets[start + num_of_allocated_buckets + j].vectors = + (*table)->slices[i] + j * (*table)->bucket_max_size * (*table)->dim; + } + num_of_allocated_buckets += num_of_buckets_in_one_slice; + } + + (*table)->num_of_memory_slices += num_of_memory_slices; + for (int i = start; i < end; i++) { + CUDA_CHECK(cudaMalloc(&((*table)->buckets[i].keys_), + (*table)->bucket_max_size * sizeof(AtomicKey))); + CUDA_CHECK(cudaMalloc(&((*table)->buckets[i].metas_), + (*table)->bucket_max_size * sizeof(AtomicMeta))); + } + + { + const size_t block_size = 512; + const size_t N = end - start + 1; + const int grid_size = SAFE_GET_GRID_SIZE(N, block_size); + create_locks<<>>((*table)->locks, start, end); + } + + { + const size_t block_size = 512; + const size_t N = end - start + 1; + const int grid_size = SAFE_GET_GRID_SIZE(N, block_size); + create_atomic_keys<<>>( + (*table)->buckets, start, end, (*table)->bucket_max_size); + } + + { + const size_t block_size = 512; + const size_t N = end - start + 1; + const int grid_size = SAFE_GET_GRID_SIZE(N, block_size); + create_atomic_metas<<>>( + (*table)->buckets, start, end, (*table)->bucket_max_size); + } + CudaCheckError(); +} + +template +size_t get_slice_size(Table** table) { + const size_t min_slice_size = + (*table)->bucket_max_size * sizeof(V) * (*table)->dim; + const size_t max_table_size = (*table)->max_size * sizeof(V) * (*table)->dim; + size_t slice_size = 0; + + if (max_table_size >= GB(16)) { + slice_size = GB(2); + } else if (max_table_size >= GB(2)) { + slice_size = MB(128); + } else if (max_table_size >= MB(128)) { + slice_size = MB(16); + } else if (max_table_size >= MB(16)) { + slice_size = MB(1); + } else { + slice_size = min_slice_size; + } + + return std::max(min_slice_size, slice_size); +} + +/* Initialize a Table struct. + + K: The key type + V: The value type which should be static array type and C++ class + with customized construct is not supported. + M: The meta type, the meta will be used to store the timestamp + or occurrence frequency or any thing for eviction. + DIM: Vector dimension. +*/ +template +void create_table(Table** table, const size_t dim, + const size_t init_size = 134217728, + const size_t max_size = std::numeric_limits::max(), + const size_t max_hbm_for_vectors = 0, + const size_t bucket_max_size = 128, + const size_t tile_size = 32, const bool primary = true) { + CUDA_CHECK(cudaMallocManaged((void**)table, sizeof(Table))); + CUDA_CHECK(cudaMemset(*table, 0, sizeof(Table))); + (*table)->dim = dim; + (*table)->bucket_max_size = bucket_max_size; + (*table)->max_size = std::max(init_size, max_size); + (*table)->tile_size = tile_size; + (*table)->is_pure_hbm = true; + (*table)->bytes_per_slice = get_slice_size(table); + + // The bucket number will be the minimum needed for saving memory if no + // rehash. + if ((init_size * 2) > (*table)->max_size) { + (*table)->buckets_num = + 1 + (((*table)->max_size - 1) / (*table)->bucket_max_size); + } else { + (*table)->buckets_num = 1; + while ((*table)->buckets_num * (*table)->bucket_max_size < init_size) { + (*table)->buckets_num *= 2; + } + } + + (*table)->capacity = (*table)->buckets_num * (*table)->bucket_max_size; + (*table)->max_hbm_for_vectors = max_hbm_for_vectors; + (*table)->remaining_hbm_for_vectors = max_hbm_for_vectors; + (*table)->primary = primary; + + CUDA_CHECK(cudaMalloc((void**)&((*table)->locks), + (*table)->buckets_num * sizeof(Mutex))); + CUDA_CHECK( + cudaMemset((*table)->locks, 0, (*table)->buckets_num * sizeof(Mutex))); + + CUDA_CHECK(cudaMalloc((void**)&((*table)->buckets_size), + (*table)->buckets_num * sizeof(int))); + CUDA_CHECK(cudaMemset((*table)->buckets_size, 0, + (*table)->buckets_num * sizeof(int))); + + CUDA_CHECK( + cudaMallocManaged((void**)&((*table)->buckets), + (*table)->buckets_num * sizeof(Bucket))); + CUDA_CHECK(cudaMemset((*table)->buckets, 0, + (*table)->buckets_num * sizeof(Bucket))); + + initialize_buckets(table, 0, (*table)->buckets_num); + CudaCheckError(); +} + +/* Double the capacity on storage, must be followed by calling the + * rehash_kernel. */ +template +void double_capacity(Table** table) { + realloc(&((*table)->locks), (*table)->buckets_num * sizeof(Mutex), + (*table)->buckets_num * sizeof(Mutex) * 2); + realloc(&((*table)->buckets_size), (*table)->buckets_num * sizeof(int), + (*table)->buckets_num * sizeof(int) * 2); + + realloc_managed*>( + &((*table)->buckets), (*table)->buckets_num * sizeof(Bucket), + (*table)->buckets_num * sizeof(Bucket) * 2); + + initialize_buckets(table, (*table)->buckets_num, + (*table)->buckets_num * 2); + + (*table)->capacity *= 2; + (*table)->buckets_num *= 2; +} + +/* free all of the resource of a Table. */ +template +void destroy_table(Table** table) { + for (int i = 0; i < (*table)->buckets_num; i++) { + CUDA_CHECK(cudaFree((*table)->buckets[i].keys_)); + CUDA_CHECK(cudaFree((*table)->buckets[i].metas_)); + } + + for (int i = 0; i < (*table)->num_of_memory_slices; i++) { + if (is_on_device((*table)->slices[i])) { + CUDA_CHECK(cudaFree((*table)->slices[i])); + } else { + CUDA_CHECK(cudaFreeHost((*table)->slices[i])); + } + } + { + const size_t block_size = 512; + const size_t N = (*table)->buckets_num; + const int grid_size = SAFE_GET_GRID_SIZE(N, block_size); + release_locks + <<>>((*table)->locks, 0, (*table)->buckets_num); + } + CUDA_CHECK(cudaFree((*table)->slices)); + CUDA_CHECK(cudaFree((*table)->buckets_size)); + CUDA_CHECK(cudaFree((*table)->buckets)); + CUDA_CHECK(cudaFree((*table)->locks)); + CUDA_CHECK(cudaFree(*table)); + CUDA_CHECK(cudaDeviceSynchronize()); + CudaCheckError(); +} + +template +__forceinline__ __device__ void defragmentation_for_rehash( + Bucket* __restrict bucket, uint32_t remove_pos, + const size_t bucket_max_size, const size_t buckets_num, const size_t dim) { + uint32_t key_idx; + size_t global_idx = 0; + size_t start_idx = 0; + K find_key; + K hashed_key; + + uint32_t empty_pos = remove_pos; + + int i = 1; + while (i < bucket_max_size) { + key_idx = (remove_pos + i) & (bucket_max_size - 1); + find_key = (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed); + if (find_key == static_cast(EMPTY_KEY)) { + break; + } + hashed_key = Murmur3HashDevice(find_key); + global_idx = hashed_key % (buckets_num * bucket_max_size); + start_idx = global_idx % bucket_max_size; + + if ((start_idx <= empty_pos && empty_pos < key_idx) || + (key_idx < start_idx && start_idx <= empty_pos) || + (empty_pos <= key_idx && key_idx < start_idx)) { + const K key = + (*(bucket->keys(key_idx))).load(cuda::std::memory_order_relaxed); + (*(bucket->keys(empty_pos))).store(key, cuda::std::memory_order_relaxed); + const M meta = + (*(bucket->metas(key_idx))).load(cuda::std::memory_order_relaxed); + (*(bucket->metas(empty_pos))) + .store(meta, cuda::std::memory_order_relaxed); + for (int j = 0; j < dim; j++) { + bucket->vectors[empty_pos * dim + j] = + bucket->vectors[key_idx * dim + j]; + } + (*(bucket->keys(key_idx))) + .store(static_cast(EMPTY_KEY), cuda::std::memory_order_relaxed); + empty_pos = key_idx; + remove_pos = key_idx; + i = 1; + } else { + i++; + } + } +} + +template +__forceinline__ __device__ void refresh_bucket_meta( + cg::thread_block_tile g, Bucket* bucket, + const size_t bucket_max_size) { + M min_val = MAX_META; + int min_pos = 0; + + for (int i = g.thread_rank(); i < bucket_max_size; i += TILE_SIZE) { + const K key = (bucket->keys(i))->load(cuda::std::memory_order_relaxed); + if (key == static_cast(EMPTY_KEY) || + key == static_cast(RECLAIM_KEY)) { + continue; + } + const M meta = bucket->metas(i)->load(cuda::std::memory_order_relaxed); + if (meta < min_val) { + min_pos = i; + min_val = meta; + } + } + M global_min_val = cg::reduce(g, min_val, cg::less()); + if (min_val == global_min_val) { + bucket->min_pos.store(min_pos, cuda::std::memory_order_relaxed); + bucket->min_meta.store(min_val, cuda::std::memory_order_relaxed); + } +} + +template +__device__ __forceinline__ void copy_vector( + cg::thread_block_tile const& g, const V* src, V* dst, + const size_t dim) { + for (auto i = g.thread_rank(); i < dim; i += g.size()) { + dst[i] = src[i]; + } + + // cuda::barrier bar; + // init(&bar, 1); + // cuda::memcpy_async(g, dst, src, dim * sizeof(V), bar); + // + // bar.arrive_and_wait(); +} + +/* Write the N data from src to each address in *dst by using CPU threads, + * usually called by upsert kernel. + * + * @note: In some machines with AMD CPUs, the `write_kernel` has low performance + * thru PCI-E, so we try to use the `memcpy` on CPU threads for writing work to + * reach better performance. + */ +template +void write_by_cpu(V** __restrict dst, const V* __restrict src, + const int* __restrict offset, size_t dim, int N, + int n_worker = 16) { + std::vector thds; + if (n_worker < 1) n_worker = 1; + + auto functor = [dim](V** __restrict dst, const V* __restrict src, + const int* __restrict offset, int handled_size, + int trunk_size) -> void { + for (int i = handled_size; i < handled_size + trunk_size; i++) { + if (dst[i] != nullptr) { + memcpy(dst[i], src + offset[i] * dim, sizeof(V) * dim); + } + } + }; + + int32_t trunk_size_floor = N / n_worker; + int32_t trunk_size_remain = N % n_worker; + int32_t n_worker_used = trunk_size_floor == 0 ? trunk_size_remain : n_worker; + + size_t handled_size = 0; + for (int i = 0; i < n_worker_used; i++) { + int32_t cur_trunk_size = trunk_size_floor; + if (trunk_size_remain != 0) { + cur_trunk_size += 1; + trunk_size_remain--; + } + thds.push_back( + std::thread(functor, dst, src, offset, handled_size, cur_trunk_size)); + handled_size += cur_trunk_size; + } + + for (int i = 0; i < n_worker_used; i++) { + thds[i].join(); + } +} + +template +__forceinline__ __device__ void move_key_to_new_bucket( + cg::thread_block_tile g, int rank, const K& key, const M& meta, + const V* __restrict vector, Bucket* __restrict new_bucket, + const size_t new_bkt_idx, const size_t new_start_idx, + int* __restrict buckets_size, const size_t bucket_max_size, + const size_t buckets_num, const size_t dim) { + uint32_t key_pos; + unsigned empty_vote; + int local_size; + int src_lane; + + for (uint32_t tile_offset = 0; tile_offset < bucket_max_size; + tile_offset += TILE_SIZE) { + size_t key_offset = + (new_start_idx + tile_offset + rank) & (bucket_max_size - 1); + const K current_key = + (*(new_bucket->keys(key_offset))).load(cuda::std::memory_order_relaxed); + empty_vote = g.ballot(current_key == static_cast(EMPTY_KEY)); + if (empty_vote) { + src_lane = __ffs(empty_vote) - 1; + key_pos = + (new_start_idx + tile_offset + src_lane) & (bucket_max_size - 1); + local_size = buckets_size[new_bkt_idx]; + if (rank == src_lane) { + new_bucket->keys(key_pos)->store(key, cuda::std::memory_order_relaxed); + new_bucket->metas(key_pos)->store(meta, + cuda::std::memory_order_relaxed); + atomicAdd(&(buckets_size[new_bkt_idx]), 1); + } + local_size = g.shfl(local_size, src_lane); + if (local_size >= bucket_max_size) { + refresh_bucket_meta(g, new_bucket, bucket_max_size); + } + copy_vector(g, vector, new_bucket->vectors + key_pos * dim, + dim); + break; + } + } +} + +template +__global__ void rehash_kernel_for_fast_mode( + const Table* __restrict table, size_t N) { + Bucket* buckets = table->buckets; + int* __restrict buckets_size = table->buckets_size; + const size_t bucket_max_size = table->bucket_max_size; + const size_t buckets_num = table->buckets_num; + const size_t dim = table->dim; + + auto g = cg::tiled_partition(cg::this_thread_block()); + int rank = g.thread_rank(); + size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; + size_t global_idx; + uint32_t start_idx = 0; + K target_key = 0; + M target_meta = 0; + + for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) { + uint32_t bkt_idx = t / TILE_SIZE; + Bucket* bucket = (buckets + bkt_idx); + + lock(g, table->locks[bkt_idx]); + uint32_t key_idx = 0; + while (key_idx < bucket_max_size) { + key_idx = g.shfl(key_idx, 0); + target_key = + (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed); + target_meta = + bucket->metas(key_idx)->load(cuda::std::memory_order_relaxed); + if (target_key != static_cast(EMPTY_KEY) && + target_key != static_cast(RECLAIM_KEY)) { + K hashed_key = Murmur3HashDevice(target_key); + global_idx = hashed_key % (buckets_num * bucket_max_size); + uint32_t new_bkt_idx = global_idx / bucket_max_size; + if (new_bkt_idx != bkt_idx) { + start_idx = global_idx % bucket_max_size; + move_key_to_new_bucket( + g, rank, target_key, target_meta, + (bucket->vectors + key_idx * dim), buckets + new_bkt_idx, + new_bkt_idx, start_idx, buckets_size, bucket_max_size, + buckets_num, table->dim); + if (rank == 0) { + (bucket->keys(key_idx)) + ->store(static_cast(EMPTY_KEY), + cuda::std::memory_order_relaxed); + atomicSub(&(buckets_size[bkt_idx]), 1); + defragmentation_for_rehash( + bucket, key_idx, bucket_max_size, buckets_num / 2, dim); + key_idx = 0; + } + } else { + key_idx++; + } + } else { + key_idx++; + } + } + unlock(g, table->locks[bkt_idx]); + } +} + +/* Write the N data from src to each address in *dst, + usually called by upsert kernel. + + `src`: A continuous memory pointer with Vector + which can be HBM. + `dst`: A pointer of pointer to V which should be on HBM, + but each value (a pointer of V) could point to a + memory on HBM or HMEM. + `N`: Number of vectors that need to be written. +*/ +template +__global__ void write_kernel(const V* __restrict src, V** __restrict dst, + const int* __restrict src_offset, const size_t dim, + const size_t N) { + size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; + + for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) { + int vec_index = int(t / dim); + int dim_index = t % dim; + + if (dst[vec_index] != nullptr) { + if (src_offset != nullptr) { + dst[vec_index][dim_index] = + src[src_offset[vec_index] * dim + dim_index]; + } else { + dst[vec_index][dim_index] = src[vec_index * dim + dim_index]; + } + } + } +} + +/* Write the values of delta_or_val into the table. If the key[i] is already in + the table indicted be @exists[i], a @delta_or_val[i] will be added to the the + existing value. if the key not exists, the value @val_or_delta[i] will be + assigned to the address @dst[i]. + + `delta_or_val`: will be treated as val and accumlating should be executed. + `dst`: A pointer of pointer to V which should be on HBM, + but each value (a pointer of V) could point to a + memory on HBM or HMEM. + `existed`: If the keys existed before this kernel is executed. + `status`: The existence status for each key when the kernel is being + executed. + + `N`: number of vectors needed to be writen. +*/ +template +__global__ void write_with_accum_kernel(const V* __restrict delta_or_val, + V** __restrict dst, + const bool* __restrict existed, + const bool* __restrict status, + const int* __restrict src_offset, + const size_t dim, size_t N) { + size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; + + for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) { + int vec_index = int(t / dim); + int dim_index = t % dim; + + if (dst[vec_index] != nullptr && + existed[src_offset[vec_index]] == status[src_offset[vec_index]]) { + if (status[src_offset[vec_index]]) { + dst[vec_index][dim_index] += + delta_or_val[src_offset[vec_index] * dim + dim_index]; + } else { + dst[vec_index][dim_index] = + delta_or_val[src_offset[vec_index] * dim + dim_index]; + } + } + } +} + +/* Add a @delta[i] to the the value saved in the address @dst[i]. + + `delta`: a delta value which should be add to. + `dst`: A pointer of pointer to V which should be on HBM, + but each value (a pointer of V) could point to a + memory on HBM or HMEM. + `N`: number of vectors needed to be writen. +*/ +template +__global__ void write_with_accum_kernel(const V* __restrict delta, + V** __restrict dst, + const int* __restrict src_offset, + const size_t dim, size_t N) { + size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; + + for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) { + int vec_index = int(t / dim); + int dim_index = t % dim; + + if (dst[vec_index] != nullptr) { + dst[vec_index][dim_index] += + delta[src_offset[vec_index] * dim + dim_index]; + } + } +} + +/* Read the N data from src to each address in *dst, + usually called by upsert kernel. + + `src`: A pointer of pointer of V which should be on HBM, + but each value (a pointer of V) could point to a + memory on HBM or HMEM. + `dst`: A continue memory pointer with Vector + which should be HBM. + `mask`: One for each `dst`. If true, reading from src, + or false reading from default_val. + `default_val`: Default value with shape (1, DIM) or (N, DIM) + `N`: The number of vectors needed to be read. + 'full_size_default': + If true, the d_def_val will be treated as + a full size default value which shape must be (N, DIM). +*/ +template +__global__ void read_kernel(const V* const* __restrict src, V* __restrict dst, + const bool* mask, const int* __restrict dst_offset, + const size_t dim, size_t N) { + size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; + + for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) { + int vec_index = int(t / dim); + int dim_index = t % dim; + int real_dst_offset = + dst_offset != nullptr ? dst_offset[vec_index] : vec_index; + + /// Copy selected values and fill in default value for all others. + if (mask[real_dst_offset] && src[vec_index] != nullptr) { + dst[real_dst_offset * dim + dim_index] = src[vec_index][dim_index]; + } + } +} + +/* Read the N data from src to each address in *dst, + * usually called by upsert kernel. + * + * `src`: A pointer of pointer of V which should be on HBM, + * but each value (a pointer of V) could point to a + * memory on HBM or HMEM. + * `dst`: A continue memory pointer with Vector + * which should be HBM. + * `N`: Number of vectors needed to be read. + */ +template +__global__ void read_kernel(const V** __restrict src, V* __restrict dst, + const int* __restrict dst_offset, const size_t dim, + const size_t N) { + size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; + + for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) { + int vec_index = int(t / dim); + int real_dst_offset = + dst_offset != nullptr ? dst_offset[vec_index] : vec_index; + int dim_index = t % dim; + if (src[vec_index] != nullptr) { + dst[real_dst_offset * dim + dim_index] = src[vec_index * dim + dim_index]; + } + } +} + +template +__device__ __forceinline__ unsigned find_in_bucket( + cg::thread_block_tile g, Bucket* bucket, + const K& find_key, uint32_t& tile_offset, const uint32_t& start_idx, + const size_t& bucket_max_size) { + uint32_t key_pos = 0; + +#pragma unroll + for (tile_offset = 0; tile_offset < bucket_max_size; + tile_offset += TILE_SIZE) { + key_pos = + (start_idx + tile_offset + g.thread_rank()) & (bucket_max_size - 1); + auto const current_key = + bucket->keys(key_pos)->load(cuda::std::memory_order_relaxed); + auto const found_vote = g.ballot(find_key == current_key); + if (found_vote) { + return found_vote; + } + + if (g.any(current_key == static_cast(EMPTY_KEY))) { + return 0; + } + } + return 0; +} + +template +__device__ __forceinline__ OccupyResult find_without_lock( + cg::thread_block_tile g, Bucket* __restrict__ bucket, + const K desired_key, const size_t start_idx, int& key_pos, int& src_lane, + const size_t bucket_max_size) { + K expected_key = static_cast(EMPTY_KEY); + + AtomicKey* current_key; + + unsigned vote = 0; + + for (uint32_t tile_offset = 0; tile_offset < bucket_max_size; + tile_offset += TILE_SIZE) { + key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size; + + current_key = bucket->keys(key_pos); + + expected_key = current_key->load(cuda::std::memory_order_relaxed); + vote = g.ballot(desired_key == expected_key); + if (vote) { + src_lane = __ffs(vote) - 1; + key_pos = g.shfl(key_pos, src_lane); + return OccupyResult::DUPLICATE; + } + vote = g.ballot(expected_key == static_cast(EMPTY_KEY)); + if (vote) break; + } + return OccupyResult::CONTINUE; +} + +template +__device__ __inline__ OccupyResult find_and_lock_when_vacant( + cg::thread_block_tile g, Bucket* __restrict__ bucket, + const K desired_key, const M desired_meta, K& evicted_key, + const size_t start_idx, int& key_pos, int& src_lane, + const size_t bucket_max_size) { + K expected_key = static_cast(EMPTY_KEY); + + AtomicKey* current_key; + AtomicMeta* current_meta; + + K local_min_meta_key = static_cast(EMPTY_KEY); + + M local_min_meta_val = MAX_META; + M temp_min_meta_val = MAX_META; + int local_min_meta_pos = -1; + + unsigned vote = 0; + bool result = false; + + for (uint32_t tile_offset = 0; tile_offset < bucket_max_size; + tile_offset += TILE_SIZE) { + key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size; + + current_key = bucket->keys(key_pos); + + // Step 1: try find and lock the desired_key. + do { + expected_key = desired_key; + result = current_key->compare_exchange_strong( + expected_key, static_cast(LOCKED_KEY), + cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed); + vote = g.ballot(result); + if (vote) { + src_lane = __ffs(vote) - 1; + key_pos = g.shfl(key_pos, src_lane); + return OccupyResult::DUPLICATE; + } + vote = g.ballot(expected_key == static_cast(EMPTY_KEY)); + if (vote) break; + vote = g.ballot(expected_key == static_cast(LOCKED_KEY)); + } while (vote != 0); + + // Step 2: (TBD)try find empty location. + while (vote) { + src_lane = __ffs(vote) - 1; + if (src_lane == g.thread_rank()) { + expected_key = static_cast(EMPTY_KEY); + result = current_key->compare_exchange_strong( + expected_key, static_cast(LOCKED_KEY), + cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed); + } + result = g.shfl(result, src_lane); + if (result) { + key_pos = g.shfl(key_pos, src_lane); + return OccupyResult::OCCUPIED_EMPTY; + } + vote -= ((unsigned(0x1)) << src_lane); + } + } + + for (uint32_t tile_offset = 0; tile_offset < bucket_max_size; + tile_offset += TILE_SIZE) { + key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size; + + current_meta = bucket->metas(key_pos); + + // Step 4: record min meta location. + temp_min_meta_val = current_meta->load(cuda::std::memory_order_relaxed); + if (temp_min_meta_val < local_min_meta_val) { + expected_key = + bucket->keys(key_pos)->load(cuda::std::memory_order_relaxed); + if (expected_key != static_cast(LOCKED_KEY) && + expected_key != static_cast(EMPTY_KEY)) { + local_min_meta_key = expected_key; + local_min_meta_val = temp_min_meta_val; + local_min_meta_pos = key_pos; + } + } + } + // Step 5: insert by evicting some one. + const M global_min_meta_val = + cg::reduce(g, local_min_meta_val, cg::less()); + if (desired_meta < global_min_meta_val) { + return OccupyResult::REFUSED; + } + vote = g.ballot(local_min_meta_val <= global_min_meta_val); + if (vote) { + src_lane = __ffs(vote) - 1; + result = false; + if (src_lane == g.thread_rank()) { + // TBD: Here can be compare_exchange_weak. Do benchmark. + current_key = bucket->keys(local_min_meta_pos); + current_meta = bucket->metas(local_min_meta_pos); + evicted_key = local_min_meta_key; + result = current_key->compare_exchange_strong( + local_min_meta_key, static_cast(LOCKED_KEY), + cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed); + + // Need to recover when fail. + if (result && (current_meta->load(cuda::std::memory_order_relaxed) > + global_min_meta_val)) { + current_key->store(local_min_meta_key, cuda::std::memory_order_relaxed); + result = false; + } + } + result = g.shfl(result, src_lane); + if (result) { + // Not every `evicted_key` is correct expect the `src_lane` thread. + key_pos = g.shfl(local_min_meta_pos, src_lane); + return (evicted_key == static_cast(RECLAIM_KEY)) + ? OccupyResult::OCCUPIED_RECLAIMED + : OccupyResult::EVICT; + } + } + return OccupyResult::CONTINUE; +} + +template +__device__ __forceinline__ OccupyResult find_and_lock_when_full( + cg::thread_block_tile g, Bucket* __restrict__ bucket, + const K desired_key, const M desired_meta, K& evicted_key, + const size_t start_idx, int& key_pos, int& src_lane, + const size_t bucket_max_size) { + K expected_key = static_cast(EMPTY_KEY); + + AtomicKey* current_key; + AtomicMeta* current_meta; + + K local_min_meta_key = static_cast(EMPTY_KEY); + + M local_min_meta_val = MAX_META; + M temp_min_meta_val = MAX_META; + int local_min_meta_pos = -1; + + unsigned vote = 0; + bool result = false; + + for (uint32_t tile_offset = 0; tile_offset < bucket_max_size; + tile_offset += TILE_SIZE) { + key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size; + + current_key = bucket->keys(key_pos); + + // Step 1: try find and lock the desired_key. + do { + expected_key = desired_key; + result = current_key->compare_exchange_strong( + expected_key, static_cast(LOCKED_KEY), + cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed); + vote = g.ballot(result); + if (vote) { + src_lane = __ffs(vote) - 1; + key_pos = g.shfl(key_pos, src_lane); + return OccupyResult::DUPLICATE; + } + vote = g.ballot(expected_key == static_cast(LOCKED_KEY)); + } while (vote != 0); + } + + for (uint32_t tile_offset = 0; tile_offset < bucket_max_size; + tile_offset += TILE_SIZE) { + key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size; + + // Step 2: record min meta location. + temp_min_meta_val = + bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed); + if (temp_min_meta_val < local_min_meta_val) { + while ((expected_key = bucket->keys(key_pos)->load( + cuda::std::memory_order_relaxed)) == + static_cast(LOCKED_KEY)) + ; + local_min_meta_key = expected_key; + local_min_meta_val = temp_min_meta_val; + local_min_meta_pos = key_pos; + } + } + + // Step 3: insert by evicting some one. + const M global_min_meta_val = + cg::reduce(g, local_min_meta_val, cg::less()); + if (desired_meta < global_min_meta_val) { + return OccupyResult::REFUSED; + } + vote = g.ballot(local_min_meta_val <= global_min_meta_val); + if (vote) { + src_lane = __ffs(vote) - 1; + result = false; + if (src_lane == g.thread_rank()) { + // TBD: Here can be compare_exchange_weak. Do benchmark. + current_key = bucket->keys(local_min_meta_pos); + current_meta = bucket->metas(local_min_meta_pos); + evicted_key = local_min_meta_key; + result = current_key->compare_exchange_strong( + local_min_meta_key, static_cast(LOCKED_KEY), + cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed); + + // Need to recover when fail. + if (result && (current_meta->load(cuda::std::memory_order_relaxed) > + global_min_meta_val)) { + current_key->store(local_min_meta_key, cuda::std::memory_order_relaxed); + result = false; + } + } + result = g.shfl(result, src_lane); + if (result) { + // Not every `evicted_key` is correct expect the `src_lane` thread. + key_pos = g.shfl(local_min_meta_pos, src_lane); + return (evicted_key == static_cast(RECLAIM_KEY)) + ? OccupyResult::OCCUPIED_RECLAIMED + : OccupyResult::EVICT; + } + } + return OccupyResult::CONTINUE; +} + +template +__device__ __forceinline__ OccupyResult find_and_lock_for_update( + cg::thread_block_tile g, Bucket* __restrict__ bucket, + const K desired_key, const size_t start_idx, int& key_pos, int& src_lane, + const size_t bucket_max_size) { + K expected_key = static_cast(EMPTY_KEY); + + AtomicKey* current_key; + + unsigned vote = 0; + bool result = false; + + for (uint32_t tile_offset = 0; tile_offset < bucket_max_size; + tile_offset += TILE_SIZE) { + key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size; + + current_key = bucket->keys(key_pos); + + // Step 1: try find and lock the desired_key. + do { + expected_key = desired_key; + result = current_key->compare_exchange_strong( + expected_key, static_cast(LOCKED_KEY), + cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed); + vote = g.ballot(result); + if (vote) { + src_lane = __ffs(vote) - 1; + key_pos = g.shfl(key_pos, src_lane); + return OccupyResult::DUPLICATE; + } + vote = g.ballot(expected_key == static_cast(EMPTY_KEY)); + if (vote) return OccupyResult::REFUSED; + vote = g.ballot(expected_key == static_cast(LOCKED_KEY)); + } while (vote != 0); + } + return OccupyResult::REFUSED; +} + +template +__forceinline__ __device__ Bucket* get_key_position( + Bucket* __restrict buckets, const K key, size_t& bkt_idx, + size_t& start_idx, const size_t buckets_num, const size_t bucket_max_size) { + const uint32_t hashed_key = Murmur3HashDevice(key); + const size_t global_idx = hashed_key % (buckets_num * bucket_max_size); + bkt_idx = global_idx / bucket_max_size; + start_idx = global_idx % bucket_max_size; + return buckets + bkt_idx; +} + +template +__forceinline__ __device__ void update_meta(Bucket* __restrict bucket, + const int key_pos, + const M* __restrict metas, + const int key_idx) { + if (metas == nullptr) { + M cur_meta = + bucket->cur_meta.fetch_add(1, cuda::std::memory_order_relaxed) + 1; + bucket->metas(key_pos)->store(cur_meta, cuda::std::memory_order_relaxed); + } else { + bucket->metas(key_pos)->store(metas[key_idx], + cuda::std::memory_order_relaxed); + } + return; +} + +template +__global__ void upsert_kernel_with_io_core( + const Table* __restrict table, const size_t bucket_max_size, + const size_t buckets_num, const size_t dim, const K* __restrict keys, + const V* __restrict values, const M* __restrict metas, size_t N) { + auto g = cg::tiled_partition(cg::this_thread_block()); + int* buckets_size = table->buckets_size; + + for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; + t += blockDim.x * gridDim.x) { + int key_pos = -1; + size_t key_idx = t / TILE_SIZE; + + const K insert_key = keys[key_idx]; + + if (IS_RESERVED_KEY(insert_key)) continue; + + const M insert_meta = + metas != nullptr ? metas[key_idx] : static_cast(MAX_META); + const V* insert_value = values + key_idx * dim; + + size_t bkt_idx = 0; + size_t start_idx = 0; + int src_lane = -1; + K evicted_key; + + Bucket* bucket = + get_key_position(table->buckets, insert_key, bkt_idx, start_idx, + buckets_num, bucket_max_size); + + OccupyResult occupy_result{OccupyResult::INITIAL}; + const int bucket_size = buckets_size[bkt_idx]; + do { + if (bucket_size < bucket_max_size) { + occupy_result = find_and_lock_when_vacant( + g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos, + src_lane, bucket_max_size); + } else { + start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; + occupy_result = find_and_lock_when_full( + g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos, + src_lane, bucket_max_size); + } + + occupy_result = g.shfl(occupy_result, src_lane); + } while (occupy_result == OccupyResult::CONTINUE); + + if (occupy_result == OccupyResult::REFUSED) continue; + + if ((occupy_result == OccupyResult::OCCUPIED_EMPTY || + occupy_result == OccupyResult::OCCUPIED_RECLAIMED) && + g.thread_rank() == src_lane) { + atomicAdd(&(buckets_size[bkt_idx]), 1); + } + + copy_vector(g, insert_value, bucket->vectors + key_pos * dim, + dim); + if (g.thread_rank() == src_lane) { + update_meta(bucket, key_pos, metas, key_idx); + (bucket->keys(key_pos)) + ->store(insert_key, cuda::std::memory_order_relaxed); + } + } +} + +template +__global__ void upsert_and_evict_kernel_with_io_core( + const Table* __restrict table, const size_t bucket_max_size, + const size_t buckets_num, const size_t dim, const K* __restrict keys, + const V* __restrict values, const M* __restrict metas, + K* __restrict evicted_keys, V* __restrict evicted_values, + M* __restrict evicted_metas, size_t N) { + auto g = cg::tiled_partition(cg::this_thread_block()); + int* buckets_size = table->buckets_size; + + for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; + t += blockDim.x * gridDim.x) { + int key_pos = -1; + const size_t key_idx = t / TILE_SIZE; + + const K insert_key = keys[key_idx]; + + if (IS_RESERVED_KEY(insert_key)) continue; + + const M insert_meta = + metas != nullptr ? metas[key_idx] : static_cast(MAX_META); + const V* insert_value = values + key_idx * dim; + + size_t bkt_idx = 0; + size_t start_idx = 0; + int src_lane = -1; + K evicted_key; + + Bucket* bucket = + get_key_position(table->buckets, insert_key, bkt_idx, start_idx, + buckets_num, bucket_max_size); + + OccupyResult occupy_result{OccupyResult::INITIAL}; + const int bucket_size = buckets_size[bkt_idx]; + do { + if (bucket_size < bucket_max_size) { + occupy_result = find_and_lock_when_vacant( + g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos, + src_lane, bucket_max_size); + } else { + start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; + occupy_result = find_and_lock_when_full( + g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos, + src_lane, bucket_max_size); + } + occupy_result = g.shfl(occupy_result, src_lane); + } while (occupy_result == OccupyResult::CONTINUE); + + if (occupy_result == OccupyResult::REFUSED) { + copy_vector(g, insert_value, evicted_values + key_idx * dim, + dim); + continue; + } + + if ((occupy_result == OccupyResult::OCCUPIED_EMPTY || + occupy_result == OccupyResult::OCCUPIED_RECLAIMED) && + g.thread_rank() == src_lane) { + atomicAdd(&(buckets_size[bkt_idx]), 1); + } + + if (occupy_result == OccupyResult::EVICT) { + if (g.thread_rank() == src_lane) { + evicted_keys[key_idx] = evicted_key; + } + if (metas != nullptr) { + evicted_metas[key_idx] = metas[key_idx]; + } + copy_vector(g, bucket->vectors + key_pos * dim, + evicted_values + key_idx * dim, dim); + } + + copy_vector(g, insert_value, bucket->vectors + key_pos * dim, + dim); + if (g.thread_rank() == src_lane) { + update_meta(bucket, key_pos, metas, key_idx); + (bucket->keys(key_pos)) + ->store(insert_key, cuda::std::memory_order_relaxed); + } + } +} + +template +struct SelectUpsertKernelWithIO { + static void execute_kernel(const float& load_factor, const int& block_size, + const size_t bucket_max_size, + const size_t buckets_num, const size_t dim, + cudaStream_t& stream, const size_t& n, + const Table* __restrict table, + const K* __restrict keys, + const V* __restrict values, + const M* __restrict metas) { + if (load_factor <= 0.5) { + const unsigned int tile_size = 4; + const size_t N = n * tile_size; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + upsert_kernel_with_io_core + <<>>( + table, bucket_max_size, buckets_num, dim, keys, values, metas, N); + + } else if (load_factor <= 0.875) { + const unsigned int tile_size = 8; + const size_t N = n * tile_size; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + upsert_kernel_with_io_core + <<>>( + table, bucket_max_size, buckets_num, dim, keys, values, metas, N); + } else { + const unsigned int tile_size = 32; + const size_t N = n * tile_size; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + upsert_kernel_with_io_core + <<>>( + table, bucket_max_size, buckets_num, dim, keys, values, metas, N); + } + return; + } +}; + +template +struct SelectUpsertAndEvictKernelWithIO { + static void execute_kernel( + const float& load_factor, const int& block_size, + const size_t bucket_max_size, const size_t buckets_num, const size_t dim, + cudaStream_t& stream, const size_t& n, + const Table* __restrict table, const K* __restrict keys, + const V* __restrict values, const M* __restrict metas, + K* __restrict evicted_keys, V* __restrict evicted_values, + M* __restrict evicted_metas) { + if (load_factor <= 0.5) { + const unsigned int tile_size = 4; + const size_t N = n * tile_size; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + upsert_and_evict_kernel_with_io_core + <<>>( + table, bucket_max_size, buckets_num, dim, keys, values, metas, + evicted_keys, evicted_values, evicted_metas, N); + + } else if (load_factor <= 0.875) { + const unsigned int tile_size = 8; + const size_t N = n * tile_size; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + upsert_and_evict_kernel_with_io_core + <<>>( + table, bucket_max_size, buckets_num, dim, keys, values, metas, + evicted_keys, evicted_values, evicted_metas, N); + + } else { + const unsigned int tile_size = 32; + const size_t N = n * tile_size; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + upsert_and_evict_kernel_with_io_core + <<>>( + table, bucket_max_size, buckets_num, dim, keys, values, metas, + evicted_keys, evicted_values, evicted_metas, N); + } + return; + } +}; + +/* Upsert with the end-user specified meta. + */ +template +__global__ void upsert_kernel(const Table* __restrict table, + const size_t bucket_max_size, + const size_t buckets_num, const size_t dim, + const K* __restrict keys, V** __restrict vectors, + const M* __restrict metas, + int* __restrict src_offset, size_t N) { + auto g = cg::tiled_partition(cg::this_thread_block()); + int* buckets_size = table->buckets_size; + + for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; + t += blockDim.x * gridDim.x) { + int key_pos = -1; + size_t key_idx = t / TILE_SIZE; + + const K insert_key = keys[key_idx]; + if (IS_RESERVED_KEY(insert_key)) continue; + + const M insert_meta = + metas != nullptr ? metas[key_idx] : static_cast(MAX_META); + + size_t bkt_idx = 0; + size_t start_idx = 0; + int src_lane = -1; + K evicted_key; + + Bucket* bucket = + get_key_position(table->buckets, insert_key, bkt_idx, start_idx, + buckets_num, bucket_max_size); + + if (src_offset != nullptr && g.thread_rank() == 0) { + *(src_offset + key_idx) = key_idx; + } + + OccupyResult occupy_result{OccupyResult::INITIAL}; + const int bucket_size = buckets_size[bkt_idx]; + do { + if (bucket_size < bucket_max_size) { + occupy_result = find_and_lock_when_vacant( + g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos, + src_lane, bucket_max_size); + } else { + start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; + occupy_result = find_and_lock_when_vacant( + g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos, + src_lane, bucket_max_size); + } + + occupy_result = g.shfl(occupy_result, src_lane); + } while (occupy_result == OccupyResult::CONTINUE); + + if (occupy_result == OccupyResult::REFUSED) continue; + + if ((occupy_result == OccupyResult::OCCUPIED_EMPTY || + occupy_result == OccupyResult::OCCUPIED_RECLAIMED) && + g.thread_rank() == src_lane) { + atomicAdd(&(buckets_size[bkt_idx]), 1); + } + + if (g.thread_rank() == src_lane) { + *(vectors + key_idx) = (bucket->vectors + key_pos * dim); + update_meta(bucket, key_pos, metas, key_idx); + (bucket->keys(key_pos)) + ->store(insert_key, cuda::std::memory_order_relaxed); + } + } +} + +/* Accum kernel with customized metas. + */ +template +__global__ void accum_kernel( + const Table* __restrict table, const K* __restrict keys, + V** __restrict vectors, const M* __restrict metas, + const bool* __restrict existed, Bucket* __restrict buckets, + int* __restrict buckets_size, const size_t bucket_max_size, + const size_t buckets_num, int* __restrict src_offset, + bool* __restrict status, size_t N) { + const size_t dim = table->dim; + size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; + auto g = cg::tiled_partition(cg::this_thread_block()); + int rank = g.thread_rank(); + + for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) { + int key_pos = -1; + int local_size = 0; + bool local_found = false; + unsigned found_or_empty_vote = 0; + + size_t key_idx = t / TILE_SIZE; + K insert_key = *(keys + key_idx); + + if (IS_RESERVED_KEY(insert_key)) continue; + + K hashed_key = Murmur3HashDevice(insert_key); + size_t global_idx = hashed_key % (buckets_num * bucket_max_size); + size_t bkt_idx = global_idx / bucket_max_size; + size_t start_idx = global_idx % bucket_max_size; + + int src_lane = -1; + + Bucket* bucket = buckets + bkt_idx; + lock(g, table->locks[bkt_idx]); + if (rank == 0 && src_offset != nullptr) { + *(src_offset + key_idx) = key_idx; + } + + for (uint32_t tile_offset = 0; tile_offset < bucket_max_size; + tile_offset += TILE_SIZE) { + size_t key_offset = + (start_idx + tile_offset + rank) & (bucket_max_size - 1); + K current_key = + bucket->keys(key_offset)->load(cuda::std::memory_order_relaxed); + found_or_empty_vote = g.ballot(current_key == static_cast(EMPTY_KEY) || + insert_key == current_key); + if (found_or_empty_vote) { + src_lane = __ffs(found_or_empty_vote) - 1; + key_pos = (start_idx + tile_offset + src_lane) & (bucket_max_size - 1); + local_size = buckets_size[bkt_idx]; + if (rank == src_lane) { + if (current_key == insert_key) { + local_found = true; + *(status + key_idx) = local_found; + } + if (local_found == existed[key_idx]) { + (bucket->keys(key_pos)) + ->store(insert_key, cuda::std::memory_order_relaxed); + if (!local_found) { + buckets_size[bkt_idx]++; + local_size++; + } + *(vectors + key_idx) = (bucket->vectors + key_pos * dim); + update_meta(bucket, key_pos, metas, key_idx); + } + } + local_size = g.shfl(local_size, src_lane); + if (local_size >= bucket_max_size) { + refresh_bucket_meta(g, bucket, bucket_max_size); + } + break; + } + } + if (!found_or_empty_vote) { + if (rank == (bucket->min_pos % TILE_SIZE)) { + key_pos = bucket->min_pos; + (bucket->keys(key_pos)) + ->store(insert_key, cuda::std::memory_order_relaxed); + *(vectors + key_idx) = (bucket->vectors + key_pos * dim); + update_meta(bucket, key_pos, metas, key_idx); + } + refresh_bucket_meta(g, bucket, bucket_max_size); + } + unlock(g, table->locks[bkt_idx]); + } +} + +/* lookup with IO operation. This kernel is + * usually used for the pure HBM mode for better performance. + */ +template +__global__ void lookup_kernel_with_io(const Table* __restrict table, + const size_t bucket_max_size, + const size_t buckets_num, + const size_t dim, + const K* __restrict keys, + V* __restrict values, M* __restrict metas, + bool* __restrict found, size_t N) { + int* buckets_size = table->buckets_size; + Bucket* buckets = table->buckets; + + auto g = cg::tiled_partition(cg::this_thread_block()); + int rank = g.thread_rank(); + + for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; + t += blockDim.x * gridDim.x) { + int key_idx = t / TILE_SIZE; + + const K find_key = keys[key_idx]; + if (IS_RESERVED_KEY(find_key)) continue; + + V* find_value = values + key_idx * dim; + + int key_pos = -1; + int src_lane = -1; + size_t bkt_idx = 0; + size_t start_idx = 0; + + Bucket* bucket = get_key_position( + buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size); + + const int bucket_size = buckets_size[bkt_idx]; + if (bucket_size >= bucket_max_size) { + start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; + } + + OccupyResult occupy_result{OccupyResult::INITIAL}; + occupy_result = find_without_lock( + g, bucket, find_key, start_idx, key_pos, src_lane, bucket_max_size); + + if (occupy_result == OccupyResult::DUPLICATE) { + copy_vector(g, bucket->vectors + key_pos * dim, find_value, + dim); + if (rank == src_lane) { + if (metas != nullptr) { + *(metas + key_idx) = + bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed); + } + if (found != nullptr) { + *(found + key_idx) = true; + } + } + } + } +} + +template +struct SelectLookupKernelWithIO { + static void execute_kernel(const float& load_factor, const int& block_size, + const size_t bucket_max_size, + const size_t buckets_num, const size_t dim, + cudaStream_t& stream, const size_t& n, + const Table* __restrict table, + const K* __restrict keys, V* __restrict values, + M* __restrict metas, bool* __restrict found) { + if (load_factor <= 0.75) { + const unsigned int tile_size = 4; + const size_t N = n * tile_size; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + lookup_kernel_with_io + <<>>(table, bucket_max_size, + buckets_num, dim, keys, values, + metas, found, N); + } else { + const unsigned int tile_size = 16; + const size_t N = n * tile_size; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + lookup_kernel_with_io + <<>>(table, bucket_max_size, + buckets_num, dim, keys, values, + metas, found, N); + } + return; + } +}; + +/* lookup kernel. + */ +template +__global__ void lookup_kernel(const Table* __restrict table, + const size_t bucket_max_size, + const size_t buckets_num, const size_t dim, + const K* __restrict keys, V** __restrict values, + M* __restrict metas, bool* __restrict found, + int* __restrict dst_offset, size_t N) { + int* buckets_size = table->buckets_size; + Bucket* buckets = table->buckets; + + auto g = cg::tiled_partition(cg::this_thread_block()); + int rank = g.thread_rank(); + + for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; + t += blockDim.x * gridDim.x) { + int key_idx = t / TILE_SIZE; + + const K find_key = keys[key_idx]; + if (IS_RESERVED_KEY(find_key)) continue; + + int key_pos = -1; + int src_lane = -1; + size_t bkt_idx = 0; + size_t start_idx = 0; + + Bucket* bucket = get_key_position( + buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size); + + const int bucket_size = buckets_size[bkt_idx]; + if (bucket_size >= bucket_max_size) { + start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; + } + + if (dst_offset != nullptr && rank == 0) { + *(dst_offset + key_idx) = key_idx; + } + + OccupyResult occupy_result{OccupyResult::INITIAL}; + occupy_result = find_without_lock( + g, bucket, find_key, start_idx, key_pos, src_lane, bucket_max_size); + + if (occupy_result == OccupyResult::DUPLICATE) { + if (rank == src_lane) { + *(values + key_idx) = (bucket->vectors + key_pos * dim); + if (metas != nullptr) { + *(metas + key_idx) = + bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed); + } + if (found != nullptr) { + *(found + key_idx) = true; + } + } + } else { + if (rank == 0) { + *(values + key_idx) = nullptr; + } + } + } +} + +/* lookup with IO operation. This kernel is + * usually used for the pure HBM mode for better performance. + */ +template +__global__ void lookup_ptr_kernel(const Table* __restrict table, + const size_t bucket_max_size, + const size_t buckets_num, const size_t dim, + const K* __restrict keys, + V** __restrict values, M* __restrict metas, + bool* __restrict found, size_t N) { + int* buckets_size = table->buckets_size; + Bucket* buckets = table->buckets; + + auto g = cg::tiled_partition(cg::this_thread_block()); + int rank = g.thread_rank(); + + for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; + t += blockDim.x * gridDim.x) { + int key_idx = t / TILE_SIZE; + + const K find_key = keys[key_idx]; + if (IS_RESERVED_KEY(find_key)) continue; + + int key_pos = -1; + int src_lane = -1; + size_t bkt_idx = 0; + size_t start_idx = 0; + + Bucket* bucket = get_key_position( + buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size); + + const int bucket_size = buckets_size[bkt_idx]; + if (bucket_size >= bucket_max_size) { + start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; + } + + OccupyResult occupy_result{OccupyResult::INITIAL}; + occupy_result = find_without_lock( + g, bucket, find_key, start_idx, key_pos, src_lane, bucket_max_size); + + if (occupy_result == OccupyResult::DUPLICATE) { + if (rank == src_lane) { + values[key_idx] = bucket->vectors + key_pos * dim; + if (metas != nullptr) { + *(metas + key_idx) = + bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed); + } + if (found != nullptr) { + *(found + key_idx) = true; + } + } + } + } +} + +template +struct SelectLookupPtrKernel { + static void execute_kernel(const float& load_factor, const int& block_size, + const size_t bucket_max_size, + const size_t buckets_num, const size_t dim, + cudaStream_t& stream, const size_t& n, + const Table* __restrict table, + const K* __restrict keys, V** __restrict values, + M* __restrict metas, bool* __restrict found) { + if (load_factor <= 0.75) { + const unsigned int tile_size = 4; + const size_t N = n * tile_size; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + lookup_ptr_kernel + <<>>(table, bucket_max_size, + buckets_num, dim, keys, values, + metas, found, N); + } else { + const unsigned int tile_size = 16; + const size_t N = n * tile_size; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + lookup_ptr_kernel + <<>>(table, bucket_max_size, + buckets_num, dim, keys, values, + metas, found, N); + } + return; + } +}; + +/* Clear all key-value in the table. */ +template +__global__ void clear_kernel(Table* __restrict table, size_t N) { + size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; + const size_t bucket_max_size = table->bucket_max_size; + + for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) { + int key_idx = t % bucket_max_size; + int bkt_idx = t / bucket_max_size; + Bucket* bucket = &(table->buckets[bkt_idx]); + + (bucket->keys(key_idx)) + ->store(static_cast(EMPTY_KEY), cuda::std::memory_order_relaxed); + if (key_idx == 0) { + table->buckets_size[bkt_idx] = 0; + } + } +} + +/* Remove specified keys. */ +template +__global__ void remove_kernel(const Table* __restrict table, + const K* __restrict keys, + Bucket* __restrict buckets, + int* __restrict buckets_size, + const size_t bucket_max_size, + const size_t buckets_num, size_t N) { + auto g = cg::tiled_partition(cg::this_thread_block()); + int rank = g.thread_rank(); + + for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; + t += blockDim.x * gridDim.x) { + int key_idx = t / TILE_SIZE; + K find_key = keys[key_idx]; + if (IS_RESERVED_KEY(find_key)) continue; + + int key_pos = -1; + + size_t bkt_idx = 0; + size_t start_idx = 0; + uint32_t tile_offset = 0; + + Bucket* bucket = get_key_position( + buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size); + + unsigned found_vote = 0; +#pragma unroll + for (tile_offset = 0; tile_offset < bucket_max_size; + tile_offset += TILE_SIZE) { + key_pos = (start_idx + tile_offset + rank) & (bucket_max_size - 1); + + const K current_key = + (bucket->keys(key_pos))->load(cuda::std::memory_order_relaxed); + + found_vote = g.ballot(find_key == current_key); + if (found_vote) { + break; + } + + if (g.any(current_key == static_cast(EMPTY_KEY))) { + break; + } + } + + if (found_vote) { + const int src_lane = __ffs(found_vote) - 1; + + if (g.thread_rank() == src_lane) { + const int key_pos = + (start_idx + tile_offset + src_lane) & (bucket_max_size - 1); + (bucket->keys(key_pos)) + ->store(static_cast(RECLAIM_KEY), + cuda::std::memory_order_relaxed); + (bucket->metas(key_pos)) + ->store(static_cast(EMPTY_META), + cuda::std::memory_order_relaxed); + atomicSub(&buckets_size[bkt_idx], 1); + } + break; + } + } +} + +/* Remove specified keys which match the Predict. */ +template +__global__ void remove_kernel(const Table* __restrict table, + const EraseIfPredictInternal pred, + const K pattern, const M threshold, + size_t* __restrict count, + Bucket* __restrict buckets, + int* __restrict buckets_size, + const size_t bucket_max_size, + const size_t buckets_num, size_t N) { + auto g = cg::tiled_partition(cg::this_thread_block()); + + for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; + t += blockDim.x * gridDim.x) { + uint32_t bkt_idx = t; + uint32_t key_pos = 0; + + Bucket* bucket = buckets + bkt_idx; + + K current_key = 0; + M current_meta = 0; + uint32_t key_offset = 0; + while (key_offset < bucket_max_size) { + current_key = + bucket->keys(key_offset)->load(cuda::std::memory_order_relaxed); + current_meta = + bucket->metas(key_offset)->load(cuda::std::memory_order_relaxed); + if (!IS_RESERVED_KEY(current_key)) { + if (pred(current_key, current_meta, pattern, threshold)) { + atomicAdd(count, 1); + key_pos = key_offset; + (bucket->keys(key_pos)) + ->store(static_cast(RECLAIM_KEY), + cuda::std::memory_order_relaxed); + (bucket->metas(key_pos)) + ->store(static_cast(EMPTY_META), + cuda::std::memory_order_relaxed); + atomicSub(&buckets_size[bkt_idx], 1); + } else { + key_offset++; + } + } else { + key_offset++; + } + } + } +} + +/* Dump with meta. */ +template +inline std::tuple dump_kernel_shared_memory_size( + const size_t available_shared_memory) { + const size_t block_size{std::min( + available_shared_memory / 2 / sizeof(KVM), UINT64_C(1024))}; + MERLIN_CHECK( + block_size > 0, + "[HierarchicalKV] block_size <= 0, the K-V-M size may be too large!"); + + return {block_size * sizeof(KVM), block_size}; +} + +template +__global__ void dump_kernel(const Table* __restrict table, K* d_key, + V* __restrict d_val, M* __restrict d_meta, + const size_t offset, const size_t search_length, + size_t* d_dump_counter) { + extern __shared__ unsigned char s[]; + KVM* const block_tuples{reinterpret_cast*>(s)}; + + const size_t bucket_max_size{table->bucket_max_size}; + const size_t dim{table->dim}; + + __shared__ size_t block_acc; + __shared__ size_t global_acc; + + const size_t tid{blockIdx.x * blockDim.x + threadIdx.x}; + + if (threadIdx.x == 0) { + block_acc = 0; + } + __syncthreads(); + + if (tid < search_length) { + Bucket* const bucket{ + &table->buckets[(tid + offset) / bucket_max_size]}; + + const int key_idx{static_cast((tid + offset) % bucket_max_size)}; + const K key{(bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed)}; + + if (!IS_RESERVED_KEY(key)) { + size_t local_index{atomicAdd(&block_acc, 1)}; + block_tuples[local_index] = { + key, &bucket->vectors[key_idx * dim], + bucket->metas(key_idx)->load(cuda::std::memory_order_relaxed)}; + } + } + __syncthreads(); + + if (threadIdx.x == 0) { + global_acc = atomicAdd(d_dump_counter, block_acc); + } + __syncthreads(); + + if (threadIdx.x < block_acc) { + const KVM& tuple{block_tuples[threadIdx.x]}; + + const size_t j{global_acc + threadIdx.x}; + d_key[j] = tuple.key; + for (int i{0}; i < dim; ++i) { + d_val[j * dim + i] = tuple.value[i]; + } + if (d_meta != nullptr) { + d_meta[j] = tuple.meta; + } + } +} + +/* Dump with meta. */ +template class PredFunctor> +__global__ void dump_kernel(const Table* __restrict table, + const K pattern, const M threshold, K* d_key, + V* __restrict d_val, M* __restrict d_meta, + const size_t offset, const size_t search_length, + size_t* d_dump_counter) { + extern __shared__ unsigned char s[]; + const size_t bucket_max_size = table->bucket_max_size; + const size_t dim = table->dim; + K* smem = (K*)s; + K* block_result_key = smem; + V* block_result_val = (V*)&(smem[blockDim.x]); + M* block_result_meta = (M*)&(block_result_val[blockDim.x * dim]); + __shared__ size_t block_acc; + __shared__ size_t global_acc; + PredFunctor fn; + + const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + + if (threadIdx.x == 0) { + block_acc = 0; + } + __syncthreads(); + + if (tid < search_length) { + int bkt_idx = (tid + offset) / bucket_max_size; + int key_idx = (tid + offset) % bucket_max_size; + Bucket* bucket = &(table->buckets[bkt_idx]); + + const K key = + (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed); + M meta = bucket->metas(key_idx)->load(cuda::std::memory_order_relaxed); + + if (key != static_cast(EMPTY_KEY) && + fn(key, meta, pattern, threshold)) { + size_t local_index = atomicAdd(&block_acc, 1); + block_result_key[local_index] = key; + for (int i = 0; i < dim; i++) { + atomicExch(&(block_result_val[local_index * dim + i]), + bucket->vectors[key_idx * dim + i]); + } + if (d_meta != nullptr) { + block_result_meta[local_index] = meta; + } + } + } + __syncthreads(); + + if (threadIdx.x == 0) { + global_acc = atomicAdd(d_dump_counter, block_acc); + } + __syncthreads(); + + if (threadIdx.x < block_acc) { + d_key[global_acc + threadIdx.x] = block_result_key[threadIdx.x]; + for (int i = 0; i < dim; i++) { + d_val[(global_acc + threadIdx.x) * dim + i] = + block_result_val[threadIdx.x * dim + i]; + } + if (d_meta != nullptr) { + d_meta[global_acc + threadIdx.x] = block_result_meta[threadIdx.x]; + } + } +} + +/* If founds[i] = true, read data from corresponding address of + * table_value_addrs and write to param_values; if founds[i] = false, write data + * from param_values to corresponding address of table_value_addrs. usually + * called by find_or_insert kernel. + */ +template +void read_or_write_by_cpu(V** __restrict table_value_addrs, + V* __restrict param_values, + const int* __restrict offset, const bool* founds, + size_t dim, int N, int n_worker = 16) { + std::vector thds; + if (n_worker < 1) n_worker = 1; + + auto functor = [founds, dim](V** __restrict table_value_addrs, + V* __restrict param_values, + const int* __restrict offset, int handled_size, + int trunk_size) -> void { + for (int i = handled_size; i < handled_size + trunk_size; i++) { + if (table_value_addrs[i] != nullptr) { + if (founds[offset[i]]) { + memcpy(param_values + offset[i] * dim, table_value_addrs[i], + sizeof(V) * dim); + } else { + memcpy(table_value_addrs[i], param_values + offset[i] * dim, + sizeof(V) * dim); + } + } + } + }; + + int32_t trunk_size_floor = N / n_worker; + int32_t trunk_size_remain = N % n_worker; + int32_t n_worker_used = trunk_size_floor == 0 ? trunk_size_remain : n_worker; + + size_t handled_size = 0; + for (int i = 0; i < n_worker_used; i++) { + int32_t cur_trunk_size = trunk_size_floor; + if (trunk_size_remain != 0) { + cur_trunk_size += 1; + trunk_size_remain--; + } + thds.push_back(std::thread(functor, table_value_addrs, param_values, offset, + handled_size, cur_trunk_size)); + handled_size += cur_trunk_size; + } + + for (int i = 0; i < n_worker_used; i++) { + thds[i].join(); + } +} + +/* + * find or insert with IO operation. This kernel is + * usually used for the pure HBM mode for better performance. + */ +template +__global__ void find_or_insert_kernel_with_io( + const Table* __restrict table, const size_t bucket_max_size, + const size_t buckets_num, const size_t dim, const K* __restrict keys, + V* __restrict values, M* __restrict metas, const size_t N) { + auto g = cg::tiled_partition(cg::this_thread_block()); + int* buckets_size = table->buckets_size; + + for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; + t += blockDim.x * gridDim.x) { + int key_pos = -1; + const size_t key_idx = t / TILE_SIZE; + + const K find_or_insert_key = keys[key_idx]; + + if (IS_RESERVED_KEY(find_or_insert_key)) continue; + + const M find_or_insert_meta = + metas != nullptr ? metas[key_idx] : static_cast(MAX_META); + V* find_or_insert_value = values + key_idx * dim; + + size_t bkt_idx = 0; + size_t start_idx = 0; + int src_lane = -1; + K evicted_key; + + Bucket* bucket = + get_key_position(table->buckets, find_or_insert_key, bkt_idx, + start_idx, buckets_num, bucket_max_size); + + OccupyResult occupy_result{OccupyResult::INITIAL}; + const int bucket_size = buckets_size[bkt_idx]; + do { + if (bucket_size < bucket_max_size) { + occupy_result = find_and_lock_when_vacant( + g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key, + start_idx, key_pos, src_lane, bucket_max_size); + } else { + start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; + occupy_result = find_and_lock_when_full( + g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key, + start_idx, key_pos, src_lane, bucket_max_size); + } + + occupy_result = g.shfl(occupy_result, src_lane); + } while (occupy_result == OccupyResult::CONTINUE); + + if (occupy_result == OccupyResult::REFUSED) continue; + + if ((occupy_result == OccupyResult::OCCUPIED_EMPTY || + occupy_result == OccupyResult::OCCUPIED_RECLAIMED) && + g.thread_rank() == src_lane) { + atomicAdd(&(buckets_size[bkt_idx]), 1); + } + + if (occupy_result == OccupyResult::DUPLICATE) { + copy_vector(g, bucket->vectors + key_pos * dim, + find_or_insert_value, dim); + if (metas != nullptr && g.thread_rank() == src_lane) { + *(metas + key_idx) = + bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed); + } + } else { + copy_vector(g, find_or_insert_value, + bucket->vectors + key_pos * dim, dim); + if (g.thread_rank() == src_lane) { + update_meta(bucket, key_pos, metas, key_idx); + } + } + + if (g.thread_rank() == src_lane) { + (bucket->keys(key_pos)) + ->store(find_or_insert_key, cuda::std::memory_order_relaxed); + } + } +} + +template +struct SelectFindOrInsertKernelWithIO { + static void execute_kernel(const float& load_factor, const int& block_size, + const size_t bucket_max_size, + const size_t buckets_num, const size_t dim, + cudaStream_t& stream, const size_t& n, + const Table* __restrict table, + const K* __restrict keys, V* __restrict values, + M* __restrict metas) { + if (load_factor <= 0.75) { + const unsigned int tile_size = 4; + const size_t N = n * tile_size; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + find_or_insert_kernel_with_io + <<>>( + table, bucket_max_size, buckets_num, dim, keys, values, metas, N); + } else { + const unsigned int tile_size = 32; + const size_t N = n * tile_size; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + find_or_insert_kernel_with_io + <<>>( + table, bucket_max_size, buckets_num, dim, keys, values, metas, N); + } + return; + } +}; + +/* find or insert with the end-user specified meta. + */ +template +__global__ void find_or_insert_kernel( + const Table* __restrict table, const size_t bucket_max_size, + const size_t buckets_num, const size_t dim, const K* __restrict keys, + V** __restrict vectors, M* __restrict metas, bool* __restrict found, + int* __restrict keys_index, const size_t N) { + auto g = cg::tiled_partition(cg::this_thread_block()); + int* buckets_size = table->buckets_size; + + for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; + t += blockDim.x * gridDim.x) { + int key_pos = -1; + size_t key_idx = t / TILE_SIZE; + + const K find_or_insert_key = keys[key_idx]; + + if (IS_RESERVED_KEY(find_or_insert_key)) continue; + + const M find_or_insert_meta = + metas != nullptr ? metas[key_idx] : static_cast(MAX_META); + + size_t bkt_idx = 0; + size_t start_idx = 0; + int src_lane = -1; + K evicted_key; + + Bucket* bucket = + get_key_position(table->buckets, find_or_insert_key, bkt_idx, + start_idx, buckets_num, bucket_max_size); + + if (g.thread_rank() == 0) { + *(keys_index + key_idx) = key_idx; + } + + OccupyResult occupy_result{OccupyResult::INITIAL}; + const int bucket_size = buckets_size[bkt_idx]; + do { + if (bucket_size < bucket_max_size) { + occupy_result = find_and_lock_when_vacant( + g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key, + start_idx, key_pos, src_lane, bucket_max_size); + } else { + start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; + occupy_result = find_and_lock_when_full( + g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key, + start_idx, key_pos, src_lane, bucket_max_size); + } + + occupy_result = g.shfl(occupy_result, src_lane); + } while (occupy_result == OccupyResult::CONTINUE); + + if (occupy_result == OccupyResult::REFUSED) continue; + + if ((occupy_result == OccupyResult::OCCUPIED_EMPTY || + occupy_result == OccupyResult::OCCUPIED_RECLAIMED) && + g.thread_rank() == src_lane) { + atomicAdd(&(buckets_size[bkt_idx]), 1); + } + + if (occupy_result == OccupyResult::DUPLICATE) { + if (g.thread_rank() == src_lane) { + *(vectors + key_idx) = (bucket->vectors + key_pos * dim); + + if (found != nullptr) { + *(found + key_idx) = true; + } + + if (metas != nullptr) { + *(metas + key_idx) = + bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed); + } + } + } else { + if (g.thread_rank() == src_lane) { + *(vectors + key_idx) = (bucket->vectors + key_pos * dim); + update_meta(bucket, key_pos, metas, key_idx); + } + } + + if (g.thread_rank() == src_lane) { + (bucket->keys(key_pos)) + ->store(find_or_insert_key, cuda::std::memory_order_relaxed); + } + } +} + +/* find or insert with the end-user specified meta. + */ +template +__global__ void find_ptr_or_insert_kernel( + const Table* __restrict table, const size_t bucket_max_size, + const size_t buckets_num, const size_t dim, const K* __restrict keys, + V** __restrict vectors, M* __restrict metas, bool* __restrict found, + const size_t N) { + auto g = cg::tiled_partition(cg::this_thread_block()); + int* buckets_size = table->buckets_size; + + for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; + t += blockDim.x * gridDim.x) { + int key_pos = -1; + size_t key_idx = t / TILE_SIZE; + + const K find_or_insert_key = keys[key_idx]; + + if (IS_RESERVED_KEY(find_or_insert_key)) continue; + + const M find_or_insert_meta = + metas != nullptr ? metas[key_idx] : static_cast(MAX_META); + + size_t bkt_idx = 0; + size_t start_idx = 0; + int src_lane = -1; + K evicted_key; + + Bucket* bucket = + get_key_position(table->buckets, find_or_insert_key, bkt_idx, + start_idx, buckets_num, bucket_max_size); + + OccupyResult occupy_result{OccupyResult::INITIAL}; + const int bucket_size = buckets_size[bkt_idx]; + do { + if (bucket_size < bucket_max_size) { + occupy_result = find_and_lock_when_vacant( + g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key, + start_idx, key_pos, src_lane, bucket_max_size); + } else { + start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; + occupy_result = find_and_lock_when_full( + g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key, + start_idx, key_pos, src_lane, bucket_max_size); + } + + occupy_result = g.shfl(occupy_result, src_lane); + } while (occupy_result == OccupyResult::CONTINUE); + + if (occupy_result == OccupyResult::REFUSED) continue; + + if ((occupy_result == OccupyResult::OCCUPIED_EMPTY || + occupy_result == OccupyResult::OCCUPIED_RECLAIMED) && + g.thread_rank() == src_lane) { + atomicAdd(&(buckets_size[bkt_idx]), 1); + } + + if (occupy_result == OccupyResult::DUPLICATE) { + if (g.thread_rank() == src_lane) { + *(vectors + key_idx) = (bucket->vectors + key_pos * dim); + *(found + key_idx) = true; + if (metas != nullptr) { + *(metas + key_idx) = + bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed); + } + } + } else { + if (g.thread_rank() == src_lane) { + *(vectors + key_idx) = (bucket->vectors + key_pos * dim); + *(found + key_idx) = false; + update_meta(bucket, key_pos, metas, key_idx); + } + } + + if (g.thread_rank() == src_lane) { + (bucket->keys(key_pos)) + ->store(find_or_insert_key, cuda::std::memory_order_relaxed); + } + } +} + +template +struct SelectFindOrInsertPtrKernel { + static void execute_kernel(const float& load_factor, const int& block_size, + const size_t bucket_max_size, + const size_t buckets_num, const size_t dim, + cudaStream_t& stream, const size_t& n, + const Table* __restrict table, + const K* __restrict keys, V** __restrict values, + M* __restrict metas, bool* __restrict found) { + if (load_factor <= 0.5) { + const unsigned int tile_size = 4; + const size_t N = n * tile_size; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + find_ptr_or_insert_kernel + <<>>(table, bucket_max_size, + buckets_num, dim, keys, values, + metas, found, N); + } else if (load_factor <= 0.875) { + const unsigned int tile_size = 8; + const size_t N = n * tile_size; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + find_ptr_or_insert_kernel + <<>>(table, bucket_max_size, + buckets_num, dim, keys, values, + metas, found, N); + } else { + const unsigned int tile_size = 32; + const size_t N = n * tile_size; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + find_ptr_or_insert_kernel + <<>>(table, bucket_max_size, + buckets_num, dim, keys, values, + metas, found, N); + } + return; + } +}; + +/* Read the data from address of table_value_addrs to corresponding position + in param_value if mask[i] is true, otherwise write data to table_value_addrs + form param_value, + usually called by find_or_insert kernel. + + `table_value_addrs`: A pointer of pointer of V which should be on HBM, + but each value (a pointer of V) could point to a + memory on HBM or HMEM. + `param_value`: A continue memory pointer with Vector + which should be HBM. + `mask`: One for each `param_value`. If true, reading from table_value_addrs, + or false writing table_value_addrs from param_value. + `param_key_index`: N values from address of table_value_addrs are mapped to + param_values according to param_key_index. + `dim`: the dim of value. + `N`: The number of vectors needed to be read. +*/ +template +__global__ void read_or_write_kernel(V** __restrict table_value_addrs, + V* __restrict param_values, + const bool* mask, + const int* __restrict param_key_index, + const size_t dim, const size_t N) { + size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; + + for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) { + int vec_index = int(t / dim); + int dim_index = t % dim; + int real_key_index = + param_key_index != nullptr ? param_key_index[vec_index] : vec_index; + + /// if found, read the value form table, otherwise write it + if (table_value_addrs[vec_index] != nullptr) { + /// find + if (mask[real_key_index]) { + param_values[real_key_index * dim + dim_index] = + table_value_addrs[vec_index][dim_index]; + } + /// insert + else { + table_value_addrs[vec_index][dim_index] = + param_values[real_key_index * dim + dim_index]; + } + } + } +} + +/* + * update with IO operation. This kernel is + * usually used for the pure HBM mode for better performance. + */ +template +__global__ void update_kernel_with_io( + const Table* __restrict table, const size_t bucket_max_size, + const size_t buckets_num, const size_t dim, const K* __restrict keys, + const V* __restrict values, const M* __restrict metas, const size_t N) { + auto g = cg::tiled_partition(cg::this_thread_block()); + int* buckets_size = table->buckets_size; + + for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; + t += blockDim.x * gridDim.x) { + int key_pos = -1; + size_t key_idx = t / TILE_SIZE; + + const K update_key = keys[key_idx]; + + if (IS_RESERVED_KEY(update_key)) continue; + + const V* update_value = values + key_idx * dim; + + size_t bkt_idx = 0; + size_t start_idx = 0; + int src_lane = -1; + + Bucket* bucket = + get_key_position(table->buckets, update_key, bkt_idx, start_idx, + buckets_num, bucket_max_size); + + OccupyResult occupy_result{OccupyResult::INITIAL}; + const int bucket_size = buckets_size[bkt_idx]; + + if (bucket_size >= bucket_max_size) { + start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; + } + occupy_result = find_and_lock_for_update( + g, bucket, update_key, start_idx, key_pos, src_lane, bucket_max_size); + + occupy_result = g.shfl(occupy_result, src_lane); + + if (occupy_result == OccupyResult::REFUSED) continue; + + if (occupy_result == OccupyResult::DUPLICATE) { + copy_vector(g, update_value, + bucket->vectors + key_pos * dim, dim); + if (src_lane == g.thread_rank()) { + update_meta(bucket, key_pos, metas, key_idx); + } + } + + if (g.thread_rank() == src_lane) { + (bucket->keys(key_pos)) + ->store(update_key, cuda::std::memory_order_relaxed); + } + } +} + +template +struct SelectUpdateKernelWithIO { + static void execute_kernel(const float& load_factor, const int& block_size, + const size_t bucket_max_size, + const size_t buckets_num, const size_t dim, + cudaStream_t& stream, const size_t& n, + const Table* __restrict table, + const K* __restrict keys, + const V* __restrict values, + const M* __restrict metas) { + if (load_factor <= 0.75) { + const unsigned int tile_size = 4; + const size_t N = n * tile_size; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + update_kernel_with_io + <<>>( + table, bucket_max_size, buckets_num, dim, keys, values, metas, N); + } else { + const unsigned int tile_size = 32; + const size_t N = n * tile_size; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + update_kernel_with_io + <<>>( + table, bucket_max_size, buckets_num, dim, keys, values, metas, N); + } + return; + } +}; + +template +__global__ void update_kernel(const Table* __restrict table, + const size_t bucket_max_size, + const size_t buckets_num, const size_t dim, + const K* __restrict keys, V** __restrict vectors, + const M* __restrict metas, + int* __restrict src_offset, size_t N) { + auto g = cg::tiled_partition(cg::this_thread_block()); + int* buckets_size = table->buckets_size; + + for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; + t += blockDim.x * gridDim.x) { + int key_pos = -1; + size_t key_idx = t / TILE_SIZE; + + const K update_key = keys[key_idx]; + + if (IS_RESERVED_KEY(update_key)) continue; + + size_t bkt_idx = 0; + size_t start_idx = 0; + int src_lane = -1; + + Bucket* bucket = + get_key_position(table->buckets, update_key, bkt_idx, start_idx, + buckets_num, bucket_max_size); + + OccupyResult occupy_result{OccupyResult::INITIAL}; + const int bucket_size = buckets_size[bkt_idx]; + *(src_offset + key_idx) = key_idx; + + if (bucket_size >= bucket_max_size) { + start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; + } + occupy_result = find_and_lock_for_update( + g, bucket, update_key, start_idx, key_pos, src_lane, bucket_max_size); + + occupy_result = g.shfl(occupy_result, src_lane); + + if (occupy_result == OccupyResult::REFUSED) continue; + + if (g.thread_rank() == src_lane) { + if (occupy_result == OccupyResult::DUPLICATE) { + *(vectors + key_idx) = (bucket->vectors + key_pos * dim); + update_meta(bucket, key_pos, metas, key_idx); + } else { + *(vectors + key_idx) = nullptr; + } + } + + if (g.thread_rank() == src_lane) { + (bucket->keys(key_pos)) + ->store(update_key, cuda::std::memory_order_relaxed); + } + } +} + +} // namespace merlin +} // namespace nv diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/debug.hpp b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/debug.hpp new file mode 100644 index 000000000..2d9ebfe8b --- /dev/null +++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/debug.hpp @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/cuda_utils.cuh" + +namespace nv { +namespace merlin { + +class CudaException : public std::runtime_error { + public: + CudaException(const std::string& what) : runtime_error(what) {} +}; + +inline void cuda_check_(cudaError_t val, const char* file, int line) { + if (val != cudaSuccess) { + std::ostringstream os; + os << file << ':' << line << ": CUDA error " << cudaGetErrorName(val) + << " (#" << val << "): " << cudaGetErrorString(val); + throw CudaException(os.str()); + } +} + +//#ifdef CUDA_CHECK +//#error Unexpected redfinition of CUDA_CHECK! Something is wrong. +//#endif + +#ifndef CUDA_CHECK +#define CUDA_CHECK(val) \ + do { \ + nv::merlin::cuda_check_((val), __FILE__, __LINE__); \ + } while (0) +#endif // CUDA_CHECK + +class MerlinException : public std::runtime_error { + public: + MerlinException(const std::string& what) : runtime_error(what) {} +}; + +template +inline void merlin_check_(bool cond, const Msg& msg, const char* file, + int line) { + if (!cond) { + std::ostringstream os; + os << file << ':' << line << ": HierarchicalKV error " << msg; + throw MerlinException(os.str()); + } +} + +#ifdef MERLIN_CHECK +#error Unexpected redfinition of MERLIN_CHECK! Something is wrong. +#endif + +#define MERLIN_CHECK(cond, msg) \ + do { \ + nv::merlin::merlin_check_((cond), (msg), __FILE__, __LINE__); \ + } while (0) + +} // namespace merlin +} // namespace nv diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/flexible_buffer.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/flexible_buffer.cuh new file mode 100644 index 000000000..d01dda6d1 --- /dev/null +++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/flexible_buffer.cuh @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include "utils.cuh" + +using std::cerr; +using std::endl; + +namespace nv { +namespace merlin { + +template +class FlexPinnedBuffer { + public: + FlexPinnedBuffer(const size_t size = 1) : ptr_(nullptr) { + if (!ptr_) { + size_ = size; + CUDA_CHECK(cudaMallocHost(&ptr_, sizeof(T) * size_)); + } + } + ~FlexPinnedBuffer() { + try { + if (!ptr_) CUDA_CHECK(cudaFreeHost(ptr_)); + } catch (const nv::merlin::CudaException& e) { + cerr << "[HierarchicalKV] Failed to free FlexPinnedBuffer!" << endl; + } + } + + __inline__ T* alloc_or_reuse(const size_t size = 0) { + if (size > size_) { + CUDA_CHECK(cudaFreeHost(ptr_)); + size_ = size; + CUDA_CHECK(cudaMallocHost(&ptr_, sizeof(T) * size_)); + } + return ptr_; + } + + private: + T* ptr_; + size_t size_; +}; + +} // namespace merlin +} // namespace nv \ No newline at end of file diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/group_lock.hpp b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/group_lock.hpp new file mode 100644 index 000000000..2cfdc6ed3 --- /dev/null +++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/group_lock.hpp @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http:///www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Implementing a group mutex and relative lock guard for better E2E performance: + * - Allow multiple writers (like `insert_or_assign` `assign` `insert_and_evict` etc.) + * The CUDA kernels guarantee the data consistency in this situation. + * - Allow multiple readers (like `find` 'size` etc.) + * - Not allow readers and writers to run concurrently + * - The `write_read_lock` is used for special APIs (like `reserve` `erase` `clear` etc.) + */ +#include +#include +#include +#include +#include + +namespace nv { +namespace merlin { + +class group_shared_mutex { + public: + group_shared_mutex(const group_shared_mutex&) = delete; + group_shared_mutex& operator=(const group_shared_mutex&) = delete; + + group_shared_mutex() noexcept + : writer_count_(0), reader_count_(0), unique_flag_(false) {} + + void lock_read() { + for (;;) { + while (writer_count_.load(std::memory_order_acquire)) { + } + reader_count_.fetch_add(1, std::memory_order_acq_rel); + if (writer_count_.load(std::memory_order_acquire) == 0) { + break; + } + reader_count_.fetch_sub(1, std::memory_order_acq_rel); + } + } + + void unlock_read() { reader_count_.fetch_sub(1, std::memory_order_release); } + + void lock_write() { + for (;;) { + while (reader_count_.load(std::memory_order_acquire)) { + } + writer_count_.fetch_add(1, std::memory_order_acq_rel); + if (reader_count_.load(std::memory_order_acquire) == 0) { + break; + } + writer_count_.fetch_sub(1, std::memory_order_acq_rel); + } + } + + void unlock_write() { writer_count_.fetch_sub(1, std::memory_order_release); } + + void lock_write_read() { + /* Lock unique flag */ + bool expected = false; + while (!unique_flag_.compare_exchange_weak(expected, true, + std::memory_order_acq_rel)) { + expected = false; + } + + /* Ban writer */ + for (;;) { + while (writer_count_.load(std::memory_order_acquire)) { + } + reader_count_.fetch_add(1, std::memory_order_acq_rel); + if (writer_count_.load(std::memory_order_acquire) == 0) { + break; + } + reader_count_.fetch_sub(1, std::memory_order_acq_rel); + } + + /* Ban reader */ + for (;;) { + while (reader_count_.load(std::memory_order_acquire) > 1) { + } + writer_count_.fetch_add(1, std::memory_order_acq_rel); + if (reader_count_.load(std::memory_order_acquire) == 1) { + break; + } + writer_count_.fetch_sub(1, std::memory_order_acq_rel); + } + } + + void unlock_write_read() noexcept { + reader_count_.fetch_sub(1, std::memory_order_release); + writer_count_.fetch_sub(1, std::memory_order_release); + unique_flag_.store(false, std::memory_order_release); + } + + int writer_count() noexcept { + return writer_count_.load(std::memory_order_relaxed); + } + + int reader_count() noexcept { + return reader_count_.load(std::memory_order_relaxed); + } + + private: + std::atomic writer_count_; + std::atomic reader_count_; + std::atomic unique_flag_; +}; + +class reader_shared_lock { + public: + reader_shared_lock(const reader_shared_lock&) = delete; + reader_shared_lock(reader_shared_lock&&) = delete; + + reader_shared_lock& operator=(const reader_shared_lock&) = delete; + reader_shared_lock& operator=(reader_shared_lock&&) = delete; + + explicit reader_shared_lock(group_shared_mutex& mutex) : mutex_(&mutex) { + mutex_->lock_read(); + owns_ = true; + } + + explicit reader_shared_lock(group_shared_mutex& mutex, std::defer_lock_t) + : mutex_(&mutex), owns_(false) {} + + ~reader_shared_lock() { + if (owns_) { + mutex_->unlock_read(); + } + } + + void lock() noexcept { + if (!owns_) { + mutex_->lock_read(); + owns_ = true; + } + } + + bool owns_lock() const noexcept { return owns_; } + + private: + group_shared_mutex* const mutex_; + bool owns_; +}; + +class writer_shared_lock { + public: + writer_shared_lock(const writer_shared_lock&) = delete; + writer_shared_lock(writer_shared_lock&&) = delete; + + writer_shared_lock& operator=(const writer_shared_lock&) = delete; + writer_shared_lock& operator=(writer_shared_lock&&) = delete; + + explicit writer_shared_lock(group_shared_mutex& mutex) : mutex_(&mutex) { + mutex_->lock_write(); + owns_ = true; + } + + explicit writer_shared_lock(group_shared_mutex& mutex, std::defer_lock_t) + : mutex_(&mutex), owns_(false) {} + + ~writer_shared_lock() { + if (owns_) { + mutex_->unlock_write(); + } + } + + void lock() noexcept { + if (!owns_) { + mutex_->lock_write(); + owns_ = true; + } + } + + bool owns_lock() const noexcept { return owns_; } + + private: + group_shared_mutex* const mutex_; + bool owns_; +}; + +class write_read_lock { + public: + write_read_lock(const write_read_lock&) = delete; + write_read_lock(write_read_lock&&) = delete; + + write_read_lock& operator=(const write_read_lock&) = delete; + write_read_lock& operator=(write_read_lock&&) = delete; + + explicit write_read_lock(group_shared_mutex& mutex) : mutex_(&mutex) { + mutex_->lock_write_read(); + owns_ = true; + } + + explicit write_read_lock(group_shared_mutex& mutex, std::defer_lock_t) noexcept + : mutex_(&mutex), owns_(false) {} + + ~write_read_lock() { + if (owns_) { + mutex_->unlock_write_read(); + } + } + + void lock() { + assert(!owns_ && "[write_read_lock] trying to lock twice!"); + mutex_->lock_write_read(); + owns_ = true; + } + + bool owns_lock() const noexcept { return owns_; } + + private: + group_shared_mutex* const mutex_; + bool owns_; +}; + +} // namespace merlin +} // namespace nv \ No newline at end of file diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/initializers.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/initializers.cuh new file mode 100644 index 000000000..6df875688 --- /dev/null +++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/initializers.cuh @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include "curand_philox4x32_x.h" +#include "types.cuh" +#include "utils.cuh" + +namespace nv { +namespace merlin { +namespace initializers { + +inline void cuda_rand_check_(curandStatus_t val, const char* file, int line) { + if (val != CURAND_STATUS_SUCCESS) { + throw CudaException(std::string(file) + ":" + std::to_string(line) + + ": CURAND error " + std::to_string(val)); + } +} + +#define CURAND_CHECK(val) \ + { nv::merlin::initializers::cuda_rand_check_((val), __FILE__, __LINE__); } + +template +void zeros(T* d_data, const size_t len, cudaStream_t stream) { + CUDA_CHECK(cudaMemsetAsync(d_data, 0, len, stream)); +} + +template +void random_normal(T* d_data, const size_t len, cudaStream_t stream, + const T mean = 0.0, const T stddev = 0.05, + const unsigned long long seed = 2022ULL) { + curandGenerator_t generator; + CURAND_CHECK(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(generator, seed)); + CURAND_CHECK(curandGenerateNormal(generator, d_data, len, mean, stddev)); +} + +template +__global__ void adjust_max_min(T* d_data, const T minval, const T maxval, + const size_t N) { + int tid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tid < N) { + d_data[tid] = + d_data[tid] * (maxval - minval) + (0.5 * (maxval + minval) - 0.5); + } +} + +template +void random_uniform(T* d_data, const size_t len, cudaStream_t stream, + const T minval = 0.0, const T maxval = 1.0, + const unsigned long long seed = 2022ULL) { + curandGenerator_t generator; + + CURAND_CHECK(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(generator, seed)); + + int N = len; + int block_size = 256; + int grid_size = (N + block_size - 1) / block_size; + CURAND_CHECK(curandGenerateUniform(generator, d_data, N)); + adjust_max_min + <<>>(d_data, minval, maxval, N); +} + +template +__global__ void init_states(curandStatePhilox4_32_10_t* states, + const unsigned long long seed, const size_t N) { + int tid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tid < N) { + curand_init(seed, tid, 0, &states[tid]); + } +} + +template +__global__ void make_truncated_normal(T* d_data, + curandStatePhilox4_32_10_t* states, + const size_t N) { + int tid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tid < N) { + constexpr T truncated_val = T(2.0); + while (fabsf(d_data[tid]) > truncated_val) { + d_data[tid] = curand_normal(&states[tid]); + } + } +} + +template +void truncated_normal(T* d_data, const size_t len, cudaStream_t stream, + const T minval = 0.0, const T maxval = 1.0, + const unsigned long long seed = 2022ULL) { + curandGenerator_t generator; + + CURAND_CHECK(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(generator, seed)); + + int N = len; + int block_size = 256; + int grid_size = (N + block_size - 1) / block_size; + curandStatePhilox4_32_10_t* d_states; + CUDA_CHECK(cudaMallocAsync(&d_states, N, stream)); + + init_states<<>>(d_states, seed, N); + + make_truncated_normal + <<>>(d_data, d_states, N); + + adjust_max_min + <<>>(d_data, minval, maxval, N); + + CUDA_CHECK(cudaFreeAsync(d_states, stream)); +} + +template +class Initializer { + public: + virtual ~Initializer() {} + virtual void initialize(T* data, size_t len, cudaStream_t stream) {} +}; + +template +class Zeros final : public Initializer { + public: + void initialize(T* data, const size_t len, cudaStream_t stream) override { + zeros(data, len, stream); + } +}; + +} // namespace initializers +} // namespace merlin +} // namespace nv \ No newline at end of file diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/memory_pool.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/memory_pool.cuh new file mode 100644 index 000000000..271676a6e --- /dev/null +++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/memory_pool.cuh @@ -0,0 +1,619 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include "debug.hpp" + +namespace nv { +namespace merlin { + +/** + * Allocators are used by the memory pool (and maybe other classes) to create + * RAII complient containers for buffers allocated in different memory areas. + */ +template +struct AllocatorBase { + using type = T; + using sync_unique_ptr = std::unique_ptr; + using async_unique_ptr = std::unique_ptr>; + using shared_ptr = std::shared_ptr; + + inline static sync_unique_ptr make_unique(size_t n) { + return sync_unique_ptr(Allocator::alloc(n)); + } + + inline static async_unique_ptr make_unique(size_t n, cudaStream_t stream) { + return {Allocator::alloc(n, stream), + [stream](type* p) { Allocator::free(p); }}; + } + + inline static shared_ptr make_shared(size_t n, cudaStream_t stream = 0) { + return {Allocator::alloc(n, stream), + [stream](type* p) { Allocator::free(p, stream); }}; + } + + inline void operator()(type* ptr) { Allocator::free(ptr); } +}; + +/** + * Trivial fallback implementation using the standard C++ allocator. This mostly + * exists to ensure interface correctness, and as an illustration of what a + * proper allocator implementation should look like. + */ +template +struct StandardAllocator final : AllocatorBase> { + using type = typename AllocatorBase>::type; + + static constexpr const char* name{"StandardAllocator"}; + + inline static type* alloc(size_t n, cudaStream_t stream = 0) { + return new type[n]; + } + + inline static void free(type* ptr, cudaStream_t stream = 0) { delete[] ptr; } +}; + +/** + * Claim/release buffers in pinned host memory. + */ +template +struct HostAllocator final : AllocatorBase> { + using type = typename AllocatorBase>::type; + + static constexpr const char* name{"HostAllocator"}; + + inline static type* alloc(size_t n, cudaStream_t stream = 0) { + void* ptr; + CUDA_CHECK(cudaMallocHost(&ptr, sizeof(T) * n)); + return reinterpret_cast(ptr); + } + + inline static void free(type* ptr, cudaStream_t stream = 0) { + CUDA_CHECK(cudaFreeHost(ptr)); + } +}; + +/** + * Claim/release buffers in the active CUDA device. Will not test if the correct + * device was used, and throw if CUDA runtime API response is negative. + */ +template +struct DeviceAllocator final : AllocatorBase> { + using type = typename AllocatorBase>::type; + + static constexpr const char* name{"DeviceAllocator"}; + + inline static type* alloc(size_t n, cudaStream_t stream = 0) { + void* ptr; + cudaError_t res; + if (stream) { + res = cudaMallocAsync(&ptr, sizeof(T) * n, stream); + } else { + res = cudaMalloc(&ptr, sizeof(T) * n); + } + CUDA_CHECK(res); + return reinterpret_cast(ptr); + } + + inline static void free(type* ptr, cudaStream_t stream = 0) { + cudaError_t res; + if (stream) { + res = cudaFreeAsync(ptr, stream); + } else { + res = cudaFree(ptr); + } + CUDA_CHECK(res); + } +}; + +/** + * Helper structure to configure a memory pool. + */ +struct MemoryPoolOptions { + size_t max_stock{4}; ///< Amount of buffers to keep in reserve. + size_t max_pending{16}; ///< Maximum amount of awaitable buffers. If this + ///< limit is exceeded threads will start to block. +}; + +/** + * Forward declares required to make templated ostream overload work. + */ +template +class MemoryPool; + +template +std::ostream& operator<<(std::ostream&, const MemoryPool&); + +/** + * CUDA deferred execution aware memory pool implementation. As for every memory + * pool, the general idea is to have resuable buffers. All buffers have the same + * size. + * + * General behavior: + * + * This memory pool implementation attempts to avoid blocking before the fact, + * but also avoids relying on a background worker. + * + * Buffer borrow and return semantics tightly align with C++ RAII principles. + * That is, if a workspace is requested, any borrowed buffers will be returned + * automatically when leaving the scope. + * + * You can either borrow a single buffer, or a workspace (that is multiple + * buffers). We support dynamic and static workspaces. Static workspaces have + * the benefit that they will never require heap memory (no hidden allocations). + * + * + * Buffer borrowing: + * + * If buffers are requested, we take them from the stock, if available. If the + * stock is depleted, we check if any pending buffer has been used up by the GPU + * and adds them to the stock. If was also not successful, we allocate a new + * buffer. Buffers or workspaces (groups of buffers). + * + * When borrowing a buffer a streaming context can be specified. This context is + * relevant for allocation and during returns. It is assumed that the stream you + * provide as context will be the stream where you queue the workload. Not doing + * so may lead to undefined behavior. + * + * Buffer return: + * + * If no context is provided, we cannot make any assumptions regarding the usage + * one the device. So we sychronize the device first and then return the buffer + * to the stock. If a streaming context was provided, we queue an event and add + * the buffer to the `pending` pool. That means, the buffer has been + * reqlinquished by the CPU, but may still be used by the GPU. If no pending + * slot is available, we probe the currently pending buffers events for + * completion. Completed pending buffers are returned to the reserve. If so, we + * queue the buffer in the freed slot. If that was unsucessful (i.e., all + * currently pending buffers are still in use by the GPU), we have no choice but + * the free the buffer using the current stream. + * + * In either case, `max_reserve` represents the maxmum size of the stock. If + * returning a buffer would lead to the stock exeeding this quantity, the buffer + * is queued for destruction. + */ +template +class MemoryPool final { + public: + using pool_type = MemoryPool; + using alloc_type = typename Allocator::type; + template + class Workspace { + public: + inline Workspace() : pool_{nullptr}, buffer_size_{0}, stream_{0} {} + + inline Workspace(pool_type* pool, cudaStream_t stream) + : pool_{pool}, buffer_size_{0}, stream_{stream} {} + + Workspace(const Workspace&) = delete; + + Workspace& operator=(const Workspace&) = delete; + + inline Workspace(Workspace&& other) + : pool_{other.pool_}, + buffer_size_{other.buffer_size_}, + stream_{other.stream_}, + buffers_{std::move(other.buffers_)} {} + + inline Workspace& operator=(Workspace&& other) { + if (pool_) { + pool_->put_raw(buffers_.begin(), buffers_.end(), buffer_size_, stream_); + } + pool_ = other.pool_; + buffer_size_ = other.buffer_size_; + stream_ = other.stream_; + buffers_ = std::move(other.buffers_); + other.pool_ = nullptr; + return *this; + } + + inline ~Workspace() { + if (pool_) { + pool_->put_raw(buffers_.begin(), buffers_.end(), buffer_size_, stream_); + } + } + + template + constexpr void at(const size_t n, T* ptr) const { + *ptr = at(n); + } + + template + constexpr T at(const size_t n) const { + return reinterpret_cast(buffers_.at(n)); + } + + template + constexpr void get(const size_t n, T* ptr) const { + *ptr = get(n); + } + + template + constexpr T get(const size_t n) const { + return reinterpret_cast(buffers_[n]); + } + + constexpr alloc_type* operator[](const size_t n) const { + return buffers_[n]; + } + + protected: + pool_type* pool_; + size_t buffer_size_; + cudaStream_t stream_; + Container buffers_; + }; + + template + class StaticWorkspace final : public Workspace> { + public: + using base_type = Workspace>; + + friend class MemoryPool; + + inline StaticWorkspace() : base_type() {} + + StaticWorkspace(const StaticWorkspace&) = delete; + + StaticWorkspace& operator=(const StaticWorkspace&) = delete; + + inline StaticWorkspace(StaticWorkspace&& other) + : base_type(std::move(other)) {} + + inline StaticWorkspace& operator=(StaticWorkspace&& other) { + base_type::operator=(std::move(other)); + return *this; + } + + private: + inline StaticWorkspace(pool_type* pool, size_t requested_buffer_size, + cudaStream_t stream) + : base_type(pool, stream) { + auto& buffers{this->buffers_}; + this->buffer_size_ = pool->get_raw(buffers.begin(), buffers.end(), + requested_buffer_size, stream); + } + }; + + class DynamicWorkspace final : public Workspace> { + public: + using base_type = Workspace>; + + friend class MemoryPool; + + inline DynamicWorkspace() : base_type() {} + + DynamicWorkspace(const DynamicWorkspace&) = delete; + + DynamicWorkspace& operator=(const DynamicWorkspace&) = delete; + + inline DynamicWorkspace(DynamicWorkspace&& other) + : base_type(std::move(other)) {} + + inline DynamicWorkspace& operator=(DynamicWorkspace&& other) { + base_type::operator=(std::move(other)); + return *this; + } + + private: + inline DynamicWorkspace(pool_type* pool, size_t n, + size_t requested_buffer_size, cudaStream_t stream) + : base_type(pool, stream) { + auto& buffers{this->buffers_}; + buffers.resize(n); + this->buffer_size_ = pool->get_raw(buffers.begin(), buffers.end(), + requested_buffer_size, stream); + } + }; + + MemoryPool(const MemoryPoolOptions& options) : options_{options} { + // Create initial buffer stock. + stock_.reserve(options_.max_stock); + + // Create enough events, so we have one per potentially pending buffer. + ready_events_.resize(options_.max_pending); + for (auto& ready_event : ready_events_) { + CUDA_CHECK(cudaEventCreate(&ready_event)); + } + + // Preallocate pending. + pending_.reserve(options_.max_pending); + } + + ~MemoryPool() { + // Make sure all queued tasks are complete. + await_pending(); + + // Free event and buffer memory. + for (auto& ready_event : ready_events_) { + CUDA_CHECK(cudaEventDestroy(ready_event)); + } + + // Any remaining buffers need to be properly unallocated. + deplete_stock(); + } + + inline size_t buffer_size() const { return buffer_size_; } + + inline size_t max_batch_size(size_t max_item_size) const { + return buffer_size_ / max_item_size; + } + + template + inline size_t max_batch_size() const { + return max_batch_size(sizeof(T)); + } + + size_t current_stock() const { + std::lock_guard lock(mutex_); + return stock_.size(); + } + + size_t num_pending() const { + std::lock_guard lock(mutex_); + return pending_.size(); + } + + void await_pending(cudaStream_t stream = 0) { + std::lock_guard lock(mutex_); + while (!pending_.empty()) { + collect_pending_unsafe(stream); + if (pending_.empty()) { + break; + } + std::this_thread::yield(); + } + } + + void deplete_stock() { + std::lock_guard lock(mutex_); + for (auto& ptr : stock_) { + Allocator::free(ptr); + } + stock_.clear(); + } + + inline std::unique_ptr> + get_unique(size_t requested_buffer_size, cudaStream_t stream = 0) { + alloc_type* ptr; + const size_t allocation_size = + get_raw(&ptr, (&ptr) + 1, requested_buffer_size, stream); + return {ptr, [this, allocation_size, stream](alloc_type* p) { + put_raw(&p, (&p) + 1, allocation_size, stream); + }}; + } + + inline std::shared_ptr get_shared(size_t requested_buffer_size, + cudaStream_t stream = 0) { + alloc_type* ptr; + const size_t allocation_size = + get_raw(&ptr, (&ptr) + 1, requested_buffer_size, stream); + return {ptr, [this, allocation_size, stream](alloc_type* p) { + put_raw(&p, (&p) + 1, allocation_size, stream); + }}; + } + + template + inline StaticWorkspace get_workspace(size_t requested_buffer_size, + cudaStream_t stream = 0) { + return {this, requested_buffer_size, stream}; + } + + inline DynamicWorkspace get_workspace(size_t n, size_t requested_buffer_size, + cudaStream_t stream = 0) { + return {this, n, requested_buffer_size, stream}; + } + + friend std::ostream& operator<<(std::ostream&, const MemoryPool&); + + private: + inline void collect_pending_unsafe(cudaStream_t stream) { + auto it{std::remove_if( + pending_.begin(), pending_.end(), [this, stream](const auto& pending) { + const cudaError_t state{cudaEventQuery(std::get<2>(pending))}; + switch (state) { + case cudaSuccess: + // Stock buffers and destroy those that are no + // longer needed, but only if the allocation_size + // is still the same as the current buffer_size. + if (stock_.size() < options_.max_stock && + std::get<1>(pending) == buffer_size_) { + stock_.emplace_back(std::get<0>(pending)); + } else { + Allocator::free(std::get<0>(pending), stream); + } + ready_events_.emplace_back(std::get<2>(pending)); + return true; + case cudaErrorNotReady: + return false; + default: + CUDA_CHECK(state); + return false; + } + })}; + pending_.erase(it, pending_.end()); + } + + inline void clear_stock_unsafe(cudaStream_t stream) { + for (auto& ptr : stock_) { + Allocator::free(ptr, stream); + } + stock_.clear(); + } + + template + inline size_t get_raw(Iterator first, Iterator const last, + size_t requested_buffer_size, cudaStream_t stream) { + // Get pre-allocated buffers if stock available. + size_t allocation_size; + { + std::lock_guard lock(mutex_); + + // If requested_buffer_size is within current buffer_size margins can + // reuse current buffers. + if (requested_buffer_size <= buffer_size_) { + while (first != last) { + // If no buffers available, try to make some available. + if (stock_.empty()) { + collect_pending_unsafe(stream); + if (stock_.empty()) { + // No buffers available. + break; + } + } + + // Just take the next available buffer. + *first++ = stock_.back(); + stock_.pop_back(); + } + } else { + // Drop the stock because we need more memory and those buffers have + // become useless to that end. + clear_stock_unsafe(stream); + buffer_size_ = requested_buffer_size; + } + + allocation_size = buffer_size_; + } + + // Forge new buffers until request can be filled. + for (; first != last; ++first) { + *first = Allocator::alloc(allocation_size, stream); + } + + return allocation_size; + } + + template + inline void put_raw(Iterator first, Iterator const last, + size_t allocation_size, cudaStream_t stream) { + std::lock_guard lock(mutex_); + + // If allocation_size of the workspace differs from the current buffer_size + // (i.e., somebody else requested a larger buffer since the original request + // occured), the provided buffers are incompatible and have to be discarded. + if (allocation_size != buffer_size_) { + while (first != last) { + Allocator::free(*first++); + } + return; + } + + // If the workspace that borrowed a stream was moved out of the RAII scope + // where it was created, it could happen that the stream was destroyed when + // we return the buffer ownershup. This `cudaStreamQuery` will prevent that. + if (stream && cudaStreamQuery(stream) != cudaErrorInvalidResourceHandle) { + for (; first != last; ++first) { + // Avoid adding already deallocated buffers. + if (*first == nullptr) { + continue; + } + + // Spin lock if too many pending buffers (i.e., let CPU wait for GPU). + while (ready_events_.empty()) { + collect_pending_unsafe(stream); + if (!ready_events_.empty()) { + break; + } + std::this_thread::yield(); + } + + // Queue buffer. + cudaEvent_t ready_event{ready_events_.back()}; + ready_events_.pop_back(); + CUDA_CHECK(cudaEventRecord(ready_event, stream)); + pending_.emplace_back(*first, allocation_size, ready_event); + } + } else { + // Without stream context, we must force a hard sync with the GPU. + CUDA_CHECK(cudaDeviceSynchronize()); + + for (; first != last; ++first) { + // Avoid adding already deallocated buffers. + if (*first == nullptr) { + continue; + } + + // Stock buffers and destroy those that are no longer needed. + if (stock_.size() < options_.max_stock) { + stock_.emplace_back(*first); + } else { + Allocator::free(*first); + } + } + } + } + + const MemoryPoolOptions options_; + + mutable std::mutex mutex_; + size_t buffer_size_{1}; + std::vector stock_; + std::vector ready_events_; + + std::vector> pending_; +}; + +template +std::ostream& operator<<(std::ostream& os, const MemoryPool& pool) { + std::lock_guard lock(pool.mutex_); + + for (size_t i{0}; i < 80; ++i) { + os << '-'; + } + + // Current stock. + os << "\nStock =\n"; + for (size_t i{0}; i < pool.stock_.size(); ++i) { + os << "[ " << i << " ] buffer " << static_cast(pool.stock_[i]) + << ", size = " << pool.buffer_size_ << '\n'; + } + + // Pending buffers. + os << "\nPending =\n"; + for (size_t i{0}; i < pool.pending_.size(); ++i) { + os << "[ " << i + << " ] buffer = " << static_cast(std::get<0>(pool.pending_[i])) + << ", size = " << std::get<1>(pool.pending_[i]) << ", ready_event = " + << static_cast(std::get<2>(pool.pending_[i])) << '\n'; + } + + // Available ready events. + os << "\nReady Events =\n"; + for (size_t i{0}; i < pool.ready_events_.size(); ++i) { + os << "[ " << i << " ] " << static_cast(pool.ready_events_[i]) + << '\n'; + } + + for (size_t i{0}; i < 80; ++i) { + os << '-'; + } + + os << '\n'; + return os; +} + +} // namespace merlin +} // namespace nv diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/optimizers.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/optimizers.cuh new file mode 100644 index 000000000..b3fc1cc70 --- /dev/null +++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/optimizers.cuh @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include "types.cuh" +#include "utils.cuh" + +namespace nv { +namespace merlin { +namespace optimizers { + +template +__global__ void adam_update_kernel(int len, float* weight, T* m, T* v, + const T* wgrad, float alpha_t, float beta1, + float beta2, float epsilon, float scaler) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + float gi = TypeConvertFunc::convert(wgrad[i]) / scaler; + float mi = + beta1 * TypeConvertFunc::convert(m[i]) + (1.f - beta1) * gi; + float vi = beta2 * TypeConvertFunc::convert(v[i]) + + (1.f - beta2) * gi * gi; + m[i] = TypeConvertFunc::convert(mi); + v[i] = TypeConvertFunc::convert(vi); + weight[i] -= alpha_t * mi / (sqrt(vi) + epsilon); + } +} + +template +__global__ void ada_grad_update_kernel(int len, float* weight, const T* wgrad, + T* sum, float lr, const float epsilon, + float scaler) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + float gi = TypeConvertFunc::convert(wgrad[i]) / scaler; + float accum_ = TypeConvertFunc::convert(__ldg(&sum[i])); + accum_ += gi * gi; + float std_ = epsilon + sqrtf(accum_); + weight[i] -= lr * gi / std_; + sum[i] = TypeConvertFunc::convert(accum_); + } +} + +template +__global__ void momentum_sgd_update_kernel(int len, float* weight, T* momentum, + const T* wgrad, float lr, + float momentum_factor, + float scaler) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx < len) { + float mv = + momentum_factor * TypeConvertFunc::convert(momentum[idx]) - + lr * TypeConvertFunc::convert(wgrad[idx]) / scaler; + momentum[idx] = TypeConvertFunc::convert(mv); + weight[idx] += mv; + } + return; +} + +} // namespace optimizers +} // namespace merlin +} // namespace nv \ No newline at end of file diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/types.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/types.cuh new file mode 100644 index 000000000..210acd4a5 --- /dev/null +++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/types.cuh @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace nv { +namespace merlin { + +/** + * Shorthand for a Key-Value-Meta tuple. + */ +template +struct KVM { + K key; + V* value; + M meta; +}; + +constexpr uint64_t EMPTY_KEY = UINT64_C(0xFFFFFFFFFFFFFFFF); +constexpr uint64_t RECLAIM_KEY = UINT64_C(0xFFFFFFFFFFFFFFFE); +constexpr uint64_t VACANT_KEY_MASK = UINT64_C(0xFFFFFFFFFFFFFFFE); +constexpr uint64_t LOCKED_KEY = UINT64_C(0xFFFFFFFFFFFFFFFD); +constexpr uint64_t RESERVED_KEY_MASK = UINT64_C(0xFFFFFFFFFFFFFFFC); +constexpr uint64_t MAX_META = UINT64_C(0xFFFFFFFFFFFFFFFF); +constexpr uint64_t EMPTY_META = UINT64_C(0); + +#define IS_RESERVED_KEY(key) ((RESERVED_KEY_MASK & (key)) == RESERVED_KEY_MASK) +#define IS_VACANT_KEY(key) ((VACANT_KEY_MASK & (key)) == VACANT_KEY_MASK) + +template +using AtomicKey = cuda::atomic; + +template +using AtomicMeta = cuda::atomic; + +template +using AtomicPos = cuda::atomic; + +template +struct Bucket { + AtomicKey* keys_; + AtomicMeta* metas_; + V* vectors; // Pinned memory or HBM + + /* For upsert_kernel without user specified metas + recording the current meta, the cur_meta will + increment by 1 when a new inserting happens. */ + AtomicMeta cur_meta; + + /* min_meta and min_pos is for or upsert_kernel + with user specified meta. They record the minimum + meta and its pos in the bucket. */ + AtomicMeta min_meta; + AtomicPos min_pos; + + __forceinline__ __device__ AtomicKey* keys(int index) const { + return keys_ + index; + } + + __forceinline__ __device__ AtomicMeta* metas(int index) const { + return metas_ + index; + } +}; + +template +class Lock { + mutable cuda::atomic _lock; + + public: + __device__ Lock() : _lock{1} {} + + template + __forceinline__ __device__ void acquire(CG const& g, + unsigned long long lane = 0) const { + if (g.thread_rank() == lane) { + T expected = 1; + while (!_lock.compare_exchange_weak(expected, 2, + cuda::std::memory_order_acquire)) { + expected = 1; + } + } + g.sync(); + } + + template + __forceinline__ __device__ void release(CG const& g, + unsigned long long lane = 0) const { + g.sync(); + if (g.thread_rank() == lane) { + _lock.store(1, cuda::std::memory_order_release); + } + } +}; + +using Mutex = Lock; + +template +struct Table { + Bucket* buckets; + Mutex* locks; // mutex for write buckets + int* buckets_size; // size of each buckets. + V** slices; // Handles of the HBM/ HMEM slices. + size_t dim; // Dimension of the `vectors`. + size_t bytes_per_slice; // Size by byte of one slice. + size_t num_of_memory_slices; // Number of vectors memory slices. + size_t capacity = 134217728; // Initial capacity. + size_t max_size = + std::numeric_limits::max(); // Up limit of the table capacity. + size_t buckets_num; // Number of the buckets. + size_t bucket_max_size = 128; // Volume of each buckets. + size_t max_hbm_for_vectors = 0; // Max HBM allocated for vectors + size_t remaining_hbm_for_vectors = 0; // Remaining HBM allocated for vectors + bool is_pure_hbm = true; // unused + bool primary = true; // unused + int slots_offset = 0; // unused + int slots_number = 0; // unused + int device_id = 0; // Device id + int tile_size; +}; + +template +using EraseIfPredictInternal = + bool (*)(const K& key, ///< iterated key in table + M& meta, ///< iterated meta in table + const K& pattern, ///< input key from caller + const M& threshold ///< input meta from caller + ); + +/** + * An abstract class provides interface between the nv::merlin::HashTable + * and a file, which enables the table to save to the file or load from + * the file, by overriding the `read` and `write` method. + * + * @tparam K The data type of the key. + * @tparam V The data type of the vector's elements. + * The item data type should be a basic data type of C++/CUDA. + * @tparam M The data type for `meta`. + * The currently supported data type is only `uint64_t`. + * + */ +template +class BaseKVFile { + public: + virtual ~BaseKVFile() {} + + /** + * Read from file and fill into the keys, values, and metas buffer. + * When calling save/load method from table, it can assume that the + * received buffer of keys, vectors, and metas are automatically + * pre-allocated. + * + * @param n The number of KV pairs expect to read. `int64_t` was used + * here to adapt to various filesytem and formats. + * @param dim The dimension of the `vectors`. + * @param keys The pointer to received buffer for keys. + * @param vectors The pointer to received buffer for vectors. + * @param metas The pointer to received buffer for metas. + * + * @return Number of KV pairs have been successfully read. + */ + virtual size_t read(const size_t n, const size_t dim, K* keys, V* vectors, + M* metas) = 0; + + /** + * Write keys, values, metas from table to the file. It defines + * an abstract method to get batch of KV pairs and write them into + * file. + * + * @param n The number of KV pairs to be written. `int64_t` was used + * here to adapt to various filesytem and formats. + * @param dim The dimension of the `vectors`. + * @param keys The keys will be written to file. + * @param vectors The vectors of values will be written to file. + * @param metas The metas will be written to file. + * + * @return Number of KV pairs have been successfully written. + */ + virtual size_t write(const size_t n, const size_t dim, const K* keys, + const V* vectors, const M* metas) = 0; +}; + +enum class OccupyResult { + INITIAL, ///< Initial status + CONTINUE, ///< Insert did not succeed, continue trying to insert + OCCUPIED_EMPTY, ///< New pair inserted successfully + OCCUPIED_RECLAIMED, + DUPLICATE, ///< Insert did not succeed, key is already present + EVICT, ///< Insert succeeded by evicting one key with minimum meta. + REFUSED, ///< Insert did not succeed, insert meta is too low. +}; + +enum class OverrideResult { + INITIAL, ///< Initial status + CONTINUE, ///< Override did not succeed, continue trying to override + SUCCESS, ///< Override successfully + REFUSED, ///< Override is refused. +}; + +} // namespace merlin +} // namespace nv diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/utils.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/utils.cuh new file mode 100644 index 000000000..bd60f93fc --- /dev/null +++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/utils.cuh @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include "cuda_fp16.h" +#include "cuda_runtime_api.h" +#include "debug.hpp" +#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/cuda_utils.cuh" + +using namespace cooperative_groups; +namespace cg = cooperative_groups; + +/* +__inline__ __device__ uint64_t atomicCAS(uint64_t* address, uint64_t compare, + uint64_t val) { + return (uint64_t)atomicCAS((unsigned long long*)address, + (unsigned long long)compare, + (unsigned long long)val); +} + +__inline__ __device__ int64_t atomicCAS(int64_t* address, int64_t compare, + int64_t val) { + return (int64_t)atomicCAS((unsigned long long*)address, + (unsigned long long)compare, + (unsigned long long)val); +} +*/ + +__inline__ __device__ uint64_t atomicExch(uint64_t* address, uint64_t val) { + return (uint64_t)atomicExch((unsigned long long*)address, + (unsigned long long)val); +} + +__inline__ __device__ int64_t atomicExch(int64_t* address, int64_t val) { + return (int64_t)atomicExch((unsigned long long*)address, + (unsigned long long)val); +} + +__inline__ __device__ signed char atomicExch(signed char* address, + signed char val) { + signed char old = *address; + *address = val; + return old; +} + +/* +__inline__ __device__ int64_t atomicAdd(int64_t* address, const int64_t val) { + return (int64_t)atomicAdd((unsigned long long*)address, val); +} + +__inline__ __device__ uint64_t atomicAdd(uint64_t* address, + const uint64_t val) { + return (uint64_t)atomicAdd((unsigned long long*)address, val); +} +*/ + +namespace nv { +namespace merlin { + +inline void __cudaCheckError(const char* file, const int line) { +#ifdef CUDA_ERROR_CHECK + cudaError err = cudaGetLastError(); + if (cudaSuccess != err) { + fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n", file, line, + cudaGetErrorString(err)); + exit(-1); + } + + // More careful checking. However, this will affect performance. + // Comment away if needed. + err = cudaDeviceSynchronize(); + if (cudaSuccess != err) { + fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n", file, + line, cudaGetErrorString(err)); + exit(-1); + } +#endif + + return; +} +//#define CudaCheckError() nv::merlin::__cudaCheckError(__FILE__, __LINE__) +#define CudaCheckError() {} + +static inline size_t SAFE_GET_GRID_SIZE(size_t N, int block_size) { + return ((N) > std::numeric_limits::max()) + ? ((1 << 30 - 1) / block_size + 1) + : (((N)-1) / block_size + 1); +} + +static inline int SAFE_GET_BLOCK_SIZE(int block_size, int device = -1) { + cudaDeviceProp prop; + int current_device = device; + if (current_device == -1) { + CUDA_CHECK(cudaGetDevice(¤t_device)); + } + CUDA_CHECK(cudaGetDeviceProperties(&prop, current_device)); + if (block_size > prop.maxThreadsPerBlock) { + fprintf(stdout, + "The requested block_size=%d exceeds the device limit, " + "the maxThreadsPerBlock=%d will be applied.\n", + block_size, prop.maxThreadsPerBlock); + } + return std::min(prop.maxThreadsPerBlock, block_size); +} + +inline uint64_t Murmur3HashHost(const uint64_t& key) { + uint64_t k = key; + k ^= k >> 33; + k *= UINT64_C(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= UINT64_C(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + return k; +} + +__inline__ __device__ uint64_t Murmur3HashDevice(uint64_t const& key) { + uint64_t k = key; + k ^= k >> 33; + k *= UINT64_C(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= UINT64_C(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + return k; +} + +__inline__ __device__ int64_t Murmur3HashDevice(int64_t const& key) { + uint64_t k = uint64_t(key); + k ^= k >> 33; + k *= UINT64_C(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= UINT64_C(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + return int64_t(k); +} + +__inline__ __device__ uint32_t Murmur3HashDevice(uint32_t const& key) { + uint32_t k = key; + k ^= k >> 16; + k *= UINT32_C(0x85ebca6b); + k ^= k >> 13; + k *= UINT32_C(0xc2b2ae35); + k ^= k >> 16; + + return k; +} + +__inline__ __device__ int32_t Murmur3HashDevice(int32_t const& key) { + uint32_t k = uint32_t(key); + k ^= k >> 16; + k *= UINT32_C(0x85ebca6b); + k ^= k >> 13; + k *= UINT32_C(0xc2b2ae35); + k ^= k >> 16; + + return int32_t(k); +} + +class CudaDeviceRestorer { + public: + CudaDeviceRestorer() { CUDA_CHECK(cudaGetDevice(&dev_)); } + ~CudaDeviceRestorer() { CUDA_CHECK(cudaSetDevice(dev_)); } + + private: + int dev_; +}; + +static inline int get_dev(const void* ptr) { + cudaPointerAttributes attr; + CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr)); + int dev = -1; + +#if CUDART_VERSION >= 10000 + if (attr.type == cudaMemoryTypeDevice) +#else + if (attr.memoryType == cudaMemoryTypeDevice) +#endif + { + dev = attr.device; + } + return dev; +} + +static inline void switch_to_dev(const void* ptr) { + int dev = get_dev(ptr); + if (dev >= 0) { + CUDA_CHECK(cudaSetDevice(dev)); + } +} + +static inline bool is_on_device(const void* ptr) { + cudaPointerAttributes attr; + CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr)); + +#if CUDART_VERSION >= 10000 + return (attr.type == cudaMemoryTypeDevice); +#else + return (attr.memoryType == cudaMemoryTypeDevice); +#endif +} + +template +struct TypeConvertFunc; + +template <> +struct TypeConvertFunc<__half, float> { + static __forceinline__ __device__ __half convert(float val) { + return __float2half(val); + } +}; + +template <> +struct TypeConvertFunc { + static __forceinline__ __device__ float convert(__half val) { + return __half2float(val); + } +}; + +template <> +struct TypeConvertFunc { + static __forceinline__ __device__ float convert(float val) { return val; } +}; + +template <> +struct TypeConvertFunc { + static __forceinline__ __device__ float convert(long long val) { + return static_cast(val); + } +}; + +template <> +struct TypeConvertFunc { + static __forceinline__ __device__ float convert(unsigned int val) { + return static_cast(val); + } +}; + +template <> +struct TypeConvertFunc { + static __forceinline__ __device__ int convert(long long val) { + return static_cast(val); + } +}; + +template <> +struct TypeConvertFunc { + static __forceinline__ __device__ int convert(unsigned int val) { + return static_cast(val); + } +}; + +template +void realloc(P* ptr, size_t old_size, size_t new_size) { + // Truncate old_size to limit dowstream copy ops. + old_size = std::min(old_size, new_size); + + // Alloc new buffer and copy at old data. + char* new_ptr; + CUDA_CHECK(cudaMalloc(&new_ptr, new_size)); + if (*ptr != nullptr) { + CUDA_CHECK(cudaMemcpy(new_ptr, *ptr, old_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaFree(*ptr)); + } + + // Zero-fill remainder. + CUDA_CHECK(cudaMemset(new_ptr + old_size, 0, new_size - old_size)); + + // Switch to new pointer. + *ptr = reinterpret_cast

(new_ptr); + return; +} + +template +void realloc_managed(P* ptr, size_t old_size, size_t new_size) { + // Truncate old_size to limit dowstream copy ops. + old_size = std::min(old_size, new_size); + + // Alloc new buffer and copy at old data. + char* new_ptr; + CUDA_CHECK(cudaMallocManaged(&new_ptr, new_size)); + if (*ptr != nullptr) { + CUDA_CHECK(cudaMemcpy(new_ptr, *ptr, old_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaFree(*ptr)); + } + + // Zero-fill remainder. + CUDA_CHECK(cudaMemset(new_ptr + old_size, 0, new_size - old_size)); + + // Switch to new pointer. + *ptr = reinterpret_cast

(new_ptr); + return; +} + +template +__forceinline__ __device__ void lock( + const cg::thread_block_tile& tile, mutex& set_mutex, + unsigned long long lane = 0) { + if (THREAD_SAFE) { + set_mutex.acquire(tile, lane); + } +} + +template +__forceinline__ __device__ void unlock( + const cg::thread_block_tile& tile, mutex& set_mutex, + unsigned long long lane = 0) { + if (THREAD_SAFE) { + set_mutex.release(tile, lane); + } +} + +inline void free_pointers(cudaStream_t stream, int n, ...) { + va_list args; + va_start(args, n); + void* ptr = nullptr; + for (int i = 0; i < n; i++) { + ptr = va_arg(args, void*); + if (ptr) { + cudaPointerAttributes attr; + memset(&attr, 0, sizeof(cudaPointerAttributes)); + try { + CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr)); + if (attr.devicePointer && (!attr.hostPointer)) { + CUDA_CHECK(cudaFreeAsync(ptr, stream)); + } else if (attr.devicePointer && attr.hostPointer) { + CUDA_CHECK(cudaFreeHost(ptr)); + } else { + free(ptr); + } + } catch (const nv::merlin::CudaException& e) { + va_end(args); + throw e; + } + } + } + va_end(args); +} + +#define CUDA_FREE_POINTERS(stream, ...) \ + nv::merlin::free_pointers( \ + stream, (sizeof((void*[]){__VA_ARGS__}) / sizeof(void*)), __VA_ARGS__); + +static inline size_t GB(size_t n) { return n << 30; } + +static inline size_t MB(size_t n) { return n << 20; } + +static inline size_t KB(size_t n) { return n << 10; } + +} // namespace merlin +} // namespace nv diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_hashtable.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_hashtable.cuh new file mode 100644 index 000000000..19da172ea --- /dev/null +++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_hashtable.cuh @@ -0,0 +1,1643 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "merlin/array_kernels.cuh" +#include "merlin/core_kernels.cuh" +#include "merlin/flexible_buffer.cuh" +#include "merlin/group_lock.hpp" +#include "merlin/memory_pool.cuh" +#include "merlin/types.cuh" +#include "merlin/utils.cuh" + +namespace nv { +namespace merlin { + +/** + * @brief Enumeration of the eviction strategies. + * + * @note The `meta` is introduced to define the importance of each key, the + * larger, the more important, the less likely they will be evicted. On `kLru` + * mode, the `metas` parameter of the APIs should keep `nullptr`, the meta for + * each key is assigned internally in LRU(Least Recently Used) policy. On + * `kCustomized` mode, the `metas` should be provided by caller. + * + * @note Eviction occurs automatically when a bucket is full. The keys with the + * minimum `meta` value are evicted first. + * + */ +enum class EvictStrategy { + kLru = 0, ///< LRU mode. + kCustomized = 1 ///< Customized mode. +}; + +/** + * @brief The options struct of HierarchicalKV. + */ +struct HashTableOptions { + size_t init_capacity = 0; ///< The initial capacity of the hash table. + size_t max_capacity = 0; ///< The maximum capacity of the hash table. + size_t max_hbm_for_vectors = 0; ///< The maximum HBM for vectors, in bytes. + size_t max_bucket_size = 128; ///< The length of each bucket. + size_t dim = 64; ///< The dimension of the vectors. + float max_load_factor = 0.5f; ///< The max load factor before rehashing. + int block_size = 128; ///< The default block size for CUDA kernels. + int io_block_size = 1024; ///< The block size for IO CUDA kernels. + int device_id = -1; ///< The ID of device. + bool io_by_cpu = false; ///< The flag indicating if the CPU handles IO. + EvictStrategy evict_strategy = EvictStrategy::kLru; ///< The evict strategy. + bool use_constant_memory = false; ///< reserved + MemoryPoolOptions + device_memory_pool; ///< Configuration options for device memory pool. + MemoryPoolOptions + host_memory_pool; ///< Configuration options for host memory pool. +}; + +/** + * @brief A customizable template function indicates which keys should be + * erased from the hash table by returning `true`. + * + * @note The `erase_if` or `export_batch_if` API traverses all of the items by + * this function and the items that return `true` are removed or exported. + * + * Example for erase_if: + * + * ``` + * template + * __forceinline__ __device__ bool erase_if_pred(const K& key, + * M& meta, + * const K& pattern, + * const M& threshold) { + * return ((key & 0xFFFF000000000000 == pattern) && + * (meta < threshold)); + * } + * ``` + * + * Example for export_batch_if: + * ``` + * template + * __forceinline__ __device__ bool export_if_pred(const K& key, + * M& meta, + * const K& pattern, + * const M& threshold) { + * return meta >= threshold; + * } + * ``` + */ +template +using EraseIfPredict = bool (*)( + const K& key, ///< The traversed key in a hash table. + M& meta, ///< The traversed meta in a hash table. + const K& pattern, ///< The key pattern to compare with the `key` argument. + const M& threshold ///< The threshold to compare with the `meta` argument. +); + +/** + * A HierarchicalKV hash table is a concurrent and hierarchical hash table that + * is powered by GPUs and can use HBM and host memory as storage for key-value + * pairs. Support for SSD storage is a future consideration. + * + * The `meta` is introduced to define the importance of each key, the + * larger, the more important, the less likely they will be evicted. Eviction + * occurs automatically when a bucket is full. The keys with the minimum `meta` + * value are evicted first. In a customized eviction strategy, we recommend + * using the timestamp or frequency of the key occurrence as the `meta` value + * for each key. You can also assign a special value to the `meta` to + * perform a customized eviction strategy. + * + * @note By default configuration, this class is thread-safe. + * + * @tparam K The data type of the key. + * @tparam V The data type of the vector's item type. + * The item data type should be a basic data type of C++/CUDA. + * @tparam M The data type for `meta`. + * The currently supported data type is only `uint64_t`. + * + */ +template +class HashTable { + public: + using size_type = size_t; + using key_type = K; + using value_type = V; + using meta_type = M; + using Pred = EraseIfPredict; + + private: + using TableCore = nv::merlin::Table; + static constexpr unsigned int TILE_SIZE = 4; + + using DeviceMemoryPool = MemoryPool>; + using HostMemoryPool = MemoryPool>; + +#if THRUST_VERSION >= 101600 + static constexpr auto thrust_par = thrust::cuda::par_nosync; +#else + static constexpr auto thrust_par = thrust::cuda::par; +#endif + + public: + /** + * @brief Default constructor for the hash table class. + */ + HashTable(){}; + + /** + * @brief Frees the resources used by the hash table and destroys the hash + * table object. + */ + ~HashTable() { + if (initialized_) { + CUDA_CHECK(cudaDeviceSynchronize()); + + initialized_ = false; + destroy_table(&table_); + CUDA_CHECK(cudaFree(d_table_)); + dev_mem_pool_.reset(); + host_mem_pool_.reset(); + } + } + + private: + HashTable(const HashTable&) = delete; + HashTable& operator=(const HashTable&) = delete; + HashTable(HashTable&&) = delete; + HashTable& operator=(HashTable&&) = delete; + + public: + /** + * @brief Initialize a merlin::HashTable. + * + * @param options The configuration options. + */ + void init(const HashTableOptions options) { + if (initialized_) { + return; + } + options_ = options; + + if (options_.device_id >= 0) { + CUDA_CHECK(cudaSetDevice(options_.device_id)); + } else { + CUDA_CHECK(cudaGetDevice(&(options_.device_id))); + } + + // Construct table. + cudaDeviceProp deviceProp; + CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, options_.device_id)); + shared_mem_size_ = deviceProp.sharedMemPerBlock; + create_table( + &table_, options_.dim, options_.init_capacity, options_.max_capacity, + options_.max_hbm_for_vectors, options_.max_bucket_size); + options_.block_size = SAFE_GET_BLOCK_SIZE(options_.block_size); + reach_max_capacity_ = (options_.init_capacity * 2 > options_.max_capacity); + MERLIN_CHECK((!(options_.io_by_cpu && options_.max_hbm_for_vectors != 0)), + "[HierarchicalKV] `io_by_cpu` should not be true when " + "`max_hbm_for_vectors` is not 0!"); + CUDA_CHECK(cudaMalloc((void**)&(d_table_), sizeof(TableCore))); + + sync_table_configuration(); + + // Create memory pools. + dev_mem_pool_ = std::make_unique>>( + options_.device_memory_pool); + host_mem_pool_ = std::make_unique>>( + options_.host_memory_pool); + + CUDA_CHECK(cudaDeviceSynchronize()); + initialized_ = true; + CudaCheckError(); + } + + /** + * @brief Insert new key-value-meta tuples into the hash table. + * If the key already exists, the values and metas are assigned new values. + * + * If the target bucket is full, the keys with minimum meta will be + * overwritten by new key unless the meta of the new key is even less than + * minimum meta of the target bucket. + * + * @param n Number of key-value-meta tuples to insert or assign. + * @param keys The keys to insert on GPU-accessible memory with shape + * (n). + * @param values The values to insert on GPU-accessible memory with + * shape (n, DIM). + * @param metas The metas to insert on GPU-accessible memory with shape + * (n). + * @parblock + * The metas should be a `uint64_t` value. You can specify a value that + * such as the timestamp of the key insertion, number of the key + * occurrences, or another value to perform a custom eviction strategy. + * + * The @p metas should be `nullptr`, when the LRU eviction strategy is + * applied. + * @endparblock + * + * @param stream The CUDA stream that is used to execute the operation. + * + * @param ignore_evict_strategy A boolean option indicating whether if + * the insert_or_assign ignores the evict strategy of table with current + * metas anyway. If true, it does not check whether the metas confroms to + * the evict strategy. If false, it requires the metas follow the evict + * strategy of table. + */ + void insert_or_assign(const size_type n, + const key_type* keys, // (n) + const value_type* values, // (n, DIM) + const meta_type* metas = nullptr, // (n) + cudaStream_t stream = 0, + bool ignore_evict_strategy = false) { + if (n == 0) { + return; + } + + while (!reach_max_capacity_ && + fast_load_factor(n, stream) > options_.max_load_factor) { + reserve(capacity() * 2, stream); + } + + if (!ignore_evict_strategy) { + check_evict_strategy(metas); + } + + writer_shared_lock lock(mutex_); + + if (is_fast_mode()) { + using Selector = + SelectUpsertKernelWithIO; + static thread_local int step_counter = 0; + static thread_local float load_factor = 0.0; + + if (((step_counter++) % kernel_select_interval_) == 0) { + load_factor = fast_load_factor(0, stream, false); + } + + Selector::execute_kernel( + load_factor, options_.block_size, options_.max_bucket_size, + table_->buckets_num, options_.dim, stream, n, d_table_, keys, + reinterpret_cast(values), metas); + } else { + const size_type dev_ws_size{n * (sizeof(value_type*) + sizeof(int))}; + auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)}; + auto d_dst{dev_ws.get(0)}; + auto d_src_offset{reinterpret_cast(d_dst + n)}; + + CUDA_CHECK(cudaMemsetAsync(d_dst, 0, dev_ws_size, stream)); + + { + const size_t block_size = options_.block_size; + const size_t N = n * TILE_SIZE; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + upsert_kernel + <<>>( + d_table_, options_.max_bucket_size, table_->buckets_num, + options_.dim, keys, d_dst, metas, d_src_offset, N); + } + + { + thrust::device_ptr d_dst_ptr( + reinterpret_cast(d_dst)); + thrust::device_ptr d_src_offset_ptr(d_src_offset); + + thrust::sort_by_key(thrust_par.on(stream), d_dst_ptr, d_dst_ptr + n, + d_src_offset_ptr, thrust::less()); + } + + if (options_.io_by_cpu) { + const size_type host_ws_size{dev_ws_size + + n * sizeof(value_type) * dim()}; + auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)}; + auto h_dst{host_ws.get(0)}; + auto h_src_offset{reinterpret_cast(h_dst + n)}; + auto h_values{reinterpret_cast(h_src_offset + n)}; + + CUDA_CHECK(cudaMemcpyAsync(h_dst, d_dst, dev_ws_size, + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(h_values, values, host_ws_size - dev_ws_size, + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + write_by_cpu(h_dst, h_values, h_src_offset, dim(), n); + } else { + const size_t block_size = options_.io_block_size; + const size_t N = n * dim(); + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + write_kernel + <<>>(values, d_dst, d_src_offset, + dim(), N); + } + } + + CudaCheckError(); + } + + /** + * @brief Insert new key-value-meta tuples into the hash table. + * If the key already exists, the values and metas are assigned new values. + * + * If the target bucket is full, the keys with minimum meta will be + * overwritten by new key unless the meta of the new key is even less than + * minimum meta of the target bucket. The overwritten key with minimum + * meta will be evicted, with its values and meta, to evicted_keys, + * evicted_values, evcted_metas seperately in compact format. + * + * @param n Number of key-value-meta tuples to insert or assign. + * @param keys The keys to insert on GPU-accessible memory with shape + * (n). + * @param values The values to insert on GPU-accessible memory with + * shape (n, DIM). + * @param metas The metas to insert on GPU-accessible memory with shape + * (n). + * @param metas The metas to insert on GPU-accessible memory with shape + * (n). + * @params evicted_keys The output of keys replaced with minimum meta. + * @params evicted_values The output of values replaced with minimum meta on + * keys. + * @params evicted_metas The output of metas replaced with minimum meta on + * keys. + * @parblock + * The metas should be a `uint64_t` value. You can specify a value that + * such as the timestamp of the key insertion, number of the key + * occurrences, or another value to perform a custom eviction strategy. + * + * The @p metas should be `nullptr`, when the LRU eviction strategy is + * applied. + * @endparblock + * + * @param stream The CUDA stream that is used to execute the operation. + * + * @param ignore_evict_strategy A boolean option indicating whether if + * the insert_or_assign ignores the evict strategy of table with current + * metas anyway. If true, it does not check whether the metas confroms to + * the evict strategy. If false, it requires the metas follow the evict + * strategy of table. + */ + size_type insert_and_evict(const size_type n, + const key_type* keys, // (n) + const value_type* values, // (n, DIM) + const meta_type* metas, // (n) + key_type* evicted_keys, // (n) + value_type* evicted_values, // (n, DIM) + meta_type* evicted_metas, // (n) + cudaStream_t stream = 0) { + if (n == 0) { + return 0; + } + + while (!reach_max_capacity_ && + fast_load_factor(n, stream) > options_.max_load_factor) { + reserve(capacity() * 2, stream); + } + + writer_shared_lock lock(mutex_); + + // TODO: Currently only need eviction when using HashTable as HBM cache. + if (!is_fast_mode()) { + throw std::runtime_error("Only allow insert_and_evict in pure HBM mode."); + } + + using Selector = + SelectUpsertAndEvictKernelWithIO; + static thread_local int step_counter = 0; + static thread_local float load_factor = 0.0; + + if (((step_counter++) % kernel_select_interval_) == 0) { + load_factor = fast_load_factor(0, stream, false); + } + + // always use max tile to avoid data-deps as possible. + const int TILE_SIZE = 32; + size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE; + const size_type dev_ws_size = + n_offsets * sizeof(int64_t) + n * sizeof(bool) + sizeof(size_type); + + auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)}; + auto d_offsets{dev_ws.get(0)}; + auto dn_evicted = reinterpret_cast(d_offsets + n_offsets); + auto d_masks = reinterpret_cast(dn_evicted + 1); + + CUDA_CHECK( + cudaMemsetAsync(d_offsets, 0, n_offsets * sizeof(int64_t), stream)); + CUDA_CHECK(cudaMemsetAsync(dn_evicted, 0, sizeof(size_type), stream)); + CUDA_CHECK(cudaMemsetAsync(d_masks, 0, n * sizeof(bool), stream)); + + size_type block_size = options_.block_size; + size_type grid_size = SAFE_GET_GRID_SIZE(n, block_size); + CUDA_CHECK(cudaMemsetAsync(evicted_keys, static_cast(EMPTY_KEY), + n * sizeof(K), stream)); + + Selector::execute_kernel( + load_factor, options_.block_size, options_.max_bucket_size, + table_->buckets_num, options_.dim, stream, n, d_table_, keys, values, + metas, evicted_keys, evicted_values, evicted_metas); + + keys_not_empty + <<>>(evicted_keys, d_masks, n); + size_type n_evicted = 0; + gpu_pick_kvm_inplace( + grid_size, block_size, d_masks, true, n, dn_evicted, d_offsets, evicted_keys, + evicted_values, evicted_metas, dim(), stream); + CUDA_CHECK(cudaMemcpyAsync(&n_evicted, dn_evicted, sizeof(size_type), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + CudaCheckError(); + return n_evicted; + } + + /** + * Searches for each key in @p keys in the hash table. + * If the key is found and the corresponding value in @p accum_or_assigns is + * `true`, the @p vectors_or_deltas is treated as a delta to the old + * value, and the delta is added to the old value of the key. + * + * If the key is not found and the corresponding value in @p accum_or_assigns + * is `false`, the @p vectors_or_deltas is treated as a new value and the + * key-value pair is updated in the table directly. + * + * @note When the key is found and the value of @p accum_or_assigns is + * `false`, or when the key is not found and the value of @p accum_or_assigns + * is `true`, nothing is changed and this operation is ignored. + * The algorithm assumes these situations occur while the key was modified or + * removed by other processes just now. + * + * @param n The number of key-value-meta tuples to process. + * @param keys The keys to insert on GPU-accessible memory with shape (n). + * @param value_or_deltas The values or deltas to insert on GPU-accessible + * memory with shape (n, DIM). + * @param accum_or_assigns The operation type with shape (n). A value of + * `true` indicates to accum and `false` indicates to assign. + * @param metas The metas to insert on GPU-accessible memory with shape (n). + * @parblock + * The metas should be a `uint64_t` value. You can specify a value that + * such as the timestamp of the key insertion, number of the key + * occurrences, or another value to perform a custom eviction strategy. + * + * The @p metas should be `nullptr`, when the LRU eviction strategy is + * applied. + * @endparblock + * + * @param stream The CUDA stream that is used to execute the operation. + * + * @param ignore_evict_strategy A boolean option indicating whether if + * the accum_or_assign ignores the evict strategy of table with current + * metas anyway. If true, it does not check whether the metas confroms to + * the evict strategy. If false, it requires the metas follow the evict + * strategy of table. + * + */ + void accum_or_assign(const size_type n, + const key_type* keys, // (n) + const value_type* value_or_deltas, // (n, DIM) + const bool* accum_or_assigns, // (n) + const meta_type* metas = nullptr, // (n) + cudaStream_t stream = 0, + bool ignore_evict_strategy = false) { + if (n == 0) { + return; + } + + while (!reach_max_capacity_ && + fast_load_factor(n, stream) > options_.max_load_factor) { + reserve(capacity() * 2, stream); + } + + if (!ignore_evict_strategy) { + check_evict_strategy(metas); + } + + writer_shared_lock lock(mutex_); + + const size_type dev_ws_size{ + n * (sizeof(value_type*) + sizeof(int) + sizeof(bool))}; + auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)}; + auto dst{dev_ws.get(0)}; + auto src_offset{reinterpret_cast(dst + n)}; + auto founds{reinterpret_cast(src_offset + n)}; + + CUDA_CHECK(cudaMemsetAsync(dst, 0, dev_ws_size, stream)); + + { + const size_t block_size = options_.block_size; + const size_t N = n * TILE_SIZE; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + accum_kernel + <<>>( + table_, keys, dst, metas, accum_or_assigns, table_->buckets, + table_->buckets_size, table_->bucket_max_size, + table_->buckets_num, src_offset, founds, N); + } + + if (!is_fast_mode()) { + thrust::device_ptr dst_ptr(reinterpret_cast(dst)); + thrust::device_ptr src_offset_ptr(src_offset); + + thrust::sort_by_key(thrust_par.on(stream), dst_ptr, dst_ptr + n, + src_offset_ptr, thrust::less()); + } + + { + const size_t block_size = options_.io_block_size; + const size_t N = n * dim(); + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + write_with_accum_kernel + <<>>(value_or_deltas, dst, + accum_or_assigns, founds, + src_offset, dim(), N); + } + + CudaCheckError(); + } + + /** + * @brief Searches the hash table for the specified keys. + * When a key is missing, the value in @p values and @p metas will be + * inserted. + * + * @param n The number of key-value-meta tuples to search or insert. + * @param keys The keys to search on GPU-accessible memory with shape (n). + * @param values The values to search on GPU-accessible memory with + * shape (n, DIM). + * @param metas The metas to search on GPU-accessible memory with shape (n). + * @parblock + * If @p metas is `nullptr`, the meta for each key will not be returned. + * @endparblock + * @param stream The CUDA stream that is used to execute the operation. + * + */ + void find_or_insert(const size_type n, const key_type* keys, // (n) + value_type* values, // (n * DIM) + meta_type* metas = nullptr, // (n) + cudaStream_t stream = 0, + bool ignore_evict_strategy = false) { + if (n == 0) { + return; + } + + while (!reach_max_capacity_ && + fast_load_factor(n, stream) > options_.max_load_factor) { + reserve(capacity() * 2, stream); + } + + if (!ignore_evict_strategy) { + check_evict_strategy(metas); + } + + writer_shared_lock lock(mutex_); + + if (is_fast_mode()) { + using Selector = + SelectFindOrInsertKernelWithIO; + static thread_local int step_counter = 0; + static thread_local float load_factor = 0.0; + + if (((step_counter++) % kernel_select_interval_) == 0) { + load_factor = fast_load_factor(0, stream, false); + } + Selector::execute_kernel(load_factor, options_.block_size, + options_.max_bucket_size, table_->buckets_num, + options_.dim, stream, n, d_table_, keys, values, + metas); + } else { + const size_type dev_ws_size{ + n * (sizeof(value_type*) + sizeof(int) + sizeof(bool))}; + auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)}; + auto d_table_value_addrs{dev_ws.get(0)}; + auto param_key_index{reinterpret_cast(d_table_value_addrs + n)}; + auto founds{reinterpret_cast(param_key_index + n)}; + + CUDA_CHECK(cudaMemsetAsync(d_table_value_addrs, 0, dev_ws_size, stream)); + + { + const size_t block_size = options_.block_size; + const size_t N = n * TILE_SIZE; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + find_or_insert_kernel + <<>>( + d_table_, options_.max_bucket_size, table_->buckets_num, + options_.dim, keys, d_table_value_addrs, metas, founds, + param_key_index, N); + } + + { + thrust::device_ptr table_value_ptr( + reinterpret_cast(d_table_value_addrs)); + thrust::device_ptr param_key_index_ptr(param_key_index); + + thrust::sort_by_key(thrust_par.on(stream), table_value_ptr, + table_value_ptr + n, param_key_index_ptr, + thrust::less()); + } + + if (options_.io_by_cpu) { + const size_type host_ws_size{ + dev_ws_size + n * (sizeof(bool) + sizeof(value_type) * dim())}; + auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)}; + auto h_table_value_addrs{host_ws.get(0)}; + auto h_param_key_index{reinterpret_cast(h_table_value_addrs + n)}; + auto h_founds{reinterpret_cast(h_param_key_index + n)}; + auto h_param_values{reinterpret_cast(h_founds + n)}; + + CUDA_CHECK(cudaMemcpyAsync(h_table_value_addrs, d_table_value_addrs, + dev_ws_size, cudaMemcpyDeviceToHost, + stream)); + CUDA_CHECK(cudaMemcpyAsync(h_founds, founds, n * sizeof(bool), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(h_param_values, values, + n * sizeof(value_type) * dim(), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + read_or_write_by_cpu(h_table_value_addrs, h_param_values, + h_param_key_index, h_founds, dim(), n); + CUDA_CHECK(cudaMemcpyAsync(values, h_param_values, + n * sizeof(value_type) * dim(), + cudaMemcpyHostToDevice, stream)); + } else { + const size_t block_size = options_.io_block_size; + const size_t N = n * dim(); + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + read_or_write_kernel + <<>>( + d_table_value_addrs, values, founds, param_key_index, dim(), N); + } + } + + CudaCheckError(); + } + + /** + * @brief Searches the hash table for the specified keys and returns address + * of the values. When a key is missing, the value in @p values and @p metas + * will be inserted. + * + * @warning This API returns internal addresses for high-performance but + * thread-unsafe. The caller is responsible for guaranteeing data consistency. + * + * @param n The number of key-value-meta tuples to search or insert. + * @param keys The keys to search on GPU-accessible memory with shape (n). + * @param values The addresses of values to search on GPU-accessible memory + * with shape (n). + * @param founds The status that indicates if the keys are found on + * @param metas The metas to search on GPU-accessible memory with shape (n). + * @parblock + * If @p metas is `nullptr`, the meta for each key will not be returned. + * @endparblock + * @param stream The CUDA stream that is used to execute the operation. + * + */ + void find_or_insert(const size_type n, const key_type* keys, // (n) + value_type** values, // (n) + bool* founds, // (n) + meta_type* metas = nullptr, // (n) + cudaStream_t stream = 0, + bool ignore_evict_strategy = false) { + if (n == 0) { + return; + } + + while (!reach_max_capacity_ && + fast_load_factor(n, stream) > options_.max_load_factor) { + reserve(capacity() * 2, stream); + } + + if (!ignore_evict_strategy) { + check_evict_strategy(metas); + } + + writer_shared_lock lock(mutex_); + + using Selector = + SelectFindOrInsertPtrKernel; + static thread_local int step_counter = 0; + static thread_local float load_factor = 0.0; + + if (((step_counter++) % kernel_select_interval_) == 0) { + load_factor = fast_load_factor(0, stream, false); + } + Selector::execute_kernel(load_factor, options_.block_size, + options_.max_bucket_size, table_->buckets_num, + options_.dim, stream, n, d_table_, keys, values, + metas, founds); + + CudaCheckError(); + } + /** + * @brief Assign new key-value-meta tuples into the hash table. + * If the key doesn't exist, the operation on the key will be ignored. + * + * @param n Number of key-value-meta tuples to insert or assign. + * @param keys The keys to insert on GPU-accessible memory with shape + * (n). + * @param values The values to insert on GPU-accessible memory with + * shape (n, DIM). + * @param metas The metas to insert on GPU-accessible memory with shape + * (n). + * @parblock + * The metas should be a `uint64_t` value. You can specify a value that + * such as the timestamp of the key insertion, number of the key + * occurrences, or another value to perform a custom eviction strategy. + * + * The @p metas should be `nullptr`, when the LRU eviction strategy is + * applied. + * @endparblock + * + * @param stream The CUDA stream that is used to execute the operation. + */ + void assign(const size_type n, + const key_type* keys, // (n) + const value_type* values, // (n, DIM) + const meta_type* metas = nullptr, // (n) + cudaStream_t stream = 0) { + if (n == 0) { + return; + } + + writer_shared_lock lock(mutex_); + + if (is_fast_mode()) { + using Selector = + SelectUpdateKernelWithIO; + static thread_local int step_counter = 0; + static thread_local float load_factor = 0.0; + + if (((step_counter++) % kernel_select_interval_) == 0) { + load_factor = fast_load_factor(0, stream, false); + } + + Selector::execute_kernel(load_factor, options_.block_size, + options_.max_bucket_size, table_->buckets_num, + options_.dim, stream, n, d_table_, keys, values, + metas); + } else { + const size_type dev_ws_size{n * (sizeof(value_type*) + sizeof(int))}; + auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)}; + auto d_dst{dev_ws.get(0)}; + auto d_src_offset{reinterpret_cast(d_dst + n)}; + + CUDA_CHECK(cudaMemsetAsync(d_dst, 0, dev_ws_size, stream)); + + { + const size_t block_size = options_.block_size; + const size_t N = n * TILE_SIZE; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + update_kernel + <<>>( + d_table_, options_.max_bucket_size, table_->buckets_num, + options_.dim, keys, d_dst, metas, d_src_offset, N); + } + + { + thrust::device_ptr d_dst_ptr( + reinterpret_cast(d_dst)); + thrust::device_ptr d_src_offset_ptr(d_src_offset); + + thrust::sort_by_key(thrust_par.on(stream), d_dst_ptr, d_dst_ptr + n, + d_src_offset_ptr, thrust::less()); + } + + if (options_.io_by_cpu) { + const size_type host_ws_size{dev_ws_size + + n * sizeof(value_type) * dim()}; + auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)}; + auto h_dst{host_ws.get(0)}; + auto h_src_offset{reinterpret_cast(h_dst + n)}; + auto h_values{reinterpret_cast(h_src_offset + n)}; + + CUDA_CHECK(cudaMemcpyAsync(h_dst, d_dst, dev_ws_size, + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(h_values, values, host_ws_size - dev_ws_size, + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + write_by_cpu(h_dst, h_values, h_src_offset, dim(), n); + } else { + const size_t block_size = options_.io_block_size; + const size_t N = n * dim(); + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + write_kernel + <<>>(values, d_dst, d_src_offset, + dim(), N); + } + } + + CudaCheckError(); + } + + /** + * @brief Searches the hash table for the specified keys. + * + * @note When a key is missing, the value in @p values is not changed. + * + * @param n The number of key-value-meta tuples to search. + * @param keys The keys to search on GPU-accessible memory with shape (n). + * @param values The values to search on GPU-accessible memory with + * shape (n, DIM). + * @param founds The status that indicates if the keys are found on + * GPU-accessible memory with shape (n). + * @param metas The metas to search on GPU-accessible memory with shape (n). + * @parblock + * If @p metas is `nullptr`, the meta for each key will not be returned. + * @endparblock + * @param stream The CUDA stream that is used to execute the operation. + * + */ + void find(const size_type n, const key_type* keys, // (n) + value_type* values, // (n, DIM) + bool* founds, // (n) + meta_type* metas = nullptr, // (n) + cudaStream_t stream = 0) const { + if (n == 0) { + return; + } + + CUDA_CHECK(cudaMemsetAsync(founds, 0, n * sizeof(bool), stream)); + + reader_shared_lock lock(mutex_); + + if (is_fast_mode()) { + using Selector = + SelectLookupKernelWithIO; + static thread_local int step_counter = 0; + static thread_local float load_factor = 0.0; + + if (((step_counter++) % kernel_select_interval_) == 0) { + load_factor = fast_load_factor(0, stream, false); + } + Selector::execute_kernel(load_factor, options_.block_size, + options_.max_bucket_size, table_->buckets_num, + options_.dim, stream, n, d_table_, keys, values, + metas, founds); + } else { + const size_type dev_ws_size{n * (sizeof(value_type*) + sizeof(int))}; + auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)}; + auto src{dev_ws.get(0)}; + auto dst_offset{reinterpret_cast(src + n)}; + + CUDA_CHECK(cudaMemsetAsync(src, 0, dev_ws_size, stream)); + + { + const size_t block_size = options_.block_size; + const size_t N = n * TILE_SIZE; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + lookup_kernel + <<>>( + d_table_, options_.max_bucket_size, table_->buckets_num, + options_.dim, keys, src, metas, founds, dst_offset, N); + } + + { + thrust::device_ptr src_ptr( + reinterpret_cast(src)); + thrust::device_ptr dst_offset_ptr(dst_offset); + + thrust::sort_by_key(thrust_par.on(stream), src_ptr, src_ptr + n, + dst_offset_ptr, thrust::less()); + } + + { + const size_t block_size = options_.io_block_size; + const size_t N = n * dim(); + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + read_kernel + <<>>(src, values, founds, + dst_offset, dim(), N); + } + } + + CudaCheckError(); + } + + /** + * @brief Searches the hash table for the specified keys and returns address + * of the values. + * + * @note When a key is missing, the data in @p values won't change. + * @warning This API returns internal addresses for high-performance but + * thread-unsafe. The caller is responsible for guaranteeing data consistency. + * + * @param n The number of key-value-meta tuples to search. + * @param keys The keys to search on GPU-accessible memory with shape (n). + * @param values The addresses of values to search on GPU-accessible memory + * with shape (n). + * @param founds The status that indicates if the keys are found on + * GPU-accessible memory with shape (n). + * @param metas The metas to search on GPU-accessible memory with shape (n). + * @parblock + * If @p metas is `nullptr`, the meta for each key will not be returned. + * @endparblock + * @param stream The CUDA stream that is used to execute the operation. + * + */ + void find(const size_type n, const key_type* keys, // (n) + value_type** values, // (n) + bool* founds, // (n) + meta_type* metas = nullptr, // (n) + cudaStream_t stream = 0) const { + if (n == 0) { + return; + } + + CUDA_CHECK(cudaMemsetAsync(founds, 0, n * sizeof(bool), stream)); + + reader_shared_lock lock(mutex_); + + using Selector = SelectLookupPtrKernel; + static thread_local int step_counter = 0; + static thread_local float load_factor = 0.0; + + if (((step_counter++) % kernel_select_interval_) == 0) { + load_factor = fast_load_factor(0, stream, false); + } + Selector::execute_kernel(load_factor, options_.block_size, + options_.max_bucket_size, table_->buckets_num, + options_.dim, stream, n, d_table_, keys, values, + metas, founds); + + CudaCheckError(); + } + + /** + * @brief Removes specified elements from the hash table. + * + * @param n The number of keys to remove. + * @param keys The keys to remove on GPU-accessible memory. + * @param stream The CUDA stream that is used to execute the operation. + * + */ + void erase(const size_type n, const key_type* keys, cudaStream_t stream = 0) { + if (n == 0) { + return; + } + + write_read_lock lock(mutex_); + + { + const size_t block_size = options_.block_size; + const size_t N = n * TILE_SIZE; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + remove_kernel + <<>>( + table_, keys, table_->buckets, table_->buckets_size, + table_->bucket_max_size, table_->buckets_num, N); + } + + CudaCheckError(); + return; + } + + /** + * @brief Erases all elements that satisfy the predicate @p pred from the + * hash table. + * + * The value for @p pred should be a function with type `Pred` defined like + * the following example: + * + * ``` + * template + * __forceinline__ __device__ bool erase_if_pred(const K& key, + * const M& meta, + * const K& pattern, + * const M& threshold) { + * return ((key & 0x1 == pattern) && (meta < threshold)); + * } + * ``` + * + * @param pred The predicate function with type Pred that returns `true` if + * the element should be erased. + * @param pattern The third user-defined argument to @p pred with key_type + * type. + * @param threshold The fourth user-defined argument to @p pred with meta_type + * type. + * @param stream The CUDA stream that is used to execute the operation. + * + * @return The number of elements removed. + * + */ + size_type erase_if(const Pred& pred, const key_type& pattern, + const meta_type& threshold, cudaStream_t stream = 0) { + write_read_lock lock(mutex_); + + auto dev_ws{dev_mem_pool_->get_workspace<1>(sizeof(size_type), stream)}; + auto d_count{dev_ws.get(0)}; + + CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(size_type), stream)); + + Pred h_pred; + CUDA_CHECK(cudaMemcpyFromSymbolAsync(&h_pred, pred, sizeof(Pred), 0, + cudaMemcpyDeviceToHost, stream)); + + { + const size_t block_size = options_.block_size; + const size_t N = table_->buckets_num; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + remove_kernel + <<>>( + table_, h_pred, pattern, threshold, d_count, table_->buckets, + table_->buckets_size, table_->bucket_max_size, + table_->buckets_num, N); + } + + size_type count = 0; + CUDA_CHECK(cudaMemcpyAsync(&count, d_count, sizeof(size_type), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + CudaCheckError(); + return count; + } + + /** + * @brief Removes all of the elements in the hash table with no release + * object. + */ + void clear(cudaStream_t stream = 0) { + write_read_lock lock(mutex_); + + const size_t block_size = options_.block_size; + const size_t N = table_->buckets_num * table_->bucket_max_size; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + clear_kernel + <<>>(table_, N); + + CudaCheckError(); + } + + public: + /** + * @brief Exports a certain number of the key-value-meta tuples from the + * hash table. + * + * @param n The maximum number of exported pairs. + * @param offset The position of the key to remove. + * @param counter Accumulates amount of successfully exported values. + * @param keys The keys to dump from GPU-accessible memory with shape (n). + * @param values The values to dump from GPU-accessible memory with shape + * (n, DIM). + * @param metas The metas to search on GPU-accessible memory with shape (n). + * @parblock + * If @p metas is `nullptr`, the meta for each key will not be returned. + * @endparblock + * + * @param stream The CUDA stream that is used to execute the operation. + * + * @return The number of elements dumped. + * + * @throw CudaException If the key-value size is too large for GPU shared + * memory. Reducing the value for @p n is currently required if this exception + * occurs. + */ + void export_batch(size_type n, const size_type offset, + size_type* counter, // (1) + key_type* keys, // (n) + value_type* values, // (n, DIM) + meta_type* metas = nullptr, // (n) + cudaStream_t stream = 0) const { + reader_shared_lock lock(mutex_); + + if (offset >= table_->capacity) { + return; + } + n = std::min(table_->capacity - offset, n); + + size_type shared_size; + size_type block_size; + std::tie(shared_size, block_size) = + dump_kernel_shared_memory_size(shared_mem_size_); + + const size_t grid_size = SAFE_GET_GRID_SIZE(n, block_size); + + dump_kernel + <<>>( + table_, keys, values, metas, offset, n, counter); + + CudaCheckError(); + } + + size_type export_batch(const size_type n, const size_type offset, + key_type* keys, // (n) + value_type* values, // (n, DIM) + meta_type* metas = nullptr, // (n) + cudaStream_t stream = 0) const { + auto dev_ws{dev_mem_pool_->get_workspace<1>(sizeof(size_type), stream)}; + auto d_counter{dev_ws.get(0)}; + + CUDA_CHECK(cudaMemsetAsync(d_counter, 0, sizeof(size_type), stream)); + export_batch(n, offset, d_counter, keys, values, metas, stream); + + size_type counter = 0; + CUDA_CHECK(cudaMemcpyAsync(&counter, d_counter, sizeof(size_type), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + return counter; + } + + /** + * @brief Exports a certain number of the key-value-meta tuples which match + * specified condition from the hash table. + * + * @param n The maximum number of exported pairs. + * The value for @p pred should be a function with type `Pred` defined like + * the following example: + * + * ``` + * template + * __forceinline__ __device__ bool export_if_pred(const K& key, + * M& meta, + * const K& pattern, + * const M& threshold) { + * + * return meta > threshold; + * } + * ``` + * + * @param pred The predicate function with type Pred that returns `true` if + * the element should be exported. + * @param pattern The third user-defined argument to @p pred with key_type + * type. + * @param threshold The fourth user-defined argument to @p pred with meta_type + * type. + * @param offset The position of the key to remove. + * @param keys The keys to dump from GPU-accessible memory with shape (n). + * @param values The values to dump from GPU-accessible memory with shape + * (n, DIM). + * @param metas The metas to search on GPU-accessible memory with shape (n). + * @parblock + * If @p metas is `nullptr`, the meta for each key will not be returned. + * @endparblock + * + * @param stream The CUDA stream that is used to execute the operation. + * + * @return The number of elements dumped. + * + * @throw CudaException If the key-value size is too large for GPU shared + * memory. Reducing the value for @p n is currently required if this exception + * occurs. + */ + template