From cbfad89d1d1314cf5bd3a5ed5f074e3886d2ea42 Mon Sep 17 00:00:00 2001
From: Lifann <xhlyfan@gmail.com>
Date: Fri, 16 Jun 2023 12:12:32 +0800
Subject: [PATCH] Hkv code draft

---
 .../dynamic_embedding/core/BUILD              |    2 +
 .../core/hkv_hashtable_ops.cc                 |  311 ++
 .../core/kernels/hkv_hashtable_op_gpu.cu.cc   |  882 ++++++
 .../kernels/lookup_impl/lookup_table_op_hkv.h |  360 +++
 .../lookup_table_op_hkv_impl.cu.cc            |   37 +
 .../core/lib/merlin_inc/BUILD                 |   42 +
 .../lib/merlin_inc/merlin/array_kernels.cuh   |  345 +++
 .../lib/merlin_inc/merlin/core_kernels.cuh    | 2510 +++++++++++++++++
 .../core/lib/merlin_inc/merlin/debug.hpp      |   78 +
 .../lib/merlin_inc/merlin/flexible_buffer.cuh |   60 +
 .../core/lib/merlin_inc/merlin/group_lock.hpp |  229 ++
 .../lib/merlin_inc/merlin/initializers.cuh    |  147 +
 .../lib/merlin_inc/merlin/memory_pool.cuh     |  619 ++++
 .../core/lib/merlin_inc/merlin/optimizers.cuh |   77 +
 .../core/lib/merlin_inc/merlin/types.cuh      |  217 ++
 .../core/lib/merlin_inc/merlin/utils.cuh      |  368 +++
 .../core/lib/merlin_inc/merlin_hashtable.cuh  | 1643 +++++++++++
 .../core/lib/merlin_inc/merlin_localfile.hpp  |  162 ++
 .../dynamic_embedding/core/lib/utils/BUILD    |   12 +
 .../core/lib/utils/cuda_utils.cuh             |  134 +
 .../core/ops/hkv_hashtable_ops.cc             |  327 +++
 .../python/ops/hkv_hashtable_ops.py           |  456 +++
 22 files changed, 9018 insertions(+)
 create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/hkv_hashtable_ops.cc
 create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/kernels/hkv_hashtable_op_gpu.cu.cc
 create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h
 create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv_impl.cu.cc
 create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/BUILD
 create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/array_kernels.cuh
 create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/core_kernels.cuh
 create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/debug.hpp
 create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/flexible_buffer.cuh
 create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/group_lock.hpp
 create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/initializers.cuh
 create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/memory_pool.cuh
 create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/optimizers.cuh
 create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/types.cuh
 create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/utils.cuh
 create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_hashtable.cuh
 create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_localfile.hpp
 create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/BUILD
 create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/cuda_utils.cuh
 create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/core/ops/hkv_hashtable_ops.cc
 create mode 100644 tensorflow_recommenders_addons/dynamic_embedding/python/ops/hkv_hashtable_ops.py

diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/BUILD b/tensorflow_recommenders_addons/dynamic_embedding/core/BUILD
index 076e2791b..da34241a0 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/BUILD
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/BUILD
@@ -97,3 +97,5 @@ custom_op_library(
         "kernels/sparse_reshape_op.cu.cc",
     ],
 )
+
+# TODO: Add hkv targets.
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/hkv_hashtable_ops.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/hkv_hashtable_ops.cc
new file mode 100644
index 000000000..7af17be5c
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/hkv_hashtable_ops.cc
@@ -0,0 +1,311 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow_recommenders_addons/dynamic_embedding/core/utils/utils.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeAndType;
+using shape_inference::ShapeHandle;
+
+namespace {
+
+Status ScalarAndTwoElementVectorInputsAndScalarOutputs(InferenceContext* c) {
+  ShapeHandle handle;
+  DimensionHandle unused_handle;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+  for (int i = 1; i < c->num_inputs(); ++i) {
+    TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &handle));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_handle));
+  }
+  for (int i = 0; i < c->num_outputs(); ++i) {
+    c->set_output(i, c->Scalar());
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status ValidateTableResourceHandle(InferenceContext* c, ShapeHandle keys,
+                                   const string& key_dtype_attr,
+                                   const string& value_dtype_attr,
+                                   bool is_lookup,
+                                   ShapeAndType* output_shape_and_type) {
+  auto* handle_data = c->input_handle_shapes_and_types(0);
+  if (handle_data == nullptr || handle_data->size() != 2) {
+    output_shape_and_type->shape = c->UnknownShape();
+    output_shape_and_type->dtype = DT_INVALID;
+  } else {
+    const ShapeAndType& key_shape_and_type = (*handle_data)[0];
+    const ShapeAndType& value_shape_and_type = (*handle_data)[1];
+    DataType key_dtype;
+    TF_RETURN_IF_ERROR(c->GetAttr(key_dtype_attr, &key_dtype));
+    if (key_shape_and_type.dtype != key_dtype) {
+      return errors::InvalidArgument(
+          "Trying to read value with wrong dtype. "
+          "Expected ",
+          DataTypeString(key_shape_and_type.dtype), " got ",
+          DataTypeString(key_dtype));
+    }
+    DataType value_dtype;
+    TF_RETURN_IF_ERROR(c->GetAttr(value_dtype_attr, &value_dtype));
+    if (value_shape_and_type.dtype != value_dtype) {
+      return errors::InvalidArgument(
+          "Trying to read value with wrong dtype. "
+          "Expected ",
+          DataTypeString(value_shape_and_type.dtype), " got ",
+          DataTypeString(value_dtype));
+    }
+    output_shape_and_type->dtype = value_shape_and_type.dtype;
+
+    if (is_lookup) {
+      if (c->RankKnown(key_shape_and_type.shape) && c->RankKnown(keys)) {
+        int keys_rank = c->Rank(keys);
+        int key_suffix_rank = c->Rank(key_shape_and_type.shape);
+        if (keys_rank < key_suffix_rank) {
+          return errors::InvalidArgument(
+              "Expected keys to have suffix ",
+              c->DebugString(key_shape_and_type.shape),
+              " but saw shape: ", c->DebugString(keys));
+        }
+        for (int d = 0; d < key_suffix_rank; d++) {
+          // Ensure the suffix of keys match what's in the Table.
+          DimensionHandle dim = c->Dim(key_shape_and_type.shape, d);
+          TF_RETURN_IF_ERROR(
+              c->ReplaceDim(keys, keys_rank - key_suffix_rank + d, dim, &keys));
+        }
+        std::vector<DimensionHandle> keys_prefix_vec;
+        keys_prefix_vec.reserve(keys_rank - key_suffix_rank);
+        for (int d = 0; d < keys_rank - key_suffix_rank; ++d) {
+          keys_prefix_vec.push_back(c->Dim(keys, d));
+        }
+        ShapeHandle keys_prefix = c->MakeShape(keys_prefix_vec);
+        TF_RETURN_IF_ERROR(c->Concatenate(keys_prefix,
+                                          value_shape_and_type.shape,
+                                          &output_shape_and_type->shape));
+      } else {
+        output_shape_and_type->shape = c->UnknownShape();
+      }
+    } else {
+      TF_RETURN_IF_ERROR(c->Concatenate(keys, value_shape_and_type.shape,
+                                        &output_shape_and_type->shape));
+    }
+  }
+  return Status::OK();
+}
+
+Status HkvHashTableShape(InferenceContext* c, const ShapeHandle& key,
+                            const ShapeHandle& value) {
+  c->set_output(0, c->Scalar());
+
+  ShapeHandle key_s;
+  TF_RETURN_IF_ERROR(c->WithRankAtMost(key, 1, &key_s));
+
+  DataType key_t;
+  TF_RETURN_IF_ERROR(c->GetAttr("key_dtype", &key_t));
+
+  DataType value_t;
+  TF_RETURN_IF_ERROR(c->GetAttr("value_dtype", &value_t));
+
+  c->set_output_handle_shapes_and_types(
+      0, std::vector<ShapeAndType>{{key_s, key_t}, {value, value_t}});
+
+  return Status::OK();
+}
+
+REGISTER_OP("TfraHkvHashTableFind")
+    .Input("table_handle: resource")
+    .Input("keys: Tin")
+    .Input("default_value: Tout")
+    .Output("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      ShapeAndType value_shape_and_type;
+      TF_RETURN_IF_ERROR(ValidateTableResourceHandle(
+          c,
+          /*keys=*/c->input(1),
+          /*key_dtype_attr=*/"Tin",
+          /*value_dtype_attr=*/"Tout",
+          /*is_lookup=*/true, &value_shape_and_type));
+      c->set_output(0, value_shape_and_type.shape);
+
+      return Status::OK();
+    });
+
+REGISTER_OP("TfraHkvHashTableFindWithExists")
+    .Input("table_handle: resource")
+    .Input("keys: Tin")
+    .Input("default_value: Tout")
+    .Output("values: Tout")
+    .Output("exists: bool")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      ShapeHandle keys = c->UnknownShapeOfRank(1);
+      ShapeAndType value_shape_and_type;
+      TF_RETURN_IF_ERROR(ValidateTableResourceHandle(
+          c,
+          /*keys=*/c->input(1),
+          /*key_dtype_attr=*/"Tin",
+          /*value_dtype_attr=*/"Tout",
+          /*is_lookup=*/true, &value_shape_and_type));
+      c->set_output(0, value_shape_and_type.shape);
+      c->set_output(1, keys);
+
+      return Status::OK();
+    });
+
+REGISTER_OP("TfraHkvHashTableInsert")
+    .Input("table_handle: resource")
+    .Input("keys: Tin")
+    .Input("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      // TODO: Validate keys and values shape.
+      return Status::OK();
+    });
+
+REGISTER_OP("TfraHkvHashTableAccum")
+    .Input("table_handle: resource")
+    .Input("keys: key_dtype")
+    .Input("values_or_deltas: value_dtype")
+    .Input("exists: bool")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      // TODO: Validate keys and values shape.
+      return Status::OK();
+    });
+
+REGISTER_OP("TfraHkvHashTableRemove")
+    .Input("table_handle: resource")
+    .Input("keys: Tin")
+    .Attr("Tin: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &handle));
+
+      // TODO(turboale): Validate keys shape.
+      return Status::OK();
+    });
+
+REGISTER_OP("TfraHkvHashTableClear")
+    .Input("table_handle: resource")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type");
+
+REGISTER_OP("TfraHkvHashTableSize")
+    .Input("table_handle: resource")
+    .Output("size: int64")
+    .SetShapeFn(ScalarAndTwoElementVectorInputsAndScalarOutputs);
+
+REGISTER_OP("TfraHkvHashTableExport")
+    .Input("table_handle: resource")
+    .Output("keys: Tkeys")
+    .Output("values: Tvalues")
+    .Attr("Tkeys: type")
+    .Attr("Tvalues: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+      ShapeHandle keys = c->UnknownShapeOfRank(1);
+      ShapeAndType value_shape_and_type;
+      TF_RETURN_IF_ERROR(ValidateTableResourceHandle(
+          c,
+          /*keys=*/keys,
+          /*key_dtype_attr=*/"Tkeys",
+          /*value_dtype_attr=*/"Tvalues",
+          /*is_lookup=*/false, &value_shape_and_type));
+      c->set_output(0, keys);
+      c->set_output(1, value_shape_and_type.shape);
+      return Status::OK();
+    });
+
+REGISTER_OP("TfraHkvHashTableSaveToFileSystem")
+    .Input("table_handle: resource")
+    .Input("dirpath: string")
+    .Input("file_name: string")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .Attr("dirpath_env: string")
+    .Attr("append_to_file: bool")
+    .Attr("buffer_size: int >= 1");
+
+REGISTER_OP("TfraHkvHashTableImport")
+    .Input("table_handle: resource")
+    .Input("keys: Tin")
+    .Input("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      ShapeHandle keys;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &keys));
+      TF_RETURN_IF_ERROR(c->Merge(keys, c->input(2), &keys));
+      return Status::OK();
+    });
+
+REGISTER_OP("TfraHkvHashTableLoadFromFileSystem")
+    .Input("table_handle: resource")
+    .Input("dirpath: string")
+    .Input("file_name: string")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .Attr("dirpath_env: string")
+    .Attr("load_entire_dir: bool")
+    .Attr("buffer_size: int >= 1");
+
+REGISTER_OP("TfraHkvHashTableOfTensors")
+    .Output("table_handle: resource")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .Attr("value_shape: shape = {}")
+    .Attr("init_capacity: int = 0")
+    .Attr("max_capacity: int = 0")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      PartialTensorShape value_p;
+      TF_RETURN_IF_ERROR(c->GetAttr("value_shape", &value_p));
+      ShapeHandle value_s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(value_p, &value_s));
+      return HkvHashTableShape(c, /*key=*/c->Scalar(), /*value=*/value_s);
+    });
+}  // namespace tensorflow
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/hkv_hashtable_op_gpu.cu.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/hkv_hashtable_op_gpu.cu.cc
new file mode 100644
index 000000000..3803893c3
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/hkv_hashtable_op_gpu.cu.cc
@@ -0,0 +1,882 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA
+
+#include "tensorflow_recommenders_addons/dynamic_embedding/core/kernels/cuckoo_hashtable_op_gpu.h"
+#include "tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_gpu.h"
+#include "tensorflow_recommenders_addons/dynamic_embedding/core/utils/utils.h"
+
+#define EIGEN_USE_GPU
+
+#include <cuda_runtime.h>
+#include <stdlib.h>
+
+#include <array>
+#include <cstdlib>
+#include <type_traits>
+#include <utility>
+
+#include "tensorflow/core/framework/lookup_interface.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/stream_executor/stream.h"
+
+#include "tensorflow/core/util/gpu_device_functions.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+
+using GPUDevice = Eigen::GpuDevice;
+
+namespace recommenders_addons {
+namespace lookup {
+
+constexpr size_t kDefaultGpuInitCapacity = 1024;
+
+using tensorflow::OpKernelContext;
+using tensorflow::lookup::LookupInterface;
+
+template <class K, class V>
+class HkvHashTableOfTensorsGpu final : public LookupInterface {
+ private:
+
+ public:
+  HkvHashTableOfTensorsGpu(OpKernelContext* ctx, OpKernel* kernel) {
+    OP_REQUIRES_OK(ctx,
+                   GetNodeAttr(kernel->def(), "value_shape", &value_shape_));
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsVector(value_shape_),
+        errors::InvalidArgument("Default value must be a vector, got shape ",
+                                value_shape_.DebugString()));
+    runtime_dim_ = value_shape_.dim_size(0);
+
+    gpu::TableWrapperInitOptions options;
+
+    int64 init_capacity_i64 = 0;
+    int64 max_capacity_i64 = 0;
+    OP_REQUIRES_OK(ctx, GetNodeAttr(kernel->def(), "init_capacity", &init_capacity_i64));
+    OP_REQUIRES_OK(ctx, GetNodeAttr(kernel->def(), "max_capacity", &max_capacity_i64));
+    options.init_capacity = static_cast<size_t>(init_capacity_i64);
+    options.max_capacity = static_cast<size_t>(max_capacity_i64);
+
+    if (options.max_capacity == 0) {
+      char* env_max_capacity_str = std::getenv("TFRA_GPU_HASHTABLE_UPLIMIT_SIZE");
+      if (env_max_capacity_str) {
+        options.max_capacity = static_cast<size_t>(std::atoll(env_max_capacity_str));
+        LOG(WARNING) << "GPU table max capacity was not set in attribute, get "
+                     << options.max_capacity << " from env TFRA_GPU_HASHTABLE_UPLIMIT_SIZE.";
+      } else {
+        throw std::runtime_error("max_capaicty=0 and TFRA_GPU_HASHTABLE_UPLIMIT_SIZE not set is not valid.");
+      }
+    }
+    if (options.init_capacity == 0) {
+      options.init_capacity = kDefaultGpuInitCapacity;
+      LOG(WARNING) << "GPU table init capacity was not set in attribute, use default"
+                   << kDefaultGpuInitCapacity;
+    }
+    if (options.max_capacity < options.init_capacity) {
+      LOG(WARNING) << "GPU table max_capacity < init_capacity, (" << options.max_capacity
+                   << "/" << options.init_capacity << "). Reset to " << options.init_capacity;
+      options.max_capacity = options.init_capacity;
+    }
+
+    if (table_) {
+      return;
+    }
+    this->CreateTable(options, &table_);
+    OP_REQUIRES(ctx, (table_ != nullptr),
+                errors::InvalidArgument("HashTable on GPU is created failed!"));
+
+    LOG(INFO) << "GPU table max capacity was created on max_capacity: "
+              << options.max_capacity << ", and init capacity: "
+              << options.init_capacity 
+              << " with K=" << std::type_index(typeid(K)).name()
+              << ", V=" << std::type_index(typeid(V)).name();
+  }
+
+  ~HkvHashTableOfTensorsGpu() {
+  }
+
+  void CreateTable(gpu::TableWrapperInitOptions& options, gpu::TableWrapper<K, V>** pptable) {
+      gpu::CreateTableImpl(pptable, options, runtime_dim_);
+  }
+
+  size_t size() const override {
+    tf_shared_lock l(mu_);
+
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    size_t retv = table_->get_size(stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaStreamDestroy(stream));
+    return retv;
+  }
+
+  void size_i64(OpKernelContext* ctx, int64* s) {
+    tf_shared_lock l(mu_);
+    auto stream = ctx->eigen_device<GPUDevice>().stream();
+    int64 hret = static_cast<int64>(table_->get_size(stream));
+    CUDA_CHECK(cudaMemcpyAsync(s, &hret, sizeof(int64), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+  }
+
+  Status Find(OpKernelContext* ctx, const Tensor& d_keys, Tensor* value,
+              const Tensor& default_value) override {
+    size_t len = d_keys.flat<K>().size();
+    bool* d_status;
+
+    auto value_flat = value->flat_inner_dims<V, 2>();
+    const auto default_flat = default_value.flat<V>();
+    int64 total = value_flat.size();
+    int64 default_total = default_flat.size();
+    bool is_full_default = (total == default_total);
+
+    auto stream = ctx->eigen_device<GPUDevice>().stream();
+
+    if (len > 0) {
+      size_t default_value_num =
+          is_full_default ? default_value.shape().dim_size(0) : 1;
+      CUDA_CHECK(cudaMallocAsync(&d_status, sizeof(bool) * len, stream));
+      CUDA_CHECK(cudaMemsetAsync(d_status, 0, sizeof(bool) * len, stream));
+      {
+        tf_shared_lock l(mu_);
+        table_->get((const K*)d_keys.tensor_data().data(),
+                    (V*)(value->tensor_data().data()),
+                    d_status, len,
+                    (V*)(default_value.tensor_data().data()),
+                    stream, is_full_default);
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+      }
+      CUDA_CHECK(cudaFreeAsync(d_status, stream));
+    }
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    return Status::OK();
+  }
+
+  Status FindWithExists(OpKernelContext* ctx, const Tensor& d_keys,
+                        Tensor* value, const Tensor& default_value,
+                        Tensor* exists) {
+    size_t len = d_keys.flat<K>().size();
+
+    auto value_flat = value->flat_inner_dims<V, 2>();
+    const auto default_flat = default_value.flat<V>();
+    int64 total = value_flat.size();
+    int64 default_total = default_flat.size();
+    bool is_full_default = (total == default_total);
+
+    auto stream = ctx->eigen_device<GPUDevice>().stream();
+
+    if (len > 0) {
+      size_t default_value_num =
+          is_full_default ? default_value.shape().dim_size(0) : 1;
+      {
+        tf_shared_lock l(mu_);
+        table_->get((const K*)d_keys.tensor_data().data(),
+                    (V*)(value->tensor_data().data()),
+                    (bool*)exists->tensor_data().data(), len,
+                    (V*)(default_value.tensor_data().data()),
+                    stream, is_full_default);
+      }
+      CUDA_CHECK(cudaStreamSynchronize(stream));
+    }
+    return Status::OK();
+  }
+
+  Status Insert(OpKernelContext* ctx, const Tensor& keys,
+                const Tensor& values) override {
+    size_t len = keys.flat<K>().size();
+    auto stream = ctx->eigen_device<GPUDevice>().stream();
+    {
+      mutex_lock l(mu_);
+      table_->upsert((const K*)keys.tensor_data().data(),
+                     (const V*)(values.tensor_data().data()),
+                     len, stream);
+    };
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    return Status::OK();
+  }
+
+  Status Accum(OpKernelContext* ctx, const Tensor& keys,
+               const Tensor& values_or_deltas, const Tensor& exists) {
+    size_t len = keys.flat<K>().size();
+    auto stream = ctx->eigen_device<GPUDevice>().stream();
+    {
+      mutex_lock l(mu_);
+      table_->accum(
+          (const K*)keys.tensor_data().data(),
+          (const V*)(values_or_deltas.tensor_data().data()),
+          (const bool*)exists.tensor_data().data(), len, stream);
+    };
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    return Status::OK();
+  }
+
+  Status Remove(OpKernelContext* ctx, const Tensor& keys) override {
+    size_t len = keys.flat<K>().size();
+    K* d_keys;
+    auto stream = ctx->eigen_device<GPUDevice>().stream();
+
+    if (len > 0) {
+      CUDA_CHECK(cudaMallocAsync((void**)&d_keys, sizeof(K) * len, stream));
+      CUDA_CHECK(cudaMemsetAsync((void*)&d_keys, 0, sizeof(K) * len, stream));
+      CUDA_CHECK(cudaMemcpyAsync((void*)d_keys, (void*)keys.tensor_data().data(),
+                            sizeof(K) * len, cudaMemcpyDefault, stream));
+      {
+        mutex_lock l(mu_);
+        table_->remove((const K*)d_keys, len, stream);
+      }
+      CUDA_CHECK(cudaFreeAsync(d_keys, stream));
+    }
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    return Status::OK();
+  }
+
+  Status Clear(OpKernelContext* ctx) {
+    auto stream = ctx->eigen_device<GPUDevice>().stream();
+    {
+      mutex_lock l(mu_);
+      table_->clear(stream);
+    }
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    return Status::OK();
+  }
+
+  Status ImportValues(OpKernelContext* ctx, const Tensor& keys,
+                      const Tensor& values) override {
+    size_t len = keys.flat<K>().size();
+    K* d_keys;
+    V* d_values;
+    auto stream = ctx->eigen_device<GPUDevice>().stream();
+    if (len > 0) {
+      CUDA_CHECK(cudaMallocAsync((void**)&d_keys, sizeof(K) * len, stream));
+      CUDA_CHECK(cudaMemsetAsync((void*)&d_keys, 0, sizeof(K) * len, stream));
+      CUDA_CHECK(
+          cudaMallocAsync((void**)&d_values, sizeof(V) * runtime_dim_ * len, stream));
+      CUDA_CHECK(
+          cudaMemsetAsync((void*)&d_values, 0, sizeof(V) * runtime_dim_ * len, stream));
+      CUDA_CHECK(cudaMemcpyAsync((void*)d_keys, (void*)keys.tensor_data().data(),
+                            sizeof(K) * len, cudaMemcpyDefault, stream));
+      CUDA_CHECK(cudaMemcpyAsync((void*)d_values, (void*)values.tensor_data().data(),
+                            sizeof(V) * runtime_dim_ * len, cudaMemcpyDefault, stream));
+      {
+        mutex_lock l(mu_);
+        table_->clear(stream);
+        table_->upsert((const K*)d_keys,
+                       (const V*)d_values, len, stream);
+      }
+      CUDA_CHECK(cudaStreamSynchronize(stream));
+      CUDA_CHECK(cudaFreeAsync(d_keys, stream));
+      CUDA_CHECK(cudaFreeAsync(d_values, stream));
+    }
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    return Status::OK();
+  }
+
+  Status ExportValues(OpKernelContext* ctx) override {
+    size_t len = 0;
+    int64 size = 0;
+
+    const size_t offset = 0;
+
+    Tensor* keys;
+    Tensor* values;
+
+    size_t* d_dump_counter = nullptr;
+    auto stream = ctx->eigen_device<GPUDevice>().stream();
+
+    {
+      tf_shared_lock l(mu_);
+      len = table_->get_capacity();
+      size = (int64)table_->get_size(stream);
+    }
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    CUDA_CHECK(cudaMallocAsync(&d_dump_counter, sizeof(size_t), stream));
+    CUDA_CHECK(cudaMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream));
+
+    AllocatorAttributes attr;
+    //attr.set_gpu_compatible(true);
+    //attr.set_nic_compatible(true);
+    attr.set_on_host(false);
+
+    TF_RETURN_IF_ERROR(
+        ctx->allocate_output("keys", TensorShape({(size)}), &keys, attr));
+    TF_RETURN_IF_ERROR(ctx->allocate_output(
+        "values", TensorShape({size, (int64)runtime_dim_}), &values, attr));
+    if (size) {
+      tf_shared_lock l(mu_);
+      table_->dump((K*)keys->flat<K>().data(),
+                   (V*)(values->matrix<V>().data()), offset,
+                   len, d_dump_counter, stream);
+    }
+    CUDA_CHECK(cudaFreeAsync(d_dump_counter, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    return Status::OK();
+  }
+
+  Status ExportValuesWithMetas(OpKernelContext* ctx) {
+    size_t len = 0;
+    int64 size = 0;
+
+    const size_t offset = 0;
+
+    Tensor* keys;
+    Tensor* values;
+    Tensor* metas;
+
+    size_t* d_dump_counter = nullptr;
+    auto stream = ctx->eigen_device<GPUDevice>().stream();
+
+    {
+      tf_shared_lock l(mu_);
+      len = table_->get_capacity();
+      size = (int64)table_->get_size(stream);
+    }
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    CUDA_CHECK(cudaMallocAsync(&d_dump_counter, sizeof(size_t), stream));
+    CUDA_CHECK(cudaMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream));
+
+    AllocatorAttributes attr;
+    //attr.set_gpu_compatible(true);
+    //attr.set_nic_compatible(true);
+    attr.set_on_host(false);
+
+    TF_RETURN_IF_ERROR(
+        ctx->allocate_output("keys", TensorShape({(size)}), &keys, attr));
+    TF_RETURN_IF_ERROR(ctx->allocate_output(
+        "values", TensorShape({size, (int64)runtime_dim_}), &values, attr));
+    TF_RETURN_IF_ERROR(
+        ctx->allocate_output("metas", TensorShape({(size)}), &metas, attr));
+    if (size) {
+      tf_shared_lock l(mu_);
+      table_->dump_with_metas((K*)keys->flat<K>().data(),
+                             (V*)(values->matrix<V>().data()),
+                             (uint64_t*)(metas->flat<V>().data()),
+                             offset, len, d_dump_counter, stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream));
+    }
+    CUDA_CHECK(cudaFreeAsync(d_dump_counter, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    return Status::OK();
+  }
+
+  Status ExportKeysAndMetas(OpKernelContext* ctx, size_t split_size) {
+    tf_shared_lock l(mu_);
+    size_t span_len = 0;
+    int64 size = 0;
+
+    const size_t offset = 0;
+
+    Tensor* keys = nullptr;
+    Tensor* metas = nullptr;
+
+    auto stream = ctx->eigen_device<GPUDevice>().stream();
+
+    AllocatorAttributes attr;
+    attr.set_on_host(false);
+
+    {
+      size = (int64)table_->get_size(stream);
+
+      TF_RETURN_IF_ERROR(
+          ctx->allocate_output("keys", TensorShape({(size)}), &keys, attr));
+      TF_RETURN_IF_ERROR(
+          ctx->allocate_output("metas", TensorShape({(size)}), &metas, attr));
+
+      if (size) {
+        table_->dump_keys_and_metas((K*)keys->flat<K>().data(),
+                                    (int64*)(metas->flat<int64>().data()),
+                                    static_cast<size_t>(size),
+                                    split_size, stream);
+      }
+    }
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    return Status::OK();
+  }
+
+  Status ExportValuesToFile(OpKernelContext* ctx, const string filepath,
+                            const size_t buffer_size) {
+    auto stream = ctx->eigen_device<GPUDevice>().stream();
+
+    {
+      tf_shared_lock l(mu_);
+      table_->dump_to_file(filepath, runtime_dim_, stream, buffer_size);
+    }
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    return Status::OK();
+  }
+
+  Status ImportValuesFromFile(OpKernelContext* ctx, const string filepath,
+                              const size_t buffer_size) {
+    auto stream = ctx->eigen_device<GPUDevice>().stream();
+
+    {
+      mutex_lock l(mu_);
+
+      string keyfile = filepath + ".keys";
+      FILE* tmpfd = fopen(keyfile.c_str(), "rb");
+      if (tmpfd == nullptr) {
+        return errors::NotFound("Failed to read key file", keyfile);
+      }
+      fseek(tmpfd, 0, SEEK_END);
+      long int filesize = ftell(tmpfd);
+      size_t size = static_cast<size_t>(filesize) / sizeof(K);
+      fseek(tmpfd, 0, SEEK_SET);
+      fclose(tmpfd);
+
+      table_->clear(stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream));
+      table_->load_from_file(filepath, size, runtime_dim_, stream,
+                             buffer_size);
+    }
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    return Status::OK();
+  }
+
+  DataType key_dtype() const override { return DataTypeToEnum<K>::v(); }
+  DataType value_dtype() const override { return DataTypeToEnum<V>::v(); }
+  TensorShape key_shape() const final { return TensorShape(); }
+  TensorShape value_shape() const override { return value_shape_; }
+
+ private:
+  TensorShape value_shape_;
+  size_t runtime_dim_;
+  mutable mutex mu_;
+  gpu::TableWrapper<K, V>* table_ = nullptr GUARDED_BY(mu_);
+};
+
+}  // namespace lookup
+
+// Table lookup op. Perform the lookup operation on the given table.
+class HashTableFindGpuOp : public OpKernel {
+ public:
+  explicit HashTableFindGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    lookup::LookupInterface* table;
+    OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
+    core::ScopedUnref unref_me(table);
+
+    // Input 0 could be a STRING_REF or a RESOURCE
+    DataType expected_input_0 = DT_RESOURCE;
+    DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(),
+                                      table->value_dtype()};
+    DataTypeVector expected_outputs = {table->value_dtype()};
+    OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, expected_outputs));
+
+    const Tensor& keys = ctx->input(1);
+    const Tensor& default_values = ctx->input(2);
+
+    TensorShape output_shape = keys.shape();
+    output_shape.RemoveLastDims(table->key_shape().dims());
+    output_shape.AppendShape(table->value_shape());
+    Tensor* out;
+    AllocatorAttributes attr;
+    attr.set_gpu_compatible(true);
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output("values", output_shape, &out, attr));
+
+    OP_REQUIRES_OK(ctx, table->Find(ctx, keys, out, default_values));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name(PREFIX_OP_NAME(HkvHashTableFind)).Device(DEVICE_GPU),
+    HashTableFindGpuOp);
+
+// Table lookup op. Perform the lookup operation on the given table.
+
+template <class K, class V>
+class HashTableFindWithExistsGpuOp : public OpKernel {
+ public:
+  explicit HashTableFindWithExistsGpuOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    lookup::LookupInterface* table;
+    OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
+    core::ScopedUnref unref_me(table);
+
+    lookup::HkvHashTableOfTensorsGpu<K, V>* table_hkv =
+        (lookup::HkvHashTableOfTensorsGpu<K, V>*)table;
+
+    // Input 0 could be a STRING_REF or a RESOURCE
+    DataType expected_input_0 = DT_RESOURCE;
+    DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(),
+                                      table->value_dtype()};
+    DataTypeVector expected_outputs = {table->value_dtype(), DT_BOOL};
+    OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, expected_outputs));
+
+    const Tensor& keys = ctx->input(1);
+    const Tensor& default_values = ctx->input(2);
+
+    TensorShape output_shape = keys.shape();
+    output_shape.RemoveLastDims(table->key_shape().dims());
+    output_shape.AppendShape(table->value_shape());
+    Tensor* values;
+    Tensor* exists;
+    AllocatorAttributes attr;
+    attr.set_gpu_compatible(true);
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output("values", output_shape, &values, attr));
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output("exists", keys.shape(), &exists, attr));
+
+    OP_REQUIRES_OK(ctx, table_hkv->FindWithExists(ctx, keys, values,
+                                                     default_values, exists));
+  }
+};
+
+// Table insert op.
+class HashTableInsertGpuOp : public OpKernel {
+ public:
+  explicit HashTableInsertGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    lookup::LookupInterface* table;
+    OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
+    core::ScopedUnref unref_me(table);
+
+    DataType expected_input_0 = DT_RESOURCE;
+    DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(),
+                                      table->value_dtype()};
+    OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, {}));
+
+    const Tensor& keys = ctx->input(1);
+    const Tensor& values = ctx->input(2);
+    OP_REQUIRES_OK(ctx, table->CheckKeyAndValueTensorsForInsert(keys, values));
+    OP_REQUIRES_OK(ctx, table->Insert(ctx, keys, values));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name(PREFIX_OP_NAME(HkvHashTableInsert)).Device(DEVICE_GPU),
+    HashTableInsertGpuOp);
+
+// Table accum op.
+template <class K, class V>
+class HashTableAccumGpuOp : public OpKernel {
+ public:
+  explicit HashTableAccumGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    lookup::LookupInterface* table;
+    OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
+    core::ScopedUnref unref_me(table);
+    lookup::HkvHashTableOfTensorsGpu<K, V>* table_hkv =
+        (lookup::HkvHashTableOfTensorsGpu<K, V>*)table;
+
+    DataType expected_input_0 = DT_RESOURCE;
+    DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(),
+                                      table->value_dtype(),
+                                      DataTypeToEnum<bool>::v()};
+    OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, {}));
+
+    const Tensor& keys = ctx->input(1);
+    const Tensor& values_or_deltas = ctx->input(2);
+    const Tensor& exists = ctx->input(3);
+    OP_REQUIRES_OK(
+        ctx, table->CheckKeyAndValueTensorsForInsert(keys, values_or_deltas));
+    OP_REQUIRES_OK(ctx,
+                   table_hkv->Accum(ctx, keys, values_or_deltas, exists));
+  }
+};
+
+// Table remove op.
+class HashTableRemoveGpuOp : public OpKernel {
+ public:
+  explicit HashTableRemoveGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    lookup::LookupInterface* table;
+    OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
+    core::ScopedUnref unref_me(table);
+
+    DataType expected_input_0 = DT_RESOURCE;
+    DataTypeVector expected_inputs = {expected_input_0, table->key_dtype()};
+    OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, {}));
+
+    const Tensor& key = ctx->input(1);
+    OP_REQUIRES_OK(ctx, table->CheckKeyTensorForRemove(key));
+    OP_REQUIRES_OK(ctx, table->Remove(ctx, key));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name(PREFIX_OP_NAME(HkvHashTableRemove)).Device(DEVICE_GPU),
+    HashTableRemoveGpuOp);
+
+// Table clear op.
+template <class K, class V>
+class HashTableClearGpuOp : public OpKernel {
+ public:
+  explicit HashTableClearGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    lookup::LookupInterface* table;
+    OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
+    core::ScopedUnref unref_me(table);
+    lookup::HkvHashTableOfTensorsGpu<K, V>* table_hkv =
+        (lookup::HkvHashTableOfTensorsGpu<K, V>*)table;
+    OP_REQUIRES_OK(ctx, table_hkv->Clear(ctx));
+  }
+};
+
+// Op that returns the size of the given table.
+class HashTableSizeGpuOp : public OpKernel {
+ public:
+  explicit HashTableSizeGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    lookup::LookupInterface* table;
+    OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
+    core::ScopedUnref unref_me(table);
+    lookup::HkvHashTableOfTensorsGpu<K, V>* table_hkv =
+        (lookup::HkvHashTableOfTensorsGpu<K, V>*)table;
+
+    Tensor* out;
+    AllocatorAttributes attr;
+    attr.set_gpu_compatible(true);
+    attr.set_on_host(false);
+
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output("size", TensorShape({}), &out, attr));
+
+    int64* p_size = (int64*)out->flat<int64>().data();
+    table_hkv->size_i64(ctx, p_size);
+  }
+};
+
+// Op that outputs tensors of all keys and all values.
+class HashTableExportGpuOp : public OpKernel {
+ public:
+  explicit HashTableExportGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    lookup::LookupInterface* table;
+    OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
+    core::ScopedUnref unref_me(table);
+
+    OP_REQUIRES_OK(ctx, table->ExportValues(ctx));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name(PREFIX_OP_NAME(HkvHashTableExport)).Device(DEVICE_GPU),
+    HashTableExportGpuOp);
+
+// Op that export all keys and values to file.
+template <class K, class V>
+class HashTableExportWithMetasGpuOp : public OpKernel {
+ public:
+  explicit HashTableExportWithMetasGpuOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    lookup::LookupInterface* table;
+    OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
+    core::ScopedUnref unref_me(table);
+    lookup::HkvHashTableOfTensorsGpu<K, V>* table_hkv =
+        (lookup::HkvHashTableOfTensorsGpu<K, V>*)table;
+    OP_REQUIRES_OK(
+        ctx, table_hkv->ExportValuesWithMetas(ctx));
+  }
+};
+
+template <class K, class V>
+class HashTableExportKeysAndMetasGpuOp : public OpKernel {
+ public:
+  explicit HashTableExportKeysAndMetasGpuOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    ctx->GetAttr("split_size", &split_size_i64_);
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    lookup::LookupInterface* table;
+    OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
+    core::ScopedUnref unref_me(table);
+    lookup::HkvHashTableOfTensorsGpu<K, V>* table_hkv =
+        (lookup::HkvHashTableOfTensorsGpu<K, V>*)table;
+    OP_REQUIRES_OK(ctx, table_hkv->ExportKeysAndMetas(ctx, static_cast<size_t>(split_size_i64_)));
+  }
+ private:
+  int64 split_size_i64_;
+};
+
+template <class K, class V>
+class HashTableExportToFileGpuOp : public OpKernel {
+ public:
+  explicit HashTableExportToFileGpuOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    int64 signed_buffer_size = 0;
+    ctx->GetAttr("buffer_size", &signed_buffer_size);
+    buffer_size_ = static_cast<size_t>(signed_buffer_size);
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    lookup::LookupInterface* table;
+    OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
+    core::ScopedUnref unref_me(table);
+
+    const Tensor& ftensor = ctx->input(1);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(ftensor.shape()),
+                errors::InvalidArgument("filepath must be scalar."));
+    string filepath = string(ftensor.scalar<tstring>()().data());
+    lookup::HkvHashTableOfTensorsGpu<K, V>* table_hkv =
+        (lookup::HkvHashTableOfTensorsGpu<K, V>*)table;
+    OP_REQUIRES_OK(
+        ctx, table_hkv->ExportValuesToFile(ctx, filepath, buffer_size_));
+  }
+
+ private:
+  size_t buffer_size_;
+};
+
+// Clear the table and insert data.
+class HashTableImportGpuOp : public OpKernel {
+ public:
+  explicit HashTableImportGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    lookup::LookupInterface* table;
+    OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
+    core::ScopedUnref unref_me(table);
+
+    DataType expected_input_0 = DT_RESOURCE;
+    DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(),
+                                      table->value_dtype()};
+    OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, {}));
+
+    const Tensor& keys = ctx->input(1);
+    const Tensor& values = ctx->input(2);
+    OP_REQUIRES_OK(ctx, table->CheckKeyAndValueTensorsForImport(keys, values));
+    OP_REQUIRES_OK(ctx, table->ImportValues(ctx, keys, values));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name(PREFIX_OP_NAME(HkvHashTableImport)).Device(DEVICE_GPU),
+    HashTableImportGpuOp);
+
+// Clear the table and insert data from FileSystem.
+template <class K, class V>
+class HashTableImportFromFileGpuOp : public OpKernel {
+ public:
+  explicit HashTableImportFromFileGpuOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    int64 signed_buffer_size = 0;
+    ctx->GetAttr("buffer_size", &signed_buffer_size);
+    buffer_size_ = static_cast<size_t>(signed_buffer_size);
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    lookup::LookupInterface* table;
+    OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
+    core::ScopedUnref unref_me(table);
+
+    const Tensor& ftensor = ctx->input(1);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(ftensor.shape()),
+                errors::InvalidArgument("filepath must be scalar."));
+    string filepath = string(ftensor.scalar<tstring>()().data());
+    lookup::HkvHashTableOfTensorsGpu<K, V>* table_hkv =
+        (lookup::HkvHashTableOfTensorsGpu<K, V>*)table;
+    OP_REQUIRES_OK(
+        ctx, table_hkv->ImportValuesFromFile(ctx, filepath, buffer_size_));
+  }
+
+ private:
+  size_t buffer_size_;
+};
+
+// Register the HkvHashTableOfTensors op.
+
+#define REGISTER_KERNEL(key_dtype, value_dtype)                                \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name(PREFIX_OP_NAME(HkvHashTableOfTensors))                           \
+          .Device(DEVICE_GPU)                                                  \
+          .TypeConstraint<key_dtype>("key_dtype")                              \
+          .TypeConstraint<value_dtype>("value_dtype"),                         \
+      HashTableGpuOp<                                                          \
+          lookup::HkvHashTableOfTensorsGpu<key_dtype, value_dtype>,         \
+          key_dtype, value_dtype>);                                            \
+  REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableClear))           \
+                              .Device(DEVICE_GPU)                              \
+                              .TypeConstraint<key_dtype>("key_dtype")          \
+                              .TypeConstraint<value_dtype>("value_dtype"),     \
+                          HashTableClearGpuOp<key_dtype, value_dtype>);        \
+  REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableSize))            \
+                              .Device(DEVICE_GPU)                              \
+                              .TypeConstraint<key_dtype>("key_dtype")          \
+                              .TypeConstraint<value_dtype>("value_dtype"),     \
+                          HashTableSizeGpuOp<key_dtype, value_dtype>);         \
+  REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableAccum))           \
+                              .Device(DEVICE_GPU)                              \
+                              .TypeConstraint<key_dtype>("key_dtype")          \
+                              .TypeConstraint<value_dtype>("value_dtype"),     \
+                          HashTableAccumGpuOp<key_dtype, value_dtype>);        \
+  REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableExportWithMetas)) \
+                              .Device(DEVICE_GPU)                              \
+                              .TypeConstraint<key_dtype>("key_dtype")          \
+                              .TypeConstraint<value_dtype>("value_dtype"),     \
+                          HashTableExportWithMetasGpuOp<key_dtype, value_dtype>); \
+  REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableExportToFile))    \
+                              .Device(DEVICE_GPU)                              \
+                              .HostMemory("filepath")                          \
+                              .TypeConstraint<key_dtype>("key_dtype")          \
+                              .TypeConstraint<value_dtype>("value_dtype"),     \
+                          HashTableExportToFileGpuOp<key_dtype, value_dtype>); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name(PREFIX_OP_NAME(HkvHashTableImportFromFile))                      \
+          .Device(DEVICE_GPU)                                                  \
+          .HostMemory("filepath")                                              \
+          .TypeConstraint<key_dtype>("key_dtype")                              \
+          .TypeConstraint<value_dtype>("value_dtype"),                         \
+      HashTableImportFromFileGpuOp<key_dtype, value_dtype>);                   \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name(PREFIX_OP_NAME(HkvHashTableFindWithExists))                      \
+          .Device(DEVICE_GPU)                                                  \
+          .TypeConstraint<key_dtype>("Tin")                                    \
+          .TypeConstraint<value_dtype>("Tout"),                                \
+      HashTableFindWithExistsGpuOp<key_dtype, value_dtype>);
+
+REGISTER_KERNEL(int64, float);
+REGISTER_KERNEL(int64, Eigen::half);
+REGISTER_KERNEL(int64, int64);
+REGISTER_KERNEL(int64, int32);
+REGISTER_KERNEL(int64, int8);
+REGISTER_KERNEL(int32, float);
+
+#undef REGISTER_KERNEL
+
+#define SINGLE_ATTR_REGISTER_KERNEL(key_dtype, value_type)                    \
+  REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableExportKeysAndMetas)) \
+                              .Device(DEVICE_GPU)                              \
+                              .TypeConstraint<key_dtype>("Tkeys"),          \
+                          HashTableExportKeysAndMetasGpuOp<key_dtype, value_type>);
+
+SINGLE_ATTR_REGISTER_KERNEL(int64, float);
+
+#undef SINGLE_ATTR_REGISTER_KERNEL
+
+}  // namespace recommenders_addons
+}  // namespace tensorflow
+#endif
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h
new file mode 100644
index 000000000..0da759780
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h
@@ -0,0 +1,360 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TFRA_CORE_KERNELS_LOOKUP_TABLE_OP_GPU_H_
+#define TFRA_CORE_KERNELS_LOOKUP_TABLE_OP_GPU_H_
+
+#include <typeindex>
+#include <stddef.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include <time.h>
+#include <limits>
+
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/lookup_interface.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/lookup_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_hashtable.cuh"
+#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_localfile.hpp"
+#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/types.cuh"
+#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/utils.cuh"
+
+namespace tensorflow {
+namespace recommenders_addons {
+namespace lookup {
+namespace gpu {
+
+template <typename K, typename V, typename M>
+class KVOnlyFile : public nv::merlin::BaseKVFile<K, V, M> {
+ public:
+  KVOnlyFile() : keys_fp_(nullptr), values_fp_(nullptr) {}
+
+  ~KVOnlyFile() {
+    close();
+  }
+
+  bool open(const std::string& keys_path, const std::string& values_path,
+            const char* mode) {
+    close();
+    keys_fp_ = fopen(keys_path.c_str(), mode);
+    if (!keys_fp_) {
+      return false;
+    }
+    values_fp_ = fopen(values_path.c_str(), mode);
+    if (!values_fp_) {
+      close();
+      return false;
+    }
+    return true;
+  }
+
+  void close() noexcept {
+    if (keys_fp_) {
+      fclose(keys_fp_);
+      keys_fp_ = nullptr;
+    }
+    if (values_fp_) {
+      fclose(values_fp_);
+      values_fp_ = nullptr;
+    }
+  }
+
+  size_t read(const size_t n, const size_t dim, K* keys, V* vectors, M* metas) override {
+    size_t nread_keys =
+        fread(keys, sizeof(K), static_cast<size_t>(n), keys_fp_);
+    size_t nread_vecs =
+        fread(vectors, sizeof(V) * dim, static_cast<size_t>(n), values_fp_);
+    if (nread_keys != nread_vecs) {
+      LOG(INFO) << "Partially read failed. " << nread_keys << " kv pairs by KVOnlyFile.";
+      return 0;
+    }
+    LOG(INFO) << "Partially read " << nread_keys << " kv pairs by KVOnlyFile.";
+    return nread_keys;
+  }
+
+  size_t write(const size_t n, const size_t dim, const K* keys, const V* vectors,
+               const M* metas) override {
+    size_t nwritten_keys =
+        fwrite(keys, sizeof(K), static_cast<size_t>(n), keys_fp_);
+    size_t nwritten_vecs =
+        fwrite(vectors, sizeof(V) * dim, static_cast<size_t>(n), values_fp_);
+    if (nwritten_keys != nwritten_vecs) {
+      return 0;
+    }
+    LOG(INFO) << "Partially write " << nwritten_keys << " kv pairs by KVOnlyFile.";
+    return nwritten_keys;
+  }
+
+ private:
+  FILE* keys_fp_;
+  FILE* values_fp_;
+};
+
+// template to avoid multidef in compile time only.
+template <typename K, typename V>
+__global__ void gpu_u64_to_i64_kernel(const uint64_t* u64, int64* i64, size_t len) {
+  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (tid < len) {
+    i64[tid] = static_cast<int64>(u64[tid]);
+  }
+}
+
+template <typename T>
+__global__ void broadcast_kernel(T* data, T val, size_t n) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < n) {
+    data[tid] = val;
+  }
+}
+
+template <typename K, typename V>
+void gpu_cast_u64_to_i64(const uint64_t* u64, int64* i64, size_t len, cudaStream_t stream) {
+  size_t block_size = nv::merlin::SAFE_GET_BLOCK_SIZE(1024);
+  size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(len, block_size);
+  gpu_u64_to_i64_kernel<K, V><<<grid_size, block_size, 0, stream>>>(u64, i64, len);
+}
+
+using GPUDevice = Eigen::ThreadPoolDevice;
+
+struct TableWrapperInitOptions {
+  size_t max_capacity;
+  size_t init_capacity;
+};
+
+template <class K, class V>
+class TableWrapper {
+ private:
+  //using M = uint64_t;
+  using Table = nv::merlin::HashTable<K, V, uint64_t>;
+
+ public:
+  TableWrapper(TableWrapperInitOptions& init_options, size_t dim) {
+    max_capacity_ = init_options.max_capacity;
+    dim_ = dim;
+    nv::merlin::HashTableOptions mkv_options;
+    mkv_options.init_capacity = std::min(init_options.init_capacity, max_capacity_);
+    mkv_options.max_capacity = max_capacity_;
+    // Since currently GPU nodes are not compatible to fast
+    // pcie connections for D2H non-continous wirte, so just
+    // use pure hbm mode now.
+    mkv_options.max_hbm_for_vectors = std::numeric_limits<size_t>::max();
+    mkv_options.max_load_factor = 0.63;
+    mkv_options.block_size = nv::merlin::SAFE_GET_BLOCK_SIZE(1024);
+    mkv_options.dim = dim;
+    mkv_options.evict_strategy = nv::merlin::EvictStrategy::kCustomized;
+    block_size_ = mkv_options.block_size;
+    table_ = new Table();
+    table_->init(mkv_options);
+  }
+
+  ~TableWrapper() { delete table_; }
+
+  void upsert(const K* d_keys, const V* d_vals, size_t len,
+              cudaStream_t stream) {
+    uint64_t t0 = (uint64_t)time(NULL);
+    uint64_t* timestamp_metas = nullptr;
+    CUDA_CHECK(cudaMallocAsync(&timestamp_metas, len * sizeof(uint64_t), stream));
+    CUDA_CHECK(cudaMemsetAsync(timestamp_metas, 0, len * sizeof(uint64_t), stream));
+    size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(len, block_size_);
+    broadcast_kernel<uint64_t><<<grid_size, block_size_, 0, stream>>>(timestamp_metas, t0, len);
+
+    table_->insert_or_assign(len, d_keys, d_vals, /*d_metas=*/timestamp_metas, stream);
+    CUDA_CHECK(cudaFreeAsync(timestamp_metas, stream));
+  }
+
+  void accum(const K* d_keys, const V* d_vals_or_deltas,
+             const bool* d_exists, size_t len, cudaStream_t stream) {
+    uint64_t t0 = (uint64_t)time(NULL);
+    uint64_t* timestamp_metas = nullptr;
+    CUDA_CHECK(cudaMallocAsync(&timestamp_metas, len * sizeof(uint64_t), stream));
+    CUDA_CHECK(cudaMemsetAsync(timestamp_metas, 0, len * sizeof(uint64_t), stream));
+    size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(len, block_size_);
+    broadcast_kernel<uint64_t><<<grid_size, block_size_, 0, stream>>>(timestamp_metas, t0, len);
+    table_->accum_or_assign(len, d_keys, d_vals_or_deltas, d_exists, /*d_metas=*/timestamp_metas, stream);
+    CUDA_CHECK(cudaFreeAsync(timestamp_metas, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream))
+  }
+
+  void dump(K* d_key, V* d_val, const size_t offset,
+            const size_t search_length, size_t* d_dump_counter,
+            cudaStream_t stream) const {
+    table_->export_batch(search_length, offset, d_dump_counter, d_key, d_val, /*d_metas=*/nullptr, stream);
+  }
+
+  void dump_with_metas(K* d_key, V* d_val, uint64_t* d_metas, const size_t offset,
+                       const size_t search_length, size_t* d_dump_counter,
+                       cudaStream_t stream) const {
+    table_->export_batch(search_length, offset, d_dump_counter, d_key, d_val, d_metas, stream);
+  }
+
+  void dump_keys_and_metas(K* keys, int64* metas, size_t len,
+                           size_t split_len, cudaStream_t stream) const {
+    V* values_buf = nullptr;
+    size_t offset = 0;
+    size_t real_offset = 0;
+    size_t skip = split_len;
+    uint64_t* metas_u64 = reinterpret_cast<uint64_t*>(metas);
+    size_t span_len = table_->capacity();
+    CUDA_CHECK(cudaMallocAsync(&values_buf, sizeof(V) * dim_ * split_len, stream));
+    CUDA_CHECK(cudaMemsetAsync(values_buf, 0, sizeof(V) * dim_ * split_len, stream));
+    for (; offset < span_len; offset += split_len) {
+      if (offset + skip > span_len) {
+        skip = span_len - offset;
+      }
+      // TODO: overlap the loop
+      size_t h_dump_counter = table_->export_batch(skip, offset, keys + real_offset, values_buf, metas_u64 + real_offset, stream);
+      CudaCheckError();
+
+      if (h_dump_counter > 0) {
+        gpu_cast_u64_to_i64<K, V>(metas_u64 + real_offset, metas + real_offset, h_dump_counter, stream);
+        real_offset += h_dump_counter;
+      }
+      CUDA_CHECK(cudaStreamSynchronize(stream));
+    }
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFreeAsync(values_buf, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+  }
+
+  void dump_to_file(const string filepath, size_t dim,
+                    cudaStream_t stream,
+                    const size_t buffer_size) const {
+    LOG(INFO) << "dump_to_file, filepath: " << filepath << ", dim: " << dim
+              << ", stream: " << stream << ", buffer_size: " << buffer_size;
+    std::unique_ptr<TimestampV1CompatFile<K, V, uint64_t>> wfile;
+    string keyfile = ;
+    string valuefile = ;
+    string metafile = ;
+
+    wfile.reset(new TimestampV1CompatFile<K, V, uint64_t>);
+    bool open_ok = wfile->open(keyfile, valuefile, metafile, "wb");
+    if (!open_ok) {
+      std::string error_msg = "Failed to dump to file to " + keyfile + ", " + valuefile + ", " + metafile;
+      throw std::runtime_error(error_msg);
+    }
+
+    size_t n_saved = table_->save(wfile.get(), buffer_size, stream);
+    LOG(INFO) << "[op] Save " << n_saved << " pairs into keyfile: "
+              << keyfile << ", and valuefile: " << valuefile
+              << ", and metafile: " << metafile;
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    wfile->close();
+  }
+
+  void load_from_file(const string filepath,
+                      size_t key_num, size_t dim, cudaStream_t stream,
+                      const size_t buffer_size) {
+    std::unique_ptr<nv::merlin::BaseKVFile<K, V, uint64_t>> rfile;
+    string keyfile = ;
+    string valuefile = ;
+    string metafile = ;
+    //rfile.reset(new TimestampV1CompatFile<K, V, uint64_t>);
+    bool has_metas = false;
+    bool open_ok = false;
+
+    if (is_valid_metas(keyfile, metafile)) {
+      rfile.reset(new TimestampV1CompatFile<K, V, uint64_t>);
+      open_ok = reinterpret_cast<TimestampV1CompatFile<K, V, uint64_t>*>(rfile.get())->open(keyfile, valuefile, metafile, "rb");
+      has_metas = true;
+    } else {
+      rfile.reset(new KVOnlyFile<K, V, uint64_t>);
+      open_ok = reinterpret_cast<KVOnlyFile<K, V, uint64_t>*>(rfile.get())->open(keyfile, valuefile, "rb");
+    }
+    if (!open_ok) {
+      std::string error_msg = "Failed to load from file to " + keyfile + ", " + valuefile + ", " + metafile;
+      throw std::runtime_error("Failed to ");
+    }
+
+    size_t n_loaded = table_->load(rfile.get(), buffer_size, stream);
+    if (has_metas) {
+      LOG(INFO) << "[op] Load " << n_loaded << " pairs into keyfile: "
+                << keyfile << ", and valuefile: " << valuefile
+                << ", and metafile" << metafile;
+    } else {
+      LOG(INFO) << "[op] Load " << n_loaded << " pairs into keyfile: "
+                << keyfile << ", and valuefile: " << valuefile;
+    }
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    if (has_metas) {
+      reinterpret_cast<TimestampV1CompatFile<K, V, uint64_t>*>(rfile.get())->close();
+    } else {
+      reinterpret_cast<KVOnlyFile<K, V, uint64_t>*>(rfile.get())->close();
+    }
+  }
+
+  void get(const K* d_keys, V* d_vals, bool* d_status, size_t len,
+           V* d_def_val, cudaStream_t stream,
+           bool is_full_size_default) const {
+    if (is_full_size_default) {
+      CUDA_CHECK(cudaMemcpyAsync(d_vals, d_def_val, sizeof(V) * dim_ * len, cudaMemcpyDeviceToDevice, stream));
+    } else {
+      size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(len, block_size_);
+      gpu_fill_default_values<V><<<grid_size, block_size_, dim_ * sizeof(V), stream>>>(d_vals, d_def_val, len, dim_);
+    }
+    table_->find(len, d_keys, d_vals, d_status, /*d_metas=*/nullptr, stream);
+  }
+
+  // TODO: Implement a contain kernel instead of find.
+  void contains(const K* d_keys, V* d_status, size_t len, cudaStream_t stream) {
+    // pass
+    V* tmp_vals = nullptr;
+    CUDA_CHECK(cudaMallocAsync(&tmp_vals, sizeof(V) * len * dim_, stream));
+    CUDA_CHECK(cudaMemsetAsync(&tmp_vals, 0, sizeof(V) * len * dim_, stream));
+    table_->find(len, d_keys, tmp_vals, d_status, /*d_metas=*/nullptr, stream);
+    CUDA_CHECK(cudaFreeAsync(tmp_vals, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+  }
+
+  size_t get_size(cudaStream_t stream) const {
+    return table_->size(stream);
+  }
+
+  size_t get_capacity() const { return table_->capacity(); }
+
+  void remove(const K* d_keys, size_t len, cudaStream_t stream) {
+    table_->erase(len, d_keys, stream);
+  }
+
+  void clear(cudaStream_t stream) { table_->clear(stream); }
+
+ private:
+  Table* table_;
+  size_t max_capacity_;
+  size_t dim_;
+  int block_size_;
+  bool dynamic_mode_;
+};
+
+template <class K, class V>
+void CreateTableImpl(TableWrapper<K, V>** pptable, TableWrapperInitOptions& options,
+                     size_t runtime_dim) {
+  *pptable = new TableWrapper<K, V>(options, runtime_dim);
+}
+
+}  // namespace gpu
+}  // namespace lookup
+}  // namespace recommenders_addons
+}  // namespace tensorflow
+
+#endif  // TFRA_CORE_KERNELS_LOOKUP_TABLE_OP_GPU_H_
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv_impl.cu.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv_impl.cu.cc
new file mode 100644
index 000000000..47660f73c
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv_impl.cu.cc
@@ -0,0 +1,37 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_gpu.h"
+namespace tensorflow {
+namespace recommenders_addons {
+namespace lookup {
+namespace gpu {
+
+#define DEFINE_PURE_GPU_HASHTABLE(key_type, value_type) \
+    template<> class TableWrapper<key_type, value_type>
+
+
+DEFINE_PURE_GPU_HASHTABLE(int64, float);
+DEFINE_PURE_GPU_HASHTABLE(int64, int32);
+DEFINE_PURE_GPU_HASHTABLE(int64, int64);
+DEFINE_PURE_GPU_HASHTABLE(int64, int64);
+
+#undef DEFINE_PURE_GPU_HASHTABLE
+
+
+}  // namespace gpu
+}  // namespace lookup
+}  // namespace recommenders_addons
+}  // namespace tensorflow
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/BUILD b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/BUILD
new file mode 100644
index 000000000..b7d22e73b
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/BUILD
@@ -0,0 +1,42 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda", "if_cuda_is_configured")
+
+package(default_visibility = ["//visibility:public"])
+
+load("//tensorflow_recommenders_addons:tensorflow_recommenders_addons.bzl", "custom_op_library",
+     "if_cuda_for_tf_serving", "tf_nvcc_binary")
+
+cc_library(
+    name = "merlin_kv",
+    hdrs = [
+        "merlin/array_kernels.cuh",
+        "merlin/core_kernels.cuh",
+        "merlin/debug.hpp",
+        "merlin/flexible_buffer.cuh",
+        "merlin/memory_pool.cuh",
+        "merlin/group_lock.hpp",
+        #"merlin/initializers.cuh",
+        #"merlin/managed.cuh",
+        #"merlin/optimizers.cuh",
+        "merlin/types.cuh",
+        "merlin/utils.cuh",
+        "merlin_hashtable.cuh",
+        "merlin_localfile.hpp",
+    ],
+    deps = [
+        "//tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils:cuda_utils",
+    ],
+)
+
+tf_nvcc_binary(
+    name="merlin_kv_insert_and_evict_test",
+    srcs=[],
+    cuda_srcs=[
+        "insert_and_evict_test.cu.cc",
+        "test_util.cu.h",
+    ],
+    cuda_deps=[
+        ":merlin_kv",
+    ],
+    copts=[],
+    linkopts=[],
+)
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/array_kernels.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/array_kernels.cuh
new file mode 100644
index 000000000..cd11fb713
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/array_kernels.cuh
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http:///www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/cuda_utils.cuh"
+
+#include <cooperative_groups.h>
+#include "cuda_runtime.h"
+#include "thrust/device_vector.h"
+#include "thrust/execution_policy.h"
+#include "thrust/scan.h"
+#include "thrust/count.h"
+#include "types.cuh"
+#include "utils.cuh"
+
+namespace nv {
+namespace merlin {
+
+template <typename K>
+__global__ void keys_not_empty(const K* keys, bool* masks, size_t n) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < n) {
+    masks[tid] = keys[tid] != EMPTY_KEY;
+  }
+}
+
+template <typename Tidx, int TILE_SIZE = 8>
+__global__ void gpu_cell_count(const bool* masks, bool target,
+                               Tidx* offsets, size_t n, size_t* n_existed) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
+  int rank = g.thread_rank();
+  bool is_existed = false;
+  if (tid < n) {
+    if (masks[tid] == target) {
+      is_existed = true;
+    }
+  }
+  unsigned int vote = g.ballot(is_existed);
+  int g_ones = __popc((int)vote);
+  if (rank == 0 && tid < n) {
+    offsets[tid / TILE_SIZE] = static_cast<Tidx>(g_ones);
+    atomicAdd(static_cast<size_t*>(n_existed), static_cast<size_t>(g_ones));
+  }
+}
+
+template <typename K, typename Tidx, int TILE_SIZE = 8>
+__global__ void gpu_select_key_kernel(const bool* masks, bool target, size_t n,
+                                      const Tidx* offsets, const K* __restrict keys,
+                                      K* __restrict outkeys, Tidx* outoffsets) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
+  int rank = g.thread_rank();
+
+  bool is_existed = false;
+  if (tid < n) {
+    if (masks[tid] == target) {
+      is_existed = true;
+    }
+  }
+  unsigned int vote = g.ballot(is_existed);
+  unsigned int r_vote = __brev(vote) >> (32 - TILE_SIZE);
+
+  if (tid < n) {
+    r_vote = r_vote >> (TILE_SIZE - rank - 1);
+    if (masks[tid] == target) {
+      int prefix_n = __popc(r_vote) - 1;
+      Tidx bias = offsets[tid / TILE_SIZE] + static_cast<Tidx>(prefix_n);
+      outkeys[bias] = keys[tid];
+      outoffsets[bias] = static_cast<Tidx>(tid);
+    }
+  }
+}
+
+template <typename K, typename V, typename Tidx, int TILE_SIZE = 8>
+__global__ void gpu_select_kv_kernel(const bool* masks, bool target, size_t n,
+                                      const Tidx* offsets,
+                                      const K* __restrict keys,
+                                      V* __restrict values,
+                                      K* __restrict outkeys,
+                                      V* __restrict outvalues,
+                                      const size_t dim) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
+  int rank = g.thread_rank();
+
+  bool is_existed = false;
+  if (tid < n) {
+    if (masks[tid] == target) {
+      is_existed = true;
+    }
+  }
+  unsigned int vote = g.ballot(is_existed);
+  unsigned int r_vote = __brev(vote) >> (32 - TILE_SIZE);
+
+  if (tid < n) {
+    r_vote = r_vote >> (TILE_SIZE - rank - 1);
+    if (masks[tid] == target) {
+      int prefix_n = __popc(r_vote) - 1;
+      Tidx bias = offsets[tid / TILE_SIZE] + static_cast<Tidx>(prefix_n);
+      outkeys[bias] = keys[tid];
+      for (size_t i=0;i<dim;i++) {
+        outvalues[dim * bias + i] = values[dim * tid + i];
+      }
+    }
+  }
+}
+
+template <typename K, typename V, typename M, typename Tidx, int TILE_SIZE = 8>
+__global__ void gpu_select_kvm_kernel(const bool* masks, size_t n,
+                                      const Tidx* offsets, K* __restrict keys,
+                                      V* __restrict values, M* __restrict metas,
+                                      const size_t dim) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
+  int rank = g.thread_rank();
+
+  bool is_existed = false;
+  if (tid < n) {
+    if (masks[tid]) {
+      is_existed = true;
+    }
+  }
+  unsigned int vote = g.ballot(is_existed);
+  unsigned int r_vote = __brev(vote) >> (32 - TILE_SIZE);
+  K empty_key = (K)EMPTY_KEY;
+
+  if (tid < n) {
+    r_vote = r_vote >> (TILE_SIZE - rank - 1);
+    if (masks[tid]) {
+      int prefix_n = __popc(r_vote) - 1;
+      Tidx bias = offsets[tid / TILE_SIZE] + static_cast<Tidx>(prefix_n);
+
+      if (bias == tid) return;
+
+      K target_key = 0;
+      AtomicKey<K>* atomic_key = reinterpret_cast<AtomicKey<K>*>(keys) + bias;
+      while (target_key != empty_key) {
+        //target_key = atomicCAS(keys + bias, empty_key, keys[tid]);
+        target_key = empty_key;
+        atomic_key->compare_exchange_weak(target_key, keys[tid],
+                                          cuda::std::memory_order_relaxed,
+                                          cuda::std::memory_order_relaxed);
+      }
+      if (metas) metas[bias] = metas[tid];
+      for (size_t j = 0; j < dim; j++) {
+        values[dim * bias + j] = values[dim * tid + j];
+      }
+      //atomicExch(keys + tid, empty_key);
+      atomic_key = reinterpret_cast<AtomicKey<K>*>(keys) + tid;
+      atomic_key->store(empty_key, cuda::std::memory_order_relaxed);
+    }
+  }
+}
+
+template <typename K, typename V, typename M, int TILE_SIZE = 8>
+__global__ void gpu_select_kvm_kernel_v2(size_t n,
+                                         K* __restrict keys,
+                                         V* __restrict values,
+                                         M* __restrict metas,
+                                         K* __restrict tmp_keys,
+                                         V* __restrict tmp_values,
+                                         M* __restrict tmp_metas,
+                                         size_t* cnt,
+                                         const size_t dim) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < n) {
+    size_t offset = atomicAdd(cnt, 1llu);
+    tmp_keys[offset] = keys[tid];
+    for (size_t j = 0; j < dim; j++) {
+      tmp_values[offset * dim + j] = values[tid * dim + j];
+    }
+    if (metas) {
+      tmp_metas[offset] = metas[tid];
+    }
+  }
+}
+
+template <typename K, typename V, typename M, typename Tidx, int TILE_SIZE = 8>
+void gpu_pick_kvm_inplace(size_t grid_size, size_t block_size, const bool* masks,
+                      bool target, size_t n, size_t* n_evicted, Tidx* offsets,
+                      K* __restrict keys, V* __restrict values,
+                      M* __restrict metas, size_t dim, cudaStream_t stream) {
+  size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE;
+  gpu_cell_count<Tidx, TILE_SIZE>
+      <<<grid_size, block_size, 0, stream>>>(masks, target, offsets, n, n_evicted);
+#if THRUST_VERSION >= 101600
+  auto policy = thrust::cuda::par_nosync.on(stream);
+#else
+  auto policy = thrust::cuda::par.on(stream);
+#endif
+  thrust::device_ptr<Tidx> d_src(offsets);
+  thrust::device_ptr<Tidx> d_dest(offsets);
+  thrust::exclusive_scan(policy, d_src, d_src + n_offsets, d_dest);
+  if (target) {
+    gpu_select_kvm_kernel<K, V, M, Tidx, TILE_SIZE>
+        <<<grid_size, block_size, 0, stream>>>(masks, n, offsets,
+                keys, values, metas, dim);
+  } else {
+    throw std::runtime_error("Not used");
+    //gpu_select_kvm_kernel_reverse<K, V, M, Tidx, TILE_SIZE>
+    //    <<<grid_size, block_size, 0, stream>>>(masks, n, offsets,
+    //            keys, values, metas, dim);
+  }
+}
+
+template <typename K, typename V, typename M, int TILE_SIZE = 8>
+size_t gpu_pick_kvm_v2(size_t grid_size, size_t block_size,
+                      bool target, size_t n, size_t* n_evicted,
+                      K* __restrict keys, V* __restrict values,
+                      M* __restrict metas, size_t dim, cudaStream_t stream) {
+#if THRUST_VERSION >= 101600
+  auto policy = thrust::cuda::par_nosync.on(stream);
+#else
+  auto policy = thrust::cuda::par.on(stream);
+#endif
+  thrust::device_ptr<K> d_src(keys);
+  int empty_cnt = thrust::count(policy, d_src, d_src + n, (K)EMPTY_KEY);
+  size_t h_cnt = n - static_cast<size_t>(empty_cnt);
+  if (h_cnt == 0) {
+    return 0;
+  }
+  K* tmp_keys = nullptr;
+  V* tmp_values = nullptr;
+  M* tmp_metas = nullptr;
+  if (target) {
+    CUDA_CHECK(cudaMallocAsync(&tmp_keys, h_cnt * sizeof(K), stream));
+    CUDA_CHECK(cudaMemsetAsync(tmp_keys, 0, h_cnt * sizeof(K), stream));
+    CUDA_CHECK(cudaMallocAsync(&tmp_values, h_cnt * dim * sizeof(V), stream));
+    CUDA_CHECK(cudaMemsetAsync(tmp_values, 0, h_cnt * dim * sizeof(V), stream));
+    if (metas) {
+      CUDA_CHECK(cudaMallocAsync(&tmp_metas, h_cnt * sizeof(M), stream));
+      CUDA_CHECK(cudaMemsetAsync(tmp_metas, 0, h_cnt * sizeof(M), stream));
+    }
+    gpu_select_kvm_kernel_v2<K, V, M, TILE_SIZE>
+        <<<grid_size, block_size, 0, stream>>>(n,
+                keys, values, metas, tmp_keys, tmp_values, tmp_metas, n_evicted, dim);
+    CUDA_CHECK(cudaMemcpyAsync(keys, tmp_keys, h_cnt * sizeof(K), cudaMemcpyDeviceToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(values, tmp_values, h_cnt * dim * sizeof(V), cudaMemcpyDeviceToDevice, stream));
+    if(metas) {
+      CUDA_CHECK(cudaMemcpyAsync(metas, tmp_metas, h_cnt * sizeof(M), cudaMemcpyDeviceToDevice, stream));
+    }
+    CUDA_CHECK(cudaFreeAsync(tmp_keys, stream));
+    CUDA_CHECK(cudaFreeAsync(tmp_values, stream));
+    if (tmp_metas) {
+      CUDA_CHECK(cudaFreeAsync(tmp_metas, stream));
+    }
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+  } else {
+    throw std::runtime_error("Not used");
+    //gpu_select_kvm_kernel_reverse<K, V, M, Tidx, TILE_SIZE>
+    //    <<<grid_size, block_size, 0, stream>>>(masks, n, offsets,
+    //            keys, values, metas, dim);
+  }
+  return h_cnt;
+}
+
+template <typename K, typename V, typename M, int TILE_SIZE = 8>
+void gpu_pick_kvm_inplace_wrap(const bool* masks, bool target,
+                           size_t n, size_t* n_evicted,
+                           K* __restrict keys, V* __restrict values,
+                           M* __restrict metas, size_t dim, cudaStream_t stream) {
+  size_t block_size = 256;
+  size_t grid_size = SAFE_GET_GRID_SIZE(n, block_size);
+  size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE;
+  int64_t* offsets = nullptr;
+  CUDA_CHECK(cudaMallocAsync(&offsets, sizeof(int64_t) * n_offsets, stream));
+  gpu_pick_kvm_inplace<K, V, M, int64_t, TILE_SIZE>(grid_size, block_size,
+          masks, target, n, n_evicted, offsets, keys, values, metas, dim, stream);
+  CUDA_CHECK(cudaFreeAsync(offsets, stream));
+}
+
+template <typename K, int TILE_SIZE = 32>
+void gpu_pick_keys(const bool* masks, bool target, size_t n, size_t* n_evicted,
+                           const K* __restrict keys, K* __restrict outkeys,
+                           int64_t* outoffsets, cudaStream_t stream) {
+  size_t block_size = 256;
+  size_t grid_size = SAFE_GET_GRID_SIZE(n, block_size);
+  size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE;
+  int64_t* offsets = nullptr;
+  CUDA_CHECK(cudaMallocAsync(&offsets, sizeof(int64_t) * n_offsets, stream));
+  CUDA_CHECK(cudaMemsetAsync(offsets, 0, sizeof(int64_t) * n_offsets, stream));
+
+  gpu_cell_count<int64_t, TILE_SIZE>
+      <<<grid_size, block_size, 0, stream>>>(masks, target, offsets, n, n_evicted);
+#if THRUST_VERSION >= 101600
+  auto policy = thrust::cuda::par_nosync.on(stream);
+#else
+  auto policy = thrust::cuda::par.on(stream);
+#endif
+  thrust::device_ptr<int64_t> d_src(offsets);
+  thrust::device_ptr<int64_t> d_dest(offsets);
+  thrust::exclusive_scan(policy, d_src, d_src + n_offsets, d_dest);
+  gpu_select_key_kernel<K, int64_t, TILE_SIZE>
+      <<<grid_size, block_size, 0, stream>>>(masks, target, n, offsets,
+              keys, outkeys, outoffsets);
+  CUDA_CHECK(cudaFreeAsync(offsets, stream));
+}
+
+template <typename K, typename V, int TILE_SIZE = 8>
+void gpu_pick_kvs(const bool* masks, bool target, size_t n, size_t* n_evicted,
+                           const K* __restrict keys,
+                           V* __restrict values,
+                           K* __restrict outkeys,
+                           V* __restrict outvalues,
+                           size_t dim,
+                           cudaStream_t stream) {
+  size_t block_size = 256;
+  size_t grid_size = SAFE_GET_GRID_SIZE(n, block_size);
+  size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE;
+  int64_t* offsets = nullptr;
+  CUDA_CHECK(cudaMallocAsync(&offsets, sizeof(int64_t) * n_offsets, stream));
+  CUDA_CHECK(cudaMemsetAsync(offsets, 0, sizeof(int64_t) * n_offsets, stream));
+
+  gpu_cell_count<int64_t, TILE_SIZE>
+      <<<grid_size, block_size, 0, stream>>>(masks, target, offsets, n, n_evicted);
+#if THRUST_VERSION >= 101600
+  auto policy = thrust::cuda::par_nosync.on(stream);
+#else
+  auto policy = thrust::cuda::par.on(stream);
+#endif
+  thrust::device_ptr<int64_t> d_src(offsets);
+  thrust::device_ptr<int64_t> d_dest(offsets);
+  thrust::exclusive_scan(policy, d_src, d_src + n_offsets, d_dest);
+  gpu_select_kv_kernel<K, V, int64_t, TILE_SIZE>
+      <<<grid_size, block_size, 0, stream>>>(masks, target, n, offsets,
+              keys, values, outkeys, outvalues, dim);
+  CUDA_CHECK(cudaFreeAsync(offsets, stream));
+}
+}  // namespace merlin
+}  // namespace nv
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/core_kernels.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/core_kernels.cuh
new file mode 100644
index 000000000..43504dd68
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/core_kernels.cuh
@@ -0,0 +1,2510 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http:///www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+#include <cuda/barrier>
+#include <mutex>
+#include <thread>
+#include <vector>
+#include "types.cuh"
+#include "utils.cuh"
+
+using namespace cooperative_groups;
+namespace cg = cooperative_groups;
+
+namespace nv {
+namespace merlin {
+
+/* For improving performance consideration, allocating up to 64 table structures
+ * in constant memory is supported. To close this function, please set
+ * `TableOption::use_constant_memory` to `false`.
+ */
+constexpr int MAX_CONSTANT_TABLE = 64;
+static std::mutex constant_table_mutex;
+static uint64_t constant_table_flag = 0;
+
+__constant__ char
+    c_table_[sizeof(Table<uint64_t, float, uint64_t>) * MAX_CONSTANT_TABLE];
+
+template <class T = uint64_t>
+int allocate_constant_table() {
+  std::lock_guard<std::mutex> guard(constant_table_mutex);
+  if (constant_table_flag == std::numeric_limits<uint64_t>::max()) return -1;
+  int table_index = 0;
+  while (constant_table_flag & (1l << table_index)) {
+    table_index++;
+  }
+
+  constant_table_flag = constant_table_flag | (1l << table_index);
+
+  return table_index;
+}
+
+template <class T = uint64_t>
+void release_constant_table(int table_index) {
+  std::lock_guard<std::mutex> guard(constant_table_mutex);
+  if (table_index < 0 || table_index >= MAX_CONSTANT_TABLE) return;
+  constant_table_flag = constant_table_flag & (~(1l << table_index));
+}
+
+template <class M>
+__global__ void create_locks(M* __restrict mutex, const size_t start,
+                             const size_t end) {
+  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (start + tid < end) {
+    new (mutex + start + tid) M();
+  }
+}
+
+template <class M>
+__global__ void release_locks(M* __restrict mutex, const size_t start,
+                              const size_t end) {
+  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (start + tid < end) {
+    (mutex + start + tid)->~M();
+  }
+}
+
+template <class K, class V, class M>
+__global__ void create_atomic_keys(Bucket<K, V, M>* __restrict buckets,
+                                   const size_t start, const size_t end,
+                                   const size_t bucket_max_size) {
+  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (start + tid < end) {
+    for (size_t i = 0; i < bucket_max_size; i++)
+      new (buckets[start + tid].keys(i))
+          AtomicKey<K>{static_cast<K>(EMPTY_KEY)};
+  }
+}
+
+template <class K, class V, class M>
+__global__ void create_atomic_metas(Bucket<K, V, M>* __restrict buckets,
+                                    const size_t start, const size_t end,
+                                    const size_t bucket_max_size) {
+  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (start + tid < end) {
+    for (size_t i = 0; i < bucket_max_size; i++) {
+      new (buckets[start + tid].metas(i))
+          AtomicMeta<M>{static_cast<M>(EMPTY_META)};
+    }
+    new (&(buckets[start + tid].cur_meta))
+        AtomicMeta<M>{static_cast<M>(EMPTY_META)};
+    new (&(buckets[start + tid].min_meta))
+        AtomicMeta<M>{static_cast<M>(EMPTY_META)};
+    new (&(buckets[start + tid].min_pos)) AtomicPos<int>{1};
+  }
+}
+
+/* Initialize the buckets with index from start to end. */
+template <class K, class V, class M>
+void initialize_buckets(Table<K, V, M>** table, const size_t start,
+                        const size_t end) {
+  /* As testing results show us, when the number of buckets is greater than
+   * the 4 million the performance will drop significantly, we believe the
+   * to many pinned memory allocation causes this issue, so we change the
+   * strategy to allocate some memory slices whose size is not greater than
+   * 64GB, and put the buckets pointer point to the slices.
+   */
+  MERLIN_CHECK(start < end,
+               "initialize_buckets, start should be less than end!");
+  size_t buckets_num = end - start;
+  const size_t total_size_of_vectors =
+      buckets_num * (*table)->bucket_max_size * sizeof(V) * (*table)->dim;
+  const size_t num_of_memory_slices =
+      1 + (total_size_of_vectors - 1) / (*table)->bytes_per_slice;
+  size_t num_of_buckets_in_one_slice =
+      (*table)->bytes_per_slice /
+      ((*table)->bucket_max_size * sizeof(V) * (*table)->dim);
+  size_t num_of_allocated_buckets = 0;
+
+  realloc_managed<V**>(
+      &((*table)->slices), (*table)->num_of_memory_slices * sizeof(V*),
+      ((*table)->num_of_memory_slices + num_of_memory_slices) * sizeof(V*));
+
+  for (size_t i = (*table)->num_of_memory_slices;
+       i < (*table)->num_of_memory_slices + num_of_memory_slices; i++) {
+    if (i == (*table)->num_of_memory_slices + num_of_memory_slices - 1) {
+      num_of_buckets_in_one_slice = buckets_num - num_of_allocated_buckets;
+    }
+    size_t slice_real_size = num_of_buckets_in_one_slice *
+                             (*table)->bucket_max_size * sizeof(V) *
+                             (*table)->dim;
+    if ((*table)->remaining_hbm_for_vectors >= slice_real_size) {
+      CUDA_CHECK(cudaMalloc(&((*table)->slices[i]), slice_real_size));
+      (*table)->remaining_hbm_for_vectors -= slice_real_size;
+    } else {
+      (*table)->is_pure_hbm = false;
+      CUDA_CHECK(
+          cudaMallocHost(&((*table)->slices[i]), slice_real_size,
+                         cudaHostAllocMapped | cudaHostAllocWriteCombined));
+    }
+    for (int j = 0; j < num_of_buckets_in_one_slice; j++) {
+      (*table)->buckets[start + num_of_allocated_buckets + j].vectors =
+          (*table)->slices[i] + j * (*table)->bucket_max_size * (*table)->dim;
+    }
+    num_of_allocated_buckets += num_of_buckets_in_one_slice;
+  }
+
+  (*table)->num_of_memory_slices += num_of_memory_slices;
+  for (int i = start; i < end; i++) {
+    CUDA_CHECK(cudaMalloc(&((*table)->buckets[i].keys_),
+                          (*table)->bucket_max_size * sizeof(AtomicKey<K>)));
+    CUDA_CHECK(cudaMalloc(&((*table)->buckets[i].metas_),
+                          (*table)->bucket_max_size * sizeof(AtomicMeta<M>)));
+  }
+
+  {
+    const size_t block_size = 512;
+    const size_t N = end - start + 1;
+    const int grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+    create_locks<Mutex><<<grid_size, block_size>>>((*table)->locks, start, end);
+  }
+
+  {
+    const size_t block_size = 512;
+    const size_t N = end - start + 1;
+    const int grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+    create_atomic_keys<K, V, M><<<grid_size, block_size>>>(
+        (*table)->buckets, start, end, (*table)->bucket_max_size);
+  }
+
+  {
+    const size_t block_size = 512;
+    const size_t N = end - start + 1;
+    const int grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+    create_atomic_metas<K, V, M><<<grid_size, block_size>>>(
+        (*table)->buckets, start, end, (*table)->bucket_max_size);
+  }
+  CudaCheckError();
+}
+
+template <class K, class V, class M>
+size_t get_slice_size(Table<K, V, M>** table) {
+  const size_t min_slice_size =
+      (*table)->bucket_max_size * sizeof(V) * (*table)->dim;
+  const size_t max_table_size = (*table)->max_size * sizeof(V) * (*table)->dim;
+  size_t slice_size = 0;
+
+  if (max_table_size >= GB(16)) {
+    slice_size = GB(2);
+  } else if (max_table_size >= GB(2)) {
+    slice_size = MB(128);
+  } else if (max_table_size >= MB(128)) {
+    slice_size = MB(16);
+  } else if (max_table_size >= MB(16)) {
+    slice_size = MB(1);
+  } else {
+    slice_size = min_slice_size;
+  }
+
+  return std::max(min_slice_size, slice_size);
+}
+
+/* Initialize a Table struct.
+
+   K: The key type
+   V: The value type which should be static array type and C++ class
+      with customized construct is not supported.
+   M: The meta type, the meta will be used to store the timestamp
+      or occurrence frequency or any thing for eviction.
+   DIM: Vector dimension.
+*/
+template <class K, class V, class M>
+void create_table(Table<K, V, M>** table, const size_t dim,
+                  const size_t init_size = 134217728,
+                  const size_t max_size = std::numeric_limits<size_t>::max(),
+                  const size_t max_hbm_for_vectors = 0,
+                  const size_t bucket_max_size = 128,
+                  const size_t tile_size = 32, const bool primary = true) {
+  CUDA_CHECK(cudaMallocManaged((void**)table, sizeof(Table<K, V, M>)));
+  CUDA_CHECK(cudaMemset(*table, 0, sizeof(Table<K, V, M>)));
+  (*table)->dim = dim;
+  (*table)->bucket_max_size = bucket_max_size;
+  (*table)->max_size = std::max(init_size, max_size);
+  (*table)->tile_size = tile_size;
+  (*table)->is_pure_hbm = true;
+  (*table)->bytes_per_slice = get_slice_size<K, V, M>(table);
+
+  // The bucket number will be the minimum needed for saving memory if no
+  // rehash.
+  if ((init_size * 2) > (*table)->max_size) {
+    (*table)->buckets_num =
+        1 + (((*table)->max_size - 1) / (*table)->bucket_max_size);
+  } else {
+    (*table)->buckets_num = 1;
+    while ((*table)->buckets_num * (*table)->bucket_max_size < init_size) {
+      (*table)->buckets_num *= 2;
+    }
+  }
+
+  (*table)->capacity = (*table)->buckets_num * (*table)->bucket_max_size;
+  (*table)->max_hbm_for_vectors = max_hbm_for_vectors;
+  (*table)->remaining_hbm_for_vectors = max_hbm_for_vectors;
+  (*table)->primary = primary;
+
+  CUDA_CHECK(cudaMalloc((void**)&((*table)->locks),
+                        (*table)->buckets_num * sizeof(Mutex)));
+  CUDA_CHECK(
+      cudaMemset((*table)->locks, 0, (*table)->buckets_num * sizeof(Mutex)));
+
+  CUDA_CHECK(cudaMalloc((void**)&((*table)->buckets_size),
+                        (*table)->buckets_num * sizeof(int)));
+  CUDA_CHECK(cudaMemset((*table)->buckets_size, 0,
+                        (*table)->buckets_num * sizeof(int)));
+
+  CUDA_CHECK(
+      cudaMallocManaged((void**)&((*table)->buckets),
+                        (*table)->buckets_num * sizeof(Bucket<K, V, M>)));
+  CUDA_CHECK(cudaMemset((*table)->buckets, 0,
+                        (*table)->buckets_num * sizeof(Bucket<K, V, M>)));
+
+  initialize_buckets<K, V, M>(table, 0, (*table)->buckets_num);
+  CudaCheckError();
+}
+
+/* Double the capacity on storage, must be followed by calling the
+ * rehash_kernel. */
+template <class K, class V, class M>
+void double_capacity(Table<K, V, M>** table) {
+  realloc<Mutex*>(&((*table)->locks), (*table)->buckets_num * sizeof(Mutex),
+                  (*table)->buckets_num * sizeof(Mutex) * 2);
+  realloc<int*>(&((*table)->buckets_size), (*table)->buckets_num * sizeof(int),
+                (*table)->buckets_num * sizeof(int) * 2);
+
+  realloc_managed<Bucket<K, V, M>*>(
+      &((*table)->buckets), (*table)->buckets_num * sizeof(Bucket<K, V, M>),
+      (*table)->buckets_num * sizeof(Bucket<K, V, M>) * 2);
+
+  initialize_buckets<K, V, M>(table, (*table)->buckets_num,
+                              (*table)->buckets_num * 2);
+
+  (*table)->capacity *= 2;
+  (*table)->buckets_num *= 2;
+}
+
+/* free all of the resource of a Table. */
+template <class K, class V, class M>
+void destroy_table(Table<K, V, M>** table) {
+  for (int i = 0; i < (*table)->buckets_num; i++) {
+    CUDA_CHECK(cudaFree((*table)->buckets[i].keys_));
+    CUDA_CHECK(cudaFree((*table)->buckets[i].metas_));
+  }
+
+  for (int i = 0; i < (*table)->num_of_memory_slices; i++) {
+    if (is_on_device((*table)->slices[i])) {
+      CUDA_CHECK(cudaFree((*table)->slices[i]));
+    } else {
+      CUDA_CHECK(cudaFreeHost((*table)->slices[i]));
+    }
+  }
+  {
+    const size_t block_size = 512;
+    const size_t N = (*table)->buckets_num;
+    const int grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+    release_locks<Mutex>
+        <<<grid_size, block_size>>>((*table)->locks, 0, (*table)->buckets_num);
+  }
+  CUDA_CHECK(cudaFree((*table)->slices));
+  CUDA_CHECK(cudaFree((*table)->buckets_size));
+  CUDA_CHECK(cudaFree((*table)->buckets));
+  CUDA_CHECK(cudaFree((*table)->locks));
+  CUDA_CHECK(cudaFree(*table));
+  CUDA_CHECK(cudaDeviceSynchronize());
+  CudaCheckError();
+}
+
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__forceinline__ __device__ void defragmentation_for_rehash(
+    Bucket<K, V, M>* __restrict bucket, uint32_t remove_pos,
+    const size_t bucket_max_size, const size_t buckets_num, const size_t dim) {
+  uint32_t key_idx;
+  size_t global_idx = 0;
+  size_t start_idx = 0;
+  K find_key;
+  K hashed_key;
+
+  uint32_t empty_pos = remove_pos;
+
+  int i = 1;
+  while (i < bucket_max_size) {
+    key_idx = (remove_pos + i) & (bucket_max_size - 1);
+    find_key = (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed);
+    if (find_key == static_cast<K>(EMPTY_KEY)) {
+      break;
+    }
+    hashed_key = Murmur3HashDevice(find_key);
+    global_idx = hashed_key % (buckets_num * bucket_max_size);
+    start_idx = global_idx % bucket_max_size;
+
+    if ((start_idx <= empty_pos && empty_pos < key_idx) ||
+        (key_idx < start_idx && start_idx <= empty_pos) ||
+        (empty_pos <= key_idx && key_idx < start_idx)) {
+      const K key =
+          (*(bucket->keys(key_idx))).load(cuda::std::memory_order_relaxed);
+      (*(bucket->keys(empty_pos))).store(key, cuda::std::memory_order_relaxed);
+      const M meta =
+          (*(bucket->metas(key_idx))).load(cuda::std::memory_order_relaxed);
+      (*(bucket->metas(empty_pos)))
+          .store(meta, cuda::std::memory_order_relaxed);
+      for (int j = 0; j < dim; j++) {
+        bucket->vectors[empty_pos * dim + j] =
+            bucket->vectors[key_idx * dim + j];
+      }
+      (*(bucket->keys(key_idx)))
+          .store(static_cast<K>(EMPTY_KEY), cuda::std::memory_order_relaxed);
+      empty_pos = key_idx;
+      remove_pos = key_idx;
+      i = 1;
+    } else {
+      i++;
+    }
+  }
+}
+
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__forceinline__ __device__ void refresh_bucket_meta(
+    cg::thread_block_tile<TILE_SIZE> g, Bucket<K, V, M>* bucket,
+    const size_t bucket_max_size) {
+  M min_val = MAX_META;
+  int min_pos = 0;
+
+  for (int i = g.thread_rank(); i < bucket_max_size; i += TILE_SIZE) {
+    const K key = (bucket->keys(i))->load(cuda::std::memory_order_relaxed);
+    if (key == static_cast<K>(EMPTY_KEY) ||
+        key == static_cast<K>(RECLAIM_KEY)) {
+      continue;
+    }
+    const M meta = bucket->metas(i)->load(cuda::std::memory_order_relaxed);
+    if (meta < min_val) {
+      min_pos = i;
+      min_val = meta;
+    }
+  }
+  M global_min_val = cg::reduce(g, min_val, cg::less<M>());
+  if (min_val == global_min_val) {
+    bucket->min_pos.store(min_pos, cuda::std::memory_order_relaxed);
+    bucket->min_meta.store(min_val, cuda::std::memory_order_relaxed);
+  }
+}
+
+template <class V, uint32_t TILE_SIZE = 4>
+__device__ __forceinline__ void copy_vector(
+    cg::thread_block_tile<TILE_SIZE> const& g, const V* src, V* dst,
+    const size_t dim) {
+  for (auto i = g.thread_rank(); i < dim; i += g.size()) {
+    dst[i] = src[i];
+  }
+
+  //  cuda::barrier<cuda::thread_scope_device> bar;
+  //  init(&bar, 1);
+  //  cuda::memcpy_async(g, dst, src, dim * sizeof(V), bar);
+  //
+  //  bar.arrive_and_wait();
+}
+
+/* Write the N data from src to each address in *dst by using CPU threads,
+ * usually called by upsert kernel.
+ *
+ * @note: In some machines with AMD CPUs, the `write_kernel` has low performance
+ * thru PCI-E, so we try to use the `memcpy` on CPU threads for writing work to
+ * reach better performance.
+ */
+template <class V>
+void write_by_cpu(V** __restrict dst, const V* __restrict src,
+                  const int* __restrict offset, size_t dim, int N,
+                  int n_worker = 16) {
+  std::vector<std::thread> thds;
+  if (n_worker < 1) n_worker = 1;
+
+  auto functor = [dim](V** __restrict dst, const V* __restrict src,
+                       const int* __restrict offset, int handled_size,
+                       int trunk_size) -> void {
+    for (int i = handled_size; i < handled_size + trunk_size; i++) {
+      if (dst[i] != nullptr) {
+        memcpy(dst[i], src + offset[i] * dim, sizeof(V) * dim);
+      }
+    }
+  };
+
+  int32_t trunk_size_floor = N / n_worker;
+  int32_t trunk_size_remain = N % n_worker;
+  int32_t n_worker_used = trunk_size_floor == 0 ? trunk_size_remain : n_worker;
+
+  size_t handled_size = 0;
+  for (int i = 0; i < n_worker_used; i++) {
+    int32_t cur_trunk_size = trunk_size_floor;
+    if (trunk_size_remain != 0) {
+      cur_trunk_size += 1;
+      trunk_size_remain--;
+    }
+    thds.push_back(
+        std::thread(functor, dst, src, offset, handled_size, cur_trunk_size));
+    handled_size += cur_trunk_size;
+  }
+
+  for (int i = 0; i < n_worker_used; i++) {
+    thds[i].join();
+  }
+}
+
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__forceinline__ __device__ void move_key_to_new_bucket(
+    cg::thread_block_tile<TILE_SIZE> g, int rank, const K& key, const M& meta,
+    const V* __restrict vector, Bucket<K, V, M>* __restrict new_bucket,
+    const size_t new_bkt_idx, const size_t new_start_idx,
+    int* __restrict buckets_size, const size_t bucket_max_size,
+    const size_t buckets_num, const size_t dim) {
+  uint32_t key_pos;
+  unsigned empty_vote;
+  int local_size;
+  int src_lane;
+
+  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
+       tile_offset += TILE_SIZE) {
+    size_t key_offset =
+        (new_start_idx + tile_offset + rank) & (bucket_max_size - 1);
+    const K current_key =
+        (*(new_bucket->keys(key_offset))).load(cuda::std::memory_order_relaxed);
+    empty_vote = g.ballot(current_key == static_cast<K>(EMPTY_KEY));
+    if (empty_vote) {
+      src_lane = __ffs(empty_vote) - 1;
+      key_pos =
+          (new_start_idx + tile_offset + src_lane) & (bucket_max_size - 1);
+      local_size = buckets_size[new_bkt_idx];
+      if (rank == src_lane) {
+        new_bucket->keys(key_pos)->store(key, cuda::std::memory_order_relaxed);
+        new_bucket->metas(key_pos)->store(meta,
+                                          cuda::std::memory_order_relaxed);
+        atomicAdd(&(buckets_size[new_bkt_idx]), 1);
+      }
+      local_size = g.shfl(local_size, src_lane);
+      if (local_size >= bucket_max_size) {
+        refresh_bucket_meta<K, V, M, TILE_SIZE>(g, new_bucket, bucket_max_size);
+      }
+      copy_vector<V, TILE_SIZE>(g, vector, new_bucket->vectors + key_pos * dim,
+                                dim);
+      break;
+    }
+  }
+}
+
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__global__ void rehash_kernel_for_fast_mode(
+    const Table<K, V, M>* __restrict table, size_t N) {
+  Bucket<K, V, M>* buckets = table->buckets;
+  int* __restrict buckets_size = table->buckets_size;
+  const size_t bucket_max_size = table->bucket_max_size;
+  const size_t buckets_num = table->buckets_num;
+  const size_t dim = table->dim;
+
+  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
+  int rank = g.thread_rank();
+  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  size_t global_idx;
+  uint32_t start_idx = 0;
+  K target_key = 0;
+  M target_meta = 0;
+
+  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
+    uint32_t bkt_idx = t / TILE_SIZE;
+    Bucket<K, V, M>* bucket = (buckets + bkt_idx);
+
+    lock<Mutex, TILE_SIZE>(g, table->locks[bkt_idx]);
+    uint32_t key_idx = 0;
+    while (key_idx < bucket_max_size) {
+      key_idx = g.shfl(key_idx, 0);
+      target_key =
+          (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed);
+      target_meta =
+          bucket->metas(key_idx)->load(cuda::std::memory_order_relaxed);
+      if (target_key != static_cast<K>(EMPTY_KEY) &&
+          target_key != static_cast<K>(RECLAIM_KEY)) {
+        K hashed_key = Murmur3HashDevice(target_key);
+        global_idx = hashed_key % (buckets_num * bucket_max_size);
+        uint32_t new_bkt_idx = global_idx / bucket_max_size;
+        if (new_bkt_idx != bkt_idx) {
+          start_idx = global_idx % bucket_max_size;
+          move_key_to_new_bucket<K, V, M, TILE_SIZE>(
+              g, rank, target_key, target_meta,
+              (bucket->vectors + key_idx * dim), buckets + new_bkt_idx,
+              new_bkt_idx, start_idx, buckets_size, bucket_max_size,
+              buckets_num, table->dim);
+          if (rank == 0) {
+            (bucket->keys(key_idx))
+                ->store(static_cast<K>(EMPTY_KEY),
+                        cuda::std::memory_order_relaxed);
+            atomicSub(&(buckets_size[bkt_idx]), 1);
+            defragmentation_for_rehash<K, V, M, TILE_SIZE>(
+                bucket, key_idx, bucket_max_size, buckets_num / 2, dim);
+            key_idx = 0;
+          }
+        } else {
+          key_idx++;
+        }
+      } else {
+        key_idx++;
+      }
+    }
+    unlock<Mutex, TILE_SIZE>(g, table->locks[bkt_idx]);
+  }
+}
+
+/* Write the N data from src to each address in *dst,
+   usually called by upsert kernel.
+
+   `src`: A continuous memory pointer with Vector
+          which can be HBM.
+   `dst`: A pointer of pointer to V which should be on HBM,
+          but each value (a pointer of V) could point to a
+          memory on HBM or HMEM.
+   `N`: Number of vectors that need to be written.
+*/
+template <class K, class V, class M>
+__global__ void write_kernel(const V* __restrict src, V** __restrict dst,
+                             const int* __restrict src_offset, const size_t dim,
+                             const size_t N) {
+  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
+    int vec_index = int(t / dim);
+    int dim_index = t % dim;
+
+    if (dst[vec_index] != nullptr) {
+      if (src_offset != nullptr) {
+        dst[vec_index][dim_index] =
+            src[src_offset[vec_index] * dim + dim_index];
+      } else {
+        dst[vec_index][dim_index] = src[vec_index * dim + dim_index];
+      }
+    }
+  }
+}
+
+/* Write the values of delta_or_val into the table. If the key[i] is already in
+   the table indicted be @exists[i], a @delta_or_val[i] will be added to the the
+   existing value. if the key not exists, the value @val_or_delta[i] will be
+   assigned to the address @dst[i].
+
+   `delta_or_val`: will be treated as val and accumlating should be executed.
+   `dst`: A pointer of pointer to V which should be on HBM,
+          but each value (a pointer of V) could point to a
+          memory on HBM or HMEM.
+   `existed`: If the keys existed before this kernel is executed.
+   `status`: The existence status for each key when the kernel is being
+   executed.
+
+   `N`: number of vectors needed to be writen.
+*/
+template <class K, class V, class M>
+__global__ void write_with_accum_kernel(const V* __restrict delta_or_val,
+                                        V** __restrict dst,
+                                        const bool* __restrict existed,
+                                        const bool* __restrict status,
+                                        const int* __restrict src_offset,
+                                        const size_t dim, size_t N) {
+  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
+    int vec_index = int(t / dim);
+    int dim_index = t % dim;
+
+    if (dst[vec_index] != nullptr &&
+        existed[src_offset[vec_index]] == status[src_offset[vec_index]]) {
+      if (status[src_offset[vec_index]]) {
+        dst[vec_index][dim_index] +=
+            delta_or_val[src_offset[vec_index] * dim + dim_index];
+      } else {
+        dst[vec_index][dim_index] =
+            delta_or_val[src_offset[vec_index] * dim + dim_index];
+      }
+    }
+  }
+}
+
+/* Add a @delta[i] to the the value saved in the address @dst[i].
+
+   `delta`: a delta value which should be add to.
+   `dst`: A pointer of pointer to V which should be on HBM,
+          but each value (a pointer of V) could point to a
+          memory on HBM or HMEM.
+   `N`: number of vectors needed to be writen.
+*/
+template <class K, class V, class M>
+__global__ void write_with_accum_kernel(const V* __restrict delta,
+                                        V** __restrict dst,
+                                        const int* __restrict src_offset,
+                                        const size_t dim, size_t N) {
+  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
+    int vec_index = int(t / dim);
+    int dim_index = t % dim;
+
+    if (dst[vec_index] != nullptr) {
+      dst[vec_index][dim_index] +=
+          delta[src_offset[vec_index] * dim + dim_index];
+    }
+  }
+}
+
+/* Read the N data from src to each address in *dst,
+   usually called by upsert kernel.
+
+   `src`: A pointer of pointer of V which should be on HBM,
+          but each value (a pointer of V) could point to a
+          memory on HBM or HMEM.
+   `dst`: A continue memory pointer with Vector
+          which should be HBM.
+   `mask`: One for each `dst`. If true, reading from src,
+           or false reading from default_val.
+   `default_val`: Default value with shape (1, DIM) or (N, DIM)
+   `N`: The number of vectors needed to be read.
+   'full_size_default':
+      If true, the d_def_val will be treated as
+      a full size default value which shape must be (N, DIM).
+*/
+template <class K, class V, class M>
+__global__ void read_kernel(const V* const* __restrict src, V* __restrict dst,
+                            const bool* mask, const int* __restrict dst_offset,
+                            const size_t dim, size_t N) {
+  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
+    int vec_index = int(t / dim);
+    int dim_index = t % dim;
+    int real_dst_offset =
+        dst_offset != nullptr ? dst_offset[vec_index] : vec_index;
+
+    /// Copy selected values and fill in default value for all others.
+    if (mask[real_dst_offset] && src[vec_index] != nullptr) {
+      dst[real_dst_offset * dim + dim_index] = src[vec_index][dim_index];
+    }
+  }
+}
+
+/* Read the N data from src to each address in *dst,
+ *  usually called by upsert kernel.
+ *
+ *  `src`: A pointer of pointer of V which should be on HBM,
+ *         but each value (a pointer of V) could point to a
+ *         memory on HBM or HMEM.
+ *  `dst`: A continue memory pointer with Vector
+ *         which should be HBM.
+ *  `N`: Number of vectors needed to be read.
+ */
+template <class K, class V, class M>
+__global__ void read_kernel(const V** __restrict src, V* __restrict dst,
+                            const int* __restrict dst_offset, const size_t dim,
+                            const size_t N) {
+  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
+    int vec_index = int(t / dim);
+    int real_dst_offset =
+        dst_offset != nullptr ? dst_offset[vec_index] : vec_index;
+    int dim_index = t % dim;
+    if (src[vec_index] != nullptr) {
+      dst[real_dst_offset * dim + dim_index] = src[vec_index * dim + dim_index];
+    }
+  }
+}
+
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__device__ __forceinline__ unsigned find_in_bucket(
+    cg::thread_block_tile<TILE_SIZE> g, Bucket<K, V, M>* bucket,
+    const K& find_key, uint32_t& tile_offset, const uint32_t& start_idx,
+    const size_t& bucket_max_size) {
+  uint32_t key_pos = 0;
+
+#pragma unroll
+  for (tile_offset = 0; tile_offset < bucket_max_size;
+       tile_offset += TILE_SIZE) {
+    key_pos =
+        (start_idx + tile_offset + g.thread_rank()) & (bucket_max_size - 1);
+    auto const current_key =
+        bucket->keys(key_pos)->load(cuda::std::memory_order_relaxed);
+    auto const found_vote = g.ballot(find_key == current_key);
+    if (found_vote) {
+      return found_vote;
+    }
+
+    if (g.any(current_key == static_cast<K>(EMPTY_KEY))) {
+      return 0;
+    }
+  }
+  return 0;
+}
+
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__device__ __forceinline__ OccupyResult find_without_lock(
+    cg::thread_block_tile<TILE_SIZE> g, Bucket<K, V, M>* __restrict__ bucket,
+    const K desired_key, const size_t start_idx, int& key_pos, int& src_lane,
+    const size_t bucket_max_size) {
+  K expected_key = static_cast<K>(EMPTY_KEY);
+
+  AtomicKey<K>* current_key;
+
+  unsigned vote = 0;
+
+  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
+       tile_offset += TILE_SIZE) {
+    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;
+
+    current_key = bucket->keys(key_pos);
+
+    expected_key = current_key->load(cuda::std::memory_order_relaxed);
+    vote = g.ballot(desired_key == expected_key);
+    if (vote) {
+      src_lane = __ffs(vote) - 1;
+      key_pos = g.shfl(key_pos, src_lane);
+      return OccupyResult::DUPLICATE;
+    }
+    vote = g.ballot(expected_key == static_cast<K>(EMPTY_KEY));
+    if (vote) break;
+  }
+  return OccupyResult::CONTINUE;
+}
+
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__device__ __inline__ OccupyResult find_and_lock_when_vacant(
+    cg::thread_block_tile<TILE_SIZE> g, Bucket<K, V, M>* __restrict__ bucket,
+    const K desired_key, const M desired_meta, K& evicted_key,
+    const size_t start_idx, int& key_pos, int& src_lane,
+    const size_t bucket_max_size) {
+  K expected_key = static_cast<K>(EMPTY_KEY);
+
+  AtomicKey<K>* current_key;
+  AtomicMeta<M>* current_meta;
+
+  K local_min_meta_key = static_cast<K>(EMPTY_KEY);
+
+  M local_min_meta_val = MAX_META;
+  M temp_min_meta_val = MAX_META;
+  int local_min_meta_pos = -1;
+
+  unsigned vote = 0;
+  bool result = false;
+
+  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
+       tile_offset += TILE_SIZE) {
+    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;
+
+    current_key = bucket->keys(key_pos);
+
+    // Step 1: try find and lock the desired_key.
+    do {
+      expected_key = desired_key;
+      result = current_key->compare_exchange_strong(
+          expected_key, static_cast<K>(LOCKED_KEY),
+          cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);
+      vote = g.ballot(result);
+      if (vote) {
+        src_lane = __ffs(vote) - 1;
+        key_pos = g.shfl(key_pos, src_lane);
+        return OccupyResult::DUPLICATE;
+      }
+      vote = g.ballot(expected_key == static_cast<K>(EMPTY_KEY));
+      if (vote) break;
+      vote = g.ballot(expected_key == static_cast<K>(LOCKED_KEY));
+    } while (vote != 0);
+
+    // Step 2: (TBD)try find empty location.
+    while (vote) {
+      src_lane = __ffs(vote) - 1;
+      if (src_lane == g.thread_rank()) {
+        expected_key = static_cast<K>(EMPTY_KEY);
+        result = current_key->compare_exchange_strong(
+            expected_key, static_cast<K>(LOCKED_KEY),
+            cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);
+      }
+      result = g.shfl(result, src_lane);
+      if (result) {
+        key_pos = g.shfl(key_pos, src_lane);
+        return OccupyResult::OCCUPIED_EMPTY;
+      }
+      vote -= ((unsigned(0x1)) << src_lane);
+    }
+  }
+
+  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
+       tile_offset += TILE_SIZE) {
+    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;
+
+    current_meta = bucket->metas(key_pos);
+
+    // Step 4: record min meta location.
+    temp_min_meta_val = current_meta->load(cuda::std::memory_order_relaxed);
+    if (temp_min_meta_val < local_min_meta_val) {
+      expected_key =
+          bucket->keys(key_pos)->load(cuda::std::memory_order_relaxed);
+      if (expected_key != static_cast<K>(LOCKED_KEY) &&
+          expected_key != static_cast<K>(EMPTY_KEY)) {
+        local_min_meta_key = expected_key;
+        local_min_meta_val = temp_min_meta_val;
+        local_min_meta_pos = key_pos;
+      }
+    }
+  }
+  // Step 5: insert by evicting some one.
+  const M global_min_meta_val =
+      cg::reduce(g, local_min_meta_val, cg::less<M>());
+  if (desired_meta < global_min_meta_val) {
+    return OccupyResult::REFUSED;
+  }
+  vote = g.ballot(local_min_meta_val <= global_min_meta_val);
+  if (vote) {
+    src_lane = __ffs(vote) - 1;
+    result = false;
+    if (src_lane == g.thread_rank()) {
+      // TBD: Here can be compare_exchange_weak. Do benchmark.
+      current_key = bucket->keys(local_min_meta_pos);
+      current_meta = bucket->metas(local_min_meta_pos);
+      evicted_key = local_min_meta_key;
+      result = current_key->compare_exchange_strong(
+          local_min_meta_key, static_cast<K>(LOCKED_KEY),
+          cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);
+
+      // Need to recover when fail.
+      if (result && (current_meta->load(cuda::std::memory_order_relaxed) >
+                     global_min_meta_val)) {
+        current_key->store(local_min_meta_key, cuda::std::memory_order_relaxed);
+        result = false;
+      }
+    }
+    result = g.shfl(result, src_lane);
+    if (result) {
+      // Not every `evicted_key` is correct expect the `src_lane` thread.
+      key_pos = g.shfl(local_min_meta_pos, src_lane);
+      return (evicted_key == static_cast<K>(RECLAIM_KEY))
+                 ? OccupyResult::OCCUPIED_RECLAIMED
+                 : OccupyResult::EVICT;
+    }
+  }
+  return OccupyResult::CONTINUE;
+}
+
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__device__ __forceinline__ OccupyResult find_and_lock_when_full(
+    cg::thread_block_tile<TILE_SIZE> g, Bucket<K, V, M>* __restrict__ bucket,
+    const K desired_key, const M desired_meta, K& evicted_key,
+    const size_t start_idx, int& key_pos, int& src_lane,
+    const size_t bucket_max_size) {
+  K expected_key = static_cast<K>(EMPTY_KEY);
+
+  AtomicKey<K>* current_key;
+  AtomicMeta<M>* current_meta;
+
+  K local_min_meta_key = static_cast<K>(EMPTY_KEY);
+
+  M local_min_meta_val = MAX_META;
+  M temp_min_meta_val = MAX_META;
+  int local_min_meta_pos = -1;
+
+  unsigned vote = 0;
+  bool result = false;
+
+  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
+       tile_offset += TILE_SIZE) {
+    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;
+
+    current_key = bucket->keys(key_pos);
+
+    // Step 1: try find and lock the desired_key.
+    do {
+      expected_key = desired_key;
+      result = current_key->compare_exchange_strong(
+          expected_key, static_cast<K>(LOCKED_KEY),
+          cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);
+      vote = g.ballot(result);
+      if (vote) {
+        src_lane = __ffs(vote) - 1;
+        key_pos = g.shfl(key_pos, src_lane);
+        return OccupyResult::DUPLICATE;
+      }
+      vote = g.ballot(expected_key == static_cast<K>(LOCKED_KEY));
+    } while (vote != 0);
+  }
+
+  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
+       tile_offset += TILE_SIZE) {
+    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;
+
+    // Step 2: record min meta location.
+    temp_min_meta_val =
+        bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed);
+    if (temp_min_meta_val < local_min_meta_val) {
+      while ((expected_key = bucket->keys(key_pos)->load(
+                  cuda::std::memory_order_relaxed)) ==
+             static_cast<K>(LOCKED_KEY))
+        ;
+      local_min_meta_key = expected_key;
+      local_min_meta_val = temp_min_meta_val;
+      local_min_meta_pos = key_pos;
+    }
+  }
+
+  // Step 3: insert by evicting some one.
+  const M global_min_meta_val =
+      cg::reduce(g, local_min_meta_val, cg::less<M>());
+  if (desired_meta < global_min_meta_val) {
+    return OccupyResult::REFUSED;
+  }
+  vote = g.ballot(local_min_meta_val <= global_min_meta_val);
+  if (vote) {
+    src_lane = __ffs(vote) - 1;
+    result = false;
+    if (src_lane == g.thread_rank()) {
+      // TBD: Here can be compare_exchange_weak. Do benchmark.
+      current_key = bucket->keys(local_min_meta_pos);
+      current_meta = bucket->metas(local_min_meta_pos);
+      evicted_key = local_min_meta_key;
+      result = current_key->compare_exchange_strong(
+          local_min_meta_key, static_cast<K>(LOCKED_KEY),
+          cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);
+
+      // Need to recover when fail.
+      if (result && (current_meta->load(cuda::std::memory_order_relaxed) >
+                     global_min_meta_val)) {
+        current_key->store(local_min_meta_key, cuda::std::memory_order_relaxed);
+        result = false;
+      }
+    }
+    result = g.shfl(result, src_lane);
+    if (result) {
+      // Not every `evicted_key` is correct expect the `src_lane` thread.
+      key_pos = g.shfl(local_min_meta_pos, src_lane);
+      return (evicted_key == static_cast<K>(RECLAIM_KEY))
+                 ? OccupyResult::OCCUPIED_RECLAIMED
+                 : OccupyResult::EVICT;
+    }
+  }
+  return OccupyResult::CONTINUE;
+}
+
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__device__ __forceinline__ OccupyResult find_and_lock_for_update(
+    cg::thread_block_tile<TILE_SIZE> g, Bucket<K, V, M>* __restrict__ bucket,
+    const K desired_key, const size_t start_idx, int& key_pos, int& src_lane,
+    const size_t bucket_max_size) {
+  K expected_key = static_cast<K>(EMPTY_KEY);
+
+  AtomicKey<K>* current_key;
+
+  unsigned vote = 0;
+  bool result = false;
+
+  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
+       tile_offset += TILE_SIZE) {
+    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;
+
+    current_key = bucket->keys(key_pos);
+
+    // Step 1: try find and lock the desired_key.
+    do {
+      expected_key = desired_key;
+      result = current_key->compare_exchange_strong(
+          expected_key, static_cast<K>(LOCKED_KEY),
+          cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);
+      vote = g.ballot(result);
+      if (vote) {
+        src_lane = __ffs(vote) - 1;
+        key_pos = g.shfl(key_pos, src_lane);
+        return OccupyResult::DUPLICATE;
+      }
+      vote = g.ballot(expected_key == static_cast<K>(EMPTY_KEY));
+      if (vote) return OccupyResult::REFUSED;
+      vote = g.ballot(expected_key == static_cast<K>(LOCKED_KEY));
+    } while (vote != 0);
+  }
+  return OccupyResult::REFUSED;
+}
+
+template <class K, class V, class M>
+__forceinline__ __device__ Bucket<K, V, M>* get_key_position(
+    Bucket<K, V, M>* __restrict buckets, const K key, size_t& bkt_idx,
+    size_t& start_idx, const size_t buckets_num, const size_t bucket_max_size) {
+  const uint32_t hashed_key = Murmur3HashDevice(key);
+  const size_t global_idx = hashed_key % (buckets_num * bucket_max_size);
+  bkt_idx = global_idx / bucket_max_size;
+  start_idx = global_idx % bucket_max_size;
+  return buckets + bkt_idx;
+}
+
+template <class K, class V, class M>
+__forceinline__ __device__ void update_meta(Bucket<K, V, M>* __restrict bucket,
+                                            const int key_pos,
+                                            const M* __restrict metas,
+                                            const int key_idx) {
+  if (metas == nullptr) {
+    M cur_meta =
+        bucket->cur_meta.fetch_add(1, cuda::std::memory_order_relaxed) + 1;
+    bucket->metas(key_pos)->store(cur_meta, cuda::std::memory_order_relaxed);
+  } else {
+    bucket->metas(key_pos)->store(metas[key_idx],
+                                  cuda::std::memory_order_relaxed);
+  }
+  return;
+}
+
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__global__ void upsert_kernel_with_io_core(
+    const Table<K, V, M>* __restrict table, const size_t bucket_max_size,
+    const size_t buckets_num, const size_t dim, const K* __restrict keys,
+    const V* __restrict values, const M* __restrict metas, size_t N) {
+  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
+  int* buckets_size = table->buckets_size;
+
+  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
+       t += blockDim.x * gridDim.x) {
+    int key_pos = -1;
+    size_t key_idx = t / TILE_SIZE;
+
+    const K insert_key = keys[key_idx];
+
+    if (IS_RESERVED_KEY(insert_key)) continue;
+
+    const M insert_meta =
+        metas != nullptr ? metas[key_idx] : static_cast<M>(MAX_META);
+    const V* insert_value = values + key_idx * dim;
+
+    size_t bkt_idx = 0;
+    size_t start_idx = 0;
+    int src_lane = -1;
+    K evicted_key;
+
+    Bucket<K, V, M>* bucket =
+        get_key_position<K>(table->buckets, insert_key, bkt_idx, start_idx,
+                            buckets_num, bucket_max_size);
+
+    OccupyResult occupy_result{OccupyResult::INITIAL};
+    const int bucket_size = buckets_size[bkt_idx];
+    do {
+      if (bucket_size < bucket_max_size) {
+        occupy_result = find_and_lock_when_vacant<K, V, M, TILE_SIZE>(
+            g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos,
+            src_lane, bucket_max_size);
+      } else {
+        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
+        occupy_result = find_and_lock_when_full<K, V, M, TILE_SIZE>(
+            g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos,
+            src_lane, bucket_max_size);
+      }
+
+      occupy_result = g.shfl(occupy_result, src_lane);
+    } while (occupy_result == OccupyResult::CONTINUE);
+
+    if (occupy_result == OccupyResult::REFUSED) continue;
+
+    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||
+         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&
+        g.thread_rank() == src_lane) {
+      atomicAdd(&(buckets_size[bkt_idx]), 1);
+    }
+
+    copy_vector<V, TILE_SIZE>(g, insert_value, bucket->vectors + key_pos * dim,
+                              dim);
+    if (g.thread_rank() == src_lane) {
+      update_meta(bucket, key_pos, metas, key_idx);
+      (bucket->keys(key_pos))
+          ->store(insert_key, cuda::std::memory_order_relaxed);
+    }
+  }
+}
+
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__global__ void upsert_and_evict_kernel_with_io_core(
+    const Table<K, V, M>* __restrict table, const size_t bucket_max_size,
+    const size_t buckets_num, const size_t dim, const K* __restrict keys,
+    const V* __restrict values, const M* __restrict metas,
+    K* __restrict evicted_keys, V* __restrict evicted_values,
+    M* __restrict evicted_metas, size_t N) {
+  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
+  int* buckets_size = table->buckets_size;
+
+  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
+       t += blockDim.x * gridDim.x) {
+    int key_pos = -1;
+    const size_t key_idx = t / TILE_SIZE;
+
+    const K insert_key = keys[key_idx];
+
+    if (IS_RESERVED_KEY(insert_key)) continue;
+
+    const M insert_meta =
+        metas != nullptr ? metas[key_idx] : static_cast<M>(MAX_META);
+    const V* insert_value = values + key_idx * dim;
+
+    size_t bkt_idx = 0;
+    size_t start_idx = 0;
+    int src_lane = -1;
+    K evicted_key;
+
+    Bucket<K, V, M>* bucket =
+        get_key_position<K>(table->buckets, insert_key, bkt_idx, start_idx,
+                            buckets_num, bucket_max_size);
+
+    OccupyResult occupy_result{OccupyResult::INITIAL};
+    const int bucket_size = buckets_size[bkt_idx];
+    do {
+      if (bucket_size < bucket_max_size) {
+        occupy_result = find_and_lock_when_vacant<K, V, M, TILE_SIZE>(
+            g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos,
+            src_lane, bucket_max_size);
+      } else {
+        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
+        occupy_result = find_and_lock_when_full<K, V, M, TILE_SIZE>(
+            g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos,
+            src_lane, bucket_max_size);
+      }
+      occupy_result = g.shfl(occupy_result, src_lane);
+    } while (occupy_result == OccupyResult::CONTINUE);
+
+    if (occupy_result == OccupyResult::REFUSED) {
+      copy_vector<V, TILE_SIZE>(g, insert_value, evicted_values + key_idx * dim,
+                                dim);
+      continue;
+    }
+
+    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||
+         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&
+        g.thread_rank() == src_lane) {
+      atomicAdd(&(buckets_size[bkt_idx]), 1);
+    }
+
+    if (occupy_result == OccupyResult::EVICT) {
+      if (g.thread_rank() == src_lane) {
+        evicted_keys[key_idx] = evicted_key;
+      }
+      if (metas != nullptr) {
+        evicted_metas[key_idx] = metas[key_idx];
+      }
+      copy_vector<V, TILE_SIZE>(g, bucket->vectors + key_pos * dim,
+                                evicted_values + key_idx * dim, dim);
+    }
+
+    copy_vector<V, TILE_SIZE>(g, insert_value, bucket->vectors + key_pos * dim,
+                              dim);
+    if (g.thread_rank() == src_lane) {
+      update_meta(bucket, key_pos, metas, key_idx);
+      (bucket->keys(key_pos))
+          ->store(insert_key, cuda::std::memory_order_relaxed);
+    }
+  }
+}
+
+template <typename K, typename V, typename M>
+struct SelectUpsertKernelWithIO {
+  static void execute_kernel(const float& load_factor, const int& block_size,
+                             const size_t bucket_max_size,
+                             const size_t buckets_num, const size_t dim,
+                             cudaStream_t& stream, const size_t& n,
+                             const Table<K, V, M>* __restrict table,
+                             const K* __restrict keys,
+                             const V* __restrict values,
+                             const M* __restrict metas) {
+    if (load_factor <= 0.5) {
+      const unsigned int tile_size = 4;
+      const size_t N = n * tile_size;
+      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+      upsert_kernel_with_io_core<K, V, M, tile_size>
+          <<<grid_size, block_size, 0, stream>>>(
+              table, bucket_max_size, buckets_num, dim, keys, values, metas, N);
+
+    } else if (load_factor <= 0.875) {
+      const unsigned int tile_size = 8;
+      const size_t N = n * tile_size;
+      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+      upsert_kernel_with_io_core<K, V, M, tile_size>
+          <<<grid_size, block_size, 0, stream>>>(
+              table, bucket_max_size, buckets_num, dim, keys, values, metas, N);
+    } else {
+      const unsigned int tile_size = 32;
+      const size_t N = n * tile_size;
+      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+      upsert_kernel_with_io_core<K, V, M, tile_size>
+          <<<grid_size, block_size, 0, stream>>>(
+              table, bucket_max_size, buckets_num, dim, keys, values, metas, N);
+    }
+    return;
+  }
+};
+
+template <typename K, typename V, typename M>
+struct SelectUpsertAndEvictKernelWithIO {
+  static void execute_kernel(
+      const float& load_factor, const int& block_size,
+      const size_t bucket_max_size, const size_t buckets_num, const size_t dim,
+      cudaStream_t& stream, const size_t& n,
+      const Table<K, V, M>* __restrict table, const K* __restrict keys,
+      const V* __restrict values, const M* __restrict metas,
+      K* __restrict evicted_keys, V* __restrict evicted_values,
+      M* __restrict evicted_metas) {
+    if (load_factor <= 0.5) {
+      const unsigned int tile_size = 4;
+      const size_t N = n * tile_size;
+      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+      upsert_and_evict_kernel_with_io_core<K, V, M, tile_size>
+          <<<grid_size, block_size, 0, stream>>>(
+              table, bucket_max_size, buckets_num, dim, keys, values, metas,
+              evicted_keys, evicted_values, evicted_metas, N);
+
+    } else if (load_factor <= 0.875) {
+      const unsigned int tile_size = 8;
+      const size_t N = n * tile_size;
+      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+
+      upsert_and_evict_kernel_with_io_core<K, V, M, tile_size>
+          <<<grid_size, block_size, 0, stream>>>(
+              table, bucket_max_size, buckets_num, dim, keys, values, metas,
+              evicted_keys, evicted_values, evicted_metas, N);
+
+    } else {
+      const unsigned int tile_size = 32;
+      const size_t N = n * tile_size;
+      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+      upsert_and_evict_kernel_with_io_core<K, V, M, tile_size>
+          <<<grid_size, block_size, 0, stream>>>(
+              table, bucket_max_size, buckets_num, dim, keys, values, metas,
+              evicted_keys, evicted_values, evicted_metas, N);
+    }
+    return;
+  }
+};
+
+/* Upsert with the end-user specified meta.
+ */
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__global__ void upsert_kernel(const Table<K, V, M>* __restrict table,
+                              const size_t bucket_max_size,
+                              const size_t buckets_num, const size_t dim,
+                              const K* __restrict keys, V** __restrict vectors,
+                              const M* __restrict metas,
+                              int* __restrict src_offset, size_t N) {
+  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
+  int* buckets_size = table->buckets_size;
+
+  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
+       t += blockDim.x * gridDim.x) {
+    int key_pos = -1;
+    size_t key_idx = t / TILE_SIZE;
+
+    const K insert_key = keys[key_idx];
+    if (IS_RESERVED_KEY(insert_key)) continue;
+
+    const M insert_meta =
+        metas != nullptr ? metas[key_idx] : static_cast<M>(MAX_META);
+
+    size_t bkt_idx = 0;
+    size_t start_idx = 0;
+    int src_lane = -1;
+    K evicted_key;
+
+    Bucket<K, V, M>* bucket =
+        get_key_position<K>(table->buckets, insert_key, bkt_idx, start_idx,
+                            buckets_num, bucket_max_size);
+
+    if (src_offset != nullptr && g.thread_rank() == 0) {
+      *(src_offset + key_idx) = key_idx;
+    }
+
+    OccupyResult occupy_result{OccupyResult::INITIAL};
+    const int bucket_size = buckets_size[bkt_idx];
+    do {
+      if (bucket_size < bucket_max_size) {
+        occupy_result = find_and_lock_when_vacant<K, V, M, TILE_SIZE>(
+            g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos,
+            src_lane, bucket_max_size);
+      } else {
+        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
+        occupy_result = find_and_lock_when_vacant<K, V, M, TILE_SIZE>(
+            g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos,
+            src_lane, bucket_max_size);
+      }
+
+      occupy_result = g.shfl(occupy_result, src_lane);
+    } while (occupy_result == OccupyResult::CONTINUE);
+
+    if (occupy_result == OccupyResult::REFUSED) continue;
+
+    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||
+         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&
+        g.thread_rank() == src_lane) {
+      atomicAdd(&(buckets_size[bkt_idx]), 1);
+    }
+
+    if (g.thread_rank() == src_lane) {
+      *(vectors + key_idx) = (bucket->vectors + key_pos * dim);
+      update_meta(bucket, key_pos, metas, key_idx);
+      (bucket->keys(key_pos))
+          ->store(insert_key, cuda::std::memory_order_relaxed);
+    }
+  }
+}
+
+/* Accum kernel with customized metas.
+ */
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__global__ void accum_kernel(
+    const Table<K, V, M>* __restrict table, const K* __restrict keys,
+    V** __restrict vectors, const M* __restrict metas,
+    const bool* __restrict existed, Bucket<K, V, M>* __restrict buckets,
+    int* __restrict buckets_size, const size_t bucket_max_size,
+    const size_t buckets_num, int* __restrict src_offset,
+    bool* __restrict status, size_t N) {
+  const size_t dim = table->dim;
+  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
+  int rank = g.thread_rank();
+
+  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
+    int key_pos = -1;
+    int local_size = 0;
+    bool local_found = false;
+    unsigned found_or_empty_vote = 0;
+
+    size_t key_idx = t / TILE_SIZE;
+    K insert_key = *(keys + key_idx);
+
+    if (IS_RESERVED_KEY(insert_key)) continue;
+
+    K hashed_key = Murmur3HashDevice(insert_key);
+    size_t global_idx = hashed_key % (buckets_num * bucket_max_size);
+    size_t bkt_idx = global_idx / bucket_max_size;
+    size_t start_idx = global_idx % bucket_max_size;
+
+    int src_lane = -1;
+
+    Bucket<K, V, M>* bucket = buckets + bkt_idx;
+    lock<Mutex, TILE_SIZE>(g, table->locks[bkt_idx]);
+    if (rank == 0 && src_offset != nullptr) {
+      *(src_offset + key_idx) = key_idx;
+    }
+
+    for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
+         tile_offset += TILE_SIZE) {
+      size_t key_offset =
+          (start_idx + tile_offset + rank) & (bucket_max_size - 1);
+      K current_key =
+          bucket->keys(key_offset)->load(cuda::std::memory_order_relaxed);
+      found_or_empty_vote = g.ballot(current_key == static_cast<K>(EMPTY_KEY) ||
+                                     insert_key == current_key);
+      if (found_or_empty_vote) {
+        src_lane = __ffs(found_or_empty_vote) - 1;
+        key_pos = (start_idx + tile_offset + src_lane) & (bucket_max_size - 1);
+        local_size = buckets_size[bkt_idx];
+        if (rank == src_lane) {
+          if (current_key == insert_key) {
+            local_found = true;
+            *(status + key_idx) = local_found;
+          }
+          if (local_found == existed[key_idx]) {
+            (bucket->keys(key_pos))
+                ->store(insert_key, cuda::std::memory_order_relaxed);
+            if (!local_found) {
+              buckets_size[bkt_idx]++;
+              local_size++;
+            }
+            *(vectors + key_idx) = (bucket->vectors + key_pos * dim);
+            update_meta(bucket, key_pos, metas, key_idx);
+          }
+        }
+        local_size = g.shfl(local_size, src_lane);
+        if (local_size >= bucket_max_size) {
+          refresh_bucket_meta<K, V, M, TILE_SIZE>(g, bucket, bucket_max_size);
+        }
+        break;
+      }
+    }
+    if (!found_or_empty_vote) {
+      if (rank == (bucket->min_pos % TILE_SIZE)) {
+        key_pos = bucket->min_pos;
+        (bucket->keys(key_pos))
+            ->store(insert_key, cuda::std::memory_order_relaxed);
+        *(vectors + key_idx) = (bucket->vectors + key_pos * dim);
+        update_meta(bucket, key_pos, metas, key_idx);
+      }
+      refresh_bucket_meta<K, V, M, TILE_SIZE>(g, bucket, bucket_max_size);
+    }
+    unlock<Mutex, TILE_SIZE>(g, table->locks[bkt_idx]);
+  }
+}
+
+/* lookup with IO operation. This kernel is
+ * usually used for the pure HBM mode for better performance.
+ */
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__global__ void lookup_kernel_with_io(const Table<K, V, M>* __restrict table,
+                                      const size_t bucket_max_size,
+                                      const size_t buckets_num,
+                                      const size_t dim,
+                                      const K* __restrict keys,
+                                      V* __restrict values, M* __restrict metas,
+                                      bool* __restrict found, size_t N) {
+  int* buckets_size = table->buckets_size;
+  Bucket<K, V, M>* buckets = table->buckets;
+
+  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
+  int rank = g.thread_rank();
+
+  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
+       t += blockDim.x * gridDim.x) {
+    int key_idx = t / TILE_SIZE;
+
+    const K find_key = keys[key_idx];
+    if (IS_RESERVED_KEY(find_key)) continue;
+
+    V* find_value = values + key_idx * dim;
+
+    int key_pos = -1;
+    int src_lane = -1;
+    size_t bkt_idx = 0;
+    size_t start_idx = 0;
+
+    Bucket<K, V, M>* bucket = get_key_position<K>(
+        buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size);
+
+    const int bucket_size = buckets_size[bkt_idx];
+    if (bucket_size >= bucket_max_size) {
+      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
+    }
+
+    OccupyResult occupy_result{OccupyResult::INITIAL};
+    occupy_result = find_without_lock<K, V, M, TILE_SIZE>(
+        g, bucket, find_key, start_idx, key_pos, src_lane, bucket_max_size);
+
+    if (occupy_result == OccupyResult::DUPLICATE) {
+      copy_vector<V, TILE_SIZE>(g, bucket->vectors + key_pos * dim, find_value,
+                                dim);
+      if (rank == src_lane) {
+        if (metas != nullptr) {
+          *(metas + key_idx) =
+              bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed);
+        }
+        if (found != nullptr) {
+          *(found + key_idx) = true;
+        }
+      }
+    }
+  }
+}
+
+template <typename K, typename V, typename M>
+struct SelectLookupKernelWithIO {
+  static void execute_kernel(const float& load_factor, const int& block_size,
+                             const size_t bucket_max_size,
+                             const size_t buckets_num, const size_t dim,
+                             cudaStream_t& stream, const size_t& n,
+                             const Table<K, V, M>* __restrict table,
+                             const K* __restrict keys, V* __restrict values,
+                             M* __restrict metas, bool* __restrict found) {
+    if (load_factor <= 0.75) {
+      const unsigned int tile_size = 4;
+      const size_t N = n * tile_size;
+      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+      lookup_kernel_with_io<K, V, M, tile_size>
+          <<<grid_size, block_size, 0, stream>>>(table, bucket_max_size,
+                                                 buckets_num, dim, keys, values,
+                                                 metas, found, N);
+    } else {
+      const unsigned int tile_size = 16;
+      const size_t N = n * tile_size;
+      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+      lookup_kernel_with_io<K, V, M, tile_size>
+          <<<grid_size, block_size, 0, stream>>>(table, bucket_max_size,
+                                                 buckets_num, dim, keys, values,
+                                                 metas, found, N);
+    }
+    return;
+  }
+};
+
+/* lookup kernel.
+ */
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__global__ void lookup_kernel(const Table<K, V, M>* __restrict table,
+                              const size_t bucket_max_size,
+                              const size_t buckets_num, const size_t dim,
+                              const K* __restrict keys, V** __restrict values,
+                              M* __restrict metas, bool* __restrict found,
+                              int* __restrict dst_offset, size_t N) {
+  int* buckets_size = table->buckets_size;
+  Bucket<K, V, M>* buckets = table->buckets;
+
+  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
+  int rank = g.thread_rank();
+
+  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
+       t += blockDim.x * gridDim.x) {
+    int key_idx = t / TILE_SIZE;
+
+    const K find_key = keys[key_idx];
+    if (IS_RESERVED_KEY(find_key)) continue;
+
+    int key_pos = -1;
+    int src_lane = -1;
+    size_t bkt_idx = 0;
+    size_t start_idx = 0;
+
+    Bucket<K, V, M>* bucket = get_key_position<K>(
+        buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size);
+
+    const int bucket_size = buckets_size[bkt_idx];
+    if (bucket_size >= bucket_max_size) {
+      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
+    }
+
+    if (dst_offset != nullptr && rank == 0) {
+      *(dst_offset + key_idx) = key_idx;
+    }
+
+    OccupyResult occupy_result{OccupyResult::INITIAL};
+    occupy_result = find_without_lock<K, V, M, TILE_SIZE>(
+        g, bucket, find_key, start_idx, key_pos, src_lane, bucket_max_size);
+
+    if (occupy_result == OccupyResult::DUPLICATE) {
+      if (rank == src_lane) {
+        *(values + key_idx) = (bucket->vectors + key_pos * dim);
+        if (metas != nullptr) {
+          *(metas + key_idx) =
+              bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed);
+        }
+        if (found != nullptr) {
+          *(found + key_idx) = true;
+        }
+      }
+    } else {
+      if (rank == 0) {
+        *(values + key_idx) = nullptr;
+      }
+    }
+  }
+}
+
+/* lookup with IO operation. This kernel is
+ * usually used for the pure HBM mode for better performance.
+ */
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__global__ void lookup_ptr_kernel(const Table<K, V, M>* __restrict table,
+                                  const size_t bucket_max_size,
+                                  const size_t buckets_num, const size_t dim,
+                                  const K* __restrict keys,
+                                  V** __restrict values, M* __restrict metas,
+                                  bool* __restrict found, size_t N) {
+  int* buckets_size = table->buckets_size;
+  Bucket<K, V, M>* buckets = table->buckets;
+
+  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
+  int rank = g.thread_rank();
+
+  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
+       t += blockDim.x * gridDim.x) {
+    int key_idx = t / TILE_SIZE;
+
+    const K find_key = keys[key_idx];
+    if (IS_RESERVED_KEY(find_key)) continue;
+
+    int key_pos = -1;
+    int src_lane = -1;
+    size_t bkt_idx = 0;
+    size_t start_idx = 0;
+
+    Bucket<K, V, M>* bucket = get_key_position<K>(
+        buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size);
+
+    const int bucket_size = buckets_size[bkt_idx];
+    if (bucket_size >= bucket_max_size) {
+      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
+    }
+
+    OccupyResult occupy_result{OccupyResult::INITIAL};
+    occupy_result = find_without_lock<K, V, M, TILE_SIZE>(
+        g, bucket, find_key, start_idx, key_pos, src_lane, bucket_max_size);
+
+    if (occupy_result == OccupyResult::DUPLICATE) {
+      if (rank == src_lane) {
+        values[key_idx] = bucket->vectors + key_pos * dim;
+        if (metas != nullptr) {
+          *(metas + key_idx) =
+              bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed);
+        }
+        if (found != nullptr) {
+          *(found + key_idx) = true;
+        }
+      }
+    }
+  }
+}
+
+template <typename K, typename V, typename M>
+struct SelectLookupPtrKernel {
+  static void execute_kernel(const float& load_factor, const int& block_size,
+                             const size_t bucket_max_size,
+                             const size_t buckets_num, const size_t dim,
+                             cudaStream_t& stream, const size_t& n,
+                             const Table<K, V, M>* __restrict table,
+                             const K* __restrict keys, V** __restrict values,
+                             M* __restrict metas, bool* __restrict found) {
+    if (load_factor <= 0.75) {
+      const unsigned int tile_size = 4;
+      const size_t N = n * tile_size;
+      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+      lookup_ptr_kernel<K, V, M, tile_size>
+          <<<grid_size, block_size, 0, stream>>>(table, bucket_max_size,
+                                                 buckets_num, dim, keys, values,
+                                                 metas, found, N);
+    } else {
+      const unsigned int tile_size = 16;
+      const size_t N = n * tile_size;
+      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+      lookup_ptr_kernel<K, V, M, tile_size>
+          <<<grid_size, block_size, 0, stream>>>(table, bucket_max_size,
+                                                 buckets_num, dim, keys, values,
+                                                 metas, found, N);
+    }
+    return;
+  }
+};
+
+/* Clear all key-value in the table. */
+template <class K, class V, class M>
+__global__ void clear_kernel(Table<K, V, M>* __restrict table, size_t N) {
+  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  const size_t bucket_max_size = table->bucket_max_size;
+
+  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
+    int key_idx = t % bucket_max_size;
+    int bkt_idx = t / bucket_max_size;
+    Bucket<K, V, M>* bucket = &(table->buckets[bkt_idx]);
+
+    (bucket->keys(key_idx))
+        ->store(static_cast<K>(EMPTY_KEY), cuda::std::memory_order_relaxed);
+    if (key_idx == 0) {
+      table->buckets_size[bkt_idx] = 0;
+    }
+  }
+}
+
+/* Remove specified keys. */
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__global__ void remove_kernel(const Table<K, V, M>* __restrict table,
+                              const K* __restrict keys,
+                              Bucket<K, V, M>* __restrict buckets,
+                              int* __restrict buckets_size,
+                              const size_t bucket_max_size,
+                              const size_t buckets_num, size_t N) {
+  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
+  int rank = g.thread_rank();
+
+  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
+       t += blockDim.x * gridDim.x) {
+    int key_idx = t / TILE_SIZE;
+    K find_key = keys[key_idx];
+    if (IS_RESERVED_KEY(find_key)) continue;
+
+    int key_pos = -1;
+
+    size_t bkt_idx = 0;
+    size_t start_idx = 0;
+    uint32_t tile_offset = 0;
+
+    Bucket<K, V, M>* bucket = get_key_position<K>(
+        buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size);
+
+    unsigned found_vote = 0;
+#pragma unroll
+    for (tile_offset = 0; tile_offset < bucket_max_size;
+         tile_offset += TILE_SIZE) {
+      key_pos = (start_idx + tile_offset + rank) & (bucket_max_size - 1);
+
+      const K current_key =
+          (bucket->keys(key_pos))->load(cuda::std::memory_order_relaxed);
+
+      found_vote = g.ballot(find_key == current_key);
+      if (found_vote) {
+        break;
+      }
+
+      if (g.any(current_key == static_cast<K>(EMPTY_KEY))) {
+        break;
+      }
+    }
+
+    if (found_vote) {
+      const int src_lane = __ffs(found_vote) - 1;
+
+      if (g.thread_rank() == src_lane) {
+        const int key_pos =
+            (start_idx + tile_offset + src_lane) & (bucket_max_size - 1);
+        (bucket->keys(key_pos))
+            ->store(static_cast<K>(RECLAIM_KEY),
+                    cuda::std::memory_order_relaxed);
+        (bucket->metas(key_pos))
+            ->store(static_cast<M>(EMPTY_META),
+                    cuda::std::memory_order_relaxed);
+        atomicSub(&buckets_size[bkt_idx], 1);
+      }
+      break;
+    }
+  }
+}
+
+/* Remove specified keys which match the Predict. */
+template <class K, class V, class M, uint32_t TILE_SIZE = 1>
+__global__ void remove_kernel(const Table<K, V, M>* __restrict table,
+                              const EraseIfPredictInternal<K, M> pred,
+                              const K pattern, const M threshold,
+                              size_t* __restrict count,
+                              Bucket<K, V, M>* __restrict buckets,
+                              int* __restrict buckets_size,
+                              const size_t bucket_max_size,
+                              const size_t buckets_num, size_t N) {
+  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
+
+  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
+       t += blockDim.x * gridDim.x) {
+    uint32_t bkt_idx = t;
+    uint32_t key_pos = 0;
+
+    Bucket<K, V, M>* bucket = buckets + bkt_idx;
+
+    K current_key = 0;
+    M current_meta = 0;
+    uint32_t key_offset = 0;
+    while (key_offset < bucket_max_size) {
+      current_key =
+          bucket->keys(key_offset)->load(cuda::std::memory_order_relaxed);
+      current_meta =
+          bucket->metas(key_offset)->load(cuda::std::memory_order_relaxed);
+      if (!IS_RESERVED_KEY(current_key)) {
+        if (pred(current_key, current_meta, pattern, threshold)) {
+          atomicAdd(count, 1);
+          key_pos = key_offset;
+          (bucket->keys(key_pos))
+              ->store(static_cast<K>(RECLAIM_KEY),
+                      cuda::std::memory_order_relaxed);
+          (bucket->metas(key_pos))
+              ->store(static_cast<M>(EMPTY_META),
+                      cuda::std::memory_order_relaxed);
+          atomicSub(&buckets_size[bkt_idx], 1);
+        } else {
+          key_offset++;
+        }
+      } else {
+        key_offset++;
+      }
+    }
+  }
+}
+
+/* Dump with meta. */
+template <class K, class V, class M>
+inline std::tuple<size_t, size_t> dump_kernel_shared_memory_size(
+    const size_t available_shared_memory) {
+  const size_t block_size{std::min(
+      available_shared_memory / 2 / sizeof(KVM<K, V, M>), UINT64_C(1024))};
+  MERLIN_CHECK(
+      block_size > 0,
+      "[HierarchicalKV] block_size <= 0, the K-V-M size may be too large!");
+
+  return {block_size * sizeof(KVM<K, V, M>), block_size};
+}
+
+template <class K, class V, class M>
+__global__ void dump_kernel(const Table<K, V, M>* __restrict table, K* d_key,
+                            V* __restrict d_val, M* __restrict d_meta,
+                            const size_t offset, const size_t search_length,
+                            size_t* d_dump_counter) {
+  extern __shared__ unsigned char s[];
+  KVM<K, V, M>* const block_tuples{reinterpret_cast<KVM<K, V, M>*>(s)};
+
+  const size_t bucket_max_size{table->bucket_max_size};
+  const size_t dim{table->dim};
+
+  __shared__ size_t block_acc;
+  __shared__ size_t global_acc;
+
+  const size_t tid{blockIdx.x * blockDim.x + threadIdx.x};
+
+  if (threadIdx.x == 0) {
+    block_acc = 0;
+  }
+  __syncthreads();
+
+  if (tid < search_length) {
+    Bucket<K, V, M>* const bucket{
+        &table->buckets[(tid + offset) / bucket_max_size]};
+
+    const int key_idx{static_cast<int>((tid + offset) % bucket_max_size)};
+    const K key{(bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed)};
+
+    if (!IS_RESERVED_KEY(key)) {
+      size_t local_index{atomicAdd(&block_acc, 1)};
+      block_tuples[local_index] = {
+          key, &bucket->vectors[key_idx * dim],
+          bucket->metas(key_idx)->load(cuda::std::memory_order_relaxed)};
+    }
+  }
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    global_acc = atomicAdd(d_dump_counter, block_acc);
+  }
+  __syncthreads();
+
+  if (threadIdx.x < block_acc) {
+    const KVM<K, V, M>& tuple{block_tuples[threadIdx.x]};
+
+    const size_t j{global_acc + threadIdx.x};
+    d_key[j] = tuple.key;
+    for (int i{0}; i < dim; ++i) {
+      d_val[j * dim + i] = tuple.value[i];
+    }
+    if (d_meta != nullptr) {
+      d_meta[j] = tuple.meta;
+    }
+  }
+}
+
+/* Dump with meta. */
+template <class K, class V, class M, template <typename, typename> class PredFunctor>
+__global__ void dump_kernel(const Table<K, V, M>* __restrict table,
+                            const K pattern, const M threshold, K* d_key,
+                            V* __restrict d_val, M* __restrict d_meta,
+                            const size_t offset, const size_t search_length,
+                            size_t* d_dump_counter) {
+  extern __shared__ unsigned char s[];
+  const size_t bucket_max_size = table->bucket_max_size;
+  const size_t dim = table->dim;
+  K* smem = (K*)s;
+  K* block_result_key = smem;
+  V* block_result_val = (V*)&(smem[blockDim.x]);
+  M* block_result_meta = (M*)&(block_result_val[blockDim.x * dim]);
+  __shared__ size_t block_acc;
+  __shared__ size_t global_acc;
+  PredFunctor<K, M> fn;
+
+  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (threadIdx.x == 0) {
+    block_acc = 0;
+  }
+  __syncthreads();
+
+  if (tid < search_length) {
+    int bkt_idx = (tid + offset) / bucket_max_size;
+    int key_idx = (tid + offset) % bucket_max_size;
+    Bucket<K, V, M>* bucket = &(table->buckets[bkt_idx]);
+
+    const K key =
+        (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed);
+    M meta = bucket->metas(key_idx)->load(cuda::std::memory_order_relaxed);
+
+    if (key != static_cast<K>(EMPTY_KEY) &&
+        fn(key, meta, pattern, threshold)) {
+      size_t local_index = atomicAdd(&block_acc, 1);
+      block_result_key[local_index] = key;
+      for (int i = 0; i < dim; i++) {
+        atomicExch(&(block_result_val[local_index * dim + i]),
+                   bucket->vectors[key_idx * dim + i]);
+      }
+      if (d_meta != nullptr) {
+        block_result_meta[local_index] = meta;
+      }
+    }
+  }
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    global_acc = atomicAdd(d_dump_counter, block_acc);
+  }
+  __syncthreads();
+
+  if (threadIdx.x < block_acc) {
+    d_key[global_acc + threadIdx.x] = block_result_key[threadIdx.x];
+    for (int i = 0; i < dim; i++) {
+      d_val[(global_acc + threadIdx.x) * dim + i] =
+          block_result_val[threadIdx.x * dim + i];
+    }
+    if (d_meta != nullptr) {
+      d_meta[global_acc + threadIdx.x] = block_result_meta[threadIdx.x];
+    }
+  }
+}
+
+/* If founds[i] = true, read data from corresponding address of
+ * table_value_addrs and write to param_values; if founds[i] = false, write data
+ * from param_values to corresponding address of table_value_addrs. usually
+ * called by find_or_insert kernel.
+ */
+template <class V>
+void read_or_write_by_cpu(V** __restrict table_value_addrs,
+                          V* __restrict param_values,
+                          const int* __restrict offset, const bool* founds,
+                          size_t dim, int N, int n_worker = 16) {
+  std::vector<std::thread> thds;
+  if (n_worker < 1) n_worker = 1;
+
+  auto functor = [founds, dim](V** __restrict table_value_addrs,
+                               V* __restrict param_values,
+                               const int* __restrict offset, int handled_size,
+                               int trunk_size) -> void {
+    for (int i = handled_size; i < handled_size + trunk_size; i++) {
+      if (table_value_addrs[i] != nullptr) {
+        if (founds[offset[i]]) {
+          memcpy(param_values + offset[i] * dim, table_value_addrs[i],
+                 sizeof(V) * dim);
+        } else {
+          memcpy(table_value_addrs[i], param_values + offset[i] * dim,
+                 sizeof(V) * dim);
+        }
+      }
+    }
+  };
+
+  int32_t trunk_size_floor = N / n_worker;
+  int32_t trunk_size_remain = N % n_worker;
+  int32_t n_worker_used = trunk_size_floor == 0 ? trunk_size_remain : n_worker;
+
+  size_t handled_size = 0;
+  for (int i = 0; i < n_worker_used; i++) {
+    int32_t cur_trunk_size = trunk_size_floor;
+    if (trunk_size_remain != 0) {
+      cur_trunk_size += 1;
+      trunk_size_remain--;
+    }
+    thds.push_back(std::thread(functor, table_value_addrs, param_values, offset,
+                               handled_size, cur_trunk_size));
+    handled_size += cur_trunk_size;
+  }
+
+  for (int i = 0; i < n_worker_used; i++) {
+    thds[i].join();
+  }
+}
+
+/*
+ * find or insert with IO operation. This kernel is
+ * usually used for the pure HBM mode for better performance.
+ */
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__global__ void find_or_insert_kernel_with_io(
+    const Table<K, V, M>* __restrict table, const size_t bucket_max_size,
+    const size_t buckets_num, const size_t dim, const K* __restrict keys,
+    V* __restrict values, M* __restrict metas, const size_t N) {
+  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
+  int* buckets_size = table->buckets_size;
+
+  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
+       t += blockDim.x * gridDim.x) {
+    int key_pos = -1;
+    const size_t key_idx = t / TILE_SIZE;
+
+    const K find_or_insert_key = keys[key_idx];
+
+    if (IS_RESERVED_KEY(find_or_insert_key)) continue;
+
+    const M find_or_insert_meta =
+        metas != nullptr ? metas[key_idx] : static_cast<M>(MAX_META);
+    V* find_or_insert_value = values + key_idx * dim;
+
+    size_t bkt_idx = 0;
+    size_t start_idx = 0;
+    int src_lane = -1;
+    K evicted_key;
+
+    Bucket<K, V, M>* bucket =
+        get_key_position<K>(table->buckets, find_or_insert_key, bkt_idx,
+                            start_idx, buckets_num, bucket_max_size);
+
+    OccupyResult occupy_result{OccupyResult::INITIAL};
+    const int bucket_size = buckets_size[bkt_idx];
+    do {
+      if (bucket_size < bucket_max_size) {
+        occupy_result = find_and_lock_when_vacant<K, V, M, TILE_SIZE>(
+            g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key,
+            start_idx, key_pos, src_lane, bucket_max_size);
+      } else {
+        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
+        occupy_result = find_and_lock_when_full<K, V, M, TILE_SIZE>(
+            g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key,
+            start_idx, key_pos, src_lane, bucket_max_size);
+      }
+
+      occupy_result = g.shfl(occupy_result, src_lane);
+    } while (occupy_result == OccupyResult::CONTINUE);
+
+    if (occupy_result == OccupyResult::REFUSED) continue;
+
+    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||
+         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&
+        g.thread_rank() == src_lane) {
+      atomicAdd(&(buckets_size[bkt_idx]), 1);
+    }
+
+    if (occupy_result == OccupyResult::DUPLICATE) {
+      copy_vector<V, TILE_SIZE>(g, bucket->vectors + key_pos * dim,
+                                find_or_insert_value, dim);
+      if (metas != nullptr && g.thread_rank() == src_lane) {
+        *(metas + key_idx) =
+            bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed);
+      }
+    } else {
+      copy_vector<V, TILE_SIZE>(g, find_or_insert_value,
+                                bucket->vectors + key_pos * dim, dim);
+      if (g.thread_rank() == src_lane) {
+        update_meta(bucket, key_pos, metas, key_idx);
+      }
+    }
+
+    if (g.thread_rank() == src_lane) {
+      (bucket->keys(key_pos))
+          ->store(find_or_insert_key, cuda::std::memory_order_relaxed);
+    }
+  }
+}
+
+template <typename K, typename V, typename M>
+struct SelectFindOrInsertKernelWithIO {
+  static void execute_kernel(const float& load_factor, const int& block_size,
+                             const size_t bucket_max_size,
+                             const size_t buckets_num, const size_t dim,
+                             cudaStream_t& stream, const size_t& n,
+                             const Table<K, V, M>* __restrict table,
+                             const K* __restrict keys, V* __restrict values,
+                             M* __restrict metas) {
+    if (load_factor <= 0.75) {
+      const unsigned int tile_size = 4;
+      const size_t N = n * tile_size;
+      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+      find_or_insert_kernel_with_io<K, V, M, tile_size>
+          <<<grid_size, block_size, 0, stream>>>(
+              table, bucket_max_size, buckets_num, dim, keys, values, metas, N);
+    } else {
+      const unsigned int tile_size = 32;
+      const size_t N = n * tile_size;
+      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+      find_or_insert_kernel_with_io<K, V, M, tile_size>
+          <<<grid_size, block_size, 0, stream>>>(
+              table, bucket_max_size, buckets_num, dim, keys, values, metas, N);
+    }
+    return;
+  }
+};
+
+/* find or insert with the end-user specified meta.
+ */
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__global__ void find_or_insert_kernel(
+    const Table<K, V, M>* __restrict table, const size_t bucket_max_size,
+    const size_t buckets_num, const size_t dim, const K* __restrict keys,
+    V** __restrict vectors, M* __restrict metas, bool* __restrict found,
+    int* __restrict keys_index, const size_t N) {
+  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
+  int* buckets_size = table->buckets_size;
+
+  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
+       t += blockDim.x * gridDim.x) {
+    int key_pos = -1;
+    size_t key_idx = t / TILE_SIZE;
+
+    const K find_or_insert_key = keys[key_idx];
+
+    if (IS_RESERVED_KEY(find_or_insert_key)) continue;
+
+    const M find_or_insert_meta =
+        metas != nullptr ? metas[key_idx] : static_cast<M>(MAX_META);
+
+    size_t bkt_idx = 0;
+    size_t start_idx = 0;
+    int src_lane = -1;
+    K evicted_key;
+
+    Bucket<K, V, M>* bucket =
+        get_key_position<K>(table->buckets, find_or_insert_key, bkt_idx,
+                            start_idx, buckets_num, bucket_max_size);
+
+    if (g.thread_rank() == 0) {
+      *(keys_index + key_idx) = key_idx;
+    }
+
+    OccupyResult occupy_result{OccupyResult::INITIAL};
+    const int bucket_size = buckets_size[bkt_idx];
+    do {
+      if (bucket_size < bucket_max_size) {
+        occupy_result = find_and_lock_when_vacant<K, V, M, TILE_SIZE>(
+            g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key,
+            start_idx, key_pos, src_lane, bucket_max_size);
+      } else {
+        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
+        occupy_result = find_and_lock_when_full<K, V, M, TILE_SIZE>(
+            g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key,
+            start_idx, key_pos, src_lane, bucket_max_size);
+      }
+
+      occupy_result = g.shfl(occupy_result, src_lane);
+    } while (occupy_result == OccupyResult::CONTINUE);
+
+    if (occupy_result == OccupyResult::REFUSED) continue;
+
+    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||
+         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&
+        g.thread_rank() == src_lane) {
+      atomicAdd(&(buckets_size[bkt_idx]), 1);
+    }
+
+    if (occupy_result == OccupyResult::DUPLICATE) {
+      if (g.thread_rank() == src_lane) {
+        *(vectors + key_idx) = (bucket->vectors + key_pos * dim);
+
+        if (found != nullptr) {
+          *(found + key_idx) = true;
+        }
+
+        if (metas != nullptr) {
+          *(metas + key_idx) =
+              bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed);
+        }
+      }
+    } else {
+      if (g.thread_rank() == src_lane) {
+        *(vectors + key_idx) = (bucket->vectors + key_pos * dim);
+        update_meta(bucket, key_pos, metas, key_idx);
+      }
+    }
+
+    if (g.thread_rank() == src_lane) {
+      (bucket->keys(key_pos))
+          ->store(find_or_insert_key, cuda::std::memory_order_relaxed);
+    }
+  }
+}
+
+/* find or insert with the end-user specified meta.
+ */
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__global__ void find_ptr_or_insert_kernel(
+    const Table<K, V, M>* __restrict table, const size_t bucket_max_size,
+    const size_t buckets_num, const size_t dim, const K* __restrict keys,
+    V** __restrict vectors, M* __restrict metas, bool* __restrict found,
+    const size_t N) {
+  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
+  int* buckets_size = table->buckets_size;
+
+  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
+       t += blockDim.x * gridDim.x) {
+    int key_pos = -1;
+    size_t key_idx = t / TILE_SIZE;
+
+    const K find_or_insert_key = keys[key_idx];
+
+    if (IS_RESERVED_KEY(find_or_insert_key)) continue;
+
+    const M find_or_insert_meta =
+        metas != nullptr ? metas[key_idx] : static_cast<M>(MAX_META);
+
+    size_t bkt_idx = 0;
+    size_t start_idx = 0;
+    int src_lane = -1;
+    K evicted_key;
+
+    Bucket<K, V, M>* bucket =
+        get_key_position<K>(table->buckets, find_or_insert_key, bkt_idx,
+                            start_idx, buckets_num, bucket_max_size);
+
+    OccupyResult occupy_result{OccupyResult::INITIAL};
+    const int bucket_size = buckets_size[bkt_idx];
+    do {
+      if (bucket_size < bucket_max_size) {
+        occupy_result = find_and_lock_when_vacant<K, V, M, TILE_SIZE>(
+            g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key,
+            start_idx, key_pos, src_lane, bucket_max_size);
+      } else {
+        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
+        occupy_result = find_and_lock_when_full<K, V, M, TILE_SIZE>(
+            g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key,
+            start_idx, key_pos, src_lane, bucket_max_size);
+      }
+
+      occupy_result = g.shfl(occupy_result, src_lane);
+    } while (occupy_result == OccupyResult::CONTINUE);
+
+    if (occupy_result == OccupyResult::REFUSED) continue;
+
+    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||
+         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&
+        g.thread_rank() == src_lane) {
+      atomicAdd(&(buckets_size[bkt_idx]), 1);
+    }
+
+    if (occupy_result == OccupyResult::DUPLICATE) {
+      if (g.thread_rank() == src_lane) {
+        *(vectors + key_idx) = (bucket->vectors + key_pos * dim);
+        *(found + key_idx) = true;
+        if (metas != nullptr) {
+          *(metas + key_idx) =
+              bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed);
+        }
+      }
+    } else {
+      if (g.thread_rank() == src_lane) {
+        *(vectors + key_idx) = (bucket->vectors + key_pos * dim);
+        *(found + key_idx) = false;
+        update_meta(bucket, key_pos, metas, key_idx);
+      }
+    }
+
+    if (g.thread_rank() == src_lane) {
+      (bucket->keys(key_pos))
+          ->store(find_or_insert_key, cuda::std::memory_order_relaxed);
+    }
+  }
+}
+
+template <typename K, typename V, typename M>
+struct SelectFindOrInsertPtrKernel {
+  static void execute_kernel(const float& load_factor, const int& block_size,
+                             const size_t bucket_max_size,
+                             const size_t buckets_num, const size_t dim,
+                             cudaStream_t& stream, const size_t& n,
+                             const Table<K, V, M>* __restrict table,
+                             const K* __restrict keys, V** __restrict values,
+                             M* __restrict metas, bool* __restrict found) {
+    if (load_factor <= 0.5) {
+      const unsigned int tile_size = 4;
+      const size_t N = n * tile_size;
+      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+      find_ptr_or_insert_kernel<K, V, M, tile_size>
+          <<<grid_size, block_size, 0, stream>>>(table, bucket_max_size,
+                                                 buckets_num, dim, keys, values,
+                                                 metas, found, N);
+    } else if (load_factor <= 0.875) {
+      const unsigned int tile_size = 8;
+      const size_t N = n * tile_size;
+      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+      find_ptr_or_insert_kernel<K, V, M, tile_size>
+          <<<grid_size, block_size, 0, stream>>>(table, bucket_max_size,
+                                                 buckets_num, dim, keys, values,
+                                                 metas, found, N);
+    } else {
+      const unsigned int tile_size = 32;
+      const size_t N = n * tile_size;
+      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+      find_ptr_or_insert_kernel<K, V, M, tile_size>
+          <<<grid_size, block_size, 0, stream>>>(table, bucket_max_size,
+                                                 buckets_num, dim, keys, values,
+                                                 metas, found, N);
+    }
+    return;
+  }
+};
+
+/* Read the data from address of table_value_addrs to corresponding position
+  in param_value if mask[i] is true, otherwise write data to table_value_addrs
+  form param_value,
+  usually called by find_or_insert kernel.
+
+  `table_value_addrs`: A pointer of pointer of V which should be on HBM,
+        but each value (a pointer of V) could point to a
+        memory on HBM or HMEM.
+  `param_value`: A continue memory pointer with Vector
+        which should be HBM.
+  `mask`: One for each `param_value`. If true, reading from table_value_addrs,
+          or false writing table_value_addrs from  param_value.
+  `param_key_index`: N values from address of table_value_addrs are mapped to
+        param_values according to param_key_index.
+  `dim`: the dim of value.
+  `N`: The number of vectors needed to be read.
+*/
+template <class K, class V, class M>
+__global__ void read_or_write_kernel(V** __restrict table_value_addrs,
+                                     V* __restrict param_values,
+                                     const bool* mask,
+                                     const int* __restrict param_key_index,
+                                     const size_t dim, const size_t N) {
+  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
+    int vec_index = int(t / dim);
+    int dim_index = t % dim;
+    int real_key_index =
+        param_key_index != nullptr ? param_key_index[vec_index] : vec_index;
+
+    /// if found, read the value form table, otherwise write it
+    if (table_value_addrs[vec_index] != nullptr) {
+      /// find
+      if (mask[real_key_index]) {
+        param_values[real_key_index * dim + dim_index] =
+            table_value_addrs[vec_index][dim_index];
+      }
+      /// insert
+      else {
+        table_value_addrs[vec_index][dim_index] =
+            param_values[real_key_index * dim + dim_index];
+      }
+    }
+  }
+}
+
+/*
+ * update with IO operation. This kernel is
+ * usually used for the pure HBM mode for better performance.
+ */
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__global__ void update_kernel_with_io(
+    const Table<K, V, M>* __restrict table, const size_t bucket_max_size,
+    const size_t buckets_num, const size_t dim, const K* __restrict keys,
+    const V* __restrict values, const M* __restrict metas, const size_t N) {
+  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
+  int* buckets_size = table->buckets_size;
+
+  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
+       t += blockDim.x * gridDim.x) {
+    int key_pos = -1;
+    size_t key_idx = t / TILE_SIZE;
+
+    const K update_key = keys[key_idx];
+
+    if (IS_RESERVED_KEY(update_key)) continue;
+
+    const V* update_value = values + key_idx * dim;
+
+    size_t bkt_idx = 0;
+    size_t start_idx = 0;
+    int src_lane = -1;
+
+    Bucket<K, V, M>* bucket =
+        get_key_position<K>(table->buckets, update_key, bkt_idx, start_idx,
+                            buckets_num, bucket_max_size);
+
+    OccupyResult occupy_result{OccupyResult::INITIAL};
+    const int bucket_size = buckets_size[bkt_idx];
+
+    if (bucket_size >= bucket_max_size) {
+      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
+    }
+    occupy_result = find_and_lock_for_update<K, V, M, TILE_SIZE>(
+        g, bucket, update_key, start_idx, key_pos, src_lane, bucket_max_size);
+
+    occupy_result = g.shfl(occupy_result, src_lane);
+
+    if (occupy_result == OccupyResult::REFUSED) continue;
+
+    if (occupy_result == OccupyResult::DUPLICATE) {
+      copy_vector<V, TILE_SIZE>(g, update_value,
+                                bucket->vectors + key_pos * dim, dim);
+      if (src_lane == g.thread_rank()) {
+        update_meta(bucket, key_pos, metas, key_idx);
+      }
+    }
+
+    if (g.thread_rank() == src_lane) {
+      (bucket->keys(key_pos))
+          ->store(update_key, cuda::std::memory_order_relaxed);
+    }
+  }
+}
+
+template <typename K, typename V, typename M>
+struct SelectUpdateKernelWithIO {
+  static void execute_kernel(const float& load_factor, const int& block_size,
+                             const size_t bucket_max_size,
+                             const size_t buckets_num, const size_t dim,
+                             cudaStream_t& stream, const size_t& n,
+                             const Table<K, V, M>* __restrict table,
+                             const K* __restrict keys,
+                             const V* __restrict values,
+                             const M* __restrict metas) {
+    if (load_factor <= 0.75) {
+      const unsigned int tile_size = 4;
+      const size_t N = n * tile_size;
+      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+      update_kernel_with_io<K, V, M, tile_size>
+          <<<grid_size, block_size, 0, stream>>>(
+              table, bucket_max_size, buckets_num, dim, keys, values, metas, N);
+    } else {
+      const unsigned int tile_size = 32;
+      const size_t N = n * tile_size;
+      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+      update_kernel_with_io<K, V, M, tile_size>
+          <<<grid_size, block_size, 0, stream>>>(
+              table, bucket_max_size, buckets_num, dim, keys, values, metas, N);
+    }
+    return;
+  }
+};
+
+template <class K, class V, class M, uint32_t TILE_SIZE = 4>
+__global__ void update_kernel(const Table<K, V, M>* __restrict table,
+                              const size_t bucket_max_size,
+                              const size_t buckets_num, const size_t dim,
+                              const K* __restrict keys, V** __restrict vectors,
+                              const M* __restrict metas,
+                              int* __restrict src_offset, size_t N) {
+  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
+  int* buckets_size = table->buckets_size;
+
+  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
+       t += blockDim.x * gridDim.x) {
+    int key_pos = -1;
+    size_t key_idx = t / TILE_SIZE;
+
+    const K update_key = keys[key_idx];
+
+    if (IS_RESERVED_KEY(update_key)) continue;
+
+    size_t bkt_idx = 0;
+    size_t start_idx = 0;
+    int src_lane = -1;
+
+    Bucket<K, V, M>* bucket =
+        get_key_position<K>(table->buckets, update_key, bkt_idx, start_idx,
+                            buckets_num, bucket_max_size);
+
+    OccupyResult occupy_result{OccupyResult::INITIAL};
+    const int bucket_size = buckets_size[bkt_idx];
+    *(src_offset + key_idx) = key_idx;
+
+    if (bucket_size >= bucket_max_size) {
+      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
+    }
+    occupy_result = find_and_lock_for_update<K, V, M, TILE_SIZE>(
+        g, bucket, update_key, start_idx, key_pos, src_lane, bucket_max_size);
+
+    occupy_result = g.shfl(occupy_result, src_lane);
+
+    if (occupy_result == OccupyResult::REFUSED) continue;
+
+    if (g.thread_rank() == src_lane) {
+      if (occupy_result == OccupyResult::DUPLICATE) {
+        *(vectors + key_idx) = (bucket->vectors + key_pos * dim);
+        update_meta(bucket, key_pos, metas, key_idx);
+      } else {
+        *(vectors + key_idx) = nullptr;
+      }
+    }
+
+    if (g.thread_rank() == src_lane) {
+      (bucket->keys(key_pos))
+          ->store(update_key, cuda::std::memory_order_relaxed);
+    }
+  }
+}
+
+}  // namespace merlin
+}  // namespace nv
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/debug.hpp b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/debug.hpp
new file mode 100644
index 000000000..2d9ebfe8b
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/debug.hpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime_api.h>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/cuda_utils.cuh"
+
+namespace nv {
+namespace merlin {
+
+class CudaException : public std::runtime_error {
+ public:
+  CudaException(const std::string& what) : runtime_error(what) {}
+};
+
+inline void cuda_check_(cudaError_t val, const char* file, int line) {
+  if (val != cudaSuccess) {
+    std::ostringstream os;
+    os << file << ':' << line << ": CUDA error " << cudaGetErrorName(val)
+       << " (#" << val << "): " << cudaGetErrorString(val);
+    throw CudaException(os.str());
+  }
+}
+
+//#ifdef CUDA_CHECK
+//#error Unexpected redfinition of CUDA_CHECK! Something is wrong.
+//#endif
+
+#ifndef CUDA_CHECK
+#define CUDA_CHECK(val)                                 \
+  do {                                                  \
+    nv::merlin::cuda_check_((val), __FILE__, __LINE__); \
+  } while (0)
+#endif  // CUDA_CHECK
+
+class MerlinException : public std::runtime_error {
+ public:
+  MerlinException(const std::string& what) : runtime_error(what) {}
+};
+
+template <class Msg>
+inline void merlin_check_(bool cond, const Msg& msg, const char* file,
+                          int line) {
+  if (!cond) {
+    std::ostringstream os;
+    os << file << ':' << line << ": HierarchicalKV error " << msg;
+    throw MerlinException(os.str());
+  }
+}
+
+#ifdef MERLIN_CHECK
+#error Unexpected redfinition of MERLIN_CHECK! Something is wrong.
+#endif
+
+#define MERLIN_CHECK(cond, msg)                                   \
+  do {                                                            \
+    nv::merlin::merlin_check_((cond), (msg), __FILE__, __LINE__); \
+  } while (0)
+
+}  // namespace merlin
+}  // namespace nv
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/flexible_buffer.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/flexible_buffer.cuh
new file mode 100644
index 000000000..d01dda6d1
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/flexible_buffer.cuh
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <iostream>
+#include "utils.cuh"
+
+using std::cerr;
+using std::endl;
+
+namespace nv {
+namespace merlin {
+
+template <class T>
+class FlexPinnedBuffer {
+ public:
+  FlexPinnedBuffer(const size_t size = 1) : ptr_(nullptr) {
+    if (!ptr_) {
+      size_ = size;
+      CUDA_CHECK(cudaMallocHost(&ptr_, sizeof(T) * size_));
+    }
+  }
+  ~FlexPinnedBuffer() {
+    try {
+      if (!ptr_) CUDA_CHECK(cudaFreeHost(ptr_));
+    } catch (const nv::merlin::CudaException& e) {
+      cerr << "[HierarchicalKV] Failed to free FlexPinnedBuffer!" << endl;
+    }
+  }
+
+  __inline__ T* alloc_or_reuse(const size_t size = 0) {
+    if (size > size_) {
+      CUDA_CHECK(cudaFreeHost(ptr_));
+      size_ = size;
+      CUDA_CHECK(cudaMallocHost(&ptr_, sizeof(T) * size_));
+    }
+    return ptr_;
+  }
+
+ private:
+  T* ptr_;
+  size_t size_;
+};
+
+}  // namespace merlin
+}  // namespace nv
\ No newline at end of file
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/group_lock.hpp b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/group_lock.hpp
new file mode 100644
index 000000000..2cfdc6ed3
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/group_lock.hpp
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http:///www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Implementing a group mutex and relative lock guard for better E2E performance:
+ * - Allow multiple writers (like `insert_or_assign` `assign` `insert_and_evict` etc.)
+ *   The CUDA kernels guarantee the data consistency in this situation.
+ * - Allow multiple readers (like `find` 'size` etc.)
+ * - Not allow readers and writers to run concurrently
+ * - The `write_read_lock` is used for special APIs (like `reserve` `erase` `clear` etc.)
+ */
+#include <atomic>
+#include <cassert>
+#include <mutex>
+#include <system_error>
+#include <thread>
+
+namespace nv {
+namespace merlin {
+
+class group_shared_mutex {
+ public:
+  group_shared_mutex(const group_shared_mutex&) = delete;
+  group_shared_mutex& operator=(const group_shared_mutex&) = delete;
+
+  group_shared_mutex() noexcept
+      : writer_count_(0), reader_count_(0), unique_flag_(false) {}
+
+  void lock_read() {
+    for (;;) {
+      while (writer_count_.load(std::memory_order_acquire)) {
+      }
+      reader_count_.fetch_add(1, std::memory_order_acq_rel);
+      if (writer_count_.load(std::memory_order_acquire) == 0) {
+        break;
+      }
+      reader_count_.fetch_sub(1, std::memory_order_acq_rel);
+    }
+  }
+
+  void unlock_read() { reader_count_.fetch_sub(1, std::memory_order_release); }
+
+  void lock_write() {
+    for (;;) {
+      while (reader_count_.load(std::memory_order_acquire)) {
+      }
+      writer_count_.fetch_add(1, std::memory_order_acq_rel);
+      if (reader_count_.load(std::memory_order_acquire) == 0) {
+        break;
+      }
+      writer_count_.fetch_sub(1, std::memory_order_acq_rel);
+    }
+  }
+
+  void unlock_write() { writer_count_.fetch_sub(1, std::memory_order_release); }
+
+  void lock_write_read() {
+    /* Lock unique flag */
+    bool expected = false;
+    while (!unique_flag_.compare_exchange_weak(expected, true,
+                                               std::memory_order_acq_rel)) {
+      expected = false;
+    }
+
+    /* Ban writer */
+    for (;;) {
+      while (writer_count_.load(std::memory_order_acquire)) {
+      }
+      reader_count_.fetch_add(1, std::memory_order_acq_rel);
+      if (writer_count_.load(std::memory_order_acquire) == 0) {
+        break;
+      }
+      reader_count_.fetch_sub(1, std::memory_order_acq_rel);
+    }
+
+    /* Ban reader */
+    for (;;) {
+      while (reader_count_.load(std::memory_order_acquire) > 1) {
+      }
+      writer_count_.fetch_add(1, std::memory_order_acq_rel);
+      if (reader_count_.load(std::memory_order_acquire) == 1) {
+        break;
+      }
+      writer_count_.fetch_sub(1, std::memory_order_acq_rel);
+    }
+  }
+
+  void unlock_write_read() noexcept {
+    reader_count_.fetch_sub(1, std::memory_order_release);
+    writer_count_.fetch_sub(1, std::memory_order_release);
+    unique_flag_.store(false, std::memory_order_release);
+  }
+
+  int writer_count() noexcept {
+    return writer_count_.load(std::memory_order_relaxed);
+  }
+
+  int reader_count() noexcept {
+    return reader_count_.load(std::memory_order_relaxed);
+  }
+
+ private:
+  std::atomic<int> writer_count_;
+  std::atomic<int> reader_count_;
+  std::atomic<bool> unique_flag_;
+};
+
+class reader_shared_lock {
+ public:
+  reader_shared_lock(const reader_shared_lock&) = delete;
+  reader_shared_lock(reader_shared_lock&&) = delete;
+
+  reader_shared_lock& operator=(const reader_shared_lock&) = delete;
+  reader_shared_lock& operator=(reader_shared_lock&&) = delete;
+
+  explicit reader_shared_lock(group_shared_mutex& mutex) : mutex_(&mutex) {
+    mutex_->lock_read();
+    owns_ = true;
+  }
+
+  explicit reader_shared_lock(group_shared_mutex& mutex, std::defer_lock_t)
+      : mutex_(&mutex), owns_(false) {}
+
+  ~reader_shared_lock() {
+    if (owns_) {
+      mutex_->unlock_read();
+    }
+  }
+
+  void lock() noexcept {
+    if (!owns_) {
+      mutex_->lock_read();
+      owns_ = true;
+    }
+  }
+
+  bool owns_lock() const noexcept { return owns_; }
+
+ private:
+  group_shared_mutex* const mutex_;
+  bool owns_;
+};
+
+class writer_shared_lock {
+ public:
+  writer_shared_lock(const writer_shared_lock&) = delete;
+  writer_shared_lock(writer_shared_lock&&) = delete;
+
+  writer_shared_lock& operator=(const writer_shared_lock&) = delete;
+  writer_shared_lock& operator=(writer_shared_lock&&) = delete;
+
+  explicit writer_shared_lock(group_shared_mutex& mutex) : mutex_(&mutex) {
+    mutex_->lock_write();
+    owns_ = true;
+  }
+
+  explicit writer_shared_lock(group_shared_mutex& mutex, std::defer_lock_t)
+      : mutex_(&mutex), owns_(false) {}
+
+  ~writer_shared_lock() {
+    if (owns_) {
+      mutex_->unlock_write();
+    }
+  }
+
+  void lock() noexcept {
+    if (!owns_) {
+      mutex_->lock_write();
+      owns_ = true;
+    }
+  }
+
+  bool owns_lock() const noexcept { return owns_; }
+
+ private:
+  group_shared_mutex* const mutex_;
+  bool owns_;
+};
+
+class write_read_lock {
+ public:
+  write_read_lock(const write_read_lock&) = delete;
+  write_read_lock(write_read_lock&&) = delete;
+
+  write_read_lock& operator=(const write_read_lock&) = delete;
+  write_read_lock& operator=(write_read_lock&&) = delete;
+
+  explicit write_read_lock(group_shared_mutex& mutex) : mutex_(&mutex) {
+    mutex_->lock_write_read();
+    owns_ = true;
+  }
+
+  explicit write_read_lock(group_shared_mutex& mutex, std::defer_lock_t) noexcept
+      : mutex_(&mutex), owns_(false) {}
+
+  ~write_read_lock() {
+    if (owns_) {
+      mutex_->unlock_write_read();
+    }
+  }
+
+  void lock() {
+    assert(!owns_ && "[write_read_lock] trying to lock twice!");
+    mutex_->lock_write_read();
+    owns_ = true;
+  }
+
+  bool owns_lock() const noexcept { return owns_; }
+
+ private:
+  group_shared_mutex* const mutex_;
+  bool owns_;
+};
+
+}  // namespace merlin
+}  // namespace nv
\ No newline at end of file
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/initializers.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/initializers.cuh
new file mode 100644
index 000000000..6df875688
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/initializers.cuh
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <curand_kernel.h>
+#include "curand_philox4x32_x.h"
+#include "types.cuh"
+#include "utils.cuh"
+
+namespace nv {
+namespace merlin {
+namespace initializers {
+
+inline void cuda_rand_check_(curandStatus_t val, const char* file, int line) {
+  if (val != CURAND_STATUS_SUCCESS) {
+    throw CudaException(std::string(file) + ":" + std::to_string(line) +
+                        ": CURAND error " + std::to_string(val));
+  }
+}
+
+#define CURAND_CHECK(val) \
+  { nv::merlin::initializers::cuda_rand_check_((val), __FILE__, __LINE__); }
+
+template <class T>
+void zeros(T* d_data, const size_t len, cudaStream_t stream) {
+  CUDA_CHECK(cudaMemsetAsync(d_data, 0, len, stream));
+}
+
+template <class T>
+void random_normal(T* d_data, const size_t len, cudaStream_t stream,
+                   const T mean = 0.0, const T stddev = 0.05,
+                   const unsigned long long seed = 2022ULL) {
+  curandGenerator_t generator;
+  CURAND_CHECK(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT));
+  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(generator, seed));
+  CURAND_CHECK(curandGenerateNormal(generator, d_data, len, mean, stddev));
+}
+
+template <class T>
+__global__ void adjust_max_min(T* d_data, const T minval, const T maxval,
+                               const size_t N) {
+  int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (tid < N) {
+    d_data[tid] =
+        d_data[tid] * (maxval - minval) + (0.5 * (maxval + minval) - 0.5);
+  }
+}
+
+template <class T>
+void random_uniform(T* d_data, const size_t len, cudaStream_t stream,
+                    const T minval = 0.0, const T maxval = 1.0,
+                    const unsigned long long seed = 2022ULL) {
+  curandGenerator_t generator;
+
+  CURAND_CHECK(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT));
+  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(generator, seed));
+
+  int N = len;
+  int block_size = 256;
+  int grid_size = (N + block_size - 1) / block_size;
+  CURAND_CHECK(curandGenerateUniform(generator, d_data, N));
+  adjust_max_min<T>
+      <<<grid_size, block_size, 0, stream>>>(d_data, minval, maxval, N);
+}
+
+template <class T>
+__global__ void init_states(curandStatePhilox4_32_10_t* states,
+                            const unsigned long long seed, const size_t N) {
+  int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (tid < N) {
+    curand_init(seed, tid, 0, &states[tid]);
+  }
+}
+
+template <class T>
+__global__ void make_truncated_normal(T* d_data,
+                                      curandStatePhilox4_32_10_t* states,
+                                      const size_t N) {
+  int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (tid < N) {
+    constexpr T truncated_val = T(2.0);
+    while (fabsf(d_data[tid]) > truncated_val) {
+      d_data[tid] = curand_normal(&states[tid]);
+    }
+  }
+}
+
+template <class T>
+void truncated_normal(T* d_data, const size_t len, cudaStream_t stream,
+                      const T minval = 0.0, const T maxval = 1.0,
+                      const unsigned long long seed = 2022ULL) {
+  curandGenerator_t generator;
+
+  CURAND_CHECK(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT));
+  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(generator, seed));
+
+  int N = len;
+  int block_size = 256;
+  int grid_size = (N + block_size - 1) / block_size;
+  curandStatePhilox4_32_10_t* d_states;
+  CUDA_CHECK(cudaMallocAsync(&d_states, N, stream));
+
+  init_states<T><<<grid_size, block_size, 0, stream>>>(d_states, seed, N);
+
+  make_truncated_normal<T>
+      <<<grid_size, block_size, 0, stream>>>(d_data, d_states, N);
+
+  adjust_max_min<T>
+      <<<grid_size, block_size, 0, stream>>>(d_data, minval, maxval, N);
+
+  CUDA_CHECK(cudaFreeAsync(d_states, stream));
+}
+
+template <class T>
+class Initializer {
+ public:
+  virtual ~Initializer() {}
+  virtual void initialize(T* data, size_t len, cudaStream_t stream) {}
+};
+
+template <class T>
+class Zeros final : public Initializer<T> {
+ public:
+  void initialize(T* data, const size_t len, cudaStream_t stream) override {
+    zeros<T>(data, len, stream);
+  }
+};
+
+}  // namespace initializers
+}  // namespace merlin
+}  // namespace nv
\ No newline at end of file
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/memory_pool.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/memory_pool.cuh
new file mode 100644
index 000000000..271676a6e
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/memory_pool.cuh
@@ -0,0 +1,619 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <vector>
+#include "debug.hpp"
+
+namespace nv {
+namespace merlin {
+
+/**
+ * Allocators are used by the memory pool (and maybe other classes) to create
+ * RAII complient containers for buffers allocated in different memory areas.
+ */
+template <class T, class Allocator>
+struct AllocatorBase {
+  using type = T;
+  using sync_unique_ptr = std::unique_ptr<type, Allocator>;
+  using async_unique_ptr = std::unique_ptr<type, std::function<void(type*)>>;
+  using shared_ptr = std::shared_ptr<type>;
+
+  inline static sync_unique_ptr make_unique(size_t n) {
+    return sync_unique_ptr(Allocator::alloc(n));
+  }
+
+  inline static async_unique_ptr make_unique(size_t n, cudaStream_t stream) {
+    return {Allocator::alloc(n, stream),
+            [stream](type* p) { Allocator::free(p); }};
+  }
+
+  inline static shared_ptr make_shared(size_t n, cudaStream_t stream = 0) {
+    return {Allocator::alloc(n, stream),
+            [stream](type* p) { Allocator::free(p, stream); }};
+  }
+
+  inline void operator()(type* ptr) { Allocator::free(ptr); }
+};
+
+/**
+ * Trivial fallback implementation using the standard C++ allocator. This mostly
+ * exists to ensure interface correctness, and as an illustration of what a
+ * proper allocator implementation should look like.
+ */
+template <class T>
+struct StandardAllocator final : AllocatorBase<T, StandardAllocator<T>> {
+  using type = typename AllocatorBase<T, StandardAllocator<T>>::type;
+
+  static constexpr const char* name{"StandardAllocator"};
+
+  inline static type* alloc(size_t n, cudaStream_t stream = 0) {
+    return new type[n];
+  }
+
+  inline static void free(type* ptr, cudaStream_t stream = 0) { delete[] ptr; }
+};
+
+/**
+ * Claim/release buffers in pinned host memory.
+ */
+template <class T>
+struct HostAllocator final : AllocatorBase<T, HostAllocator<T>> {
+  using type = typename AllocatorBase<T, HostAllocator<T>>::type;
+
+  static constexpr const char* name{"HostAllocator"};
+
+  inline static type* alloc(size_t n, cudaStream_t stream = 0) {
+    void* ptr;
+    CUDA_CHECK(cudaMallocHost(&ptr, sizeof(T) * n));
+    return reinterpret_cast<type*>(ptr);
+  }
+
+  inline static void free(type* ptr, cudaStream_t stream = 0) {
+    CUDA_CHECK(cudaFreeHost(ptr));
+  }
+};
+
+/**
+ * Claim/release buffers in the active CUDA device. Will not test if the correct
+ * device was used, and throw if CUDA runtime API response is negative.
+ */
+template <class T>
+struct DeviceAllocator final : AllocatorBase<T, DeviceAllocator<T>> {
+  using type = typename AllocatorBase<T, DeviceAllocator<T>>::type;
+
+  static constexpr const char* name{"DeviceAllocator"};
+
+  inline static type* alloc(size_t n, cudaStream_t stream = 0) {
+    void* ptr;
+    cudaError_t res;
+    if (stream) {
+      res = cudaMallocAsync(&ptr, sizeof(T) * n, stream);
+    } else {
+      res = cudaMalloc(&ptr, sizeof(T) * n);
+    }
+    CUDA_CHECK(res);
+    return reinterpret_cast<type*>(ptr);
+  }
+
+  inline static void free(type* ptr, cudaStream_t stream = 0) {
+    cudaError_t res;
+    if (stream) {
+      res = cudaFreeAsync(ptr, stream);
+    } else {
+      res = cudaFree(ptr);
+    }
+    CUDA_CHECK(res);
+  }
+};
+
+/**
+ * Helper structure to configure a memory pool.
+ */
+struct MemoryPoolOptions {
+  size_t max_stock{4};     ///< Amount of buffers to keep in reserve.
+  size_t max_pending{16};  ///< Maximum amount of awaitable buffers. If this
+                           ///< limit is exceeded threads will start to block.
+};
+
+/**
+ * Forward declares required to make templated ostream overload work.
+ */
+template <class Allocator>
+class MemoryPool;
+
+template <class Allocator>
+std::ostream& operator<<(std::ostream&, const MemoryPool<Allocator>&);
+
+/**
+ * CUDA deferred execution aware memory pool implementation. As for every memory
+ * pool, the general idea is to have resuable buffers. All buffers have the same
+ * size.
+ *
+ * General behavior:
+ *
+ * This memory pool implementation attempts to avoid blocking before the fact,
+ * but also avoids relying on a background worker.
+ *
+ * Buffer borrow and return semantics tightly align with C++ RAII principles.
+ * That is, if a workspace is requested, any borrowed buffers will be returned
+ * automatically when leaving the scope.
+ *
+ * You can either borrow a single buffer, or a workspace (that is multiple
+ * buffers). We support dynamic and static workspaces. Static workspaces have
+ * the benefit that they will never require heap memory (no hidden allocations).
+ *
+ *
+ * Buffer borrowing:
+ *
+ * If buffers are requested, we take them from the stock, if available. If the
+ * stock is depleted, we check if any pending buffer has been used up by the GPU
+ * and adds them to the stock. If was also not successful, we allocate a new
+ * buffer. Buffers or workspaces (groups of buffers).
+ *
+ * When borrowing a buffer a streaming context can be specified. This context is
+ * relevant for allocation and during returns. It is assumed that the stream you
+ * provide as context will be the stream where you queue the workload. Not doing
+ * so may lead to undefined behavior.
+ *
+ * Buffer return:
+ *
+ * If no context is provided, we cannot make any assumptions regarding the usage
+ * one the device. So we sychronize the device first and then return the buffer
+ * to the stock. If a streaming context was provided, we queue an event and add
+ * the buffer to the `pending` pool. That means, the buffer has been
+ * reqlinquished by the CPU, but may still be used by the GPU. If no pending
+ * slot is available, we probe the currently pending buffers events for
+ * completion. Completed pending buffers are returned to the reserve. If so, we
+ * queue the buffer in the freed slot. If that was unsucessful (i.e., all
+ * currently pending buffers are still in use by the GPU), we have no choice but
+ * the free the buffer using the current stream.
+ *
+ * In either case, `max_reserve` represents the maxmum size of the stock. If
+ * returning a buffer would lead to the stock exeeding this quantity, the buffer
+ * is queued for destruction.
+ */
+template <class Allocator>
+class MemoryPool final {
+ public:
+  using pool_type = MemoryPool<Allocator>;
+  using alloc_type = typename Allocator::type;
+  template <class Container>
+  class Workspace {
+   public:
+    inline Workspace() : pool_{nullptr}, buffer_size_{0}, stream_{0} {}
+
+    inline Workspace(pool_type* pool, cudaStream_t stream)
+        : pool_{pool}, buffer_size_{0}, stream_{stream} {}
+
+    Workspace(const Workspace&) = delete;
+
+    Workspace& operator=(const Workspace&) = delete;
+
+    inline Workspace(Workspace&& other)
+        : pool_{other.pool_},
+          buffer_size_{other.buffer_size_},
+          stream_{other.stream_},
+          buffers_{std::move(other.buffers_)} {}
+
+    inline Workspace& operator=(Workspace&& other) {
+      if (pool_) {
+        pool_->put_raw(buffers_.begin(), buffers_.end(), buffer_size_, stream_);
+      }
+      pool_ = other.pool_;
+      buffer_size_ = other.buffer_size_;
+      stream_ = other.stream_;
+      buffers_ = std::move(other.buffers_);
+      other.pool_ = nullptr;
+      return *this;
+    }
+
+    inline ~Workspace() {
+      if (pool_) {
+        pool_->put_raw(buffers_.begin(), buffers_.end(), buffer_size_, stream_);
+      }
+    }
+
+    template <class T>
+    constexpr void at(const size_t n, T* ptr) const {
+      *ptr = at<T>(n);
+    }
+
+    template <class T>
+    constexpr T at(const size_t n) const {
+      return reinterpret_cast<T>(buffers_.at(n));
+    }
+
+    template <class T>
+    constexpr void get(const size_t n, T* ptr) const {
+      *ptr = get<T>(n);
+    }
+
+    template <class T>
+    constexpr T get(const size_t n) const {
+      return reinterpret_cast<T>(buffers_[n]);
+    }
+
+    constexpr alloc_type* operator[](const size_t n) const {
+      return buffers_[n];
+    }
+
+   protected:
+    pool_type* pool_;
+    size_t buffer_size_;
+    cudaStream_t stream_;
+    Container buffers_;
+  };
+
+  template <size_t N>
+  class StaticWorkspace final : public Workspace<std::array<alloc_type*, N>> {
+   public:
+    using base_type = Workspace<std::array<alloc_type*, N>>;
+
+    friend class MemoryPool<Allocator>;
+
+    inline StaticWorkspace() : base_type() {}
+
+    StaticWorkspace(const StaticWorkspace&) = delete;
+
+    StaticWorkspace& operator=(const StaticWorkspace&) = delete;
+
+    inline StaticWorkspace(StaticWorkspace&& other)
+        : base_type(std::move(other)) {}
+
+    inline StaticWorkspace& operator=(StaticWorkspace&& other) {
+      base_type::operator=(std::move(other));
+      return *this;
+    }
+
+   private:
+    inline StaticWorkspace(pool_type* pool, size_t requested_buffer_size,
+                           cudaStream_t stream)
+        : base_type(pool, stream) {
+      auto& buffers{this->buffers_};
+      this->buffer_size_ = pool->get_raw(buffers.begin(), buffers.end(),
+                                         requested_buffer_size, stream);
+    }
+  };
+
+  class DynamicWorkspace final : public Workspace<std::vector<alloc_type*>> {
+   public:
+    using base_type = Workspace<std::vector<alloc_type*>>;
+
+    friend class MemoryPool<Allocator>;
+
+    inline DynamicWorkspace() : base_type() {}
+
+    DynamicWorkspace(const DynamicWorkspace&) = delete;
+
+    DynamicWorkspace& operator=(const DynamicWorkspace&) = delete;
+
+    inline DynamicWorkspace(DynamicWorkspace&& other)
+        : base_type(std::move(other)) {}
+
+    inline DynamicWorkspace& operator=(DynamicWorkspace&& other) {
+      base_type::operator=(std::move(other));
+      return *this;
+    }
+
+   private:
+    inline DynamicWorkspace(pool_type* pool, size_t n,
+                            size_t requested_buffer_size, cudaStream_t stream)
+        : base_type(pool, stream) {
+      auto& buffers{this->buffers_};
+      buffers.resize(n);
+      this->buffer_size_ = pool->get_raw(buffers.begin(), buffers.end(),
+                                         requested_buffer_size, stream);
+    }
+  };
+
+  MemoryPool(const MemoryPoolOptions& options) : options_{options} {
+    // Create initial buffer stock.
+    stock_.reserve(options_.max_stock);
+
+    // Create enough events, so we have one per potentially pending buffer.
+    ready_events_.resize(options_.max_pending);
+    for (auto& ready_event : ready_events_) {
+      CUDA_CHECK(cudaEventCreate(&ready_event));
+    }
+
+    // Preallocate pending.
+    pending_.reserve(options_.max_pending);
+  }
+
+  ~MemoryPool() {
+    // Make sure all queued tasks are complete.
+    await_pending();
+
+    // Free event and buffer memory.
+    for (auto& ready_event : ready_events_) {
+      CUDA_CHECK(cudaEventDestroy(ready_event));
+    }
+
+    // Any remaining buffers need to be properly unallocated.
+    deplete_stock();
+  }
+
+  inline size_t buffer_size() const { return buffer_size_; }
+
+  inline size_t max_batch_size(size_t max_item_size) const {
+    return buffer_size_ / max_item_size;
+  }
+
+  template <class T>
+  inline size_t max_batch_size() const {
+    return max_batch_size(sizeof(T));
+  }
+
+  size_t current_stock() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return stock_.size();
+  }
+
+  size_t num_pending() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return pending_.size();
+  }
+
+  void await_pending(cudaStream_t stream = 0) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    while (!pending_.empty()) {
+      collect_pending_unsafe(stream);
+      if (pending_.empty()) {
+        break;
+      }
+      std::this_thread::yield();
+    }
+  }
+
+  void deplete_stock() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    for (auto& ptr : stock_) {
+      Allocator::free(ptr);
+    }
+    stock_.clear();
+  }
+
+  inline std::unique_ptr<alloc_type, std::function<void(alloc_type*)>>
+  get_unique(size_t requested_buffer_size, cudaStream_t stream = 0) {
+    alloc_type* ptr;
+    const size_t allocation_size =
+        get_raw(&ptr, (&ptr) + 1, requested_buffer_size, stream);
+    return {ptr, [this, allocation_size, stream](alloc_type* p) {
+              put_raw(&p, (&p) + 1, allocation_size, stream);
+            }};
+  }
+
+  inline std::shared_ptr<alloc_type> get_shared(size_t requested_buffer_size,
+                                                cudaStream_t stream = 0) {
+    alloc_type* ptr;
+    const size_t allocation_size =
+        get_raw(&ptr, (&ptr) + 1, requested_buffer_size, stream);
+    return {ptr, [this, allocation_size, stream](alloc_type* p) {
+              put_raw(&p, (&p) + 1, allocation_size, stream);
+            }};
+  }
+
+  template <size_t N>
+  inline StaticWorkspace<N> get_workspace(size_t requested_buffer_size,
+                                          cudaStream_t stream = 0) {
+    return {this, requested_buffer_size, stream};
+  }
+
+  inline DynamicWorkspace get_workspace(size_t n, size_t requested_buffer_size,
+                                        cudaStream_t stream = 0) {
+    return {this, n, requested_buffer_size, stream};
+  }
+
+  friend std::ostream& operator<<<Allocator>(std::ostream&, const MemoryPool&);
+
+ private:
+  inline void collect_pending_unsafe(cudaStream_t stream) {
+    auto it{std::remove_if(
+        pending_.begin(), pending_.end(), [this, stream](const auto& pending) {
+          const cudaError_t state{cudaEventQuery(std::get<2>(pending))};
+          switch (state) {
+            case cudaSuccess:
+              // Stock buffers and destroy those that are no
+              // longer needed, but only if the allocation_size
+              // is still the same as the current buffer_size.
+              if (stock_.size() < options_.max_stock &&
+                  std::get<1>(pending) == buffer_size_) {
+                stock_.emplace_back(std::get<0>(pending));
+              } else {
+                Allocator::free(std::get<0>(pending), stream);
+              }
+              ready_events_.emplace_back(std::get<2>(pending));
+              return true;
+            case cudaErrorNotReady:
+              return false;
+            default:
+              CUDA_CHECK(state);
+              return false;
+          }
+        })};
+    pending_.erase(it, pending_.end());
+  }
+
+  inline void clear_stock_unsafe(cudaStream_t stream) {
+    for (auto& ptr : stock_) {
+      Allocator::free(ptr, stream);
+    }
+    stock_.clear();
+  }
+
+  template <class Iterator>
+  inline size_t get_raw(Iterator first, Iterator const last,
+                        size_t requested_buffer_size, cudaStream_t stream) {
+    // Get pre-allocated buffers if stock available.
+    size_t allocation_size;
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+
+      // If requested_buffer_size is within current buffer_size margins can
+      // reuse current buffers.
+      if (requested_buffer_size <= buffer_size_) {
+        while (first != last) {
+          // If no buffers available, try to make some available.
+          if (stock_.empty()) {
+            collect_pending_unsafe(stream);
+            if (stock_.empty()) {
+              // No buffers available.
+              break;
+            }
+          }
+
+          // Just take the next available buffer.
+          *first++ = stock_.back();
+          stock_.pop_back();
+        }
+      } else {
+        // Drop the stock because we need more memory and those buffers have
+        // become useless to that end.
+        clear_stock_unsafe(stream);
+        buffer_size_ = requested_buffer_size;
+      }
+
+      allocation_size = buffer_size_;
+    }
+
+    // Forge new buffers until request can be filled.
+    for (; first != last; ++first) {
+      *first = Allocator::alloc(allocation_size, stream);
+    }
+
+    return allocation_size;
+  }
+
+  template <class Iterator>
+  inline void put_raw(Iterator first, Iterator const last,
+                      size_t allocation_size, cudaStream_t stream) {
+    std::lock_guard<std::mutex> lock(mutex_);
+
+    // If allocation_size of the workspace differs from the current buffer_size
+    // (i.e., somebody else requested a larger buffer since the original request
+    // occured), the provided buffers are incompatible and have to be discarded.
+    if (allocation_size != buffer_size_) {
+      while (first != last) {
+        Allocator::free(*first++);
+      }
+      return;
+    }
+
+    // If the workspace that borrowed a stream was moved out of the RAII scope
+    // where it was created, it could happen that the stream was destroyed when
+    // we return the buffer ownershup. This `cudaStreamQuery` will prevent that.
+    if (stream && cudaStreamQuery(stream) != cudaErrorInvalidResourceHandle) {
+      for (; first != last; ++first) {
+        // Avoid adding already deallocated buffers.
+        if (*first == nullptr) {
+          continue;
+        }
+
+        // Spin lock if too many pending buffers (i.e., let CPU wait for GPU).
+        while (ready_events_.empty()) {
+          collect_pending_unsafe(stream);
+          if (!ready_events_.empty()) {
+            break;
+          }
+          std::this_thread::yield();
+        }
+
+        // Queue buffer.
+        cudaEvent_t ready_event{ready_events_.back()};
+        ready_events_.pop_back();
+        CUDA_CHECK(cudaEventRecord(ready_event, stream));
+        pending_.emplace_back(*first, allocation_size, ready_event);
+      }
+    } else {
+      // Without stream context, we must force a hard sync with the GPU.
+      CUDA_CHECK(cudaDeviceSynchronize());
+
+      for (; first != last; ++first) {
+        // Avoid adding already deallocated buffers.
+        if (*first == nullptr) {
+          continue;
+        }
+
+        // Stock buffers and destroy those that are no longer needed.
+        if (stock_.size() < options_.max_stock) {
+          stock_.emplace_back(*first);
+        } else {
+          Allocator::free(*first);
+        }
+      }
+    }
+  }
+
+  const MemoryPoolOptions options_;
+
+  mutable std::mutex mutex_;
+  size_t buffer_size_{1};
+  std::vector<alloc_type*> stock_;
+  std::vector<cudaEvent_t> ready_events_;
+
+  std::vector<std::tuple<alloc_type*, size_t, cudaEvent_t>> pending_;
+};
+
+template <class Allocator>
+std::ostream& operator<<(std::ostream& os, const MemoryPool<Allocator>& pool) {
+  std::lock_guard<std::mutex> lock(pool.mutex_);
+
+  for (size_t i{0}; i < 80; ++i) {
+    os << '-';
+  }
+
+  // Current stock.
+  os << "\nStock =\n";
+  for (size_t i{0}; i < pool.stock_.size(); ++i) {
+    os << "[ " << i << " ] buffer " << static_cast<void*>(pool.stock_[i])
+       << ", size = " << pool.buffer_size_ << '\n';
+  }
+
+  // Pending buffers.
+  os << "\nPending =\n";
+  for (size_t i{0}; i < pool.pending_.size(); ++i) {
+    os << "[ " << i
+       << " ] buffer = " << static_cast<void*>(std::get<0>(pool.pending_[i]))
+       << ", size = " << std::get<1>(pool.pending_[i]) << ", ready_event = "
+       << static_cast<void*>(std::get<2>(pool.pending_[i])) << '\n';
+  }
+
+  // Available ready events.
+  os << "\nReady Events =\n";
+  for (size_t i{0}; i < pool.ready_events_.size(); ++i) {
+    os << "[ " << i << " ] " << static_cast<void*>(pool.ready_events_[i])
+       << '\n';
+  }
+
+  for (size_t i{0}; i < 80; ++i) {
+    os << '-';
+  }
+
+  os << '\n';
+  return os;
+}
+
+}  // namespace merlin
+}  // namespace nv
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/optimizers.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/optimizers.cuh
new file mode 100644
index 000000000..b3fc1cc70
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/optimizers.cuh
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include "types.cuh"
+#include "utils.cuh"
+
+namespace nv {
+namespace merlin {
+namespace optimizers {
+
+template <class T>
+__global__ void adam_update_kernel(int len, float* weight, T* m, T* v,
+                                   const T* wgrad, float alpha_t, float beta1,
+                                   float beta2, float epsilon, float scaler) {
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    float gi = TypeConvertFunc<float, T>::convert(wgrad[i]) / scaler;
+    float mi =
+        beta1 * TypeConvertFunc<float, T>::convert(m[i]) + (1.f - beta1) * gi;
+    float vi = beta2 * TypeConvertFunc<float, T>::convert(v[i]) +
+               (1.f - beta2) * gi * gi;
+    m[i] = TypeConvertFunc<T, float>::convert(mi);
+    v[i] = TypeConvertFunc<T, float>::convert(vi);
+    weight[i] -= alpha_t * mi / (sqrt(vi) + epsilon);
+  }
+}
+
+template <class T>
+__global__ void ada_grad_update_kernel(int len, float* weight, const T* wgrad,
+                                       T* sum, float lr, const float epsilon,
+                                       float scaler) {
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    float gi = TypeConvertFunc<float, T>::convert(wgrad[i]) / scaler;
+    float accum_ = TypeConvertFunc<float, T>::convert(__ldg(&sum[i]));
+    accum_ += gi * gi;
+    float std_ = epsilon + sqrtf(accum_);
+    weight[i] -= lr * gi / std_;
+    sum[i] = TypeConvertFunc<T, float>::convert(accum_);
+  }
+}
+
+template <class T>
+__global__ void momentum_sgd_update_kernel(int len, float* weight, T* momentum,
+                                           const T* wgrad, float lr,
+                                           float momentum_factor,
+                                           float scaler) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx < len) {
+    float mv =
+        momentum_factor * TypeConvertFunc<float, T>::convert(momentum[idx]) -
+        lr * TypeConvertFunc<float, T>::convert(wgrad[idx]) / scaler;
+    momentum[idx] = TypeConvertFunc<T, float>::convert(mv);
+    weight[idx] += mv;
+  }
+  return;
+}
+
+}  // namespace optimizers
+}  // namespace merlin
+}  // namespace nv
\ No newline at end of file
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/types.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/types.cuh
new file mode 100644
index 000000000..210acd4a5
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/types.cuh
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <stddef.h>
+#include <cstdint>
+#include <cuda/std/semaphore>
+
+namespace nv {
+namespace merlin {
+
+/**
+ * Shorthand for a Key-Value-Meta tuple.
+ */
+template <class K, class V, class M>
+struct KVM {
+  K key;
+  V* value;
+  M meta;
+};
+
+constexpr uint64_t EMPTY_KEY = UINT64_C(0xFFFFFFFFFFFFFFFF);
+constexpr uint64_t RECLAIM_KEY = UINT64_C(0xFFFFFFFFFFFFFFFE);
+constexpr uint64_t VACANT_KEY_MASK = UINT64_C(0xFFFFFFFFFFFFFFFE);
+constexpr uint64_t LOCKED_KEY = UINT64_C(0xFFFFFFFFFFFFFFFD);
+constexpr uint64_t RESERVED_KEY_MASK = UINT64_C(0xFFFFFFFFFFFFFFFC);
+constexpr uint64_t MAX_META = UINT64_C(0xFFFFFFFFFFFFFFFF);
+constexpr uint64_t EMPTY_META = UINT64_C(0);
+
+#define IS_RESERVED_KEY(key) ((RESERVED_KEY_MASK & (key)) == RESERVED_KEY_MASK)
+#define IS_VACANT_KEY(key) ((VACANT_KEY_MASK & (key)) == VACANT_KEY_MASK)
+
+template <class K>
+using AtomicKey = cuda::atomic<K, cuda::thread_scope_device>;
+
+template <class M>
+using AtomicMeta = cuda::atomic<M, cuda::thread_scope_device>;
+
+template <class T>
+using AtomicPos = cuda::atomic<T, cuda::thread_scope_device>;
+
+template <class K, class V, class M>
+struct Bucket {
+  AtomicKey<K>* keys_;
+  AtomicMeta<M>* metas_;
+  V* vectors;  // Pinned memory or HBM
+
+  /* For upsert_kernel without user specified metas
+     recording the current meta, the cur_meta will
+     increment by 1 when a new inserting happens. */
+  AtomicMeta<M> cur_meta;
+
+  /* min_meta and min_pos is for or upsert_kernel
+     with user specified meta. They record the minimum
+     meta and its pos in the bucket. */
+  AtomicMeta<M> min_meta;
+  AtomicPos<int> min_pos;
+
+  __forceinline__ __device__ AtomicKey<K>* keys(int index) const {
+    return keys_ + index;
+  }
+
+  __forceinline__ __device__ AtomicMeta<M>* metas(int index) const {
+    return metas_ + index;
+  }
+};
+
+template <cuda::thread_scope Scope, class T = int>
+class Lock {
+  mutable cuda::atomic<T, Scope> _lock;
+
+ public:
+  __device__ Lock() : _lock{1} {}
+
+  template <typename CG>
+  __forceinline__ __device__ void acquire(CG const& g,
+                                          unsigned long long lane = 0) const {
+    if (g.thread_rank() == lane) {
+      T expected = 1;
+      while (!_lock.compare_exchange_weak(expected, 2,
+                                          cuda::std::memory_order_acquire)) {
+        expected = 1;
+      }
+    }
+    g.sync();
+  }
+
+  template <typename CG>
+  __forceinline__ __device__ void release(CG const& g,
+                                          unsigned long long lane = 0) const {
+    g.sync();
+    if (g.thread_rank() == lane) {
+      _lock.store(1, cuda::std::memory_order_release);
+    }
+  }
+};
+
+using Mutex = Lock<cuda::thread_scope_device>;
+
+template <class K, class V, class M>
+struct Table {
+  Bucket<K, V, M>* buckets;
+  Mutex* locks;                 // mutex for write buckets
+  int* buckets_size;            // size of each buckets.
+  V** slices;                   // Handles of the HBM/ HMEM slices.
+  size_t dim;                   // Dimension of the `vectors`.
+  size_t bytes_per_slice;       // Size by byte of one slice.
+  size_t num_of_memory_slices;  // Number of vectors memory slices.
+  size_t capacity = 134217728;  // Initial capacity.
+  size_t max_size =
+      std::numeric_limits<uint64_t>::max();  // Up limit of the table capacity.
+  size_t buckets_num;                        // Number of the buckets.
+  size_t bucket_max_size = 128;              // Volume of each buckets.
+  size_t max_hbm_for_vectors = 0;            // Max HBM allocated for vectors
+  size_t remaining_hbm_for_vectors = 0;  // Remaining HBM allocated for vectors
+  bool is_pure_hbm = true;               // unused
+  bool primary = true;                   // unused
+  int slots_offset = 0;                  // unused
+  int slots_number = 0;                  // unused
+  int device_id = 0;                     // Device id
+  int tile_size;
+};
+
+template <class K, class M>
+using EraseIfPredictInternal =
+    bool (*)(const K& key,       ///< iterated key in table
+             M& meta,            ///< iterated meta in table
+             const K& pattern,   ///< input key from caller
+             const M& threshold  ///< input meta from caller
+    );
+
+/**
+ * An abstract class provides interface between the nv::merlin::HashTable
+ * and a file, which enables the table to save to the file or load from
+ * the file, by overriding the `read` and `write` method.
+ *
+ * @tparam K The data type of the key.
+ * @tparam V The data type of the vector's elements.
+ *         The item data type should be a basic data type of C++/CUDA.
+ * @tparam M The data type for `meta`.
+ *           The currently supported data type is only `uint64_t`.
+ *
+ */
+template <class K, class V, class M>
+class BaseKVFile {
+ public:
+  virtual ~BaseKVFile() {}
+
+  /**
+   * Read from file and fill into the keys, values, and metas buffer.
+   * When calling save/load method from table, it can assume that the
+   * received buffer of keys, vectors, and metas are automatically
+   * pre-allocated.
+   *
+   * @param n The number of KV pairs expect to read. `int64_t` was used
+   *          here to adapt to various filesytem and formats.
+   * @param dim The dimension of the `vectors`.
+   * @param keys The pointer to received buffer for keys.
+   * @param vectors The pointer to received buffer for vectors.
+   * @param metas The pointer to received buffer for metas.
+   *
+   * @return Number of KV pairs have been successfully read.
+   */
+  virtual size_t read(const size_t n, const size_t dim, K* keys, V* vectors,
+                      M* metas) = 0;
+
+  /**
+   * Write keys, values, metas from table to the file. It defines
+   * an abstract method to get batch of KV pairs and write them into
+   * file.
+   *
+   * @param n The number of KV pairs to be written. `int64_t` was used
+   *          here to adapt to various filesytem and formats.
+   * @param dim The dimension of the `vectors`.
+   * @param keys The keys will be written to file.
+   * @param vectors The vectors of values will be written to file.
+   * @param metas The metas will be written to file.
+   *
+   * @return Number of KV pairs have been successfully written.
+   */
+  virtual size_t write(const size_t n, const size_t dim, const K* keys,
+                       const V* vectors, const M* metas) = 0;
+};
+
+enum class OccupyResult {
+  INITIAL,         ///< Initial status
+  CONTINUE,        ///< Insert did not succeed, continue trying to insert
+  OCCUPIED_EMPTY,  ///< New pair inserted successfully
+  OCCUPIED_RECLAIMED,
+  DUPLICATE,  ///< Insert did not succeed, key is already present
+  EVICT,      ///< Insert succeeded by evicting one key with minimum meta.
+  REFUSED,    ///< Insert did not succeed, insert meta is too low.
+};
+
+enum class OverrideResult {
+  INITIAL,   ///< Initial status
+  CONTINUE,  ///< Override did not succeed, continue trying to override
+  SUCCESS,   ///< Override successfully
+  REFUSED,   ///< Override is refused.
+};
+
+}  // namespace merlin
+}  // namespace nv
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/utils.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/utils.cuh
new file mode 100644
index 000000000..bd60f93fc
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/utils.cuh
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cooperative_groups.h>
+#include <stdarg.h>
+#include <cstdint>
+#include <exception>
+#include <string>
+#include "cuda_fp16.h"
+#include "cuda_runtime_api.h"
+#include "debug.hpp"
+#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/cuda_utils.cuh"
+
+using namespace cooperative_groups;
+namespace cg = cooperative_groups;
+
+/*
+__inline__ __device__ uint64_t atomicCAS(uint64_t* address, uint64_t compare,
+                                         uint64_t val) {
+  return (uint64_t)atomicCAS((unsigned long long*)address,
+                             (unsigned long long)compare,
+                             (unsigned long long)val);
+}
+
+__inline__ __device__ int64_t atomicCAS(int64_t* address, int64_t compare,
+                                        int64_t val) {
+  return (int64_t)atomicCAS((unsigned long long*)address,
+                            (unsigned long long)compare,
+                            (unsigned long long)val);
+}
+*/
+
+__inline__ __device__ uint64_t atomicExch(uint64_t* address, uint64_t val) {
+  return (uint64_t)atomicExch((unsigned long long*)address,
+                              (unsigned long long)val);
+}
+
+__inline__ __device__ int64_t atomicExch(int64_t* address, int64_t val) {
+  return (int64_t)atomicExch((unsigned long long*)address,
+                             (unsigned long long)val);
+}
+
+__inline__ __device__ signed char atomicExch(signed char* address,
+                                             signed char val) {
+  signed char old = *address;
+  *address = val;
+  return old;
+}
+
+/*
+__inline__ __device__ int64_t atomicAdd(int64_t* address, const int64_t val) {
+  return (int64_t)atomicAdd((unsigned long long*)address, val);
+}
+
+__inline__ __device__ uint64_t atomicAdd(uint64_t* address,
+                                         const uint64_t val) {
+  return (uint64_t)atomicAdd((unsigned long long*)address, val);
+}
+*/
+
+namespace nv {
+namespace merlin {
+
+inline void __cudaCheckError(const char* file, const int line) {
+#ifdef CUDA_ERROR_CHECK
+  cudaError err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n", file, line,
+            cudaGetErrorString(err));
+    exit(-1);
+  }
+
+  // More careful checking. However, this will affect performance.
+  // Comment away if needed.
+  err = cudaDeviceSynchronize();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n", file,
+            line, cudaGetErrorString(err));
+    exit(-1);
+  }
+#endif
+
+  return;
+}
+//#define CudaCheckError() nv::merlin::__cudaCheckError(__FILE__, __LINE__)
+#define CudaCheckError() {}
+
+static inline size_t SAFE_GET_GRID_SIZE(size_t N, int block_size) {
+  return ((N) > std::numeric_limits<int>::max())
+             ? ((1 << 30 - 1) / block_size + 1)
+             : (((N)-1) / block_size + 1);
+}
+
+static inline int SAFE_GET_BLOCK_SIZE(int block_size, int device = -1) {
+  cudaDeviceProp prop;
+  int current_device = device;
+  if (current_device == -1) {
+    CUDA_CHECK(cudaGetDevice(&current_device));
+  }
+  CUDA_CHECK(cudaGetDeviceProperties(&prop, current_device));
+  if (block_size > prop.maxThreadsPerBlock) {
+    fprintf(stdout,
+            "The requested block_size=%d exceeds the device limit, "
+            "the maxThreadsPerBlock=%d will be applied.\n",
+            block_size, prop.maxThreadsPerBlock);
+  }
+  return std::min(prop.maxThreadsPerBlock, block_size);
+}
+
+inline uint64_t Murmur3HashHost(const uint64_t& key) {
+  uint64_t k = key;
+  k ^= k >> 33;
+  k *= UINT64_C(0xff51afd7ed558ccd);
+  k ^= k >> 33;
+  k *= UINT64_C(0xc4ceb9fe1a85ec53);
+  k ^= k >> 33;
+  return k;
+}
+
+__inline__ __device__ uint64_t Murmur3HashDevice(uint64_t const& key) {
+  uint64_t k = key;
+  k ^= k >> 33;
+  k *= UINT64_C(0xff51afd7ed558ccd);
+  k ^= k >> 33;
+  k *= UINT64_C(0xc4ceb9fe1a85ec53);
+  k ^= k >> 33;
+  return k;
+}
+
+__inline__ __device__ int64_t Murmur3HashDevice(int64_t const& key) {
+  uint64_t k = uint64_t(key);
+  k ^= k >> 33;
+  k *= UINT64_C(0xff51afd7ed558ccd);
+  k ^= k >> 33;
+  k *= UINT64_C(0xc4ceb9fe1a85ec53);
+  k ^= k >> 33;
+  return int64_t(k);
+}
+
+__inline__ __device__ uint32_t Murmur3HashDevice(uint32_t const& key) {
+  uint32_t k = key;
+  k ^= k >> 16;
+  k *= UINT32_C(0x85ebca6b);
+  k ^= k >> 13;
+  k *= UINT32_C(0xc2b2ae35);
+  k ^= k >> 16;
+
+  return k;
+}
+
+__inline__ __device__ int32_t Murmur3HashDevice(int32_t const& key) {
+  uint32_t k = uint32_t(key);
+  k ^= k >> 16;
+  k *= UINT32_C(0x85ebca6b);
+  k ^= k >> 13;
+  k *= UINT32_C(0xc2b2ae35);
+  k ^= k >> 16;
+
+  return int32_t(k);
+}
+
+class CudaDeviceRestorer {
+ public:
+  CudaDeviceRestorer() { CUDA_CHECK(cudaGetDevice(&dev_)); }
+  ~CudaDeviceRestorer() { CUDA_CHECK(cudaSetDevice(dev_)); }
+
+ private:
+  int dev_;
+};
+
+static inline int get_dev(const void* ptr) {
+  cudaPointerAttributes attr;
+  CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
+  int dev = -1;
+
+#if CUDART_VERSION >= 10000
+  if (attr.type == cudaMemoryTypeDevice)
+#else
+  if (attr.memoryType == cudaMemoryTypeDevice)
+#endif
+  {
+    dev = attr.device;
+  }
+  return dev;
+}
+
+static inline void switch_to_dev(const void* ptr) {
+  int dev = get_dev(ptr);
+  if (dev >= 0) {
+    CUDA_CHECK(cudaSetDevice(dev));
+  }
+}
+
+static inline bool is_on_device(const void* ptr) {
+  cudaPointerAttributes attr;
+  CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
+
+#if CUDART_VERSION >= 10000
+  return (attr.type == cudaMemoryTypeDevice);
+#else
+  return (attr.memoryType == cudaMemoryTypeDevice);
+#endif
+}
+
+template <typename TOUT, typename TIN>
+struct TypeConvertFunc;
+
+template <>
+struct TypeConvertFunc<__half, float> {
+  static __forceinline__ __device__ __half convert(float val) {
+    return __float2half(val);
+  }
+};
+
+template <>
+struct TypeConvertFunc<float, __half> {
+  static __forceinline__ __device__ float convert(__half val) {
+    return __half2float(val);
+  }
+};
+
+template <>
+struct TypeConvertFunc<float, float> {
+  static __forceinline__ __device__ float convert(float val) { return val; }
+};
+
+template <>
+struct TypeConvertFunc<float, long long> {
+  static __forceinline__ __device__ float convert(long long val) {
+    return static_cast<float>(val);
+  }
+};
+
+template <>
+struct TypeConvertFunc<float, unsigned int> {
+  static __forceinline__ __device__ float convert(unsigned int val) {
+    return static_cast<float>(val);
+  }
+};
+
+template <>
+struct TypeConvertFunc<int, long long> {
+  static __forceinline__ __device__ int convert(long long val) {
+    return static_cast<int>(val);
+  }
+};
+
+template <>
+struct TypeConvertFunc<int, unsigned int> {
+  static __forceinline__ __device__ int convert(unsigned int val) {
+    return static_cast<int>(val);
+  }
+};
+
+template <class P>
+void realloc(P* ptr, size_t old_size, size_t new_size) {
+  // Truncate old_size to limit dowstream copy ops.
+  old_size = std::min(old_size, new_size);
+
+  // Alloc new buffer and copy at old data.
+  char* new_ptr;
+  CUDA_CHECK(cudaMalloc(&new_ptr, new_size));
+  if (*ptr != nullptr) {
+    CUDA_CHECK(cudaMemcpy(new_ptr, *ptr, old_size, cudaMemcpyDefault));
+    CUDA_CHECK(cudaFree(*ptr));
+  }
+
+  // Zero-fill remainder.
+  CUDA_CHECK(cudaMemset(new_ptr + old_size, 0, new_size - old_size));
+
+  // Switch to new pointer.
+  *ptr = reinterpret_cast<P>(new_ptr);
+  return;
+}
+
+template <class P>
+void realloc_managed(P* ptr, size_t old_size, size_t new_size) {
+  // Truncate old_size to limit dowstream copy ops.
+  old_size = std::min(old_size, new_size);
+
+  // Alloc new buffer and copy at old data.
+  char* new_ptr;
+  CUDA_CHECK(cudaMallocManaged(&new_ptr, new_size));
+  if (*ptr != nullptr) {
+    CUDA_CHECK(cudaMemcpy(new_ptr, *ptr, old_size, cudaMemcpyDefault));
+    CUDA_CHECK(cudaFree(*ptr));
+  }
+
+  // Zero-fill remainder.
+  CUDA_CHECK(cudaMemset(new_ptr + old_size, 0, new_size - old_size));
+
+  // Switch to new pointer.
+  *ptr = reinterpret_cast<P>(new_ptr);
+  return;
+}
+
+template <typename mutex, uint32_t TILE_SIZE, bool THREAD_SAFE = true>
+__forceinline__ __device__ void lock(
+    const cg::thread_block_tile<TILE_SIZE>& tile, mutex& set_mutex,
+    unsigned long long lane = 0) {
+  if (THREAD_SAFE) {
+    set_mutex.acquire(tile, lane);
+  }
+}
+
+template <typename mutex, uint32_t TILE_SIZE, bool THREAD_SAFE = true>
+__forceinline__ __device__ void unlock(
+    const cg::thread_block_tile<TILE_SIZE>& tile, mutex& set_mutex,
+    unsigned long long lane = 0) {
+  if (THREAD_SAFE) {
+    set_mutex.release(tile, lane);
+  }
+}
+
+inline void free_pointers(cudaStream_t stream, int n, ...) {
+  va_list args;
+  va_start(args, n);
+  void* ptr = nullptr;
+  for (int i = 0; i < n; i++) {
+    ptr = va_arg(args, void*);
+    if (ptr) {
+      cudaPointerAttributes attr;
+      memset(&attr, 0, sizeof(cudaPointerAttributes));
+      try {
+        CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
+        if (attr.devicePointer && (!attr.hostPointer)) {
+          CUDA_CHECK(cudaFreeAsync(ptr, stream));
+        } else if (attr.devicePointer && attr.hostPointer) {
+          CUDA_CHECK(cudaFreeHost(ptr));
+        } else {
+          free(ptr);
+        }
+      } catch (const nv::merlin::CudaException& e) {
+        va_end(args);
+        throw e;
+      }
+    }
+  }
+  va_end(args);
+}
+
+#define CUDA_FREE_POINTERS(stream, ...) \
+  nv::merlin::free_pointers(            \
+      stream, (sizeof((void*[]){__VA_ARGS__}) / sizeof(void*)), __VA_ARGS__);
+
+static inline size_t GB(size_t n) { return n << 30; }
+
+static inline size_t MB(size_t n) { return n << 20; }
+
+static inline size_t KB(size_t n) { return n << 10; }
+
+}  // namespace merlin
+}  // namespace nv
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_hashtable.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_hashtable.cuh
new file mode 100644
index 000000000..19da172ea
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_hashtable.cuh
@@ -0,0 +1,1643 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sort.h>
+#include <atomic>
+#include <cstdint>
+#include <limits>
+#include <mutex>
+#include <shared_mutex>
+#include <type_traits>
+#include "merlin/array_kernels.cuh"
+#include "merlin/core_kernels.cuh"
+#include "merlin/flexible_buffer.cuh"
+#include "merlin/group_lock.hpp"
+#include "merlin/memory_pool.cuh"
+#include "merlin/types.cuh"
+#include "merlin/utils.cuh"
+
+namespace nv {
+namespace merlin {
+
+/**
+ * @brief Enumeration of the eviction strategies.
+ *
+ * @note The `meta` is introduced to define the importance of each key, the
+ * larger, the more important, the less likely they will be evicted. On `kLru`
+ * mode, the `metas` parameter of the APIs should keep `nullptr`, the meta for
+ * each key is assigned internally in LRU(Least Recently Used) policy. On
+ * `kCustomized` mode, the `metas` should be provided by caller.
+ *
+ * @note Eviction occurs automatically when a bucket is full. The keys with the
+ * minimum `meta` value are evicted first.
+ *
+ */
+enum class EvictStrategy {
+  kLru = 0,        ///< LRU mode.
+  kCustomized = 1  ///< Customized mode.
+};
+
+/**
+ * @brief The options struct of HierarchicalKV.
+ */
+struct HashTableOptions {
+  size_t init_capacity = 0;        ///< The initial capacity of the hash table.
+  size_t max_capacity = 0;         ///< The maximum capacity of the hash table.
+  size_t max_hbm_for_vectors = 0;  ///< The maximum HBM for vectors, in bytes.
+  size_t max_bucket_size = 128;    ///< The length of each bucket.
+  size_t dim = 64;                 ///< The dimension of the vectors.
+  float max_load_factor = 0.5f;    ///< The max load factor before rehashing.
+  int block_size = 128;            ///< The default block size for CUDA kernels.
+  int io_block_size = 1024;        ///< The block size for IO CUDA kernels.
+  int device_id = -1;              ///< The ID of device.
+  bool io_by_cpu = false;  ///< The flag indicating if the CPU handles IO.
+  EvictStrategy evict_strategy = EvictStrategy::kLru;  ///< The evict strategy.
+  bool use_constant_memory = false;                    ///< reserved
+  MemoryPoolOptions
+      device_memory_pool;  ///< Configuration options for device memory pool.
+  MemoryPoolOptions
+      host_memory_pool;  ///< Configuration options for host memory pool.
+};
+
+/**
+ * @brief A customizable template function indicates which keys should be
+ * erased from the hash table by returning `true`.
+ *
+ * @note The `erase_if` or `export_batch_if` API traverses all of the items by
+ * this function and the items that return `true` are removed or exported.
+ *
+ *  Example for erase_if:
+ *
+ *    ```
+ *    template <class K, class M>
+ *    __forceinline__ __device__ bool erase_if_pred(const K& key,
+ *                                                  M& meta,
+ *                                                  const K& pattern,
+ *                                                  const M& threshold) {
+ *      return ((key & 0xFFFF000000000000 == pattern) &&
+ *              (meta < threshold));
+ *    }
+ *    ```
+ *
+ *  Example for export_batch_if:
+ *    ```
+ *    template <class K, class M>
+ *    __forceinline__ __device__ bool export_if_pred(const K& key,
+ *                                                   M& meta,
+ *                                                   const K& pattern,
+ *                                                   const M& threshold) {
+ *      return meta >= threshold;
+ *    }
+ *    ```
+ */
+template <class K, class M>
+using EraseIfPredict = bool (*)(
+    const K& key,       ///< The traversed key in a hash table.
+    M& meta,            ///< The traversed meta in a hash table.
+    const K& pattern,   ///< The key pattern to compare with the `key` argument.
+    const M& threshold  ///< The threshold to compare with the `meta` argument.
+);
+
+/**
+ * A HierarchicalKV hash table is a concurrent and hierarchical hash table that
+ * is powered by GPUs and can use HBM and host memory as storage for key-value
+ * pairs. Support for SSD storage is a future consideration.
+ *
+ * The `meta` is introduced to define the importance of each key, the
+ * larger, the more important, the less likely they will be evicted. Eviction
+ * occurs automatically when a bucket is full. The keys with the minimum `meta`
+ * value are evicted first. In a customized eviction strategy, we recommend
+ * using the timestamp or frequency of the key occurrence as the `meta` value
+ * for each key. You can also assign a special value to the `meta` to
+ * perform a customized eviction strategy.
+ *
+ * @note By default configuration, this class is thread-safe.
+ *
+ * @tparam K The data type of the key.
+ * @tparam V The data type of the vector's item type.
+ *         The item data type should be a basic data type of C++/CUDA.
+ * @tparam M The data type for `meta`.
+ *           The currently supported data type is only `uint64_t`.
+ *
+ */
+template <class K, class V, class M = uint64_t>
+class HashTable {
+ public:
+  using size_type = size_t;
+  using key_type = K;
+  using value_type = V;
+  using meta_type = M;
+  using Pred = EraseIfPredict<key_type, meta_type>;
+
+ private:
+  using TableCore = nv::merlin::Table<key_type, value_type, meta_type>;
+  static constexpr unsigned int TILE_SIZE = 4;
+
+  using DeviceMemoryPool = MemoryPool<DeviceAllocator<char>>;
+  using HostMemoryPool = MemoryPool<HostAllocator<char>>;
+
+#if THRUST_VERSION >= 101600
+  static constexpr auto thrust_par = thrust::cuda::par_nosync;
+#else
+  static constexpr auto thrust_par = thrust::cuda::par;
+#endif
+
+ public:
+  /**
+   * @brief Default constructor for the hash table class.
+   */
+  HashTable(){};
+
+  /**
+   * @brief Frees the resources used by the hash table and destroys the hash
+   * table object.
+   */
+  ~HashTable() {
+    if (initialized_) {
+      CUDA_CHECK(cudaDeviceSynchronize());
+
+      initialized_ = false;
+      destroy_table<key_type, value_type, meta_type>(&table_);
+      CUDA_CHECK(cudaFree(d_table_));
+      dev_mem_pool_.reset();
+      host_mem_pool_.reset();
+    }
+  }
+
+ private:
+  HashTable(const HashTable&) = delete;
+  HashTable& operator=(const HashTable&) = delete;
+  HashTable(HashTable&&) = delete;
+  HashTable& operator=(HashTable&&) = delete;
+
+ public:
+  /**
+   * @brief Initialize a merlin::HashTable.
+   *
+   * @param options The configuration options.
+   */
+  void init(const HashTableOptions options) {
+    if (initialized_) {
+      return;
+    }
+    options_ = options;
+
+    if (options_.device_id >= 0) {
+      CUDA_CHECK(cudaSetDevice(options_.device_id));
+    } else {
+      CUDA_CHECK(cudaGetDevice(&(options_.device_id)));
+    }
+
+    // Construct table.
+    cudaDeviceProp deviceProp;
+    CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, options_.device_id));
+    shared_mem_size_ = deviceProp.sharedMemPerBlock;
+    create_table<key_type, value_type, meta_type>(
+        &table_, options_.dim, options_.init_capacity, options_.max_capacity,
+        options_.max_hbm_for_vectors, options_.max_bucket_size);
+    options_.block_size = SAFE_GET_BLOCK_SIZE(options_.block_size);
+    reach_max_capacity_ = (options_.init_capacity * 2 > options_.max_capacity);
+    MERLIN_CHECK((!(options_.io_by_cpu && options_.max_hbm_for_vectors != 0)),
+                 "[HierarchicalKV] `io_by_cpu` should not be true when "
+                 "`max_hbm_for_vectors` is not 0!");
+    CUDA_CHECK(cudaMalloc((void**)&(d_table_), sizeof(TableCore)));
+
+    sync_table_configuration();
+
+    // Create memory pools.
+    dev_mem_pool_ = std::make_unique<MemoryPool<DeviceAllocator<char>>>(
+        options_.device_memory_pool);
+    host_mem_pool_ = std::make_unique<MemoryPool<HostAllocator<char>>>(
+        options_.host_memory_pool);
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+    initialized_ = true;
+    CudaCheckError();
+  }
+
+  /**
+   * @brief Insert new key-value-meta tuples into the hash table.
+   * If the key already exists, the values and metas are assigned new values.
+   *
+   * If the target bucket is full, the keys with minimum meta will be
+   * overwritten by new key unless the meta of the new key is even less than
+   * minimum meta of the target bucket.
+   *
+   * @param n Number of key-value-meta tuples to insert or assign.
+   * @param keys The keys to insert on GPU-accessible memory with shape
+   * (n).
+   * @param values The values to insert on GPU-accessible memory with
+   * shape (n, DIM).
+   * @param metas The metas to insert on GPU-accessible memory with shape
+   * (n).
+   * @parblock
+   * The metas should be a `uint64_t` value. You can specify a value that
+   * such as the timestamp of the key insertion, number of the key
+   * occurrences, or another value to perform a custom eviction strategy.
+   *
+   * The @p metas should be `nullptr`, when the LRU eviction strategy is
+   * applied.
+   * @endparblock
+   *
+   * @param stream The CUDA stream that is used to execute the operation.
+   *
+   * @param ignore_evict_strategy A boolean option indicating whether if
+   * the insert_or_assign ignores the evict strategy of table with current
+   * metas anyway. If true, it does not check whether the metas confroms to
+   * the evict strategy. If false, it requires the metas follow the evict
+   * strategy of table.
+   */
+  void insert_or_assign(const size_type n,
+                        const key_type* keys,              // (n)
+                        const value_type* values,          // (n, DIM)
+                        const meta_type* metas = nullptr,  // (n)
+                        cudaStream_t stream = 0,
+                        bool ignore_evict_strategy = false) {
+    if (n == 0) {
+      return;
+    }
+
+    while (!reach_max_capacity_ &&
+           fast_load_factor(n, stream) > options_.max_load_factor) {
+      reserve(capacity() * 2, stream);
+    }
+
+    if (!ignore_evict_strategy) {
+      check_evict_strategy(metas);
+    }
+
+    writer_shared_lock lock(mutex_);
+
+    if (is_fast_mode()) {
+      using Selector =
+          SelectUpsertKernelWithIO<key_type, value_type, meta_type>;
+      static thread_local int step_counter = 0;
+      static thread_local float load_factor = 0.0;
+
+      if (((step_counter++) % kernel_select_interval_) == 0) {
+        load_factor = fast_load_factor(0, stream, false);
+      }
+
+      Selector::execute_kernel(
+          load_factor, options_.block_size, options_.max_bucket_size,
+          table_->buckets_num, options_.dim, stream, n, d_table_, keys,
+          reinterpret_cast<const value_type*>(values), metas);
+    } else {
+      const size_type dev_ws_size{n * (sizeof(value_type*) + sizeof(int))};
+      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
+      auto d_dst{dev_ws.get<value_type**>(0)};
+      auto d_src_offset{reinterpret_cast<int*>(d_dst + n)};
+
+      CUDA_CHECK(cudaMemsetAsync(d_dst, 0, dev_ws_size, stream));
+
+      {
+        const size_t block_size = options_.block_size;
+        const size_t N = n * TILE_SIZE;
+        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+
+        upsert_kernel<key_type, value_type, meta_type, TILE_SIZE>
+            <<<grid_size, block_size, 0, stream>>>(
+                d_table_, options_.max_bucket_size, table_->buckets_num,
+                options_.dim, keys, d_dst, metas, d_src_offset, N);
+      }
+
+      {
+        thrust::device_ptr<uintptr_t> d_dst_ptr(
+            reinterpret_cast<uintptr_t*>(d_dst));
+        thrust::device_ptr<int> d_src_offset_ptr(d_src_offset);
+
+        thrust::sort_by_key(thrust_par.on(stream), d_dst_ptr, d_dst_ptr + n,
+                            d_src_offset_ptr, thrust::less<uintptr_t>());
+      }
+
+      if (options_.io_by_cpu) {
+        const size_type host_ws_size{dev_ws_size +
+                                     n * sizeof(value_type) * dim()};
+        auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)};
+        auto h_dst{host_ws.get<value_type**>(0)};
+        auto h_src_offset{reinterpret_cast<int*>(h_dst + n)};
+        auto h_values{reinterpret_cast<value_type*>(h_src_offset + n)};
+
+        CUDA_CHECK(cudaMemcpyAsync(h_dst, d_dst, dev_ws_size,
+                                   cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaMemcpyAsync(h_values, values, host_ws_size - dev_ws_size,
+                                   cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+
+        write_by_cpu<value_type>(h_dst, h_values, h_src_offset, dim(), n);
+      } else {
+        const size_t block_size = options_.io_block_size;
+        const size_t N = n * dim();
+        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+
+        write_kernel<key_type, value_type, meta_type>
+            <<<grid_size, block_size, 0, stream>>>(values, d_dst, d_src_offset,
+                                                   dim(), N);
+      }
+    }
+
+    CudaCheckError();
+  }
+
+  /**
+   * @brief Insert new key-value-meta tuples into the hash table.
+   * If the key already exists, the values and metas are assigned new values.
+   *
+   * If the target bucket is full, the keys with minimum meta will be
+   * overwritten by new key unless the meta of the new key is even less than
+   * minimum meta of the target bucket. The overwritten key with minimum
+   * meta will be evicted, with its values and meta, to evicted_keys,
+   * evicted_values, evcted_metas seperately in compact format.
+   *
+   * @param n Number of key-value-meta tuples to insert or assign.
+   * @param keys The keys to insert on GPU-accessible memory with shape
+   * (n).
+   * @param values The values to insert on GPU-accessible memory with
+   * shape (n, DIM).
+   * @param metas The metas to insert on GPU-accessible memory with shape
+   * (n).
+   * @param metas The metas to insert on GPU-accessible memory with shape
+   * (n).
+   * @params evicted_keys The output of keys replaced with minimum meta.
+   * @params evicted_values The output of values replaced with minimum meta on
+   * keys.
+   * @params evicted_metas The output of metas replaced with minimum meta on
+   * keys.
+   * @parblock
+   * The metas should be a `uint64_t` value. You can specify a value that
+   * such as the timestamp of the key insertion, number of the key
+   * occurrences, or another value to perform a custom eviction strategy.
+   *
+   * The @p metas should be `nullptr`, when the LRU eviction strategy is
+   * applied.
+   * @endparblock
+   *
+   * @param stream The CUDA stream that is used to execute the operation.
+   *
+   * @param ignore_evict_strategy A boolean option indicating whether if
+   * the insert_or_assign ignores the evict strategy of table with current
+   * metas anyway. If true, it does not check whether the metas confroms to
+   * the evict strategy. If false, it requires the metas follow the evict
+   * strategy of table.
+   */
+  size_type insert_and_evict(const size_type n,
+                             const key_type* keys,        // (n)
+                             const value_type* values,    // (n, DIM)
+                             const meta_type* metas,      // (n)
+                             key_type* evicted_keys,      // (n)
+                             value_type* evicted_values,  // (n, DIM)
+                             meta_type* evicted_metas,    // (n)
+                             cudaStream_t stream = 0) {
+    if (n == 0) {
+      return 0;
+    }
+
+    while (!reach_max_capacity_ &&
+           fast_load_factor(n, stream) > options_.max_load_factor) {
+      reserve(capacity() * 2, stream);
+    }
+
+    writer_shared_lock lock(mutex_);
+
+    // TODO: Currently only need eviction when using HashTable as HBM cache.
+    if (!is_fast_mode()) {
+      throw std::runtime_error("Only allow insert_and_evict in pure HBM mode.");
+    }
+
+    using Selector =
+        SelectUpsertAndEvictKernelWithIO<key_type, value_type, meta_type>;
+    static thread_local int step_counter = 0;
+    static thread_local float load_factor = 0.0;
+
+    if (((step_counter++) % kernel_select_interval_) == 0) {
+      load_factor = fast_load_factor(0, stream, false);
+    }
+
+    // always use max tile to avoid data-deps as possible.
+    const int TILE_SIZE = 32;
+    size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE;
+    const size_type dev_ws_size =
+        n_offsets * sizeof(int64_t) + n * sizeof(bool) + sizeof(size_type);
+
+    auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
+    auto d_offsets{dev_ws.get<int64_t*>(0)};
+    auto dn_evicted = reinterpret_cast<size_type*>(d_offsets + n_offsets);
+    auto d_masks = reinterpret_cast<bool*>(dn_evicted + 1);
+
+    CUDA_CHECK(
+        cudaMemsetAsync(d_offsets, 0, n_offsets * sizeof(int64_t), stream));
+    CUDA_CHECK(cudaMemsetAsync(dn_evicted, 0, sizeof(size_type), stream));
+    CUDA_CHECK(cudaMemsetAsync(d_masks, 0, n * sizeof(bool), stream));
+
+    size_type block_size = options_.block_size;
+    size_type grid_size = SAFE_GET_GRID_SIZE(n, block_size);
+    CUDA_CHECK(cudaMemsetAsync(evicted_keys, static_cast<int>(EMPTY_KEY),
+                               n * sizeof(K), stream));
+
+    Selector::execute_kernel(
+        load_factor, options_.block_size, options_.max_bucket_size,
+        table_->buckets_num, options_.dim, stream, n, d_table_, keys, values,
+        metas, evicted_keys, evicted_values, evicted_metas);
+
+    keys_not_empty<K>
+        <<<grid_size, block_size, 0, stream>>>(evicted_keys, d_masks, n);
+    size_type n_evicted = 0;
+    gpu_pick_kvm_inplace<K, V, M, int64_t, TILE_SIZE>(
+        grid_size, block_size, d_masks, true, n, dn_evicted, d_offsets, evicted_keys,
+        evicted_values, evicted_metas, dim(), stream);
+    CUDA_CHECK(cudaMemcpyAsync(&n_evicted, dn_evicted, sizeof(size_type),
+                               cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CudaCheckError();
+    return n_evicted;
+  }
+
+  /**
+   * Searches for each key in @p keys in the hash table.
+   * If the key is found and the corresponding value in @p accum_or_assigns is
+   * `true`, the @p vectors_or_deltas is treated as a delta to the old
+   * value, and the delta is added to the old value of the key.
+   *
+   * If the key is not found and the corresponding value in @p accum_or_assigns
+   * is `false`, the @p vectors_or_deltas is treated as a new value and the
+   * key-value pair is updated in the table directly.
+   *
+   * @note When the key is found and the value of @p accum_or_assigns is
+   * `false`, or when the key is not found and the value of @p accum_or_assigns
+   * is `true`, nothing is changed and this operation is ignored.
+   * The algorithm assumes these situations occur while the key was modified or
+   * removed by other processes just now.
+   *
+   * @param n The number of key-value-meta tuples to process.
+   * @param keys The keys to insert on GPU-accessible memory with shape (n).
+   * @param value_or_deltas The values or deltas to insert on GPU-accessible
+   * memory with shape (n, DIM).
+   * @param accum_or_assigns The operation type with shape (n). A value of
+   * `true` indicates to accum and `false` indicates to assign.
+   * @param metas The metas to insert on GPU-accessible memory with shape (n).
+   * @parblock
+   * The metas should be a `uint64_t` value. You can specify a value that
+   * such as the timestamp of the key insertion, number of the key
+   * occurrences, or another value to perform a custom eviction strategy.
+   *
+   * The @p metas should be `nullptr`, when the LRU eviction strategy is
+   * applied.
+   * @endparblock
+   *
+   * @param stream The CUDA stream that is used to execute the operation.
+   *
+   * @param ignore_evict_strategy A boolean option indicating whether if
+   * the accum_or_assign ignores the evict strategy of table with current
+   * metas anyway. If true, it does not check whether the metas confroms to
+   * the evict strategy. If false, it requires the metas follow the evict
+   * strategy of table.
+   *
+   */
+  void accum_or_assign(const size_type n,
+                       const key_type* keys,               // (n)
+                       const value_type* value_or_deltas,  // (n, DIM)
+                       const bool* accum_or_assigns,       // (n)
+                       const meta_type* metas = nullptr,   // (n)
+                       cudaStream_t stream = 0,
+                       bool ignore_evict_strategy = false) {
+    if (n == 0) {
+      return;
+    }
+
+    while (!reach_max_capacity_ &&
+           fast_load_factor(n, stream) > options_.max_load_factor) {
+      reserve(capacity() * 2, stream);
+    }
+
+    if (!ignore_evict_strategy) {
+      check_evict_strategy(metas);
+    }
+
+    writer_shared_lock lock(mutex_);
+
+    const size_type dev_ws_size{
+        n * (sizeof(value_type*) + sizeof(int) + sizeof(bool))};
+    auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
+    auto dst{dev_ws.get<value_type**>(0)};
+    auto src_offset{reinterpret_cast<int*>(dst + n)};
+    auto founds{reinterpret_cast<bool*>(src_offset + n)};
+
+    CUDA_CHECK(cudaMemsetAsync(dst, 0, dev_ws_size, stream));
+
+    {
+      const size_t block_size = options_.block_size;
+      const size_t N = n * TILE_SIZE;
+      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+
+      accum_kernel<key_type, value_type, meta_type>
+          <<<grid_size, block_size, 0, stream>>>(
+              table_, keys, dst, metas, accum_or_assigns, table_->buckets,
+              table_->buckets_size, table_->bucket_max_size,
+              table_->buckets_num, src_offset, founds, N);
+    }
+
+    if (!is_fast_mode()) {
+      thrust::device_ptr<uintptr_t> dst_ptr(reinterpret_cast<uintptr_t*>(dst));
+      thrust::device_ptr<int> src_offset_ptr(src_offset);
+
+      thrust::sort_by_key(thrust_par.on(stream), dst_ptr, dst_ptr + n,
+                          src_offset_ptr, thrust::less<uintptr_t>());
+    }
+
+    {
+      const size_t block_size = options_.io_block_size;
+      const size_t N = n * dim();
+      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+
+      write_with_accum_kernel<key_type, value_type, meta_type>
+          <<<grid_size, block_size, 0, stream>>>(value_or_deltas, dst,
+                                                 accum_or_assigns, founds,
+                                                 src_offset, dim(), N);
+    }
+
+    CudaCheckError();
+  }
+
+  /**
+   * @brief Searches the hash table for the specified keys.
+   * When a key is missing, the value in @p values and @p metas will be
+   * inserted.
+   *
+   * @param n The number of key-value-meta tuples to search or insert.
+   * @param keys The keys to search on GPU-accessible memory with shape (n).
+   * @param values The values to search on GPU-accessible memory with
+   * shape (n, DIM).
+   * @param metas The metas to search on GPU-accessible memory with shape (n).
+   * @parblock
+   * If @p metas is `nullptr`, the meta for each key will not be returned.
+   * @endparblock
+   * @param stream The CUDA stream that is used to execute the operation.
+   *
+   */
+  void find_or_insert(const size_type n, const key_type* keys,  // (n)
+                      value_type* values,                       // (n * DIM)
+                      meta_type* metas = nullptr,               // (n)
+                      cudaStream_t stream = 0,
+                      bool ignore_evict_strategy = false) {
+    if (n == 0) {
+      return;
+    }
+
+    while (!reach_max_capacity_ &&
+           fast_load_factor(n, stream) > options_.max_load_factor) {
+      reserve(capacity() * 2, stream);
+    }
+
+    if (!ignore_evict_strategy) {
+      check_evict_strategy(metas);
+    }
+
+    writer_shared_lock lock(mutex_);
+
+    if (is_fast_mode()) {
+      using Selector =
+          SelectFindOrInsertKernelWithIO<key_type, value_type, meta_type>;
+      static thread_local int step_counter = 0;
+      static thread_local float load_factor = 0.0;
+
+      if (((step_counter++) % kernel_select_interval_) == 0) {
+        load_factor = fast_load_factor(0, stream, false);
+      }
+      Selector::execute_kernel(load_factor, options_.block_size,
+                               options_.max_bucket_size, table_->buckets_num,
+                               options_.dim, stream, n, d_table_, keys, values,
+                               metas);
+    } else {
+      const size_type dev_ws_size{
+          n * (sizeof(value_type*) + sizeof(int) + sizeof(bool))};
+      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
+      auto d_table_value_addrs{dev_ws.get<value_type**>(0)};
+      auto param_key_index{reinterpret_cast<int*>(d_table_value_addrs + n)};
+      auto founds{reinterpret_cast<bool*>(param_key_index + n)};
+
+      CUDA_CHECK(cudaMemsetAsync(d_table_value_addrs, 0, dev_ws_size, stream));
+
+      {
+        const size_t block_size = options_.block_size;
+        const size_t N = n * TILE_SIZE;
+        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+
+        find_or_insert_kernel<key_type, value_type, meta_type, TILE_SIZE>
+            <<<grid_size, block_size, 0, stream>>>(
+                d_table_, options_.max_bucket_size, table_->buckets_num,
+                options_.dim, keys, d_table_value_addrs, metas, founds,
+                param_key_index, N);
+      }
+
+      {
+        thrust::device_ptr<uintptr_t> table_value_ptr(
+            reinterpret_cast<uintptr_t*>(d_table_value_addrs));
+        thrust::device_ptr<int> param_key_index_ptr(param_key_index);
+
+        thrust::sort_by_key(thrust_par.on(stream), table_value_ptr,
+                            table_value_ptr + n, param_key_index_ptr,
+                            thrust::less<uintptr_t>());
+      }
+
+      if (options_.io_by_cpu) {
+        const size_type host_ws_size{
+            dev_ws_size + n * (sizeof(bool) + sizeof(value_type) * dim())};
+        auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)};
+        auto h_table_value_addrs{host_ws.get<value_type**>(0)};
+        auto h_param_key_index{reinterpret_cast<int*>(h_table_value_addrs + n)};
+        auto h_founds{reinterpret_cast<bool*>(h_param_key_index + n)};
+        auto h_param_values{reinterpret_cast<value_type*>(h_founds + n)};
+
+        CUDA_CHECK(cudaMemcpyAsync(h_table_value_addrs, d_table_value_addrs,
+                                   dev_ws_size, cudaMemcpyDeviceToHost,
+                                   stream));
+        CUDA_CHECK(cudaMemcpyAsync(h_founds, founds, n * sizeof(bool),
+                                   cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaMemcpyAsync(h_param_values, values,
+                                   n * sizeof(value_type) * dim(),
+                                   cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+
+        read_or_write_by_cpu<value_type>(h_table_value_addrs, h_param_values,
+                                         h_param_key_index, h_founds, dim(), n);
+        CUDA_CHECK(cudaMemcpyAsync(values, h_param_values,
+                                   n * sizeof(value_type) * dim(),
+                                   cudaMemcpyHostToDevice, stream));
+      } else {
+        const size_t block_size = options_.io_block_size;
+        const size_t N = n * dim();
+        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+
+        read_or_write_kernel<key_type, value_type, meta_type>
+            <<<grid_size, block_size, 0, stream>>>(
+                d_table_value_addrs, values, founds, param_key_index, dim(), N);
+      }
+    }
+
+    CudaCheckError();
+  }
+
+  /**
+   * @brief Searches the hash table for the specified keys and returns address
+   * of the values. When a key is missing, the value in @p values and @p metas
+   * will be inserted.
+   *
+   * @warning This API returns internal addresses for high-performance but
+   * thread-unsafe. The caller is responsible for guaranteeing data consistency.
+   *
+   * @param n The number of key-value-meta tuples to search or insert.
+   * @param keys The keys to search on GPU-accessible memory with shape (n).
+   * @param values  The addresses of values to search on GPU-accessible memory
+   * with shape (n).
+   * @param founds The status that indicates if the keys are found on
+   * @param metas The metas to search on GPU-accessible memory with shape (n).
+   * @parblock
+   * If @p metas is `nullptr`, the meta for each key will not be returned.
+   * @endparblock
+   * @param stream The CUDA stream that is used to execute the operation.
+   *
+   */
+  void find_or_insert(const size_type n, const key_type* keys,  // (n)
+                      value_type** values,                      // (n)
+                      bool* founds,                             // (n)
+                      meta_type* metas = nullptr,               // (n)
+                      cudaStream_t stream = 0,
+                      bool ignore_evict_strategy = false) {
+    if (n == 0) {
+      return;
+    }
+
+    while (!reach_max_capacity_ &&
+           fast_load_factor(n, stream) > options_.max_load_factor) {
+      reserve(capacity() * 2, stream);
+    }
+
+    if (!ignore_evict_strategy) {
+      check_evict_strategy(metas);
+    }
+
+    writer_shared_lock lock(mutex_);
+
+    using Selector =
+        SelectFindOrInsertPtrKernel<key_type, value_type, meta_type>;
+    static thread_local int step_counter = 0;
+    static thread_local float load_factor = 0.0;
+
+    if (((step_counter++) % kernel_select_interval_) == 0) {
+      load_factor = fast_load_factor(0, stream, false);
+    }
+    Selector::execute_kernel(load_factor, options_.block_size,
+                             options_.max_bucket_size, table_->buckets_num,
+                             options_.dim, stream, n, d_table_, keys, values,
+                             metas, founds);
+
+    CudaCheckError();
+  }
+  /**
+   * @brief Assign new key-value-meta tuples into the hash table.
+   * If the key doesn't exist, the operation on the key will be ignored.
+   *
+   * @param n Number of key-value-meta tuples to insert or assign.
+   * @param keys The keys to insert on GPU-accessible memory with shape
+   * (n).
+   * @param values The values to insert on GPU-accessible memory with
+   * shape (n, DIM).
+   * @param metas The metas to insert on GPU-accessible memory with shape
+   * (n).
+   * @parblock
+   * The metas should be a `uint64_t` value. You can specify a value that
+   * such as the timestamp of the key insertion, number of the key
+   * occurrences, or another value to perform a custom eviction strategy.
+   *
+   * The @p metas should be `nullptr`, when the LRU eviction strategy is
+   * applied.
+   * @endparblock
+   *
+   * @param stream The CUDA stream that is used to execute the operation.
+   */
+  void assign(const size_type n,
+              const key_type* keys,              // (n)
+              const value_type* values,          // (n, DIM)
+              const meta_type* metas = nullptr,  // (n)
+              cudaStream_t stream = 0) {
+    if (n == 0) {
+      return;
+    }
+
+    writer_shared_lock lock(mutex_);
+
+    if (is_fast_mode()) {
+      using Selector =
+          SelectUpdateKernelWithIO<key_type, value_type, meta_type>;
+      static thread_local int step_counter = 0;
+      static thread_local float load_factor = 0.0;
+
+      if (((step_counter++) % kernel_select_interval_) == 0) {
+        load_factor = fast_load_factor(0, stream, false);
+      }
+
+      Selector::execute_kernel(load_factor, options_.block_size,
+                               options_.max_bucket_size, table_->buckets_num,
+                               options_.dim, stream, n, d_table_, keys, values,
+                               metas);
+    } else {
+      const size_type dev_ws_size{n * (sizeof(value_type*) + sizeof(int))};
+      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
+      auto d_dst{dev_ws.get<value_type**>(0)};
+      auto d_src_offset{reinterpret_cast<int*>(d_dst + n)};
+
+      CUDA_CHECK(cudaMemsetAsync(d_dst, 0, dev_ws_size, stream));
+
+      {
+        const size_t block_size = options_.block_size;
+        const size_t N = n * TILE_SIZE;
+        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+
+        update_kernel<key_type, value_type, meta_type, TILE_SIZE>
+            <<<grid_size, block_size, 0, stream>>>(
+                d_table_, options_.max_bucket_size, table_->buckets_num,
+                options_.dim, keys, d_dst, metas, d_src_offset, N);
+      }
+
+      {
+        thrust::device_ptr<uintptr_t> d_dst_ptr(
+            reinterpret_cast<uintptr_t*>(d_dst));
+        thrust::device_ptr<int> d_src_offset_ptr(d_src_offset);
+
+        thrust::sort_by_key(thrust_par.on(stream), d_dst_ptr, d_dst_ptr + n,
+                            d_src_offset_ptr, thrust::less<uintptr_t>());
+      }
+
+      if (options_.io_by_cpu) {
+        const size_type host_ws_size{dev_ws_size +
+                                     n * sizeof(value_type) * dim()};
+        auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)};
+        auto h_dst{host_ws.get<value_type**>(0)};
+        auto h_src_offset{reinterpret_cast<int*>(h_dst + n)};
+        auto h_values{reinterpret_cast<value_type*>(h_src_offset + n)};
+
+        CUDA_CHECK(cudaMemcpyAsync(h_dst, d_dst, dev_ws_size,
+                                   cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaMemcpyAsync(h_values, values, host_ws_size - dev_ws_size,
+                                   cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+
+        write_by_cpu<value_type>(h_dst, h_values, h_src_offset, dim(), n);
+      } else {
+        const size_t block_size = options_.io_block_size;
+        const size_t N = n * dim();
+        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+
+        write_kernel<key_type, value_type, meta_type>
+            <<<grid_size, block_size, 0, stream>>>(values, d_dst, d_src_offset,
+                                                   dim(), N);
+      }
+    }
+
+    CudaCheckError();
+  }
+
+  /**
+   * @brief Searches the hash table for the specified keys.
+   *
+   * @note When a key is missing, the value in @p values is not changed.
+   *
+   * @param n The number of key-value-meta tuples to search.
+   * @param keys The keys to search on GPU-accessible memory with shape (n).
+   * @param values The values to search on GPU-accessible memory with
+   * shape (n, DIM).
+   * @param founds The status that indicates if the keys are found on
+   * GPU-accessible memory with shape (n).
+   * @param metas The metas to search on GPU-accessible memory with shape (n).
+   * @parblock
+   * If @p metas is `nullptr`, the meta for each key will not be returned.
+   * @endparblock
+   * @param stream The CUDA stream that is used to execute the operation.
+   *
+   */
+  void find(const size_type n, const key_type* keys,  // (n)
+            value_type* values,                       // (n, DIM)
+            bool* founds,                             // (n)
+            meta_type* metas = nullptr,               // (n)
+            cudaStream_t stream = 0) const {
+    if (n == 0) {
+      return;
+    }
+
+    CUDA_CHECK(cudaMemsetAsync(founds, 0, n * sizeof(bool), stream));
+
+    reader_shared_lock lock(mutex_);
+
+    if (is_fast_mode()) {
+      using Selector =
+          SelectLookupKernelWithIO<key_type, value_type, meta_type>;
+      static thread_local int step_counter = 0;
+      static thread_local float load_factor = 0.0;
+
+      if (((step_counter++) % kernel_select_interval_) == 0) {
+        load_factor = fast_load_factor(0, stream, false);
+      }
+      Selector::execute_kernel(load_factor, options_.block_size,
+                               options_.max_bucket_size, table_->buckets_num,
+                               options_.dim, stream, n, d_table_, keys, values,
+                               metas, founds);
+    } else {
+      const size_type dev_ws_size{n * (sizeof(value_type*) + sizeof(int))};
+      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
+      auto src{dev_ws.get<value_type**>(0)};
+      auto dst_offset{reinterpret_cast<int*>(src + n)};
+
+      CUDA_CHECK(cudaMemsetAsync(src, 0, dev_ws_size, stream));
+
+      {
+        const size_t block_size = options_.block_size;
+        const size_t N = n * TILE_SIZE;
+        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+
+        lookup_kernel<key_type, value_type, meta_type, TILE_SIZE>
+            <<<grid_size, block_size, 0, stream>>>(
+                d_table_, options_.max_bucket_size, table_->buckets_num,
+                options_.dim, keys, src, metas, founds, dst_offset, N);
+      }
+
+      {
+        thrust::device_ptr<uintptr_t> src_ptr(
+            reinterpret_cast<uintptr_t*>(src));
+        thrust::device_ptr<int> dst_offset_ptr(dst_offset);
+
+        thrust::sort_by_key(thrust_par.on(stream), src_ptr, src_ptr + n,
+                            dst_offset_ptr, thrust::less<uintptr_t>());
+      }
+
+      {
+        const size_t block_size = options_.io_block_size;
+        const size_t N = n * dim();
+        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+
+        read_kernel<key_type, value_type, meta_type>
+            <<<grid_size, block_size, 0, stream>>>(src, values, founds,
+                                                   dst_offset, dim(), N);
+      }
+    }
+
+    CudaCheckError();
+  }
+
+  /**
+   * @brief Searches the hash table for the specified keys and returns address
+   * of the values.
+   *
+   * @note When a key is missing, the data in @p values won't change.
+   * @warning This API returns internal addresses for high-performance but
+   * thread-unsafe. The caller is responsible for guaranteeing data consistency.
+   *
+   * @param n The number of key-value-meta tuples to search.
+   * @param keys The keys to search on GPU-accessible memory with shape (n).
+   * @param values The addresses of values to search on GPU-accessible memory
+   * with shape (n).
+   * @param founds The status that indicates if the keys are found on
+   * GPU-accessible memory with shape (n).
+   * @param metas The metas to search on GPU-accessible memory with shape (n).
+   * @parblock
+   * If @p metas is `nullptr`, the meta for each key will not be returned.
+   * @endparblock
+   * @param stream The CUDA stream that is used to execute the operation.
+   *
+   */
+  void find(const size_type n, const key_type* keys,  // (n)
+            value_type** values,                      // (n)
+            bool* founds,                             // (n)
+            meta_type* metas = nullptr,               // (n)
+            cudaStream_t stream = 0) const {
+    if (n == 0) {
+      return;
+    }
+
+    CUDA_CHECK(cudaMemsetAsync(founds, 0, n * sizeof(bool), stream));
+
+    reader_shared_lock lock(mutex_);
+
+    using Selector = SelectLookupPtrKernel<key_type, value_type, meta_type>;
+    static thread_local int step_counter = 0;
+    static thread_local float load_factor = 0.0;
+
+    if (((step_counter++) % kernel_select_interval_) == 0) {
+      load_factor = fast_load_factor(0, stream, false);
+    }
+    Selector::execute_kernel(load_factor, options_.block_size,
+                             options_.max_bucket_size, table_->buckets_num,
+                             options_.dim, stream, n, d_table_, keys, values,
+                             metas, founds);
+
+    CudaCheckError();
+  }
+
+  /**
+   * @brief Removes specified elements from the hash table.
+   *
+   * @param n The number of keys to remove.
+   * @param keys The keys to remove on GPU-accessible memory.
+   * @param stream The CUDA stream that is used to execute the operation.
+   *
+   */
+  void erase(const size_type n, const key_type* keys, cudaStream_t stream = 0) {
+    if (n == 0) {
+      return;
+    }
+
+    write_read_lock lock(mutex_);
+
+    {
+      const size_t block_size = options_.block_size;
+      const size_t N = n * TILE_SIZE;
+      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+
+      remove_kernel<key_type, value_type, meta_type, TILE_SIZE>
+          <<<grid_size, block_size, 0, stream>>>(
+              table_, keys, table_->buckets, table_->buckets_size,
+              table_->bucket_max_size, table_->buckets_num, N);
+    }
+
+    CudaCheckError();
+    return;
+  }
+
+  /**
+   * @brief Erases all elements that satisfy the predicate @p pred from the
+   * hash table.
+   *
+   * The value for @p pred should be a function with type `Pred` defined like
+   * the following example:
+   *
+   *    ```
+   *    template <class K, class M>
+   *    __forceinline__ __device__ bool erase_if_pred(const K& key,
+   *                                                  const M& meta,
+   *                                                  const K& pattern,
+   *                                                  const M& threshold) {
+   *      return ((key & 0x1 == pattern) && (meta < threshold));
+   *    }
+   *    ```
+   *
+   * @param pred The predicate function with type Pred that returns `true` if
+   * the element should be erased.
+   * @param pattern The third user-defined argument to @p pred with key_type
+   * type.
+   * @param threshold The fourth user-defined argument to @p pred with meta_type
+   * type.
+   * @param stream The CUDA stream that is used to execute the operation.
+   *
+   * @return The number of elements removed.
+   *
+   */
+  size_type erase_if(const Pred& pred, const key_type& pattern,
+                     const meta_type& threshold, cudaStream_t stream = 0) {
+    write_read_lock lock(mutex_);
+
+    auto dev_ws{dev_mem_pool_->get_workspace<1>(sizeof(size_type), stream)};
+    auto d_count{dev_ws.get<size_type*>(0)};
+
+    CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(size_type), stream));
+
+    Pred h_pred;
+    CUDA_CHECK(cudaMemcpyFromSymbolAsync(&h_pred, pred, sizeof(Pred), 0,
+                                         cudaMemcpyDeviceToHost, stream));
+
+    {
+      const size_t block_size = options_.block_size;
+      const size_t N = table_->buckets_num;
+      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+
+      remove_kernel<key_type, value_type, meta_type>
+          <<<grid_size, block_size, 0, stream>>>(
+              table_, h_pred, pattern, threshold, d_count, table_->buckets,
+              table_->buckets_size, table_->bucket_max_size,
+              table_->buckets_num, N);
+    }
+
+    size_type count = 0;
+    CUDA_CHECK(cudaMemcpyAsync(&count, d_count, sizeof(size_type),
+                               cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    CudaCheckError();
+    return count;
+  }
+
+  /**
+   * @brief Removes all of the elements in the hash table with no release
+   * object.
+   */
+  void clear(cudaStream_t stream = 0) {
+    write_read_lock lock(mutex_);
+
+    const size_t block_size = options_.block_size;
+    const size_t N = table_->buckets_num * table_->bucket_max_size;
+    const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+
+    clear_kernel<key_type, value_type, meta_type>
+        <<<grid_size, block_size, 0, stream>>>(table_, N);
+
+    CudaCheckError();
+  }
+
+ public:
+  /**
+   * @brief Exports a certain number of the key-value-meta tuples from the
+   * hash table.
+   *
+   * @param n The maximum number of exported pairs.
+   * @param offset The position of the key to remove.
+   * @param counter Accumulates amount of successfully exported values.
+   * @param keys The keys to dump from GPU-accessible memory with shape (n).
+   * @param values The values to dump from GPU-accessible memory with shape
+   * (n, DIM).
+   * @param metas The metas to search on GPU-accessible memory with shape (n).
+   * @parblock
+   * If @p metas is `nullptr`, the meta for each key will not be returned.
+   * @endparblock
+   *
+   * @param stream The CUDA stream that is used to execute the operation.
+   *
+   * @return The number of elements dumped.
+   *
+   * @throw CudaException If the key-value size is too large for GPU shared
+   * memory. Reducing the value for @p n is currently required if this exception
+   * occurs.
+   */
+  void export_batch(size_type n, const size_type offset,
+                    size_type* counter,          // (1)
+                    key_type* keys,              // (n)
+                    value_type* values,          // (n, DIM)
+                    meta_type* metas = nullptr,  // (n)
+                    cudaStream_t stream = 0) const {
+    reader_shared_lock lock(mutex_);
+
+    if (offset >= table_->capacity) {
+      return;
+    }
+    n = std::min(table_->capacity - offset, n);
+
+    size_type shared_size;
+    size_type block_size;
+    std::tie(shared_size, block_size) =
+        dump_kernel_shared_memory_size<K, V, M>(shared_mem_size_);
+
+    const size_t grid_size = SAFE_GET_GRID_SIZE(n, block_size);
+
+    dump_kernel<key_type, value_type, meta_type>
+        <<<grid_size, block_size, shared_size, stream>>>(
+            table_, keys, values, metas, offset, n, counter);
+
+    CudaCheckError();
+  }
+
+  size_type export_batch(const size_type n, const size_type offset,
+                         key_type* keys,              // (n)
+                         value_type* values,          // (n, DIM)
+                         meta_type* metas = nullptr,  // (n)
+                         cudaStream_t stream = 0) const {
+    auto dev_ws{dev_mem_pool_->get_workspace<1>(sizeof(size_type), stream)};
+    auto d_counter{dev_ws.get<size_type*>(0)};
+
+    CUDA_CHECK(cudaMemsetAsync(d_counter, 0, sizeof(size_type), stream));
+    export_batch(n, offset, d_counter, keys, values, metas, stream);
+
+    size_type counter = 0;
+    CUDA_CHECK(cudaMemcpyAsync(&counter, d_counter, sizeof(size_type),
+                               cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    return counter;
+  }
+
+  /**
+   * @brief Exports a certain number of the key-value-meta tuples which match
+   * specified condition from the hash table.
+   *
+   * @param n The maximum number of exported pairs.
+   * The value for @p pred should be a function with type `Pred` defined like
+   * the following example:
+   *
+   *    ```
+   *    template <class K, class M>
+   *    __forceinline__ __device__ bool export_if_pred(const K& key,
+   *                                                   M& meta,
+   *                                                   const K& pattern,
+   *                                                   const M& threshold) {
+   *
+   *      return meta > threshold;
+   *    }
+   *    ```
+   *
+   * @param pred The predicate function with type Pred that returns `true` if
+   * the element should be exported.
+   * @param pattern The third user-defined argument to @p pred with key_type
+   * type.
+   * @param threshold The fourth user-defined argument to @p pred with meta_type
+   * type.
+   * @param offset The position of the key to remove.
+   * @param keys The keys to dump from GPU-accessible memory with shape (n).
+   * @param values The values to dump from GPU-accessible memory with shape
+   * (n, DIM).
+   * @param metas The metas to search on GPU-accessible memory with shape (n).
+   * @parblock
+   * If @p metas is `nullptr`, the meta for each key will not be returned.
+   * @endparblock
+   *
+   * @param stream The CUDA stream that is used to execute the operation.
+   *
+   * @return The number of elements dumped.
+   *
+   * @throw CudaException If the key-value size is too large for GPU shared
+   * memory. Reducing the value for @p n is currently required if this exception
+   * occurs.
+   */
+  template <template <typename, typename> class PredFunctor>
+  void export_batch_if(const key_type& pattern,
+                       const meta_type& threshold, size_type n,
+                       const size_type offset, size_type* d_counter,
+                       key_type* keys,              // (n)
+                       value_type* values,          // (n, DIM)
+                       meta_type* metas = nullptr,  // (n)
+                       cudaStream_t stream = 0) const {
+    reader_shared_lock lock(mutex_);
+
+    if (offset >= table_->capacity) {
+      CUDA_CHECK(cudaMemsetAsync(d_counter, 0, sizeof(size_type), stream));
+      return;
+    }
+    n = std::min(table_->capacity - offset, n);
+
+    const size_t meta_size = metas ? sizeof(meta_type) : 0;
+    const size_t kvm_size =
+        sizeof(key_type) + sizeof(value_type) * dim() + meta_size;
+    const size_t block_size = std::min(shared_mem_size_ / 2 / kvm_size, 1024UL);
+    MERLIN_CHECK(
+        block_size > 0,
+        "[HierarchicalKV] block_size <= 0, the K-V-M size may be too large!");
+
+    const size_t shared_size = kvm_size * block_size;
+    const size_t grid_size = SAFE_GET_GRID_SIZE(n, block_size);
+
+    dump_kernel<key_type, value_type, meta_type, PredFunctor>
+        <<<grid_size, block_size, shared_size, stream>>>(
+            table_, pattern, threshold, keys, values, metas, offset, n,
+            d_counter);
+
+    CudaCheckError();
+  }
+
+ public:
+  /**
+   * @brief Indicates if the hash table has no elements.
+   *
+   * @param stream The CUDA stream that is used to execute the operation.
+   * @return `true` if the table is empty and `false` otherwise.
+   */
+  bool empty(cudaStream_t stream = 0) const { return size(stream) == 0; }
+
+  /**
+   * @brief Returns the hash table size.
+   *
+   * @param stream The CUDA stream that is used to execute the operation.
+   * @return The table size.
+   */
+  size_type size(cudaStream_t stream = 0) const {
+    reader_shared_lock lock(mutex_);
+
+    size_type h_size = 0;
+
+    const size_type N = table_->buckets_num;
+    const size_type step = static_cast<size_type>(
+        std::numeric_limits<int>::max() / options_.max_bucket_size);
+
+    thrust::device_ptr<int> size_ptr(table_->buckets_size);
+
+    for (size_type start_i = 0; start_i < N; start_i += step) {
+      size_type end_i = std::min(start_i + step, N);
+      h_size += thrust::reduce(thrust_par.on(stream), size_ptr + start_i,
+                               size_ptr + end_i, 0, thrust::plus<int>());
+    }
+
+    CudaCheckError();
+    return h_size;
+  }
+
+  /**
+   * @brief Returns the hash table capacity.
+   *
+   * @note The value that is returned might be less than the actual capacity of
+   * the hash table because the hash table currently keeps the capacity to be
+   * a power of 2 for performance considerations.
+   *
+   * @return The table capacity.
+   */
+  size_type capacity() const { return table_->capacity; }
+
+  /**
+   * @brief Sets the number of buckets to the number that is needed to
+   * accommodate at least @p new_capacity elements without exceeding the maximum
+   * load factor. This method rehashes the hash table. Rehashing puts the
+   * elements into the appropriate buckets considering that total number of
+   * buckets has changed.
+   *
+   * @note If the value of @p new_capacity or double of @p new_capacity is
+   * greater or equal than `options_.max_capacity`, the reserve does not perform
+   * any change to the hash table.
+   *
+   * @param new_capacity The requested capacity for the hash table.
+   * @param stream The CUDA stream that is used to execute the operation.
+   */
+  void reserve(const size_type new_capacity, cudaStream_t stream = 0) {
+    if (reach_max_capacity_ || new_capacity > options_.max_capacity) {
+      return;
+    }
+
+    {
+      write_read_lock lock(mutex_);
+
+      // Once we have exclusive access, make sure that pending GPU calls have
+      // been processed.
+      CUDA_CHECK(cudaDeviceSynchronize());
+
+      while (capacity() < new_capacity &&
+             capacity() * 2 <= options_.max_capacity) {
+        double_capacity(&table_);
+        CUDA_CHECK(cudaDeviceSynchronize());
+        sync_table_configuration();
+
+        const size_t block_size = options_.block_size;
+        const size_t N = TILE_SIZE * table_->buckets_num / 2;
+        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
+
+        rehash_kernel_for_fast_mode<key_type, value_type, meta_type, TILE_SIZE>
+            <<<grid_size, block_size, 0, stream>>>(d_table_, N);
+      }
+      CUDA_CHECK(cudaDeviceSynchronize());
+      reach_max_capacity_ = (capacity() * 2 > options_.max_capacity);
+    }
+    CudaCheckError();
+  }
+
+  /**
+   * @brief Returns the average number of elements per slot, that is, size()
+   * divided by capacity().
+   *
+   * @param stream The CUDA stream that is used to execute the operation.
+   *
+   * @return The load factor
+   */
+  float load_factor(cudaStream_t stream = 0) const {
+    return static_cast<float>((size(stream) * 1.0) / (capacity() * 1.0));
+  }
+
+  /**
+   * @brief Set max_capacity of the table.
+   *
+   * @param new_max_capacity The new expecting max_capacity. It must be power
+   * of 2. Otherwise it will raise an error.
+   */
+  void set_max_capacity(size_type new_max_capacity) {
+    if (!is_power(2, new_max_capacity)) {
+      throw std::invalid_argument(
+          "None power-of-2 new_max_capacity is not supported.");
+    }
+
+    write_read_lock lock(mutex_);
+
+    if (new_max_capacity < capacity()) {
+      return;
+    }
+    if (reach_max_capacity_) {
+      reach_max_capacity_ = false;
+    }
+    options_.max_capacity = new_max_capacity;
+  }
+
+  /**
+   * @brief Returns the dimension of the vectors.
+   *
+   * @return The dimension of the vectors.
+   */
+  size_type dim() const noexcept { return options_.dim; }
+
+  /**
+   * @brief Returns The length of each bucket.
+   *
+   * @return The length of each bucket.
+   */
+  size_type max_bucket_size() const noexcept {
+    return options_.max_bucket_size;
+  }
+
+  /**
+   * @brief Returns the number of buckets in the table.
+   *
+   * @return The number of buckets in the table.
+   */
+  size_type bucket_count() const noexcept { return table_->buckets_num; }
+
+  /**
+   * @brief Save keys, vectors, metas in table to file or files.
+   *
+   * @param file A BaseKVFile object defined the file format on host filesystem.
+   * @param max_workspace_size Saving is conducted in chunks. This value denotes
+   * the maximum amount of temporary memory to use when dumping the table.
+   * Larger values *can* lead to higher performance.
+   * @param stream The CUDA stream used to execute the operation.
+   *
+   * @return Number of KV pairs saved to file.
+   */
+  size_type save(BaseKVFile<K, V, M>* file,
+                 const size_t max_workspace_size = 1L * 1024 * 1024,
+                 cudaStream_t stream = 0) const {
+    const size_type tuple_size{sizeof(key_type) + sizeof(meta_type) +
+                               sizeof(value_type) * dim()};
+    MERLIN_CHECK(max_workspace_size >= tuple_size,
+                 "[HierarchicalKV] max_workspace_size is smaller than a single "
+                 "`key + metadata + value` tuple! Please set a larger value!");
+
+    size_type shared_size;
+    size_type block_size;
+    std::tie(shared_size, block_size) =
+        dump_kernel_shared_memory_size<K, V, M>(shared_mem_size_);
+
+    // Request exclusive access (to make sure capacity won't change anymore).
+    write_read_lock lock(mutex_);
+
+    const size_type total_size{capacity()};
+    const size_type n{std::min(max_workspace_size / tuple_size, total_size)};
+    const size_type grid_size{SAFE_GET_GRID_SIZE(n, block_size)};
+
+    // Grab temporary device and host memory.
+    const size_type host_ws_size{n * tuple_size};
+    auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)};
+    auto h_keys{host_ws.get<key_type*>(0)};
+    auto h_metas{reinterpret_cast<meta_type*>(h_keys + n)};
+    auto h_values{reinterpret_cast<value_type*>(h_metas + n)};
+
+    const size_type dev_ws_size{sizeof(size_type) + host_ws_size};
+    auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
+    auto d_count{dev_ws.get<size_type*>(0)};
+    auto d_keys{reinterpret_cast<key_type*>(d_count + 1)};
+    auto d_metas{reinterpret_cast<meta_type*>(d_keys + n)};
+    auto d_values{reinterpret_cast<value_type*>(d_metas + n)};
+
+    // Step through table, dumping contents in batches.
+    size_type total_count{0};
+    for (size_type i{0}; i < total_size; i += n) {
+      // Dump the next batch to workspace, and then write it to the file.
+      CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(size_type), stream));
+
+      dump_kernel<key_type, value_type, meta_type>
+          <<<grid_size, block_size, shared_size, stream>>>(
+              table_, d_keys, d_values, d_metas, i, std::min(total_size - i, n),
+              d_count);
+
+      size_type count;
+      CUDA_CHECK(cudaMemcpyAsync(&count, d_count, sizeof(size_type),
+                                 cudaMemcpyDeviceToHost, stream));
+      CUDA_CHECK(cudaStreamSynchronize(stream));
+
+      if (count == n) {
+        CUDA_CHECK(cudaMemcpyAsync(h_keys, d_keys, host_ws_size,
+                                   cudaMemcpyDeviceToHost, stream));
+      } else {
+        CUDA_CHECK(cudaMemcpyAsync(h_keys, d_keys, sizeof(key_type) * count,
+                                   cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaMemcpyAsync(h_metas, d_metas, sizeof(meta_type) * count,
+                                   cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaMemcpyAsync(h_values, d_values,
+                                   sizeof(value_type) * dim() * count,
+                                   cudaMemcpyDeviceToHost, stream));
+      }
+
+      CUDA_CHECK(cudaStreamSynchronize(stream));
+      file->write(count, dim(), h_keys, h_values, h_metas);
+      total_count += count;
+    }
+
+    return total_count;
+  }
+
+  /**
+   * @brief Load keys, vectors, metas from file to table.
+   *
+   * @param file An BaseKVFile defined the file format within filesystem.
+   * @param max_workspace_size Loading is conducted in chunks. This value
+   * denotes the maximum size of such chunks. Larger values *can* lead to higher
+   * performance.
+   * @param stream The CUDA stream used to execute the operation.
+   *
+   * @return Number of keys loaded from file.
+   */
+  size_type load(BaseKVFile<K, V, M>* file,
+                 const size_t max_workspace_size = 1L * 1024 * 1024,
+                 cudaStream_t stream = 0) {
+    const size_type tuple_size{sizeof(key_type) + sizeof(meta_type) +
+                               sizeof(value_type) * dim()};
+    MERLIN_CHECK(max_workspace_size >= tuple_size,
+                 "[HierarchicalKV] max_workspace_size is smaller than a single "
+                 "`key + metadata + value` tuple! Please set a larger value!");
+
+    const size_type n{max_workspace_size / tuple_size};
+    const size_type ws_size{n * tuple_size};
+
+    // Grab enough host memory to hold batch data.
+    auto host_ws{host_mem_pool_->get_workspace<1>(ws_size, stream)};
+    auto h_keys{host_ws.get<key_type*>(0)};
+    auto h_metas{reinterpret_cast<meta_type*>(h_keys + n)};
+    auto h_values{reinterpret_cast<value_type*>(h_metas + n)};
+
+    // Attempt a first read.
+    size_type count{file->read(n, dim(), h_keys, h_values, h_metas)};
+    if (count == 0) {
+      return 0;
+    }
+
+    // Grab equal amount of device memory as temporary storage.
+    auto dev_ws{dev_mem_pool_->get_workspace<1>(ws_size, stream)};
+    auto d_keys{dev_ws.get<key_type*>(0)};
+    auto d_metas{reinterpret_cast<meta_type*>(d_keys + n)};
+    auto d_values{reinterpret_cast<value_type*>(d_metas + n)};
+
+    size_type total_count{0};
+    do {
+      if (count == n) {
+        CUDA_CHECK(cudaMemcpyAsync(d_keys, h_keys, ws_size,
+                                   cudaMemcpyHostToDevice, stream));
+      } else {
+        CUDA_CHECK(cudaMemcpyAsync(d_keys, h_keys, sizeof(key_type) * count,
+                                   cudaMemcpyHostToDevice, stream));
+        CUDA_CHECK(cudaMemcpyAsync(d_metas, h_metas, sizeof(meta_type) * count,
+                                   cudaMemcpyHostToDevice, stream));
+        CUDA_CHECK(cudaMemcpyAsync(d_values, h_values,
+                                   sizeof(value_type) * dim() * count,
+                                   cudaMemcpyHostToDevice, stream));
+      }
+
+      insert_or_assign(count, d_keys, d_values, d_metas, stream, true);
+      total_count += count;
+
+      // Read next batch.
+      CUDA_CHECK(cudaStreamSynchronize(stream));
+      count = file->read(n, dim(), h_keys, h_values, h_metas);
+    } while (count > 0);
+
+    return total_count;
+  }
+
+ private:
+  bool is_power(size_t base, size_t n) {
+    if (base < 2) {
+      throw std::invalid_argument("is_power with zero base.");
+    }
+    while (n > 1) {
+      if (n % base != 0) {
+        return false;
+      }
+      n /= base;
+    }
+    return true;
+  }
+
+ private:
+  inline bool is_fast_mode() const noexcept { return table_->is_pure_hbm; }
+
+  /**
+   * @brief Returns the load factor by sampling up to 1024 buckets.
+   *
+   * @note For performance consideration, the returned load factor is
+   * inaccurate but within an error in 1% empirically which is enough for
+   * capacity control. But it's not suitable for end-users.
+   *
+   * @param delta A hypothetical upcoming change on table size.
+   * @param stream The CUDA stream used to execute the operation.
+   * @param need_lock If lock is needed.
+   *
+   * @return The evaluated load factor
+   */
+  inline float fast_load_factor(const size_type delta = 0,
+                                cudaStream_t stream = 0,
+                                const bool need_lock = true) const {
+    reader_shared_lock lock(mutex_, std::defer_lock);
+    if (need_lock) {
+      lock.lock();
+    }
+
+    size_t N = std::min(table_->buckets_num, 1024UL);
+
+    thrust::device_ptr<int> size_ptr(table_->buckets_size);
+
+    int size = thrust::reduce(thrust_par.on(stream), size_ptr, size_ptr + N, 0,
+                              thrust::plus<int>());
+
+    CudaCheckError();
+    return static_cast<float>((delta * 1.0) / (capacity() * 1.0) +
+                              (size * 1.0) /
+                                  (options_.max_bucket_size * N * 1.0));
+  }
+
+  inline void check_evict_strategy(const meta_type* metas) {
+    if (options_.evict_strategy == EvictStrategy::kLru) {
+      MERLIN_CHECK(metas == nullptr,
+                   "the metas should not be specified when running on "
+                   "LRU mode.");
+    }
+
+    if (options_.evict_strategy == EvictStrategy::kCustomized) {
+      MERLIN_CHECK(metas != nullptr,
+                   "the metas should be specified when running on "
+                   "customized mode.");
+    }
+  }
+
+  /**
+   * @brief Synchronize the TableCore struct to replicas.
+   *
+   * @note For performance consideration, synchronize the TableCore struct to
+   * its replicas in constant memory and device memory when it's changed.
+   */
+  inline void sync_table_configuration() {
+    CUDA_CHECK(
+        cudaMemcpy(d_table_, table_, sizeof(TableCore), cudaMemcpyDefault));
+    if (c_table_index_ >= 0) {
+      CUDA_CHECK(cudaMemcpyToSymbol(c_table_, table_, sizeof(TableCore),
+                                    sizeof(TableCore) * c_table_index_,
+                                    cudaMemcpyDefault));
+    }
+  }
+
+ private:
+  HashTableOptions options_;
+  TableCore* table_ = nullptr;
+  TableCore* d_table_ = nullptr;
+  size_t shared_mem_size_ = 0;
+  std::atomic_bool reach_max_capacity_{false};
+  bool initialized_ = false;
+  mutable group_shared_mutex mutex_;
+  const unsigned int kernel_select_interval_ = 7;
+  int c_table_index_ = -1;
+  std::unique_ptr<DeviceMemoryPool> dev_mem_pool_;
+  std::unique_ptr<HostMemoryPool> host_mem_pool_;
+};
+
+}  // namespace merlin
+}  // namespace nv
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_localfile.hpp b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_localfile.hpp
new file mode 100644
index 000000000..c60099ac2
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_localfile.hpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <stddef.h>
+#include <stdio.h>
+#include <string>
+#include "merlin/types.cuh"
+
+namespace nv {
+namespace merlin {
+
+/**
+ * The KV file on local file system. It only save/load keys and vectors
+ * between table and file. `metas` are ignored in it since absolute
+ * values of metas are commonly time-variant, while the time interval
+ * between save/load calling is not deterministic, in default case. If
+ * other specified rules are required, the BaseKVFile could be inherited
+ * to implement customized read/write rules. The LocalKVFile uses compact,
+ * consecutive binary format, where keys, values, and metas are stored in
+ * seperated paths.
+ *
+ * @tparam K The data type of the key.
+ * @tparam V The data type of the vector's elements.
+ *         The item data type should be a basic data type of C++/CUDA.
+ * @tparam M The data type for `meta`.
+ *           The currently supported data type is only `uint64_t`.
+ *
+ */
+template <class K, class V, class M>
+class LocalKVFile : public BaseKVFile<K, V, M> {
+ public:
+  LocalKVFile() : keys_fp_(nullptr), values_fp_(nullptr), metas_fp_(nullptr) {}
+
+  ~LocalKVFile() { close(); }
+
+  /**
+   * @brief Open the file from local path. A LocalKVFile can only be
+   * read or written when it stays opened.
+   *
+   * @param keys_path Path to file to store keys.
+   * @param values_path Path to file to store values.
+   * @param metas_path Path to file to store metas.
+   * @params mode The mode to the file. The mode follows glibc style
+   *              and behavior like fopen.
+   */
+  bool open(const std::string& keys_path, const std::string& values_path,
+            const std::string& metas_path, const char* mode) {
+    close();
+    keys_fp_ = fopen(keys_path.c_str(), mode);
+    if (!keys_fp_) {
+      return false;
+    }
+    values_fp_ = fopen(values_path.c_str(), mode);
+    if (!values_fp_) {
+      close();
+      return false;
+    }
+    metas_fp_ = fopen(metas_path.c_str(), mode);
+    if (!metas_fp_) {
+      close();
+      return false;
+    }
+    return true;
+  }
+
+  /**
+   * @brief Close the file from open status and release fd(s) on files
+   * of keys, values, and metas.
+   */
+  void close() noexcept {
+    if (keys_fp_) {
+      fclose(keys_fp_);
+      keys_fp_ = nullptr;
+    }
+    if (values_fp_) {
+      fclose(values_fp_);
+      values_fp_ = nullptr;
+    }
+    if (metas_fp_) {
+      fclose(metas_fp_);
+      metas_fp_ = nullptr;
+    }
+  }
+
+  /**
+   * Read from file and fill into the keys, values, and metas buffer.
+   * When calling save/load method from table, it can assume that the
+   * received buffer of keys, vectors, and metas are automatically
+   * pre-allocated.
+   *
+   * @param n The number of KV pairs expect to read. `int64_t` was used
+   *          here to adapt to various filesytem and formats.
+   * @param dim The dimension of the `vectors`.
+   * @param keys The pointer to received buffer for keys.
+   * @param vectors The pointer to received buffer for vectors.
+   * @param metas The pointer to received buffer for metas.
+   *
+   * @return Number of KV pairs have been successfully read.
+   */
+  size_t read(const size_t n, const size_t dim, K* keys, V* vectors,
+              M* metas) override {
+    size_t nread_keys =
+        fread(keys, sizeof(K), static_cast<size_t>(n), keys_fp_);
+    size_t nread_vecs =
+        fread(vectors, sizeof(V) * dim, static_cast<size_t>(n), values_fp_);
+    size_t nread_metas =
+        fread(metas, sizeof(M), static_cast<size_t>(n), metas_fp_);
+    if (nread_keys != nread_vecs || nread_keys != nread_metas) {
+      return 0;
+    }
+    return nread_keys;
+  }
+
+  /**
+   * Write keys, values, metas from table to the file.
+   *
+   * @param n The number of KV pairs to be written. `int64_t` was used
+   *          here to adapt to various filesytem and formats.
+   * @param dim The dimension of the `vectors`.
+   * @param keys The keys will be written to file.
+   * @param vectors The vectors of values will be written to file.
+   * @param metas The metas will be written to file.
+   *
+   * @return Number of KV pairs have been successfully written.
+   */
+  size_t write(const size_t n, const size_t dim, const K* keys,
+               const V* vectors, const M* metas) override {
+    size_t nwritten_keys =
+        fwrite(keys, sizeof(K), static_cast<size_t>(n), keys_fp_);
+    size_t nwritten_vecs =
+        fwrite(vectors, sizeof(V) * dim, static_cast<size_t>(n), values_fp_);
+    size_t nwritten_metas =
+        fwrite(metas, sizeof(M), static_cast<size_t>(n), metas_fp_);
+    if (nwritten_keys != nwritten_vecs || nwritten_keys != nwritten_metas) {
+      return 0;
+    }
+    return nwritten_keys;
+  }
+
+ private:
+  FILE* keys_fp_;
+  FILE* values_fp_;
+  FILE* metas_fp_;
+};
+
+}  // namespace merlin
+}  // namespace nv
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/BUILD b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/BUILD
new file mode 100644
index 000000000..7108afe68
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/BUILD
@@ -0,0 +1,12 @@
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "cuda_utils",
+    hdrs = [
+      "cuda_utils.cuh",
+    ],
+    linkopts = [
+        "-L/usr/local/cuda-11.2/targets/x86_64-linux/lib/stubs",
+        "-lnvidia-ml",
+    ],
+)
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/cuda_utils.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/cuda_utils.cuh
new file mode 100644
index 000000000..9b4bc25d4
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/cuda_utils.cuh
@@ -0,0 +1,134 @@
+#ifndef CUDA_UTILS_CUH_
+#define CUDA_UTILS_CUH_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <exception>
+#include <stdexcept>
+#include <string>
+#include "cuda_runtime.h"
+#include "nvml.h"
+
+#include "tensorflow/core/framework/types.h"
+
+// TODO: can we do this more efficiently?
+__inline__ __device__ int8_t atomicCAS(int8_t* address, int8_t compare, int8_t val)
+{
+  int32_t *base_address = (int32_t*)((char*)address - ((size_t)address & 3));
+  int32_t int_val = (int32_t)val << (((size_t)address & 3) * 8);
+  int32_t int_comp = (int32_t)compare << (((size_t)address & 3) * 8);
+  return (int8_t)atomicCAS(base_address, int_comp, int_val);
+}
+
+// TODO: can we do this more efficiently?
+/*__inline__ __device__ int16_t atomicCAS(int16_t* address, int16_t compare, int16_t val)
+{
+  int32_t *base_address = (int32_t*)((char*)address - ((size_t)address & 2));
+  int32_t int_val = (int32_t)val << (((size_t)address & 2) * 8);
+  int32_t int_comp = (int32_t)compare << (((size_t)address & 2) * 8);
+  return (int16_t)atomicCAS(base_address, int_comp, int_val);
+}*/
+
+__inline__ __device__ int64_t atomicCAS(int64_t* address, int64_t compare, int64_t val)
+{
+  return (int64_t)atomicCAS((unsigned long long*)address, (unsigned long long)compare, (unsigned long long)val);
+}
+
+__inline__ __device__ uint64_t atomicCAS(uint64_t* address, uint64_t compare, uint64_t val)
+{
+  return (uint64_t)atomicCAS((unsigned long long*)address, (unsigned long long)compare, (unsigned long long)val);
+}
+
+__inline__ __device__ long long int atomicCAS(long long int* address, long long int compare, long long int val)
+{
+  return (long long int)atomicCAS((unsigned long long*)address, (unsigned long long)compare, (unsigned long long)val);
+}
+
+__inline__ __device__ double atomicCAS(double* address, double compare, double val)
+{
+  return __longlong_as_double(atomicCAS((unsigned long long int*)address, __double_as_longlong(compare), __double_as_longlong(val)));
+}
+
+__inline__ __device__ float atomicCAS(float* address, float compare, float val)
+{
+  return __int_as_float(atomicCAS((int*)address, __float_as_int(compare), __float_as_int(val)));
+}
+
+__inline__ __device__ int64_t atomicAdd(int64_t* address, const int64_t val)
+{
+  return (int64_t) atomicAdd((unsigned long long*)address, (unsigned long long)val);
+}
+
+__inline__ __device__ uint64_t atomicAdd(uint64_t* address,  const uint64_t val)
+{
+  return (uint64_t) atomicAdd((unsigned long long*)address, (unsigned long long)val);
+}
+
+__inline__ __device__ signed char atomicAdd(signed char* address, const signed char val)
+{
+  int *base_address = (int*)((char*)address - ((size_t)address & 3));
+  int int_val = (int)val << (((size_t)address & 3) * 8);
+
+  return (signed char) atomicAdd((int*)base_address, (int)int_val);
+}
+
+class CudaException : public std::runtime_error {
+ public:
+  CudaException(const std::string& what) : runtime_error(what) {}
+};
+
+namespace nv_util {
+
+inline void cuda_check_(cudaError_t val, const char* file, int line) {
+  if (val != cudaSuccess) {
+    throw CudaException(std::string(file) + ":" + std::to_string(line) +
+                        ": CUDA error " + std::to_string(val) + ": " +
+                        cudaGetErrorString(val));
+  }
+}
+
+#define CUDA_CHECK(val) \
+  { nv_util::cuda_check_((val), __FILE__, __LINE__); }
+
+inline void __cudaCheckErrorHost(const char* file, const int line) {
+  cudaError err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n", file, line,
+            cudaGetErrorString(err));
+    exit(-1);
+  }
+
+  // More careful checking. However, this will affect performance.
+  // Comment away if needed.
+  err = cudaDeviceSynchronize();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n", file,
+            line, cudaGetErrorString(err));
+    exit(-1);
+  }
+  return;
+}
+
+//#define CudaCheckErrorHost() nv_util::__cudaCheckErrorHost(__FILE__, __LINE__)
+#define CudaCheckErrorHost() {}
+
+#ifndef NVML_CHECK
+#define NVML_CHECK( call )                                                                                           \
+    {                                                                                                                  \
+        auto status = static_cast<nvmlReturn_t>( call );                                                               \
+        if ( status != NVML_SUCCESS )                                                                                  \
+            fprintf( stderr,                                                                                           \
+                     "ERROR: CUDA NVML call \"%s\" in line %d of file %s failed "                                      \
+                     "with "                                                                                           \
+                     "%s (%d).\n",                                                                                     \
+                     #call,                                                                                            \
+                     __LINE__,                                                                                         \
+                     __FILE__,                                                                                         \
+                     nvmlErrorString( status ),                                                                        \
+                     status );                                                                                         \
+    }
+#endif  // NVML_CHECK
+
+}  // namespace nv_util
+
+#endif  // CUDA_UTILS_CUH_
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/ops/hkv_hashtable_ops.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/ops/hkv_hashtable_ops.cc
new file mode 100644
index 000000000..a241b7abd
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/ops/hkv_hashtable_ops.cc
@@ -0,0 +1,327 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow_recommenders_addons/dynamic_embedding/core/utils/utils.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeAndType;
+using shape_inference::ShapeHandle;
+
+namespace {
+
+Status ScalarAndTwoElementVectorInputsAndScalarOutputs(InferenceContext* c) {
+  ShapeHandle handle;
+  DimensionHandle unused_handle;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+  for (int i = 1; i < c->num_inputs(); ++i) {
+    TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &handle));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_handle));
+  }
+  for (int i = 0; i < c->num_outputs(); ++i) {
+    c->set_output(i, c->Scalar());
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status ValidateTableResourceHandle(InferenceContext* c, ShapeHandle keys,
+                                   const string& key_dtype_attr,
+                                   const string& value_dtype_attr,
+                                   bool is_lookup,
+                                   ShapeAndType* output_shape_and_type) {
+  auto* handle_data = c->input_handle_shapes_and_types(0);
+  if (handle_data == nullptr || handle_data->size() != 2) {
+    output_shape_and_type->shape = c->UnknownShape();
+    output_shape_and_type->dtype = DT_INVALID;
+  } else {
+    const ShapeAndType& key_shape_and_type = (*handle_data)[0];
+    const ShapeAndType& value_shape_and_type = (*handle_data)[1];
+    DataType key_dtype;
+    TF_RETURN_IF_ERROR(c->GetAttr(key_dtype_attr, &key_dtype));
+    if (key_shape_and_type.dtype != key_dtype) {
+      return errors::InvalidArgument(
+          "Trying to read value with wrong dtype. "
+          "Expected ",
+          DataTypeString(key_shape_and_type.dtype), " got ",
+          DataTypeString(key_dtype));
+    }
+    DataType value_dtype;
+    TF_RETURN_IF_ERROR(c->GetAttr(value_dtype_attr, &value_dtype));
+    if (value_shape_and_type.dtype != value_dtype) {
+      return errors::InvalidArgument(
+          "Trying to read value with wrong dtype. "
+          "Expected ",
+          DataTypeString(value_shape_and_type.dtype), " got ",
+          DataTypeString(value_dtype));
+    }
+    output_shape_and_type->dtype = value_shape_and_type.dtype;
+
+    if (is_lookup) {
+      if (c->RankKnown(key_shape_and_type.shape) && c->RankKnown(keys)) {
+        int keys_rank = c->Rank(keys);
+        int key_suffix_rank = c->Rank(key_shape_and_type.shape);
+        if (keys_rank < key_suffix_rank) {
+          return errors::InvalidArgument(
+              "Expected keys to have suffix ",
+              c->DebugString(key_shape_and_type.shape),
+              " but saw shape: ", c->DebugString(keys));
+        }
+        for (int d = 0; d < key_suffix_rank; d++) {
+          // Ensure the suffix of keys match what's in the Table.
+          DimensionHandle dim = c->Dim(key_shape_and_type.shape, d);
+          TF_RETURN_IF_ERROR(
+              c->ReplaceDim(keys, keys_rank - key_suffix_rank + d, dim, &keys));
+        }
+        std::vector<DimensionHandle> keys_prefix_vec;
+        keys_prefix_vec.reserve(keys_rank - key_suffix_rank);
+        for (int d = 0; d < keys_rank - key_suffix_rank; ++d) {
+          keys_prefix_vec.push_back(c->Dim(keys, d));
+        }
+        ShapeHandle keys_prefix = c->MakeShape(keys_prefix_vec);
+        TF_RETURN_IF_ERROR(c->Concatenate(keys_prefix,
+                                          value_shape_and_type.shape,
+                                          &output_shape_and_type->shape));
+      } else {
+        output_shape_and_type->shape = c->UnknownShape();
+      }
+    } else {
+      TF_RETURN_IF_ERROR(c->Concatenate(keys, value_shape_and_type.shape,
+                                        &output_shape_and_type->shape));
+    }
+  }
+  return Status::OK();
+}
+
+Status HkvHashTableShape(InferenceContext* c, const ShapeHandle& key,
+                            const ShapeHandle& value) {
+  c->set_output(0, c->Scalar());
+
+  ShapeHandle key_s;
+  TF_RETURN_IF_ERROR(c->WithRankAtMost(key, 1, &key_s));
+
+  DataType key_t;
+  TF_RETURN_IF_ERROR(c->GetAttr("key_dtype", &key_t));
+
+  DataType value_t;
+  TF_RETURN_IF_ERROR(c->GetAttr("value_dtype", &value_t));
+
+  c->set_output_handle_shapes_and_types(
+      0, std::vector<ShapeAndType>{{key_s, key_t}, {value, value_t}});
+
+  return Status::OK();
+}
+
+REGISTER_OP("TfraHkvHashTableFind")
+    .Input("table_handle: resource")
+    .Input("keys: Tin")
+    .Input("default_value: Tout")
+    .Output("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      ShapeAndType value_shape_and_type;
+      TF_RETURN_IF_ERROR(ValidateTableResourceHandle(
+          c,
+          /*keys=*/c->input(1),
+          /*key_dtype_attr=*/"Tin",
+          /*value_dtype_attr=*/"Tout",
+          /*is_lookup=*/true, &value_shape_and_type));
+      c->set_output(0, value_shape_and_type.shape);
+
+      return Status::OK();
+    });
+
+REGISTER_OP("TfraHkvHashTableFindWithExists")
+    .Input("table_handle: resource")
+    .Input("keys: Tin")
+    .Input("default_value: Tout")
+    .Output("values: Tout")
+    .Output("exists: bool")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      ShapeHandle keys = c->UnknownShapeOfRank(1);
+      ShapeAndType value_shape_and_type;
+      TF_RETURN_IF_ERROR(ValidateTableResourceHandle(
+          c,
+          /*keys=*/c->input(1),
+          /*key_dtype_attr=*/"Tin",
+          /*value_dtype_attr=*/"Tout",
+          /*is_lookup=*/true, &value_shape_and_type));
+      c->set_output(0, value_shape_and_type.shape);
+      c->set_output(1, keys);
+
+      return Status::OK();
+    });
+
+REGISTER_OP("TfraHkvHashTableInsert")
+    .Input("table_handle: resource")
+    .Input("keys: Tin")
+    .Input("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      // TODO: Validate keys and values shape.
+      return Status::OK();
+    });
+
+REGISTER_OP("TfraHkvHashTableAccum")
+    .Input("table_handle: resource")
+    .Input("keys: key_dtype")
+    .Input("values_or_deltas: value_dtype")
+    .Input("exists: bool")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      // TODO: Validate keys and values shape.
+      return Status::OK();
+    });
+
+REGISTER_OP("TfraHkvHashTableRemove")
+    .Input("table_handle: resource")
+    .Input("keys: Tin")
+    .Attr("Tin: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &handle));
+
+      // TODO(turboale): Validate keys shape.
+      return Status::OK();
+    });
+
+REGISTER_OP("TfraHkvHashTableClear")
+    .Input("table_handle: resource")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type");
+
+REGISTER_OP("TfraHkvHashTableSize")
+    .Input("table_handle: resource")
+    .Output("size: int64")
+    .SetShapeFn(ScalarAndTwoElementVectorInputsAndScalarOutputs);
+
+REGISTER_OP("TfraHkvHashTableExport")
+    .Input("table_handle: resource")
+    .Output("keys: Tkeys")
+    .Output("values: Tvalues")
+    .Attr("Tkeys: type")
+    .Attr("Tvalues: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+      ShapeHandle keys = c->UnknownShapeOfRank(1);
+      ShapeAndType value_shape_and_type;
+      TF_RETURN_IF_ERROR(ValidateTableResourceHandle(
+          c,
+          /*keys=*/keys,
+          /*key_dtype_attr=*/"Tkeys",
+          /*value_dtype_attr=*/"Tvalues",
+          /*is_lookup=*/false, &value_shape_and_type));
+      c->set_output(0, keys);
+      c->set_output(1, value_shape_and_type.shape);
+      return Status::OK();
+    });
+
+REGISTER_OP("TfraHkvHashTableSaveToFileSystem")
+    .Input("table_handle: resource")
+    .Input("dirpath: string")
+    .Input("file_name: string")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .Attr("dirpath_env: string")
+    .Attr("append_to_file: bool")
+    .Attr("buffer_size: int >= 1");
+
+REGISTER_OP("TfraHkvHashTableExportKeysAndMetas")
+    .Input("table_handle: resource")
+    .Output("keys: Tkeys")
+    .Output("metas: int64")
+    .Attr("Tkeys: type")
+    .Attr("split_size: int")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+      ShapeHandle keys = c->UnknownShapeOfRank(1);
+      ShapeHandle metas = c->UnknownShapeOfRank(1);
+      c->set_output(0, keys);
+      c->set_output(1, metas);
+      return Status::OK();
+    });
+
+REGISTER_OP("TfraHkvHashTableImport")
+    .Input("table_handle: resource")
+    .Input("keys: Tin")
+    .Input("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      ShapeHandle keys;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &keys));
+      TF_RETURN_IF_ERROR(c->Merge(keys, c->input(2), &keys));
+      return Status::OK();
+    });
+
+REGISTER_OP("TfraHkvHashTableLoadFromFileSystem")
+    .Input("table_handle: resource")
+    .Input("dirpath: string")
+    .Input("file_name: string")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .Attr("dirpath_env: string")
+    .Attr("load_entire_dir: bool")
+    .Attr("buffer_size: int >= 1");
+
+REGISTER_OP("TfraHkvHashTableOfTensors")
+    .Output("table_handle: resource")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .Attr("value_shape: shape = {}")
+    .Attr("init_capacity: int = 0")
+    .Attr("max_capacity: int = 0")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      PartialTensorShape value_p;
+      TF_RETURN_IF_ERROR(c->GetAttr("value_shape", &value_p));
+      ShapeHandle value_s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(value_p, &value_s));
+      return HkvHashTableShape(c, /*key=*/c->Scalar(), /*value=*/value_s);
+    });
+}  // namespace tensorflow
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/python/ops/hkv_hashtable_ops.py b/tensorflow_recommenders_addons/dynamic_embedding/python/ops/hkv_hashtable_ops.py
new file mode 100644
index 000000000..7061c6b30
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/python/ops/hkv_hashtable_ops.py
@@ -0,0 +1,456 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""CuckooHash Lookup operations."""
+# pylint: disable=g-bad-name
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops.lookup_ops import LookupInterface
+from tensorflow.python.training.saver import BaseSaverBuilder
+
+from tensorflow_recommenders_addons.utils.resource_loader import LazySO
+from tensorflow_recommenders_addons.utils.resource_loader import prefix_op_name
+
+try:
+  cuckoo_ops = LazySO("dynamic_embedding/core/_cuckoo_hashtable_ops.so").ops
+except:
+  cuckoo_ops = None
+
+
+class CuckooHashTable(LookupInterface):
+  """A generic mutable hash table implementation.
+
+    Data can be inserted by calling the insert method and removed by calling the
+    remove method. It does not support initialization via the init method.
+
+    Example usage:
+
+    ```python
+    table = tfra.dynamic_embedding.CuckooHashTable(key_dtype=tf.string,
+                                                   value_dtype=tf.int64,
+                                                   default_value=-1)
+    sess.run(table.insert(keys, values))
+    out = table.lookup(query_keys)
+    print(out.eval())
+    ```
+    """
+
+  def __init__(
+      self,
+      key_dtype,
+      value_dtype,
+      default_value,
+      name="CuckooHashTable",
+      checkpoint=True,
+      init_capacity=0,
+      max_capacity=0,
+      config=None,
+      slot_hook=None,
+      is_slot=False,
+  ):
+    """Creates an empty `CuckooHashTable` object.
+
+        Creates a table, the type of its keys and values are specified by key_dtype
+        and value_dtype, respectively.
+
+        Args:
+          key_dtype: the type of the key tensors.
+          value_dtype: the type of the value tensors.
+          default_value: The value to use if a key is missing in the table.
+          name: A name for the operation (optional).
+          checkpoint: if True, the contents of the table are saved to and restored
+            from checkpoints. If `shared_name` is empty for a checkpointed table, it
+            is shared using the table node name.
+          init_capacity: initial size for the Variable and initial size of each hash
+            tables will be int(init_capacity / N), N is the number of the devices.
+          max_capacity: max size for the Variable and initial size of each hash
+            tables will be int(max_capacity / N), N is the number of the devices.
+
+        Returns:
+          A `CuckooHashTable` object.
+
+        Raises:
+          ValueError: If checkpoint is True and no name was specified.
+        """
+    self._default_value = ops.convert_to_tensor(default_value,
+                                                dtype=value_dtype)
+    self._value_shape = self._default_value.get_shape()
+    self._checkpoint = checkpoint
+    self._key_dtype = key_dtype
+    self._value_dtype = value_dtype
+    self._init_capacity = init_capacity
+    self._max_capacity = max_capacity
+    self._name = name
+
+    self._shared_name = None
+    if context.executing_eagerly():
+      # TODO(allenl): This will leak memory due to kernel caching by the
+      # shared_name attribute value (but is better than the alternative of
+      # sharing everything by default when executing eagerly; hopefully creating
+      # tables in a loop is uncommon).
+      # TODO(rohanj): Use context.shared_name() instead.
+      self._shared_name = "table_%d" % (ops.uid(),)
+    super(CuckooHashTable, self).__init__(key_dtype, value_dtype)
+
+    self._resource_handle = self._create_resource()
+    if checkpoint:
+      _ = CuckooHashTable._Saveable(self, name)
+      if not context.executing_eagerly():
+        self.saveable = CuckooHashTable._Saveable(
+            self,
+            name=self._resource_handle.op.name,
+            full_name=self._resource_handle.op.name,
+        )
+        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self.saveable)
+      else:
+        self.saveable = CuckooHashTable._Saveable(self,
+                                                  name=name,
+                                                  full_name=name)
+
+  def _create_resource(self):
+    # The table must be shared if checkpointing is requested for multi-worker
+    # training to work correctly. Use the node name if no shared_name has been
+    # explicitly specified.
+    use_node_name_sharing = self._checkpoint and self._shared_name is None
+
+    table_ref = cuckoo_ops.tfra_cuckoo_hash_table_of_tensors(
+        shared_name=self._shared_name,
+        use_node_name_sharing=use_node_name_sharing,
+        key_dtype=self._key_dtype,
+        value_dtype=self._value_dtype,
+        value_shape=self._default_value.get_shape(),
+        init_capacity=self._init_capacity,
+        max_capacity=self._max_capacity,
+        name=self._name,
+    )
+
+    if context.executing_eagerly():
+      self._table_name = None
+    else:
+      self._table_name = table_ref.op.name.split("/")[-1]
+    return table_ref
+
+  @property
+  def name(self):
+    return self._table_name
+
+  def size(self, name=None):
+    """Compute the number of elements in this table.
+
+        Args:
+          name: A name for the operation (optional).
+
+        Returns:
+          A scalar tensor containing the number of elements in this table.
+        """
+    with ops.name_scope(name, "%s_Size" % self.name, [self.resource_handle]):
+      with ops.colocate_with(self.resource_handle):
+        return cuckoo_ops.tfra_cuckoo_hash_table_size(
+            self.resource_handle,
+            key_dtype=self._key_dtype,
+            value_dtype=self._value_dtype)
+
+  def remove(self, keys, name=None):
+    """Removes `keys` and its associated values from the table.
+
+        If a key is not present in the table, it is silently ignored.
+
+        Args:
+          keys: Keys to remove. Can be a tensor of any shape. Must match the table's
+            key type.
+          name: A name for the operation (optional).
+
+        Returns:
+          The created Operation.
+
+        Raises:
+          TypeError: when `keys` do not match the table data types.
+        """
+    if keys.dtype != self._key_dtype:
+      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
+                      (self._key_dtype, keys.dtype))
+
+    with ops.name_scope(
+        name,
+        "%s_lookup_table_remove" % self.name,
+        (self.resource_handle, keys, self._default_value),
+    ):
+      op = cuckoo_ops.tfra_cuckoo_hash_table_remove(self.resource_handle, keys)
+
+    return op
+
+  def clear(self, name=None):
+    """clear all keys and values in the table.
+
+    Args:
+      name: A name for the operation (optional).
+
+    Returns:
+      The created Operation.
+    """
+    with ops.name_scope(name, "%s_lookup_table_clear" % self.name,
+                        (self.resource_handle, self._default_value)):
+      op = cuckoo_ops.tfra_cuckoo_hash_table_clear(
+          self.resource_handle,
+          key_dtype=self._key_dtype,
+          value_dtype=self._value_dtype)
+
+    return op
+
+  def prepare(self, keys, name=None, shared_name=None, is_async=False):
+    return control_flow_ops.no_op()
+
+  def lookup(self,
+             keys,
+             dynamic_default_values=None,
+             return_exists=False,
+             name=None,
+             shared_name=None):
+    """Looks up `keys` in a table, outputs the corresponding values.
+
+      The `default_value` is used for keys not present in the table.
+
+      Args:
+        keys: Keys to look up. Can be a tensor of any shape. Must match the
+          table's key_dtype.
+        dynamic_default_values: The values to use if a key is missing in the
+          table. If None (by default), the static default_value
+          `self._default_value` will be used.
+        return_exists: if True, will return a additional Tensor which indicates
+          if or not keys are existing in the table.
+        name: A name for the operation (optional).
+
+      Returns:
+        A tensor containing the values in the same shape as `keys` using the
+          table's value type.
+        exists:
+          A bool type Tensor of the same shape as `keys` which indicates
+            if keys are existing in the table.
+            Only provided if `return_exists` is True.
+
+      Raises:
+        TypeError: when `keys` do not match the table data types.
+    """
+    with ops.name_scope(
+        name,
+        "%s_lookup_table_find" % self.name,
+        (self.resource_handle, keys, self._default_value),
+    ):
+      keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
+      with ops.colocate_with(self.resource_handle, ignore_existing=True):
+        if return_exists:
+          values, exists = cuckoo_ops.tfra_cuckoo_hash_table_find_with_exists(
+              self.resource_handle,
+              keys,
+              dynamic_default_values
+              if dynamic_default_values is not None else self._default_value,
+          )
+        else:
+          values = cuckoo_ops.tfra_cuckoo_hash_table_find(
+              self.resource_handle,
+              keys,
+              dynamic_default_values
+              if dynamic_default_values is not None else self._default_value,
+          )
+    return (values, exists) if return_exists else values
+
+  def insert(self, keys, values, name=None, shared_name=None):
+    """Associates `keys` with `values`.
+
+        Args:
+          keys: Keys to insert. Can be a tensor of any shape. Must match the table's
+            key type.
+          values: Values to be associated with keys. Must be a tensor of the same
+            shape as `keys` and match the table's value type.
+          name: A name for the operation (optional).
+
+        Returns:
+          The created Operation.
+
+        Raises:
+          TypeError: when `keys` or `values` doesn't match the table data
+            types.
+        """
+    with ops.name_scope(
+        name,
+        "%s_lookup_table_insert" % self.name,
+        [self.resource_handle, keys, values],
+    ):
+      keys = ops.convert_to_tensor(keys, self._key_dtype, name="keys")
+      values = ops.convert_to_tensor(values, self._value_dtype, name="values")
+      with ops.colocate_with(self.resource_handle, ignore_existing=True):
+        # pylint: disable=protected-access
+        op = cuckoo_ops.tfra_cuckoo_hash_table_insert(self.resource_handle,
+                                                      keys, values)
+    return op
+
+  def accum(self, keys, values_or_deltas, exists, name=None, shared_name=None):
+    """Associates `keys` with `values`.
+
+      Args:
+        keys: Keys to accmulate. Can be a tensor of any shape.
+          Must match the table's key type.
+        values_or_deltas: values to be associated with keys. Must be a tensor of
+          the same shape as `keys` and match the table's value type.
+        exists: A bool type tensor indicates if keys already exist or not.
+          Must be a tensor of the same shape as `keys`.
+        name: A name for the operation (optional).
+
+      Returns:
+        The created Operation.
+
+      Raises:
+        TypeError: when `keys` or `values` doesn't match the table data
+          types.
+    """
+    with ops.name_scope(
+        name,
+        "%s_lookup_table_accum" % self.name,
+        [self.resource_handle, keys, values_or_deltas],
+    ):
+      keys = ops.convert_to_tensor(keys, self._key_dtype, name="keys")
+      values_or_deltas = ops.convert_to_tensor(values_or_deltas,
+                                               self._value_dtype,
+                                               name="values_or_deltas")
+      exists = ops.convert_to_tensor(exists, dtypes.bool, name="exists")
+      with ops.colocate_with(self.resource_handle, ignore_existing=True):
+        # pylint: disable=protected-access
+        op = cuckoo_ops.tfra_cuckoo_hash_table_accum(self.resource_handle, keys,
+                                                     values_or_deltas, exists)
+    return op
+
+  def export(self, name=None):
+    """Returns tensors of all keys and values in the table.
+
+        Args:
+          name: A name for the operation (optional).
+
+        Returns:
+          A pair of tensors with the first tensor containing all keys and the
+            second tensors containing all values in the table.
+        """
+    with ops.name_scope(name, "%s_lookup_table_export_values" % self.name,
+                        [self.resource_handle]):
+      with ops.colocate_with(self.resource_handle):
+        keys, values = cuckoo_ops.tfra_cuckoo_hash_table_export(
+            self.resource_handle, self._key_dtype, self._value_dtype)
+    return keys, values
+
+  def export_keys_and_metas(self, split_size, name=None):
+    if not (split_size > 0 and isinstance(split_size, int)):
+      raise ValueError(f'split_size must be positive integer.')
+    with ops.name_scope(name,
+                        "%s_lookup_table_export_keys_and_metas" % self.name,
+                        [self.resource_handle]):
+      with ops.colocate_with(self.resource_handle):
+        keys, metas = cuckoo_ops.tfra_cuckoo_hash_table_export_keys_and_metas(
+            self.resource_handle, Tkeys=self._key_dtype, split_size=split_size)
+    return keys, metas
+
+  def save(self, filepath, buffer_size=4194304, name=None):
+    """
+    Returns an operation to save the keys and values in table to
+    filepath. The keys and values will be stored in files with
+    suffix ".keys" and ".values", appended to the filepath.
+
+    Args:
+      filepath: A path to save the table.
+      name: Name for the operation.
+      buffer_size: Number of kv pairs buffer write to file.
+
+    Returns:
+      An operation to save the table.
+    """
+    with ops.name_scope(name, "%s_save_table" % self.name,
+                        [self.resource_handle]):
+      with ops.colocate_with(None, ignore_existing=True):
+        return cuckoo_ops.tfra_cuckoo_hash_table_export_to_file(
+            self.resource_handle,
+            filepath,
+            key_dtype=self._key_dtype,
+            value_dtype=self._value_dtype,
+            buffer_size=buffer_size)
+
+  def load(self, filepath, buffer_size=4194304, name=None):
+    """
+    Returns an operation to load keys and values to table from
+    file. The keys and values files are generated from `save`.
+
+    Args:
+      filepath: A file path stored the table keys and values.
+      name: Name for the operation.
+      buffer_size: Number of kv pairs buffer to read file.
+
+    Returns:
+      An operation to load keys and values to table from file.
+    """
+    with ops.name_scope(name, "%s_load_table" % self.name,
+                        [self.resource_handle]):
+      with ops.colocate_with(None, ignore_existing=True):
+        return cuckoo_ops.tfra_cuckoo_hash_table_import_from_file(
+            self.resource_handle,
+            filepath,
+            key_dtype=self._key_dtype,
+            value_dtype=self._value_dtype,
+            buffer_size=buffer_size)
+
+  def _gather_saveables_for_checkpoint(self):
+    """For object-based checkpointing."""
+    # full_name helps to figure out the name-based Saver's name for this saveable.
+    full_name = self._table_name
+    return {
+        "table":
+            functools.partial(
+                CuckooHashTable._Saveable,
+                table=self,
+                name=self._name,
+                full_name=full_name,
+            )
+    }
+
+  class _Saveable(BaseSaverBuilder.SaveableObject):
+    """SaveableObject implementation for CuckooHashTable."""
+
+    def __init__(self, table, name, full_name=""):
+      tensors = table.export()
+      specs = [
+          BaseSaverBuilder.SaveSpec(tensors[0], "", name + "-keys"),
+          BaseSaverBuilder.SaveSpec(tensors[1], "", name + "-values"),
+      ]
+      # pylint: disable=protected-access
+      super(CuckooHashTable._Saveable, self).__init__(table, specs, name)
+      self._restore_name = table._name
+
+    def restore(self, restored_tensors, restored_shapes, name=None):
+      del restored_shapes  # unused
+      # pylint: disable=protected-access
+      with ops.name_scope(name, "%s_table_restore" % self._restore_name):
+        with ops.colocate_with(self.op.resource_handle):
+          return cuckoo_ops.tfra_cuckoo_hash_table_import(
+              self.op.resource_handle,
+              restored_tensors[0],
+              restored_tensors[1],
+          )
+
+
+ops.NotDifferentiable(prefix_op_name("CuckooHashTableOfTensors"))