[BUG] Value error when using insert_or_assign in 1<<24 capacity

GPU: A100
gcc: 9.2

I my tests, I found that the `insert_or_assign` API will meet value error when the capacity is large. And here is the code to reproduce the error:

```c++
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <array>
#include <iostream>
#include <thread>
#include <vector>
#include <map>
#include <unordered_map>
#include "merlin_hashtable.cuh"
#include "merlin/types.cuh"
#include "test_util.cuh"

using K = uint64_t;
using V = float;
using S = uint64_t;
using i64 = int64_t;
using u64 = uint64_t;
using f32 = float;
using EvictStrategy = nv::merlin::EvictStrategy;
using TableOptions = nv::merlin::HashTableOptions;

template <class K, class S>
struct ExportIfPredFunctor {
  __forceinline__ __device__ bool operator()(const K& key, S& score,
                                             const K& pattern,
                                             const S& threshold) {
    return score < threshold;
  }
};

void test_export_batch_if_with_limited_size() {
  constexpr uint64_t CAP = 1llu << 24;
  size_t n0 = (1llu << 23) - 163;
  size_t n1 = (1llu << 23) + 221;
  size_t n2 = (1llu << 23) - 17;
  size_t dim = 64;
  size_t table_size = 0;
  i64 pattern = 0;
  u64 threshold = 40;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  TableOptions options;
  options.init_capacity = CAP;
  options.max_capacity = CAP;
  options.dim = dim;
  options.max_hbm_for_vectors = nv::merlin::GB(100);
  using Table = nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kCustomized>;

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(options);

  size_t* d_cnt = nullptr;
  CUDA_CHECK(cudaMallocAsync(&d_cnt, sizeof(size_t), stream));
  CUDA_CHECK(cudaMemsetAsync(d_cnt, 0, sizeof(size_t), stream));

  test_util::KVMSBuffer<i64, f32, u64> buffer0;
  buffer0.Reserve(n0, dim, stream);
  buffer0.ToRange(0, 1, stream);
  buffer0.Setscore((u64)15, stream);
  {
    test_util::KVMSBuffer<i64, f32, u64> buffer0_ev;
    buffer0_ev.Reserve(n0, dim, stream);
    buffer0_ev.ToZeros(stream);
    table->insert_or_assign(n0, buffer0.keys_ptr(), buffer0.values_ptr(), buffer0.scores_ptr(), stream, true, false);
    //table->insert_and_evict(n0, buffer0.keys_ptr(), buffer0.values_ptr(), buffer0.scores_ptr(),
    //                        buffer0_ev.keys_ptr(), buffer0_ev.values_ptr(), buffer0_ev.scores_ptr(),
    //                        d_cnt, stream, true, false);
    table_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    MERLIN_EXPECT_TRUE(table_size == n0, "Invalid table size.");
  }

  test_util::KVMSBuffer<i64, f32, u64> buffer1;
  buffer1.Reserve(n1, dim, stream);
  buffer1.ToRange(n0, 1, stream);
  buffer1.Setscore((u64)30, stream);
        {
    test_util::KVMSBuffer<i64, f32, u64> buffer1_ev;
    buffer1_ev.Reserve(n0, dim, stream);
    buffer1_ev.ToZeros(stream);
    table->insert_or_assign(n1, buffer1.keys_ptr(), buffer1.values_ptr(), buffer1.scores_ptr(), stream, true, false);
    //table->insert_and_evict(n0, buffer1.keys_ptr(), buffer1.values_ptr(), buffer1.scores_ptr(),
    //                        buffer1_ev.keys_ptr(), buffer1_ev.values_ptr(), buffer1_ev.scores_ptr(),
    //                        d_cnt, stream, true, false);
    table_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
        }

  test_util::KVMSBuffer<i64, f32, u64> buffer2;
  buffer2.Reserve(n2, dim, stream);
  buffer2.ToRange(n0 + n1, 1, stream);
  buffer2.Setscore((u64)45, stream);
        {
    test_util::KVMSBuffer<i64, f32, u64> buffer2_ev;
    buffer2_ev.Reserve(n0, dim, stream);
    buffer2_ev.ToZeros(stream);
    table->insert_or_assign(n2, buffer2.keys_ptr(), buffer2.values_ptr(), buffer2.scores_ptr(), stream, true, false);
    //table->insert_and_evict(n0, buffer2.keys_ptr(), buffer2.values_ptr(), buffer2.scores_ptr(),
    //                        buffer2_ev.keys_ptr(), buffer2_ev.values_ptr(), buffer2_ev.scores_ptr(),
    //                        d_cnt, stream, true, false);
    table_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    printf("final size: %zu, capacity: %zu\n", table_size, table->capacity());
        }

  size_t h_cnt2 = 0;

  test_util::KVMSBuffer<i64, f32, u64> buffer_out;
  buffer_out.Reserve(CAP, dim, stream);
  buffer_out.ToZeros(stream);

  CUDA_CHECK(cudaMemsetAsync(d_cnt, 0, sizeof(size_t), stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));

  bool use_pin = false;

  uint64_t t0 = test_util::getTimestamp();
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaEventRecord(start);
  table->export_batch_if<ExportIfPredFunctor>(pattern, threshold,
                                              static_cast<size_t>(CAP), 0,
                                              d_cnt, buffer_out.keys_ptr(!use_pin),
                                              buffer_out.values_ptr(!use_pin),
                                              buffer_out.scores_ptr(!use_pin),
                                              stream);
  cudaEventRecord(stop);
  CUDA_CHECK(cudaMemcpyAsync(&h_cnt2, d_cnt, sizeof(size_t), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));
  printf("final h_cnt2: %zu\n", h_cnt2);

  float cu_cost = 0;
  cudaEventElapsedTime(&cu_cost, start, stop);
  uint64_t t1 = test_util::getTimestamp();
  printf("final h_cnt2: %zu, cost: %zu, cu_cost: %f\n", h_cnt2, t1 - t0, cu_cost);

  if (!use_pin) {
    buffer_out.SyncData(false, stream);
  }
  CUDA_CHECK(cudaStreamSynchronize(stream));

  std::unordered_map<i64, u64> record;
  for (size_t i = 0; i < h_cnt2; i++) {
    i64 key = buffer_out.keys_ptr(false)[i];
    u64 score = buffer_out.scores_ptr(false)[i];
    MERLIN_EXPECT_TRUE(score < threshold, "");
    record[key] = score;
    for (int j = 0; j < dim; j++) {
      f32 value = buffer_out.values_ptr(false)[i * dim + j];
      //MERLIN_EXPECT_TRUE(key == static_cast<i64>(value), "");
      if (key != static_cast<i64>(value) && j == 0) {
        printf("----> check key(%lld) != value(%f) at i: %zu, j: %d\n", key, value, i, j);
      }
    }
  }
  MERLIN_EXPECT_TRUE(record.size() == h_cnt2, "");
  printf("record: %zu\n", record.size());
  printf("n0+n1: %zu\n", n0+n1);
  printf("n0+n1+n2: %zu\n", n0+n1+n2);
  printf("done\n");
}


int main() {
  test_export_batch_if_with_limited_size();
  return 0;
}
```

In the test cast, it expects value equals to key for every vec. But it will get invalid outputs in stdout.

```bash
final size: 16777216, capacity: 16777216
final h_cnt2: 8388625
final h_cnt2: 8388625, cost: 24, cu_cost: 24.695711
----> check key(16777235) != value(16777236.000000) at i: 376767, j: 0
----> check key(16777259) != value(16777260.000000) at i: 387306, j: 0
----> check key(16777241) != value(16777240.000000) at i: 490218, j: 0
----> check key(16777267) != value(16777268.000000) at i: 647383, j: 0
----> check key(16777239) != value(16777240.000000) at i: 1051866, j: 0
----> check key(16777243) != value(16777244.000000) at i: 1094834, j: 0
----> check key(16777251) != value(16777252.000000) at i: 1239658, j: 0
----> check key(16777223) != value(16777224.000000) at i: 1326925, j: 0
----> check key(16777273) != value(16777272.000000) at i: 1553255, j: 0
----> check key(16777245) != value(16777244.000000) at i: 2216367, j: 0
----> check key(16777229) != value(16777228.000000) at i: 3559115, j: 0
----> check key(16777271) != value(16777272.000000) at i: 4042540, j: 0
----> check key(16777237) != value(16777236.000000) at i: 4183398, j: 0
----> check key(16777233) != value(16777232.000000) at i: 4776781, j: 0
----> check key(16777225) != value(16777224.000000) at i: 5460733, j: 0
----> check key(16777227) != value(16777228.000000) at i: 5616921, j: 0
----> check key(16777221) != value(16777220.000000) at i: 6148795, j: 0
----> check key(16777261) != value(16777260.000000) at i: 6203695, j: 0
----> check key(16777249) != value(16777248.000000) at i: 6227725, j: 0
----> check key(16777255) != value(16777256.000000) at i: 6274915, j: 0
----> check key(16777257) != value(16777256.000000) at i: 6398638, j: 0
----> check key(16777217) != value(16777216.000000) at i: 6431965, j: 0
----> check key(16777263) != value(16777264.000000) at i: 6547061, j: 0
----> check key(16777247) != value(16777248.000000) at i: 6741382, j: 0
----> check key(16777231) != value(16777232.000000) at i: 7109209, j: 0
----> check key(16777253) != value(16777252.000000) at i: 7387730, j: 0
----> check key(16777269) != value(16777268.000000) at i: 8151589, j: 0
----> check key(16777219) != value(16777220.000000) at i: 8273935, j: 0
record: 8388625
n0+n1: 16777274
n0+n1+n2: 25165865
```

The error will not occur if using `insert_and_evict` or the capacity is 1<<23 or smaller.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[BUG] Value error when using insert_or_assign in 1<<24 capacity #203

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[BUG] Value error when using insert_or_assign in 1<<24 capacity #203

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions