tensorflow
diff --git a/‎tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_cluster_connection_pool.hpp‎
Lines changed: 101 additions & 0 deletions b/‎tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_cluster_connection_pool.hpp‎
Lines changed: 101 additions & 0 deletions
diff --git a/‎tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_connection_pool.hpp‎
Lines changed: 89 additions & 0 deletions b/‎tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_connection_pool.hpp‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_connection_util.hpp‎
Lines changed: 6 additions & 0 deletions b/‎tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_connection_util.hpp‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_table_op_util.hpp‎
Lines changed: 20 additions & 0 deletions b/‎tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_table_op_util.hpp‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_table_op.cc‎
Lines changed: 106 additions & 0 deletions b/‎tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_table_op.cc‎
Lines changed: 106 additions & 0 deletions
@@ -1244,6 +1244,107 @@ every bucket has its own BucketContext for sending data---for locating reply-
     return Status::OK();
   }
 
+  virtual Status MaccumCommand(
+      const Tensor &keys, const Tensor &values_or_delta, const Tensor &exists,
+      ThreadContext *thread_context, const int64 begin, const int64 max_i,
+      const int64 Velems_per_dim0,
+      const std::vector<std::string> &keys_prefix_name_slices) override {
+    const int &&total = max_i - begin;
+    const int &&argc = total * 2 + 4;
+
+    const static char *redis_command = "HMACCUM";
+    const static std::size_t &&redis_command_byte = 7;
+    std::string dTypestr = DataTypeString(values_or_delta.dtype());
+    size_t dTypeStrsize = dTypestr.size();
+
+    const K *const pk_raw_end =
+        reinterpret_cast<const K *>(keys.tensor_data().data()) + max_i;
+    const K *pk_raw =
+        reinterpret_cast<const K *>(keys.tensor_data().data()) + begin;
+
+    const std::size_t &&V_byte_size = Velems_per_dim0 * sizeof(V);
+
+    const V *pv_raw =
+        reinterpret_cast<const V *>(values_or_delta.tensor_data().data()) +
+        begin * Velems_per_dim0;
+
+    const unsigned &storage_slice = redis_connection_params.storage_slice;
+    const unsigned &&vector_len =
+        (static_cast<int64>(reinterpret_cast<int>(argc)) /
+         redis_connection_params.storage_slice) +
+        4;
+
+    thread_context->HandleReserve(storage_slice, vector_len, total);
+
+    for (unsigned i = 0; i < storage_slice; ++i) {
+      thread_context->HandlePushBack(i, redis_command, redis_command_byte);
+      thread_context->HandlePushBack(i, keys_prefix_name_slices[i].data(),
+                                     keys_prefix_name_slices[i].size());
+      thread_context->HandlePushBack(i, dTypestr.c_str(), dTypeStrsize);
+    }
+
+    VContentAndTypeSizeResult VCATS_temp;
+    // std::vector<char> for storage all string in one KV pair
+    std::vector<std::vector<char>> buff_temp(total);
+    unsigned key_bucket_locs = 0;
+    for (int i = 0; pk_raw != pk_raw_end;
+         ++i, ++pk_raw, pv_raw += Velems_per_dim0) {
+      VCATS_temp = VContentAndTypeSize<V>(VCATS_temp, Velems_per_dim0,
+                                          V_byte_size, pv_raw, buff_temp[i]);
+      key_bucket_locs =
+          KBucketNum<K>(pk_raw, storage_slice);  // TODO: change it to AVX512
+
+      // Direct access to Tensor data in TensorFlow
+      thread_context->HandlePushBack(
+          key_bucket_locs, KContentPointer<K>(pk_raw), KTypeSize<K>(pk_raw));
+      thread_context->HandlePushBack(
+          key_bucket_locs, VCATS_temp.VContentPointer, VCATS_temp.VTypeSize);
+    }
+
+    const bool *pe_raw =
+        reinterpret_cast<const bool *>(exists.tensor_data().data()) + begin;
+    for (unsigned i = 0; i < storage_slice; ++i) {
+      thread_context->HandlePushBack(i, KContentPointer<bool>(pe_raw),
+                                     total * KTypeSize<bool>(pe_raw));
+    }
+
+    auto cmd = [](::sw::redis::Connection &connection,
+                  const ::sw::redis::StringView &hkey,
+                  const std::vector<const char *> *ptrs_i,
+                  const std::vector<std::size_t> *sizes_i) {
+      assert(strcmp(ptrs_i->front(), "HMACCUM") == 0);
+      assert(sizes_i->front() == redis_command_byte);
+      assert(std::string(hkey.data()).compare(ptrs_i[1]) == 0);
+
+      connection.send(static_cast<int>(ptrs_i->size()),
+                      const_cast<const char **>(ptrs_i->data()),
+                      sizes_i->data());
+    };
+
+    std::vector<
+        std::future<std::unique_ptr<redisReply, ::sw::redis::ReplyDeleter>>>
+        results;
+    try {
+      for (unsigned i = 0; i < storage_slice; ++i) {
+        results.emplace_back(
+            network_worker_pool->enqueue([this, &cmd, &thread_context, i] {
+              return PipeExecWrite(cmd, 6U, thread_context->buckets[i]);
+            }));
+      }
+      for (auto &&result : results) {
+        result.wait();
+      }
+      if (error_ptr) {
+        std::rethrow_exception(error_ptr);
+      }
+    } catch (const std::exception &err) {
+      error_ptr = nullptr;
+      return errors::Unknown(err.what());
+    }
+
+    return Status::OK();
+  }
+
   virtual Status DelCommand(
       const Tensor &keys, ThreadContext *thread_context, const int64 begin,
       const int64 max_i,
 
@@ -1051,6 +1051,95 @@ every bucket has its own BucketContext for sending data---for locating reply-
     return Status::OK();
   }
 
+  virtual Status MaccumCommand(
+      const Tensor &keys, const Tensor &values_or_delta, const Tensor &exists,
+      ThreadContext *thread_context, const int64 begin, const int64 max_i,
+      const int64 Velems_per_dim0,
+      const std::vector<std::string> &keys_prefix_name_slices) override {
+    const int &&total = max_i - begin;
+    const int &&argc = total * 2 + 4;
+
+    const static char *redis_command = "HMACCUM";
+    const static std::size_t redis_command_byte = 7;
+    std::string dTypestr = DataTypeString(values_or_delta.dtype());
+
+    thread_context->HandleReserve(1U, argc, 0);
+
+    std::vector<const char *> *ptrs_0 = thread_context->buckets[0]->ptrs.get();
+    std::vector<std::size_t> *sizes_0 = thread_context->buckets[0]->sizes.get();
+
+    const K *const pk_raw_end =
+        reinterpret_cast<const K *>(keys.tensor_data().data()) + max_i;
+    const K *pk_raw =
+        reinterpret_cast<const K *>(keys.tensor_data().data()) + begin;
+
+    const std::size_t &&V_byte_size = Velems_per_dim0 * sizeof(V);
+
+    const V *pv_raw =
+        reinterpret_cast<const V *>(values_or_delta.tensor_data().data()) +
+        begin * Velems_per_dim0;
+
+    auto ptrs_iter = ptrs_0->begin();
+    *ptrs_iter = redis_command;
+    ++ptrs_iter;
+    *ptrs_iter = keys_prefix_name_slices[0].data();
+    ++ptrs_iter;
+    *ptrs_iter = dTypestr.c_str();
+    ++ptrs_iter;
+
+    auto sizes_iter = sizes_0->begin();
+    *sizes_iter = redis_command_byte;
+    ++sizes_iter;
+    *sizes_iter = keys_prefix_name_slices[0].size();
+    ++sizes_iter;
+    *sizes_iter = dTypestr.size();
+    ++sizes_iter;
+
+    VContentAndTypeSizeResult VCATS_temp;
+    // std::vector<char> for storage all string in one KV pair
+    std::vector<std::vector<char>> buff_temp(total);
+
+    for (int i = 0; pk_raw != pk_raw_end;
+         ++i, ++pk_raw, pv_raw += Velems_per_dim0) {
+      VCATS_temp = VContentAndTypeSize<V>(VCATS_temp, Velems_per_dim0,
+                                          V_byte_size, pv_raw, buff_temp[i]);
+
+      *ptrs_iter = KContentPointer<K>(
+          pk_raw);  // Direct access to Tensor data in TensorFlow
+      *(++ptrs_iter) = VCATS_temp.VContentPointer;
+      ++ptrs_iter;
+
+      *sizes_iter = KTypeSize<K>(pk_raw);  // key data char size
+      *(++sizes_iter) = VCATS_temp.VTypeSize;
+      ++sizes_iter;
+    }
+
+    const bool *pe_raw =
+        reinterpret_cast<const bool *>(exists.tensor_data().data()) + begin;
+    *ptrs_iter = KContentPointer<bool>(pe_raw);
+    *sizes_iter = total * KTypeSize<bool>(pe_raw);
+
+    assert(ptrs_0->front() == redis_command);
+    assert(sizes_0->front() == redis_command_byte);
+
+    auto cmd = [](::sw::redis::Connection &connection, const int argc,
+                  const std::vector<const char *> *ptrs_0,
+                  const std::vector<std::size_t> *sizes_0) {
+      connection.send(argc, const_cast<const char **>(ptrs_0->data()),
+                      sizes_0->data());
+    };
+
+    try {
+      redis_conn_write->command(cmd, argc, ptrs_0, sizes_0);
+    } catch (const std::exception &err) {
+      LOG(ERROR) << "RedisHandler error in MACCUM_COMMAND for HMACCUM "
+                 << keys_prefix_name_slices[0] << " -- " << err.what();
+      return errors::Unknown(err.what());
+    }
+
+    return Status::OK();
+  }
+
   virtual Status DelCommand(
       const Tensor &keys, ThreadContext *thread_context, const int64 begin,
       const int64 max_i,
 
@@ -430,6 +430,12 @@ class RedisVirtualWrapper {
       const int64 begin, const int64 max_i, const int64 Velems_per_dim0,
       const std::vector<std::string> &keys_prefix_name_slices) = 0;
 
+  virtual Status MaccumCommand(
+      const Tensor &keys, const Tensor &values, const Tensor &exists,
+      ThreadContext *thread_context, const int64 begin, const int64 max_i,
+      const int64 Velems_per_dim0,
+      const std::vector<std::string> &keys_prefix_name_slices) = 0;
+
   virtual Status DelCommand(
       const Tensor &keys, ThreadContext *thread_context, const int64 begin,
       const int64 max_i,
 
@@ -132,6 +132,26 @@ Status launchInsertCore(std::shared_ptr<RedisVirtualWrapper> _table_instance,
   return statu;
 }
 
+Status launchAccumCore(std::shared_ptr<RedisVirtualWrapper> _table_instance,
+                       std::vector<std::string> &keys_prefix_name_slices,
+                       const Tensor &keys, const Tensor &values_or_delta,
+                       const Tensor &exists, const int64 &Velems_per_flat2_dim0,
+                       std::vector<ThreadContext *> &threads_Insert,
+                       std::mutex &threads_Accum_mutex, const int64 begin,
+                       const int64 end) {
+  size_t thread_context_id =
+      SelectAvailableThreadContext(threads_Insert, threads_Accum_mutex);
+
+  auto statu = _table_instance->MaccumCommand(
+      keys, values_or_delta, exists, threads_Insert.at(thread_context_id),
+      begin, end, Velems_per_flat2_dim0, keys_prefix_name_slices);
+
+  threads_Insert[thread_context_id]->thread_occupied.store(
+      false, std::memory_order_release);
+
+  return statu;
+}
+
 Status launchDeleteCore(std::shared_ptr<RedisVirtualWrapper> _table_instance,
                         std::vector<std::string> &keys_prefix_name_slices,
                         const Tensor &keys,
 
@@ -78,6 +78,7 @@ class RedisTableOfTensors final : public LookupInterface {
   std::vector<ThreadContext *> threads_Delete;
   std::mutex threads_Find_mutex;
   std::mutex threads_Insert_mutex;
+  std::mutex threads_Accum_mutex;
   std::mutex threads_Delete_mutex;
 
   std::vector<aiocb> IMPORT_content;
@@ -211,6 +212,42 @@ class RedisTableOfTensors final : public LookupInterface {
                               threads_Insert_mutex, 0, total));
   }
 
+  void launchAccum_parallel(OpKernelContext *ctx,
+                            std::vector<std::string> &keys_prefix_name_slices,
+                            const Tensor &keys, const Tensor &values_or_delta,
+                            const Tensor &exists, const int64 &total,
+                            const int64 &Velems_per_flat2_dim0,
+                            std::vector<ThreadContext *> &threads_Insert) {
+    const int64 max_parallelism = (total / multi_redis_cmd_max_argc) + 1;
+
+    auto shard = [this, &ctx, &total, &keys_prefix_name_slices, &keys,
+                  &values_or_delta, &exists, &Velems_per_flat2_dim0,
+                  &threads_Insert](int64 begin, int64 end) {
+      const int64 max_i = std::min(total, end);
+
+      OP_REQUIRES_OK(
+          ctx,
+          launchAccumCore(_table_instance, keys_prefix_name_slices, keys,
+                          values_or_delta, exists, Velems_per_flat2_dim0,
+                          threads_Insert, threads_Accum_mutex, begin, max_i));
+    };
+    int64 slices_size = std::min(total, multi_redis_cmd_max_argc - 1);
+    auto &worker_threads = *ctx->device()->tensorflow_cpu_worker_threads();
+    Shard(max_parallelism, worker_threads.workers, total, slices_size, shard);
+  }
+
+  void launchAccum(OpKernelContext *ctx,
+                   std::vector<std::string> &keys_prefix_name_slices,
+                   const Tensor &keys, const Tensor &values_or_delta,
+                   const Tensor &exists, const int64 &total,
+                   const int64 &Velems_per_flat2_dim0,
+                   std::vector<ThreadContext *> &threads_Insert) {
+    OP_REQUIRES_OK(
+        ctx, launchAccumCore(_table_instance, keys_prefix_name_slices, keys,
+                             values_or_delta, exists, Velems_per_flat2_dim0,
+                             threads_Insert, threads_Insert_mutex, 0, total));
+  }
+
   void launchDelete_parallel(OpKernelContext *ctx,
                              std::vector<std::string> &keys_prefix_name_slices,
                              const Tensor &keys, const int64 &total,
@@ -691,11 +728,35 @@ class RedisTableOfTensors final : public LookupInterface {
     return Status::OK();
   }
 
+  Status DoAccum(OpKernelContext *ctx, const Tensor &keys,
+                 const Tensor &values_or_delta, const Tensor &exists) {
+    int64 total = keys.NumElements();
+    const int64 Velems_per_flat2_dim0 =
+        values_or_delta.NumElements() / keys.NumElements();
+
+    if (total < (multi_redis_cmd_max_argc - 1)) {
+      launchAccum(ctx, keys_prefix_name_slices, keys, values_or_delta, exists,
+                  total, Velems_per_flat2_dim0, threads_Insert);
+    } else {
+      launchAccum_parallel(
+          ctx, keys_prefix_name_slices, keys, values_or_delta, exists, total,
+          Velems_per_flat2_dim0,
+          threads_Insert);  // redis commmand args > multi_redis_cmd_max_argc
+    }
+
+    return Status::OK();
+  }
+
   Status Insert(OpKernelContext *ctx, const Tensor &keys,
                 const Tensor &values) override {
     return DoInsert(false, ctx, keys, values);
   }
 
+  Status Accum(OpKernelContext *ctx, const Tensor &keys,
+               const Tensor &values_or_delta, const Tensor &exists) {
+    return DoAccum(ctx, keys, values_or_delta, exists);
+  }
+
   Status Remove(OpKernelContext *ctx, const Tensor &keys) override {
     int64 total = keys.NumElements();
     if (total > 0) {
@@ -1129,6 +1190,45 @@ class HashTableInsertOp : public HashTableOpKernel {
   }
 };
 
+// Table accum op.
+template <class K, class V>
+class HashTableAccumOp : public HashTableOpKernel {
+ public:
+  using HashTableOpKernel::HashTableOpKernel;
+
+  void Compute(OpKernelContext *ctx) override {
+    LookupInterface *table;
+    OP_REQUIRES_OK(ctx, GetTable(ctx, &table));
+    core::ScopedUnref unref_me(table);
+
+    RedisTableOfTensors<K, V> *redisTable = (RedisTableOfTensors<K, V> *)table;
+
+    DataTypeVector expected_inputs = {expected_input_0_, table->key_dtype(),
+                                      table->value_dtype(),
+                                      DataTypeToEnum<bool>::v()};
+    OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, {}));
+
+    const Tensor &keys = ctx->input(1);
+    const Tensor &values_or_deltas = ctx->input(2);
+    const Tensor &exists = ctx->input(3);
+    OP_REQUIRES(ctx, (values_or_deltas.dtype() != DataTypeToEnum<tstring>::v()),
+                errors::InvalidArgument(
+                    "AccumOP is not supporting tstring value type!"));
+    OP_REQUIRES_OK(
+        ctx, table->CheckKeyAndValueTensorsForInsert(keys, values_or_deltas));
+
+    int64 memory_used_before = 0;
+    if (ctx->track_allocations()) {
+      memory_used_before = table->MemoryUsed();
+    }
+    OP_REQUIRES_OK(ctx, redisTable->Accum(ctx, keys, values_or_deltas, exists));
+    if (ctx->track_allocations()) {
+      ctx->record_persistent_memory_allocation(table->MemoryUsed() -
+                                               memory_used_before);
+    }
+  }
+};
+
 // Table remove op.
 class HashTableRemoveOp : public HashTableOpKernel {
  public:
@@ -1275,6 +1375,12 @@ REGISTER_KERNEL_BUILDER(
           .TypeConstraint<key_dtype>("key_dtype")                           \
           .TypeConstraint<value_dtype>("value_dtype"),                      \
       redis_table::HashTableClearOp<key_dtype, value_dtype>);               \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name(PREFIX_OP_NAME(RedisTableAccum))                                 \
+          .Device(DEVICE_CPU)                                               \
+          .TypeConstraint<key_dtype>("key_dtype")                           \
+          .TypeConstraint<value_dtype>("value_dtype"),                      \
+      redis_table::HashTableAccumOp<key_dtype, value_dtype>);               \
   REGISTER_KERNEL_BUILDER(                                                  \
       Name(PREFIX_OP_NAME(RedisTableFindWithExists))                        \
           .Device(DEVICE_CPU)                                               \