[Feat] Try to compatible with Keras 3 optimizer design and support CUDNN 9.0+. (#392)

MoFHeka · web-flow · commit 96fc4163891b · 2024-03-24T15:55:11.000-07:00
* [fix] self.params may not with saveable attribution.

* [feat] Update the config file for new CUDNN release.
Now support CUDNN 9.0+

* [fix] Suppress nodiscard warnings and unused warnings
by adding a new LOG_IF_ERROR macro and using proper api function.

* [feat] Competible with Keras3 optimizer style.
diff --git a/build_deps/toolchains/gpu/cuda_configure.bzl b/build_deps/toolchains/gpu/cuda_configure.bzl
@@ -960,19 +960,28 @@ def _create_local_cuda_repository(repository_ctx):
     # Copy cudnn.h if cuDNN was not installed to CUDA_TOOLKIT_PATH.
     included_files = _read_dir(repository_ctx, cuda_include_path)
     if not any([file.endswith("cudnn.h") for file in included_files]):
-        if [int(x) for x in cuda_config.cudnn_version.split(".")] < [8, 0]:
-            cudnn_headers = ["cudnn.h"]
-        else:
-            cudnn_headers = [
+        cudnn_headers = ["cudnn.h"]
+        if cuda_config.cudnn_version.rsplit("_", 1)[-1] >= "9":
+            cudnn_headers += [
+                "cudnn_adv.h",
+                "cudnn_backend.h",
+                "cudnn_cnn.h",
+                "cudnn_graph.h",
+                "cudnn_ops.h",
+                "cudnn_version.h",
+            ]
+        elif cuda_config.cudnn_version.rsplit("_", 1)[-1] >= "8":
+            cudnn_headers += [
+                "cudnn_backend.h",
                 "cudnn_adv_infer.h",
                 "cudnn_adv_train.h",
                 "cudnn_cnn_infer.h",
                 "cudnn_cnn_train.h",
                 "cudnn_ops_infer.h",
                 "cudnn_ops_train.h",
-                "cudnn.h",
                 "cudnn_version.h",
             ]
+
         cudnn_srcs = []
         cudnn_outs = []
         for header in cudnn_headers:
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -119,11 +119,7 @@ void MoveValues(const GPUDevice& d, int32* keys, int32* values, int32* num_runs,
                               values, num_runs, out_size, out));
 }
 
-struct IdentityOp {
-  __device__ int32 __forceinline__ operator()(const int32& a) const {
-    return a;
-  }
-};
+struct IdentityOp {};
 
 // Define an output iterator that only allows assignment to
 // positions between [base, base + limit).
@@ -162,27 +158,10 @@ class BoundedOutputIterator
                         IdentityOp op, int32 size)
       : TransformOutputIterator(ptr, op), limit(size), base(base) {}
 
-  // Indirection
-  __host__ __device__ __forceinline__ reference operator*() const {
-    return BoundedReference(ptr, base, conversion_op, limit);
-  }
-
   // Array subscript
   __host__ __device__ __forceinline__ reference operator[](int32 n) const {
     return BoundedReference(ptr + n, base, conversion_op, limit);
   }
-
-  // Addition
-  __host__ __device__ __forceinline__ self_type operator+(int32 n) const {
-    self_type retval(ptr + n, base, conversion_op, limit);
-    return retval;
-  }
-
-  // Subtraction
-  __host__ __device__ __forceinline__ self_type operator-(int32 n) const {
-    self_type retval(ptr - n, base, conversion_op, limit);
-    return retval;
-  }
 };
 
 }  // namespace
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/hkv_hashtable_op_gpu.cu.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/hkv_hashtable_op_gpu.cu.cc
@@ -839,7 +839,7 @@ class HashTableExportKeysAndScoresGpuOp : public OpKernel {
  public:
   explicit HashTableExportKeysAndScoresGpuOp(OpKernelConstruction* ctx)
       : OpKernel(ctx) {
-    ctx->GetAttr("split_size", &split_size_i64_);
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("split_size", &split_size_i64_));
   }
 
   void Compute(OpKernelContext* ctx) override {
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h
@@ -207,10 +207,10 @@ class RandomKVFile : public nv::merlin::BaseKVFile<K, V, S> {
 
   void close() {
     if (key_writer_) {
-      key_writer_->Flush();
+      TFRA_LOG_IF_ERROR(key_writer_->Flush());
     }
     if (value_writer_) {
-      value_writer_->Flush();
+      TFRA_LOG_IF_ERROR(value_writer_->Flush());
     }
   }
 
@@ -222,8 +222,9 @@ class RandomKVFile : public nv::merlin::BaseKVFile<K, V, S> {
     key_buffer_.reserve(key_read_byte);
     value_buffer_.reserve(value_read_byte);
 
-    key_reader_->ReadNBytes(key_read_byte, &key_buffer_);
-    value_reader_->ReadNBytes(value_read_byte, &value_buffer_);
+    TFRA_LOG_IF_ERROR(key_reader_->ReadNBytes(key_read_byte, &key_buffer_));
+    TFRA_LOG_IF_ERROR(
+        value_reader_->ReadNBytes(value_read_byte, &value_buffer_));
 
     memcpy((char*)keys, key_buffer_.data(), key_buffer_.size());
     memcpy((char*)vectors, value_buffer_.data(), value_buffer_.size());
@@ -237,8 +238,10 @@ class RandomKVFile : public nv::merlin::BaseKVFile<K, V, S> {
     size_t key_write_byte = n * sizeof(K);
     size_t value_write_byte = n * sizeof(V) * value_dim_;
 
-    key_writer_->Append(StringPiece((char*)keys, key_write_byte));
-    value_writer_->Append(StringPiece((char*)vectors, value_write_byte));
+    TFRA_LOG_IF_ERROR(
+        key_writer_->Append(StringPiece((char*)keys, key_write_byte)));
+    TFRA_LOG_IF_ERROR(
+        value_writer_->Append(StringPiece((char*)vectors, value_write_byte)));
 
     return n;
   }
@@ -552,8 +555,8 @@ class TableWrapper {
     } else {
       wfile.reset(new RandomKVFile<K, V, uint64_t>(
           fs, filepath, dim, buffer_size, append_to_file));
-      status = reinterpret_cast<RandomKVFile<K, V, uint64_t>*>(wfile.get())
-                   ->open(keyfile, valuefile, "wb");
+      status.Update(reinterpret_cast<RandomKVFile<K, V, uint64_t>*>(wfile.get())
+                        ->open(keyfile, valuefile, "wb"));
     }
     if (!status.ok()) {
       std::string error_msg = "Failed to dump to file to " + keyfile + ", " +
@@ -603,8 +606,8 @@ class TableWrapper {
     } else {
       rfile.reset(
           new RandomKVFile<K, V, uint64_t>(fs, filepath, dim, buffer_size));
-      status = reinterpret_cast<RandomKVFile<K, V, uint64_t>*>(rfile.get())
-                   ->open(keyfile, valuefile, "rb");
+      status.Update(reinterpret_cast<RandomKVFile<K, V, uint64_t>*>(rfile.get())
+                        ->open(keyfile, valuefile, "rb"));
     }
     if (!status.ok()) {
       std::string error_msg = "Failed to load from file " + keyfile + ", " +
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/utils/utils.h b/tensorflow_recommenders_addons/dynamic_embedding/core/utils/utils.h
@@ -42,4 +42,24 @@ This code is for compatibility.*/
 }  // namespace recommenders_addons
 }  // namespace tensorflow
 
+// For propagating errors when calling a function but not return status.
+#if TF_VERSION_INTEGER >= 2130
+#define TFRA_LOG_IF_ERROR(...)             \
+  do {                                     \
+    const auto _status = (__VA_ARGS__);    \
+    if (TF_PREDICT_FALSE(!_status.ok())) { \
+      MAYBE_ADD_SOURCE_LOCATION(_status)   \
+      LOG(ERROR) << _status.message();     \
+    }                                      \
+  } while (0)
+#else
+#define TFRA_LOG_IF_ERROR(...)               \
+  do {                                       \
+    const auto _status = (__VA_ARGS__);      \
+    if (TF_PREDICT_FALSE(!_status.ok())) {   \
+      LOG(ERROR) << _status.error_message(); \
+    }                                        \
+  } while (0)
+#endif
+
 #endif  // TFRA_UTILS_H_
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/python/keras/layers/embedding.py b/tensorflow_recommenders_addons/dynamic_embedding/python/keras/layers/embedding.py
@@ -559,7 +559,9 @@ def __init__(self,
     else:
       self._mpi_size = mpi_size
     super(HvdAllToAllEmbedding, self).__init__(*args, **kwargs)
-    if type(self.params.saveable).__name__ not in de_fs_saveable_class_names:
+    try:
+      assert type(self.params.saveable).__name__ in de_fs_saveable_class_names
+    except:
       tf_logging.warning(
           "Please use FileSystemSaver in KVCreator when use HvdAllToAllEmbedding. "
           "It will allow TFRA save and restore KV files when Embedding tensor parallel in distributed training. "
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/python/ops/dynamic_embedding_optimizer.py b/tensorflow_recommenders_addons/dynamic_embedding/python/ops/dynamic_embedding_optimizer.py
@@ -303,7 +303,7 @@ def _update_step_fn(var, grad):
         if self.jit_compile:
           return self._update_step_xla(grad, var, id(self._var_key(var)))
         else:
-          return script_ops.py_func_common(self._update_step, [grad, var], [])
+          return self._update_step(grad, var)
 
       if not isinstance(var, de.TrainableWrapper):
         return _update_step_fn(var, grad)
@@ -327,9 +327,9 @@ def _update_step_fn(var, grad):
             _before = [v0] + s0
 
           with ops.control_dependencies(_before):
-            _apply_op = _update_step_fn(var, grad)
+            _update_step_fn(var, grad)
 
-          with ops.control_dependencies([_apply_op]):
+          with ops.control_dependencies([var]):
             _after = control_flow_ops.group(
                 [var.update_op(v0=v0)] +
                 [_s.update_op(v0=s0[si]) for si, _s in enumerate(_slots)])
@@ -514,6 +514,8 @@ def _zeros_slot(var, slot_name, op_name):
   def _hvd_aggregate_gradients(hvd_handle,
                                grads_and_vars_in,
                                sparse_as_dense=True):
+    if hvd_handle.size() <= 1:
+      return grads_and_vars_in
     var_list = []
     aggregated_grad = []
     for grad, var in grads_and_vars_in:

Original file line number	Diff line number	Diff line change
`@@ -839,7 +839,7 @@ class HashTableExportKeysAndScoresGpuOp : public OpKernel {`
`839`	`839`	`public:`
`840`	`840`	`explicit HashTableExportKeysAndScoresGpuOp(OpKernelConstruction* ctx)`
`841`	`841`	`: OpKernel(ctx) {`
`842`		`- ctx->GetAttr("split_size", &split_size_i64_);`
	`842`	`+ OP_REQUIRES_OK(ctx, ctx->GetAttr("split_size", &split_size_i64_));`
`843`	`843`	`}`
`844`	`844`
`845`	`845`	`void Compute(OpKernelContext* ctx) override {`