Fix clang-tidy and nvcc warnings (pytorch#4923)

cyyever · facebook-github-bot · commit 1de24349cbb0 · 2025-09-23T18:14:58.000-07:00
Summary: Pull Request resolved: pytorch#4923 X-link: facebookresearch/FBGEMM#1947 Pull Request resolved: pytorch#4918 Reviewed By: ionuthristodorescu Differential Revision: D83070110 Pulled By: q10 fbshipit-source-id: 1b12db2de80a5bdd50a3a2fdcae332f634a7abbc
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu
@@ -98,8 +98,8 @@ __global__ __launch_bounds__(kForwardMaxThreads) void
     {%- endif %}
     ) {
     constexpr int32_t kVecWidth = 4;
-    int error_code = 0;
-    int64_t error_value;
+    [[maybe_unused]] int error_code = 0;
+    [[maybe_unused]] int64_t error_value = 0;
 
     int32_t T = D_offsets.size(0) - 1;
     auto b_t = blockIdx.x * blockDim.y + threadIdx.y;
diff --git a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_kernel_template.cu b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_kernel_template.cu
@@ -298,7 +298,7 @@ using namespace fbgemm_gpu;
         // Cooperatively load the indices
         const overflow_safe_int_t idx = l < L ? indices[indices_start + l] : 0;
         // If idx is loaded
-        const auto offset_idx = idx * D_emb;
+        [[maybe_unused]] const auto offset_idx = idx * D_emb;
         {%- endif %}
 
         {%- if not dense and lxu_miss_rate != "cache_conflict_miss_rate::all" %}
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -1386,9 +1386,9 @@ void _block_bucketize_sparse_features_2d_weights_cpu_kernel(
     const std::optional<Tensor>& total_num_blocks,
     const int64_t my_size,
     const int64_t weights_dim,
-    Tensor new_lengths,
-    Tensor new_indices,
-    Tensor new_weights,
+    const Tensor& new_lengths,
+    const Tensor& new_indices,
+    const Tensor& new_weights,
     std::optional<Tensor> new_pos,
     const std::optional<Tensor>& unbucketize_permute,
     const std::optional<Tensor>& batch_size_per_feature,
@@ -1417,8 +1417,8 @@ void _block_bucketize_sparse_features_2d_weights_cpu_kernel(
   const index_t* const block_sizes_data = block_sizes.data_ptr<index_t>();
   offset_t* batch_sizes_data = nullptr;
   const auto variable_batch_size = batch_size_per_feature.has_value();
-  const auto variable_bucket_sizes = block_bucketize_pos.has_value() &&
-      block_bucketize_pos.value().size() != 0;
+  const auto variable_bucket_sizes =
+      block_bucketize_pos.has_value() && !block_bucketize_pos.value().empty();
   using uindex_t = std::make_unsigned_t<index_t>;
   using uoffset_t = std::make_unsigned_t<offset_t>;
   std::vector<int64_t> lower_bounds(indices.numel(), 0);
diff --git a/fbgemm_gpu/src/split_embeddings_utils/generate_vbe_metadata.cu b/fbgemm_gpu/src/split_embeddings_utils/generate_vbe_metadata.cu
@@ -70,7 +70,7 @@ __launch_bounds__(kMaxThreads) void generate_vbe_metadata_foreach_sample_kernel(
 
 } // namespace
 
-std::tuple<int, int, int> get_max_grid_size(int device) {
+static std::tuple<int, int, int> get_max_grid_size() {
   static auto max_grid = [&]() -> std::tuple<int, int, int> {
     cudaDeviceProp prop;
     C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, at::cuda::current_device()));
@@ -152,8 +152,7 @@ generate_vbe_metadata(
 
   const auto grid_dim_x = div_round_up(max_B_feature_rank, kMaxThreads);
   const dim3 grid_size(grid_dim_x, num_ranks, T);
-  const auto& [max_grid_x, max_grid_y, max_grid_z] =
-      get_max_grid_size(at::cuda::current_device());
+  const auto& [max_grid_x, max_grid_y, max_grid_z] = get_max_grid_size();
   TORCH_CHECK(
       grid_size.x > 0 && grid_size.x <= max_grid_x,
       "generate_vbe_metadata: Invalid grid_size.x ",
diff --git a/fbgemm_gpu/src/split_embeddings_utils/get_infos_metadata.cu b/fbgemm_gpu/src/split_embeddings_utils/get_infos_metadata.cu
@@ -14,6 +14,6 @@ using Tensor = at::Tensor;
 using namespace fbgemm_gpu;
 
 DLL_PUBLIC std::tuple<int64_t, int64_t>
-get_infos_metadata(Tensor unused, int64_t B, int64_t T) {
+get_infos_metadata(Tensor /*unused*/, int64_t B, int64_t T) {
   return get_info_B_num_bits_from_T(T, B);
 }
diff --git a/fbgemm_gpu/src/tbe/eeg/indices_generator.cpp b/fbgemm_gpu/src/tbe/eeg/indices_generator.cpp
@@ -53,7 +53,7 @@ static torch::Tensor convertVectorToTensor(
 // Metadata structure for an index
 struct IndexMetadata {
   std::vector<double> tags;
-  int64_t freq;
+  int64_t freq{};
 };
 
 torch::Tensor IndicesGenerator::generate() {
diff --git a/fbgemm_gpu/src/topology_utils.cpp b/fbgemm_gpu/src/topology_utils.cpp
@@ -124,11 +124,11 @@ AdjacencyMatrix<Links> get_nvlink_matrix() {
   std::unordered_map<Node, uint32_t> cuda_device_to_nvml_device;
 
   for (const auto i : c10::irange(device_count)) {
-    nvmlDevice_t handle;
+    nvmlDevice_t handle = nullptr;
     NVML_CHECK(nvmlDeviceGetHandleByIndex(i, &handle));
     nvmlPciInfo_t pci_info;
     NVML_CHECK(nvmlDeviceGetPciInfo(handle, &pci_info));
-    std::array<char, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id;
+    std::array<char, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id{};
     std::copy(
         &pci_info.busId[0],
         &pci_info.busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE],
@@ -148,7 +148,7 @@ AdjacencyMatrix<Links> get_nvlink_matrix() {
 
   std::vector<Links> links(world_size * world_size);
   for (const auto i : c10::irange(world_size)) {
-    nvmlDevice_t handle;
+    nvmlDevice_t handle = nullptr;
     NVML_CHECK(
         nvmlDeviceGetHandleByIndex(cuda_device_to_nvml_device[i], &handle));
     for (const auto link : c10::irange(NVML_NVLINK_MAX_LINKS)) {
@@ -163,7 +163,7 @@ AdjacencyMatrix<Links> get_nvlink_matrix() {
       }
       nvmlPciInfo_t pci_info;
       NVML_CHECK(nvmlDeviceGetNvLinkRemotePciInfo(handle, link, &pci_info));
-      std::array<char, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id;
+      std::array<char, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id{};
       std::copy(
           &pci_info.busId[0],
           &pci_info.busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE],
diff --git a/test/PackedRequantizeAcc16Test.cc b/test/PackedRequantizeAcc16Test.cc
@@ -8,7 +8,6 @@
 
 #include <cpuinfo.h>
 #include <algorithm>
-#include <cmath>
 #include <numeric>
 #include <random>
 #include <vector>
@@ -458,7 +457,7 @@ TEST_P(fbgemmu8s8acc16WithQuantGranularityTest, SpMDMTest) {
               int b_remainder = 0;
               if (kidx % 2 == 1) {
                 // Make sure abs(b_prev + *bptr - b_remainder) <= 128
-                int b_prev = B_csc.Values().back();
+                auto b_prev = B_csc.Values().back();
                 b_remainder = std::max(b_prev + *bptr - 128, b_remainder);
                 b_remainder = std::min(b_prev + *bptr + 128, b_remainder);
               }

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,6 @@ using Tensor = at::Tensor;`
`14`	`14`	`using namespace fbgemm_gpu;`
`15`	`15`
`16`	`16`	`DLL_PUBLIC std::tuple<int64_t, int64_t>`
`17`		`-get_infos_metadata(Tensor unused, int64_t B, int64_t T) {`
	`17`	`+get_infos_metadata(Tensor /unused/, int64_t B, int64_t T) {`
`18`	`18`	`return get_info_B_num_bits_from_T(T, B);`
`19`	`19`	`}`