apply pitrou suggestion

andishgar · andishgar · commit deb3686dbe2f · 2025-10-10T13:23:43.000+03:30
diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc
@@ -16,7 +16,7 @@
 // under the License.
 
 #include "arrow/sparse_tensor.h"
-#include "arrow/tensor/converter.h"
+#include "arrow/tensor/converter_internal.h"
 
 #include <algorithm>
 #include <functional>
diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc
@@ -436,6 +436,7 @@ class TestSparseCOOTensorCreationFromNegativeZero
     AssertCOOIndex(si->indices(), 0, {4});
     AssertCOOIndex(si->indices(), 1, {9});
     ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_coo_tensor->ToTensor());
+    ASSERT_OK(new_tensor->Validate());
     ASSERT_TRUE(new_tensor->Equals(*dense_tensor));
   }
 
@@ -459,6 +460,7 @@ class TestSparseCOOTensorCreationFromNegativeZero
     AssertCOOIndex(si->indices(), 0, {1, 1});
     AssertCOOIndex(si->indices(), 1, {3, 0});
     ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_coo_tensor->ToTensor());
+    ASSERT_OK(new_tensor->Validate());
     ASSERT_TRUE(new_tensor->Equals(*dense_tensor));
   }
 
@@ -485,6 +487,7 @@ class TestSparseCOOTensorCreationFromNegativeZero
     AssertCOOIndex(si->indices(), 0, {1, 1});
     AssertCOOIndex(si->indices(), 1, {3, 0});
     ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_coo_tensor->ToTensor());
+    ASSERT_OK(new_tensor->Validate());
     ASSERT_TRUE(new_tensor->Equals(*dense_tensor));
   }
 
@@ -994,6 +997,7 @@ class TestSparseCSRTensorCreationFromNegativeZero
     ASSERT_EQ(indices[0], 1);
     ASSERT_EQ(indices[1], 0);
     ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_csr_tensor->ToTensor());
+    ASSERT_OK(new_tensor->Validate());
     ASSERT_TRUE(new_tensor->Equals(*dense_tensor));
   }
 
@@ -1374,6 +1378,7 @@ class TestSparseCSCTensorCreationFromNegativeZero
     ASSERT_EQ(indices[0], 3);
     ASSERT_EQ(indices[1], 1);
     ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_csc_tensor->ToTensor());
+    ASSERT_OK(new_tensor->Validate());
     ASSERT_TRUE(new_tensor->Equals(*dense_tensor));
   }
 
@@ -1696,6 +1701,7 @@ class TestSparseCSFTensorCreationFromNegativeZero
     EXPECT_EQ(column_indices[0], 0);
     EXPECT_EQ(column_indices[1], 1);
     ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_csf_tensor->ToTensor());
+    ASSERT_OK(new_tensor->Validate());
     ASSERT_TRUE(new_tensor->Equals(*dense_tensor));
   }
 
diff --git a/cpp/src/arrow/tensor/converter_internal.h b/cpp/src/arrow/tensor/converter_internal.h
@@ -22,9 +22,11 @@
 #include <memory>
 #include <utility>
 
-#include "arrow/visit_type_inline.h"
-
 namespace arrow {
+
+template <typename VISITOR, typename... ARGS>
+Status VisitTypeInline(const DataType& type, VISITOR* visitor, ARGS&&... args);
+
 namespace internal {
 
 struct SparseTensorConverterMixin {
@@ -95,25 +97,24 @@ struct ValueTypeVisitor {
 struct IndexAndValueTypeVisitor {
   template <typename IndexType, typename Function>
   enable_if_integer<IndexType, Status> Visit(const IndexType& index_type,
-                                             const std::shared_ptr<DataType>& value_type,
+                                             const DataType& value_type,
                                              Function&& function) {
     ValueTypeVisitor visitor;
-    return VisitTypeInline(*value_type, &visitor, index_type,
+    return VisitTypeInline(value_type, &visitor, index_type,
                            std::forward<Function>(function));
   }
 
   template <typename Function>
-  Status Visit(const DataType& type, const std::shared_ptr<DataType>&, Function&&) {
+  Status Visit(const DataType& type, const DataType&, Function&&) {
     return Status::Invalid("Invalid index type: ", type.name(), ". Expected integer.");
   }
 };
 
 template <typename Function>
-Status VisitValueAndIndexType(const std::shared_ptr<DataType>& value_type,
-                              const std::shared_ptr<DataType>& index_type,
+Status VisitValueAndIndexType(const DataType& value_type, const DataType& index_type,
                               Function&& function) {
   IndexAndValueTypeVisitor visitor;
-  return VisitTypeInline(*index_type, &visitor, value_type,
+  return VisitTypeInline(index_type, &visitor, value_type,
                          std::forward<Function>(function));
 }
 
diff --git a/cpp/src/arrow/tensor/coo_converter.cc b/cpp/src/arrow/tensor/coo_converter.cc
@@ -15,9 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "arrow/tensor/converter.h"
+#include "arrow/tensor/converter_internal.h"
 
 #include <algorithm>
+#include <cmath>
 #include <cstdint>
 #include <memory>
 #include <numeric>
@@ -27,8 +28,11 @@
 #include "arrow/status.h"
 #include "arrow/tensor.h"
 #include "arrow/type.h"
+#include "arrow/type_traits.h"
 #include "arrow/util/checked_cast.h"
+#include "arrow/util/logging_internal.h"
 #include "arrow/util/macros.h"
+#include "arrow/visit_type_inline.h"
 
 namespace arrow {
 
@@ -38,6 +42,57 @@ namespace internal {
 
 namespace {
 
+template <typename ValueType, typename IndexType>
+Status ValidateSparseCooTensorCreation(const SparseCOOIndex& sparse_coo_index,
+                                       const Buffer& sparse_coo_values_buffer,
+                                       const Tensor& tensor) {
+  using IndexCType = typename IndexType::c_type;
+  using ValueCType = typename ValueType::c_type;
+
+  const auto& indices = sparse_coo_index.indices();
+  const auto* indices_data = sparse_coo_index.indices()->data()->data_as<IndexCType>();
+  const auto* sparse_coo_values = sparse_coo_values_buffer.data_as<ValueCType>();
+
+  ARROW_ASSIGN_OR_RAISE(auto non_zero_count, tensor.CountNonZero());
+
+  if (indices->shape()[0] != non_zero_count) {
+    return Status::Invalid("Mismatch between non-zero count in sparse tensor (",
+                           indices->shape()[0], ") and dense tensor (", non_zero_count,
+                           ")");
+  } else if (indices->shape()[1] != static_cast<int64_t>(tensor.shape().size())) {
+    return Status::Invalid("Mismatch between coordinate dimension in sparse tensor (",
+                           indices->shape()[1], ") and tensor shape (",
+                           tensor.shape().size(), ")");
+  }
+
+  auto coord_size = indices->shape()[1];
+  std::vector<int64_t> coord(coord_size);
+  for (int64_t i = 0; i < indices->shape()[0]; i++) {
+    if (!is_not_zero<ValueType>(sparse_coo_values[i])) {
+      return Status::Invalid("Sparse tensor values must be non-zero");
+    }
+
+    for (int64_t j = 0; j < coord_size; j++) {
+      coord[j] = static_cast<int64_t>(indices_data[i * coord_size + j]);
+    }
+
+    if (sparse_coo_values[i] != tensor.Value<ValueType>(coord)) {
+      if constexpr (is_floating_type<ValueType>::value) {
+        if (!std::isnan(tensor.Value<ValueType>(coord)) ||
+            !std::isnan(sparse_coo_values[i])) {
+          return Status::Invalid(
+              "Inconsistent values between sparse tensor and dense tensor");
+        }
+      } else {
+        return Status::Invalid(
+            "Inconsistent values between sparse tensor and dense tensor");
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
 template <typename IndexCType>
 inline void IncrementRowMajorIndex(std::vector<IndexCType>& coord,
                                    const std::vector<int64_t>& shape) {
@@ -210,7 +265,8 @@ class SparseCOOTensorConverter {
                                            indices_shape, indices_strides);
     ARROW_ASSIGN_OR_RAISE(sparse_index, SparseCOOIndex::Make(coords, true));
     data = std::move(values_buffer);
-
+    DCHECK_OK((ValidateSparseCooTensorCreation<ValueType, IndexType>(*sparse_index, *data,
+                                                                     tensor_)));
     return Status::OK();
   }
 
@@ -272,7 +328,7 @@ Status MakeSparseCOOTensorFromTensor(const Tensor& tensor,
                                      std::shared_ptr<Buffer>* out_data) {
   SparseCOOTensorConverter converter(tensor, index_value_type, pool);
   ConverterVisitor visitor{converter};
-  ARROW_RETURN_NOT_OK(VisitValueAndIndexType(tensor.type(), index_value_type, visitor));
+  ARROW_RETURN_NOT_OK(VisitValueAndIndexType(*tensor.type(), *index_value_type, visitor));
   *out_sparse_index = checked_pointer_cast<SparseIndex>(converter.sparse_index);
   *out_data = converter.data;
   return Status::OK();
diff --git a/cpp/src/arrow/tensor/csf_converter.cc b/cpp/src/arrow/tensor/csf_converter.cc
@@ -15,9 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "arrow/tensor/converter.h"
+#include "arrow/tensor/converter_internal.h"
 
 #include <algorithm>
+#include <cmath>
 #include <cstdint>
 #include <limits>
 #include <memory>
@@ -29,8 +30,11 @@
 #include "arrow/status.h"
 #include "arrow/tensor.h"
 #include "arrow/type.h"
+#include "arrow/type_traits.h"
 #include "arrow/util/checked_cast.h"
+#include "arrow/util/logging_internal.h"
 #include "arrow/util/sort_internal.h"
+#include "arrow/visit_type_inline.h"
 
 namespace arrow {
 
@@ -54,6 +58,89 @@ inline void IncrementIndex(std::vector<int64_t>& coord, const std::vector<int64_
   }
 }
 
+template <typename ValueType, typename IndexType>
+Status CheckValues(const SparseCSFIndex& sparse_csf_index,
+                   const typename ValueType::c_type* values, const Tensor& tensor,
+                   const int64_t dim, const int64_t dim_offset, const int64_t start,
+                   const int64_t stop) {
+  using ValueCType = typename ValueType::c_type;
+  using IndexCType = typename IndexType::c_type;
+
+  const auto& indices = sparse_csf_index.indices();
+  const auto& indptr = sparse_csf_index.indptr();
+  const auto& axis_order = sparse_csf_index.axis_order();
+  auto ndim = indices.size();
+  auto strides = tensor.strides();
+
+  const auto& cur_indices = indices[dim];
+  const auto* indices_data = cur_indices->data()->data_as<IndexCType>() + start;
+
+  if (dim == static_cast<int64_t>(ndim) - 1) {
+    for (auto i = start; i < stop; ++i) {
+      auto index = static_cast<int64_t>(*indices_data);
+      const int64_t offset = dim_offset + index * strides[axis_order[dim]];
+
+      auto sparse_value = values[i];
+      auto tensor_value =
+          *reinterpret_cast<const ValueCType*>(tensor.raw_data() + offset);
+      if (!is_not_zero<ValueType>(sparse_value)) {
+        return Status::Invalid("Sparse tensor values must be non-zero");
+      } else if (sparse_value != tensor_value) {
+        if constexpr (is_floating_type<ValueType>::value) {
+          if (!std::isnan(tensor_value) || !std::isnan(sparse_value)) {
+            return Status::Invalid(
+                "Inconsistent values between sparse tensor and dense tensor");
+          }
+        } else {
+          return Status::Invalid(
+              "Inconsistent values between sparse tensor and dense tensor");
+        }
+      }
+      ++indices_data;
+    }
+  } else {
+    const auto& cur_indptr = indptr[dim];
+    const auto* indptr_data = cur_indptr->data()->data_as<IndexCType>() + start;
+
+    for (int64_t i = start; i < stop; ++i) {
+      const int64_t index = *indices_data;
+      int64_t offset = dim_offset + index * strides[axis_order[dim]];
+      auto next_start = static_cast<int64_t>(*indptr_data);
+      auto next_stop = static_cast<int64_t>(*(indptr_data + 1));
+
+      ARROW_RETURN_NOT_OK((CheckValues<ValueType, IndexType>(
+          sparse_csf_index, values, tensor, dim + 1, offset, next_start, next_stop)));
+
+      ++indices_data;
+      ++indptr_data;
+    }
+  }
+  return Status::OK();
+}
+
+template <typename ValueType, typename IndexType>
+Status ValidateSparseTensorCSFCreation(const SparseIndex& sparse_index,
+                                       const Buffer& values_buffer,
+                                       const Tensor& tensor) {
+  auto sparse_csf_index = checked_cast<const SparseCSFIndex&>(sparse_index);
+  const auto* values = values_buffer.data_as<typename ValueType::c_type>();
+  const auto& indices = sparse_csf_index.indices();
+
+  ARROW_ASSIGN_OR_RAISE(auto non_zero_count, tensor.CountNonZero());
+  if (indices.back()->size() != non_zero_count) {
+    return Status::Invalid("Mismatch between non-zero count in sparse tensor (",
+                           indices.back()->size(), ") and dense tensor (", non_zero_count,
+                           ")");
+  } else if (indices.size() != tensor.shape().size()) {
+    return Status::Invalid("Mismatch between coordinate dimension in sparse tensor (",
+                           indices.size(), ") and tensor shape (", tensor.shape().size(),
+                           ")");
+  } else {
+    return CheckValues<ValueType, IndexType>(sparse_csf_index, values, tensor, 0, 0, 0,
+                                             sparse_csf_index.indptr()[0]->size() - 1);
+  }
+}
+
 // ----------------------------------------------------------------------
 // SparseTensorConverter for SparseCSFIndex
 
@@ -88,8 +175,10 @@ class SparseCSFTensorConverter {
     std::vector<int64_t> coord(ndim, 0);
     std::vector<int64_t> previous_coord(ndim, -1);
 
-    std::vector<TypedBufferBuilder<IndexCType>> indptr_buffer_builders(ndim - 1);
-    std::vector<TypedBufferBuilder<IndexCType>> indices_buffer_builders(ndim);
+    std::vector<TypedBufferBuilder<IndexCType>> indptr_buffer_builders(
+        ndim - 1, TypedBufferBuilder<IndexCType>(pool_));
+    std::vector<TypedBufferBuilder<IndexCType>> indices_buffer_builders(
+        ndim, TypedBufferBuilder<IndexCType>(pool_));
 
     auto* values = values_buffer->mutable_data_as<ValueCType>();
 
@@ -146,6 +235,8 @@ class SparseCSFTensorConverter {
     ARROW_ASSIGN_OR_RAISE(
         sparse_index, SparseCSFIndex::Make(index_value_type_, indices_shapes, axis_order,
                                            indptr_buffers, indices_buffers));
+    DCHECK_OK((ValidateSparseTensorCSFCreation<ValueType, IndexType>(*sparse_index, *data,
+                                                                     tensor_)));
     return Status::OK();
   }
 
@@ -262,7 +353,7 @@ Status MakeSparseCSFTensorFromTensor(const Tensor& tensor,
                                      std::shared_ptr<Buffer>* out_data) {
   SparseCSFTensorConverter converter(tensor, index_value_type, pool);
   ConverterVisitor visitor{converter};
-  ARROW_RETURN_NOT_OK(VisitValueAndIndexType(tensor.type(), index_value_type, visitor));
+  ARROW_RETURN_NOT_OK(VisitValueAndIndexType(*tensor.type(), *index_value_type, visitor));
   *out_sparse_index = checked_pointer_cast<SparseIndex>(converter.sparse_index);
   *out_data = converter.data;
   return Status::OK();
diff --git a/cpp/src/arrow/tensor/csx_converter.cc b/cpp/src/arrow/tensor/csx_converter.cc