Skip to content

Commit 7ef5648

Browse files
andishgarkou
andauthored
GH-46937: [C++] Enable arrow::EqualOptions for arrow::Table (#47164)
### Rationale for this change Enable floating-point, Schema, Metadata comparison in `arrow::Table::Equals`. ### What changes are included in this PR? Add support for `arrow::EqualOptions` in `arrow::Table::Equals`. ### Are these changes tested? Yes, the relevant unit tests have been run. ### Are there any user-facing changes? Yes, `arrow::EqualOptions` can now be used with `arrow::Table::Equals`. * GitHub Issue: #46937 Lead-authored-by: Arash Andishgar <[email protected]> Co-authored-by: Sutou Kouhei <[email protected]> Signed-off-by: Sutou Kouhei <[email protected]>
1 parent 479662e commit 7ef5648

File tree

4 files changed

+263
-42
lines changed

4 files changed

+263
-42
lines changed

cpp/src/arrow/record_batch.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ class ARROW_EXPORT RecordBatch {
122122
///
123123
/// \param[in] other the RecordBatch to compare with
124124
/// \param[in] check_metadata if true, the schema metadata will be compared,
125-
/// regardless of the value set in \ref EqualOptions::use_metadata_
125+
/// regardless of the value set in \ref EqualOptions::use_metadata
126126
/// \param[in] opts the options for equality comparisons
127127
/// \return true if batches are equal
128128
bool Equals(const RecordBatch& other, bool check_metadata = false,

cpp/src/arrow/table.cc

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "arrow/array/concatenate.h"
3131
#include "arrow/array/util.h"
3232
#include "arrow/chunked_array.h"
33+
#include "arrow/compare.h"
3334
#include "arrow/compute/cast.h"
3435
#include "arrow/pretty_print.h"
3536
#include "arrow/record_batch.h"
@@ -534,19 +535,52 @@ Result<std::shared_ptr<Table>> PromoteTableToSchema(const std::shared_ptr<Table>
534535
return Table::Make(schema, std::move(columns));
535536
}
536537

537-
bool Table::Equals(const Table& other, bool check_metadata) const {
538-
if (this == &other) {
538+
namespace {
539+
540+
bool ContainFloat(const std::shared_ptr<DataType>& type) {
541+
if (is_floating(type->id())) {
539542
return true;
540543
}
541-
if (!schema_->Equals(*other.schema(), check_metadata)) {
542-
return false;
544+
545+
for (const auto& field : type->fields()) {
546+
if (ContainFloat(field->type())) {
547+
return true;
548+
}
549+
}
550+
return false;
551+
}
552+
553+
bool CanIgnoreNan(const Schema& schema, const EqualOptions& opts) {
554+
if (opts.nans_equal()) {
555+
return true;
556+
}
557+
558+
for (auto& field : schema.fields()) {
559+
if (ContainFloat(field->type())) {
560+
return false;
561+
}
543562
}
544-
if (this->num_columns() != other.num_columns()) {
545-
return false;
563+
return true;
564+
}
565+
566+
} // namespace
567+
568+
bool Table::Equals(const Table& other, const EqualOptions& opts) const {
569+
if (this == &other) {
570+
if (CanIgnoreNan(*schema_, opts)) {
571+
return true;
572+
}
573+
} else {
574+
if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) {
575+
return false;
576+
} else if (opts.use_schema() &&
577+
!schema_->Equals(*other.schema(), opts.use_metadata())) {
578+
return false;
579+
}
546580
}
547581

548582
for (int i = 0; i < this->num_columns(); i++) {
549-
if (!this->column(i)->Equals(other.column(i))) {
583+
if (!this->column(i)->Equals(other.column(i), opts)) {
550584
return false;
551585
}
552586
}

cpp/src/arrow/table.h

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include <vector>
2424

2525
#include "arrow/chunked_array.h" // IWYU pragma: keep
26+
#include "arrow/compare.h"
2627
#include "arrow/record_batch.h"
2728
#include "arrow/status.h"
2829
#include "arrow/type.h"
@@ -203,11 +204,24 @@ class ARROW_EXPORT Table {
203204
/// \brief Return the number of rows (equal to each column's logical length)
204205
int64_t num_rows() const { return num_rows_; }
205206

206-
/// \brief Determine if tables are equal
207+
/// \brief Determine if two tables are equal
207208
///
208-
/// Two tables can be equal only if they have equal schemas.
209-
/// However, they may be equal even if they have different chunkings.
210-
bool Equals(const Table& other, bool check_metadata = false) const;
209+
/// \param[in] other the table to compare with
210+
/// \param[in] opts the options for equality comparisons
211+
/// \return true if two tables are equal
212+
bool Equals(const Table& other, const EqualOptions& opts) const;
213+
214+
/// \brief Determine if two tables are equal
215+
///
216+
/// \param[in] other the table to compare with
217+
/// \param[in] check_metadata if true, the schema metadata will be compared,
218+
/// regardless of the value set in \ref EqualOptions::use_metadata
219+
/// \param[in] opts the options for equality comparisons
220+
/// \return true if two tables are equal
221+
bool Equals(const Table& other, bool check_metadata = false,
222+
const EqualOptions& opts = EqualOptions::Defaults()) const {
223+
return Equals(other, opts.use_metadata(check_metadata));
224+
}
211225

212226
/// \brief Make a new table by combining the chunks this table has.
213227
///

cpp/src/arrow/table_test.cc

Lines changed: 203 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "arrow/array/data.h"
3030
#include "arrow/array/util.h"
3131
#include "arrow/chunked_array.h"
32+
#include "arrow/compare.h"
3233
#include "arrow/compute/cast.h"
3334
#include "arrow/record_batch.h"
3435
#include "arrow/status.h"
@@ -152,38 +153,210 @@ TEST_F(TestTable, AllColumnsAndFields) {
152153
ASSERT_EQ(0, fields.size());
153154
}
154155

155-
TEST_F(TestTable, Equals) {
156-
const int length = 100;
157-
MakeExample1(length);
156+
TEST(TestTableEquality, Equals) {
157+
const int32_t length = 10;
158158

159-
table_ = Table::Make(schema_, columns_);
159+
auto f0 = field("f0", int32());
160+
auto f1 = field("f1", uint8());
161+
auto f2 = field("f2", int16());
160162

161-
ASSERT_TRUE(table_->Equals(*table_));
162-
// Differing schema
163-
auto f0 = field("f3", int32());
164-
auto f1 = field("f4", uint8());
165-
auto f2 = field("f5", int16());
166-
std::vector<std::shared_ptr<Field>> fields = {f0, f1, f2};
167-
auto other_schema = std::make_shared<Schema>(fields);
168-
auto other = Table::Make(other_schema, columns_);
169-
ASSERT_FALSE(table_->Equals(*other));
170-
// Differing columns
171-
std::vector<std::shared_ptr<ChunkedArray>> other_columns = {
172-
std::make_shared<ChunkedArray>(
173-
gen_.ArrayOf(int32(), length, /*null_probability=*/0.3)),
174-
std::make_shared<ChunkedArray>(
175-
gen_.ArrayOf(uint8(), length, /*null_probability=*/0.3)),
176-
std::make_shared<ChunkedArray>(
177-
gen_.ArrayOf(int16(), length, /*null_probability=*/0.3))};
178-
179-
other = Table::Make(schema_, other_columns);
180-
ASSERT_FALSE(table_->Equals(*other));
181-
182-
// Differing schema metadata
183-
other_schema = schema_->WithMetadata(::arrow::key_value_metadata({"key"}, {"value"}));
184-
other = Table::Make(other_schema, columns_);
185-
ASSERT_TRUE(table_->Equals(*other));
186-
ASSERT_FALSE(table_->Equals(*other, /*check_metadata=*/true));
163+
auto schema = ::arrow::schema({f0, f1, f2});
164+
auto schema_same = ::arrow::schema({f0, f1, f2});
165+
auto schema_fewer_fields = ::arrow::schema({f0, f1});
166+
167+
random::RandomArrayGenerator gen(42);
168+
169+
auto a_f0 = gen.ArrayOf(int32(), length);
170+
auto a_f1 = gen.ArrayOf(uint8(), length);
171+
auto a_f2 = gen.ArrayOf(int16(), length);
172+
auto a_f0_half = a_f0->Slice(0, length / 2);
173+
auto a_f1_half = a_f1->Slice(0, length / 2);
174+
auto a_f2_half = a_f2->Slice(0, length / 2);
175+
auto a_f0_different = gen.ArrayOf(int32(), length);
176+
auto a_f1_different = gen.ArrayOf(uint8(), length);
177+
auto a_f2_different = gen.ArrayOf(uint16(), length);
178+
179+
auto table = Table::Make(schema, {a_f0, a_f1, a_f2}, length);
180+
auto table_same = Table::Make(schema_same, {a_f0, a_f1, a_f2}, length);
181+
auto table_fewer_fields = Table::Make(schema_fewer_fields, {a_f0, a_f1}, length);
182+
auto table_half =
183+
Table::Make(schema_fewer_fields, {a_f0_half, a_f1_half, a_f2_half}, length / 2);
184+
auto table_different = Table::Make(
185+
schema_fewer_fields, {a_f0_different, a_f1_different, a_f2_different}, length);
186+
187+
// Same Values
188+
ASSERT_TRUE(table->Equals(*table_same));
189+
190+
// Different number of columns
191+
ASSERT_FALSE(table->Equals(*table_fewer_fields));
192+
193+
// Different number of rows
194+
ASSERT_FALSE(table->Equals(*table_half));
195+
196+
// Different values
197+
ASSERT_FALSE(table->Equals(*table_different));
198+
}
199+
200+
TEST(TestTableEquality, MetadataAndSchema) {
201+
const int32_t length = 10;
202+
203+
auto f0 = field("f0", int32());
204+
auto f1 = field("f1", uint8());
205+
auto f2 = field("f2", int16());
206+
auto f2_renamed = field("f2b", int16());
207+
208+
auto metadata = key_value_metadata({"foo"}, {"bar"});
209+
210+
auto schema = ::arrow::schema({f0, f1, f2});
211+
auto schema_with_metadata = schema->WithMetadata(metadata);
212+
auto schema_renamed_field = ::arrow::schema({f0, f1, f2_renamed});
213+
214+
random::RandomArrayGenerator gen(42);
215+
216+
auto a_f0 = gen.ArrayOf(int32(), length);
217+
auto a_f1 = gen.ArrayOf(uint8(), length);
218+
auto a_f2 = gen.ArrayOf(int16(), length);
219+
auto a_f2_renamed = a_f2;
220+
221+
// All Tables have the same values but different schemas.
222+
auto table = Table::Make(schema, {a_f0, a_f1, a_f2}, length);
223+
auto table_with_metadata =
224+
Table::Make(schema_with_metadata, {a_f0, a_f1, a_f2}, length);
225+
auto table_renamed_field =
226+
Table::Make(schema_renamed_field, {a_f0, a_f1, a_f2_renamed}, length);
227+
228+
auto options = EqualOptions::Defaults();
229+
230+
// Same values and types, but different field names
231+
ASSERT_FALSE(table->Equals(*table_renamed_field));
232+
ASSERT_TRUE(table->Equals(*table_renamed_field, options.use_schema(false)));
233+
234+
// Different metadata
235+
ASSERT_TRUE(table->Equals(*table_with_metadata));
236+
ASSERT_TRUE(table->Equals(*table_with_metadata, options));
237+
ASSERT_FALSE(table->Equals(*table_with_metadata,
238+
/*check_metadata=*/true));
239+
ASSERT_FALSE(table->Equals(*table_with_metadata,
240+
/*check_metadata=*/true, options.use_schema(true)));
241+
ASSERT_TRUE(table->Equals(*table_with_metadata,
242+
/*check_metadata=*/true, options.use_schema(false)));
243+
ASSERT_TRUE(
244+
table->Equals(*table_with_metadata, options.use_schema(true).use_metadata(false)));
245+
ASSERT_FALSE(
246+
table->Equals(*table_with_metadata, options.use_schema(true).use_metadata(true)));
247+
ASSERT_TRUE(
248+
table->Equals(*table_with_metadata, options.use_schema(false).use_metadata(true)));
249+
}
250+
251+
TEST(TestTableEqualityFloatType, SameValue) {
252+
auto schema = ::arrow::schema({field("f0", int32()), field("f1", float64())});
253+
auto table = TableFromJSON(
254+
schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": 5.0}, {"f0": 3, "f1": 6.0}])"});
255+
auto other_table = TableFromJSON(
256+
schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": 5.0}, {"f0": 3, "f1": 6.0}])"});
257+
258+
ASSERT_TRUE(table->Equals(*other_table));
259+
}
260+
261+
TEST(TestTableEqualityFloatType, SingedZero) {
262+
auto schema = ::arrow::schema({field("f0", int32()), field("f1", float64())});
263+
auto table = TableFromJSON(
264+
schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": -0.0}, {"f0": 3, "f1": 0.0}])"});
265+
auto other_table = TableFromJSON(
266+
schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": 0.0}, {"f0": 3, "f1": -0.0}])"});
267+
auto options = EqualOptions::Defaults();
268+
269+
ASSERT_TRUE(table->Equals(*other_table, options));
270+
ASSERT_FALSE(table->Equals(*other_table, options.signed_zeros_equal(false)));
271+
}
272+
273+
TEST(TestTableEqualityFloatType, Infinity) {
274+
auto schema = ::arrow::schema({field("f0", int32()), field("f1", float64())});
275+
auto table = TableFromJSON(
276+
schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": 5.0}, {"f0": 3, "f1": Inf}])"});
277+
auto table_different_inf = TableFromJSON(
278+
schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": 5.0}, {"f0": 3, "f1": -Inf}])"});
279+
auto table_same_inf = TableFromJSON(
280+
schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": 5.0}, {"f0": 3, "f1": Inf}])"});
281+
282+
ASSERT_FALSE(table->Equals(*table_different_inf));
283+
ASSERT_TRUE(table->Equals(*table_same_inf));
284+
}
285+
286+
TEST(TestTableEqualityFloatType, NaN) {
287+
auto schema = ::arrow::schema({field("f0", int32()), field("f1", float64())});
288+
auto table = TableFromJSON(
289+
schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": NaN}, {"f0": 3, "f1": 6.0}])"});
290+
auto other_table = TableFromJSON(
291+
schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": NaN}, {"f0": 3, "f1": 6.0}])"});
292+
auto options = EqualOptions::Defaults();
293+
294+
ASSERT_FALSE(table->Equals(*other_table, options));
295+
ASSERT_TRUE(table->Equals(*other_table, options.nans_equal(true)));
296+
}
297+
298+
TEST(TestTableEqualityFloatType, Approximate) {
299+
auto schema = ::arrow::schema({field("f0", int32()), field("f1", float64())});
300+
auto table = TableFromJSON(
301+
schema,
302+
{R"([{"f0": 1, "f1": 4.0001}, {"f0": 2, "f1": 5.0001}, {"f0": 3, "f1": 6.0001}])"});
303+
auto other_table = TableFromJSON(
304+
schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": 5.0}, {"f0": 3, "f1": 6.0}])"});
305+
auto options = EqualOptions::Defaults();
306+
307+
ASSERT_FALSE(table->Equals(*other_table, options));
308+
309+
ASSERT_TRUE(table->Equals(*other_table, options.use_atol(true).atol(1e-3)));
310+
311+
ASSERT_FALSE(table->Equals(*other_table, options.use_atol(true).atol(1e-5)));
312+
}
313+
314+
TEST(TestTableEqualitySameAddress, NonFloatType) {
315+
auto schema = ::arrow::schema({field("f0", int32()), field("f1", uint8())});
316+
auto table = TableFromJSON(
317+
schema, {R"([{"f0": 1, "f1": 4}, {"f0": 2, "f1": 5}, {"f0": 3, "f1": 6}])"});
318+
auto other_table = table;
319+
auto options = EqualOptions::Defaults();
320+
321+
ASSERT_TRUE(table->Equals(*other_table, options));
322+
ASSERT_TRUE(table->Equals(*other_table, options.nans_equal(true)));
323+
}
324+
325+
TEST(TestTableEqualitySameAddress, NestedTypesWithoutFloatType) {
326+
auto schema = ::arrow::schema(
327+
{field("f0", int32()), field("f1", struct_({{"f2", utf8()}, {"f3", int64()}}))});
328+
auto table = TableFromJSON(
329+
schema,
330+
{R"([{"f0": 1, "f1": {"f2": "4", "f3": 7}}, {"f0": 2, "f1": {"f2": "5", "f3": 8}}, {"f0": 3, "f1": {"f2" : "6", "f3": 9}}])"});
331+
auto other_table = table;
332+
auto options = EqualOptions::Defaults();
333+
334+
ASSERT_TRUE(table->Equals(*other_table, options));
335+
ASSERT_TRUE(table->Equals(*other_table, options.nans_equal(true)));
336+
}
337+
338+
TEST(TestTableEqualitySameAddress, FloatType) {
339+
auto schema = ::arrow::schema({field("f0", int32()), field("f1", float64())});
340+
auto table = TableFromJSON(
341+
schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": NaN}, {"f0": 3, "f1": 6.0}])"});
342+
auto other_table = table;
343+
auto options = EqualOptions::Defaults();
344+
345+
ASSERT_FALSE(table->Equals(*other_table, options));
346+
ASSERT_TRUE(table->Equals(*other_table, options.nans_equal(true)));
347+
}
348+
349+
TEST(TestTableEqualitySameAddress, NestedTypesWithFloatType) {
350+
auto schema = ::arrow::schema(
351+
{field("f0", int32()), field("f1", struct_({{"f2", utf8()}, {"f3", float64()}}))});
352+
auto table = TableFromJSON(
353+
schema,
354+
{R"([{"f0": 1, "f1": {"f2": "4", "f3": 7.0}}, {"f0": 2, "f1": {"f2": "5", "f3": NaN}}, {"f0": 3,"f1": {"f2" : "6", "f3": 9.0}}])"});
355+
auto other_table = table;
356+
auto options = EqualOptions::Defaults();
357+
358+
ASSERT_FALSE(table->Equals(*other_table, options));
359+
ASSERT_TRUE(table->Equals(*other_table, options.nans_equal(true)));
187360
}
188361

189362
TEST_F(TestTable, MakeEmpty) {

0 commit comments

Comments
 (0)