Skip to content

Commit 93a3482

Browse files
committed
remove remainder
1 parent ebfff37 commit 93a3482

File tree

6 files changed

+120
-76
lines changed

6 files changed

+120
-76
lines changed

cpp/src/arrow/compute/kernels/scalar_cast_test.cc

Lines changed: 56 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include "arrow/chunked_array.h"
3232
#include "arrow/extension_type.h"
3333
#include "arrow/status.h"
34+
#include "arrow/testing/builder.h"
3435
#include "arrow/testing/extension_type.h"
3536
#include "arrow/testing/gtest_util.h"
3637
#include "arrow/testing/random.h"
@@ -55,34 +56,65 @@ using internal::checked_pointer_cast;
5556

5657
namespace compute {
5758

59+
// Build array with invalid UTF-8 without JSON parsing (simdjson validates UTF-8)
60+
template <typename TYPE>
61+
static std::shared_ptr<Array> InvalidUtf8Impl() {
62+
std::vector<std::string> values = {"Hi", "olá mundo", "你好世界", "", "\xa0\xa1"};
63+
std::shared_ptr<Array> out;
64+
ArrayFromVector<TYPE, std::string>(values, &out);
65+
return out;
66+
}
67+
5868
static std::shared_ptr<Array> InvalidUtf8(std::shared_ptr<DataType> type) {
59-
return ArrayFromJSON(type,
60-
"["
61-
R"(
62-
"Hi",
63-
"olá mundo",
64-
"你好世界",
65-
"",
66-
)"
67-
"\"\xa0\xa1\""
68-
"]");
69+
switch (type->id()) {
70+
case Type::BINARY:
71+
return InvalidUtf8Impl<BinaryType>();
72+
case Type::STRING:
73+
return InvalidUtf8Impl<StringType>();
74+
case Type::LARGE_BINARY:
75+
return InvalidUtf8Impl<LargeBinaryType>();
76+
case Type::LARGE_STRING:
77+
return InvalidUtf8Impl<LargeStringType>();
78+
case Type::BINARY_VIEW:
79+
return InvalidUtf8Impl<BinaryViewType>();
80+
case Type::STRING_VIEW:
81+
return InvalidUtf8Impl<StringViewType>();
82+
default:
83+
return nullptr;
84+
}
85+
}
86+
87+
// Build array with invalid UTF-8 without JSON parsing (simdjson validates UTF-8)
88+
// All values are 3 bytes for fixed_size_binary(3) compatibility
89+
template <typename TYPE>
90+
static std::shared_ptr<Array> FixedSizeInvalidUtf8Impl(
91+
const std::shared_ptr<DataType>& type) {
92+
std::vector<std::string> values = {"Hi!", "", "", " ", "\xa0\xa1\xa2"};
93+
std::shared_ptr<Array> out;
94+
ArrayFromVector<TYPE, std::string>(type, values, &out);
95+
return out;
6996
}
7097

7198
static std::shared_ptr<Array> FixedSizeInvalidUtf8(std::shared_ptr<DataType> type) {
72-
if (type->id() == Type::FIXED_SIZE_BINARY) {
73-
// Assume a particular width for testing
74-
EXPECT_EQ(3, checked_cast<const FixedSizeBinaryType&>(*type).byte_width());
75-
}
76-
return ArrayFromJSON(type,
77-
"["
78-
R"(
79-
"Hi!",
80-
"lá",
81-
"你",
82-
" ",
83-
)"
84-
"\"\xa0\xa1\xa2\""
85-
"]");
99+
switch (type->id()) {
100+
case Type::FIXED_SIZE_BINARY:
101+
EXPECT_EQ(3, checked_cast<const FixedSizeBinaryType&>(*type).byte_width());
102+
return FixedSizeInvalidUtf8Impl<FixedSizeBinaryType>(type);
103+
case Type::BINARY:
104+
return FixedSizeInvalidUtf8Impl<BinaryType>(type);
105+
case Type::STRING:
106+
return FixedSizeInvalidUtf8Impl<StringType>(type);
107+
case Type::LARGE_BINARY:
108+
return FixedSizeInvalidUtf8Impl<LargeBinaryType>(type);
109+
case Type::LARGE_STRING:
110+
return FixedSizeInvalidUtf8Impl<LargeStringType>(type);
111+
case Type::BINARY_VIEW:
112+
return FixedSizeInvalidUtf8Impl<BinaryViewType>(type);
113+
case Type::STRING_VIEW:
114+
return FixedSizeInvalidUtf8Impl<StringViewType>(type);
115+
default:
116+
return nullptr;
117+
}
86118
}
87119

88120
static std::vector<std::shared_ptr<DataType>> kNumericTypes = {

cpp/src/arrow/compute/kernels/scalar_string_test.cc

Lines changed: 38 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1102,7 +1102,8 @@ TYPED_TEST(TestStringKernels, Utf8Reverse) {
11021102

11031103
// inputs with malformed utf8 chars would produce garbage output, but the end result
11041104
// would produce arrays with same lengths. Hence checking offset buffer equality
1105-
auto malformed_input = ArrayFromJSON(this->type(), "[\"ɑ\xFFɑa\", \"ɽ\xe1\xbdɽa\"]");
1105+
// Use MakeArray for invalid UTF-8 since simdjson validates UTF-8 in JSON
1106+
auto malformed_input = this->MakeArray({"ɑ\xFFɑa", "ɽ\xe1\xbdɽa"});
11061107
const Result<Datum>& res = CallFunction("utf8_reverse", {malformed_input});
11071108
ASSERT_TRUE(res->array()->buffers[1]->Equals(*malformed_input->data()->buffers[1]));
11081109
}
@@ -2513,104 +2514,99 @@ TYPED_TEST(TestBinaryKernels, SliceBytesBasic) {
25132514
}
25142515

25152516
TYPED_TEST(TestBinaryKernels, SliceBytesPosPos) {
2517+
// Use MakeArray for invalid UTF-8 since simdjson validates UTF-8 in JSON
25162518
SliceOptions options{2, 4};
25172519
this->CheckUnary(
25182520
"binary_slice",
2519-
"[\"\", \"a\", \"ab\", \"a\xc2\xa2\", \"ab\xc2\xa2\", \"ab\xc2\xffZ\"]",
2520-
this->type(), "[\"\", \"\", \"\", \"\xa2\", \"\xc2\xa2\", \"\xc2\xff\"]", &options);
2521+
this->MakeArray({"", "a", "ab", "a\xc2\xa2", "ab\xc2\xa2", "ab\xc2\xffZ"}),
2522+
this->MakeArray({"", "", "", "\xa2", "\xc2\xa2", "\xc2\xff"}), &options);
25212523
SliceOptions options_step{1, 5, 2};
25222524
this->CheckUnary(
25232525
"binary_slice",
2524-
"[\"\", \"a\", \"ab\", \"a\xc2\xa2\", \"ab\xc2\xa2\", \"ab\xc2\xffZ\"]",
2525-
this->type(), "[\"\", \"\", \"b\", \"\xc2\", \"b\xa2\", \"b\xff\"]", &options_step);
2526+
this->MakeArray({"", "a", "ab", "a\xc2\xa2", "ab\xc2\xa2", "ab\xc2\xffZ"}),
2527+
this->MakeArray({"", "", "b", "\xc2", "b\xa2", "b\xff"}), &options_step);
25262528
SliceOptions options_step_neg{5, 1, -2};
25272529
this->CheckUnary(
25282530
"binary_slice",
2529-
"[\"\", \"a\", \"ab\", \"a\xc2\xa2\", \"ab\xc2\xa2\", \"ab\xc2\xffZ\"]",
2530-
this->type(), "[\"\", \"\", \"\", \"\xa2\", \"\xa2\", \"Z\xc2\"]",
2531-
&options_step_neg);
2531+
this->MakeArray({"", "a", "ab", "a\xc2\xa2", "ab\xc2\xa2", "ab\xc2\xffZ"}),
2532+
this->MakeArray({"", "", "", "\xa2", "\xa2", "Z\xc2"}), &options_step_neg);
25322533
options_step_neg.stop = 0;
25332534
this->CheckUnary(
25342535
"binary_slice",
2535-
"[\"\", \"a\", \"ab\", \"a\xc2\xa2\", \"aZ\xc2\xa2\", \"ab\xc2\xffZ\"]",
2536-
this->type(), "[\"\", \"\", \"b\", \"\xa2\", \"\xa2Z\", \"Z\xc2\"]",
2537-
&options_step_neg);
2536+
this->MakeArray({"", "a", "ab", "a\xc2\xa2", "aZ\xc2\xa2", "ab\xc2\xffZ"}),
2537+
this->MakeArray({"", "", "b", "\xa2", "\xa2Z", "Z\xc2"}), &options_step_neg);
25382538
}
25392539

25402540
TYPED_TEST(TestBinaryKernels, SliceBytesPosNeg) {
2541+
// Use MakeArray for invalid UTF-8 since simdjson validates UTF-8 in JSON
25412542
SliceOptions options{2, -1};
25422543
this->CheckUnary(
25432544
"binary_slice",
2544-
"[\"\", \"a\", \"ab\", \"a\xc2\xa2\", \"aZ\xc2\xa2\", \"ab\xc2\xffZ\"]",
2545-
this->type(), "[\"\", \"\", \"\", \"\", \"\xc2\", \"\xc2\xff\"]", &options);
2545+
this->MakeArray({"", "a", "ab", "a\xc2\xa2", "aZ\xc2\xa2", "ab\xc2\xffZ"}),
2546+
this->MakeArray({"", "", "", "", "\xc2", "\xc2\xff"}), &options);
25462547
SliceOptions options_step{1, -1, 2};
25472548
this->CheckUnary(
25482549
"binary_slice",
2549-
"[\"\", \"a\", \"ab\", \"a\xc2\xa2\", \"aZ\xc2\xa2\", \"ab\xc2\xffZ\"]",
2550-
this->type(), "[\"\", \"\", \"\", \"\xc2\", \"Z\", \"b\xff\"]", &options_step);
2550+
this->MakeArray({"", "a", "ab", "a\xc2\xa2", "aZ\xc2\xa2", "ab\xc2\xffZ"}),
2551+
this->MakeArray({"", "", "", "\xc2", "Z", "b\xff"}), &options_step);
25512552
SliceOptions options_step_neg{3, -4, -2};
25522553
this->CheckUnary(
25532554
"binary_slice",
2554-
"[\"\", \"a\", \"ab\", \"Z\xc2\xa2\", \"aZ\xc2\xa2\", \"ab\xc2\xffZ\"]",
2555-
this->type(), "[\"\", \"a\", \"b\", \"\xa2Z\", \"\xa2Z\", \"\xff\"]",
2556-
&options_step_neg);
2555+
this->MakeArray({"", "a", "ab", "Z\xc2\xa2", "aZ\xc2\xa2", "ab\xc2\xffZ"}),
2556+
this->MakeArray({"", "a", "b", "\xa2Z", "\xa2Z", "\xff"}), &options_step_neg);
25572557
options_step_neg.stop = -5;
25582558
this->CheckUnary(
25592559
"binary_slice",
2560-
"[\"\", \"a\", \"ab\", \"Z\xc2\xa2\", \"aZ\xc2\xa2\", \"aP\xc2\xffZ\"]",
2561-
this->type(), "[\"\", \"a\", \"b\", \"\xa2Z\", \"\xa2Z\", \"\xffP\"]",
2562-
&options_step_neg);
2560+
this->MakeArray({"", "a", "ab", "Z\xc2\xa2", "aZ\xc2\xa2", "aP\xc2\xffZ"}),
2561+
this->MakeArray({"", "a", "b", "\xa2Z", "\xa2Z", "\xffP"}), &options_step_neg);
25632562
}
25642563

25652564
TYPED_TEST(TestBinaryKernels, SliceBytesNegNeg) {
2565+
// Use MakeArray for invalid UTF-8 since simdjson validates UTF-8 in JSON
25662566
SliceOptions options{-2, -1};
25672567
this->CheckUnary(
25682568
"binary_slice",
2569-
"[\"\", \"a\", \"ab\", \"Z\xc2\xa2\", \"aZ\xc2\xa2\", \"ab\xc2\xffZ\"]",
2570-
this->type(), "[\"\", \"\", \"a\", \"\xc2\", \"\xc2\", \"\xff\"]", &options);
2569+
this->MakeArray({"", "a", "ab", "Z\xc2\xa2", "aZ\xc2\xa2", "ab\xc2\xffZ"}),
2570+
this->MakeArray({"", "", "a", "\xc2", "\xc2", "\xff"}), &options);
25712571
SliceOptions options_step{-4, -1, 2};
25722572
this->CheckUnary(
25732573
"binary_slice",
2574-
"[\"\", \"a\", \"ab\", \"Z\xc2\xa2\", \"aZ\xc2\xa2\", \"aP\xc2\xffZ\"]",
2575-
this->type(), "[\"\", \"\", \"a\", \"Z\", \"a\xc2\", \"P\xff\"]", &options_step);
2574+
this->MakeArray({"", "a", "ab", "Z\xc2\xa2", "aZ\xc2\xa2", "aP\xc2\xffZ"}),
2575+
this->MakeArray({"", "", "a", "Z", "a\xc2", "P\xff"}), &options_step);
25762576
SliceOptions options_step_neg{-1, -3, -2};
25772577
this->CheckUnary(
25782578
"binary_slice",
2579-
"[\"\", \"a\", \"ab\", \"Z\xc2\xa2\", \"aZ\xc2\xa2\", \"aP\xc2\xffZ\"]",
2580-
this->type(), "[\"\", \"a\", \"b\", \"\xa2\", \"\xa2\", \"Z\"]", &options_step_neg);
2579+
this->MakeArray({"", "a", "ab", "Z\xc2\xa2", "aZ\xc2\xa2", "aP\xc2\xffZ"}),
2580+
this->MakeArray({"", "a", "b", "\xa2", "\xa2", "Z"}), &options_step_neg);
25812581
options_step_neg.stop = -4;
25822582
this->CheckUnary(
25832583
"binary_slice",
2584-
"[\"\", \"a\", \"ab\", \"Z\xc2\xa2\", \"aZ\xc2\xa2\", \"aP\xc2\xffZ\"]",
2585-
this->type(), "[\"\", \"a\", \"b\", \"\xa2Z\", \"\xa2Z\", \"Z\xc2\"]",
2586-
&options_step_neg);
2584+
this->MakeArray({"", "a", "ab", "Z\xc2\xa2", "aZ\xc2\xa2", "aP\xc2\xffZ"}),
2585+
this->MakeArray({"", "a", "b", "\xa2Z", "\xa2Z", "Z\xc2"}), &options_step_neg);
25872586
}
25882587

25892588
TYPED_TEST(TestBinaryKernels, SliceBytesNegPos) {
2589+
// Use MakeArray for invalid UTF-8 since simdjson validates UTF-8 in JSON
25902590
SliceOptions options{-2, 4};
25912591
this->CheckUnary(
25922592
"binary_slice",
2593-
"[\"\", \"a\", \"ab\", \"Z\xc2\xa2\", \"aZ\xc2\xa2\", \"aP\xc2\xffZ\"]",
2594-
this->type(), "[\"\", \"a\", \"ab\", \"\xc2\xa2\", \"\xc2\xa2\", \"\xff\"]",
2595-
&options);
2593+
this->MakeArray({"", "a", "ab", "Z\xc2\xa2", "aZ\xc2\xa2", "aP\xc2\xffZ"}),
2594+
this->MakeArray({"", "a", "ab", "\xc2\xa2", "\xc2\xa2", "\xff"}), &options);
25962595
SliceOptions options_step{-4, 4, 2};
25972596
this->CheckUnary(
25982597
"binary_slice",
2599-
"[\"\", \"a\", \"ab\", \"Z\xc2\xa2\", \"aZ\xc2\xa2\", \"aP\xc2\xffZ\"]",
2600-
this->type(), "[\"\", \"a\", \"a\", \"Z\xa2\", \"a\xc2\", \"P\xff\"]",
2601-
&options_step);
2598+
this->MakeArray({"", "a", "ab", "Z\xc2\xa2", "aZ\xc2\xa2", "aP\xc2\xffZ"}),
2599+
this->MakeArray({"", "a", "a", "Z\xa2", "a\xc2", "P\xff"}), &options_step);
26022600
SliceOptions options_step_neg{-1, 1, -2};
26032601
this->CheckUnary(
26042602
"binary_slice",
2605-
"[\"\", \"a\", \"ab\", \"Z\xc2\xa2\", \"aZ\xc2\xa2\", \"aP\xc2\xffZ\"]",
2606-
this->type(), "[\"\", \"\", \"\", \"\xa2\", \"\xa2\", \"Z\xc2\"]",
2607-
&options_step_neg);
2603+
this->MakeArray({"", "a", "ab", "Z\xc2\xa2", "aZ\xc2\xa2", "aP\xc2\xffZ"}),
2604+
this->MakeArray({"", "", "", "\xa2", "\xa2", "Z\xc2"}), &options_step_neg);
26082605
options_step_neg.stop = 0;
26092606
this->CheckUnary(
26102607
"binary_slice",
2611-
"[\"\", \"a\", \"ab\", \"Z\xc2\xa2\", \"aZ\xc2\xa2\", \"aP\xc2\xffZ\"]",
2612-
this->type(), "[\"\", \"\", \"b\", \"\xa2\", \"\xa2Z\", \"Z\xc2\"]",
2613-
&options_step_neg);
2608+
this->MakeArray({"", "a", "ab", "Z\xc2\xa2", "aZ\xc2\xa2", "aP\xc2\xffZ"}),
2609+
this->MakeArray({"", "", "b", "\xa2", "\xa2Z", "Z\xc2"}), &options_step_neg);
26142610
}
26152611

26162612
TYPED_TEST(TestStringKernels, PadAscii) {

cpp/src/arrow/json/from_string.cc

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,20 @@ std::string PreprocessNanInf(std::string_view json_string) {
132132
continue;
133133
}
134134
}
135+
// Check for -NaN
136+
if (i + 4 <= json_string.size() &&
137+
(json_string[i] == '-' && json_string[i + 1] == 'N' &&
138+
json_string[i + 2] == 'a' && json_string[i + 3] == 'N')) {
139+
bool before_ok =
140+
(i == 0 || !std::isalnum(static_cast<unsigned char>(json_string[i - 1])));
141+
bool after_ok = (i + 4 >= json_string.size() ||
142+
!std::isalnum(static_cast<unsigned char>(json_string[i + 4])));
143+
if (before_ok && after_ok) {
144+
result += "\"-NaN\"";
145+
i += 3;
146+
continue;
147+
}
148+
}
135149
}
136150

137151
result.push_back(c);
@@ -381,7 +395,7 @@ enable_if_half_float<T, Status> ConvertNumber(simdjson::dom::element json_obj,
381395
} else if (json_obj.is_string()) {
382396
// Handle NaN/Inf that were preprocessed to strings
383397
std::string_view str = json_obj.get_string().value();
384-
if (str == "NaN") {
398+
if (str == "NaN" || str == "-NaN") {
385399
*out = Float16(std::nan("")).bits();
386400
return Status::OK();
387401
} else if (str == "Inf") {
@@ -416,7 +430,7 @@ enable_if_physical_floating_point<T, Status> ConvertNumber(
416430
} else if (json_obj.is_string()) {
417431
// Handle NaN/Inf that were preprocessed to strings
418432
std::string_view str = json_obj.get_string().value();
419-
if (str == "NaN") {
433+
if (str == "NaN" || str == "-NaN") {
420434
*out = static_cast<typename T::c_type>(std::nan(""));
421435
return Status::OK();
422436
} else if (str == "Inf") {

cpp/src/arrow/meson.build

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -308,10 +308,7 @@ if needs_csv
308308
endif
309309

310310
if needs_json or needs_integration
311-
simdjson_dep = dependency(
312-
'simdjson',
313-
fallback: ['simdjson', 'simdjson_dep'],
314-
)
311+
simdjson_dep = dependency('simdjson', fallback: ['simdjson', 'simdjson_dep'])
315312
else
316313
simdjson_dep = disabler()
317314
endif

cpp/src/arrow/scalar_test.cc

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include <gtest/gtest.h>
2929

3030
#include "arrow/array.h"
31+
#include "arrow/array/builder_binary.h"
3132
#include "arrow/array/util.h"
3233
#include "arrow/buffer.h"
3334
#include "arrow/compute/cast.h"
@@ -1321,7 +1322,13 @@ class TestListLikeScalar : public ::testing::Test {
13211322

13221323
{
13231324
// Invalid UTF8 in child data
1324-
ScalarType scalar(ArrayFromJSON(utf8(), "[null, null, \"\xff\"]"));
1325+
// Build the array without JSON parsing since simdjson validates UTF-8
1326+
StringBuilder builder;
1327+
ASSERT_OK(builder.AppendNull());
1328+
ASSERT_OK(builder.AppendNull());
1329+
ASSERT_OK(builder.Append("\xff"));
1330+
ASSERT_OK_AND_ASSIGN(auto arr, builder.Finish());
1331+
ScalarType scalar(arr);
13251332
ASSERT_OK(scalar.Validate());
13261333
ASSERT_RAISES(Invalid, scalar.ValidateFull());
13271334
}

cpp/subprojects/packagefiles/simdjson/meson.build

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,7 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18-
project('simdjson', 'cpp',
19-
version: '3.12.3',
20-
default_options: ['cpp_std=c++17'])
18+
project('simdjson', 'cpp', version: '3.12.3', default_options: ['cpp_std=c++17'])
2119

2220
simdjson_lib = static_library(
2321
'simdjson',

0 commit comments

Comments
 (0)