Skip to content

Commit 174e514

Browse files
committed
work
1 parent 3e6c5ba commit 174e514

File tree

2 files changed

+20
-6
lines changed

2 files changed

+20
-6
lines changed

cpp/src/arrow/compute/kernels/scalar_string_test.cc

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1225,8 +1225,8 @@ TYPED_TEST(TestStringKernels, Utf8Upper) {
12251225
// test maximum buffer growth
12261226
this->CheckUnary("utf8_upper", "[\"ɑɑɑɑ\"]", this->type(), "[\"ⱭⱭⱭⱭ\"]");
12271227

1228-
// Test invalid data
1229-
auto invalid_input = ArrayFromJSON(this->type(), "[\"ɑa\xFFɑ\", \"ɽ\xe1\xbdɽaa\"]");
1228+
// Test invalid data - use MakeArray since simdjson validates UTF-8 strictly
1229+
auto invalid_input = this->MakeArray({"ɑa\xFFɑ", "ɽ\xe1\xbdɽaa"});
12301230
EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 sequence"),
12311231
CallFunction("utf8_upper", {invalid_input}));
12321232
}
@@ -1247,8 +1247,8 @@ TYPED_TEST(TestStringKernels, Utf8Lower) {
12471247
// test maximum buffer growth
12481248
this->CheckUnary("utf8_lower", "[\"ȺȺȺȺ\"]", this->type(), "[\"ⱥⱥⱥⱥ\"]");
12491249

1250-
// Test invalid data
1251-
auto invalid_input = ArrayFromJSON(this->type(), "[\"Ⱥa\xFF\", \"\xe1\xbdⱤaA\"]");
1250+
// Test invalid data - use MakeArray since simdjson validates UTF-8 strictly
1251+
auto invalid_input = this->MakeArray({"Ⱥa\xFF", "\xe1\xbdⱤaA"});
12521252
EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 sequence"),
12531253
CallFunction("utf8_lower", {invalid_input}));
12541254
}
@@ -1267,8 +1267,8 @@ TYPED_TEST(TestStringKernels, Utf8SwapCase) {
12671267
this->CheckUnary("utf8_swapcase", "[\"hEllO, WoRld!\", \"$. A35?\"]", this->type(),
12681268
"[\"HeLLo, wOrLD!\", \"$. a35?\"]");
12691269

1270-
// Test invalid data
1271-
auto invalid_input = ArrayFromJSON(this->type(), "[\"Ⱥa\xFF\", \"\xe1\xbdⱤaA\"]");
1270+
// Test invalid data - use MakeArray since simdjson validates UTF-8 strictly
1271+
auto invalid_input = this->MakeArray({"Ⱥa\xFF", "\xe1\xbdⱤaA"});
12721272
EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 sequence"),
12731273
CallFunction("utf8_swapcase", {invalid_input}));
12741274
}

cpp/src/arrow/dataset/file_json.cc

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,18 @@ namespace {
5353

5454
using ReaderPtr = std::shared_ptr<json::StreamingReader>;
5555

56+
// Strip UTF-8 BOM from the beginning of data if present
57+
std::string_view StripBOM(std::string_view data) {
58+
// UTF-8 BOM is 0xEF 0xBB 0xBF
59+
if (data.size() >= 3) {
60+
const auto* bytes = reinterpret_cast<const uint8_t*>(data.data());
61+
if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
62+
return data.substr(3);
63+
}
64+
}
65+
return data;
66+
}
67+
5668
struct JsonInspectedFragment : public InspectedFragment {
5769
JsonInspectedFragment() : InspectedFragment({}) {}
5870
JsonInspectedFragment(std::vector<std::string> column_names,
@@ -137,6 +149,8 @@ json::ParseOptions GetInitialParseOptions(json::ParseOptions options) {
137149

138150
Result<std::shared_ptr<StructType>> ParseToStructType(
139151
std::string_view data, const json::ParseOptions& parse_options, MemoryPool* pool) {
152+
// Strip UTF-8 BOM if present
153+
data = StripBOM(data);
140154
auto full_buffer = std::make_shared<Buffer>(data);
141155
std::shared_ptr<Buffer> buffer, partial;
142156
auto chunker = json::MakeChunker(parse_options);

0 commit comments

Comments
 (0)