Skip to content

Commit 72b0346

Browse files
HuaHuaYwgtmac
andauthored
GH-41239: [C++] Support to write csv header without quotes (#47524)
### Rationale for this change Give an option to determine whether to write quotes in csv header in C++ code. ### What changes are included in this PR? 1. Add `QuotingStyle quoting_header` option in `WriteOptions` to determine whether to write quotes in `CSVWriterImpl::WriteHeader`. 2. Move part of the code in `UnquotedColumnPopulator::CheckStringArrayHasNoStructuralChars` into a new function to reuse the logic of checking structural characters. ### Are these changes tested? Yes. ### Are there any user-facing changes? Add `QuotingStyle quoting_header` option in `WriteOptions`. * GitHub Issue: #41239 Lead-authored-by: Zehua Zou <[email protected]> Co-authored-by: Zehua Zou <[email protected]> Co-authored-by: Gang Wu <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent 931acd8 commit 72b0346

File tree

3 files changed

+121
-41
lines changed

3 files changed

+121
-41
lines changed

cpp/src/arrow/csv/options.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,12 @@ struct ARROW_EXPORT WriteOptions {
209209
/// \brief Quoting style
210210
QuotingStyle quoting_style = QuotingStyle::Needed;
211211

212+
/// \brief Quoting style of header
213+
///
214+
/// Note that `QuotingStyle::Needed` and `QuotingStyle::AllValid` have the same
215+
/// effect of quoting all column names.
216+
QuotingStyle quoting_header = QuotingStyle::Needed;
217+
212218
/// Create write options with default values
213219
static WriteOptions Defaults();
214220

cpp/src/arrow/csv/writer.cc

Lines changed: 87 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,8 @@ int64_t CountQuotes(std::string_view s) {
105105

106106
// Matching quote pair character length.
107107
constexpr int64_t kQuoteCount = 2;
108-
constexpr int64_t kQuoteDelimiterCount = kQuoteCount + /*end_char*/ 1;
108+
// Delimiter character length.
109+
constexpr int64_t kDelimiterCount = 1;
109110

110111
// Interface for generating CSV data per column.
111112
// The intended usage is to iteratively call UpdateRowLengths for a column and
@@ -176,6 +177,34 @@ char* Escape(std::string_view s, char* out) {
176177
return out;
177178
}
178179

180+
// Return the index of the first structural char in the input. A structural char
181+
// is a character that needs quoting and/or escaping.
182+
int64_t StopAtStructuralChar(const uint8_t* data, const int64_t buffer_size,
183+
const char delimiter) {
184+
int64_t offset = 0;
185+
#if defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON)
186+
// _mm_cmpistrc gives slightly better performance than the naive approach,
187+
// probably doesn't deserve the effort
188+
using simd_batch = xsimd::make_sized_batch_t<uint8_t, 16>;
189+
while ((offset + 16) <= buffer_size) {
190+
const auto v = simd_batch::load_unaligned(data + offset);
191+
if (xsimd::any((v == '\n') | (v == '\r') | (v == '"') | (v == delimiter))) {
192+
break;
193+
}
194+
offset += 16;
195+
}
196+
#endif
197+
while (offset < buffer_size) {
198+
// error happened or remaining bytes to check
199+
const char c = static_cast<char>(data[offset]);
200+
if (c == '\n' || c == '\r' || c == '"' || c == delimiter) {
201+
break;
202+
}
203+
++offset;
204+
}
205+
return offset;
206+
}
207+
179208
// Populator used for non-string/binary types, or when unquoted strings/binary types are
180209
// desired. It assumes the strings in the casted array do not require quoting or escaping.
181210
// This is enforced by setting reject_values_with_quotes to true, in which case a check
@@ -268,35 +297,18 @@ class UnquotedColumnPopulator : public ColumnPopulator {
268297
// scan the underlying string array buffer as a single big string
269298
const uint8_t* const data = array.raw_data() + array.value_offset(0);
270299
const int64_t buffer_size = array.total_values_length();
271-
int64_t offset = 0;
272-
#if defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON)
273-
// _mm_cmpistrc gives slightly better performance than the naive approach,
274-
// probably doesn't deserve the effort
275-
using simd_batch = xsimd::make_sized_batch_t<uint8_t, 16>;
276-
while ((offset + 16) <= buffer_size) {
277-
const auto v = simd_batch::load_unaligned(data + offset);
278-
if (xsimd::any((v == '\n') | (v == '\r') | (v == '"') | (v == delimiter))) {
279-
break;
280-
}
281-
offset += 16;
282-
}
283-
#endif
284-
while (offset < buffer_size) {
285-
// error happened or remaining bytes to check
286-
const char c = static_cast<char>(data[offset]);
287-
if (c == '\n' || c == '\r' || c == '"' || c == delimiter) {
288-
// extract the offending string from array per offset
289-
const auto* offsets = array.raw_value_offsets();
290-
const auto index =
291-
std::upper_bound(offsets, offsets + array.length(), offset + offsets[0]) -
292-
offsets;
293-
DCHECK_GT(index, 0);
294-
return Status::Invalid(
295-
"CSV values may not contain structural characters if quoting style is "
296-
"\"None\". See RFC4180. Invalid value: ",
297-
array.GetView(index - 1));
298-
}
299-
++offset;
300+
if (int64_t offset = StopAtStructuralChar(data, buffer_size, delimiter);
301+
offset != buffer_size) {
302+
// extract the offending string from array per offset
303+
const auto* offsets = array.raw_value_offsets();
304+
const auto index =
305+
std::upper_bound(offsets, offsets + array.length(), offset + offsets[0]) -
306+
offsets;
307+
DCHECK_GT(index, 0);
308+
return Status::Invalid(
309+
"CSV values may not contain structural characters if quoting style is "
310+
"\"None\". See RFC4180. Invalid value: ",
311+
array.GetView(index - 1));
300312
}
301313
return Status::OK();
302314
}
@@ -578,26 +590,62 @@ class CSVWriterImpl : public ipc::RecordBatchWriter {
578590
return Status::OK();
579591
}
580592

581-
int64_t CalculateHeaderSize() const {
593+
int64_t CalculateHeaderSize(QuotingStyle quoting_style) const {
582594
int64_t header_length = 0;
583595
for (int col = 0; col < schema_->num_fields(); col++) {
584596
const std::string& col_name = schema_->field(col)->name();
585597
header_length += col_name.size();
586-
header_length += CountQuotes(col_name);
598+
switch (quoting_style) {
599+
case QuotingStyle::None:
600+
break;
601+
case QuotingStyle::Needed:
602+
case QuotingStyle::AllValid:
603+
header_length += CountQuotes(col_name);
604+
break;
605+
}
606+
}
607+
header_length += kDelimiterCount * (schema_->num_fields() - 1) + options_.eol.size();
608+
switch (quoting_style) {
609+
case QuotingStyle::None:
610+
break;
611+
case QuotingStyle::Needed:
612+
case QuotingStyle::AllValid:
613+
header_length += kQuoteCount * schema_->num_fields();
614+
break;
587615
}
588-
// header_length + ([quotes + ','] * schema_->num_fields()) + (eol - ',')
589-
return header_length + (kQuoteDelimiterCount * schema_->num_fields()) +
590-
(options_.eol.size() - 1);
616+
return header_length;
591617
}
592618

593619
Status WriteHeader() {
594620
// Only called once, as part of initialization
595-
RETURN_NOT_OK(data_buffer_->Resize(CalculateHeaderSize(), /*shrink_to_fit=*/false));
621+
RETURN_NOT_OK(data_buffer_->Resize(CalculateHeaderSize(options_.quoting_header),
622+
/*shrink_to_fit=*/false));
596623
char* next = reinterpret_cast<char*>(data_buffer_->mutable_data());
597624
for (int col = 0; col < schema_->num_fields(); ++col) {
598-
*next++ = '"';
599-
next = Escape(schema_->field(col)->name(), next);
600-
*next++ = '"';
625+
const std::string& col_name = schema_->field(col)->name();
626+
switch (options_.quoting_header) {
627+
case QuotingStyle::None:
628+
if (StopAtStructuralChar(reinterpret_cast<const uint8_t*>(col_name.c_str()),
629+
col_name.length(), options_.delimiter) !=
630+
static_cast<int64_t>(col_name.length())) {
631+
return Status::Invalid(
632+
"CSV header may not contain structural characters if quoting style is "
633+
"\"None\". See RFC4180. Invalid value: ",
634+
col_name);
635+
}
636+
memcpy(next, col_name.data(), col_name.size());
637+
next += col_name.size();
638+
break;
639+
case QuotingStyle::Needed:
640+
case QuotingStyle::AllValid:
641+
// QuotingStyle::Needed is defined as always quoting string/binary data,
642+
// regardless of whether it contains structural chars.
643+
// We use consistent semantics for header names, which are strings.
644+
*next++ = '"';
645+
next = Escape(schema_->field(col)->name(), next);
646+
*next++ = '"';
647+
break;
648+
}
601649
if (col != schema_->num_fields() - 1) {
602650
*next++ = options_.delimiter;
603651
}

cpp/src/arrow/csv/writer_test.cc

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,12 @@ WriteOptions DefaultTestOptions(bool include_header = false,
6161
const std::string& null_string = "",
6262
QuotingStyle quoting_style = QuotingStyle::Needed,
6363
const std::string& eol = "\n", char delimiter = ',',
64-
int batch_size = 5) {
64+
int batch_size = 5,
65+
QuotingStyle quoting_header = QuotingStyle::Needed) {
6566
WriteOptions options;
6667
options.batch_size = batch_size;
6768
options.include_header = include_header;
69+
options.quoting_header = quoting_header;
6870
options.null_string = null_string;
6971
options.eol = eol;
7072
options.quoting_style = quoting_style;
@@ -91,6 +93,17 @@ std::vector<WriterTestParams> GenerateTestCases() {
9193
auto dummy_schema = schema({field("a", uint8())});
9294
std::string dummy_batch_data = R"([{"a": null}])";
9395

96+
auto header_without_structural_charaters =
97+
schema({field("a ", uint64()), field("b", int32())});
98+
std::string expected_header_without_structural_charaters =
99+
std::string(R"(a ,b)") + "\n";
100+
auto expected_status_no_quotes_with_structural_in_header = [](const char* header) {
101+
return Status::Invalid(
102+
"CSV header may not contain structural characters if quoting "
103+
"style is \"None\". See RFC4180. Invalid value: ",
104+
header);
105+
};
106+
94107
// Schema to test various types.
95108
auto abc_schema = schema({
96109
field("a", uint64()),
@@ -279,7 +292,20 @@ std::vector<WriterTestParams> GenerateTestCases() {
279292
{schema_custom_delimiter, batch_custom_delimiter,
280293
DefaultTestOptions(/*include_header=*/false, /*null_string=*/"",
281294
QuotingStyle::Needed, /*eol=*/";", /*delimiter=*/';'),
282-
/*expected_output*/ "", expected_status_illegal_delimiter(';')}};
295+
/*expected_output*/ "", expected_status_illegal_delimiter(';')},
296+
{header_without_structural_charaters, "[]",
297+
DefaultTestOptions(/*include_header=*/true, /*null_string=*/"",
298+
QuotingStyle::Needed, /*eol=*/"\n",
299+
/*delimiter=*/',', /*batch_size=*/5,
300+
/*quoting_header=*/QuotingStyle::None),
301+
expected_header_without_structural_charaters},
302+
{abc_schema, "[]",
303+
DefaultTestOptions(/*include_header=*/true, /*null_string=*/"",
304+
QuotingStyle::Needed, /*eol=*/"\n",
305+
/*delimiter=*/',', /*batch_size=*/5,
306+
/*quoting_header=*/QuotingStyle::None),
307+
"", expected_status_no_quotes_with_structural_in_header("b\"")},
308+
};
283309
}
284310

285311
class TestWriteCSV : public ::testing::TestWithParam<WriterTestParams> {

0 commit comments

Comments
 (0)