@@ -105,7 +105,8 @@ int64_t CountQuotes(std::string_view s) {
105105
106106// Matching quote pair character length.
107107constexpr int64_t kQuoteCount = 2 ;
108- constexpr int64_t kQuoteDelimiterCount = kQuoteCount + /* end_char*/ 1 ;
108+ // Delimiter character length.
109+ constexpr int64_t kDelimiterCount = 1 ;
109110
110111// Interface for generating CSV data per column.
111112// The intended usage is to iteratively call UpdateRowLengths for a column and
@@ -176,6 +177,34 @@ char* Escape(std::string_view s, char* out) {
176177 return out;
177178}
178179
180+ // Return the index of the first structural char in the input. A structural char
181+ // is a character that needs quoting and/or escaping.
182+ int64_t StopAtStructuralChar (const uint8_t * data, const int64_t buffer_size,
183+ const char delimiter) {
184+ int64_t offset = 0 ;
185+ #if defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON)
186+ // _mm_cmpistrc gives slightly better performance than the naive approach,
187+ // probably doesn't deserve the effort
188+ using simd_batch = xsimd::make_sized_batch_t <uint8_t , 16 >;
189+ while ((offset + 16 ) <= buffer_size) {
190+ const auto v = simd_batch::load_unaligned (data + offset);
191+ if (xsimd::any ((v == ' \n ' ) | (v == ' \r ' ) | (v == ' "' ) | (v == delimiter))) {
192+ break ;
193+ }
194+ offset += 16 ;
195+ }
196+ #endif
197+ while (offset < buffer_size) {
198+ // error happened or remaining bytes to check
199+ const char c = static_cast <char >(data[offset]);
200+ if (c == ' \n ' || c == ' \r ' || c == ' "' || c == delimiter) {
201+ break ;
202+ }
203+ ++offset;
204+ }
205+ return offset;
206+ }
207+
179208// Populator used for non-string/binary types, or when unquoted strings/binary types are
180209// desired. It assumes the strings in the casted array do not require quoting or escaping.
181210// This is enforced by setting reject_values_with_quotes to true, in which case a check
@@ -268,35 +297,18 @@ class UnquotedColumnPopulator : public ColumnPopulator {
268297 // scan the underlying string array buffer as a single big string
269298 const uint8_t * const data = array.raw_data () + array.value_offset (0 );
270299 const int64_t buffer_size = array.total_values_length ();
271- int64_t offset = 0 ;
272- #if defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON)
273- // _mm_cmpistrc gives slightly better performance than the naive approach,
274- // probably doesn't deserve the effort
275- using simd_batch = xsimd::make_sized_batch_t <uint8_t , 16 >;
276- while ((offset + 16 ) <= buffer_size) {
277- const auto v = simd_batch::load_unaligned (data + offset);
278- if (xsimd::any ((v == ' \n ' ) | (v == ' \r ' ) | (v == ' "' ) | (v == delimiter))) {
279- break ;
280- }
281- offset += 16 ;
282- }
283- #endif
284- while (offset < buffer_size) {
285- // error happened or remaining bytes to check
286- const char c = static_cast <char >(data[offset]);
287- if (c == ' \n ' || c == ' \r ' || c == ' "' || c == delimiter) {
288- // extract the offending string from array per offset
289- const auto * offsets = array.raw_value_offsets ();
290- const auto index =
291- std::upper_bound (offsets, offsets + array.length (), offset + offsets[0 ]) -
292- offsets;
293- DCHECK_GT (index, 0 );
294- return Status::Invalid (
295- " CSV values may not contain structural characters if quoting style is "
296- " \" None\" . See RFC4180. Invalid value: " ,
297- array.GetView (index - 1 ));
298- }
299- ++offset;
300+ if (int64_t offset = StopAtStructuralChar (data, buffer_size, delimiter);
301+ offset != buffer_size) {
302+ // extract the offending string from array per offset
303+ const auto * offsets = array.raw_value_offsets ();
304+ const auto index =
305+ std::upper_bound (offsets, offsets + array.length (), offset + offsets[0 ]) -
306+ offsets;
307+ DCHECK_GT (index, 0 );
308+ return Status::Invalid (
309+ " CSV values may not contain structural characters if quoting style is "
310+ " \" None\" . See RFC4180. Invalid value: " ,
311+ array.GetView (index - 1 ));
300312 }
301313 return Status::OK ();
302314 }
@@ -578,26 +590,62 @@ class CSVWriterImpl : public ipc::RecordBatchWriter {
578590 return Status::OK ();
579591 }
580592
581- int64_t CalculateHeaderSize () const {
593+ int64_t CalculateHeaderSize (QuotingStyle quoting_style ) const {
582594 int64_t header_length = 0 ;
583595 for (int col = 0 ; col < schema_->num_fields (); col++) {
584596 const std::string& col_name = schema_->field (col)->name ();
585597 header_length += col_name.size ();
586- header_length += CountQuotes (col_name);
598+ switch (quoting_style) {
599+ case QuotingStyle::None:
600+ break ;
601+ case QuotingStyle::Needed:
602+ case QuotingStyle::AllValid:
603+ header_length += CountQuotes (col_name);
604+ break ;
605+ }
606+ }
607+ header_length += kDelimiterCount * (schema_->num_fields () - 1 ) + options_.eol .size ();
608+ switch (quoting_style) {
609+ case QuotingStyle::None:
610+ break ;
611+ case QuotingStyle::Needed:
612+ case QuotingStyle::AllValid:
613+ header_length += kQuoteCount * schema_->num_fields ();
614+ break ;
587615 }
588- // header_length + ([quotes + ','] * schema_->num_fields()) + (eol - ',')
589- return header_length + (kQuoteDelimiterCount * schema_->num_fields ()) +
590- (options_.eol .size () - 1 );
616+ return header_length;
591617 }
592618
593619 Status WriteHeader () {
594620 // Only called once, as part of initialization
595- RETURN_NOT_OK (data_buffer_->Resize (CalculateHeaderSize (), /* shrink_to_fit=*/ false ));
621+ RETURN_NOT_OK (data_buffer_->Resize (CalculateHeaderSize (options_.quoting_header ),
622+ /* shrink_to_fit=*/ false ));
596623 char * next = reinterpret_cast <char *>(data_buffer_->mutable_data ());
597624 for (int col = 0 ; col < schema_->num_fields (); ++col) {
598- *next++ = ' "' ;
599- next = Escape (schema_->field (col)->name (), next);
600- *next++ = ' "' ;
625+ const std::string& col_name = schema_->field (col)->name ();
626+ switch (options_.quoting_header ) {
627+ case QuotingStyle::None:
628+ if (StopAtStructuralChar (reinterpret_cast <const uint8_t *>(col_name.c_str ()),
629+ col_name.length (), options_.delimiter ) !=
630+ static_cast <int64_t >(col_name.length ())) {
631+ return Status::Invalid (
632+ " CSV header may not contain structural characters if quoting style is "
633+ " \" None\" . See RFC4180. Invalid value: " ,
634+ col_name);
635+ }
636+ memcpy (next, col_name.data (), col_name.size ());
637+ next += col_name.size ();
638+ break ;
639+ case QuotingStyle::Needed:
640+ case QuotingStyle::AllValid:
641+ // QuotingStyle::Needed is defined as always quoting string/binary data,
642+ // regardless of whether it contains structural chars.
643+ // We use consistent semantics for header names, which are strings.
644+ *next++ = ' "' ;
645+ next = Escape (schema_->field (col)->name (), next);
646+ *next++ = ' "' ;
647+ break ;
648+ }
601649 if (col != schema_->num_fields () - 1 ) {
602650 *next++ = options_.delimiter ;
603651 }
0 commit comments