Skip to content

Commit c6090ed

Browse files
authored
apacheGH-49069: [C++] Share Trie instances across CSV value decoders (apache#49070)
### Rationale for this change The CSV converter was building identical Trie data structures (for null/true/false values) in every decoder instance, causing duplicate memory allocation and initialization overhead. ### What changes are included in this PR? - Introduced `TrieCache` struct to hold shared Trie instances (null_trie, true_trie, false_trie) - Updated `ValueDecoder` and all decoder subclasses to accept and reference a shared `TrieCache` instead of building their own Tries - Updated `Converter` base class to create one `TrieCache` per converter and pass it to all decoders ### Are these changes tested? Yes, all existing tests. I ran a simple benchmark showing roughly 2-4% faster converter creation, and obviously less memory usage. ### Are there any user-facing changes? No. * GitHub Issue: apache#49069 Authored-by: Hyukjin Kwon <[email protected]> Signed-off-by: Sutou Kouhei <[email protected]>
1 parent 9106671 commit c6090ed

File tree

2 files changed

+66
-44
lines changed

2 files changed

+66
-44
lines changed

cpp/src/arrow/csv/converter.cc

Lines changed: 63 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -105,31 +105,45 @@ Status PresizeBuilder(const BlockParser& parser, BuilderType* builder) {
105105
}
106106
}
107107

108+
/////////////////////////////////////////////////////////////////////////
109+
// Shared Tries cache to avoid rebuilding them for each decoder instance
110+
111+
struct TrieCache {
112+
Trie null_trie;
113+
Trie true_trie;
114+
Trie false_trie;
115+
116+
static Result<std::shared_ptr<TrieCache>> Make(const ConvertOptions& options) {
117+
auto cache = std::make_shared<TrieCache>();
118+
RETURN_NOT_OK(InitializeTrie(options.null_values, &cache->null_trie));
119+
RETURN_NOT_OK(InitializeTrie(options.true_values, &cache->true_trie));
120+
RETURN_NOT_OK(InitializeTrie(options.false_values, &cache->false_trie));
121+
return cache;
122+
}
123+
};
124+
108125
/////////////////////////////////////////////////////////////////////////
109126
// Per-type value decoders
110127

111128
struct ValueDecoder {
112129
explicit ValueDecoder(const std::shared_ptr<DataType>& type,
113-
const ConvertOptions& options)
114-
: type_(type), options_(options) {}
130+
const ConvertOptions& options, const TrieCache* trie_cache)
131+
: type_(type), options_(options), trie_cache_(trie_cache) {}
115132

116-
Status Initialize() {
117-
// TODO no need to build a separate Trie for each instance
118-
return InitializeTrie(options_.null_values, &null_trie_);
119-
}
133+
Status Initialize() { return Status::OK(); }
120134

121135
bool IsNull(const uint8_t* data, uint32_t size, bool quoted) {
122136
if (quoted && !options_.quoted_strings_can_be_null) {
123137
return false;
124138
}
125-
return null_trie_.Find(std::string_view(reinterpret_cast<const char*>(data), size)) >=
126-
0;
139+
return trie_cache_->null_trie.Find(
140+
std::string_view(reinterpret_cast<const char*>(data), size)) >= 0;
127141
}
128142

129143
protected:
130-
Trie null_trie_;
131144
const std::shared_ptr<DataType> type_;
132145
const ConvertOptions& options_;
146+
const TrieCache* trie_cache_;
133147
};
134148

135149
//
@@ -140,8 +154,9 @@ struct FixedSizeBinaryValueDecoder : public ValueDecoder {
140154
using value_type = const uint8_t*;
141155

142156
explicit FixedSizeBinaryValueDecoder(const std::shared_ptr<DataType>& type,
143-
const ConvertOptions& options)
144-
: ValueDecoder(type, options),
157+
const ConvertOptions& options,
158+
const TrieCache* trie_cache)
159+
: ValueDecoder(type, options, trie_cache),
145160
byte_width_(checked_cast<const FixedSizeBinaryType&>(*type).byte_width()) {}
146161

147162
Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
@@ -207,8 +222,8 @@ struct NumericValueDecoder : public ValueDecoder {
207222
using value_type = typename T::c_type;
208223

209224
NumericValueDecoder(const std::shared_ptr<DataType>& type,
210-
const ConvertOptions& options)
211-
: ValueDecoder(type, options),
225+
const ConvertOptions& options, const TrieCache* trie_cache)
226+
: ValueDecoder(type, options, trie_cache),
212227
concrete_type_(checked_cast<const T&>(*type)),
213228
string_converter_(MakeStringConverter<T>(options)) {}
214229

@@ -236,31 +251,20 @@ struct BooleanValueDecoder : public ValueDecoder {
236251

237252
using ValueDecoder::ValueDecoder;
238253

239-
Status Initialize() {
240-
// TODO no need to build separate Tries for each instance
241-
RETURN_NOT_OK(InitializeTrie(options_.true_values, &true_trie_));
242-
RETURN_NOT_OK(InitializeTrie(options_.false_values, &false_trie_));
243-
return ValueDecoder::Initialize();
244-
}
245-
246254
Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
247255
// XXX should quoted values be allowed at all?
248-
if (false_trie_.Find(std::string_view(reinterpret_cast<const char*>(data), size)) >=
249-
0) {
256+
if (trie_cache_->false_trie.Find(
257+
std::string_view(reinterpret_cast<const char*>(data), size)) >= 0) {
250258
*out = false;
251259
return Status::OK();
252260
}
253-
if (ARROW_PREDICT_TRUE(true_trie_.Find(std::string_view(
261+
if (ARROW_PREDICT_TRUE(trie_cache_->true_trie.Find(std::string_view(
254262
reinterpret_cast<const char*>(data), size)) >= 0)) {
255263
*out = true;
256264
return Status::OK();
257265
}
258266
return GenericConversionError(type_, data, size);
259267
}
260-
261-
protected:
262-
Trie true_trie_;
263-
Trie false_trie_;
264268
};
265269

266270
//
@@ -271,8 +275,8 @@ struct DecimalValueDecoder : public ValueDecoder {
271275
using value_type = Decimal128;
272276

273277
explicit DecimalValueDecoder(const std::shared_ptr<DataType>& type,
274-
const ConvertOptions& options)
275-
: ValueDecoder(type, options),
278+
const ConvertOptions& options, const TrieCache* trie_cache)
279+
: ValueDecoder(type, options, trie_cache),
276280
decimal_type_(internal::checked_cast<const DecimalType&>(*type_)),
277281
type_precision_(decimal_type_.precision()),
278282
type_scale_(decimal_type_.scale()) {}
@@ -310,8 +314,10 @@ struct CustomDecimalPointValueDecoder : public ValueDecoder {
310314
using value_type = typename WrappedDecoder::value_type;
311315

312316
explicit CustomDecimalPointValueDecoder(const std::shared_ptr<DataType>& type,
313-
const ConvertOptions& options)
314-
: ValueDecoder(type, options), wrapped_decoder_(type, options) {}
317+
const ConvertOptions& options,
318+
const TrieCache* trie_cache)
319+
: ValueDecoder(type, options, trie_cache),
320+
wrapped_decoder_(type, options, trie_cache) {}
315321

316322
Status Initialize() {
317323
RETURN_NOT_OK(wrapped_decoder_.Initialize());
@@ -321,7 +327,7 @@ struct CustomDecimalPointValueDecoder : public ValueDecoder {
321327
mapping_[options_.decimal_point] = '.';
322328
mapping_['.'] = options_.decimal_point; // error out on standard decimal point
323329
temp_.resize(30);
324-
return Status::OK();
330+
return ValueDecoder::Initialize();
325331
}
326332

327333
Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
@@ -357,8 +363,9 @@ struct InlineISO8601ValueDecoder : public ValueDecoder {
357363
using value_type = int64_t;
358364

359365
explicit InlineISO8601ValueDecoder(const std::shared_ptr<DataType>& type,
360-
const ConvertOptions& options)
361-
: ValueDecoder(type, options),
366+
const ConvertOptions& options,
367+
const TrieCache* trie_cache)
368+
: ValueDecoder(type, options, trie_cache),
362369
unit_(checked_cast<const TimestampType&>(*type_).unit()),
363370
expect_timezone_(!checked_cast<const TimestampType&>(*type_).timezone().empty()) {
364371
}
@@ -396,8 +403,9 @@ struct SingleParserTimestampValueDecoder : public ValueDecoder {
396403
using value_type = int64_t;
397404

398405
explicit SingleParserTimestampValueDecoder(const std::shared_ptr<DataType>& type,
399-
const ConvertOptions& options)
400-
: ValueDecoder(type, options),
406+
const ConvertOptions& options,
407+
const TrieCache* trie_cache)
408+
: ValueDecoder(type, options, trie_cache),
401409
unit_(checked_cast<const TimestampType&>(*type_).unit()),
402410
expect_timezone_(!checked_cast<const TimestampType&>(*type_).timezone().empty()),
403411
parser_(*options_.timestamp_parsers[0]) {}
@@ -436,8 +444,9 @@ struct MultipleParsersTimestampValueDecoder : public ValueDecoder {
436444
using value_type = int64_t;
437445

438446
explicit MultipleParsersTimestampValueDecoder(const std::shared_ptr<DataType>& type,
439-
const ConvertOptions& options)
440-
: ValueDecoder(type, options),
447+
const ConvertOptions& options,
448+
const TrieCache* trie_cache)
449+
: ValueDecoder(type, options, trie_cache),
441450
unit_(checked_cast<const TimestampType&>(*type_).unit()),
442451
expect_timezone_(!checked_cast<const TimestampType&>(*type_).timezone().empty()),
443452
parsers_(GetParsers(options_)) {}
@@ -477,8 +486,9 @@ struct DurationValueDecoder : public ValueDecoder {
477486
using value_type = int64_t;
478487

479488
explicit DurationValueDecoder(const std::shared_ptr<DataType>& type,
480-
const ConvertOptions& options)
481-
: ValueDecoder(type, options),
489+
const ConvertOptions& options,
490+
const TrieCache* trie_cache)
491+
: ValueDecoder(type, options, trie_cache),
482492
concrete_type_(checked_cast<const DurationType&>(*type)),
483493
string_converter_() {}
484494

@@ -517,7 +527,8 @@ class NullConverter : public ConcreteConverter {
517527
public:
518528
NullConverter(const std::shared_ptr<DataType>& type, const ConvertOptions& options,
519529
MemoryPool* pool)
520-
: ConcreteConverter(type, options, pool), decoder_(type_, options_) {}
530+
: ConcreteConverter(type, options, pool),
531+
decoder_(type_, options_, static_cast<const TrieCache*>(trie_cache_.get())) {}
521532

522533
Result<std::shared_ptr<Array>> Convert(const BlockParser& parser,
523534
int32_t col_index) override {
@@ -551,7 +562,8 @@ class PrimitiveConverter : public ConcreteConverter {
551562
public:
552563
PrimitiveConverter(const std::shared_ptr<DataType>& type, const ConvertOptions& options,
553564
MemoryPool* pool)
554-
: ConcreteConverter(type, options, pool), decoder_(type_, options_) {}
565+
: ConcreteConverter(type, options, pool),
566+
decoder_(type_, options_, static_cast<const TrieCache*>(trie_cache_.get())) {}
555567

556568
Result<std::shared_ptr<Array>> Convert(const BlockParser& parser,
557569
int32_t col_index) override {
@@ -593,7 +605,8 @@ class TypedDictionaryConverter : public ConcreteDictionaryConverter {
593605
TypedDictionaryConverter(const std::shared_ptr<DataType>& value_type,
594606
const ConvertOptions& options, MemoryPool* pool)
595607
: ConcreteDictionaryConverter(value_type, options, pool),
596-
decoder_(value_type, options_) {}
608+
decoder_(value_type, options_, static_cast<const TrieCache*>(trie_cache_.get())) {
609+
}
597610

598611
Result<std::shared_ptr<Array>> Convert(const BlockParser& parser,
599612
int32_t col_index) override {
@@ -684,7 +697,13 @@ std::shared_ptr<ConverterType> MakeRealConverter(const std::shared_ptr<DataType>
684697

685698
Converter::Converter(const std::shared_ptr<DataType>& type, const ConvertOptions& options,
686699
MemoryPool* pool)
687-
: options_(options), pool_(pool), type_(type) {}
700+
: options_(options), pool_(pool), type_(type) {
701+
// Build shared Trie cache (errors handled in Initialize())
702+
auto maybe_cache = TrieCache::Make(options);
703+
if (maybe_cache.ok()) {
704+
trie_cache_ = std::static_pointer_cast<void>(*std::move(maybe_cache));
705+
}
706+
}
688707

689708
DictionaryConverter::DictionaryConverter(const std::shared_ptr<DataType>& value_type,
690709
const ConvertOptions& options, MemoryPool* pool)

cpp/src/arrow/csv/converter.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ class ARROW_EXPORT Converter {
5757
const ConvertOptions& options_;
5858
MemoryPool* pool_;
5959
std::shared_ptr<DataType> type_;
60+
// Opaque TrieCache pointer. TrieCache destructor is called via control block.
61+
// https://en.cppreference.com/w/cpp/memory/shared_ptr
62+
std::shared_ptr<void> trie_cache_;
6063
};
6164

6265
class ARROW_EXPORT DictionaryConverter : public Converter {

0 commit comments

Comments
 (0)