@@ -105,31 +105,45 @@ Status PresizeBuilder(const BlockParser& parser, BuilderType* builder) {
105105 }
106106}
107107
108+ // ///////////////////////////////////////////////////////////////////////
109+ // Shared Tries cache to avoid rebuilding them for each decoder instance
110+
111+ struct TrieCache {
112+ Trie null_trie;
113+ Trie true_trie;
114+ Trie false_trie;
115+
116+ static Result<std::shared_ptr<TrieCache>> Make (const ConvertOptions& options) {
117+ auto cache = std::make_shared<TrieCache>();
118+ RETURN_NOT_OK (InitializeTrie (options.null_values , &cache->null_trie ));
119+ RETURN_NOT_OK (InitializeTrie (options.true_values , &cache->true_trie ));
120+ RETURN_NOT_OK (InitializeTrie (options.false_values , &cache->false_trie ));
121+ return cache;
122+ }
123+ };
124+
108125// ///////////////////////////////////////////////////////////////////////
109126// Per-type value decoders
110127
111128struct ValueDecoder {
112129 explicit ValueDecoder (const std::shared_ptr<DataType>& type,
113- const ConvertOptions& options)
114- : type_(type), options_(options) {}
130+ const ConvertOptions& options, const TrieCache* trie_cache )
131+ : type_(type), options_(options), trie_cache_(trie_cache) {}
115132
116- Status Initialize () {
117- // TODO no need to build a separate Trie for each instance
118- return InitializeTrie (options_.null_values , &null_trie_);
119- }
133+ Status Initialize () { return Status::OK (); }
120134
121135 bool IsNull (const uint8_t * data, uint32_t size, bool quoted) {
122136 if (quoted && !options_.quoted_strings_can_be_null ) {
123137 return false ;
124138 }
125- return null_trie_ .Find (std::string_view ( reinterpret_cast < const char *>(data), size)) >=
126- 0 ;
139+ return trie_cache_-> null_trie .Find (
140+ std::string_view ( reinterpret_cast < const char *>(data), size)) >= 0 ;
127141 }
128142
129143 protected:
130- Trie null_trie_;
131144 const std::shared_ptr<DataType> type_;
132145 const ConvertOptions& options_;
146+ const TrieCache* trie_cache_;
133147};
134148
135149//
@@ -140,8 +154,9 @@ struct FixedSizeBinaryValueDecoder : public ValueDecoder {
140154 using value_type = const uint8_t *;
141155
142156 explicit FixedSizeBinaryValueDecoder (const std::shared_ptr<DataType>& type,
143- const ConvertOptions& options)
144- : ValueDecoder(type, options),
157+ const ConvertOptions& options,
158+ const TrieCache* trie_cache)
159+ : ValueDecoder(type, options, trie_cache),
145160 byte_width_(checked_cast<const FixedSizeBinaryType&>(*type).byte_width()) {}
146161
147162 Status Decode (const uint8_t * data, uint32_t size, bool quoted, value_type* out) {
@@ -207,8 +222,8 @@ struct NumericValueDecoder : public ValueDecoder {
207222 using value_type = typename T::c_type;
208223
209224 NumericValueDecoder (const std::shared_ptr<DataType>& type,
210- const ConvertOptions& options)
211- : ValueDecoder(type, options),
225+ const ConvertOptions& options, const TrieCache* trie_cache )
226+ : ValueDecoder(type, options, trie_cache ),
212227 concrete_type_ (checked_cast<const T&>(*type)),
213228 string_converter_(MakeStringConverter<T>(options)) {}
214229
@@ -236,31 +251,20 @@ struct BooleanValueDecoder : public ValueDecoder {
236251
237252 using ValueDecoder::ValueDecoder;
238253
239- Status Initialize () {
240- // TODO no need to build separate Tries for each instance
241- RETURN_NOT_OK (InitializeTrie (options_.true_values , &true_trie_));
242- RETURN_NOT_OK (InitializeTrie (options_.false_values , &false_trie_));
243- return ValueDecoder::Initialize ();
244- }
245-
246254 Status Decode (const uint8_t * data, uint32_t size, bool quoted, value_type* out) {
247255 // XXX should quoted values be allowed at all?
248- if (false_trie_ .Find (std::string_view ( reinterpret_cast < const char *>(data), size)) >=
249- 0 ) {
256+ if (trie_cache_-> false_trie .Find (
257+ std::string_view ( reinterpret_cast < const char *>(data), size)) >= 0 ) {
250258 *out = false ;
251259 return Status::OK ();
252260 }
253- if (ARROW_PREDICT_TRUE (true_trie_ .Find (std::string_view (
261+ if (ARROW_PREDICT_TRUE (trie_cache_-> true_trie .Find (std::string_view (
254262 reinterpret_cast <const char *>(data), size)) >= 0 )) {
255263 *out = true ;
256264 return Status::OK ();
257265 }
258266 return GenericConversionError (type_, data, size);
259267 }
260-
261- protected:
262- Trie true_trie_;
263- Trie false_trie_;
264268};
265269
266270//
@@ -271,8 +275,8 @@ struct DecimalValueDecoder : public ValueDecoder {
271275 using value_type = Decimal128;
272276
273277 explicit DecimalValueDecoder (const std::shared_ptr<DataType>& type,
274- const ConvertOptions& options)
275- : ValueDecoder(type, options),
278+ const ConvertOptions& options, const TrieCache* trie_cache )
279+ : ValueDecoder(type, options, trie_cache ),
276280 decimal_type_(internal::checked_cast<const DecimalType&>(*type_)),
277281 type_precision_(decimal_type_.precision()),
278282 type_scale_(decimal_type_.scale()) {}
@@ -310,8 +314,10 @@ struct CustomDecimalPointValueDecoder : public ValueDecoder {
310314 using value_type = typename WrappedDecoder::value_type;
311315
312316 explicit CustomDecimalPointValueDecoder (const std::shared_ptr<DataType>& type,
313- const ConvertOptions& options)
314- : ValueDecoder(type, options), wrapped_decoder_(type, options) {}
317+ const ConvertOptions& options,
318+ const TrieCache* trie_cache)
319+ : ValueDecoder(type, options, trie_cache),
320+ wrapped_decoder_(type, options, trie_cache) {}
315321
316322 Status Initialize () {
317323 RETURN_NOT_OK (wrapped_decoder_.Initialize ());
@@ -321,7 +327,7 @@ struct CustomDecimalPointValueDecoder : public ValueDecoder {
321327 mapping_[options_.decimal_point ] = ' .' ;
322328 mapping_[' .' ] = options_.decimal_point ; // error out on standard decimal point
323329 temp_.resize (30 );
324- return Status::OK ();
330+ return ValueDecoder::Initialize ();
325331 }
326332
327333 Status Decode (const uint8_t * data, uint32_t size, bool quoted, value_type* out) {
@@ -357,8 +363,9 @@ struct InlineISO8601ValueDecoder : public ValueDecoder {
357363 using value_type = int64_t ;
358364
359365 explicit InlineISO8601ValueDecoder (const std::shared_ptr<DataType>& type,
360- const ConvertOptions& options)
361- : ValueDecoder(type, options),
366+ const ConvertOptions& options,
367+ const TrieCache* trie_cache)
368+ : ValueDecoder(type, options, trie_cache),
362369 unit_(checked_cast<const TimestampType&>(*type_).unit()),
363370 expect_timezone_(!checked_cast<const TimestampType&>(*type_).timezone().empty()) {
364371 }
@@ -396,8 +403,9 @@ struct SingleParserTimestampValueDecoder : public ValueDecoder {
396403 using value_type = int64_t ;
397404
398405 explicit SingleParserTimestampValueDecoder (const std::shared_ptr<DataType>& type,
399- const ConvertOptions& options)
400- : ValueDecoder(type, options),
406+ const ConvertOptions& options,
407+ const TrieCache* trie_cache)
408+ : ValueDecoder(type, options, trie_cache),
401409 unit_(checked_cast<const TimestampType&>(*type_).unit()),
402410 expect_timezone_(!checked_cast<const TimestampType&>(*type_).timezone().empty()),
403411 parser_(*options_.timestamp_parsers[0 ]) {}
@@ -436,8 +444,9 @@ struct MultipleParsersTimestampValueDecoder : public ValueDecoder {
436444 using value_type = int64_t ;
437445
438446 explicit MultipleParsersTimestampValueDecoder (const std::shared_ptr<DataType>& type,
439- const ConvertOptions& options)
440- : ValueDecoder(type, options),
447+ const ConvertOptions& options,
448+ const TrieCache* trie_cache)
449+ : ValueDecoder(type, options, trie_cache),
441450 unit_(checked_cast<const TimestampType&>(*type_).unit()),
442451 expect_timezone_(!checked_cast<const TimestampType&>(*type_).timezone().empty()),
443452 parsers_(GetParsers(options_)) {}
@@ -477,8 +486,9 @@ struct DurationValueDecoder : public ValueDecoder {
477486 using value_type = int64_t ;
478487
479488 explicit DurationValueDecoder (const std::shared_ptr<DataType>& type,
480- const ConvertOptions& options)
481- : ValueDecoder(type, options),
489+ const ConvertOptions& options,
490+ const TrieCache* trie_cache)
491+ : ValueDecoder(type, options, trie_cache),
482492 concrete_type_(checked_cast<const DurationType&>(*type)),
483493 string_converter_() {}
484494
@@ -517,7 +527,8 @@ class NullConverter : public ConcreteConverter {
517527 public:
518528 NullConverter (const std::shared_ptr<DataType>& type, const ConvertOptions& options,
519529 MemoryPool* pool)
520- : ConcreteConverter(type, options, pool), decoder_(type_, options_) {}
530+ : ConcreteConverter(type, options, pool),
531+ decoder_ (type_, options_, static_cast <const TrieCache*>(trie_cache_.get())) {}
521532
522533 Result<std::shared_ptr<Array>> Convert (const BlockParser& parser,
523534 int32_t col_index) override {
@@ -551,7 +562,8 @@ class PrimitiveConverter : public ConcreteConverter {
551562 public:
552563 PrimitiveConverter (const std::shared_ptr<DataType>& type, const ConvertOptions& options,
553564 MemoryPool* pool)
554- : ConcreteConverter(type, options, pool), decoder_(type_, options_) {}
565+ : ConcreteConverter(type, options, pool),
566+ decoder_ (type_, options_, static_cast <const TrieCache*>(trie_cache_.get())) {}
555567
556568 Result<std::shared_ptr<Array>> Convert (const BlockParser& parser,
557569 int32_t col_index) override {
@@ -593,7 +605,8 @@ class TypedDictionaryConverter : public ConcreteDictionaryConverter {
593605 TypedDictionaryConverter (const std::shared_ptr<DataType>& value_type,
594606 const ConvertOptions& options, MemoryPool* pool)
595607 : ConcreteDictionaryConverter(value_type, options, pool),
596- decoder_ (value_type, options_) {}
608+ decoder_ (value_type, options_, static_cast <const TrieCache*>(trie_cache_.get())) {
609+ }
597610
598611 Result<std::shared_ptr<Array>> Convert (const BlockParser& parser,
599612 int32_t col_index) override {
@@ -684,7 +697,13 @@ std::shared_ptr<ConverterType> MakeRealConverter(const std::shared_ptr<DataType>
684697
685698Converter::Converter (const std::shared_ptr<DataType>& type, const ConvertOptions& options,
686699 MemoryPool* pool)
687- : options_(options), pool_(pool), type_(type) {}
700+ : options_(options), pool_(pool), type_(type) {
701+ // Build shared Trie cache (errors handled in Initialize())
702+ auto maybe_cache = TrieCache::Make (options);
703+ if (maybe_cache.ok ()) {
704+ trie_cache_ = std::static_pointer_cast<void >(*std::move (maybe_cache));
705+ }
706+ }
688707
689708DictionaryConverter::DictionaryConverter (const std::shared_ptr<DataType>& value_type,
690709 const ConvertOptions& options, MemoryPool* pool)
0 commit comments