@@ -57,9 +57,6 @@ void ArrowStringHandler::convert_type(
57
57
const std::shared_ptr<StringPool>& string_pool) const {
58
58
using ArcticStringColumnTag = ScalarTagType<DataTypeTag<DataType::UTF_DYNAMIC64>>;
59
59
auto input_data = source_column.data ();
60
- auto pos = input_data.cbegin <ArcticStringColumnTag>();
61
- const auto end = input_data.cend <ArcticStringColumnTag>();
62
-
63
60
struct DictEntry {
64
61
int32_t offset_buffer_pos_;
65
62
int64_t string_buffer_pos_;
@@ -75,29 +72,52 @@ void ArrowStringHandler::convert_type(
75
72
int32_t unique_offset_count = 0 ;
76
73
auto dest_ptr = reinterpret_cast <int32_t *>(dest_column.bytes_at (mapping.offset_bytes_ , source_column.row_count () * sizeof (int32_t )));
77
74
75
+ util::BitSet bitset;
76
+ util::BitSet::bulk_insert_iterator inserter (bitset);
77
+ const auto end = input_data.cend <ArcticStringColumnTag, IteratorType::ENUMERATED>();
78
78
// First go through the source column once to compute the size of offset and string buffers.
79
- while (pos != end) {
80
- auto [entry, is_emplaced] = unique_offsets.try_emplace (*pos, DictEntry{unique_offset_count, bytes, string_pool->get_const_view (*pos)});
81
- if (is_emplaced) {
82
- bytes += entry->second .strv .size ();
83
- unique_offsets_in_order.push_back (*pos);
84
- ++unique_offset_count;
79
+ // TODO: This can't be right if the column was sparse as it has only been decoded, not expanded
80
+ for (auto en = input_data.cbegin <ArcticStringColumnTag, IteratorType::ENUMERATED>(); en != end; ++en) {
81
+ if (is_a_string (en->value ())) {
82
+ auto [entry, is_emplaced] = unique_offsets.try_emplace (en->value (), DictEntry{static_cast <int32_t >(unique_offset_count), bytes, string_pool->get_const_view (en->value ())});
83
+ if (is_emplaced) {
84
+ bytes += entry->second .strv .size ();
85
+ unique_offsets_in_order.push_back (en->value ());
86
+ ++unique_offset_count;
87
+ }
88
+ *dest_ptr = entry->second .offset_buffer_pos_ ;
89
+ } else {
90
+ inserter = en->idx ();
85
91
}
86
- ++pos;
87
- *dest_ptr++ = entry->second .offset_buffer_pos_ ;
92
+ ++dest_ptr;
88
93
}
89
- auto & string_buffer = dest_column.create_extra_buffer (mapping.offset_bytes_ , ExtraBufferType::STRING, bytes, AllocationType::DETACHABLE);
90
- auto & offsets_buffer = dest_column.create_extra_buffer (mapping.offset_bytes_ , ExtraBufferType::OFFSET, (unique_offsets_in_order.size () + 1 ) * sizeof (int64_t ), AllocationType::DETACHABLE);
91
- // Then go through unique_offsets to fill up the offset and string buffers.
92
- auto offsets_ptr = reinterpret_cast <int64_t *>(offsets_buffer.data ());
93
- auto string_ptr = reinterpret_cast <char *>(string_buffer.data ());
94
- for (auto unique_offset: unique_offsets_in_order) {
95
- const auto & entry = unique_offsets[unique_offset];
96
- *offsets_ptr++ = entry.string_buffer_pos_ ;
97
- memcpy (string_ptr, entry.strv .data (), entry.strv .size ());
98
- string_ptr += entry.strv .size ();
94
+ inserter.flush ();
95
+ // At this point bitset has ones where the source column contained None or NaN
96
+ // Inverting and shrinking to the source column size it then makes a sparse map for the input data
97
+ bitset.invert ();
98
+ // TODO: row_count() here won't be right when the original data was sparse, but we don't support sparse
99
+ // string columns yet anyway
100
+ bitset.resize (source_column.row_count ());
101
+ if (bitset.count () != bitset.size ()) {
102
+ handle_truncation (bitset, mapping.truncate_ );
103
+ create_dense_bitmap (mapping.offset_bytes_ , bitset, dest_column, AllocationType::DETACHABLE);
104
+ } // else there weren't any Nones or NaNs
105
+ // bitset.count() == 0 is the special case where all of the rows contained None or NaN. In this case, do not create
106
+ // the extra string and offset buffers. string_dict_from_block will then do the right thing and call minimal_strings_dict
107
+ if (bitset.count () > 0 ) {
108
+ auto & string_buffer = dest_column.create_extra_buffer (mapping.offset_bytes_ , ExtraBufferType::STRING, bytes, AllocationType::DETACHABLE);
109
+ auto & offsets_buffer = dest_column.create_extra_buffer (mapping.offset_bytes_ , ExtraBufferType::OFFSET, (unique_offsets_in_order.size () + 1 ) * sizeof (int64_t ), AllocationType::DETACHABLE);
110
+ // Then go through unique_offsets to fill up the offset and string buffers.
111
+ auto offsets_ptr = reinterpret_cast <int64_t *>(offsets_buffer.data ());
112
+ auto string_ptr = reinterpret_cast <char *>(string_buffer.data ());
113
+ for (auto unique_offset: unique_offsets_in_order) {
114
+ const auto & entry = unique_offsets[unique_offset];
115
+ *offsets_ptr++ = entry.string_buffer_pos_ ;
116
+ memcpy (string_ptr, entry.strv .data (), entry.strv .size ());
117
+ string_ptr += entry.strv .size ();
118
+ }
119
+ *offsets_ptr = bytes;
99
120
}
100
- *offsets_ptr = bytes;
101
121
}
102
122
103
123
TypeDescriptor ArrowStringHandler::output_type (const TypeDescriptor&) const {
0 commit comments