Arrow fix string cols with nones and nans (#2586)

alexowens90 · web-flow · commit dd746f07084e · 2025-08-15T17:05:58.000+01:00
#### What does this implement or fix?
Allow reading into Arrow string columns containing Nones and NaNs
diff --git a/cpp/arcticdb/arrow/arrow_handlers.cpp b/cpp/arcticdb/arrow/arrow_handlers.cpp
@@ -57,9 +57,6 @@ void ArrowStringHandler::convert_type(
     const std::shared_ptr<StringPool>& string_pool) const {
     using ArcticStringColumnTag = ScalarTagType<DataTypeTag<DataType::UTF_DYNAMIC64>>;
     auto input_data = source_column.data();
-    auto pos = input_data.cbegin<ArcticStringColumnTag>();
-    const auto end = input_data.cend<ArcticStringColumnTag>();
-
     struct DictEntry {
         int32_t offset_buffer_pos_;
         int64_t string_buffer_pos_;
@@ -75,29 +72,52 @@ void ArrowStringHandler::convert_type(
     int32_t unique_offset_count = 0;
     auto dest_ptr = reinterpret_cast<int32_t*>(dest_column.bytes_at(mapping.offset_bytes_, source_column.row_count() * sizeof(int32_t)));
 
+    util::BitSet bitset;
+    util::BitSet::bulk_insert_iterator inserter(bitset);
+    const auto end = input_data.cend<ArcticStringColumnTag, IteratorType::ENUMERATED>();
     // First go through the source column once to compute the size of offset and string buffers.
-    while(pos != end) {
-        auto [entry, is_emplaced] = unique_offsets.try_emplace(*pos, DictEntry{unique_offset_count, bytes, string_pool->get_const_view(*pos)});
-        if(is_emplaced) {
-            bytes += entry->second.strv.size();
-            unique_offsets_in_order.push_back(*pos);
-            ++unique_offset_count;
+    // TODO: This can't be right if the column was sparse as it has only been decoded, not expanded
+    for (auto en = input_data.cbegin<ArcticStringColumnTag, IteratorType::ENUMERATED>(); en != end; ++en) {
+        if (is_a_string(en->value())) {
+            auto [entry, is_emplaced] = unique_offsets.try_emplace(en->value(), DictEntry{static_cast<int32_t>(unique_offset_count), bytes, string_pool->get_const_view(en->value())});
+            if (is_emplaced) {
+                bytes += entry->second.strv.size();
+                unique_offsets_in_order.push_back(en->value());
+                ++unique_offset_count;
+            }
+            *dest_ptr = entry->second.offset_buffer_pos_;
+        } else {
+            inserter = en->idx();
         }
-        ++pos;
-        *dest_ptr++ = entry->second.offset_buffer_pos_;
+        ++dest_ptr;
     }
-    auto& string_buffer = dest_column.create_extra_buffer(mapping.offset_bytes_, ExtraBufferType::STRING, bytes, AllocationType::DETACHABLE);
-    auto& offsets_buffer = dest_column.create_extra_buffer(mapping.offset_bytes_, ExtraBufferType::OFFSET, (unique_offsets_in_order.size() + 1) * sizeof(int64_t), AllocationType::DETACHABLE);
-    // Then go through unique_offsets to fill up the offset and string buffers.
-    auto offsets_ptr = reinterpret_cast<int64_t*>(offsets_buffer.data());
-    auto string_ptr = reinterpret_cast<char*>(string_buffer.data());
-    for (auto unique_offset: unique_offsets_in_order) {
-        const auto& entry = unique_offsets[unique_offset];
-        *offsets_ptr++ = entry.string_buffer_pos_;
-        memcpy(string_ptr, entry.strv.data(), entry.strv.size());
-        string_ptr += entry.strv.size();
+    inserter.flush();
+    // At this point bitset has ones where the source column contained None or NaN
+    // Inverting and shrinking to the source column size it then makes a sparse map for the input data
+    bitset.invert();
+    // TODO: row_count() here won't be right when the original data was sparse, but we don't support sparse
+    // string columns yet anyway
+    bitset.resize(source_column.row_count());
+    if (bitset.count() != bitset.size()) {
+        handle_truncation(bitset, mapping.truncate_);
+        create_dense_bitmap(mapping.offset_bytes_, bitset, dest_column, AllocationType::DETACHABLE);
+    } // else there weren't any Nones or NaNs
+    // bitset.count() == 0 is the special case where all of the rows contained None or NaN. In this case, do not create
+    // the extra string and offset buffers. string_dict_from_block will then do the right thing and call minimal_strings_dict
+    if (bitset.count() > 0) {
+        auto& string_buffer = dest_column.create_extra_buffer(mapping.offset_bytes_, ExtraBufferType::STRING, bytes, AllocationType::DETACHABLE);
+        auto& offsets_buffer = dest_column.create_extra_buffer(mapping.offset_bytes_, ExtraBufferType::OFFSET, (unique_offsets_in_order.size() + 1) * sizeof(int64_t), AllocationType::DETACHABLE);
+        // Then go through unique_offsets to fill up the offset and string buffers.
+        auto offsets_ptr = reinterpret_cast<int64_t*>(offsets_buffer.data());
+        auto string_ptr = reinterpret_cast<char*>(string_buffer.data());
+        for (auto unique_offset: unique_offsets_in_order) {
+            const auto& entry = unique_offsets[unique_offset];
+            *offsets_ptr++ = entry.string_buffer_pos_;
+            memcpy(string_ptr, entry.strv.data(), entry.strv.size());
+            string_ptr += entry.strv.size();
+        }
+        *offsets_ptr = bytes;
     }
-    *offsets_ptr = bytes;
 }
 
 TypeDescriptor ArrowStringHandler::output_type(const TypeDescriptor&) const {
diff --git a/cpp/arcticdb/pipeline/column_mapping.hpp b/cpp/arcticdb/pipeline/column_mapping.hpp
@@ -6,6 +6,7 @@
  */
 
 #pragma once
+#include <arcticdb/column_store/column.hpp>
 #include <arcticdb/entity/types.hpp>
 #include <arcticdb/util/bitset.hpp>
 #include <optional>
@@ -103,4 +104,52 @@ struct StaticColumnMappingIterator {
     [[nodiscard]] size_t index_fieldcount() const;
 };
 
+inline void handle_truncation(
+        Column& dest_column,
+        const ColumnTruncation& truncate) {
+    if(dest_column.num_blocks() == 1 && truncate.start_ && truncate.end_) {
+        dest_column.truncate_single_block(*truncate.start_, *truncate.end_);
+    } else {
+        if(truncate.start_)
+            dest_column.truncate_first_block(*truncate.start_);
+        if(truncate.end_)
+            dest_column.truncate_last_block(*truncate.end_);
+    }
+}
+
+inline void handle_truncation(
+        Column& dest_column,
+        const ColumnMapping& mapping) {
+    handle_truncation(dest_column, mapping.truncate_);
+}
+
+inline void handle_truncation(util::BitSet& bv, const ColumnTruncation& truncate) {
+    if (truncate.start_) {
+        bv = util::truncate_sparse_map(bv, *truncate.start_, truncate.end_.value_or(bv.size()));
+    } else if (truncate.end_) {
+        // More efficient than util::truncate_sparse_map as it avoids a copy
+        bv.resize(*truncate.end_);
+    }
+}
+
+inline void create_dense_bitmap(size_t offset, const util::BitSet& sparse_map, Column& dest_column, AllocationType allocation_type) {
+    auto& sparse_buffer = dest_column.create_extra_buffer(
+            offset,
+            ExtraBufferType::BITMAP,
+            bitset_packed_size_bytes(sparse_map.size()),
+            allocation_type);
+
+    bitset_to_packed_bits(sparse_map, sparse_buffer.data());
+}
+
+inline void create_dense_bitmap_all_zeros(size_t offset, size_t num_bits, Column& dest_column, AllocationType allocation_type) {
+    auto num_bytes = bitset_packed_size_bytes(num_bits);
+    auto& sparse_buffer = dest_column.create_extra_buffer(
+            offset,
+            ExtraBufferType::BITMAP,
+            num_bytes,
+            allocation_type);
+    std::memset(sparse_buffer.data(), 0, num_bytes);
+}
+
 } // namespace arcticdb
diff --git a/cpp/arcticdb/pipeline/read_frame.cpp b/cpp/arcticdb/pipeline/read_frame.cpp
@@ -243,54 +243,6 @@ void decode_index_field(
     }
 }
 
-void handle_truncation(
-    Column& dest_column,
-    const ColumnTruncation& truncate) {
-    if(dest_column.num_blocks() == 1 && truncate.start_ && truncate.end_) {
-        dest_column.truncate_single_block(*truncate.start_, *truncate.end_);
-    } else {
-        if(truncate.start_)
-            dest_column.truncate_first_block(*truncate.start_);
-        if(truncate.end_)
-            dest_column.truncate_last_block(*truncate.end_);
-    }
-}
-
-void handle_truncation(
-        Column& dest_column,
-        const ColumnMapping& mapping) {
-    handle_truncation(dest_column, mapping.truncate_);
-}
-
-void handle_truncation(util::BitSet& bv, const ColumnTruncation& truncate) {
-    if (truncate.start_) {
-        bv = util::truncate_sparse_map(bv, *truncate.start_, truncate.end_.value_or(bv.size()));
-    } else if (truncate.end_) {
-        // More efficient than util::truncate_sparse_map as it avoids a copy
-        bv.resize(*truncate.end_);
-    }
-}
-
-void create_dense_bitmap(size_t offset, const util::BitSet& sparse_map, Column& dest_column, AllocationType allocation_type) {
-    auto& sparse_buffer = dest_column.create_extra_buffer(
-        offset,
-        ExtraBufferType::BITMAP,
-        bitset_packed_size_bytes(sparse_map.size()),
-        allocation_type);
-
-    bitset_to_packed_bits(sparse_map, sparse_buffer.data());
-}
-
-void create_dense_bitmap_all_zeros(size_t offset, size_t num_bits, Column& dest_column, AllocationType allocation_type) {
-    auto num_bytes = bitset_packed_size_bytes(num_bits);
-    auto& sparse_buffer = dest_column.create_extra_buffer(
-            offset,
-            ExtraBufferType::BITMAP,
-            num_bytes,
-            allocation_type);
-    std::memset(sparse_buffer.data(), 0, num_bytes);
-}
-
 void decode_or_expand(
     const uint8_t*& data,
     Column& dest_column,
diff --git a/python/tests/unit/arcticdb/version_store/test_arrow.py b/python/tests/unit/arcticdb/version_store/test_arrow.py
@@ -93,6 +93,42 @@ def test_strings_basic(lmdb_version_store_arrow, dynamic_strings):
     assert_frame_equal_with_arrow(table, df)
 
 
+@pytest.mark.parametrize("row_range", [None, (2, 3), (2, 4), (2, 5), (2, 6), (3, 4), (3, 5), (3, 6)])
+def test_strings_with_nones_and_nans(lmdb_version_store_tiny_segment, row_range):
+    lib = lmdb_version_store_tiny_segment
+    lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+    # lmdb_version_store_tiny_segment has 2 rows per segment
+    # This column is constructed so that every 2-element permutation of strings, Nones, and NaNs are tested
+    df = pd.DataFrame(
+        {
+            "x": [
+                "a",
+                "b",
+                "c",
+                None,
+                None,
+                "d",
+                "e",
+                np.nan,
+                np.nan,
+                "f",
+                None,
+                None,
+                None,
+                np.nan,
+                np.nan,
+                None,
+                np.nan,
+                np.nan,
+            ]
+        }
+    )
+    lib.write("arrow", df, dynamic_strings=True)
+    table = lib.read("arrow", row_range=row_range).data
+    expected = lib.read("arrow", row_range=row_range, output_format=OutputFormat.PANDAS).data
+    assert_frame_equal_with_arrow(table, expected)
+
+
 @pytest.mark.skipif(WINDOWS, reason="Fixed-width string columns not supported on Windows")
 def test_fixed_width_strings(lmdb_version_store_arrow):
     lib = lmdb_version_store_arrow