Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmake/external_dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ if(SPARROW_IPC_BUILD_TESTS)

# Iterate over all the files in the arrow-testing-data source directiory. When it's a gz, extract in place.
file(GLOB_RECURSE arrow_testing_data_targz_files CONFIGURE_DEPENDS
"${arrow-testing_SOURCE_DIR}/data/arrow-ipc-stream/integration/1.0.0-littleendian/*.json.gz"
"${arrow-testing_SOURCE_DIR}/data/arrow-ipc-stream/integration/cpp-21.0.0/*.json.gz"
)
foreach(file_path IN LISTS arrow_testing_data_targz_files)
cmake_path(GET file_path PARENT_PATH parent_dir)
Expand Down
2 changes: 1 addition & 1 deletion conanfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def configure(self):
self.options.rm_safe("fPIC")

def requirements(self):
self.requires("sparrow/1.0.0")
self.requires("sparrow/1.2.0", options={"json_reader": True})
self.requires(f"flatbuffers/{self._flatbuffers_version}")
if self.options.get_safe("build_tests"):
self.test_requires("doctest/2.4.12")
Expand Down
68 changes: 68 additions & 0 deletions include/sparrow_ipc/deserialize_decimal_array.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#pragma once

#include <span>

#include <sparrow/arrow_interface/arrow_array_schema_proxy.hpp>
#include <sparrow/decimal_array.hpp>

#include "Message_generated.h"
#include "sparrow_ipc/arrow_interface/arrow_array.hpp"
#include "sparrow_ipc/arrow_interface/arrow_schema.hpp"
#include "sparrow_ipc/deserialize_utils.hpp"

namespace sparrow_ipc
{
template <sparrow::decimal_type T>
[[nodiscard]] sparrow::decimal_array<T> deserialize_non_owning_decimal(
const org::apache::arrow::flatbuf::RecordBatch& record_batch,
std::span<const uint8_t> body,
std::string_view name,
const std::optional<std::vector<sparrow::metadata_pair>>& metadata,
size_t& buffer_index,
int32_t scale,
int32_t precision
)
{
constexpr std::size_t sizeof_decimal = sizeof(typename T::integer_type);
std::string format_str = "d:" + std::to_string(precision) + "," + std::to_string(scale);
if constexpr (sizeof_decimal != 16) // We don't need to specify the size for 128-bit
// decimals
{
format_str += "," + std::to_string(sizeof_decimal * 8);
}

ArrowSchema schema = make_non_owning_arrow_schema(
format_str,
name.data(),
metadata,
std::nullopt,
0,
nullptr,
nullptr
);
const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(
record_batch,
body,
buffer_index++
);

const auto buffer_metadata = record_batch.buffers()->Get(buffer_index++);
if ((body.size() < (buffer_metadata->offset() + buffer_metadata->length())))
{
throw std::runtime_error("Data buffer exceeds body size");
}
auto buffer_ptr = const_cast<uint8_t*>(body.data() + buffer_metadata->offset());
std::vector<std::uint8_t*> buffers = {bitmap_ptr, buffer_ptr};
ArrowArray array = make_non_owning_arrow_array(
record_batch.length(),
null_count,
0,
std::move(buffers),
0,
nullptr,
nullptr
);
sparrow::arrow_proxy ap{std::move(array), std::move(schema)};
return sparrow::decimal_array<T>(std::move(ap));
}
}
35 changes: 35 additions & 0 deletions include/sparrow_ipc/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

#include <cstdint>
#include <optional>
#include <string>
#include <string_view>
#include <utility>
#include <vector>

#include <sparrow/record_batch.hpp>

Expand All @@ -20,6 +22,39 @@ namespace sparrow_ipc::utils
SPARROW_IPC_API std::pair<org::apache::arrow::flatbuf::Type, flatbuffers::Offset<void>>
get_flatbuffer_type(flatbuffers::FlatBufferBuilder& builder, std::string_view format_str);

/**
* @brief Extracts words after ':' separated by ',' from a string.
*
* This function finds the position of ':' in the input string and then
* splits the remaining part by ',' to extract individual words.
*
* @param str Input string to parse (e.g., "prefix:word1,word2,word3")
* @return std::vector<std::string_view> Vector of string views containing the extracted words
* Returns an empty vector if ':' is not found or if there are no words after it
*
* @example
* extract_words_after_colon("d:128,10") returns {"128", "10"}
* extract_words_after_colon("w:256") returns {"256"}
* extract_words_after_colon("no_colon") returns {}
*/
SPARROW_IPC_API std::vector<std::string_view> extract_words_after_colon(std::string_view str);

/**
* @brief Parse a string_view to int32_t using std::from_chars.
*
* This function converts a string view to a 32-bit integer using std::from_chars
* for efficient parsing.
*
* @param str The string view to parse
* @return std::optional<int32_t> The parsed integer value, or std::nullopt if parsing fails
*
* @example
* parse_to_int32("123") returns std::optional<int32_t>(123)
* parse_to_int32("abc") returns std::nullopt
* parse_to_int32("") returns std::nullopt
*/
SPARROW_IPC_API std::optional<int32_t> parse_to_int32(std::string_view str);

/**
* @brief Checks if all record batches in a collection have consistent structure.
*
Expand Down
65 changes: 65 additions & 0 deletions src/deserialize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@

#include <sparrow/types/data_type.hpp>

#include "sparrow_ipc/deserialize_decimal_array.hpp"
#include "sparrow_ipc/deserialize_fixedsizebinary_array.hpp"
#include "sparrow_ipc/deserialize_primitive_array.hpp"
#include "sparrow_ipc/deserialize_variable_size_binary_array.hpp"
#include "sparrow_ipc/encapsulated_message.hpp"
#include "sparrow_ipc/magic_values.hpp"
#include "sparrow_ipc/metadata.hpp"

Expand Down Expand Up @@ -191,6 +193,69 @@ namespace sparrow_ipc
)
);
break;
case org::apache::arrow::flatbuf::Type::Decimal:
{
const auto decimal_field = field->type_as_Decimal();
const auto scale = decimal_field->scale();
const auto precision = decimal_field->precision();
if (decimal_field->bitWidth() == 32)
{
arrays.emplace_back(
deserialize_non_owning_decimal<sparrow::decimal<int32_t>>(
record_batch,
encapsulated_message.body(),
name,
metadata,
buffer_index,
scale,
precision
)
);
}
else if (decimal_field->bitWidth() == 64)
{
arrays.emplace_back(
deserialize_non_owning_decimal<sparrow::decimal<int64_t>>(
record_batch,
encapsulated_message.body(),
name,
metadata,
buffer_index,
scale,
precision
)
);
}
else if (decimal_field->bitWidth() == 128)
{
arrays.emplace_back(
deserialize_non_owning_decimal<sparrow::decimal<sparrow::int128_t>>(
record_batch,
encapsulated_message.body(),
name,
metadata,
buffer_index,
scale,
precision
)
);
}
else if (decimal_field->bitWidth() == 256)
{
arrays.emplace_back(
deserialize_non_owning_decimal<sparrow::decimal<sparrow::int256_t>>(
record_batch,
encapsulated_message.body(),
name,
metadata,
buffer_index,
scale,
precision
)
);
}
break;
}
default:
throw std::runtime_error("Unsupported type.");
}
Expand Down
4 changes: 4 additions & 0 deletions src/deserialize_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ namespace sparrow_ipc::utils
size_t index
)
{
if(index >= static_cast<size_t>(record_batch.buffers()->size()))
{
throw std::runtime_error("Buffer index out of range");
}
const auto bitmap_metadata = record_batch.buffers()->Get(index);
if (bitmap_metadata->length() == 0)
{
Expand Down
116 changes: 78 additions & 38 deletions src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,40 +3,14 @@
#include <charconv>
#include <stdexcept>
#include <string>
#include <vector>

#include "sparrow.hpp"

namespace sparrow_ipc
{
namespace
{
// Parse the format string
// The format string is expected to be "w:size", "+w:size", "d:precision,scale", etc
std::optional<int32_t> parse_format(std::string_view format_str, std::string_view sep)
{
// Find the position of the delimiter
const auto sep_pos = format_str.find(sep);
if (sep_pos == std::string_view::npos)
{
return std::nullopt;
}

std::string_view substr_str(format_str.data() + sep_pos + 1, format_str.size() - sep_pos - 1);

int32_t substr_size = 0;
const auto [ptr, ec] = std::from_chars(
substr_str.data(),
substr_str.data() + substr_str.size(),
substr_size
);

if (ec != std::errc() || ptr != substr_str.data() + substr_str.size())
{
return std::nullopt;
}
return substr_size;
}

// Creates a Flatbuffers Decimal type from a format string
// The format string is expected to be in the format "d:precision,scale"
std::pair<org::apache::arrow::flatbuf::Type, flatbuffers::Offset<void>> get_flatbuffer_decimal_type(
Expand All @@ -45,23 +19,21 @@ namespace sparrow_ipc
const int32_t bitWidth
)
{
// Decimal requires precision and scale. We need to parse the format_str.
// Format: "d:precision,scale"
const auto scale = parse_format(format_str, ",");
if (!scale.has_value())
const std::vector<std::string_view> words = utils::extract_words_after_colon(format_str);
if (words.size() < 2)
{
throw std::runtime_error(
"Failed to parse Decimal " + std::to_string(bitWidth)
+ " scale from format string: " + std::string(format_str)
+ " from format string: " + std::string(format_str)
);
}
const size_t comma_pos = format_str.find(',');
const auto precision = parse_format(format_str.substr(0, comma_pos), ":");
if (!precision.has_value())
const auto scale = utils::parse_to_int32(words[1]);
const auto precision = utils::parse_to_int32(words[0]);
if (!scale.has_value() || !precision.has_value())
{
throw std::runtime_error(
"Failed to parse Decimal " + std::to_string(bitWidth)
+ " precision from format string: " + std::string(format_str)
+ " precision/scale from format string: " + std::string(format_str)
);
}
const auto decimal_type = org::apache::arrow::flatbuf::CreateDecimal(
Expand All @@ -81,6 +53,59 @@ namespace sparrow_ipc
return (n + 7) & -8;
}

std::vector<std::string_view> extract_words_after_colon(std::string_view str)
{
std::vector<std::string_view> result;

// Find the position of ':'
const auto colon_pos = str.find(':');
if (colon_pos == std::string_view::npos)
{
return result; // Return empty vector if ':' not found
}

// Get the substring after ':'
std::string_view remaining = str.substr(colon_pos + 1);

// If nothing after ':', return empty vector
if (remaining.empty())
{
return result;
}

// Split by ','
size_t start = 0;
size_t comma_pos = remaining.find(',');

while (comma_pos != std::string_view::npos)
{
result.push_back(remaining.substr(start, comma_pos - start));
start = comma_pos + 1;
comma_pos = remaining.find(',', start);
}

// Add the last word (or the only word if no comma was found)
result.push_back(remaining.substr(start));

return result;
}

std::optional<int32_t> parse_to_int32(std::string_view str)
{
int32_t value = 0;
const auto [ptr, ec] = std::from_chars(
str.data(),
str.data() + str.size(),
value
);

if (ec != std::errc() || ptr != str.data() + str.size())
{
return std::nullopt;
}
return value;
}

std::pair<org::apache::arrow::flatbuf::Type, flatbuffers::Offset<void>>
get_flatbuffer_type(flatbuffers::FlatBufferBuilder& builder, std::string_view format_str)
{
Expand Down Expand Up @@ -355,7 +380,14 @@ namespace sparrow_ipc
{
// FixedSizeList requires listSize. We need to parse the format_str.
// Format: "+w:size"
const auto list_size = parse_format(format_str, ":");
const auto words = utils::extract_words_after_colon(format_str);
if (words.empty())
{
throw std::runtime_error(
"Failed to parse FixedSizeList size from format string: " + std::string(format_str)
);
}
const auto list_size = utils::parse_to_int32(words[0]);
if (!list_size.has_value())
{
throw std::runtime_error(
Expand Down Expand Up @@ -423,7 +455,15 @@ namespace sparrow_ipc
{
// FixedSizeBinary requires byteWidth. We need to parse the format_str.
// Format: "w:size"
const auto byte_width = parse_format(format_str, ":");
const auto words = utils::extract_words_after_colon(format_str);
if (words.empty())
{
throw std::runtime_error(
"Failed to parse FixedWidthBinary size from format string: "
+ std::string(format_str)
);
}
const auto byte_width = utils::parse_to_int32(words[0]);
if (!byte_width.has_value())
{
throw std::runtime_error(
Expand Down
Loading