Skip to content

Commit 610259f

Browse files
committed
Handle different data types
1 parent 7a7d44d commit 610259f

File tree

1 file changed

+315
-16
lines changed

1 file changed

+315
-16
lines changed

src/serialize.cpp

Lines changed: 315 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#include <charconv>
12
#include <cstdint>
23
#include <cstring>
34
#include <optional>
@@ -18,30 +19,328 @@ namespace
1819
return (n + 7) & -8;
1920
}
2021

21-
// TODO Complete this with all possible formats?
22-
std::pair<org::apache::arrow::flatbuf::Type, flatbuffers::Offset<void>>
23-
get_flatbuffer_type(flatbuffers::FlatBufferBuilder& builder, const char* format_str)
22+
// Parse the format string
23+
std::optional<int32_t> parse_format(std::string_view format_str, std::string_view sep)
2424
{
25-
if (format_str == sparrow::data_type_to_format(sparrow::data_type::INT32))
25+
// Find the position of the colon
26+
auto sep_pos = format_str.find(sep);
27+
if (sep_pos == std::string_view::npos)
2628
{
27-
auto int_type = org::apache::arrow::flatbuf::CreateInt(builder, 32, true);
28-
return {org::apache::arrow::flatbuf::Type::Int, int_type.Union()};
29+
return std::nullopt;
2930
}
30-
else if (format_str == sparrow::data_type_to_format(sparrow::data_type::FLOAT))
31+
32+
std::string_view substr_str(format_str.data() + sep_pos + 1, format_str.size() - sep_pos - 1);
33+
34+
int32_t substr_size = 0;
35+
auto [ptr, ec] = std::from_chars(substr_str.data(), substr_str.data() + substr_str.size(), substr_size);
36+
37+
if (ec != std::errc() || ptr != substr_str.data() + substr_str.size())
3138
{
32-
auto fp_type = org::apache::arrow::flatbuf::CreateFloatingPoint(
33-
builder, org::apache::arrow::flatbuf::Precision::SINGLE);
34-
return {org::apache::arrow::flatbuf::Type::FloatingPoint, fp_type.Union()};
39+
return std::nullopt;
3540
}
36-
else if (format_str == sparrow::data_type_to_format(sparrow::data_type::DOUBLE))
41+
return substr_size;
42+
}
43+
44+
std::pair<org::apache::arrow::flatbuf::Type, flatbuffers::Offset<void>>
45+
get_flatbuffer_decimal_type(flatbuffers::FlatBufferBuilder& builder, std::string_view format_str, int32_t bitWidth)
46+
{
47+
// Decimal requires precision and scale. We need to parse the format_str.
48+
// Format: "d:precision,scale"
49+
auto scale = parse_format(format_str, ",");
50+
if (!scale.has_value())
3751
{
38-
auto fp_type = org::apache::arrow::flatbuf::CreateFloatingPoint(
39-
builder, org::apache::arrow::flatbuf::Precision::DOUBLE);
40-
return {org::apache::arrow::flatbuf::Type::FloatingPoint, fp_type.Union()};
52+
throw std::runtime_error("Failed to parse Decimal256 scale from format string: " + std::string(format_str));
4153
}
42-
else
54+
size_t comma_pos = format_str.find(',');
55+
auto precision = parse_format(format_str.substr(0, comma_pos), ":");
56+
if (!precision.has_value())
57+
{
58+
throw std::runtime_error("Failed to parse Decimal256 precision from format string: " + std::string(format_str));
59+
}
60+
auto decimal_type = org::apache::arrow::flatbuf::CreateDecimal(builder, precision.value(), scale.value(), bitWidth);
61+
return {org::apache::arrow::flatbuf::Type::Decimal, decimal_type.Union()};
62+
}
63+
64+
std::pair<org::apache::arrow::flatbuf::Type, flatbuffers::Offset<void>>
65+
get_flatbuffer_type(flatbuffers::FlatBufferBuilder& builder, std::string_view format_str)
66+
{
67+
auto type = sparrow::format_to_data_type(format_str);
68+
switch (type)
4369
{
44-
throw std::runtime_error("Unsupported data type for serialization");
70+
case sparrow::data_type::NA:
71+
{
72+
auto null_type = org::apache::arrow::flatbuf::CreateNull(builder);
73+
return {org::apache::arrow::flatbuf::Type::Null, null_type.Union()};
74+
}
75+
case sparrow::data_type::BOOL:
76+
{
77+
auto bool_type = org::apache::arrow::flatbuf::CreateBool(builder);
78+
return {org::apache::arrow::flatbuf::Type::Bool, bool_type.Union()};
79+
}
80+
case sparrow::data_type::UINT8:
81+
{
82+
auto int_type = org::apache::arrow::flatbuf::CreateInt(builder, 8, false);
83+
return {org::apache::arrow::flatbuf::Type::Int, int_type.Union()};
84+
}
85+
case sparrow::data_type::INT8:
86+
{
87+
auto int_type = org::apache::arrow::flatbuf::CreateInt(builder, 8, true);
88+
return {org::apache::arrow::flatbuf::Type::Int, int_type.Union()};
89+
}
90+
case sparrow::data_type::UINT16:
91+
{
92+
auto int_type = org::apache::arrow::flatbuf::CreateInt(builder, 16, false);
93+
return {org::apache::arrow::flatbuf::Type::Int, int_type.Union()};
94+
}
95+
case sparrow::data_type::INT16:
96+
{
97+
auto int_type = org::apache::arrow::flatbuf::CreateInt(builder, 16, true);
98+
return {org::apache::arrow::flatbuf::Type::Int, int_type.Union()};
99+
}
100+
case sparrow::data_type::UINT32:
101+
{
102+
auto int_type = org::apache::arrow::flatbuf::CreateInt(builder, 32, false);
103+
return {org::apache::arrow::flatbuf::Type::Int, int_type.Union()};
104+
}
105+
case sparrow::data_type::INT32:
106+
{
107+
auto int_type = org::apache::arrow::flatbuf::CreateInt(builder, 32, true);
108+
return {org::apache::arrow::flatbuf::Type::Int, int_type.Union()};
109+
}
110+
case sparrow::data_type::UINT64:
111+
{
112+
auto int_type = org::apache::arrow::flatbuf::CreateInt(builder, 64, false);
113+
return {org::apache::arrow::flatbuf::Type::Int, int_type.Union()};
114+
}
115+
case sparrow::data_type::INT64:
116+
{
117+
auto int_type = org::apache::arrow::flatbuf::CreateInt(builder, 64, true);
118+
return {org::apache::arrow::flatbuf::Type::Int, int_type.Union()};
119+
}
120+
case sparrow::data_type::HALF_FLOAT:
121+
{
122+
auto fp_type = org::apache::arrow::flatbuf::CreateFloatingPoint(
123+
builder, org::apache::arrow::flatbuf::Precision::HALF);
124+
return {org::apache::arrow::flatbuf::Type::FloatingPoint, fp_type.Union()};
125+
}
126+
case sparrow::data_type::FLOAT:
127+
{
128+
auto fp_type = org::apache::arrow::flatbuf::CreateFloatingPoint(
129+
builder, org::apache::arrow::flatbuf::Precision::SINGLE);
130+
return {org::apache::arrow::flatbuf::Type::FloatingPoint, fp_type.Union()};
131+
}
132+
case sparrow::data_type::DOUBLE:
133+
{
134+
auto fp_type = org::apache::arrow::flatbuf::CreateFloatingPoint(
135+
builder, org::apache::arrow::flatbuf::Precision::DOUBLE);
136+
return {org::apache::arrow::flatbuf::Type::FloatingPoint, fp_type.Union()};
137+
}
138+
case sparrow::data_type::STRING:
139+
{
140+
auto string_type = org::apache::arrow::flatbuf::CreateUtf8(builder);
141+
return {org::apache::arrow::flatbuf::Type::Utf8, string_type.Union()};
142+
}
143+
case sparrow::data_type::LARGE_STRING:
144+
{
145+
auto large_string_type = org::apache::arrow::flatbuf::CreateLargeUtf8(builder);
146+
return {org::apache::arrow::flatbuf::Type::LargeUtf8, large_string_type.Union()};
147+
}
148+
case sparrow::data_type::BINARY:
149+
{
150+
auto binary_type = org::apache::arrow::flatbuf::CreateBinary(builder);
151+
return {org::apache::arrow::flatbuf::Type::Binary, binary_type.Union()};
152+
}
153+
case sparrow::data_type::LARGE_BINARY:
154+
{
155+
auto large_binary_type = org::apache::arrow::flatbuf::CreateLargeBinary(builder);
156+
return {org::apache::arrow::flatbuf::Type::LargeBinary, large_binary_type.Union()};
157+
}
158+
case sparrow::data_type::STRING_VIEW:
159+
{
160+
auto string_view_type = org::apache::arrow::flatbuf::CreateUtf8View(builder);
161+
return {org::apache::arrow::flatbuf::Type::Utf8View, string_view_type.Union()};
162+
}
163+
case sparrow::data_type::BINARY_VIEW:
164+
{
165+
auto binary_view_type = org::apache::arrow::flatbuf::CreateBinaryView(builder);
166+
return {org::apache::arrow::flatbuf::Type::BinaryView, binary_view_type.Union()};
167+
}
168+
case sparrow::data_type::DATE_DAYS:
169+
{
170+
auto date_type = org::apache::arrow::flatbuf::CreateDate(builder, org::apache::arrow::flatbuf::DateUnit::DAY);
171+
return {org::apache::arrow::flatbuf::Type::Date, date_type.Union()};
172+
}
173+
case sparrow::data_type::DATE_MILLISECONDS:
174+
{
175+
auto date_type = org::apache::arrow::flatbuf::CreateDate(builder, org::apache::arrow::flatbuf::DateUnit::MILLISECOND);
176+
return {org::apache::arrow::flatbuf::Type::Date, date_type.Union()};
177+
}
178+
case sparrow::data_type::TIMESTAMP_SECONDS:
179+
{
180+
auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp(builder, org::apache::arrow::flatbuf::TimeUnit::SECOND);
181+
return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union()};
182+
}
183+
case sparrow::data_type::TIMESTAMP_MILLISECONDS:
184+
{
185+
auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp(builder, org::apache::arrow::flatbuf::TimeUnit::MILLISECOND);
186+
return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union()};
187+
}
188+
case sparrow::data_type::TIMESTAMP_MICROSECONDS:
189+
{
190+
auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp(builder, org::apache::arrow::flatbuf::TimeUnit::MICROSECOND);
191+
return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union()};
192+
}
193+
case sparrow::data_type::TIMESTAMP_NANOSECONDS:
194+
{
195+
auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp(builder, org::apache::arrow::flatbuf::TimeUnit::NANOSECOND);
196+
return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union()};
197+
}
198+
case sparrow::data_type::DURATION_SECONDS:
199+
{
200+
auto duration_type = org::apache::arrow::flatbuf::CreateDuration(builder, org::apache::arrow::flatbuf::TimeUnit::SECOND);
201+
return {org::apache::arrow::flatbuf::Type::Duration, duration_type.Union()};
202+
}
203+
case sparrow::data_type::DURATION_MILLISECONDS:
204+
{
205+
auto duration_type = org::apache::arrow::flatbuf::CreateDuration(builder, org::apache::arrow::flatbuf::TimeUnit::MILLISECOND);
206+
return {org::apache::arrow::flatbuf::Type::Duration, duration_type.Union()};
207+
}
208+
case sparrow::data_type::DURATION_MICROSECONDS:
209+
{
210+
auto duration_type = org::apache::arrow::flatbuf::CreateDuration(builder, org::apache::arrow::flatbuf::TimeUnit::MICROSECOND);
211+
return {org::apache::arrow::flatbuf::Type::Duration, duration_type.Union()};
212+
}
213+
case sparrow::data_type::DURATION_NANOSECONDS:
214+
{
215+
auto duration_type = org::apache::arrow::flatbuf::CreateDuration(builder, org::apache::arrow::flatbuf::TimeUnit::NANOSECOND);
216+
return {org::apache::arrow::flatbuf::Type::Duration, duration_type.Union()};
217+
}
218+
case sparrow::data_type::INTERVAL_MONTHS:
219+
{
220+
auto interval_type = org::apache::arrow::flatbuf::CreateInterval(builder, org::apache::arrow::flatbuf::IntervalUnit::YEAR_MONTH);
221+
return {org::apache::arrow::flatbuf::Type::Interval, interval_type.Union()};
222+
}
223+
case sparrow::data_type::INTERVAL_DAYS_TIME:
224+
{
225+
auto interval_type = org::apache::arrow::flatbuf::CreateInterval(builder, org::apache::arrow::flatbuf::IntervalUnit::DAY_TIME);
226+
return {org::apache::arrow::flatbuf::Type::Interval, interval_type.Union()};
227+
}
228+
case sparrow::data_type::INTERVAL_MONTHS_DAYS_NANOSECONDS:
229+
{
230+
auto interval_type = org::apache::arrow::flatbuf::CreateInterval(builder, org::apache::arrow::flatbuf::IntervalUnit::MONTH_DAY_NANO);
231+
return {org::apache::arrow::flatbuf::Type::Interval, interval_type.Union()};
232+
}
233+
case sparrow::data_type::TIME_SECONDS:
234+
{
235+
auto time_type = org::apache::arrow::flatbuf::CreateTime(builder, org::apache::arrow::flatbuf::TimeUnit::SECOND, 32);
236+
return {org::apache::arrow::flatbuf::Type::Time, time_type.Union()};
237+
}
238+
case sparrow::data_type::TIME_MILLISECONDS:
239+
{
240+
auto time_type = org::apache::arrow::flatbuf::CreateTime(builder, org::apache::arrow::flatbuf::TimeUnit::MILLISECOND, 32);
241+
return {org::apache::arrow::flatbuf::Type::Time, time_type.Union()};
242+
}
243+
case sparrow::data_type::TIME_MICROSECONDS:
244+
{
245+
auto time_type = org::apache::arrow::flatbuf::CreateTime(builder, org::apache::arrow::flatbuf::TimeUnit::MICROSECOND, 64);
246+
return {org::apache::arrow::flatbuf::Type::Time, time_type.Union()};
247+
}
248+
case sparrow::data_type::TIME_NANOSECONDS:
249+
{
250+
auto time_type = org::apache::arrow::flatbuf::CreateTime(builder, org::apache::arrow::flatbuf::TimeUnit::NANOSECOND, 64);
251+
return {org::apache::arrow::flatbuf::Type::Time, time_type.Union()};
252+
}
253+
case sparrow::data_type::LIST:
254+
{
255+
auto list_type = org::apache::arrow::flatbuf::CreateList(builder);
256+
return {org::apache::arrow::flatbuf::Type::List, list_type.Union()};
257+
}
258+
case sparrow::data_type::LARGE_LIST:
259+
{
260+
auto large_list_type = org::apache::arrow::flatbuf::CreateLargeList(builder);
261+
return {org::apache::arrow::flatbuf::Type::LargeList, large_list_type.Union()};
262+
}
263+
case sparrow::data_type::LIST_VIEW:
264+
{
265+
auto list_view_type = org::apache::arrow::flatbuf::CreateListView(builder);
266+
return {org::apache::arrow::flatbuf::Type::ListView, list_view_type.Union()};
267+
}
268+
case sparrow::data_type::LARGE_LIST_VIEW:
269+
{
270+
auto large_list_view_type = org::apache::arrow::flatbuf::CreateLargeListView(builder);
271+
return {org::apache::arrow::flatbuf::Type::LargeListView, large_list_view_type.Union()};
272+
}
273+
case sparrow::data_type::FIXED_SIZED_LIST:
274+
{
275+
// FixedSizeList requires listSize. We need to parse the format_str.
276+
// Format: "+w:size"
277+
auto list_size = parse_format(format_str, ":");
278+
if (!list_size.has_value())
279+
{
280+
throw std::runtime_error("Failed to parse FixedSizeList size from format string: " + std::string(format_str));
281+
}
282+
283+
auto fixed_size_list_type = org::apache::arrow::flatbuf::CreateFixedSizeList(builder, list_size.value());
284+
return {org::apache::arrow::flatbuf::Type::FixedSizeList, fixed_size_list_type.Union()};
285+
}
286+
case sparrow::data_type::STRUCT:
287+
{
288+
auto struct_type = org::apache::arrow::flatbuf::CreateStruct_(builder);
289+
return {org::apache::arrow::flatbuf::Type::Struct_, struct_type.Union()};
290+
}
291+
case sparrow::data_type::MAP:
292+
{
293+
auto map_type = org::apache::arrow::flatbuf::CreateMap(builder, false); // not sorted keys
294+
return {org::apache::arrow::flatbuf::Type::Map, map_type.Union()};
295+
}
296+
case sparrow::data_type::DENSE_UNION:
297+
{
298+
auto union_type = org::apache::arrow::flatbuf::CreateUnion(builder, org::apache::arrow::flatbuf::UnionMode::Dense, 0);
299+
return {org::apache::arrow::flatbuf::Type::Union, union_type.Union()};
300+
}
301+
case sparrow::data_type::SPARSE_UNION:
302+
{
303+
auto union_type = org::apache::arrow::flatbuf::CreateUnion(builder, org::apache::arrow::flatbuf::UnionMode::Sparse, 0);
304+
return {org::apache::arrow::flatbuf::Type::Union, union_type.Union()};
305+
}
306+
case sparrow::data_type::RUN_ENCODED:
307+
{
308+
auto run_end_encoded_type = org::apache::arrow::flatbuf::CreateRunEndEncoded(builder);
309+
return {org::apache::arrow::flatbuf::Type::RunEndEncoded, run_end_encoded_type.Union()};
310+
}
311+
case sparrow::data_type::DECIMAL32:
312+
{
313+
return get_flatbuffer_decimal_type(builder, format_str, 32);
314+
}
315+
case sparrow::data_type::DECIMAL64:
316+
{
317+
return get_flatbuffer_decimal_type(builder, format_str, 64);
318+
}
319+
case sparrow::data_type::DECIMAL128:
320+
{
321+
return get_flatbuffer_decimal_type(builder, format_str, 128);
322+
}
323+
case sparrow::data_type::DECIMAL256:
324+
{
325+
return get_flatbuffer_decimal_type(builder, format_str, 256);
326+
}
327+
case sparrow::data_type::FIXED_WIDTH_BINARY:
328+
{
329+
// FixedSizeBinary requires byteWidth. We need to parse the format_str.
330+
// Format: "w:size"
331+
auto byte_width = parse_format(format_str, ":");
332+
if (!byte_width.has_value())
333+
{
334+
throw std::runtime_error("Failed to parse FixedWidthBinary size from format string: " + std::string(format_str));
335+
}
336+
337+
auto fixed_width_binary_type = org::apache::arrow::flatbuf::CreateFixedSizeBinary(builder, byte_width.value());
338+
return {org::apache::arrow::flatbuf::Type::FixedSizeBinary, fixed_width_binary_type.Union()};
339+
}
340+
default:
341+
{
342+
throw std::runtime_error("Unsupported data type for serialization");
343+
}
45344
}
46345
}
47346
}

0 commit comments

Comments
 (0)