|
1 | | -#include <charconv> |
2 | 1 | #include <cstdint> |
3 | 2 | #include <cstring> |
4 | 3 | #include <optional> |
|
10 | 9 | #include "Schema_generated.h" |
11 | 10 |
|
12 | 11 | #include "serialize.hpp" |
13 | | - |
14 | | -namespace |
15 | | -{ |
16 | | - // Aligns a value to the next multiple of 8, as required by the Arrow IPC format for message bodies. |
17 | | - int64_t align_to_8(int64_t n) |
18 | | - { |
19 | | - return (n + 7) & -8; |
20 | | - } |
21 | | - |
22 | | - // Parse the format string |
23 | | - std::optional<int32_t> parse_format(std::string_view format_str, std::string_view sep) |
24 | | - { |
25 | | - // Find the position of the colon |
26 | | - auto sep_pos = format_str.find(sep); |
27 | | - if (sep_pos == std::string_view::npos) |
28 | | - { |
29 | | - return std::nullopt; |
30 | | - } |
31 | | - |
32 | | - std::string_view substr_str(format_str.data() + sep_pos + 1, format_str.size() - sep_pos - 1); |
33 | | - |
34 | | - int32_t substr_size = 0; |
35 | | - auto [ptr, ec] = std::from_chars(substr_str.data(), substr_str.data() + substr_str.size(), substr_size); |
36 | | - |
37 | | - if (ec != std::errc() || ptr != substr_str.data() + substr_str.size()) |
38 | | - { |
39 | | - return std::nullopt; |
40 | | - } |
41 | | - return substr_size; |
42 | | - } |
43 | | - |
44 | | - std::pair<org::apache::arrow::flatbuf::Type, flatbuffers::Offset<void>> |
45 | | - get_flatbuffer_decimal_type(flatbuffers::FlatBufferBuilder& builder, std::string_view format_str, int32_t bitWidth) |
46 | | - { |
47 | | - // Decimal requires precision and scale. We need to parse the format_str. |
48 | | - // Format: "d:precision,scale" |
49 | | - auto scale = parse_format(format_str, ","); |
50 | | - if (!scale.has_value()) |
51 | | - { |
52 | | - throw std::runtime_error("Failed to parse Decimal256 scale from format string: " + std::string(format_str)); |
53 | | - } |
54 | | - size_t comma_pos = format_str.find(','); |
55 | | - auto precision = parse_format(format_str.substr(0, comma_pos), ":"); |
56 | | - if (!precision.has_value()) |
57 | | - { |
58 | | - throw std::runtime_error("Failed to parse Decimal256 precision from format string: " + std::string(format_str)); |
59 | | - } |
60 | | - auto decimal_type = org::apache::arrow::flatbuf::CreateDecimal(builder, precision.value(), scale.value(), bitWidth); |
61 | | - return {org::apache::arrow::flatbuf::Type::Decimal, decimal_type.Union()}; |
62 | | - } |
63 | | - |
64 | | - std::pair<org::apache::arrow::flatbuf::Type, flatbuffers::Offset<void>> |
65 | | - get_flatbuffer_type(flatbuffers::FlatBufferBuilder& builder, std::string_view format_str) |
66 | | - { |
67 | | - auto type = sparrow::format_to_data_type(format_str); |
68 | | - switch (type) |
69 | | - { |
70 | | - case sparrow::data_type::NA: |
71 | | - { |
72 | | - auto null_type = org::apache::arrow::flatbuf::CreateNull(builder); |
73 | | - return {org::apache::arrow::flatbuf::Type::Null, null_type.Union()}; |
74 | | - } |
75 | | - case sparrow::data_type::BOOL: |
76 | | - { |
77 | | - auto bool_type = org::apache::arrow::flatbuf::CreateBool(builder); |
78 | | - return {org::apache::arrow::flatbuf::Type::Bool, bool_type.Union()}; |
79 | | - } |
80 | | - case sparrow::data_type::UINT8: |
81 | | - { |
82 | | - auto int_type = org::apache::arrow::flatbuf::CreateInt(builder, 8, false); |
83 | | - return {org::apache::arrow::flatbuf::Type::Int, int_type.Union()}; |
84 | | - } |
85 | | - case sparrow::data_type::INT8: |
86 | | - { |
87 | | - auto int_type = org::apache::arrow::flatbuf::CreateInt(builder, 8, true); |
88 | | - return {org::apache::arrow::flatbuf::Type::Int, int_type.Union()}; |
89 | | - } |
90 | | - case sparrow::data_type::UINT16: |
91 | | - { |
92 | | - auto int_type = org::apache::arrow::flatbuf::CreateInt(builder, 16, false); |
93 | | - return {org::apache::arrow::flatbuf::Type::Int, int_type.Union()}; |
94 | | - } |
95 | | - case sparrow::data_type::INT16: |
96 | | - { |
97 | | - auto int_type = org::apache::arrow::flatbuf::CreateInt(builder, 16, true); |
98 | | - return {org::apache::arrow::flatbuf::Type::Int, int_type.Union()}; |
99 | | - } |
100 | | - case sparrow::data_type::UINT32: |
101 | | - { |
102 | | - auto int_type = org::apache::arrow::flatbuf::CreateInt(builder, 32, false); |
103 | | - return {org::apache::arrow::flatbuf::Type::Int, int_type.Union()}; |
104 | | - } |
105 | | - case sparrow::data_type::INT32: |
106 | | - { |
107 | | - auto int_type = org::apache::arrow::flatbuf::CreateInt(builder, 32, true); |
108 | | - return {org::apache::arrow::flatbuf::Type::Int, int_type.Union()}; |
109 | | - } |
110 | | - case sparrow::data_type::UINT64: |
111 | | - { |
112 | | - auto int_type = org::apache::arrow::flatbuf::CreateInt(builder, 64, false); |
113 | | - return {org::apache::arrow::flatbuf::Type::Int, int_type.Union()}; |
114 | | - } |
115 | | - case sparrow::data_type::INT64: |
116 | | - { |
117 | | - auto int_type = org::apache::arrow::flatbuf::CreateInt(builder, 64, true); |
118 | | - return {org::apache::arrow::flatbuf::Type::Int, int_type.Union()}; |
119 | | - } |
120 | | - case sparrow::data_type::HALF_FLOAT: |
121 | | - { |
122 | | - auto fp_type = org::apache::arrow::flatbuf::CreateFloatingPoint( |
123 | | - builder, org::apache::arrow::flatbuf::Precision::HALF); |
124 | | - return {org::apache::arrow::flatbuf::Type::FloatingPoint, fp_type.Union()}; |
125 | | - } |
126 | | - case sparrow::data_type::FLOAT: |
127 | | - { |
128 | | - auto fp_type = org::apache::arrow::flatbuf::CreateFloatingPoint( |
129 | | - builder, org::apache::arrow::flatbuf::Precision::SINGLE); |
130 | | - return {org::apache::arrow::flatbuf::Type::FloatingPoint, fp_type.Union()}; |
131 | | - } |
132 | | - case sparrow::data_type::DOUBLE: |
133 | | - { |
134 | | - auto fp_type = org::apache::arrow::flatbuf::CreateFloatingPoint( |
135 | | - builder, org::apache::arrow::flatbuf::Precision::DOUBLE); |
136 | | - return {org::apache::arrow::flatbuf::Type::FloatingPoint, fp_type.Union()}; |
137 | | - } |
138 | | - case sparrow::data_type::STRING: |
139 | | - { |
140 | | - auto string_type = org::apache::arrow::flatbuf::CreateUtf8(builder); |
141 | | - return {org::apache::arrow::flatbuf::Type::Utf8, string_type.Union()}; |
142 | | - } |
143 | | - case sparrow::data_type::LARGE_STRING: |
144 | | - { |
145 | | - auto large_string_type = org::apache::arrow::flatbuf::CreateLargeUtf8(builder); |
146 | | - return {org::apache::arrow::flatbuf::Type::LargeUtf8, large_string_type.Union()}; |
147 | | - } |
148 | | - case sparrow::data_type::BINARY: |
149 | | - { |
150 | | - auto binary_type = org::apache::arrow::flatbuf::CreateBinary(builder); |
151 | | - return {org::apache::arrow::flatbuf::Type::Binary, binary_type.Union()}; |
152 | | - } |
153 | | - case sparrow::data_type::LARGE_BINARY: |
154 | | - { |
155 | | - auto large_binary_type = org::apache::arrow::flatbuf::CreateLargeBinary(builder); |
156 | | - return {org::apache::arrow::flatbuf::Type::LargeBinary, large_binary_type.Union()}; |
157 | | - } |
158 | | - case sparrow::data_type::STRING_VIEW: |
159 | | - { |
160 | | - auto string_view_type = org::apache::arrow::flatbuf::CreateUtf8View(builder); |
161 | | - return {org::apache::arrow::flatbuf::Type::Utf8View, string_view_type.Union()}; |
162 | | - } |
163 | | - case sparrow::data_type::BINARY_VIEW: |
164 | | - { |
165 | | - auto binary_view_type = org::apache::arrow::flatbuf::CreateBinaryView(builder); |
166 | | - return {org::apache::arrow::flatbuf::Type::BinaryView, binary_view_type.Union()}; |
167 | | - } |
168 | | - case sparrow::data_type::DATE_DAYS: |
169 | | - { |
170 | | - auto date_type = org::apache::arrow::flatbuf::CreateDate(builder, org::apache::arrow::flatbuf::DateUnit::DAY); |
171 | | - return {org::apache::arrow::flatbuf::Type::Date, date_type.Union()}; |
172 | | - } |
173 | | - case sparrow::data_type::DATE_MILLISECONDS: |
174 | | - { |
175 | | - auto date_type = org::apache::arrow::flatbuf::CreateDate(builder, org::apache::arrow::flatbuf::DateUnit::MILLISECOND); |
176 | | - return {org::apache::arrow::flatbuf::Type::Date, date_type.Union()}; |
177 | | - } |
178 | | - case sparrow::data_type::TIMESTAMP_SECONDS: |
179 | | - { |
180 | | - auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp(builder, org::apache::arrow::flatbuf::TimeUnit::SECOND); |
181 | | - return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union()}; |
182 | | - } |
183 | | - case sparrow::data_type::TIMESTAMP_MILLISECONDS: |
184 | | - { |
185 | | - auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp(builder, org::apache::arrow::flatbuf::TimeUnit::MILLISECOND); |
186 | | - return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union()}; |
187 | | - } |
188 | | - case sparrow::data_type::TIMESTAMP_MICROSECONDS: |
189 | | - { |
190 | | - auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp(builder, org::apache::arrow::flatbuf::TimeUnit::MICROSECOND); |
191 | | - return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union()}; |
192 | | - } |
193 | | - case sparrow::data_type::TIMESTAMP_NANOSECONDS: |
194 | | - { |
195 | | - auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp(builder, org::apache::arrow::flatbuf::TimeUnit::NANOSECOND); |
196 | | - return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union()}; |
197 | | - } |
198 | | - case sparrow::data_type::DURATION_SECONDS: |
199 | | - { |
200 | | - auto duration_type = org::apache::arrow::flatbuf::CreateDuration(builder, org::apache::arrow::flatbuf::TimeUnit::SECOND); |
201 | | - return {org::apache::arrow::flatbuf::Type::Duration, duration_type.Union()}; |
202 | | - } |
203 | | - case sparrow::data_type::DURATION_MILLISECONDS: |
204 | | - { |
205 | | - auto duration_type = org::apache::arrow::flatbuf::CreateDuration(builder, org::apache::arrow::flatbuf::TimeUnit::MILLISECOND); |
206 | | - return {org::apache::arrow::flatbuf::Type::Duration, duration_type.Union()}; |
207 | | - } |
208 | | - case sparrow::data_type::DURATION_MICROSECONDS: |
209 | | - { |
210 | | - auto duration_type = org::apache::arrow::flatbuf::CreateDuration(builder, org::apache::arrow::flatbuf::TimeUnit::MICROSECOND); |
211 | | - return {org::apache::arrow::flatbuf::Type::Duration, duration_type.Union()}; |
212 | | - } |
213 | | - case sparrow::data_type::DURATION_NANOSECONDS: |
214 | | - { |
215 | | - auto duration_type = org::apache::arrow::flatbuf::CreateDuration(builder, org::apache::arrow::flatbuf::TimeUnit::NANOSECOND); |
216 | | - return {org::apache::arrow::flatbuf::Type::Duration, duration_type.Union()}; |
217 | | - } |
218 | | - case sparrow::data_type::INTERVAL_MONTHS: |
219 | | - { |
220 | | - auto interval_type = org::apache::arrow::flatbuf::CreateInterval(builder, org::apache::arrow::flatbuf::IntervalUnit::YEAR_MONTH); |
221 | | - return {org::apache::arrow::flatbuf::Type::Interval, interval_type.Union()}; |
222 | | - } |
223 | | - case sparrow::data_type::INTERVAL_DAYS_TIME: |
224 | | - { |
225 | | - auto interval_type = org::apache::arrow::flatbuf::CreateInterval(builder, org::apache::arrow::flatbuf::IntervalUnit::DAY_TIME); |
226 | | - return {org::apache::arrow::flatbuf::Type::Interval, interval_type.Union()}; |
227 | | - } |
228 | | - case sparrow::data_type::INTERVAL_MONTHS_DAYS_NANOSECONDS: |
229 | | - { |
230 | | - auto interval_type = org::apache::arrow::flatbuf::CreateInterval(builder, org::apache::arrow::flatbuf::IntervalUnit::MONTH_DAY_NANO); |
231 | | - return {org::apache::arrow::flatbuf::Type::Interval, interval_type.Union()}; |
232 | | - } |
233 | | - case sparrow::data_type::TIME_SECONDS: |
234 | | - { |
235 | | - auto time_type = org::apache::arrow::flatbuf::CreateTime(builder, org::apache::arrow::flatbuf::TimeUnit::SECOND, 32); |
236 | | - return {org::apache::arrow::flatbuf::Type::Time, time_type.Union()}; |
237 | | - } |
238 | | - case sparrow::data_type::TIME_MILLISECONDS: |
239 | | - { |
240 | | - auto time_type = org::apache::arrow::flatbuf::CreateTime(builder, org::apache::arrow::flatbuf::TimeUnit::MILLISECOND, 32); |
241 | | - return {org::apache::arrow::flatbuf::Type::Time, time_type.Union()}; |
242 | | - } |
243 | | - case sparrow::data_type::TIME_MICROSECONDS: |
244 | | - { |
245 | | - auto time_type = org::apache::arrow::flatbuf::CreateTime(builder, org::apache::arrow::flatbuf::TimeUnit::MICROSECOND, 64); |
246 | | - return {org::apache::arrow::flatbuf::Type::Time, time_type.Union()}; |
247 | | - } |
248 | | - case sparrow::data_type::TIME_NANOSECONDS: |
249 | | - { |
250 | | - auto time_type = org::apache::arrow::flatbuf::CreateTime(builder, org::apache::arrow::flatbuf::TimeUnit::NANOSECOND, 64); |
251 | | - return {org::apache::arrow::flatbuf::Type::Time, time_type.Union()}; |
252 | | - } |
253 | | - case sparrow::data_type::LIST: |
254 | | - { |
255 | | - auto list_type = org::apache::arrow::flatbuf::CreateList(builder); |
256 | | - return {org::apache::arrow::flatbuf::Type::List, list_type.Union()}; |
257 | | - } |
258 | | - case sparrow::data_type::LARGE_LIST: |
259 | | - { |
260 | | - auto large_list_type = org::apache::arrow::flatbuf::CreateLargeList(builder); |
261 | | - return {org::apache::arrow::flatbuf::Type::LargeList, large_list_type.Union()}; |
262 | | - } |
263 | | - case sparrow::data_type::LIST_VIEW: |
264 | | - { |
265 | | - auto list_view_type = org::apache::arrow::flatbuf::CreateListView(builder); |
266 | | - return {org::apache::arrow::flatbuf::Type::ListView, list_view_type.Union()}; |
267 | | - } |
268 | | - case sparrow::data_type::LARGE_LIST_VIEW: |
269 | | - { |
270 | | - auto large_list_view_type = org::apache::arrow::flatbuf::CreateLargeListView(builder); |
271 | | - return {org::apache::arrow::flatbuf::Type::LargeListView, large_list_view_type.Union()}; |
272 | | - } |
273 | | - case sparrow::data_type::FIXED_SIZED_LIST: |
274 | | - { |
275 | | - // FixedSizeList requires listSize. We need to parse the format_str. |
276 | | - // Format: "+w:size" |
277 | | - auto list_size = parse_format(format_str, ":"); |
278 | | - if (!list_size.has_value()) |
279 | | - { |
280 | | - throw std::runtime_error("Failed to parse FixedSizeList size from format string: " + std::string(format_str)); |
281 | | - } |
282 | | - |
283 | | - auto fixed_size_list_type = org::apache::arrow::flatbuf::CreateFixedSizeList(builder, list_size.value()); |
284 | | - return {org::apache::arrow::flatbuf::Type::FixedSizeList, fixed_size_list_type.Union()}; |
285 | | - } |
286 | | - case sparrow::data_type::STRUCT: |
287 | | - { |
288 | | - auto struct_type = org::apache::arrow::flatbuf::CreateStruct_(builder); |
289 | | - return {org::apache::arrow::flatbuf::Type::Struct_, struct_type.Union()}; |
290 | | - } |
291 | | - case sparrow::data_type::MAP: |
292 | | - { |
293 | | - auto map_type = org::apache::arrow::flatbuf::CreateMap(builder, false); // not sorted keys |
294 | | - return {org::apache::arrow::flatbuf::Type::Map, map_type.Union()}; |
295 | | - } |
296 | | - case sparrow::data_type::DENSE_UNION: |
297 | | - { |
298 | | - auto union_type = org::apache::arrow::flatbuf::CreateUnion(builder, org::apache::arrow::flatbuf::UnionMode::Dense, 0); |
299 | | - return {org::apache::arrow::flatbuf::Type::Union, union_type.Union()}; |
300 | | - } |
301 | | - case sparrow::data_type::SPARSE_UNION: |
302 | | - { |
303 | | - auto union_type = org::apache::arrow::flatbuf::CreateUnion(builder, org::apache::arrow::flatbuf::UnionMode::Sparse, 0); |
304 | | - return {org::apache::arrow::flatbuf::Type::Union, union_type.Union()}; |
305 | | - } |
306 | | - case sparrow::data_type::RUN_ENCODED: |
307 | | - { |
308 | | - auto run_end_encoded_type = org::apache::arrow::flatbuf::CreateRunEndEncoded(builder); |
309 | | - return {org::apache::arrow::flatbuf::Type::RunEndEncoded, run_end_encoded_type.Union()}; |
310 | | - } |
311 | | - case sparrow::data_type::DECIMAL32: |
312 | | - { |
313 | | - return get_flatbuffer_decimal_type(builder, format_str, 32); |
314 | | - } |
315 | | - case sparrow::data_type::DECIMAL64: |
316 | | - { |
317 | | - return get_flatbuffer_decimal_type(builder, format_str, 64); |
318 | | - } |
319 | | - case sparrow::data_type::DECIMAL128: |
320 | | - { |
321 | | - return get_flatbuffer_decimal_type(builder, format_str, 128); |
322 | | - } |
323 | | - case sparrow::data_type::DECIMAL256: |
324 | | - { |
325 | | - return get_flatbuffer_decimal_type(builder, format_str, 256); |
326 | | - } |
327 | | - case sparrow::data_type::FIXED_WIDTH_BINARY: |
328 | | - { |
329 | | - // FixedSizeBinary requires byteWidth. We need to parse the format_str. |
330 | | - // Format: "w:size" |
331 | | - auto byte_width = parse_format(format_str, ":"); |
332 | | - if (!byte_width.has_value()) |
333 | | - { |
334 | | - throw std::runtime_error("Failed to parse FixedWidthBinary size from format string: " + std::string(format_str)); |
335 | | - } |
336 | | - |
337 | | - auto fixed_width_binary_type = org::apache::arrow::flatbuf::CreateFixedSizeBinary(builder, byte_width.value()); |
338 | | - return {org::apache::arrow::flatbuf::Type::FixedSizeBinary, fixed_width_binary_type.Union()}; |
339 | | - } |
340 | | - default: |
341 | | - { |
342 | | - throw std::runtime_error("Unsupported data type for serialization"); |
343 | | - } |
344 | | - } |
345 | | - } |
346 | | -} |
| 12 | +#include "utils.hpp" |
347 | 13 |
|
348 | 14 | template <typename T> |
349 | 15 | std::vector<uint8_t> serialize_primitive_array(const sparrow::primitive_array<T>& arr) |
|
0 commit comments