1+ #include < charconv>
12#include < cstdint>
23#include < cstring>
34#include < optional>
@@ -18,30 +19,328 @@ namespace
1819 return (n + 7 ) & -8 ;
1920 }
2021
21- // TODO Complete this with all possible formats?
22- std::pair<org::apache::arrow::flatbuf::Type, flatbuffers::Offset<void >>
23- get_flatbuffer_type (flatbuffers::FlatBufferBuilder& builder, const char * format_str)
22+ // Parse the format string
23+ std::optional<int32_t > parse_format (std::string_view format_str, std::string_view sep)
2424 {
25- if (format_str == sparrow::data_type_to_format (sparrow::data_type::INT32))
25+ // Find the position of the colon
26+ auto sep_pos = format_str.find (sep);
27+ if (sep_pos == std::string_view::npos)
2628 {
27- auto int_type = org::apache::arrow::flatbuf::CreateInt (builder, 32 , true );
28- return {org::apache::arrow::flatbuf::Type::Int, int_type.Union ()};
29+ return std::nullopt ;
2930 }
30- else if (format_str == sparrow::data_type_to_format (sparrow::data_type::FLOAT))
31+
32+ std::string_view substr_str (format_str.data () + sep_pos + 1 , format_str.size () - sep_pos - 1 );
33+
34+ int32_t substr_size = 0 ;
35+ auto [ptr, ec] = std::from_chars (substr_str.data (), substr_str.data () + substr_str.size (), substr_size);
36+
37+ if (ec != std::errc () || ptr != substr_str.data () + substr_str.size ())
3138 {
32- auto fp_type = org::apache::arrow::flatbuf::CreateFloatingPoint (
33- builder, org::apache::arrow::flatbuf::Precision::SINGLE);
34- return {org::apache::arrow::flatbuf::Type::FloatingPoint, fp_type.Union ()};
39+ return std::nullopt ;
3540 }
36- else if (format_str == sparrow::data_type_to_format (sparrow::data_type::DOUBLE))
41+ return substr_size;
42+ }
43+
44+ std::pair<org::apache::arrow::flatbuf::Type, flatbuffers::Offset<void >>
45+ get_flatbuffer_decimal_type (flatbuffers::FlatBufferBuilder& builder, std::string_view format_str, int32_t bitWidth)
46+ {
47+ // Decimal requires precision and scale. We need to parse the format_str.
48+ // Format: "d:precision,scale"
49+ auto scale = parse_format (format_str, " ," );
50+ if (!scale.has_value ())
3751 {
38- auto fp_type = org::apache::arrow::flatbuf::CreateFloatingPoint (
39- builder, org::apache::arrow::flatbuf::Precision::DOUBLE);
40- return {org::apache::arrow::flatbuf::Type::FloatingPoint, fp_type.Union ()};
52+ throw std::runtime_error (" Failed to parse Decimal256 scale from format string: " + std::string (format_str));
4153 }
42- else
54+ size_t comma_pos = format_str.find (' ,' );
55+ auto precision = parse_format (format_str.substr (0 , comma_pos), " :" );
56+ if (!precision.has_value ())
57+ {
58+ throw std::runtime_error (" Failed to parse Decimal256 precision from format string: " + std::string (format_str));
59+ }
60+ auto decimal_type = org::apache::arrow::flatbuf::CreateDecimal (builder, precision.value (), scale.value (), bitWidth);
61+ return {org::apache::arrow::flatbuf::Type::Decimal, decimal_type.Union ()};
62+ }
63+
64+ std::pair<org::apache::arrow::flatbuf::Type, flatbuffers::Offset<void >>
65+ get_flatbuffer_type (flatbuffers::FlatBufferBuilder& builder, std::string_view format_str)
66+ {
67+ auto type = sparrow::format_to_data_type (format_str);
68+ switch (type)
4369 {
44- throw std::runtime_error (" Unsupported data type for serialization" );
70+ case sparrow::data_type::NA:
71+ {
72+ auto null_type = org::apache::arrow::flatbuf::CreateNull (builder);
73+ return {org::apache::arrow::flatbuf::Type::Null, null_type.Union ()};
74+ }
75+ case sparrow::data_type::BOOL:
76+ {
77+ auto bool_type = org::apache::arrow::flatbuf::CreateBool (builder);
78+ return {org::apache::arrow::flatbuf::Type::Bool, bool_type.Union ()};
79+ }
80+ case sparrow::data_type::UINT8:
81+ {
82+ auto int_type = org::apache::arrow::flatbuf::CreateInt (builder, 8 , false );
83+ return {org::apache::arrow::flatbuf::Type::Int, int_type.Union ()};
84+ }
85+ case sparrow::data_type::INT8:
86+ {
87+ auto int_type = org::apache::arrow::flatbuf::CreateInt (builder, 8 , true );
88+ return {org::apache::arrow::flatbuf::Type::Int, int_type.Union ()};
89+ }
90+ case sparrow::data_type::UINT16:
91+ {
92+ auto int_type = org::apache::arrow::flatbuf::CreateInt (builder, 16 , false );
93+ return {org::apache::arrow::flatbuf::Type::Int, int_type.Union ()};
94+ }
95+ case sparrow::data_type::INT16:
96+ {
97+ auto int_type = org::apache::arrow::flatbuf::CreateInt (builder, 16 , true );
98+ return {org::apache::arrow::flatbuf::Type::Int, int_type.Union ()};
99+ }
100+ case sparrow::data_type::UINT32:
101+ {
102+ auto int_type = org::apache::arrow::flatbuf::CreateInt (builder, 32 , false );
103+ return {org::apache::arrow::flatbuf::Type::Int, int_type.Union ()};
104+ }
105+ case sparrow::data_type::INT32:
106+ {
107+ auto int_type = org::apache::arrow::flatbuf::CreateInt (builder, 32 , true );
108+ return {org::apache::arrow::flatbuf::Type::Int, int_type.Union ()};
109+ }
110+ case sparrow::data_type::UINT64:
111+ {
112+ auto int_type = org::apache::arrow::flatbuf::CreateInt (builder, 64 , false );
113+ return {org::apache::arrow::flatbuf::Type::Int, int_type.Union ()};
114+ }
115+ case sparrow::data_type::INT64:
116+ {
117+ auto int_type = org::apache::arrow::flatbuf::CreateInt (builder, 64 , true );
118+ return {org::apache::arrow::flatbuf::Type::Int, int_type.Union ()};
119+ }
120+ case sparrow::data_type::HALF_FLOAT:
121+ {
122+ auto fp_type = org::apache::arrow::flatbuf::CreateFloatingPoint (
123+ builder, org::apache::arrow::flatbuf::Precision::HALF);
124+ return {org::apache::arrow::flatbuf::Type::FloatingPoint, fp_type.Union ()};
125+ }
126+ case sparrow::data_type::FLOAT:
127+ {
128+ auto fp_type = org::apache::arrow::flatbuf::CreateFloatingPoint (
129+ builder, org::apache::arrow::flatbuf::Precision::SINGLE);
130+ return {org::apache::arrow::flatbuf::Type::FloatingPoint, fp_type.Union ()};
131+ }
132+ case sparrow::data_type::DOUBLE:
133+ {
134+ auto fp_type = org::apache::arrow::flatbuf::CreateFloatingPoint (
135+ builder, org::apache::arrow::flatbuf::Precision::DOUBLE);
136+ return {org::apache::arrow::flatbuf::Type::FloatingPoint, fp_type.Union ()};
137+ }
138+ case sparrow::data_type::STRING:
139+ {
140+ auto string_type = org::apache::arrow::flatbuf::CreateUtf8 (builder);
141+ return {org::apache::arrow::flatbuf::Type::Utf8, string_type.Union ()};
142+ }
143+ case sparrow::data_type::LARGE_STRING:
144+ {
145+ auto large_string_type = org::apache::arrow::flatbuf::CreateLargeUtf8 (builder);
146+ return {org::apache::arrow::flatbuf::Type::LargeUtf8, large_string_type.Union ()};
147+ }
148+ case sparrow::data_type::BINARY:
149+ {
150+ auto binary_type = org::apache::arrow::flatbuf::CreateBinary (builder);
151+ return {org::apache::arrow::flatbuf::Type::Binary, binary_type.Union ()};
152+ }
153+ case sparrow::data_type::LARGE_BINARY:
154+ {
155+ auto large_binary_type = org::apache::arrow::flatbuf::CreateLargeBinary (builder);
156+ return {org::apache::arrow::flatbuf::Type::LargeBinary, large_binary_type.Union ()};
157+ }
158+ case sparrow::data_type::STRING_VIEW:
159+ {
160+ auto string_view_type = org::apache::arrow::flatbuf::CreateUtf8View (builder);
161+ return {org::apache::arrow::flatbuf::Type::Utf8View, string_view_type.Union ()};
162+ }
163+ case sparrow::data_type::BINARY_VIEW:
164+ {
165+ auto binary_view_type = org::apache::arrow::flatbuf::CreateBinaryView (builder);
166+ return {org::apache::arrow::flatbuf::Type::BinaryView, binary_view_type.Union ()};
167+ }
168+ case sparrow::data_type::DATE_DAYS:
169+ {
170+ auto date_type = org::apache::arrow::flatbuf::CreateDate (builder, org::apache::arrow::flatbuf::DateUnit::DAY);
171+ return {org::apache::arrow::flatbuf::Type::Date, date_type.Union ()};
172+ }
173+ case sparrow::data_type::DATE_MILLISECONDS:
174+ {
175+ auto date_type = org::apache::arrow::flatbuf::CreateDate (builder, org::apache::arrow::flatbuf::DateUnit::MILLISECOND);
176+ return {org::apache::arrow::flatbuf::Type::Date, date_type.Union ()};
177+ }
178+ case sparrow::data_type::TIMESTAMP_SECONDS:
179+ {
180+ auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp (builder, org::apache::arrow::flatbuf::TimeUnit::SECOND);
181+ return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union ()};
182+ }
183+ case sparrow::data_type::TIMESTAMP_MILLISECONDS:
184+ {
185+ auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp (builder, org::apache::arrow::flatbuf::TimeUnit::MILLISECOND);
186+ return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union ()};
187+ }
188+ case sparrow::data_type::TIMESTAMP_MICROSECONDS:
189+ {
190+ auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp (builder, org::apache::arrow::flatbuf::TimeUnit::MICROSECOND);
191+ return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union ()};
192+ }
193+ case sparrow::data_type::TIMESTAMP_NANOSECONDS:
194+ {
195+ auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp (builder, org::apache::arrow::flatbuf::TimeUnit::NANOSECOND);
196+ return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union ()};
197+ }
198+ case sparrow::data_type::DURATION_SECONDS:
199+ {
200+ auto duration_type = org::apache::arrow::flatbuf::CreateDuration (builder, org::apache::arrow::flatbuf::TimeUnit::SECOND);
201+ return {org::apache::arrow::flatbuf::Type::Duration, duration_type.Union ()};
202+ }
203+ case sparrow::data_type::DURATION_MILLISECONDS:
204+ {
205+ auto duration_type = org::apache::arrow::flatbuf::CreateDuration (builder, org::apache::arrow::flatbuf::TimeUnit::MILLISECOND);
206+ return {org::apache::arrow::flatbuf::Type::Duration, duration_type.Union ()};
207+ }
208+ case sparrow::data_type::DURATION_MICROSECONDS:
209+ {
210+ auto duration_type = org::apache::arrow::flatbuf::CreateDuration (builder, org::apache::arrow::flatbuf::TimeUnit::MICROSECOND);
211+ return {org::apache::arrow::flatbuf::Type::Duration, duration_type.Union ()};
212+ }
213+ case sparrow::data_type::DURATION_NANOSECONDS:
214+ {
215+ auto duration_type = org::apache::arrow::flatbuf::CreateDuration (builder, org::apache::arrow::flatbuf::TimeUnit::NANOSECOND);
216+ return {org::apache::arrow::flatbuf::Type::Duration, duration_type.Union ()};
217+ }
218+ case sparrow::data_type::INTERVAL_MONTHS:
219+ {
220+ auto interval_type = org::apache::arrow::flatbuf::CreateInterval (builder, org::apache::arrow::flatbuf::IntervalUnit::YEAR_MONTH);
221+ return {org::apache::arrow::flatbuf::Type::Interval, interval_type.Union ()};
222+ }
223+ case sparrow::data_type::INTERVAL_DAYS_TIME:
224+ {
225+ auto interval_type = org::apache::arrow::flatbuf::CreateInterval (builder, org::apache::arrow::flatbuf::IntervalUnit::DAY_TIME);
226+ return {org::apache::arrow::flatbuf::Type::Interval, interval_type.Union ()};
227+ }
228+ case sparrow::data_type::INTERVAL_MONTHS_DAYS_NANOSECONDS:
229+ {
230+ auto interval_type = org::apache::arrow::flatbuf::CreateInterval (builder, org::apache::arrow::flatbuf::IntervalUnit::MONTH_DAY_NANO);
231+ return {org::apache::arrow::flatbuf::Type::Interval, interval_type.Union ()};
232+ }
233+ case sparrow::data_type::TIME_SECONDS:
234+ {
235+ auto time_type = org::apache::arrow::flatbuf::CreateTime (builder, org::apache::arrow::flatbuf::TimeUnit::SECOND, 32 );
236+ return {org::apache::arrow::flatbuf::Type::Time, time_type.Union ()};
237+ }
238+ case sparrow::data_type::TIME_MILLISECONDS:
239+ {
240+ auto time_type = org::apache::arrow::flatbuf::CreateTime (builder, org::apache::arrow::flatbuf::TimeUnit::MILLISECOND, 32 );
241+ return {org::apache::arrow::flatbuf::Type::Time, time_type.Union ()};
242+ }
243+ case sparrow::data_type::TIME_MICROSECONDS:
244+ {
245+ auto time_type = org::apache::arrow::flatbuf::CreateTime (builder, org::apache::arrow::flatbuf::TimeUnit::MICROSECOND, 64 );
246+ return {org::apache::arrow::flatbuf::Type::Time, time_type.Union ()};
247+ }
248+ case sparrow::data_type::TIME_NANOSECONDS:
249+ {
250+ auto time_type = org::apache::arrow::flatbuf::CreateTime (builder, org::apache::arrow::flatbuf::TimeUnit::NANOSECOND, 64 );
251+ return {org::apache::arrow::flatbuf::Type::Time, time_type.Union ()};
252+ }
253+ case sparrow::data_type::LIST:
254+ {
255+ auto list_type = org::apache::arrow::flatbuf::CreateList (builder);
256+ return {org::apache::arrow::flatbuf::Type::List, list_type.Union ()};
257+ }
258+ case sparrow::data_type::LARGE_LIST:
259+ {
260+ auto large_list_type = org::apache::arrow::flatbuf::CreateLargeList (builder);
261+ return {org::apache::arrow::flatbuf::Type::LargeList, large_list_type.Union ()};
262+ }
263+ case sparrow::data_type::LIST_VIEW:
264+ {
265+ auto list_view_type = org::apache::arrow::flatbuf::CreateListView (builder);
266+ return {org::apache::arrow::flatbuf::Type::ListView, list_view_type.Union ()};
267+ }
268+ case sparrow::data_type::LARGE_LIST_VIEW:
269+ {
270+ auto large_list_view_type = org::apache::arrow::flatbuf::CreateLargeListView (builder);
271+ return {org::apache::arrow::flatbuf::Type::LargeListView, large_list_view_type.Union ()};
272+ }
273+ case sparrow::data_type::FIXED_SIZED_LIST:
274+ {
275+ // FixedSizeList requires listSize. We need to parse the format_str.
276+ // Format: "+w:size"
277+ auto list_size = parse_format (format_str, " :" );
278+ if (!list_size.has_value ())
279+ {
280+ throw std::runtime_error (" Failed to parse FixedSizeList size from format string: " + std::string (format_str));
281+ }
282+
283+ auto fixed_size_list_type = org::apache::arrow::flatbuf::CreateFixedSizeList (builder, list_size.value ());
284+ return {org::apache::arrow::flatbuf::Type::FixedSizeList, fixed_size_list_type.Union ()};
285+ }
286+ case sparrow::data_type::STRUCT:
287+ {
288+ auto struct_type = org::apache::arrow::flatbuf::CreateStruct_ (builder);
289+ return {org::apache::arrow::flatbuf::Type::Struct_, struct_type.Union ()};
290+ }
291+ case sparrow::data_type::MAP:
292+ {
293+ auto map_type = org::apache::arrow::flatbuf::CreateMap (builder, false ); // not sorted keys
294+ return {org::apache::arrow::flatbuf::Type::Map, map_type.Union ()};
295+ }
296+ case sparrow::data_type::DENSE_UNION:
297+ {
298+ auto union_type = org::apache::arrow::flatbuf::CreateUnion (builder, org::apache::arrow::flatbuf::UnionMode::Dense, 0 );
299+ return {org::apache::arrow::flatbuf::Type::Union, union_type.Union ()};
300+ }
301+ case sparrow::data_type::SPARSE_UNION:
302+ {
303+ auto union_type = org::apache::arrow::flatbuf::CreateUnion (builder, org::apache::arrow::flatbuf::UnionMode::Sparse, 0 );
304+ return {org::apache::arrow::flatbuf::Type::Union, union_type.Union ()};
305+ }
306+ case sparrow::data_type::RUN_ENCODED:
307+ {
308+ auto run_end_encoded_type = org::apache::arrow::flatbuf::CreateRunEndEncoded (builder);
309+ return {org::apache::arrow::flatbuf::Type::RunEndEncoded, run_end_encoded_type.Union ()};
310+ }
311+ case sparrow::data_type::DECIMAL32:
312+ {
313+ return get_flatbuffer_decimal_type (builder, format_str, 32 );
314+ }
315+ case sparrow::data_type::DECIMAL64:
316+ {
317+ return get_flatbuffer_decimal_type (builder, format_str, 64 );
318+ }
319+ case sparrow::data_type::DECIMAL128:
320+ {
321+ return get_flatbuffer_decimal_type (builder, format_str, 128 );
322+ }
323+ case sparrow::data_type::DECIMAL256:
324+ {
325+ return get_flatbuffer_decimal_type (builder, format_str, 256 );
326+ }
327+ case sparrow::data_type::FIXED_WIDTH_BINARY:
328+ {
329+ // FixedSizeBinary requires byteWidth. We need to parse the format_str.
330+ // Format: "w:size"
331+ auto byte_width = parse_format (format_str, " :" );
332+ if (!byte_width.has_value ())
333+ {
334+ throw std::runtime_error (" Failed to parse FixedWidthBinary size from format string: " + std::string (format_str));
335+ }
336+
337+ auto fixed_width_binary_type = org::apache::arrow::flatbuf::CreateFixedSizeBinary (builder, byte_width.value ());
338+ return {org::apache::arrow::flatbuf::Type::FixedSizeBinary, fixed_width_binary_type.Union ()};
339+ }
340+ default :
341+ {
342+ throw std::runtime_error (" Unsupported data type for serialization" );
343+ }
45344 }
46345 }
47346}
0 commit comments