Skip to content

Commit ebd2d66

Browse files
committed
feat: implement literal expressions with binary serialization support
1 parent b8895e2 commit ebd2d66

File tree

10 files changed

+42
-293
lines changed

10 files changed

+42
-293
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,11 @@ set(ICEBERG_SOURCES
4949
manifest_reader_internal.cc
5050
manifest_writer.cc
5151
arrow_c_data_guard_internal.cc
52+
util/conversions.cc
5253
util/decimal.cc
53-
util/murmurhash3_internal.cc
54-
util/timepoint.cc
5554
util/gzip_internal.cc
56-
util/conversions.cc
57-
util/literal_format.cc)
55+
util/murmurhash3_internal.cc
56+
util/timepoint.cc)
5857

5958
set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS)
6059
set(ICEBERG_SHARED_BUILD_INTERFACE_LIBS)

src/iceberg/expression/literal.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424

2525
#include "iceberg/exception.h"
2626
#include "iceberg/util/conversions.h"
27-
#include "iceberg/util/literal_format.h"
2827

2928
namespace iceberg {
3029

src/iceberg/expression/literal.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,8 +143,8 @@ class ICEBERG_EXPORT Literal {
143143
private:
144144
Literal(Value value, std::shared_ptr<PrimitiveType> type);
145145

146-
friend class LiteralCaster;
147146
friend class Conversions;
147+
friend class LiteralCaster;
148148

149149
Value value_;
150150
std::shared_ptr<PrimitiveType> type_;

src/iceberg/util/conversions.cc

Lines changed: 22 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -19,23 +19,24 @@
1919

2020
#include "iceberg/util/conversions.h"
2121

22-
#include <cctype>
22+
#include <array>
2323
#include <cstring>
24-
#include <ranges>
24+
#include <span>
25+
#include <string>
2526

26-
#include "iceberg/exception.h"
27-
#include "iceberg/type.h"
2827
#include "iceberg/util/endian.h"
2928
#include "iceberg/util/macros.h"
3029

3130
namespace iceberg {
3231

33-
/// \brief Write a value in little-endian format to the buffer.
32+
/// \brief Write a value in little-endian format and return as vector.
3433
template <EndianConvertible T>
35-
void WriteLittleEndian(std::vector<uint8_t>& buffer, T value) {
34+
std::vector<uint8_t> WriteLittleEndian(T value) {
3635
value = ToLittleEndian(value);
3736
const auto* bytes = reinterpret_cast<const uint8_t*>(&value);
38-
buffer.insert(buffer.end(), bytes, bytes + sizeof(T));
37+
std::vector<uint8_t> result;
38+
result.insert(result.end(), bytes, bytes + sizeof(T));
39+
return result;
3940
}
4041

4142
/// \brief Read a value in little-endian format from the data.
@@ -58,78 +59,63 @@ Result<std::vector<uint8_t>> Conversions::ToBytes(const PrimitiveType& type,
5859

5960
switch (type_id) {
6061
case TypeId::kBoolean: {
61-
// 0x00 for false, 0x01 for true
6262
result.push_back(std::get<bool>(value) ? 0x01 : 0x00);
6363
return result;
6464
}
6565

6666
case TypeId::kInt: {
67-
// Stored as 4-byte little-endian
68-
WriteLittleEndian(result, std::get<int32_t>(value));
67+
result = WriteLittleEndian(std::get<int32_t>(value));
6968
return result;
7069
}
7170

7271
case TypeId::kDate: {
73-
// Stores days from 1970-01-01 in a 4-byte little-endian int
74-
WriteLittleEndian(result, std::get<int32_t>(value));
72+
result = WriteLittleEndian(std::get<int32_t>(value));
7573
return result;
7674
}
7775

7876
case TypeId::kLong: {
79-
// Stored as 8-byte little-endian
80-
WriteLittleEndian(result, std::get<int64_t>(value));
77+
result = WriteLittleEndian(std::get<int64_t>(value));
8178
return result;
8279
}
8380

8481
case TypeId::kTime: {
85-
// Stores microseconds from midnight in an 8-byte little-endian long
86-
WriteLittleEndian(result, std::get<int64_t>(value));
82+
result = WriteLittleEndian(std::get<int64_t>(value));
8783
return result;
8884
}
8985

9086
case TypeId::kTimestamp: {
91-
// Stores microseconds from 1970-01-01 00:00:00.000000 in an 8-byte little-endian
92-
// long
93-
WriteLittleEndian(result, std::get<int64_t>(value));
87+
result = WriteLittleEndian(std::get<int64_t>(value));
9488
return result;
9589
}
9690

9791
case TypeId::kTimestampTz: {
98-
// Stores microseconds from 1970-01-01 00:00:00.000000 UTC in an 8-byte
99-
// little-endian long
100-
WriteLittleEndian(result, std::get<int64_t>(value));
92+
result = WriteLittleEndian(std::get<int64_t>(value));
10193
return result;
10294
}
10395

10496
case TypeId::kFloat: {
105-
// Stored as 4-byte little-endian
106-
WriteLittleEndian(result, std::get<float>(value));
97+
result = WriteLittleEndian(std::get<float>(value));
10798
return result;
10899
}
109100

110101
case TypeId::kDouble: {
111-
// Stored as 8-byte little-endian
112-
WriteLittleEndian(result, std::get<double>(value));
102+
result = WriteLittleEndian(std::get<double>(value));
113103
return result;
114104
}
115105

116106
case TypeId::kString: {
117-
// UTF-8 bytes (without length)
118107
const auto& str = std::get<std::string>(value);
119108
result.insert(result.end(), str.begin(), str.end());
120109
return result;
121110
}
122111

123112
case TypeId::kBinary: {
124-
// Binary value (without length)
125113
const auto& binary_data = std::get<std::vector<uint8_t>>(value);
126114
result.insert(result.end(), binary_data.begin(), binary_data.end());
127115
return result;
128116
}
129117

130118
case TypeId::kFixed: {
131-
// Fixed(L) - Binary value, could be stored in std::array<uint8_t, 16> or
132-
// std::vector<uint8_t>
133119
if (std::holds_alternative<std::array<uint8_t, 16>>(value)) {
134120
const auto& fixed_bytes = std::get<std::array<uint8_t, 16>>(value);
135121
result.insert(result.end(), fixed_bytes.begin(), fixed_bytes.end());
@@ -144,13 +130,7 @@ Result<std::vector<uint8_t>> Conversions::ToBytes(const PrimitiveType& type,
144130
}
145131
return result;
146132
}
147-
148-
case TypeId::kUuid: {
149-
// 16-byte big-endian value
150-
const auto& uuid_bytes = std::get<std::array<uint8_t, 16>>(value);
151-
WriteBigEndian16(result, uuid_bytes);
152-
return result;
153-
}
133+
// TODO(Li Feiyang): Add support for UUID and Decimal
154134

155135
default:
156136
return NotSupported("Serialization for type {} is not supported", type.ToString());
@@ -174,9 +154,8 @@ Result<std::vector<uint8_t>> Conversions::ToBytes(const Literal& literal) {
174154

175155
Result<Literal::Value> Conversions::FromBytes(const PrimitiveType& type,
176156
std::span<const uint8_t> data) {
177-
// Empty data represents null value
178157
if (data.empty()) {
179-
return Literal::Value{std::monostate{}};
158+
return InvalidArgument("Data cannot be empty");
180159
}
181160

182161
const auto type_id = type.type_id();
@@ -187,7 +166,6 @@ Result<Literal::Value> Conversions::FromBytes(const PrimitiveType& type,
187166
return InvalidArgument("Boolean requires 1 byte, got {}", data.size());
188167
}
189168
ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<uint8_t>(data));
190-
// 0x00 for false, non-zero byte for true
191169
return Literal::Value{static_cast<bool>(value != 0x00)};
192170
}
193171

@@ -215,20 +193,14 @@ Result<Literal::Value> Conversions::FromBytes(const PrimitiveType& type,
215193
case TypeId::kTimestampTz: {
216194
int64_t value;
217195
if (data.size() == 8) {
218-
// Standard 8-byte long
219196
ICEBERG_ASSIGN_OR_RAISE(auto long_value, ReadLittleEndian<int64_t>(data));
220197
value = long_value;
221198
} else if (data.size() == 4) {
222199
// Type was promoted from int to long
223200
ICEBERG_ASSIGN_OR_RAISE(auto int_value, ReadLittleEndian<int32_t>(data));
224201
value = static_cast<int64_t>(int_value);
225202
} else {
226-
auto type_name_view = ToString(type_id);
227-
std::string type_name{type_name_view};
228-
if (!type_name.empty()) {
229-
type_name[0] = static_cast<char>(std::toupper(type_name[0]));
230-
}
231-
return InvalidArgument("{} requires 4 or 8 bytes, got {}", type_name,
203+
return InvalidArgument("{} requires 4 or 8 bytes, got {}", ToString(type_id),
232204
data.size());
233205
}
234206

@@ -246,7 +218,6 @@ Result<Literal::Value> Conversions::FromBytes(const PrimitiveType& type,
246218

247219
case TypeId::kDouble: {
248220
if (data.size() == 8) {
249-
// Standard 8-byte double
250221
ICEBERG_ASSIGN_OR_RAISE(auto double_value, ReadLittleEndian<double>(data));
251222
return Literal::Value{double_value};
252223
} else if (data.size() == 4) {
@@ -276,27 +247,7 @@ Result<Literal::Value> Conversions::FromBytes(const PrimitiveType& type,
276247
return Literal::Value{std::vector<uint8_t>(data.begin(), data.end())};
277248
}
278249
}
279-
280-
case TypeId::kUuid: {
281-
if (data.size() != 16) {
282-
return InvalidArgument("UUID requires 16 bytes, got {}", data.size());
283-
}
284-
ICEBERG_ASSIGN_OR_RAISE(auto uuid_value, ReadBigEndian16(data));
285-
return Literal::Value{uuid_value};
286-
}
287-
288-
case TypeId::kDecimal: {
289-
if (data.size() > 16) {
290-
return InvalidArgument(
291-
"Decimal data too large, maximum 16 bytes supported, got {}", data.size());
292-
}
293-
294-
std::array<uint8_t, 16> decimal_bytes{};
295-
// Copy data to the end of the array (big-endian format for decimals)
296-
// This handles variable-length decimals by right-aligning them
297-
std::ranges::copy(data, decimal_bytes.end() - data.size());
298-
return Literal::Value{decimal_bytes};
299-
}
250+
// TODO(Li Feiyang): Add support for UUID and Decimal
300251

301252
default:
302253
return NotSupported("Deserialization for type {} is not supported",
@@ -314,10 +265,10 @@ Result<Literal> Conversions::FromBytes(std::shared_ptr<PrimitiveType> type,
314265

315266
// If we got a null value (monostate), create a null Literal
316267
if (std::holds_alternative<std::monostate>(value)) {
317-
return Literal::Null(type);
268+
return Literal::Null(std::move(type));
318269
}
319270

320-
return Literal(value, type);
271+
return Literal(std::move(value), std::move(type));
321272
}
322273

323274
} // namespace iceberg

src/iceberg/util/conversions.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,25 +19,33 @@
1919

2020
#pragma once
2121

22+
#include <span>
2223
#include <vector>
2324

2425
#include "iceberg/expression/literal.h"
25-
#include "iceberg/iceberg_export.h"
2626
#include "iceberg/result.h"
27-
#include "iceberg/type.h"
27+
#include "iceberg/type_fwd.h"
28+
29+
/// \file iceberg/util/conversions.h
30+
/// \brief Conversion utilities for primitive types
2831

2932
namespace iceberg {
33+
34+
/// \brief Conversion utilities for primitive types
3035
class ICEBERG_EXPORT Conversions {
3136
public:
37+
/// \brief Convert a literal value to bytes
3238
static Result<std::vector<uint8_t>> ToBytes(const PrimitiveType& type,
3339
const Literal::Value& value);
3440

3541
static Result<std::vector<uint8_t>> ToBytes(const Literal& literal);
3642

43+
/// \brief Convert bytes to a literal value
3744
static Result<Literal::Value> FromBytes(const PrimitiveType& type,
3845
std::span<const uint8_t> data);
3946

4047
static Result<Literal> FromBytes(std::shared_ptr<PrimitiveType> type,
4148
std::span<const uint8_t> data);
4249
};
50+
4351
} // namespace iceberg

src/iceberg/util/endian.h

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,11 @@
1919

2020
#pragma once
2121

22-
// #include <bit>
23-
// #include <concepts>
24-
// #include <cstdint>
2522
#include <algorithm>
23+
#include <array>
2624
#include <bit>
27-
#include <cstring>
28-
#include <span>
29-
#include <vector>
30-
31-
#include "iceberg/result.h"
25+
#include <concepts>
26+
#include <cstdint>
3227

3328
/// \file iceberg/util/endian.h
3429
/// \brief Endianness conversion utilities

src/iceberg/util/literal_format.cc

Lines changed: 0 additions & 76 deletions
This file was deleted.

0 commit comments

Comments
 (0)