Skip to content

Commit 9b9848f

Browse files
committed
feat: add NullType and fix string truncate
1 parent c3bbd2b commit 9b9848f

File tree

12 files changed

+97
-51
lines changed

12 files changed

+97
-51
lines changed

src/iceberg/expression/literal.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,8 @@ Literal::Literal(Value value, std::shared_ptr<PrimitiveType> type)
126126
: value_(std::move(value)), type_(std::move(type)) {}
127127

128128
// Factory methods
129+
Literal Literal::Null() { return {Value{std::monostate{}}, iceberg::null()}; }
130+
129131
Literal Literal::Boolean(bool value) { return {Value{value}, iceberg::boolean()}; }
130132

131133
Literal Literal::Int(int32_t value) { return {Value{value}, iceberg::int32()}; }
@@ -205,6 +207,9 @@ std::partial_ordering Literal::operator<=>(const Literal& other) const {
205207

206208
// Same type comparison for normal values
207209
switch (type_->type_id()) {
210+
case TypeId::kNull:
211+
// Nulls are equivalent
212+
return std::partial_ordering::equivalent;
208213
case TypeId::kBoolean: {
209214
auto this_val = std::get<bool>(value_);
210215
auto other_val = std::get<bool>(other.value_);

src/iceberg/expression/literal.h

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,17 +48,19 @@ class ICEBERG_EXPORT Literal {
4848
bool operator==(const AboveMax&) const = default;
4949
std::strong_ordering operator<=>(const AboveMax&) const = default;
5050
};
51-
using Value = std::variant<bool, // for boolean
52-
int32_t, // for int, date
53-
int64_t, // for long, timestamp, timestamp_tz, time
54-
float, // for float
55-
double, // for double
56-
std::string, // for string
51+
using Value = std::variant<std::monostate, // for null
52+
bool, // for boolean
53+
int32_t, // for int, date
54+
int64_t, // for long, timestamp, timestamp_tz, time
55+
float, // for float
56+
double, // for double
57+
std::string, // for string
5758
std::vector<uint8_t>, // for binary, fixed
5859
std::array<uint8_t, 16>, // for uuid and decimal
5960
BelowMin, AboveMax>;
6061

6162
/// \brief Factory methods for primitive types
63+
static Literal Null();
6264
static Literal Boolean(bool value);
6365
static Literal Int(int32_t value);
6466
static Literal Date(int32_t value);

src/iceberg/json_internal.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -477,8 +477,9 @@ nlohmann::json ToJson(const Type& type) {
477477
}
478478
case TypeId::kUuid:
479479
return "uuid";
480+
default:
481+
std::unreachable();
480482
}
481-
std::unreachable();
482483
}
483484

484485
nlohmann::json ToJson(const Schema& schema) {

src/iceberg/schema_internal.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <cstring>
2323
#include <optional>
2424
#include <string>
25+
#include <utility>
2526

2627
#include "iceberg/schema.h"
2728
#include "iceberg/type.h"
@@ -139,6 +140,8 @@ ArrowErrorCode ToArrowSchema(const Type& type, bool optional, std::string_view n
139140
ArrowMetadataBuilderAppend(&metadata_buffer, ArrowCharView(kArrowExtensionName),
140141
ArrowCharView(kArrowUuidExtensionName)));
141142
} break;
143+
default:
144+
std::unreachable();
142145
}
143146

144147
if (!name.empty()) {

src/iceberg/transform.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
#include <optional>
2727
#include <variant>
2828

29-
#include "iceberg/arrow_c_data.h"
3029
#include "iceberg/expression/literal.h"
3130
#include "iceberg/iceberg_export.h"
3231
#include "iceberg/result.h"
@@ -173,7 +172,7 @@ class ICEBERG_EXPORT TransformFunction {
173172
virtual ~TransformFunction() = default;
174173
TransformFunction(TransformType transform_type, std::shared_ptr<Type> source_type);
175174
/// \brief Transform an input Literal to a new Literal
176-
virtual Result<std::optional<Literal>> Transform(const Literal& literal) = 0;
175+
virtual Result<Literal> Transform(const Literal& literal) = 0;
177176
/// \brief Get the transform type
178177
TransformType transform_type() const;
179178
/// \brief Get the source type of transform function

src/iceberg/transform_function.cc

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include <chrono>
2424
#include <type_traits>
2525
#include <utility>
26+
#include <variant>
2627

2728
#include "iceberg/type.h"
2829
#include "iceberg/util/murmurhash3_internal.h"
@@ -32,9 +33,7 @@ namespace iceberg {
3233
IdentityTransform::IdentityTransform(std::shared_ptr<Type> const& source_type)
3334
: TransformFunction(TransformType::kIdentity, source_type) {}
3435

35-
Result<std::optional<Literal>> IdentityTransform::Transform(const Literal& literal) {
36-
return literal;
37-
}
36+
Result<Literal> IdentityTransform::Transform(const Literal& literal) { return literal; }
3837

3938
Result<std::shared_ptr<Type>> IdentityTransform::ResultType() const {
4039
return source_type();
@@ -53,7 +52,7 @@ BucketTransform::BucketTransform(std::shared_ptr<Type> const& source_type,
5352
int32_t num_buckets)
5453
: TransformFunction(TransformType::kBucket, source_type), num_buckets_(num_buckets) {}
5554

56-
Result<std::optional<Literal>> BucketTransform::Transform(const Literal& literal) {
55+
Result<Literal> BucketTransform::Transform(const Literal& literal) {
5756
assert(literal.type() == source_type());
5857
if (literal.IsBelowMin() || literal.IsAboveMax()) {
5958
return InvalidArgument(
@@ -74,7 +73,8 @@ Result<std::optional<Literal>> BucketTransform::Transform(const Literal& literal
7473
MurmurHash3_x86_32(value.data(), value.size(), 0, &hash_value);
7574
} else if constexpr (std::is_same_v<T, std::vector<uint8_t>>) {
7675
MurmurHash3_x86_32(value.data(), value.size(), 0, &hash_value);
77-
} else if constexpr (std::is_same_v<T, bool> || std::is_same_v<T, float> ||
76+
} else if constexpr (std::is_same_v<T, std::monostate> ||
77+
std::is_same_v<T, bool> || std::is_same_v<T, float> ||
7878
std::is_same_v<T, double> ||
7979
std::is_same_v<T, Literal::BelowMin> ||
8080
std::is_same_v<T, Literal::AboveMax>) {
@@ -128,7 +128,7 @@ TruncateTransform::TruncateTransform(std::shared_ptr<Type> const& source_type,
128128
int32_t width)
129129
: TransformFunction(TransformType::kTruncate, source_type), width_(width) {}
130130

131-
Result<std::optional<Literal>> TruncateTransform::Transform(const Literal& literal) {
131+
Result<Literal> TruncateTransform::Transform(const Literal& literal) {
132132
assert(literal.type() == source_type());
133133
if (literal.IsBelowMin() || literal.IsAboveMax()) {
134134
return InvalidArgument(
@@ -150,17 +150,25 @@ Result<std::optional<Literal>> TruncateTransform::Transform(const Literal& liter
150150
return NotImplemented("Truncate for Decimal is not implemented yet");
151151
}
152152
case TypeId::kString: {
153+
// Strings are truncated to a valid UTF-8 string with no more than L code points.
153154
auto value = std::get<std::string>(literal.value());
154-
if (value.size() > static_cast<size_t>(width_)) {
155-
size_t safe_point = width_;
156-
while (safe_point > 0 && (value[safe_point] & 0xC0) == 0x80) {
157-
// Find the last valid UTF-8 character boundary before or at width_
158-
safe_point--;
155+
size_t code_point_count = 0;
156+
size_t safe_point = 0;
157+
158+
for (size_t i = 0; i < value.size(); ++i) {
159+
// Start of a new UTF-8 code point
160+
if ((value[i] & 0xC0) != 0x80) {
161+
code_point_count++;
162+
if (code_point_count > static_cast<size_t>(width_)) {
163+
safe_point = i;
164+
break;
165+
}
159166
}
160-
// Resize the string to the safe point
161-
value.resize(safe_point);
162167
}
163168

169+
if (safe_point != 0) {
170+
value.resize(safe_point); // Resize the string to the safe point
171+
}
164172
return Literal::String(value);
165173
}
166174
case TypeId::kBinary: {
@@ -204,7 +212,7 @@ Result<std::unique_ptr<TransformFunction>> TruncateTransform::Make(
204212
YearTransform::YearTransform(std::shared_ptr<Type> const& source_type)
205213
: TransformFunction(TransformType::kTruncate, source_type) {}
206214

207-
Result<std::optional<Literal>> YearTransform::Transform(const Literal& literal) {
215+
Result<Literal> YearTransform::Transform(const Literal& literal) {
208216
assert(literal.type() == source_type());
209217
if (literal.IsBelowMin() || literal.IsAboveMax()) {
210218
return InvalidArgument(
@@ -256,7 +264,7 @@ Result<std::unique_ptr<TransformFunction>> YearTransform::Make(
256264
MonthTransform::MonthTransform(std::shared_ptr<Type> const& source_type)
257265
: TransformFunction(TransformType::kMonth, source_type) {}
258266

259-
Result<std::optional<Literal>> MonthTransform::Transform(const Literal& literal) {
267+
Result<Literal> MonthTransform::Transform(const Literal& literal) {
260268
assert(literal.type() == source_type());
261269
if (literal.IsBelowMin() || literal.IsAboveMax()) {
262270
return InvalidArgument(
@@ -320,7 +328,7 @@ Result<std::unique_ptr<TransformFunction>> MonthTransform::Make(
320328
DayTransform::DayTransform(std::shared_ptr<Type> const& source_type)
321329
: TransformFunction(TransformType::kDay, source_type) {}
322330

323-
Result<std::optional<Literal>> DayTransform::Transform(const Literal& literal) {
331+
Result<Literal> DayTransform::Transform(const Literal& literal) {
324332
assert(literal.type() == source_type());
325333
if (literal.IsBelowMin() || literal.IsAboveMax()) {
326334
return InvalidArgument(
@@ -371,7 +379,7 @@ Result<std::unique_ptr<TransformFunction>> DayTransform::Make(
371379
HourTransform::HourTransform(std::shared_ptr<Type> const& source_type)
372380
: TransformFunction(TransformType::kHour, source_type) {}
373381

374-
Result<std::optional<Literal>> HourTransform::Transform(const Literal& literal) {
382+
Result<Literal> HourTransform::Transform(const Literal& literal) {
375383
assert(literal.type() == source_type());
376384
if (literal.IsBelowMin() || literal.IsAboveMax()) {
377385
return InvalidArgument(
@@ -420,8 +428,8 @@ Result<std::unique_ptr<TransformFunction>> HourTransform::Make(
420428
VoidTransform::VoidTransform(std::shared_ptr<Type> const& source_type)
421429
: TransformFunction(TransformType::kVoid, source_type) {}
422430

423-
Result<std::optional<Literal>> VoidTransform::Transform(const Literal& literal) {
424-
return std::nullopt;
431+
Result<Literal> VoidTransform::Transform(const Literal& literal) {
432+
return Literal::Null();
425433
}
426434

427435
Result<std::shared_ptr<Type>> VoidTransform::ResultType() const { return source_type(); }

src/iceberg/transform_function.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ class IdentityTransform : public TransformFunction {
3131
explicit IdentityTransform(std::shared_ptr<Type> const& source_type);
3232

3333
/// \brief Returns the same Literal as the input.
34-
Result<std::optional<Literal>> Transform(const Literal& literal) override;
34+
Result<Literal> Transform(const Literal& literal) override;
3535

3636
/// \brief Returns the same type as the source type if it is valid.
3737
Result<std::shared_ptr<Type>> ResultType() const override;
@@ -51,7 +51,7 @@ class BucketTransform : public TransformFunction {
5151
BucketTransform(std::shared_ptr<Type> const& source_type, int32_t num_buckets);
5252

5353
/// \brief Applies the bucket hash function to the input Literal.
54-
Result<std::optional<Literal>> Transform(const Literal& literal) override;
54+
Result<Literal> Transform(const Literal& literal) override;
5555

5656
/// \brief Returns INT32 as the output type.
5757
Result<std::shared_ptr<Type>> ResultType() const override;
@@ -75,7 +75,7 @@ class TruncateTransform : public TransformFunction {
7575
TruncateTransform(std::shared_ptr<Type> const& source_type, int32_t width);
7676

7777
/// \brief Truncates the input Literal to the specified width.
78-
Result<std::optional<Literal>> Transform(const Literal& literal) override;
78+
Result<Literal> Transform(const Literal& literal) override;
7979

8080
/// \brief Returns the same type as source_type.
8181
Result<std::shared_ptr<Type>> ResultType() const override;
@@ -98,7 +98,7 @@ class YearTransform : public TransformFunction {
9898
explicit YearTransform(std::shared_ptr<Type> const& source_type);
9999

100100
/// \brief Extract a date or timestamp year, as years from 1970.
101-
Result<std::optional<Literal>> Transform(const Literal& literal) override;
101+
Result<Literal> Transform(const Literal& literal) override;
102102

103103
/// \brief Returns INT32 as the output type.
104104
Result<std::shared_ptr<Type>> ResultType() const override;
@@ -117,7 +117,7 @@ class MonthTransform : public TransformFunction {
117117
explicit MonthTransform(std::shared_ptr<Type> const& source_type);
118118

119119
/// \brief Extract a date or timestamp month, as months from 1970-01-01.
120-
Result<std::optional<Literal>> Transform(const Literal& literal) override;
120+
Result<Literal> Transform(const Literal& literal) override;
121121

122122
/// \brief Returns INT32 as the output type.
123123
Result<std::shared_ptr<Type>> ResultType() const override;
@@ -136,7 +136,7 @@ class DayTransform : public TransformFunction {
136136
explicit DayTransform(std::shared_ptr<Type> const& source_type);
137137

138138
/// \brief Extract a date or timestamp day, as days from 1970-01-01.
139-
Result<std::optional<Literal>> Transform(const Literal& literal) override;
139+
Result<Literal> Transform(const Literal& literal) override;
140140

141141
/// \brief Returns INT32 as the output type.
142142
Result<std::shared_ptr<Type>> ResultType() const override;
@@ -155,7 +155,7 @@ class HourTransform : public TransformFunction {
155155
explicit HourTransform(std::shared_ptr<Type> const& source_type);
156156

157157
/// \brief Extract a timestamp hour, as hours from 1970-01-01 00:00:00.
158-
Result<std::optional<Literal>> Transform(const Literal& literal) override;
158+
Result<Literal> Transform(const Literal& literal) override;
159159

160160
/// \brief Returns INT32 as the output type.
161161
Result<std::shared_ptr<Type>> ResultType() const override;
@@ -174,7 +174,7 @@ class VoidTransform : public TransformFunction {
174174
explicit VoidTransform(std::shared_ptr<Type> const& source_type);
175175

176176
/// \brief Returns a null literal.
177-
Result<std::optional<Literal>> Transform(const Literal& literal) override;
177+
Result<Literal> Transform(const Literal& literal) override;
178178

179179
/// \brief Returns null type or a sentinel type indicating void.
180180
Result<std::shared_ptr<Type>> ResultType() const override;

src/iceberg/type.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,10 @@ bool MapType::Equals(const Type& other) const {
194194
return fields_ == map.fields_;
195195
}
196196

197+
TypeId NullType::type_id() const { return TypeId::kNull; }
198+
std::string NullType::ToString() const { return "null"; }
199+
bool NullType::Equals(const Type& other) const { return other.type_id() == kTypeId; }
200+
197201
TypeId BooleanType::type_id() const { return kTypeId; }
198202
std::string BooleanType::ToString() const { return "boolean"; }
199203
bool BooleanType::Equals(const Type& other) const { return other.type_id() == kTypeId; }
@@ -296,6 +300,7 @@ bool BinaryType::Equals(const Type& other) const { return other.type_id() == kTy
296300
return result; \
297301
}
298302

303+
TYPE_FACTORY(null, NullType)
299304
TYPE_FACTORY(boolean, BooleanType)
300305
TYPE_FACTORY(int32, IntType)
301306
TYPE_FACTORY(int64, LongType)

src/iceberg/type.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,23 @@ class ICEBERG_EXPORT MapType : public NestedType {
191191
/// Primitive types do not have nested fields.
192192
/// @{
193193

194+
/// \brief A data type that has no physical storage.
195+
/// Technically, this is a primitive type, we treat it as a primitive type for
196+
/// convenience.
197+
class ICEBERG_EXPORT NullType : public PrimitiveType {
198+
public:
199+
constexpr static const TypeId kTypeId = TypeId::kNull;
200+
201+
NullType() = default;
202+
~NullType() override = default;
203+
204+
TypeId type_id() const override;
205+
std::string ToString() const override;
206+
207+
protected:
208+
bool Equals(const Type& other) const override;
209+
};
210+
194211
/// \brief A data type representing a boolean (true or false).
195212
class ICEBERG_EXPORT BooleanType : public PrimitiveType {
196213
public:
@@ -451,6 +468,8 @@ class ICEBERG_EXPORT UuidType : public PrimitiveType {
451468
/// Factory functions for creating primitive data types
452469
/// @{
453470

471+
/// \brief Return a NullType instance.
472+
ICEBERG_EXPORT const std::shared_ptr<NullType>& null();
454473
/// \brief Return a BooleanType instance.
455474
ICEBERG_EXPORT const std::shared_ptr<BooleanType>& boolean();
456475
/// \brief Return an IntType instance.

src/iceberg/type_fwd.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ enum class TypeId {
3636
kStruct,
3737
kList,
3838
kMap,
39+
kNull, // Note: A type having no physical storage. This is not an iceberg type, we add
40+
// it to simplify the code logic.
3941
kBoolean,
4042
kInt,
4143
kLong,
@@ -69,6 +71,7 @@ class LongType;
6971
class ListType;
7072
class MapType;
7173
class NestedType;
74+
class NullType;
7275
class PartitionField;
7376
class PartitionSpec;
7477
class PrimitiveType;

0 commit comments

Comments
 (0)