Skip to content

Commit 080b10a

Browse files
committed
feat: add truncate_utils.h
1 parent 8c685e8 commit 080b10a

File tree

2 files changed

+67
-26
lines changed

2 files changed

+67
-26
lines changed

src/iceberg/transform_function.cc

Lines changed: 12 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
#include "iceberg/type.h"
2929
#include "iceberg/util/murmurhash3_internal.h"
30+
#include "iceberg/util/truncate_utils.h"
3031

3132
namespace iceberg {
3233

@@ -57,7 +58,7 @@ Result<Literal> BucketTransform::Transform(const Literal& literal) {
5758
"Cannot apply bucket transform to literal with value {} of type {}",
5859
literal.ToString(), source_type()->ToString());
5960
}
60-
if (literal.IsNull()) {
61+
if (literal.IsNull()) [[unlikely]] {
6162
return Literal::Null(iceberg::int32());
6263
}
6364

@@ -135,19 +136,19 @@ Result<Literal> TruncateTransform::Transform(const Literal& literal) {
135136
"Cannot apply truncate transform to literal with value {} of type {}",
136137
literal.ToString(), source_type()->ToString());
137138
}
138-
if (literal.IsNull()) {
139+
if (literal.IsNull()) [[unlikely]] {
139140
// Return null as is
140141
return literal;
141142
}
142143

143144
switch (source_type()->type_id()) {
144145
case TypeId::kInt: {
145146
auto value = std::get<int32_t>(literal.value());
146-
return Literal::Int(value - (((value % width_) + width_) % width_));
147+
return Literal::Int(TruncateUtils::TruncateInt(value, width_));
147148
}
148149
case TypeId::kLong: {
149150
auto value = std::get<int64_t>(literal.value());
150-
return Literal::Long(value - (((value % width_) + width_) % width_));
151+
return Literal::Long(TruncateUtils::TruncateLong(value, width_));
151152
}
152153
case TypeId::kDecimal: {
153154
// TODO(zhjwpku): Handle decimal truncation logic here
@@ -156,26 +157,11 @@ Result<Literal> TruncateTransform::Transform(const Literal& literal) {
156157
case TypeId::kString: {
157158
// Strings are truncated to a valid UTF-8 string with no more than L code points.
158159
auto value = std::get<std::string>(literal.value());
159-
size_t code_point_count = 0;
160-
size_t safe_point = 0;
161-
162-
for (size_t i = 0; i < value.size(); ++i) {
163-
// Start of a new UTF-8 code point
164-
if ((value[i] & 0xC0) != 0x80) {
165-
code_point_count++;
166-
if (code_point_count > static_cast<size_t>(width_)) {
167-
safe_point = i;
168-
break;
169-
}
170-
}
171-
}
172-
173-
if (safe_point != 0) {
174-
value.resize(safe_point); // Resize the string to the safe point
175-
}
176-
return Literal::String(value);
160+
return Literal::String(TruncateUtils::TruncateUTF8(std::move(value), width_));
177161
}
178162
case TypeId::kBinary: {
163+
/// In contrast to strings, binary values do not have an assumed encoding and are
164+
/// truncated to L bytes.
179165
auto value = std::get<std::vector<uint8_t>>(literal.value());
180166
if (value.size() > static_cast<size_t>(width_)) {
181167
value.resize(width_);
@@ -221,7 +207,7 @@ Result<Literal> YearTransform::Transform(const Literal& literal) {
221207
"Cannot apply year transform to literal with value {} of type {}",
222208
literal.ToString(), source_type()->ToString());
223209
}
224-
if (literal.IsNull()) {
210+
if (literal.IsNull()) [[unlikely]] {
225211
return Literal::Null(iceberg::int32());
226212
}
227213

@@ -274,7 +260,7 @@ Result<Literal> MonthTransform::Transform(const Literal& literal) {
274260
"Cannot apply month transform to literal with value {} of type {}",
275261
literal.ToString(), source_type()->ToString());
276262
}
277-
if (literal.IsNull()) {
263+
if (literal.IsNull()) [[unlikely]] {
278264
return Literal::Null(iceberg::int32());
279265
}
280266

@@ -339,7 +325,7 @@ Result<Literal> DayTransform::Transform(const Literal& literal) {
339325
"Cannot apply day transform to literal with value {} of type {}",
340326
literal.ToString(), source_type()->ToString());
341327
}
342-
if (literal.IsNull()) {
328+
if (literal.IsNull()) [[unlikely]] {
343329
return Literal::Null(iceberg::int32());
344330
}
345331

@@ -394,7 +380,7 @@ Result<Literal> HourTransform::Transform(const Literal& literal) {
394380
literal.ToString(), source_type()->ToString());
395381
}
396382

397-
if (literal.IsNull()) {
383+
if (literal.IsNull()) [[unlikely]] {
398384
return Literal::Null(int32());
399385
}
400386

src/iceberg/util/truncate_utils.h

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
#include <cstdint>
23+
#include <string>
24+
25+
#include "iceberg/iceberg_export.h"
26+
27+
namespace iceberg {
28+
29+
ICEBERG_EXPORT class TruncateUtils {
30+
public:
31+
static std::string TruncateUTF8(std::string&& source, size_t L) {
32+
size_t code_point_count = 0;
33+
size_t safe_point = 0;
34+
35+
for (size_t i = 0; i < source.size(); ++i) {
36+
// Start of a new UTF-8 code point
37+
if ((source[i] & 0xC0) != 0x80) {
38+
code_point_count++;
39+
if (code_point_count > L) {
40+
safe_point = i;
41+
break;
42+
}
43+
}
44+
}
45+
46+
if (safe_point != 0) {
47+
// Resize the string to the safe point
48+
source.resize(safe_point);
49+
}
50+
51+
return std::move(source);
52+
}
53+
};
54+
55+
} // namespace iceberg

0 commit comments

Comments
 (0)