Skip to content

Commit ab63064

Browse files
committed
feat: optimize truncate(col) == value to startsWith predicate
Rewrite truncate(col, width) == "value" predicates to col STARTS_WITH "value" for string columns when the literal has exactly `width` UTF-8 code points. This enables better predicate pushdown to storage formats and efficient use of prefix indexes. The optimization applies when ALL conditions are met: - Operation is equality - Term is a truncate transform - Literal is a string type - Literal UTF-8 code point count equals truncate width (CRITICAL) - Literal is not null (safety check) - Width is not zero (Transform rejects this, but guard added for safety) Correctly handles UTF-8 multi-byte characters: - "José" = 4 code points, 5 bytes (é = 2 bytes) - "Hi👋" = 3 code points, 6 bytes (👋 = 4 bytes) Implementation details: - Uses type-safe Transform::param() API (no brittle regex parsing) - Binary string comparison semantics (no collation issues) - Benefits from strict metrics evaluation for startsWith - Counts code points (not grapheme clusters) per Iceberg spec - Short-string invariance: when source < width, truncate returns full string, so equality implies exact match and STARTS_WITH remains valid Added Transform::param() public API for accessing transform parameters, replacing brittle ToString() regex parsing. Comprehensive tests including UTF-8 edge cases and empty strings.
1 parent 4f84053 commit ab63064

File tree

3 files changed

+312
-1
lines changed

3 files changed

+312
-1
lines changed

src/iceberg/expression/predicate.cc

Lines changed: 86 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@
2525
#include "iceberg/exception.h"
2626
#include "iceberg/expression/expressions.h"
2727
#include "iceberg/expression/literal.h"
28+
#include "iceberg/expression/term.h"
2829
#include "iceberg/result.h"
30+
#include "iceberg/transform.h"
2931
#include "iceberg/type.h"
3032
#include "iceberg/util/checked_cast.h"
3133
#include "iceberg/util/formatter_internal.h"
@@ -143,6 +145,27 @@ bool IsFloatingType(TypeId type) {
143145
return type == TypeId::kFloat || type == TypeId::kDouble;
144146
}
145147

148+
/// \brief Count the number of UTF-8 code points in a string.
149+
/// This matches the behavior of TruncateUtils::TruncateUTF8.
150+
///
151+
/// NOTE: This counts code points, not grapheme clusters (user-perceived characters).
152+
/// Per the Iceberg spec, combining marks count as separate code points.
153+
/// Example: "é" as e + combining-acute (U+0065 U+0301) = 2 code points,
154+
/// but "é" as single precomposed character (U+00E9) = 1 code point.
155+
///
156+
/// \param str The UTF-8 encoded string
157+
/// \return The number of code points (not bytes, not graphemes)
158+
inline int32_t CountUTF8CodePoints(std::string_view str) {
159+
int32_t code_point_count = 0;
160+
for (unsigned char c : str) {
161+
// Start of a new UTF-8 code point (not a continuation byte 10xxxxxx)
162+
if ((c & 0xC0) != 0x80) {
163+
++code_point_count;
164+
}
165+
}
166+
return code_point_count;
167+
}
168+
146169
} // namespace
147170

148171
template <typename B>
@@ -226,7 +249,69 @@ Result<std::shared_ptr<Expression>> UnboundPredicate<B>::BindLiteralOperation(
226249
}
227250
}
228251

229-
// TODO(gangwu): translate truncate(col) == value to startsWith(value)
252+
// Optimize: translate truncate(col, width) == value to col startsWith(value)
253+
// This optimization allows better predicate pushdown and index usage
254+
// IMPORTANT: Only valid when literal has exactly `width` UTF-8 code points
255+
//
256+
// NOTE: This rewrite is safe because:
257+
// - Iceberg string comparisons are binary (byte-for-byte), no collation
258+
// - STARTS_WITH uses the same binary comparison semantics as equality
259+
// - truncate(col, w) == "value" ⟺ col STARTS_WITH "value" when len(value) == w
260+
// - When source has < w code points, truncate returns full string; equality
261+
// implies exact match, so STARTS_WITH remains valid (short-string invariance)
262+
if (BASE::op() == Expression::Operation::kEq &&
263+
bound_term->kind() == Term::Kind::kTransform) {
264+
// Safe to cast after kind check confirms it's a transform
265+
auto* transform_term = dynamic_cast<BoundTransform*>(bound_term.get());
266+
if (!transform_term) {
267+
// Should never happen after kind check, but be defensive
268+
return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
269+
std::move(literal));
270+
}
271+
272+
if (transform_term->transform()->transform_type() == TransformType::kTruncate &&
273+
literal.type()->type_id() == TypeId::kString &&
274+
!literal.IsNull()) { // Null safety: skip null literals
275+
276+
// Extract width parameter using type-safe API
277+
auto width_opt = transform_term->transform()->param();
278+
if (!width_opt) {
279+
// Should never happen for truncate, but be defensive
280+
return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
281+
std::move(literal));
282+
}
283+
284+
int32_t truncate_width = *width_opt;
285+
286+
// Skip width=0: truncate(col, 0) == "" would rewrite to STARTS_WITH("")
287+
// which is tautologically true and could accidentally broaden filters
288+
// (Note: Transform::Truncate already validates width > 0, but defensive check)
289+
if (truncate_width == 0) {
290+
return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
291+
std::move(literal));
292+
}
293+
294+
auto& string_value = std::get<std::string>(literal.value());
295+
296+
// Count UTF-8 code points (not bytes!)
297+
// Truncate uses code points: "José" has 4 code points but 5 bytes
298+
int32_t code_point_count = CountUTF8CodePoints(string_value);
299+
300+
// Only optimize if literal code point count equals truncate width
301+
// Example: truncate(col, 5) == "Alice" (5 code points) can be optimized
302+
// truncate(col, 10) == "abc" (3 code points) CANNOT
303+
// truncate(col, 4) == "José" (4 code points, 5 bytes) CAN be optimized
304+
if (code_point_count == truncate_width) {
305+
// Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value"
306+
// This benefits from strict metrics evaluation for startsWith in manifest
307+
// filtering
308+
return std::make_shared<BoundLiteralPredicate>(Expression::Operation::kStartsWith,
309+
transform_term->reference(),
310+
std::move(literal));
311+
}
312+
}
313+
}
314+
230315
return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
231316
std::move(literal));
232317
}

src/iceberg/test/predicate_test.cc

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
* under the License.
1818
*/
1919

20+
#include "iceberg/expression/predicate.h"
21+
2022
#include "iceberg/expression/expressions.h"
2123
#include "iceberg/schema.h"
2224
#include "iceberg/test/matchers.h"
@@ -433,4 +435,215 @@ TEST_F(PredicateTest, ComplexExpressionCombinations) {
433435
EXPECT_EQ(nested->op(), Expression::Operation::kAnd);
434436
}
435437

438+
TEST_F(PredicateTest, TruncateOptimizationToStartsWith) {
439+
// Test that truncate(col) == "value" is optimized to col STARTS_WITH "value"
440+
441+
// Create a truncate transform expression: truncate(name, 5)
442+
auto truncate_expr = Expressions::Truncate("name", 5);
443+
444+
// Create predicate: truncate(name, 5) == "Alice"
445+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
446+
Expression::Operation::kEq, truncate_expr, Literal::String("Alice"));
447+
448+
// Bind the predicate to the schema
449+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
450+
ASSERT_THAT(bound_result, IsOk());
451+
auto bound_pred = bound_result.value();
452+
453+
// After optimization, it should be a STARTS_WITH operation
454+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
455+
456+
// Verify it's a BoundLiteralPredicate
457+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
458+
ASSERT_NE(literal_pred, nullptr);
459+
460+
// The term should now be a direct reference to "name", not a transform
461+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
462+
463+
// The literal should still be "Alice"
464+
EXPECT_EQ(literal_pred->literal(), Literal::String("Alice"));
465+
}
466+
467+
TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonEquality) {
468+
// Test that optimization is NOT applied for non-equality operations
469+
470+
auto truncate_expr = Expressions::Truncate("name", 5);
471+
472+
// Test with less-than (should NOT be optimized)
473+
auto truncate_lt_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
474+
Expression::Operation::kLt, truncate_expr, Literal::String("Bob"));
475+
auto bound_lt_result = truncate_lt_pred->Bind(*schema_, /*case_sensitive=*/true);
476+
ASSERT_THAT(bound_lt_result, IsOk());
477+
auto bound_lt = bound_lt_result.value();
478+
479+
// Should remain as kLt, not converted to STARTS_WITH
480+
EXPECT_EQ(bound_lt->op(), Expression::Operation::kLt);
481+
482+
// The term should still be a transform
483+
auto* literal_pred_lt = dynamic_cast<BoundLiteralPredicate*>(bound_lt.get());
484+
ASSERT_NE(literal_pred_lt, nullptr);
485+
EXPECT_EQ(literal_pred_lt->term()->kind(), Term::Kind::kTransform);
486+
}
487+
488+
TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonString) {
489+
// Test that optimization is NOT applied for non-string types
490+
// (truncate can also work on binary types, but optimization only applies to strings)
491+
492+
// Create a schema with binary field
493+
auto binary_schema = std::make_shared<Schema>(
494+
std::vector<SchemaField>{SchemaField::MakeOptional(1, "data", binary())},
495+
/*schema_id=*/0);
496+
497+
auto truncate_expr = Expressions::Truncate("data", 10);
498+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
499+
Expression::Operation::kEq, truncate_expr,
500+
Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05}));
501+
502+
auto bound_result = truncate_eq_pred->Bind(*binary_schema, /*case_sensitive=*/true);
503+
ASSERT_THAT(bound_result, IsOk());
504+
auto bound_pred = bound_result.value();
505+
506+
// Should remain as kEq, not converted to STARTS_WITH (binary doesn't support
507+
// startsWith)
508+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
509+
510+
// The term should still be a transform
511+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
512+
ASSERT_NE(literal_pred, nullptr);
513+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
514+
}
515+
516+
TEST_F(PredicateTest, TruncateOptimizationNotAppliedForWidthMismatch) {
517+
// CRITICAL TEST: Optimization must NOT apply when literal length != truncate width
518+
// Example: truncate(col, 10) == "abc" should NOT become STARTS_WITH
519+
// Because "abc1234567" would match STARTS_WITH but NOT truncate equality
520+
521+
auto truncate_expr = Expressions::Truncate("name", 10);
522+
523+
// Literal "abc" has length 3, but truncate width is 10
524+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
525+
Expression::Operation::kEq, truncate_expr, Literal::String("abc"));
526+
527+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
528+
ASSERT_THAT(bound_result, IsOk());
529+
auto bound_pred = bound_result.value();
530+
531+
// Should remain as kEq, NOT converted to STARTS_WITH
532+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
533+
534+
// The term should still be a transform (not optimized away)
535+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
536+
ASSERT_NE(literal_pred, nullptr);
537+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
538+
}
539+
540+
TEST_F(PredicateTest, TruncateOptimizationAppliedWhenLengthMatches) {
541+
// Test that optimization IS applied when literal length == truncate width
542+
543+
auto truncate_expr = Expressions::Truncate("name", 5);
544+
545+
// Literal "Alice" has length 5, matching truncate width 5
546+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
547+
Expression::Operation::kEq, truncate_expr, Literal::String("Alice"));
548+
549+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
550+
ASSERT_THAT(bound_result, IsOk());
551+
auto bound_pred = bound_result.value();
552+
553+
// Should be optimized to STARTS_WITH
554+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
555+
556+
// The term should be a direct reference (optimization applied)
557+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
558+
ASSERT_NE(literal_pred, nullptr);
559+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
560+
}
561+
562+
TEST_F(PredicateTest, TruncateOptimizationWithUTF8Accents) {
563+
// CRITICAL: Test UTF-8 code points vs bytes
564+
// "José" = 4 UTF-8 code points but 5 bytes (é = 0xC3 0xA9)
565+
566+
auto truncate_expr = Expressions::Truncate("name", 4);
567+
568+
// "José" has 4 code points, matching truncate width 4
569+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
570+
Expression::Operation::kEq, truncate_expr, Literal::String("José"));
571+
572+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
573+
ASSERT_THAT(bound_result, IsOk());
574+
auto bound_pred = bound_result.value();
575+
576+
// Should be optimized to STARTS_WITH (code points match)
577+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
578+
579+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
580+
ASSERT_NE(literal_pred, nullptr);
581+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
582+
}
583+
584+
TEST_F(PredicateTest, TruncateOptimizationWithUTF8Emoji) {
585+
// Test multi-byte UTF-8 characters
586+
// "Hi👋" = 3 UTF-8 code points but 6 bytes (👋 = 4 bytes: 0xF0 0x9F 0x91 0x8B)
587+
588+
auto truncate_expr = Expressions::Truncate("name", 3);
589+
590+
// "Hi👋" has 3 code points
591+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
592+
Expression::Operation::kEq, truncate_expr, Literal::String("Hi👋"));
593+
594+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
595+
ASSERT_THAT(bound_result, IsOk());
596+
auto bound_pred = bound_result.value();
597+
598+
// Should be optimized to STARTS_WITH
599+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
600+
601+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
602+
ASSERT_NE(literal_pred, nullptr);
603+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
604+
}
605+
606+
TEST_F(PredicateTest, TruncateOptimizationNotAppliedWhenUTF8LengthMismatch) {
607+
// "José" has 4 code points but we're comparing against width 5
608+
// Should NOT optimize
609+
610+
auto truncate_expr = Expressions::Truncate("name", 5);
611+
612+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
613+
Expression::Operation::kEq, truncate_expr, Literal::String("José"));
614+
615+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
616+
ASSERT_THAT(bound_result, IsOk());
617+
auto bound_pred = bound_result.value();
618+
619+
// Should NOT be optimized (code points 4 != width 5)
620+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
621+
622+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
623+
ASSERT_NE(literal_pred, nullptr);
624+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
625+
}
626+
627+
TEST_F(PredicateTest, TruncateOptimizationNotAppliedForEmptyLiteralWithNonZeroWidth) {
628+
// Empty literal with w > 0 should NOT optimize
629+
// Empty string has 0 code points, which != width
630+
// NOTE: width=0 is rejected by Transform::Truncate, so not tested here
631+
632+
auto truncate_expr = Expressions::Truncate("name", 5);
633+
634+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
635+
Expression::Operation::kEq, truncate_expr, Literal::String(""));
636+
637+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
638+
ASSERT_THAT(bound_result, IsOk());
639+
auto bound_pred = bound_result.value();
640+
641+
// Should NOT be optimized (0 code points != width 5)
642+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
643+
644+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
645+
ASSERT_NE(literal_pred, nullptr);
646+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
647+
}
648+
436649
} // namespace iceberg

src/iceberg/transform.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,19 @@ class ICEBERG_EXPORT Transform : public util::Formattable {
141141
/// \brief Returns the transform type.
142142
TransformType transform_type() const;
143143

144+
/// \brief Returns the optional parameter for parameterized transforms.
145+
///
146+
/// For transforms like bucket(N) or truncate(W), returns the parameter value.
147+
/// For non-parameterized transforms (identity, year, etc.), returns std::nullopt.
148+
///
149+
/// \return The parameter if present, otherwise std::nullopt
150+
std::optional<int32_t> param() const {
151+
if (auto* p = std::get_if<int32_t>(&param_)) {
152+
return *p;
153+
}
154+
return std::nullopt;
155+
}
156+
144157
/// \brief Binds this transform to a source type, returning a typed TransformFunction.
145158
///
146159
/// This creates a concrete transform implementation based on the transform type and

0 commit comments

Comments
 (0)