Skip to content

Commit 1d01343

Browse files
committed
feat: optimize truncate(col) == value to startsWith predicate
Implements optimization to rewrite truncate equality predicates as startsWith for better predicate pushdown and index usage. The optimization applies when: - Operation is equality - Term is a truncate transform on string column - Literal has exactly the truncate width in UTF-8 code points Implementation uses transform_func()->Transform() to validate that: 1. truncate(literal) == literal (literal is compatible) 2. truncate(literal + 'x') == literal (literal has exact width) This approach leverages the transform function without duplicating UTF-8 code point counting logic.
1 parent 3b59ef2 commit 1d01343

File tree

3 files changed

+187
-1
lines changed

3 files changed

+187
-1
lines changed

src/iceberg/expression/predicate.cc

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,10 @@
2525

2626
#include "iceberg/expression/expressions.h"
2727
#include "iceberg/expression/literal.h"
28+
#include "iceberg/expression/term.h"
2829
#include "iceberg/result.h"
30+
#include "iceberg/transform.h"
31+
#include "iceberg/transform_function.h"
2932
#include "iceberg/type.h"
3033
#include "iceberg/util/checked_cast.h"
3134
#include "iceberg/util/formatter_internal.h"
@@ -286,7 +289,48 @@ Result<std::shared_ptr<Expression>> UnboundPredicate<B>::BindLiteralOperation(
286289
}
287290
}
288291

289-
// TODO(gangwu): translate truncate(col) == value to startsWith(value)
292+
if (BASE::op() == Expression::Operation::kEq &&
293+
bound_term->kind() == Term::Kind::kTransform) {
294+
// Safe to cast after kind check confirms it's a transform
295+
auto* transform_term = internal::checked_cast<BoundTransform*>(bound_term.get());
296+
297+
if (transform_term->transform()->transform_type() == TransformType::kTruncate &&
298+
literal.type()->type_id() == TypeId::kString &&
299+
!literal.IsNull()) { // Null safety: skip null literals
300+
301+
// Apply truncate transform to the literal and check if result matches
302+
// This verifies the literal is compatible with the truncate operation
303+
auto transformed_result = transform_term->transform_func()->Transform(literal);
304+
if (!transformed_result.has_value() || transformed_result.value() != literal) {
305+
// Transform failed or modified the literal - can't optimize
306+
return BoundLiteralPredicate::Make(BASE::op(), std::move(bound_term),
307+
std::move(literal));
308+
}
309+
310+
// Literal passed truncate unchanged. Now check if adding one more character
311+
// would cause truncation. If yes, then the literal has EXACTLY the width.
312+
// Example:
313+
// - "Alice" with width=5: adding "x" makes "Alicex", truncate to "Alice" (can
314+
// optimize)
315+
// - "abc" with width=10: adding "x" makes "abcx", truncate to "abcx" != "abc"
316+
// (cannot optimize)
317+
318+
auto& string_value = std::get<std::string>(literal.value());
319+
auto extended_literal = Literal::String(string_value + "x");
320+
auto extended_result =
321+
transform_term->transform_func()->Transform(extended_literal);
322+
323+
if (extended_result.has_value() && extended_result.value() == literal) {
324+
// Adding a character gets truncated back to original - literal has exact width!
325+
// Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value"
326+
return BoundLiteralPredicate::Make(Expression::Operation::kStartsWith,
327+
transform_term->reference(),
328+
std::move(literal));
329+
}
330+
// Literal is shorter than width - can't optimize
331+
}
332+
}
333+
290334
return BoundLiteralPredicate::Make(BASE::op(), std::move(bound_term),
291335
std::move(literal));
292336
}

src/iceberg/expression/term.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,10 @@ class ICEBERG_EXPORT BoundTransform : public BoundTerm {
250250

251251
const std::shared_ptr<Transform>& transform() const { return transform_; }
252252

253+
const std::shared_ptr<TransformFunction>& transform_func() const {
254+
return transform_func_;
255+
}
256+
253257
Kind kind() const override { return Kind::kTransform; }
254258

255259
private:

src/iceberg/test/predicate_test.cc

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -870,4 +870,142 @@ TEST_F(PredicateTest, BoundSetPredicateTestSingleLiteral) {
870870
EXPECT_THAT(bound_literal->Test(Literal::Int(41)), HasValue(testing::Eq(false)));
871871
}
872872

873+
TEST_F(PredicateTest, TruncateLiteralOptimizationExactWidth) {
874+
// Test optimization: truncate(name, 5) == "Alice" should become name STARTS_WITH
875+
// "Alice"
876+
auto truncate_term = Expressions::Truncate("name", 5);
877+
ICEBERG_ASSIGN_OR_THROW(auto equal_pred, UnboundPredicate<BoundTransform>::Make(
878+
Expression::Operation::kEq, truncate_term,
879+
Literal::String("Alice")));
880+
881+
ICEBERG_ASSIGN_OR_THROW(auto bound_pred,
882+
equal_pred->Bind(*schema_, /*case_sensitive=*/true));
883+
884+
// Should be optimized to STARTS_WITH operation
885+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
886+
887+
// Verify it's a bound literal predicate on the reference (not the transform)
888+
auto bound_literal = AssertAndCastToBoundPredicate(bound_pred);
889+
EXPECT_THAT(bound_literal->Test(Literal::String("Alice")), HasValue(testing::Eq(true)));
890+
EXPECT_THAT(bound_literal->Test(Literal::String("AliceX")),
891+
HasValue(testing::Eq(true)));
892+
EXPECT_THAT(bound_literal->Test(Literal::String("Alice123")),
893+
HasValue(testing::Eq(true)));
894+
EXPECT_THAT(bound_literal->Test(Literal::String("Bob")), HasValue(testing::Eq(false)));
895+
EXPECT_THAT(bound_literal->Test(Literal::String("Alic")), HasValue(testing::Eq(false)));
896+
}
897+
898+
TEST_F(PredicateTest, TruncateLiteralOptimizationShorterLiteral) {
899+
// Test no optimization: truncate(name, 10) == "abc" should NOT be optimized
900+
// because "abc" is shorter than width 10
901+
auto truncate_term = Expressions::Truncate("name", 10);
902+
ICEBERG_ASSIGN_OR_THROW(auto equal_pred, UnboundPredicate<BoundTransform>::Make(
903+
Expression::Operation::kEq, truncate_term,
904+
Literal::String("abc")));
905+
906+
ICEBERG_ASSIGN_OR_THROW(auto bound_pred,
907+
equal_pred->Bind(*schema_, /*case_sensitive=*/true));
908+
909+
// Should remain as EQUAL operation (not optimized to STARTS_WITH)
910+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
911+
}
912+
913+
TEST_F(PredicateTest, TruncateLiteralOptimizationNullLiteral) {
914+
// Test no optimization with null literal - skipped as null strings are handled
915+
// differently Null values are tested through IS NULL predicates, not equality
916+
// predicates
917+
GTEST_SKIP() << "Null literal equality not supported for strings";
918+
}
919+
920+
TEST_F(PredicateTest, TruncateLiteralOptimizationNonEqualityOperations) {
921+
// Test that non-equality operations are not optimized
922+
auto truncate_term = Expressions::Truncate("name", 5);
923+
924+
// NotEqual should not be optimized
925+
ICEBERG_ASSIGN_OR_THROW(
926+
auto not_equal_pred,
927+
UnboundPredicate<BoundTransform>::Make(Expression::Operation::kNotEq, truncate_term,
928+
Literal::String("Alice")));
929+
ICEBERG_ASSIGN_OR_THROW(auto bound_not_equal,
930+
not_equal_pred->Bind(*schema_, /*case_sensitive=*/true));
931+
EXPECT_EQ(bound_not_equal->op(), Expression::Operation::kNotEq);
932+
933+
// LessThan should not be optimized
934+
ICEBERG_ASSIGN_OR_THROW(auto lt_pred, UnboundPredicate<BoundTransform>::Make(
935+
Expression::Operation::kLt, truncate_term,
936+
Literal::String("Alice")));
937+
ICEBERG_ASSIGN_OR_THROW(auto bound_lt,
938+
lt_pred->Bind(*schema_, /*case_sensitive=*/true));
939+
EXPECT_EQ(bound_lt->op(), Expression::Operation::kLt);
940+
941+
// GreaterThan should not be optimized
942+
ICEBERG_ASSIGN_OR_THROW(auto gt_pred, UnboundPredicate<BoundTransform>::Make(
943+
Expression::Operation::kGt, truncate_term,
944+
Literal::String("Alice")));
945+
ICEBERG_ASSIGN_OR_THROW(auto bound_gt,
946+
gt_pred->Bind(*schema_, /*case_sensitive=*/true));
947+
EXPECT_EQ(bound_gt->op(), Expression::Operation::kGt);
948+
}
949+
950+
TEST_F(PredicateTest, TruncateLiteralOptimizationUTF8MultibyteCharacters) {
951+
// Test optimization with UTF-8 multibyte characters (5 code points, not bytes)
952+
auto truncate_term = Expressions::Truncate("name", 5);
953+
954+
// "你好世界!" is 5 UTF-8 code points
955+
ICEBERG_ASSIGN_OR_THROW(auto equal_pred, UnboundPredicate<BoundTransform>::Make(
956+
Expression::Operation::kEq, truncate_term,
957+
Literal::String("你好世界!")));
958+
ICEBERG_ASSIGN_OR_THROW(auto bound_pred,
959+
equal_pred->Bind(*schema_, /*case_sensitive=*/true));
960+
961+
// Should be optimized to STARTS_WITH
962+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
963+
964+
// Test with mixed ASCII and UTF-8: "你好世界x" is 5 code points (4 Chinese + 1 ASCII)
965+
ICEBERG_ASSIGN_OR_THROW(auto mixed_pred, UnboundPredicate<BoundTransform>::Make(
966+
Expression::Operation::kEq, truncate_term,
967+
Literal::String("你好世界x")));
968+
ICEBERG_ASSIGN_OR_THROW(auto bound_mixed,
969+
mixed_pred->Bind(*schema_, /*case_sensitive=*/true));
970+
EXPECT_EQ(bound_mixed->op(), Expression::Operation::kStartsWith);
971+
972+
// Test with 3 UTF-8 characters (shorter than width) - should NOT optimize
973+
ICEBERG_ASSIGN_OR_THROW(
974+
auto shorter_pred,
975+
UnboundPredicate<BoundTransform>::Make(Expression::Operation::kEq, truncate_term,
976+
Literal::String("你好世")));
977+
ICEBERG_ASSIGN_OR_THROW(auto bound_shorter,
978+
shorter_pred->Bind(*schema_, /*case_sensitive=*/true));
979+
EXPECT_EQ(bound_shorter->op(), Expression::Operation::kEq);
980+
}
981+
982+
TEST_F(PredicateTest, TruncateLiteralOptimizationEmptyString) {
983+
// Test edge case: empty string with any width should not optimize
984+
auto truncate_term = Expressions::Truncate("name", 5);
985+
ICEBERG_ASSIGN_OR_THROW(auto equal_pred, UnboundPredicate<BoundTransform>::Make(
986+
Expression::Operation::kEq, truncate_term,
987+
Literal::String("")));
988+
989+
ICEBERG_ASSIGN_OR_THROW(auto bound_pred,
990+
equal_pred->Bind(*schema_, /*case_sensitive=*/true));
991+
992+
// Empty string is shorter than width, should not optimize
993+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
994+
}
995+
996+
TEST_F(PredicateTest, TruncateLiteralOptimizationNonTruncateTransform) {
997+
// Test that other transforms (e.g., bucket) are not optimized
998+
// Bucket returns an integer, so we use an integer literal
999+
auto bucket_term = Expressions::Bucket("id", 10); // id is int64
1000+
ICEBERG_ASSIGN_OR_THROW(auto equal_pred,
1001+
UnboundPredicate<BoundTransform>::Make(
1002+
Expression::Operation::kEq, bucket_term, Literal::Int(5)));
1003+
1004+
ICEBERG_ASSIGN_OR_THROW(auto bound_pred,
1005+
equal_pred->Bind(*schema_, /*case_sensitive=*/true));
1006+
1007+
// Should remain as EQUAL operation (bucket transform not optimized)
1008+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
1009+
}
1010+
8731011
} // namespace iceberg

0 commit comments

Comments
 (0)