Skip to content

Commit 9032da6

Browse files
committed
optimize truncate(col) == value to startsWith predicate
Rewrite truncate(col) == "value" predicates to col STARTS_WITH "value" for string columns. This enables better predicate pushdown to storage formats and efficient use of prefix indexes. The optimization only applies when: - Operation is equality - Term is a truncate transform - Literal is a string type Added tests to verify correct application and edge cases.
1 parent 4f84053 commit 9032da6

File tree

2 files changed

+95
-1
lines changed

2 files changed

+95
-1
lines changed

src/iceberg/expression/predicate.cc

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@
2525
#include "iceberg/exception.h"
2626
#include "iceberg/expression/expressions.h"
2727
#include "iceberg/expression/literal.h"
28+
#include "iceberg/expression/term.h"
2829
#include "iceberg/result.h"
30+
#include "iceberg/transform.h"
2931
#include "iceberg/type.h"
3032
#include "iceberg/util/checked_cast.h"
3133
#include "iceberg/util/formatter_internal.h"
@@ -226,7 +228,21 @@ Result<std::shared_ptr<Expression>> UnboundPredicate<B>::BindLiteralOperation(
226228
}
227229
}
228230

229-
// TODO(gangwu): translate truncate(col) == value to startsWith(value)
231+
// Optimize: translate truncate(col) == value to col startsWith(value)
232+
// This optimization allows better predicate pushdown and index usage
233+
if (BASE::op() == Expression::Operation::kEq &&
234+
bound_term->kind() == Term::Kind::kTransform) {
235+
auto* transform_term = dynamic_cast<BoundTransform*>(bound_term.get());
236+
if (transform_term &&
237+
transform_term->transform()->transform_type() == TransformType::kTruncate &&
238+
literal.type()->type_id() == TypeId::kString) {
239+
// Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value"
240+
return std::make_shared<BoundLiteralPredicate>(
241+
Expression::Operation::kStartsWith, transform_term->reference(),
242+
std::move(literal));
243+
}
244+
}
245+
230246
return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
231247
std::move(literal));
232248
}

src/iceberg/test/predicate_test.cc

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
*/
1919

2020
#include "iceberg/expression/expressions.h"
21+
#include "iceberg/expression/predicate.h"
2122
#include "iceberg/schema.h"
2223
#include "iceberg/test/matchers.h"
2324
#include "iceberg/type.h"
@@ -433,4 +434,81 @@ TEST_F(PredicateTest, ComplexExpressionCombinations) {
433434
EXPECT_EQ(nested->op(), Expression::Operation::kAnd);
434435
}
435436

437+
TEST_F(PredicateTest, TruncateOptimizationToStartsWith) {
438+
// Test that truncate(col) == "value" is optimized to col STARTS_WITH "value"
439+
440+
// Create a truncate transform expression: truncate(name, 5)
441+
auto truncate_expr = Expressions::Truncate("name", 5);
442+
443+
// Create predicate: truncate(name, 5) == "Alice"
444+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
445+
Expression::Operation::kEq, truncate_expr, Literal::String("Alice"));
446+
447+
// Bind the predicate to the schema
448+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
449+
ASSERT_THAT(bound_result, IsOk());
450+
auto bound_pred = bound_result.value();
451+
452+
// After optimization, it should be a STARTS_WITH operation
453+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
454+
455+
// Verify it's a BoundLiteralPredicate
456+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
457+
ASSERT_NE(literal_pred, nullptr);
458+
459+
// The term should now be a direct reference to "name", not a transform
460+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
461+
462+
// The literal should still be "Alice"
463+
EXPECT_EQ(literal_pred->literal(), Literal::String("Alice"));
464+
}
465+
466+
TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonEquality) {
467+
// Test that optimization is NOT applied for non-equality operations
468+
469+
auto truncate_expr = Expressions::Truncate("name", 5);
470+
471+
// Test with less-than (should NOT be optimized)
472+
auto truncate_lt_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
473+
Expression::Operation::kLt, truncate_expr, Literal::String("Bob"));
474+
auto bound_lt_result = truncate_lt_pred->Bind(*schema_, /*case_sensitive=*/true);
475+
ASSERT_THAT(bound_lt_result, IsOk());
476+
auto bound_lt = bound_lt_result.value();
477+
478+
// Should remain as kLt, not converted to STARTS_WITH
479+
EXPECT_EQ(bound_lt->op(), Expression::Operation::kLt);
480+
481+
// The term should still be a transform
482+
auto* literal_pred_lt = dynamic_cast<BoundLiteralPredicate*>(bound_lt.get());
483+
ASSERT_NE(literal_pred_lt, nullptr);
484+
EXPECT_EQ(literal_pred_lt->term()->kind(), Term::Kind::kTransform);
485+
}
486+
487+
TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonString) {
488+
// Test that optimization is NOT applied for non-string types
489+
// (truncate can also work on binary types, but optimization only applies to strings)
490+
491+
// Create a schema with binary field
492+
auto binary_schema = std::make_shared<Schema>(
493+
std::vector<SchemaField>{SchemaField::MakeOptional(1, "data", binary())},
494+
/*schema_id=*/0);
495+
496+
auto truncate_expr = Expressions::Truncate("data", 10);
497+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
498+
Expression::Operation::kEq, truncate_expr,
499+
Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05}));
500+
501+
auto bound_result = truncate_eq_pred->Bind(*binary_schema, /*case_sensitive=*/true);
502+
ASSERT_THAT(bound_result, IsOk());
503+
auto bound_pred = bound_result.value();
504+
505+
// Should remain as kEq, not converted to STARTS_WITH (binary doesn't support startsWith)
506+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
507+
508+
// The term should still be a transform
509+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
510+
ASSERT_NE(literal_pred, nullptr);
511+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
512+
}
513+
436514
} // namespace iceberg

0 commit comments

Comments
 (0)