Skip to content

Commit b705315

Browse files
committed
optimize truncate(col) == value to startsWith predicate
Rewrite truncate(col, width) == "value" predicates to col STARTS_WITH "value" for string columns when literal.length() == width. This enables better predicate pushdown to storage formats and efficient use of prefix indexes. The optimization only applies when ALL conditions are met: - Operation is equality - Term is a truncate transform - Literal is a string type - Literal length equals truncate width (CRITICAL for correctness) Without the width check, truncate(col, 10) == "abc" would incorrectly match "abc1234567" if rewritten to STARTS_WITH. Added comprehensive tests including width mismatch edge case.
1 parent 4f84053 commit b705315

File tree

2 files changed

+158
-1
lines changed

2 files changed

+158
-1
lines changed

src/iceberg/expression/predicate.cc

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,14 @@
2121

2222
#include <algorithm>
2323
#include <format>
24+
#include <regex>
2425

2526
#include "iceberg/exception.h"
2627
#include "iceberg/expression/expressions.h"
2728
#include "iceberg/expression/literal.h"
29+
#include "iceberg/expression/term.h"
2830
#include "iceberg/result.h"
31+
#include "iceberg/transform.h"
2932
#include "iceberg/type.h"
3033
#include "iceberg/util/checked_cast.h"
3134
#include "iceberg/util/formatter_internal.h"
@@ -226,7 +229,37 @@ Result<std::shared_ptr<Expression>> UnboundPredicate<B>::BindLiteralOperation(
226229
}
227230
}
228231

229-
// TODO(gangwu): translate truncate(col) == value to startsWith(value)
232+
// Optimize: translate truncate(col, width) == value to col startsWith(value)
233+
// This optimization allows better predicate pushdown and index usage
234+
// IMPORTANT: Only valid when literal.length() == width
235+
if (BASE::op() == Expression::Operation::kEq &&
236+
bound_term->kind() == Term::Kind::kTransform) {
237+
auto* transform_term = dynamic_cast<BoundTransform*>(bound_term.get());
238+
if (transform_term &&
239+
transform_term->transform()->transform_type() == TransformType::kTruncate &&
240+
literal.type()->type_id() == TypeId::kString) {
241+
// Extract width from transform string (format: "truncate[width]")
242+
std::string transform_str = transform_term->transform()->ToString();
243+
std::regex width_regex(R"(truncate\[(\d+)\])");
244+
std::smatch match;
245+
246+
if (std::regex_match(transform_str, match, width_regex)) {
247+
int32_t truncate_width = std::stoi(match[1].str());
248+
auto& string_value = std::get<std::string>(literal.value());
249+
250+
// Only optimize if literal length equals truncate width
251+
// Example: truncate(col, 5) == "Alice" can be optimized
252+
// truncate(col, 10) == "abc" CANNOT (would incorrectly match "abc1234567")
253+
if (static_cast<int32_t>(string_value.length()) == truncate_width) {
254+
// Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value"
255+
return std::make_shared<BoundLiteralPredicate>(
256+
Expression::Operation::kStartsWith, transform_term->reference(),
257+
std::move(literal));
258+
}
259+
}
260+
}
261+
}
262+
230263
return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
231264
std::move(literal));
232265
}

src/iceberg/test/predicate_test.cc

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
*/
1919

2020
#include "iceberg/expression/expressions.h"
21+
#include "iceberg/expression/predicate.h"
2122
#include "iceberg/schema.h"
2223
#include "iceberg/test/matchers.h"
2324
#include "iceberg/type.h"
@@ -433,4 +434,127 @@ TEST_F(PredicateTest, ComplexExpressionCombinations) {
433434
EXPECT_EQ(nested->op(), Expression::Operation::kAnd);
434435
}
435436

437+
TEST_F(PredicateTest, TruncateOptimizationToStartsWith) {
438+
// Test that truncate(col) == "value" is optimized to col STARTS_WITH "value"
439+
440+
// Create a truncate transform expression: truncate(name, 5)
441+
auto truncate_expr = Expressions::Truncate("name", 5);
442+
443+
// Create predicate: truncate(name, 5) == "Alice"
444+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
445+
Expression::Operation::kEq, truncate_expr, Literal::String("Alice"));
446+
447+
// Bind the predicate to the schema
448+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
449+
ASSERT_THAT(bound_result, IsOk());
450+
auto bound_pred = bound_result.value();
451+
452+
// After optimization, it should be a STARTS_WITH operation
453+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
454+
455+
// Verify it's a BoundLiteralPredicate
456+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
457+
ASSERT_NE(literal_pred, nullptr);
458+
459+
// The term should now be a direct reference to "name", not a transform
460+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
461+
462+
// The literal should still be "Alice"
463+
EXPECT_EQ(literal_pred->literal(), Literal::String("Alice"));
464+
}
465+
466+
TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonEquality) {
467+
// Test that optimization is NOT applied for non-equality operations
468+
469+
auto truncate_expr = Expressions::Truncate("name", 5);
470+
471+
// Test with less-than (should NOT be optimized)
472+
auto truncate_lt_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
473+
Expression::Operation::kLt, truncate_expr, Literal::String("Bob"));
474+
auto bound_lt_result = truncate_lt_pred->Bind(*schema_, /*case_sensitive=*/true);
475+
ASSERT_THAT(bound_lt_result, IsOk());
476+
auto bound_lt = bound_lt_result.value();
477+
478+
// Should remain as kLt, not converted to STARTS_WITH
479+
EXPECT_EQ(bound_lt->op(), Expression::Operation::kLt);
480+
481+
// The term should still be a transform
482+
auto* literal_pred_lt = dynamic_cast<BoundLiteralPredicate*>(bound_lt.get());
483+
ASSERT_NE(literal_pred_lt, nullptr);
484+
EXPECT_EQ(literal_pred_lt->term()->kind(), Term::Kind::kTransform);
485+
}
486+
487+
TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonString) {
488+
// Test that optimization is NOT applied for non-string types
489+
// (truncate can also work on binary types, but optimization only applies to strings)
490+
491+
// Create a schema with binary field
492+
auto binary_schema = std::make_shared<Schema>(
493+
std::vector<SchemaField>{SchemaField::MakeOptional(1, "data", binary())},
494+
/*schema_id=*/0);
495+
496+
auto truncate_expr = Expressions::Truncate("data", 10);
497+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
498+
Expression::Operation::kEq, truncate_expr,
499+
Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05}));
500+
501+
auto bound_result = truncate_eq_pred->Bind(*binary_schema, /*case_sensitive=*/true);
502+
ASSERT_THAT(bound_result, IsOk());
503+
auto bound_pred = bound_result.value();
504+
505+
// Should remain as kEq, not converted to STARTS_WITH (binary doesn't support startsWith)
506+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
507+
508+
// The term should still be a transform
509+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
510+
ASSERT_NE(literal_pred, nullptr);
511+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
512+
}
513+
514+
TEST_F(PredicateTest, TruncateOptimizationNotAppliedForWidthMismatch) {
515+
// CRITICAL TEST: Optimization must NOT apply when literal length != truncate width
516+
// Example: truncate(col, 10) == "abc" should NOT become STARTS_WITH
517+
// Because "abc1234567" would match STARTS_WITH but NOT truncate equality
518+
519+
auto truncate_expr = Expressions::Truncate("name", 10);
520+
521+
// Literal "abc" has length 3, but truncate width is 10
522+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
523+
Expression::Operation::kEq, truncate_expr, Literal::String("abc"));
524+
525+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
526+
ASSERT_THAT(bound_result, IsOk());
527+
auto bound_pred = bound_result.value();
528+
529+
// Should remain as kEq, NOT converted to STARTS_WITH
530+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
531+
532+
// The term should still be a transform (not optimized away)
533+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
534+
ASSERT_NE(literal_pred, nullptr);
535+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
536+
}
537+
538+
TEST_F(PredicateTest, TruncateOptimizationAppliedWhenLengthMatches) {
539+
// Test that optimization IS applied when literal length == truncate width
540+
541+
auto truncate_expr = Expressions::Truncate("name", 5);
542+
543+
// Literal "Alice" has length 5, matching truncate width 5
544+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
545+
Expression::Operation::kEq, truncate_expr, Literal::String("Alice"));
546+
547+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
548+
ASSERT_THAT(bound_result, IsOk());
549+
auto bound_pred = bound_result.value();
550+
551+
// Should be optimized to STARTS_WITH
552+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
553+
554+
// The term should be a direct reference (optimization applied)
555+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
556+
ASSERT_NE(literal_pred, nullptr);
557+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
558+
}
559+
436560
} // namespace iceberg

0 commit comments

Comments
 (0)