Skip to content

Commit 81c56fb

Browse files
committed
optimize truncate(col) == value to startsWith predicate
Rewrite truncate(col, width) == "value" predicates to col STARTS_WITH "value" for string columns when the literal has exactly `width` UTF-8 code points. This enables better predicate pushdown to storage formats and efficient use of prefix indexes. The optimization only applies when ALL conditions are met: - Operation is equality - Term is a truncate transform - Literal is a string type - Literal UTF-8 code point count equals truncate width (CRITICAL) - Literal is not null (safety check) Correctly handles UTF-8 multi-byte characters: - "José" = 4 code points, 5 bytes (é = 2 bytes) - "Hi👋" = 3 code points, 6 bytes (👋 = 4 bytes) Implementation notes: - Uses checked_cast for fail-fast debug behavior - Binary string comparison semantics (no collation) - Benefits from strict metrics evaluation for startsWith - Null literals are rejected earlier in bind, explicit check for safety Added comprehensive tests including UTF-8 edge cases.
1 parent 4f84053 commit 81c56fb

File tree

3 files changed

+270
-1
lines changed

3 files changed

+270
-1
lines changed

.claude/settings.local.json

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"permissions": {
3+
"allow": [
4+
"Bash(cmake:*)",
5+
"Bash(brew install:*)",
6+
"Bash(ctest:*)",
7+
"Bash(find:*)",
8+
"Bash(ninja:*)",
9+
"Bash(./src/iceberg/test/table_test:*)",
10+
"Bash(git remote:*)",
11+
"Bash(git fetch:*)",
12+
"WebFetch(domain:github.com)"
13+
],
14+
"deny": [],
15+
"ask": []
16+
}
17+
}

src/iceberg/expression/predicate.cc

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,14 @@
2121

2222
#include <algorithm>
2323
#include <format>
24+
#include <regex>
2425

2526
#include "iceberg/exception.h"
2627
#include "iceberg/expression/expressions.h"
2728
#include "iceberg/expression/literal.h"
29+
#include "iceberg/expression/term.h"
2830
#include "iceberg/result.h"
31+
#include "iceberg/transform.h"
2932
#include "iceberg/type.h"
3033
#include "iceberg/util/checked_cast.h"
3134
#include "iceberg/util/formatter_internal.h"
@@ -143,6 +146,21 @@ bool IsFloatingType(TypeId type) {
143146
return type == TypeId::kFloat || type == TypeId::kDouble;
144147
}
145148

149+
/// \brief Count the number of UTF-8 code points in a string.
150+
/// This matches the behavior of TruncateUtils::TruncateUTF8.
151+
/// \param str The UTF-8 encoded string
152+
/// \return The number of code points (not bytes)
153+
int32_t CountUTF8CodePoints(const std::string& str) {
154+
int32_t code_point_count = 0;
155+
for (size_t i = 0; i < str.size(); ++i) {
156+
// Start of a new UTF-8 code point (not a continuation byte 10xxxxxx)
157+
if ((str[i] & 0xC0) != 0x80) {
158+
code_point_count++;
159+
}
160+
}
161+
return code_point_count;
162+
}
163+
146164
} // namespace
147165

148166
template <typename B>
@@ -226,7 +244,52 @@ Result<std::shared_ptr<Expression>> UnboundPredicate<B>::BindLiteralOperation(
226244
}
227245
}
228246

229-
// TODO(gangwu): translate truncate(col) == value to startsWith(value)
247+
// Optimize: translate truncate(col, width) == value to col startsWith(value)
248+
// This optimization allows better predicate pushdown and index usage
249+
// IMPORTANT: Only valid when literal has exactly `width` UTF-8 code points
250+
//
251+
// NOTE: This rewrite is safe because:
252+
// - Iceberg string comparisons are binary (byte-for-byte), no collation
253+
// - STARTS_WITH uses the same binary comparison semantics as equality
254+
// - truncate(col, w) == "value" ⟺ col STARTS_WITH "value" when len(value) == w
255+
if (BASE::op() == Expression::Operation::kEq &&
256+
bound_term->kind() == Term::Kind::kTransform) {
257+
// Use checked_cast for fail-fast debug behavior
258+
auto* transform_term =
259+
internal::checked_cast<BoundTransform*>(bound_term.get());
260+
261+
if (transform_term->transform()->transform_type() == TransformType::kTruncate &&
262+
literal.type()->type_id() == TypeId::kString &&
263+
!literal.IsNull()) { // Null safety: skip null literals
264+
265+
// Extract width from transform string (format: "truncate[width]")
266+
std::string transform_str = transform_term->transform()->ToString();
267+
std::regex width_regex(R"(truncate\[(\d+)\])");
268+
std::smatch match;
269+
270+
if (std::regex_match(transform_str, match, width_regex)) {
271+
int32_t truncate_width = std::stoi(match[1].str());
272+
auto& string_value = std::get<std::string>(literal.value());
273+
274+
// Count UTF-8 code points (not bytes!)
275+
// Truncate uses code points: "José" has 4 code points but 5 bytes
276+
int32_t code_point_count = CountUTF8CodePoints(string_value);
277+
278+
// Only optimize if literal code point count equals truncate width
279+
// Example: truncate(col, 5) == "Alice" (5 code points) can be optimized
280+
// truncate(col, 10) == "abc" (3 code points) CANNOT
281+
// truncate(col, 4) == "José" (4 code points, 5 bytes) CAN be optimized
282+
if (code_point_count == truncate_width) {
283+
// Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value"
284+
// This benefits from strict metrics evaluation for startsWith in manifest filtering
285+
return std::make_shared<BoundLiteralPredicate>(
286+
Expression::Operation::kStartsWith, transform_term->reference(),
287+
std::move(literal));
288+
}
289+
}
290+
}
291+
}
292+
230293
return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
231294
std::move(literal));
232295
}

src/iceberg/test/predicate_test.cc

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
*/
1919

2020
#include "iceberg/expression/expressions.h"
21+
#include "iceberg/expression/predicate.h"
2122
#include "iceberg/schema.h"
2223
#include "iceberg/test/matchers.h"
2324
#include "iceberg/type.h"
@@ -433,4 +434,192 @@ TEST_F(PredicateTest, ComplexExpressionCombinations) {
433434
EXPECT_EQ(nested->op(), Expression::Operation::kAnd);
434435
}
435436

437+
TEST_F(PredicateTest, TruncateOptimizationToStartsWith) {
438+
// Test that truncate(col) == "value" is optimized to col STARTS_WITH "value"
439+
440+
// Create a truncate transform expression: truncate(name, 5)
441+
auto truncate_expr = Expressions::Truncate("name", 5);
442+
443+
// Create predicate: truncate(name, 5) == "Alice"
444+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
445+
Expression::Operation::kEq, truncate_expr, Literal::String("Alice"));
446+
447+
// Bind the predicate to the schema
448+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
449+
ASSERT_THAT(bound_result, IsOk());
450+
auto bound_pred = bound_result.value();
451+
452+
// After optimization, it should be a STARTS_WITH operation
453+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
454+
455+
// Verify it's a BoundLiteralPredicate
456+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
457+
ASSERT_NE(literal_pred, nullptr);
458+
459+
// The term should now be a direct reference to "name", not a transform
460+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
461+
462+
// The literal should still be "Alice"
463+
EXPECT_EQ(literal_pred->literal(), Literal::String("Alice"));
464+
}
465+
466+
TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonEquality) {
467+
// Test that optimization is NOT applied for non-equality operations
468+
469+
auto truncate_expr = Expressions::Truncate("name", 5);
470+
471+
// Test with less-than (should NOT be optimized)
472+
auto truncate_lt_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
473+
Expression::Operation::kLt, truncate_expr, Literal::String("Bob"));
474+
auto bound_lt_result = truncate_lt_pred->Bind(*schema_, /*case_sensitive=*/true);
475+
ASSERT_THAT(bound_lt_result, IsOk());
476+
auto bound_lt = bound_lt_result.value();
477+
478+
// Should remain as kLt, not converted to STARTS_WITH
479+
EXPECT_EQ(bound_lt->op(), Expression::Operation::kLt);
480+
481+
// The term should still be a transform
482+
auto* literal_pred_lt = dynamic_cast<BoundLiteralPredicate*>(bound_lt.get());
483+
ASSERT_NE(literal_pred_lt, nullptr);
484+
EXPECT_EQ(literal_pred_lt->term()->kind(), Term::Kind::kTransform);
485+
}
486+
487+
TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonString) {
488+
// Test that optimization is NOT applied for non-string types
489+
// (truncate can also work on binary types, but optimization only applies to strings)
490+
491+
// Create a schema with binary field
492+
auto binary_schema = std::make_shared<Schema>(
493+
std::vector<SchemaField>{SchemaField::MakeOptional(1, "data", binary())},
494+
/*schema_id=*/0);
495+
496+
auto truncate_expr = Expressions::Truncate("data", 10);
497+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
498+
Expression::Operation::kEq, truncate_expr,
499+
Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05}));
500+
501+
auto bound_result = truncate_eq_pred->Bind(*binary_schema, /*case_sensitive=*/true);
502+
ASSERT_THAT(bound_result, IsOk());
503+
auto bound_pred = bound_result.value();
504+
505+
// Should remain as kEq, not converted to STARTS_WITH (binary doesn't support startsWith)
506+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
507+
508+
// The term should still be a transform
509+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
510+
ASSERT_NE(literal_pred, nullptr);
511+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
512+
}
513+
514+
TEST_F(PredicateTest, TruncateOptimizationNotAppliedForWidthMismatch) {
515+
// CRITICAL TEST: Optimization must NOT apply when literal length != truncate width
516+
// Example: truncate(col, 10) == "abc" should NOT become STARTS_WITH
517+
// Because "abc1234567" would match STARTS_WITH but NOT truncate equality
518+
519+
auto truncate_expr = Expressions::Truncate("name", 10);
520+
521+
// Literal "abc" has length 3, but truncate width is 10
522+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
523+
Expression::Operation::kEq, truncate_expr, Literal::String("abc"));
524+
525+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
526+
ASSERT_THAT(bound_result, IsOk());
527+
auto bound_pred = bound_result.value();
528+
529+
// Should remain as kEq, NOT converted to STARTS_WITH
530+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
531+
532+
// The term should still be a transform (not optimized away)
533+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
534+
ASSERT_NE(literal_pred, nullptr);
535+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
536+
}
537+
538+
TEST_F(PredicateTest, TruncateOptimizationAppliedWhenLengthMatches) {
539+
// Test that optimization IS applied when literal length == truncate width
540+
541+
auto truncate_expr = Expressions::Truncate("name", 5);
542+
543+
// Literal "Alice" has length 5, matching truncate width 5
544+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
545+
Expression::Operation::kEq, truncate_expr, Literal::String("Alice"));
546+
547+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
548+
ASSERT_THAT(bound_result, IsOk());
549+
auto bound_pred = bound_result.value();
550+
551+
// Should be optimized to STARTS_WITH
552+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
553+
554+
// The term should be a direct reference (optimization applied)
555+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
556+
ASSERT_NE(literal_pred, nullptr);
557+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
558+
}
559+
560+
TEST_F(PredicateTest, TruncateOptimizationWithUTF8Accents) {
561+
// CRITICAL: Test UTF-8 code points vs bytes
562+
// "José" = 4 UTF-8 code points but 5 bytes (é = 0xC3 0xA9)
563+
564+
auto truncate_expr = Expressions::Truncate("name", 4);
565+
566+
// "José" has 4 code points, matching truncate width 4
567+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
568+
Expression::Operation::kEq, truncate_expr, Literal::String("José"));
569+
570+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
571+
ASSERT_THAT(bound_result, IsOk());
572+
auto bound_pred = bound_result.value();
573+
574+
// Should be optimized to STARTS_WITH (code points match)
575+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
576+
577+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
578+
ASSERT_NE(literal_pred, nullptr);
579+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
580+
}
581+
582+
TEST_F(PredicateTest, TruncateOptimizationWithUTF8Emoji) {
583+
// Test multi-byte UTF-8 characters
584+
// "Hi👋" = 3 UTF-8 code points but 6 bytes (👋 = 4 bytes: 0xF0 0x9F 0x91 0x8B)
585+
586+
auto truncate_expr = Expressions::Truncate("name", 3);
587+
588+
// "Hi👋" has 3 code points
589+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
590+
Expression::Operation::kEq, truncate_expr, Literal::String("Hi👋"));
591+
592+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
593+
ASSERT_THAT(bound_result, IsOk());
594+
auto bound_pred = bound_result.value();
595+
596+
// Should be optimized to STARTS_WITH
597+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
598+
599+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
600+
ASSERT_NE(literal_pred, nullptr);
601+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
602+
}
603+
604+
TEST_F(PredicateTest, TruncateOptimizationNotAppliedWhenUTF8LengthMismatch) {
605+
// "José" has 4 code points but we're comparing against width 5
606+
// Should NOT optimize
607+
608+
auto truncate_expr = Expressions::Truncate("name", 5);
609+
610+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
611+
Expression::Operation::kEq, truncate_expr, Literal::String("José"));
612+
613+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
614+
ASSERT_THAT(bound_result, IsOk());
615+
auto bound_pred = bound_result.value();
616+
617+
// Should NOT be optimized (code points 4 != width 5)
618+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
619+
620+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
621+
ASSERT_NE(literal_pred, nullptr);
622+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
623+
}
624+
436625
} // namespace iceberg

0 commit comments

Comments
 (0)