Skip to content

Commit a777ab7

Browse files
committed
optimize truncate(col) == value to startsWith predicate
Rewrite truncate(col, width) == "value" predicates to col STARTS_WITH "value" for string columns when the literal has exactly `width` UTF-8 code points. This enables better predicate pushdown to storage formats and efficient use of prefix indexes. The optimization applies when ALL conditions are met: - Operation is equality - Term is a truncate transform - Literal is a string type - Literal UTF-8 code point count equals truncate width (CRITICAL) - Literal is not null (safety check) - Width is not zero (Transform rejects this, but guard added for safety) Correctly handles UTF-8 multi-byte characters: - "José" = 4 code points, 5 bytes (é = 2 bytes) - "Hi👋" = 3 code points, 6 bytes (👋 = 4 bytes) Implementation details: - Uses checked_cast for fail-fast debug behavior - Binary string comparison semantics (no collation issues) - Benefits from strict metrics evaluation for startsWith - Static regex to avoid recompilation overhead - Counts code points (not grapheme clusters) per Iceberg spec - Short-string invariance: when source < width, truncate returns full string, so equality implies exact match and STARTS_WITH remains valid - TODO: Replace ToString/regex with direct width getter when available Comprehensive tests including UTF-8 edge cases and empty strings.
1 parent 4f84053 commit a777ab7

File tree

2 files changed

+296
-1
lines changed

2 files changed

+296
-1
lines changed

src/iceberg/expression/predicate.cc

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,14 @@
2121

2222
#include <algorithm>
2323
#include <format>
24+
#include <regex>
2425

2526
#include "iceberg/exception.h"
2627
#include "iceberg/expression/expressions.h"
2728
#include "iceberg/expression/literal.h"
29+
#include "iceberg/expression/term.h"
2830
#include "iceberg/result.h"
31+
#include "iceberg/transform.h"
2932
#include "iceberg/type.h"
3033
#include "iceberg/util/checked_cast.h"
3134
#include "iceberg/util/formatter_internal.h"
@@ -143,6 +146,27 @@ bool IsFloatingType(TypeId type) {
143146
return type == TypeId::kFloat || type == TypeId::kDouble;
144147
}
145148

149+
/// \brief Count the number of UTF-8 code points in a string.
150+
/// This matches the behavior of TruncateUtils::TruncateUTF8.
151+
///
152+
/// NOTE: This counts code points, not grapheme clusters (user-perceived characters).
153+
/// Per the Iceberg spec, combining marks count as separate code points.
154+
/// Example: "é" as e + combining-acute (U+0065 U+0301) = 2 code points,
155+
/// but "é" as single precomposed character (U+00E9) = 1 code point.
156+
///
157+
/// \param str The UTF-8 encoded string
158+
/// \return The number of code points (not bytes, not graphemes)
159+
inline int32_t CountUTF8CodePoints(const std::string& str) {
160+
int32_t code_point_count = 0;
161+
for (size_t i = 0; i < str.size(); ++i) {
162+
// Start of a new UTF-8 code point (not a continuation byte 10xxxxxx)
163+
if ((str[i] & 0xC0) != 0x80) {
164+
code_point_count++;
165+
}
166+
}
167+
return code_point_count;
168+
}
169+
146170
} // namespace
147171

148172
template <typename B>
@@ -226,7 +250,67 @@ Result<std::shared_ptr<Expression>> UnboundPredicate<B>::BindLiteralOperation(
226250
}
227251
}
228252

229-
// TODO(gangwu): translate truncate(col) == value to startsWith(value)
253+
// Optimize: translate truncate(col, width) == value to col startsWith(value)
254+
// This optimization allows better predicate pushdown and index usage
255+
// IMPORTANT: Only valid when literal has exactly `width` UTF-8 code points
256+
//
257+
// NOTE: This rewrite is safe because:
258+
// - Iceberg string comparisons are binary (byte-for-byte), no collation
259+
// - STARTS_WITH uses the same binary comparison semantics as equality
260+
// - truncate(col, w) == "value" ⟺ col STARTS_WITH "value" when len(value) == w
261+
// - When source has < w code points, truncate returns full string; equality
262+
// implies exact match, so STARTS_WITH remains valid (short-string invariance)
263+
if (BASE::op() == Expression::Operation::kEq &&
264+
bound_term->kind() == Term::Kind::kTransform) {
265+
// Use checked_cast for fail-fast debug behavior
266+
auto* transform_term =
267+
internal::checked_cast<BoundTransform*>(bound_term.get());
268+
269+
if (transform_term->transform()->transform_type() == TransformType::kTruncate &&
270+
literal.type()->type_id() == TypeId::kString &&
271+
!literal.IsNull()) { // Null safety: skip null literals
272+
273+
// TODO: Avoid ToString/regex parsing once Transform API exposes width directly
274+
// (e.g., TruncateTransform::width() getter would be cleaner and faster)
275+
// Extract width from transform string (format: "truncate[width]")
276+
std::string transform_str = transform_term->transform()->ToString();
277+
278+
// Static regex to avoid recompilation on each bind (micro-optimization)
279+
static const std::regex width_regex(R"(truncate\[(\d+)\])");
280+
std::smatch match;
281+
282+
if (std::regex_match(transform_str, match, width_regex)) {
283+
int32_t truncate_width = std::stoi(match[1].str());
284+
285+
// Skip width=0: truncate(col, 0) == "" would rewrite to STARTS_WITH("")
286+
// which is tautologically true and could accidentally broaden filters
287+
if (truncate_width == 0) {
288+
// Don't optimize; let the normal predicate handle this edge case
289+
return std::make_shared<BoundLiteralPredicate>(
290+
BASE::op(), std::move(bound_term), std::move(literal));
291+
}
292+
293+
auto& string_value = std::get<std::string>(literal.value());
294+
295+
// Count UTF-8 code points (not bytes!)
296+
// Truncate uses code points: "José" has 4 code points but 5 bytes
297+
int32_t code_point_count = CountUTF8CodePoints(string_value);
298+
299+
// Only optimize if literal code point count equals truncate width
300+
// Example: truncate(col, 5) == "Alice" (5 code points) can be optimized
301+
// truncate(col, 10) == "abc" (3 code points) CANNOT
302+
// truncate(col, 4) == "José" (4 code points, 5 bytes) CAN be optimized
303+
if (code_point_count == truncate_width) {
304+
// Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value"
305+
// This benefits from strict metrics evaluation for startsWith in manifest filtering
306+
return std::make_shared<BoundLiteralPredicate>(
307+
Expression::Operation::kStartsWith, transform_term->reference(),
308+
std::move(literal));
309+
}
310+
}
311+
}
312+
}
313+
230314
return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
231315
std::move(literal));
232316
}

src/iceberg/test/predicate_test.cc

Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
*/
1919

2020
#include "iceberg/expression/expressions.h"
21+
#include "iceberg/expression/predicate.h"
2122
#include "iceberg/schema.h"
2223
#include "iceberg/test/matchers.h"
2324
#include "iceberg/type.h"
@@ -433,4 +434,214 @@ TEST_F(PredicateTest, ComplexExpressionCombinations) {
433434
EXPECT_EQ(nested->op(), Expression::Operation::kAnd);
434435
}
435436

437+
TEST_F(PredicateTest, TruncateOptimizationToStartsWith) {
438+
// Test that truncate(col) == "value" is optimized to col STARTS_WITH "value"
439+
440+
// Create a truncate transform expression: truncate(name, 5)
441+
auto truncate_expr = Expressions::Truncate("name", 5);
442+
443+
// Create predicate: truncate(name, 5) == "Alice"
444+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
445+
Expression::Operation::kEq, truncate_expr, Literal::String("Alice"));
446+
447+
// Bind the predicate to the schema
448+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
449+
ASSERT_THAT(bound_result, IsOk());
450+
auto bound_pred = bound_result.value();
451+
452+
// After optimization, it should be a STARTS_WITH operation
453+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
454+
455+
// Verify it's a BoundLiteralPredicate
456+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
457+
ASSERT_NE(literal_pred, nullptr);
458+
459+
// The term should now be a direct reference to "name", not a transform
460+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
461+
462+
// The literal should still be "Alice"
463+
EXPECT_EQ(literal_pred->literal(), Literal::String("Alice"));
464+
}
465+
466+
TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonEquality) {
467+
// Test that optimization is NOT applied for non-equality operations
468+
469+
auto truncate_expr = Expressions::Truncate("name", 5);
470+
471+
// Test with less-than (should NOT be optimized)
472+
auto truncate_lt_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
473+
Expression::Operation::kLt, truncate_expr, Literal::String("Bob"));
474+
auto bound_lt_result = truncate_lt_pred->Bind(*schema_, /*case_sensitive=*/true);
475+
ASSERT_THAT(bound_lt_result, IsOk());
476+
auto bound_lt = bound_lt_result.value();
477+
478+
// Should remain as kLt, not converted to STARTS_WITH
479+
EXPECT_EQ(bound_lt->op(), Expression::Operation::kLt);
480+
481+
// The term should still be a transform
482+
auto* literal_pred_lt = dynamic_cast<BoundLiteralPredicate*>(bound_lt.get());
483+
ASSERT_NE(literal_pred_lt, nullptr);
484+
EXPECT_EQ(literal_pred_lt->term()->kind(), Term::Kind::kTransform);
485+
}
486+
487+
TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonString) {
488+
// Test that optimization is NOT applied for non-string types
489+
// (truncate can also work on binary types, but optimization only applies to strings)
490+
491+
// Create a schema with binary field
492+
auto binary_schema = std::make_shared<Schema>(
493+
std::vector<SchemaField>{SchemaField::MakeOptional(1, "data", binary())},
494+
/*schema_id=*/0);
495+
496+
auto truncate_expr = Expressions::Truncate("data", 10);
497+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
498+
Expression::Operation::kEq, truncate_expr,
499+
Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05}));
500+
501+
auto bound_result = truncate_eq_pred->Bind(*binary_schema, /*case_sensitive=*/true);
502+
ASSERT_THAT(bound_result, IsOk());
503+
auto bound_pred = bound_result.value();
504+
505+
// Should remain as kEq, not converted to STARTS_WITH (binary doesn't support startsWith)
506+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
507+
508+
// The term should still be a transform
509+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
510+
ASSERT_NE(literal_pred, nullptr);
511+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
512+
}
513+
514+
TEST_F(PredicateTest, TruncateOptimizationNotAppliedForWidthMismatch) {
515+
// CRITICAL TEST: Optimization must NOT apply when literal length != truncate width
516+
// Example: truncate(col, 10) == "abc" should NOT become STARTS_WITH
517+
// Because "abc1234567" would match STARTS_WITH but NOT truncate equality
518+
519+
auto truncate_expr = Expressions::Truncate("name", 10);
520+
521+
// Literal "abc" has length 3, but truncate width is 10
522+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
523+
Expression::Operation::kEq, truncate_expr, Literal::String("abc"));
524+
525+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
526+
ASSERT_THAT(bound_result, IsOk());
527+
auto bound_pred = bound_result.value();
528+
529+
// Should remain as kEq, NOT converted to STARTS_WITH
530+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
531+
532+
// The term should still be a transform (not optimized away)
533+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
534+
ASSERT_NE(literal_pred, nullptr);
535+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
536+
}
537+
538+
TEST_F(PredicateTest, TruncateOptimizationAppliedWhenLengthMatches) {
539+
// Test that optimization IS applied when literal length == truncate width
540+
541+
auto truncate_expr = Expressions::Truncate("name", 5);
542+
543+
// Literal "Alice" has length 5, matching truncate width 5
544+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
545+
Expression::Operation::kEq, truncate_expr, Literal::String("Alice"));
546+
547+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
548+
ASSERT_THAT(bound_result, IsOk());
549+
auto bound_pred = bound_result.value();
550+
551+
// Should be optimized to STARTS_WITH
552+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
553+
554+
// The term should be a direct reference (optimization applied)
555+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
556+
ASSERT_NE(literal_pred, nullptr);
557+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
558+
}
559+
560+
TEST_F(PredicateTest, TruncateOptimizationWithUTF8Accents) {
561+
// CRITICAL: Test UTF-8 code points vs bytes
562+
// "José" = 4 UTF-8 code points but 5 bytes (é = 0xC3 0xA9)
563+
564+
auto truncate_expr = Expressions::Truncate("name", 4);
565+
566+
// "José" has 4 code points, matching truncate width 4
567+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
568+
Expression::Operation::kEq, truncate_expr, Literal::String("José"));
569+
570+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
571+
ASSERT_THAT(bound_result, IsOk());
572+
auto bound_pred = bound_result.value();
573+
574+
// Should be optimized to STARTS_WITH (code points match)
575+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
576+
577+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
578+
ASSERT_NE(literal_pred, nullptr);
579+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
580+
}
581+
582+
TEST_F(PredicateTest, TruncateOptimizationWithUTF8Emoji) {
583+
// Test multi-byte UTF-8 characters
584+
// "Hi👋" = 3 UTF-8 code points but 6 bytes (👋 = 4 bytes: 0xF0 0x9F 0x91 0x8B)
585+
586+
auto truncate_expr = Expressions::Truncate("name", 3);
587+
588+
// "Hi👋" has 3 code points
589+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
590+
Expression::Operation::kEq, truncate_expr, Literal::String("Hi👋"));
591+
592+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
593+
ASSERT_THAT(bound_result, IsOk());
594+
auto bound_pred = bound_result.value();
595+
596+
// Should be optimized to STARTS_WITH
597+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
598+
599+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
600+
ASSERT_NE(literal_pred, nullptr);
601+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
602+
}
603+
604+
TEST_F(PredicateTest, TruncateOptimizationNotAppliedWhenUTF8LengthMismatch) {
605+
// "José" has 4 code points but we're comparing against width 5
606+
// Should NOT optimize
607+
608+
auto truncate_expr = Expressions::Truncate("name", 5);
609+
610+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
611+
Expression::Operation::kEq, truncate_expr, Literal::String("José"));
612+
613+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
614+
ASSERT_THAT(bound_result, IsOk());
615+
auto bound_pred = bound_result.value();
616+
617+
// Should NOT be optimized (code points 4 != width 5)
618+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
619+
620+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
621+
ASSERT_NE(literal_pred, nullptr);
622+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
623+
}
624+
625+
TEST_F(PredicateTest, TruncateOptimizationNotAppliedForEmptyLiteralWithNonZeroWidth) {
626+
// Empty literal with w > 0 should NOT optimize
627+
// Empty string has 0 code points, which != width
628+
// NOTE: width=0 is rejected by Transform::Truncate, so not tested here
629+
630+
auto truncate_expr = Expressions::Truncate("name", 5);
631+
632+
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
633+
Expression::Operation::kEq, truncate_expr, Literal::String(""));
634+
635+
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
636+
ASSERT_THAT(bound_result, IsOk());
637+
auto bound_pred = bound_result.value();
638+
639+
// Should NOT be optimized (0 code points != width 5)
640+
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
641+
642+
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
643+
ASSERT_NE(literal_pred, nullptr);
644+
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
645+
}
646+
436647
} // namespace iceberg

0 commit comments

Comments
 (0)