optimize truncate(col) == value to startsWith predicate

shangxinli · shangxinli · commit a777ab7e924a · 2025-11-03T08:52:41.000-08:00
Rewrite truncate(col, width) == "value" predicates to col STARTS_WITH "value"
for string columns when the literal has exactly `width` UTF-8 code points.
This enables better predicate pushdown to storage formats and efficient use
of prefix indexes.

The optimization applies when ALL conditions are met:
- Operation is equality
- Term is a truncate transform
- Literal is a string type
- Literal UTF-8 code point count equals truncate width (CRITICAL)
- Literal is not null (safety check)
- Width is not zero (Transform rejects this, but guard added for safety)

Correctly handles UTF-8 multi-byte characters:
- "José" = 4 code points, 5 bytes (é = 2 bytes)
- "Hi👋" = 3 code points, 6 bytes (👋 = 4 bytes)

Implementation details:
- Uses checked_cast for fail-fast debug behavior
- Binary string comparison semantics (no collation issues)
- Benefits from strict metrics evaluation for startsWith
- Static regex to avoid recompilation overhead
- Counts code points (not grapheme clusters) per Iceberg spec
- Short-string invariance: when source &lt; width, truncate returns full
  string, so equality implies exact match and STARTS_WITH remains valid
- TODO: Replace ToString/regex with direct width getter when available

Comprehensive tests including UTF-8 edge cases and empty strings.
diff --git a/src/iceberg/expression/predicate.cc b/src/iceberg/expression/predicate.cc
@@ -21,11 +21,14 @@
 
 #include <algorithm>
 #include <format>
+#include <regex>
 
 #include "iceberg/exception.h"
 #include "iceberg/expression/expressions.h"
 #include "iceberg/expression/literal.h"
+#include "iceberg/expression/term.h"
 #include "iceberg/result.h"
+#include "iceberg/transform.h"
 #include "iceberg/type.h"
 #include "iceberg/util/checked_cast.h"
 #include "iceberg/util/formatter_internal.h"
@@ -143,6 +146,27 @@ bool IsFloatingType(TypeId type) {
   return type == TypeId::kFloat || type == TypeId::kDouble;
 }
 
+/// \brief Count the number of UTF-8 code points in a string.
+/// This matches the behavior of TruncateUtils::TruncateUTF8.
+///
+/// NOTE: This counts code points, not grapheme clusters (user-perceived characters).
+/// Per the Iceberg spec, combining marks count as separate code points.
+/// Example: "é" as e + combining-acute (U+0065 U+0301) = 2 code points,
+///          but "é" as single precomposed character (U+00E9) = 1 code point.
+///
+/// \param str The UTF-8 encoded string
+/// \return The number of code points (not bytes, not graphemes)
+inline int32_t CountUTF8CodePoints(const std::string& str) {
+  int32_t code_point_count = 0;
+  for (size_t i = 0; i < str.size(); ++i) {
+    // Start of a new UTF-8 code point (not a continuation byte 10xxxxxx)
+    if ((str[i] & 0xC0) != 0x80) {
+      code_point_count++;
+    }
+  }
+  return code_point_count;
+}
+
 }  // namespace
 
 template <typename B>
@@ -226,7 +250,67 @@ Result<std::shared_ptr<Expression>> UnboundPredicate<B>::BindLiteralOperation(
     }
   }
 
-  // TODO(gangwu): translate truncate(col) == value to startsWith(value)
+  // Optimize: translate truncate(col, width) == value to col startsWith(value)
+  // This optimization allows better predicate pushdown and index usage
+  // IMPORTANT: Only valid when literal has exactly `width` UTF-8 code points
+  //
+  // NOTE: This rewrite is safe because:
+  // - Iceberg string comparisons are binary (byte-for-byte), no collation
+  // - STARTS_WITH uses the same binary comparison semantics as equality
+  // - truncate(col, w) == "value" ⟺ col STARTS_WITH "value" when len(value) == w
+  // - When source has < w code points, truncate returns full string; equality
+  //   implies exact match, so STARTS_WITH remains valid (short-string invariance)
+  if (BASE::op() == Expression::Operation::kEq &&
+      bound_term->kind() == Term::Kind::kTransform) {
+    // Use checked_cast for fail-fast debug behavior
+    auto* transform_term =
+        internal::checked_cast<BoundTransform*>(bound_term.get());
+
+    if (transform_term->transform()->transform_type() == TransformType::kTruncate &&
+        literal.type()->type_id() == TypeId::kString &&
+        !literal.IsNull()) {  // Null safety: skip null literals
+
+      // TODO: Avoid ToString/regex parsing once Transform API exposes width directly
+      // (e.g., TruncateTransform::width() getter would be cleaner and faster)
+      // Extract width from transform string (format: "truncate[width]")
+      std::string transform_str = transform_term->transform()->ToString();
+
+      // Static regex to avoid recompilation on each bind (micro-optimization)
+      static const std::regex width_regex(R"(truncate\[(\d+)\])");
+      std::smatch match;
+
+      if (std::regex_match(transform_str, match, width_regex)) {
+        int32_t truncate_width = std::stoi(match[1].str());
+
+        // Skip width=0: truncate(col, 0) == "" would rewrite to STARTS_WITH("")
+        // which is tautologically true and could accidentally broaden filters
+        if (truncate_width == 0) {
+          // Don't optimize; let the normal predicate handle this edge case
+          return std::make_shared<BoundLiteralPredicate>(
+              BASE::op(), std::move(bound_term), std::move(literal));
+        }
+
+        auto& string_value = std::get<std::string>(literal.value());
+
+        // Count UTF-8 code points (not bytes!)
+        // Truncate uses code points: "José" has 4 code points but 5 bytes
+        int32_t code_point_count = CountUTF8CodePoints(string_value);
+
+        // Only optimize if literal code point count equals truncate width
+        // Example: truncate(col, 5) == "Alice" (5 code points) can be optimized
+        //          truncate(col, 10) == "abc" (3 code points) CANNOT
+        //          truncate(col, 4) == "José" (4 code points, 5 bytes) CAN be optimized
+        if (code_point_count == truncate_width) {
+          // Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value"
+          // This benefits from strict metrics evaluation for startsWith in manifest filtering
+          return std::make_shared<BoundLiteralPredicate>(
+              Expression::Operation::kStartsWith, transform_term->reference(),
+              std::move(literal));
+        }
+      }
+    }
+  }
+
   return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
                                                  std::move(literal));
 }
diff --git a/src/iceberg/test/predicate_test.cc b/src/iceberg/test/predicate_test.cc
@@ -18,6 +18,7 @@
  */
 
 #include "iceberg/expression/expressions.h"
+#include "iceberg/expression/predicate.h"
 #include "iceberg/schema.h"
 #include "iceberg/test/matchers.h"
 #include "iceberg/type.h"
@@ -433,4 +434,214 @@ TEST_F(PredicateTest, ComplexExpressionCombinations) {
   EXPECT_EQ(nested->op(), Expression::Operation::kAnd);
 }
 
+TEST_F(PredicateTest, TruncateOptimizationToStartsWith) {
+  // Test that truncate(col) == "value" is optimized to col STARTS_WITH "value"
+
+  // Create a truncate transform expression: truncate(name, 5)
+  auto truncate_expr = Expressions::Truncate("name", 5);
+
+  // Create predicate: truncate(name, 5) == "Alice"
+  auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
+      Expression::Operation::kEq, truncate_expr, Literal::String("Alice"));
+
+  // Bind the predicate to the schema
+  auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
+  ASSERT_THAT(bound_result, IsOk());
+  auto bound_pred = bound_result.value();
+
+  // After optimization, it should be a STARTS_WITH operation
+  EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
+
+  // Verify it's a BoundLiteralPredicate
+  auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
+  ASSERT_NE(literal_pred, nullptr);
+
+  // The term should now be a direct reference to "name", not a transform
+  EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
+
+  // The literal should still be "Alice"
+  EXPECT_EQ(literal_pred->literal(), Literal::String("Alice"));
+}
+
+TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonEquality) {
+  // Test that optimization is NOT applied for non-equality operations
+
+  auto truncate_expr = Expressions::Truncate("name", 5);
+
+  // Test with less-than (should NOT be optimized)
+  auto truncate_lt_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
+      Expression::Operation::kLt, truncate_expr, Literal::String("Bob"));
+  auto bound_lt_result = truncate_lt_pred->Bind(*schema_, /*case_sensitive=*/true);
+  ASSERT_THAT(bound_lt_result, IsOk());
+  auto bound_lt = bound_lt_result.value();
+
+  // Should remain as kLt, not converted to STARTS_WITH
+  EXPECT_EQ(bound_lt->op(), Expression::Operation::kLt);
+
+  // The term should still be a transform
+  auto* literal_pred_lt = dynamic_cast<BoundLiteralPredicate*>(bound_lt.get());
+  ASSERT_NE(literal_pred_lt, nullptr);
+  EXPECT_EQ(literal_pred_lt->term()->kind(), Term::Kind::kTransform);
+}
+
+TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonString) {
+  // Test that optimization is NOT applied for non-string types
+  // (truncate can also work on binary types, but optimization only applies to strings)
+
+  // Create a schema with binary field
+  auto binary_schema = std::make_shared<Schema>(
+      std::vector<SchemaField>{SchemaField::MakeOptional(1, "data", binary())},
+      /*schema_id=*/0);
+
+  auto truncate_expr = Expressions::Truncate("data", 10);
+  auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
+      Expression::Operation::kEq, truncate_expr,
+      Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05}));
+
+  auto bound_result = truncate_eq_pred->Bind(*binary_schema, /*case_sensitive=*/true);
+  ASSERT_THAT(bound_result, IsOk());
+  auto bound_pred = bound_result.value();
+
+  // Should remain as kEq, not converted to STARTS_WITH (binary doesn't support startsWith)
+  EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
+
+  // The term should still be a transform
+  auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
+  ASSERT_NE(literal_pred, nullptr);
+  EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
+}
+
+TEST_F(PredicateTest, TruncateOptimizationNotAppliedForWidthMismatch) {
+  // CRITICAL TEST: Optimization must NOT apply when literal length != truncate width
+  // Example: truncate(col, 10) == "abc" should NOT become STARTS_WITH
+  // Because "abc1234567" would match STARTS_WITH but NOT truncate equality
+
+  auto truncate_expr = Expressions::Truncate("name", 10);
+
+  // Literal "abc" has length 3, but truncate width is 10
+  auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
+      Expression::Operation::kEq, truncate_expr, Literal::String("abc"));
+
+  auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
+  ASSERT_THAT(bound_result, IsOk());
+  auto bound_pred = bound_result.value();
+
+  // Should remain as kEq, NOT converted to STARTS_WITH
+  EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
+
+  // The term should still be a transform (not optimized away)
+  auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
+  ASSERT_NE(literal_pred, nullptr);
+  EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
+}
+
+TEST_F(PredicateTest, TruncateOptimizationAppliedWhenLengthMatches) {
+  // Test that optimization IS applied when literal length == truncate width
+
+  auto truncate_expr = Expressions::Truncate("name", 5);
+
+  // Literal "Alice" has length 5, matching truncate width 5
+  auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
+      Expression::Operation::kEq, truncate_expr, Literal::String("Alice"));
+
+  auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
+  ASSERT_THAT(bound_result, IsOk());
+  auto bound_pred = bound_result.value();
+
+  // Should be optimized to STARTS_WITH
+  EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
+
+  // The term should be a direct reference (optimization applied)
+  auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
+  ASSERT_NE(literal_pred, nullptr);
+  EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
+}
+
+TEST_F(PredicateTest, TruncateOptimizationWithUTF8Accents) {
+  // CRITICAL: Test UTF-8 code points vs bytes
+  // "José" = 4 UTF-8 code points but 5 bytes (é = 0xC3 0xA9)
+
+  auto truncate_expr = Expressions::Truncate("name", 4);
+
+  // "José" has 4 code points, matching truncate width 4
+  auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
+      Expression::Operation::kEq, truncate_expr, Literal::String("José"));
+
+  auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
+  ASSERT_THAT(bound_result, IsOk());
+  auto bound_pred = bound_result.value();
+
+  // Should be optimized to STARTS_WITH (code points match)
+  EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
+
+  auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
+  ASSERT_NE(literal_pred, nullptr);
+  EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
+}
+
+TEST_F(PredicateTest, TruncateOptimizationWithUTF8Emoji) {
+  // Test multi-byte UTF-8 characters
+  // "Hi👋" = 3 UTF-8 code points but 6 bytes (👋 = 4 bytes: 0xF0 0x9F 0x91 0x8B)
+
+  auto truncate_expr = Expressions::Truncate("name", 3);
+
+  // "Hi👋" has 3 code points
+  auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
+      Expression::Operation::kEq, truncate_expr, Literal::String("Hi👋"));
+
+  auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
+  ASSERT_THAT(bound_result, IsOk());
+  auto bound_pred = bound_result.value();
+
+  // Should be optimized to STARTS_WITH
+  EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
+
+  auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
+  ASSERT_NE(literal_pred, nullptr);
+  EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
+}
+
+TEST_F(PredicateTest, TruncateOptimizationNotAppliedWhenUTF8LengthMismatch) {
+  // "José" has 4 code points but we're comparing against width 5
+  // Should NOT optimize
+
+  auto truncate_expr = Expressions::Truncate("name", 5);
+
+  auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
+      Expression::Operation::kEq, truncate_expr, Literal::String("José"));
+
+  auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
+  ASSERT_THAT(bound_result, IsOk());
+  auto bound_pred = bound_result.value();
+
+  // Should NOT be optimized (code points 4 != width 5)
+  EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
+
+  auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
+  ASSERT_NE(literal_pred, nullptr);
+  EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
+}
+
+TEST_F(PredicateTest, TruncateOptimizationNotAppliedForEmptyLiteralWithNonZeroWidth) {
+  // Empty literal with w > 0 should NOT optimize
+  // Empty string has 0 code points, which != width
+  // NOTE: width=0 is rejected by Transform::Truncate, so not tested here
+
+  auto truncate_expr = Expressions::Truncate("name", 5);
+
+  auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
+      Expression::Operation::kEq, truncate_expr, Literal::String(""));
+
+  auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
+  ASSERT_THAT(bound_result, IsOk());
+  auto bound_pred = bound_result.value();
+
+  // Should NOT be optimized (0 code points != width 5)
+  EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
+
+  auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
+  ASSERT_NE(literal_pred, nullptr);
+  EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
+}
+
 }  // namespace iceberg