feat: optimize truncate(col) == value to startsWith predicate

shangxinli · shangxinli · commit ab63064da61b · 2025-11-03T10:53:42.000-08:00
Rewrite truncate(col, width) == "value" predicates to col STARTS_WITH "value"
for string columns when the literal has exactly `width` UTF-8 code points.
This enables better predicate pushdown to storage formats and efficient use
of prefix indexes.

The optimization applies when ALL conditions are met:
- Operation is equality
- Term is a truncate transform
- Literal is a string type
- Literal UTF-8 code point count equals truncate width (CRITICAL)
- Literal is not null (safety check)
- Width is not zero (Transform rejects this, but guard added for safety)

Correctly handles UTF-8 multi-byte characters:
- "José" = 4 code points, 5 bytes (é = 2 bytes)
- "Hi👋" = 3 code points, 6 bytes (👋 = 4 bytes)

Implementation details:
- Uses type-safe Transform::param() API (no brittle regex parsing)
- Binary string comparison semantics (no collation issues)
- Benefits from strict metrics evaluation for startsWith
- Counts code points (not grapheme clusters) per Iceberg spec
- Short-string invariance: when source &lt; width, truncate returns full
  string, so equality implies exact match and STARTS_WITH remains valid

Added Transform::param() public API for accessing transform parameters,
replacing brittle ToString() regex parsing.

Comprehensive tests including UTF-8 edge cases and empty strings.
diff --git a/src/iceberg/expression/predicate.cc b/src/iceberg/expression/predicate.cc
@@ -25,7 +25,9 @@
 #include "iceberg/exception.h"
 #include "iceberg/expression/expressions.h"
 #include "iceberg/expression/literal.h"
+#include "iceberg/expression/term.h"
 #include "iceberg/result.h"
+#include "iceberg/transform.h"
 #include "iceberg/type.h"
 #include "iceberg/util/checked_cast.h"
 #include "iceberg/util/formatter_internal.h"
@@ -143,6 +145,27 @@ bool IsFloatingType(TypeId type) {
   return type == TypeId::kFloat || type == TypeId::kDouble;
 }
 
+/// \brief Count the number of UTF-8 code points in a string.
+/// This matches the behavior of TruncateUtils::TruncateUTF8.
+///
+/// NOTE: This counts code points, not grapheme clusters (user-perceived characters).
+/// Per the Iceberg spec, combining marks count as separate code points.
+/// Example: "é" as e + combining-acute (U+0065 U+0301) = 2 code points,
+///          but "é" as single precomposed character (U+00E9) = 1 code point.
+///
+/// \param str The UTF-8 encoded string
+/// \return The number of code points (not bytes, not graphemes)
+inline int32_t CountUTF8CodePoints(std::string_view str) {
+  int32_t code_point_count = 0;
+  for (unsigned char c : str) {
+    // Start of a new UTF-8 code point (not a continuation byte 10xxxxxx)
+    if ((c & 0xC0) != 0x80) {
+      ++code_point_count;
+    }
+  }
+  return code_point_count;
+}
+
 }  // namespace
 
 template <typename B>
@@ -226,7 +249,69 @@ Result<std::shared_ptr<Expression>> UnboundPredicate<B>::BindLiteralOperation(
     }
   }
 
-  // TODO(gangwu): translate truncate(col) == value to startsWith(value)
+  // Optimize: translate truncate(col, width) == value to col startsWith(value)
+  // This optimization allows better predicate pushdown and index usage
+  // IMPORTANT: Only valid when literal has exactly `width` UTF-8 code points
+  //
+  // NOTE: This rewrite is safe because:
+  // - Iceberg string comparisons are binary (byte-for-byte), no collation
+  // - STARTS_WITH uses the same binary comparison semantics as equality
+  // - truncate(col, w) == "value" ⟺ col STARTS_WITH "value" when len(value) == w
+  // - When source has < w code points, truncate returns full string; equality
+  //   implies exact match, so STARTS_WITH remains valid (short-string invariance)
+  if (BASE::op() == Expression::Operation::kEq &&
+      bound_term->kind() == Term::Kind::kTransform) {
+    // Safe to cast after kind check confirms it's a transform
+    auto* transform_term = dynamic_cast<BoundTransform*>(bound_term.get());
+    if (!transform_term) {
+      // Should never happen after kind check, but be defensive
+      return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
+                                                     std::move(literal));
+    }
+
+    if (transform_term->transform()->transform_type() == TransformType::kTruncate &&
+        literal.type()->type_id() == TypeId::kString &&
+        !literal.IsNull()) {  // Null safety: skip null literals
+
+      // Extract width parameter using type-safe API
+      auto width_opt = transform_term->transform()->param();
+      if (!width_opt) {
+        // Should never happen for truncate, but be defensive
+        return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
+                                                       std::move(literal));
+      }
+
+      int32_t truncate_width = *width_opt;
+
+      // Skip width=0: truncate(col, 0) == "" would rewrite to STARTS_WITH("")
+      // which is tautologically true and could accidentally broaden filters
+      // (Note: Transform::Truncate already validates width > 0, but defensive check)
+      if (truncate_width == 0) {
+        return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
+                                                       std::move(literal));
+      }
+
+      auto& string_value = std::get<std::string>(literal.value());
+
+      // Count UTF-8 code points (not bytes!)
+      // Truncate uses code points: "José" has 4 code points but 5 bytes
+      int32_t code_point_count = CountUTF8CodePoints(string_value);
+
+      // Only optimize if literal code point count equals truncate width
+      // Example: truncate(col, 5) == "Alice" (5 code points) can be optimized
+      //          truncate(col, 10) == "abc" (3 code points) CANNOT
+      //          truncate(col, 4) == "José" (4 code points, 5 bytes) CAN be optimized
+      if (code_point_count == truncate_width) {
+        // Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value"
+        // This benefits from strict metrics evaluation for startsWith in manifest
+        // filtering
+        return std::make_shared<BoundLiteralPredicate>(Expression::Operation::kStartsWith,
+                                                       transform_term->reference(),
+                                                       std::move(literal));
+      }
+    }
+  }
+
   return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
                                                  std::move(literal));
 }
diff --git a/src/iceberg/test/predicate_test.cc b/src/iceberg/test/predicate_test.cc
@@ -17,6 +17,8 @@
  * under the License.
  */
 
+#include "iceberg/expression/predicate.h"
+
 #include "iceberg/expression/expressions.h"
 #include "iceberg/schema.h"
 #include "iceberg/test/matchers.h"
@@ -433,4 +435,215 @@ TEST_F(PredicateTest, ComplexExpressionCombinations) {
   EXPECT_EQ(nested->op(), Expression::Operation::kAnd);
 }
 
+TEST_F(PredicateTest, TruncateOptimizationToStartsWith) {
+  // Test that truncate(col) == "value" is optimized to col STARTS_WITH "value"
+
+  // Create a truncate transform expression: truncate(name, 5)
+  auto truncate_expr = Expressions::Truncate("name", 5);
+
+  // Create predicate: truncate(name, 5) == "Alice"
+  auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
+      Expression::Operation::kEq, truncate_expr, Literal::String("Alice"));
+
+  // Bind the predicate to the schema
+  auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
+  ASSERT_THAT(bound_result, IsOk());
+  auto bound_pred = bound_result.value();
+
+  // After optimization, it should be a STARTS_WITH operation
+  EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
+
+  // Verify it's a BoundLiteralPredicate
+  auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
+  ASSERT_NE(literal_pred, nullptr);
+
+  // The term should now be a direct reference to "name", not a transform
+  EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
+
+  // The literal should still be "Alice"
+  EXPECT_EQ(literal_pred->literal(), Literal::String("Alice"));
+}
+
+TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonEquality) {
+  // Test that optimization is NOT applied for non-equality operations
+
+  auto truncate_expr = Expressions::Truncate("name", 5);
+
+  // Test with less-than (should NOT be optimized)
+  auto truncate_lt_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
+      Expression::Operation::kLt, truncate_expr, Literal::String("Bob"));
+  auto bound_lt_result = truncate_lt_pred->Bind(*schema_, /*case_sensitive=*/true);
+  ASSERT_THAT(bound_lt_result, IsOk());
+  auto bound_lt = bound_lt_result.value();
+
+  // Should remain as kLt, not converted to STARTS_WITH
+  EXPECT_EQ(bound_lt->op(), Expression::Operation::kLt);
+
+  // The term should still be a transform
+  auto* literal_pred_lt = dynamic_cast<BoundLiteralPredicate*>(bound_lt.get());
+  ASSERT_NE(literal_pred_lt, nullptr);
+  EXPECT_EQ(literal_pred_lt->term()->kind(), Term::Kind::kTransform);
+}
+
+TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonString) {
+  // Test that optimization is NOT applied for non-string types
+  // (truncate can also work on binary types, but optimization only applies to strings)
+
+  // Create a schema with binary field
+  auto binary_schema = std::make_shared<Schema>(
+      std::vector<SchemaField>{SchemaField::MakeOptional(1, "data", binary())},
+      /*schema_id=*/0);
+
+  auto truncate_expr = Expressions::Truncate("data", 10);
+  auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
+      Expression::Operation::kEq, truncate_expr,
+      Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05}));
+
+  auto bound_result = truncate_eq_pred->Bind(*binary_schema, /*case_sensitive=*/true);
+  ASSERT_THAT(bound_result, IsOk());
+  auto bound_pred = bound_result.value();
+
+  // Should remain as kEq, not converted to STARTS_WITH (binary doesn't support
+  // startsWith)
+  EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
+
+  // The term should still be a transform
+  auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
+  ASSERT_NE(literal_pred, nullptr);
+  EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
+}
+
+TEST_F(PredicateTest, TruncateOptimizationNotAppliedForWidthMismatch) {
+  // CRITICAL TEST: Optimization must NOT apply when literal length != truncate width
+  // Example: truncate(col, 10) == "abc" should NOT become STARTS_WITH
+  // Because "abc1234567" would match STARTS_WITH but NOT truncate equality
+
+  auto truncate_expr = Expressions::Truncate("name", 10);
+
+  // Literal "abc" has length 3, but truncate width is 10
+  auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
+      Expression::Operation::kEq, truncate_expr, Literal::String("abc"));
+
+  auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
+  ASSERT_THAT(bound_result, IsOk());
+  auto bound_pred = bound_result.value();
+
+  // Should remain as kEq, NOT converted to STARTS_WITH
+  EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
+
+  // The term should still be a transform (not optimized away)
+  auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
+  ASSERT_NE(literal_pred, nullptr);
+  EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
+}
+
+TEST_F(PredicateTest, TruncateOptimizationAppliedWhenLengthMatches) {
+  // Test that optimization IS applied when literal length == truncate width
+
+  auto truncate_expr = Expressions::Truncate("name", 5);
+
+  // Literal "Alice" has length 5, matching truncate width 5
+  auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
+      Expression::Operation::kEq, truncate_expr, Literal::String("Alice"));
+
+  auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
+  ASSERT_THAT(bound_result, IsOk());
+  auto bound_pred = bound_result.value();
+
+  // Should be optimized to STARTS_WITH
+  EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
+
+  // The term should be a direct reference (optimization applied)
+  auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
+  ASSERT_NE(literal_pred, nullptr);
+  EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
+}
+
+TEST_F(PredicateTest, TruncateOptimizationWithUTF8Accents) {
+  // CRITICAL: Test UTF-8 code points vs bytes
+  // "José" = 4 UTF-8 code points but 5 bytes (é = 0xC3 0xA9)
+
+  auto truncate_expr = Expressions::Truncate("name", 4);
+
+  // "José" has 4 code points, matching truncate width 4
+  auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
+      Expression::Operation::kEq, truncate_expr, Literal::String("José"));
+
+  auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
+  ASSERT_THAT(bound_result, IsOk());
+  auto bound_pred = bound_result.value();
+
+  // Should be optimized to STARTS_WITH (code points match)
+  EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
+
+  auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
+  ASSERT_NE(literal_pred, nullptr);
+  EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
+}
+
+TEST_F(PredicateTest, TruncateOptimizationWithUTF8Emoji) {
+  // Test multi-byte UTF-8 characters
+  // "Hi👋" = 3 UTF-8 code points but 6 bytes (👋 = 4 bytes: 0xF0 0x9F 0x91 0x8B)
+
+  auto truncate_expr = Expressions::Truncate("name", 3);
+
+  // "Hi👋" has 3 code points
+  auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
+      Expression::Operation::kEq, truncate_expr, Literal::String("Hi👋"));
+
+  auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
+  ASSERT_THAT(bound_result, IsOk());
+  auto bound_pred = bound_result.value();
+
+  // Should be optimized to STARTS_WITH
+  EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);
+
+  auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
+  ASSERT_NE(literal_pred, nullptr);
+  EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
+}
+
+TEST_F(PredicateTest, TruncateOptimizationNotAppliedWhenUTF8LengthMismatch) {
+  // "José" has 4 code points but we're comparing against width 5
+  // Should NOT optimize
+
+  auto truncate_expr = Expressions::Truncate("name", 5);
+
+  auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
+      Expression::Operation::kEq, truncate_expr, Literal::String("José"));
+
+  auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
+  ASSERT_THAT(bound_result, IsOk());
+  auto bound_pred = bound_result.value();
+
+  // Should NOT be optimized (code points 4 != width 5)
+  EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
+
+  auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
+  ASSERT_NE(literal_pred, nullptr);
+  EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
+}
+
+TEST_F(PredicateTest, TruncateOptimizationNotAppliedForEmptyLiteralWithNonZeroWidth) {
+  // Empty literal with w > 0 should NOT optimize
+  // Empty string has 0 code points, which != width
+  // NOTE: width=0 is rejected by Transform::Truncate, so not tested here
+
+  auto truncate_expr = Expressions::Truncate("name", 5);
+
+  auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
+      Expression::Operation::kEq, truncate_expr, Literal::String(""));
+
+  auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
+  ASSERT_THAT(bound_result, IsOk());
+  auto bound_pred = bound_result.value();
+
+  // Should NOT be optimized (0 code points != width 5)
+  EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
+
+  auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
+  ASSERT_NE(literal_pred, nullptr);
+  EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
+}
+
 }  // namespace iceberg
diff --git a/src/iceberg/transform.h b/src/iceberg/transform.h
@@ -141,6 +141,19 @@ class ICEBERG_EXPORT Transform : public util::Formattable {
   /// \brief Returns the transform type.
   TransformType transform_type() const;
 
+  /// \brief Returns the optional parameter for parameterized transforms.
+  ///
+  /// For transforms like bucket(N) or truncate(W), returns the parameter value.
+  /// For non-parameterized transforms (identity, year, etc.), returns std::nullopt.
+  ///
+  /// \return The parameter if present, otherwise std::nullopt
+  std::optional<int32_t> param() const {
+    if (auto* p = std::get_if<int32_t>(&param_)) {
+      return *p;
+    }
+    return std::nullopt;
+  }
+
   /// \brief Binds this transform to a source type, returning a typed TransformFunction.
   ///
   /// This creates a concrete transform implementation based on the transform type and