|
17 | 17 | * under the License. |
18 | 18 | */ |
19 | 19 |
|
| 20 | +#include "iceberg/expression/predicate.h" |
| 21 | + |
20 | 22 | #include "iceberg/expression/expressions.h" |
21 | 23 | #include "iceberg/schema.h" |
22 | 24 | #include "iceberg/test/matchers.h" |
@@ -433,4 +435,215 @@ TEST_F(PredicateTest, ComplexExpressionCombinations) { |
433 | 435 | EXPECT_EQ(nested->op(), Expression::Operation::kAnd); |
434 | 436 | } |
435 | 437 |
|
| 438 | +TEST_F(PredicateTest, TruncateOptimizationToStartsWith) { |
| 439 | + // Test that truncate(col) == "value" is optimized to col STARTS_WITH "value" |
| 440 | + |
| 441 | + // Create a truncate transform expression: truncate(name, 5) |
| 442 | + auto truncate_expr = Expressions::Truncate("name", 5); |
| 443 | + |
| 444 | + // Create predicate: truncate(name, 5) == "Alice" |
| 445 | + auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>( |
| 446 | + Expression::Operation::kEq, truncate_expr, Literal::String("Alice")); |
| 447 | + |
| 448 | + // Bind the predicate to the schema |
| 449 | + auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true); |
| 450 | + ASSERT_THAT(bound_result, IsOk()); |
| 451 | + auto bound_pred = bound_result.value(); |
| 452 | + |
| 453 | + // After optimization, it should be a STARTS_WITH operation |
| 454 | + EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith); |
| 455 | + |
| 456 | + // Verify it's a BoundLiteralPredicate |
| 457 | + auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get()); |
| 458 | + ASSERT_NE(literal_pred, nullptr); |
| 459 | + |
| 460 | + // The term should now be a direct reference to "name", not a transform |
| 461 | + EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference); |
| 462 | + |
| 463 | + // The literal should still be "Alice" |
| 464 | + EXPECT_EQ(literal_pred->literal(), Literal::String("Alice")); |
| 465 | +} |
| 466 | + |
| 467 | +TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonEquality) { |
| 468 | + // Test that optimization is NOT applied for non-equality operations |
| 469 | + |
| 470 | + auto truncate_expr = Expressions::Truncate("name", 5); |
| 471 | + |
| 472 | + // Test with less-than (should NOT be optimized) |
| 473 | + auto truncate_lt_pred = std::make_shared<UnboundPredicate<BoundTransform>>( |
| 474 | + Expression::Operation::kLt, truncate_expr, Literal::String("Bob")); |
| 475 | + auto bound_lt_result = truncate_lt_pred->Bind(*schema_, /*case_sensitive=*/true); |
| 476 | + ASSERT_THAT(bound_lt_result, IsOk()); |
| 477 | + auto bound_lt = bound_lt_result.value(); |
| 478 | + |
| 479 | + // Should remain as kLt, not converted to STARTS_WITH |
| 480 | + EXPECT_EQ(bound_lt->op(), Expression::Operation::kLt); |
| 481 | + |
| 482 | + // The term should still be a transform |
| 483 | + auto* literal_pred_lt = dynamic_cast<BoundLiteralPredicate*>(bound_lt.get()); |
| 484 | + ASSERT_NE(literal_pred_lt, nullptr); |
| 485 | + EXPECT_EQ(literal_pred_lt->term()->kind(), Term::Kind::kTransform); |
| 486 | +} |
| 487 | + |
| 488 | +TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonString) { |
| 489 | + // Test that optimization is NOT applied for non-string types |
| 490 | + // (truncate can also work on binary types, but optimization only applies to strings) |
| 491 | + |
| 492 | + // Create a schema with binary field |
| 493 | + auto binary_schema = std::make_shared<Schema>( |
| 494 | + std::vector<SchemaField>{SchemaField::MakeOptional(1, "data", binary())}, |
| 495 | + /*schema_id=*/0); |
| 496 | + |
| 497 | + auto truncate_expr = Expressions::Truncate("data", 10); |
| 498 | + auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>( |
| 499 | + Expression::Operation::kEq, truncate_expr, |
| 500 | + Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05})); |
| 501 | + |
| 502 | + auto bound_result = truncate_eq_pred->Bind(*binary_schema, /*case_sensitive=*/true); |
| 503 | + ASSERT_THAT(bound_result, IsOk()); |
| 504 | + auto bound_pred = bound_result.value(); |
| 505 | + |
| 506 | + // Should remain as kEq, not converted to STARTS_WITH (binary doesn't support |
| 507 | + // startsWith) |
| 508 | + EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq); |
| 509 | + |
| 510 | + // The term should still be a transform |
| 511 | + auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get()); |
| 512 | + ASSERT_NE(literal_pred, nullptr); |
| 513 | + EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform); |
| 514 | +} |
| 515 | + |
| 516 | +TEST_F(PredicateTest, TruncateOptimizationNotAppliedForWidthMismatch) { |
| 517 | + // CRITICAL TEST: Optimization must NOT apply when literal length != truncate width |
| 518 | + // Example: truncate(col, 10) == "abc" should NOT become STARTS_WITH |
| 519 | + // Because "abc1234567" would match STARTS_WITH but NOT truncate equality |
| 520 | + |
| 521 | + auto truncate_expr = Expressions::Truncate("name", 10); |
| 522 | + |
| 523 | + // Literal "abc" has length 3, but truncate width is 10 |
| 524 | + auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>( |
| 525 | + Expression::Operation::kEq, truncate_expr, Literal::String("abc")); |
| 526 | + |
| 527 | + auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true); |
| 528 | + ASSERT_THAT(bound_result, IsOk()); |
| 529 | + auto bound_pred = bound_result.value(); |
| 530 | + |
| 531 | + // Should remain as kEq, NOT converted to STARTS_WITH |
| 532 | + EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq); |
| 533 | + |
| 534 | + // The term should still be a transform (not optimized away) |
| 535 | + auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get()); |
| 536 | + ASSERT_NE(literal_pred, nullptr); |
| 537 | + EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform); |
| 538 | +} |
| 539 | + |
| 540 | +TEST_F(PredicateTest, TruncateOptimizationAppliedWhenLengthMatches) { |
| 541 | + // Test that optimization IS applied when literal length == truncate width |
| 542 | + |
| 543 | + auto truncate_expr = Expressions::Truncate("name", 5); |
| 544 | + |
| 545 | + // Literal "Alice" has length 5, matching truncate width 5 |
| 546 | + auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>( |
| 547 | + Expression::Operation::kEq, truncate_expr, Literal::String("Alice")); |
| 548 | + |
| 549 | + auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true); |
| 550 | + ASSERT_THAT(bound_result, IsOk()); |
| 551 | + auto bound_pred = bound_result.value(); |
| 552 | + |
| 553 | + // Should be optimized to STARTS_WITH |
| 554 | + EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith); |
| 555 | + |
| 556 | + // The term should be a direct reference (optimization applied) |
| 557 | + auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get()); |
| 558 | + ASSERT_NE(literal_pred, nullptr); |
| 559 | + EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference); |
| 560 | +} |
| 561 | + |
| 562 | +TEST_F(PredicateTest, TruncateOptimizationWithUTF8Accents) { |
| 563 | + // CRITICAL: Test UTF-8 code points vs bytes |
| 564 | + // "José" = 4 UTF-8 code points but 5 bytes (é = 0xC3 0xA9) |
| 565 | + |
| 566 | + auto truncate_expr = Expressions::Truncate("name", 4); |
| 567 | + |
| 568 | + // "José" has 4 code points, matching truncate width 4 |
| 569 | + auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>( |
| 570 | + Expression::Operation::kEq, truncate_expr, Literal::String("José")); |
| 571 | + |
| 572 | + auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true); |
| 573 | + ASSERT_THAT(bound_result, IsOk()); |
| 574 | + auto bound_pred = bound_result.value(); |
| 575 | + |
| 576 | + // Should be optimized to STARTS_WITH (code points match) |
| 577 | + EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith); |
| 578 | + |
| 579 | + auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get()); |
| 580 | + ASSERT_NE(literal_pred, nullptr); |
| 581 | + EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference); |
| 582 | +} |
| 583 | + |
| 584 | +TEST_F(PredicateTest, TruncateOptimizationWithUTF8Emoji) { |
| 585 | + // Test multi-byte UTF-8 characters |
| 586 | + // "Hi👋" = 3 UTF-8 code points but 6 bytes (👋 = 4 bytes: 0xF0 0x9F 0x91 0x8B) |
| 587 | + |
| 588 | + auto truncate_expr = Expressions::Truncate("name", 3); |
| 589 | + |
| 590 | + // "Hi👋" has 3 code points |
| 591 | + auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>( |
| 592 | + Expression::Operation::kEq, truncate_expr, Literal::String("Hi👋")); |
| 593 | + |
| 594 | + auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true); |
| 595 | + ASSERT_THAT(bound_result, IsOk()); |
| 596 | + auto bound_pred = bound_result.value(); |
| 597 | + |
| 598 | + // Should be optimized to STARTS_WITH |
| 599 | + EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith); |
| 600 | + |
| 601 | + auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get()); |
| 602 | + ASSERT_NE(literal_pred, nullptr); |
| 603 | + EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference); |
| 604 | +} |
| 605 | + |
| 606 | +TEST_F(PredicateTest, TruncateOptimizationNotAppliedWhenUTF8LengthMismatch) { |
| 607 | + // "José" has 4 code points but we're comparing against width 5 |
| 608 | + // Should NOT optimize |
| 609 | + |
| 610 | + auto truncate_expr = Expressions::Truncate("name", 5); |
| 611 | + |
| 612 | + auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>( |
| 613 | + Expression::Operation::kEq, truncate_expr, Literal::String("José")); |
| 614 | + |
| 615 | + auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true); |
| 616 | + ASSERT_THAT(bound_result, IsOk()); |
| 617 | + auto bound_pred = bound_result.value(); |
| 618 | + |
| 619 | + // Should NOT be optimized (code points 4 != width 5) |
| 620 | + EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq); |
| 621 | + |
| 622 | + auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get()); |
| 623 | + ASSERT_NE(literal_pred, nullptr); |
| 624 | + EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform); |
| 625 | +} |
| 626 | + |
| 627 | +TEST_F(PredicateTest, TruncateOptimizationNotAppliedForEmptyLiteralWithNonZeroWidth) { |
| 628 | + // Empty literal with w > 0 should NOT optimize |
| 629 | + // Empty string has 0 code points, which != width |
| 630 | + // NOTE: width=0 is rejected by Transform::Truncate, so not tested here |
| 631 | + |
| 632 | + auto truncate_expr = Expressions::Truncate("name", 5); |
| 633 | + |
| 634 | + auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>( |
| 635 | + Expression::Operation::kEq, truncate_expr, Literal::String("")); |
| 636 | + |
| 637 | + auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true); |
| 638 | + ASSERT_THAT(bound_result, IsOk()); |
| 639 | + auto bound_pred = bound_result.value(); |
| 640 | + |
| 641 | + // Should NOT be optimized (0 code points != width 5) |
| 642 | + EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq); |
| 643 | + |
| 644 | + auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get()); |
| 645 | + ASSERT_NE(literal_pred, nullptr); |
| 646 | + EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform); |
| 647 | +} |
| 648 | + |
436 | 649 | } // namespace iceberg |
0 commit comments