@@ -1271,6 +1271,263 @@ TEST_F(TransformProjectTest, TruncateProjectStringStartsWith) {
12711271 " Hello" );
12721272}
12731273
1274+ TEST_F (TransformProjectTest, TruncateProjectStringStartsWithCodePointCountLessThanWidth) {
1275+ auto transform = Transform::Truncate (5 );
1276+
1277+ // Code point count < width (multi-byte UTF-8 characters)
1278+ // "😜🧐" has 2 code points, width is 5
1279+ auto unbound_emoji_short = Expressions::StartsWith (" value" , " 😜🧐" );
1280+ ICEBERG_ASSIGN_OR_THROW (
1281+ auto bound_emoji_short,
1282+ unbound_emoji_short->Bind (*string_schema_, /* case_sensitive=*/ true ));
1283+ auto bound_pred_emoji_short =
1284+ std::dynamic_pointer_cast<BoundPredicate>(bound_emoji_short);
1285+ ASSERT_NE (bound_pred_emoji_short, nullptr );
1286+
1287+ ICEBERG_ASSIGN_OR_THROW (auto projected_emoji_short,
1288+ transform->Project (" part" , bound_pred_emoji_short));
1289+ ASSERT_NE (projected_emoji_short, nullptr );
1290+ EXPECT_EQ (projected_emoji_short->op (), Expression::Operation::kStartsWith );
1291+
1292+ auto unbound_projected_emoji_short =
1293+ internal::checked_pointer_cast<UnboundPredicateImpl<BoundReference>>(
1294+ std::move (projected_emoji_short));
1295+ ASSERT_NE (unbound_projected_emoji_short, nullptr );
1296+ EXPECT_EQ (unbound_projected_emoji_short->op (), Expression::Operation::kStartsWith );
1297+ EXPECT_EQ (unbound_projected_emoji_short->literals ().size (), 1 );
1298+ EXPECT_EQ (
1299+ std::get<std::string>(unbound_projected_emoji_short->literals ().front ().value ()),
1300+ " 😜🧐" );
1301+ }
1302+
1303+ TEST_F (TransformProjectTest, TruncateProjectStringStartsWithCodePointCountEqualToWidth) {
1304+ auto transform = Transform::Truncate (5 );
1305+
1306+ // Code point count == width (exactly 5 code points)
1307+ // "😜🧐🤔🤪🥳" has exactly 5 code points
1308+ auto unbound_emoji_equal = Expressions::StartsWith (" value" , " 😜🧐🤔🤪🥳" );
1309+ ICEBERG_ASSIGN_OR_THROW (
1310+ auto bound_emoji_equal,
1311+ unbound_emoji_equal->Bind (*string_schema_, /* case_sensitive=*/ true ));
1312+ auto bound_pred_emoji_equal =
1313+ std::dynamic_pointer_cast<BoundPredicate>(bound_emoji_equal);
1314+ ASSERT_NE (bound_pred_emoji_equal, nullptr );
1315+
1316+ ICEBERG_ASSIGN_OR_THROW (auto projected_emoji_equal,
1317+ transform->Project (" part" , bound_pred_emoji_equal));
1318+ ASSERT_NE (projected_emoji_equal, nullptr );
1319+ EXPECT_EQ (projected_emoji_equal->op (), Expression::Operation::kEq );
1320+
1321+ auto unbound_projected_emoji_equal =
1322+ internal::checked_pointer_cast<UnboundPredicateImpl<BoundReference>>(
1323+ std::move (projected_emoji_equal));
1324+ ASSERT_NE (unbound_projected_emoji_equal, nullptr );
1325+ EXPECT_EQ (unbound_projected_emoji_equal->op (), Expression::Operation::kEq );
1326+ EXPECT_EQ (unbound_projected_emoji_equal->literals ().size (), 1 );
1327+ EXPECT_EQ (
1328+ std::get<std::string>(unbound_projected_emoji_equal->literals ().front ().value ()),
1329+ " 😜🧐🤔🤪🥳" );
1330+ }
1331+
1332+ TEST_F (TransformProjectTest,
1333+ TruncateProjectStringStartsWithCodePointCountGreaterThanWidth) {
1334+ auto transform = Transform::Truncate (5 );
1335+
1336+ // Code point count > width (truncate to 5 code points)
1337+ // "😜🧐🤔🤪🥳😵💫😂" has 7 code points, should truncate to 5
1338+ auto unbound_emoji_long =
1339+ Expressions::StartsWith (" value" , " 😜🧐🤔🤪🥳😵💫😂" );
1340+ ICEBERG_ASSIGN_OR_THROW (
1341+ auto bound_emoji_long,
1342+ unbound_emoji_long->Bind (*string_schema_, /* case_sensitive=*/ true ));
1343+ auto bound_pred_emoji_long =
1344+ std::dynamic_pointer_cast<BoundPredicate>(bound_emoji_long);
1345+ ASSERT_NE (bound_pred_emoji_long, nullptr );
1346+
1347+ ICEBERG_ASSIGN_OR_THROW (auto projected_emoji_long,
1348+ transform->Project (" part" , bound_pred_emoji_long));
1349+ ASSERT_NE (projected_emoji_long, nullptr );
1350+ EXPECT_EQ (projected_emoji_long->op (), Expression::Operation::kStartsWith );
1351+
1352+ auto unbound_projected_emoji_long =
1353+ internal::checked_pointer_cast<UnboundPredicateImpl<BoundReference>>(
1354+ std::move (projected_emoji_long));
1355+ ASSERT_NE (unbound_projected_emoji_long, nullptr );
1356+ EXPECT_EQ (unbound_projected_emoji_long->op (), Expression::Operation::kStartsWith );
1357+ EXPECT_EQ (unbound_projected_emoji_long->literals ().size (), 1 );
1358+ EXPECT_EQ (
1359+ std::get<std::string>(unbound_projected_emoji_long->literals ().front ().value ()),
1360+ " 😜🧐🤔🤪🥳" );
1361+ }
1362+
1363+ TEST_F (TransformProjectTest, TruncateProjectStringStartsWithMixedAsciiAndMultiByte) {
1364+ auto transform = Transform::Truncate (5 );
1365+
1366+ // Mixed ASCII and multi-byte UTF-8 characters
1367+ // "a😜b🧐c" has 5 code points (3 ASCII + 2 emojis)
1368+ auto unbound_mixed_equal = Expressions::StartsWith (" value" , " a😜b🧐c" );
1369+ ICEBERG_ASSIGN_OR_THROW (
1370+ auto bound_mixed_equal,
1371+ unbound_mixed_equal->Bind (*string_schema_, /* case_sensitive=*/ true ));
1372+ auto bound_pred_mixed_equal =
1373+ std::dynamic_pointer_cast<BoundPredicate>(bound_mixed_equal);
1374+ ASSERT_NE (bound_pred_mixed_equal, nullptr );
1375+
1376+ ICEBERG_ASSIGN_OR_THROW (auto projected_mixed_equal,
1377+ transform->Project (" part" , bound_pred_mixed_equal));
1378+ ASSERT_NE (projected_mixed_equal, nullptr );
1379+ EXPECT_EQ (projected_mixed_equal->op (), Expression::Operation::kEq );
1380+
1381+ auto unbound_projected_mixed_equal =
1382+ internal::checked_pointer_cast<UnboundPredicateImpl<BoundReference>>(
1383+ std::move (projected_mixed_equal));
1384+ ASSERT_NE (unbound_projected_mixed_equal, nullptr );
1385+ EXPECT_EQ (unbound_projected_mixed_equal->op (), Expression::Operation::kEq );
1386+ EXPECT_EQ (unbound_projected_mixed_equal->literals ().size (), 1 );
1387+ EXPECT_EQ (
1388+ std::get<std::string>(unbound_projected_mixed_equal->literals ().front ().value ()),
1389+ " a😜b🧐c" );
1390+ }
1391+
1392+ TEST_F (TransformProjectTest, TruncateProjectStringStartsWithChineseCharactersShort) {
1393+ auto transform = Transform::Truncate (5 );
1394+
1395+ // Chinese characters (3-byte UTF-8)
1396+ // "你好世界" has 4 code points, width is 5
1397+ auto unbound_chinese_short = Expressions::StartsWith (" value" , " 你好世界" );
1398+ ICEBERG_ASSIGN_OR_THROW (
1399+ auto bound_chinese_short,
1400+ unbound_chinese_short->Bind (*string_schema_, /* case_sensitive=*/ true ));
1401+ auto bound_pred_chinese_short =
1402+ std::dynamic_pointer_cast<BoundPredicate>(bound_chinese_short);
1403+ ASSERT_NE (bound_pred_chinese_short, nullptr );
1404+
1405+ ICEBERG_ASSIGN_OR_THROW (auto projected_chinese_short,
1406+ transform->Project (" part" , bound_pred_chinese_short));
1407+ ASSERT_NE (projected_chinese_short, nullptr );
1408+ EXPECT_EQ (projected_chinese_short->op (), Expression::Operation::kStartsWith );
1409+
1410+ auto unbound_projected_chinese_short =
1411+ internal::checked_pointer_cast<UnboundPredicateImpl<BoundReference>>(
1412+ std::move (projected_chinese_short));
1413+ ASSERT_NE (unbound_projected_chinese_short, nullptr );
1414+ EXPECT_EQ (unbound_projected_chinese_short->op (), Expression::Operation::kStartsWith );
1415+ EXPECT_EQ (unbound_projected_chinese_short->literals ().size (), 1 );
1416+ EXPECT_EQ (
1417+ std::get<std::string>(unbound_projected_chinese_short->literals ().front ().value ()),
1418+ " 你好世界" );
1419+ }
1420+
1421+ TEST_F (TransformProjectTest, TruncateProjectStringStartsWithChineseCharactersEqualWidth) {
1422+ auto transform = Transform::Truncate (5 );
1423+
1424+ // Chinese characters exactly matching width
1425+ // "你好世界好" has exactly 5 code points
1426+ auto unbound_chinese_equal = Expressions::StartsWith (" value" , " 你好世界好" );
1427+ ICEBERG_ASSIGN_OR_THROW (
1428+ auto bound_chinese_equal,
1429+ unbound_chinese_equal->Bind (*string_schema_, /* case_sensitive=*/ true ));
1430+ auto bound_pred_chinese_equal =
1431+ std::dynamic_pointer_cast<BoundPredicate>(bound_chinese_equal);
1432+ ASSERT_NE (bound_pred_chinese_equal, nullptr );
1433+
1434+ ICEBERG_ASSIGN_OR_THROW (auto projected_chinese_equal,
1435+ transform->Project (" part" , bound_pred_chinese_equal));
1436+ ASSERT_NE (projected_chinese_equal, nullptr );
1437+ EXPECT_EQ (projected_chinese_equal->op (), Expression::Operation::kEq );
1438+
1439+ auto unbound_projected_chinese_equal =
1440+ internal::checked_pointer_cast<UnboundPredicateImpl<BoundReference>>(
1441+ std::move (projected_chinese_equal));
1442+ ASSERT_NE (unbound_projected_chinese_equal, nullptr );
1443+ EXPECT_EQ (unbound_projected_chinese_equal->op (), Expression::Operation::kEq );
1444+ EXPECT_EQ (unbound_projected_chinese_equal->literals ().size (), 1 );
1445+ EXPECT_EQ (
1446+ std::get<std::string>(unbound_projected_chinese_equal->literals ().front ().value ()),
1447+ " 你好世界好" );
1448+ }
1449+
1450+ TEST_F (TransformProjectTest,
1451+ TruncateProjectStringNotStartsWithCodePointCountEqualToWidth) {
1452+ auto transform = Transform::Truncate (5 );
1453+
1454+ // NotStartsWith with code point count == width
1455+ // Should convert to NotEq
1456+ auto unbound_not_starts_equal = Expressions::NotStartsWith (" value" , " 😜🧐🤔🤪🥳" );
1457+ ICEBERG_ASSIGN_OR_THROW (
1458+ auto bound_not_starts_equal,
1459+ unbound_not_starts_equal->Bind (*string_schema_, /* case_sensitive=*/ true ));
1460+ auto bound_pred_not_starts_equal =
1461+ std::dynamic_pointer_cast<BoundPredicate>(bound_not_starts_equal);
1462+ ASSERT_NE (bound_pred_not_starts_equal, nullptr );
1463+
1464+ ICEBERG_ASSIGN_OR_THROW (auto projected_not_starts_equal,
1465+ transform->Project (" part" , bound_pred_not_starts_equal));
1466+ ASSERT_NE (projected_not_starts_equal, nullptr );
1467+ EXPECT_EQ (projected_not_starts_equal->op (), Expression::Operation::kNotEq );
1468+
1469+ auto unbound_projected_not_starts_equal =
1470+ internal::checked_pointer_cast<UnboundPredicateImpl<BoundReference>>(
1471+ std::move (projected_not_starts_equal));
1472+ ASSERT_NE (unbound_projected_not_starts_equal, nullptr );
1473+ EXPECT_EQ (unbound_projected_not_starts_equal->op (), Expression::Operation::kNotEq );
1474+ EXPECT_EQ (unbound_projected_not_starts_equal->literals ().size (), 1 );
1475+ EXPECT_EQ (std::get<std::string>(
1476+ unbound_projected_not_starts_equal->literals ().front ().value ()),
1477+ " 😜🧐🤔🤪🥳" );
1478+ }
1479+
1480+ TEST_F (TransformProjectTest,
1481+ TruncateProjectStringNotStartsWithCodePointCountLessThanWidth) {
1482+ auto transform = Transform::Truncate (5 );
1483+
1484+ // NotStartsWith with code point count < width
1485+ // Should remain NotStartsWith
1486+ auto unbound_not_starts_short = Expressions::NotStartsWith (" value" , " 😜🧐" );
1487+ ICEBERG_ASSIGN_OR_THROW (
1488+ auto bound_not_starts_short,
1489+ unbound_not_starts_short->Bind (*string_schema_, /* case_sensitive=*/ true ));
1490+ auto bound_pred_not_starts_short =
1491+ std::dynamic_pointer_cast<BoundPredicate>(bound_not_starts_short);
1492+ ASSERT_NE (bound_pred_not_starts_short, nullptr );
1493+
1494+ ICEBERG_ASSIGN_OR_THROW (auto projected_not_starts_short,
1495+ transform->Project (" part" , bound_pred_not_starts_short));
1496+ ASSERT_NE (projected_not_starts_short, nullptr );
1497+ EXPECT_EQ (projected_not_starts_short->op (), Expression::Operation::kNotStartsWith );
1498+
1499+ auto unbound_projected_not_starts_short =
1500+ internal::checked_pointer_cast<UnboundPredicateImpl<BoundReference>>(
1501+ std::move (projected_not_starts_short));
1502+ ASSERT_NE (unbound_projected_not_starts_short, nullptr );
1503+ EXPECT_EQ (unbound_projected_not_starts_short->op (),
1504+ Expression::Operation::kNotStartsWith );
1505+ EXPECT_EQ (unbound_projected_not_starts_short->literals ().size (), 1 );
1506+ EXPECT_EQ (std::get<std::string>(
1507+ unbound_projected_not_starts_short->literals ().front ().value ()),
1508+ " 😜🧐" );
1509+ }
1510+
1511+ TEST_F (TransformProjectTest,
1512+ TruncateProjectStringNotStartsWithCodePointCountGreaterThanWidth) {
1513+ auto transform = Transform::Truncate (5 );
1514+
1515+ // NotStartsWith with code point count > width
1516+ // Should return nullptr (cannot project)
1517+ auto unbound_not_starts_long =
1518+ Expressions::NotStartsWith (" value" , " 😜🧐🤔🤪🥳😵💫😂" );
1519+ ICEBERG_ASSIGN_OR_THROW (
1520+ auto bound_not_starts_long,
1521+ unbound_not_starts_long->Bind (*string_schema_, /* case_sensitive=*/ true ));
1522+ auto bound_pred_not_starts_long =
1523+ std::dynamic_pointer_cast<BoundPredicate>(bound_not_starts_long);
1524+ ASSERT_NE (bound_pred_not_starts_long, nullptr );
1525+
1526+ ICEBERG_ASSIGN_OR_THROW (auto projected_not_starts_long,
1527+ transform->Project (" part" , bound_pred_not_starts_long));
1528+ EXPECT_EQ (projected_not_starts_long, nullptr );
1529+ }
1530+
12741531TEST_F (TransformProjectTest, YearProjectEquality) {
12751532 auto transform = Transform::Year ();
12761533
@@ -1398,4 +1655,31 @@ TEST_F(TransformProjectTest, TemporalProjectInSet) {
13981655 EXPECT_EQ (projected_in->op (), Expression::Operation::kIn );
13991656}
14001657
1658+ TEST_F (TransformProjectTest, DayTimestampProjectionFix) {
1659+ auto transform = Transform::Day ();
1660+
1661+ // Predicate: value < 1970-01-01 00:00:00 (0)
1662+ // This implies value <= -1 micros.
1663+ // day(-1 micros) = -1 day (1969-12-31).
1664+ // If we don't fix, we project to day <= -1.
1665+ // If we fix (for buggy writers), we project to day <= 0.
1666+ auto unbound = Expressions::LessThan (" value" , Literal::Timestamp (0 ));
1667+
1668+ ICEBERG_ASSIGN_OR_THROW (auto bound,
1669+ unbound->Bind (*timestamp_schema_, /* case_sensitive=*/ true ));
1670+ auto bound_pred = std::dynamic_pointer_cast<BoundPredicate>(bound);
1671+ ASSERT_NE (bound_pred, nullptr );
1672+
1673+ ICEBERG_ASSIGN_OR_THROW (auto projected, transform->Project (" part" , bound_pred));
1674+ ASSERT_NE (projected, nullptr );
1675+
1676+ auto unbound_projected =
1677+ internal::checked_pointer_cast<UnboundPredicateImpl<BoundReference>>(
1678+ std::move (projected));
1679+ EXPECT_EQ (unbound_projected->op (), Expression::Operation::kLtEq );
1680+ ASSERT_EQ (unbound_projected->literals ().size (), 1 );
1681+ int32_t val = std::get<int32_t >(unbound_projected->literals ().front ().value ());
1682+ EXPECT_EQ (val, 0 ) << " Expected projected value to be 0 (fix applied), but got " << val;
1683+ }
1684+
14011685} // namespace iceberg
0 commit comments