Skip to content

Commit 99098e8

Browse files
committed
fix: review comments
1 parent 252b2ab commit 99098e8

File tree

4 files changed

+401
-187
lines changed

4 files changed

+401
-187
lines changed

src/iceberg/test/transform_test.cc

Lines changed: 284 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1271,6 +1271,263 @@ TEST_F(TransformProjectTest, TruncateProjectStringStartsWith) {
12711271
"Hello");
12721272
}
12731273

1274+
TEST_F(TransformProjectTest, TruncateProjectStringStartsWithCodePointCountLessThanWidth) {
1275+
auto transform = Transform::Truncate(5);
1276+
1277+
// Code point count < width (multi-byte UTF-8 characters)
1278+
// "😜🧐" has 2 code points, width is 5
1279+
auto unbound_emoji_short = Expressions::StartsWith("value", "😜🧐");
1280+
ICEBERG_ASSIGN_OR_THROW(
1281+
auto bound_emoji_short,
1282+
unbound_emoji_short->Bind(*string_schema_, /*case_sensitive=*/true));
1283+
auto bound_pred_emoji_short =
1284+
std::dynamic_pointer_cast<BoundPredicate>(bound_emoji_short);
1285+
ASSERT_NE(bound_pred_emoji_short, nullptr);
1286+
1287+
ICEBERG_ASSIGN_OR_THROW(auto projected_emoji_short,
1288+
transform->Project("part", bound_pred_emoji_short));
1289+
ASSERT_NE(projected_emoji_short, nullptr);
1290+
EXPECT_EQ(projected_emoji_short->op(), Expression::Operation::kStartsWith);
1291+
1292+
auto unbound_projected_emoji_short =
1293+
internal::checked_pointer_cast<UnboundPredicateImpl<BoundReference>>(
1294+
std::move(projected_emoji_short));
1295+
ASSERT_NE(unbound_projected_emoji_short, nullptr);
1296+
EXPECT_EQ(unbound_projected_emoji_short->op(), Expression::Operation::kStartsWith);
1297+
EXPECT_EQ(unbound_projected_emoji_short->literals().size(), 1);
1298+
EXPECT_EQ(
1299+
std::get<std::string>(unbound_projected_emoji_short->literals().front().value()),
1300+
"😜🧐");
1301+
}
1302+
1303+
TEST_F(TransformProjectTest, TruncateProjectStringStartsWithCodePointCountEqualToWidth) {
1304+
auto transform = Transform::Truncate(5);
1305+
1306+
// Code point count == width (exactly 5 code points)
1307+
// "😜🧐🤔🤪🥳" has exactly 5 code points
1308+
auto unbound_emoji_equal = Expressions::StartsWith("value", "😜🧐🤔🤪🥳");
1309+
ICEBERG_ASSIGN_OR_THROW(
1310+
auto bound_emoji_equal,
1311+
unbound_emoji_equal->Bind(*string_schema_, /*case_sensitive=*/true));
1312+
auto bound_pred_emoji_equal =
1313+
std::dynamic_pointer_cast<BoundPredicate>(bound_emoji_equal);
1314+
ASSERT_NE(bound_pred_emoji_equal, nullptr);
1315+
1316+
ICEBERG_ASSIGN_OR_THROW(auto projected_emoji_equal,
1317+
transform->Project("part", bound_pred_emoji_equal));
1318+
ASSERT_NE(projected_emoji_equal, nullptr);
1319+
EXPECT_EQ(projected_emoji_equal->op(), Expression::Operation::kEq);
1320+
1321+
auto unbound_projected_emoji_equal =
1322+
internal::checked_pointer_cast<UnboundPredicateImpl<BoundReference>>(
1323+
std::move(projected_emoji_equal));
1324+
ASSERT_NE(unbound_projected_emoji_equal, nullptr);
1325+
EXPECT_EQ(unbound_projected_emoji_equal->op(), Expression::Operation::kEq);
1326+
EXPECT_EQ(unbound_projected_emoji_equal->literals().size(), 1);
1327+
EXPECT_EQ(
1328+
std::get<std::string>(unbound_projected_emoji_equal->literals().front().value()),
1329+
"😜🧐🤔🤪🥳");
1330+
}
1331+
1332+
TEST_F(TransformProjectTest,
1333+
TruncateProjectStringStartsWithCodePointCountGreaterThanWidth) {
1334+
auto transform = Transform::Truncate(5);
1335+
1336+
// Code point count > width (truncate to 5 code points)
1337+
// "😜🧐🤔🤪🥳😵‍💫😂" has 7 code points, should truncate to 5
1338+
auto unbound_emoji_long =
1339+
Expressions::StartsWith("value", "😜🧐🤔🤪🥳😵‍💫😂");
1340+
ICEBERG_ASSIGN_OR_THROW(
1341+
auto bound_emoji_long,
1342+
unbound_emoji_long->Bind(*string_schema_, /*case_sensitive=*/true));
1343+
auto bound_pred_emoji_long =
1344+
std::dynamic_pointer_cast<BoundPredicate>(bound_emoji_long);
1345+
ASSERT_NE(bound_pred_emoji_long, nullptr);
1346+
1347+
ICEBERG_ASSIGN_OR_THROW(auto projected_emoji_long,
1348+
transform->Project("part", bound_pred_emoji_long));
1349+
ASSERT_NE(projected_emoji_long, nullptr);
1350+
EXPECT_EQ(projected_emoji_long->op(), Expression::Operation::kStartsWith);
1351+
1352+
auto unbound_projected_emoji_long =
1353+
internal::checked_pointer_cast<UnboundPredicateImpl<BoundReference>>(
1354+
std::move(projected_emoji_long));
1355+
ASSERT_NE(unbound_projected_emoji_long, nullptr);
1356+
EXPECT_EQ(unbound_projected_emoji_long->op(), Expression::Operation::kStartsWith);
1357+
EXPECT_EQ(unbound_projected_emoji_long->literals().size(), 1);
1358+
EXPECT_EQ(
1359+
std::get<std::string>(unbound_projected_emoji_long->literals().front().value()),
1360+
"😜🧐🤔🤪🥳");
1361+
}
1362+
1363+
TEST_F(TransformProjectTest, TruncateProjectStringStartsWithMixedAsciiAndMultiByte) {
1364+
auto transform = Transform::Truncate(5);
1365+
1366+
// Mixed ASCII and multi-byte UTF-8 characters
1367+
// "a😜b🧐c" has 5 code points (3 ASCII + 2 emojis)
1368+
auto unbound_mixed_equal = Expressions::StartsWith("value", "a😜b🧐c");
1369+
ICEBERG_ASSIGN_OR_THROW(
1370+
auto bound_mixed_equal,
1371+
unbound_mixed_equal->Bind(*string_schema_, /*case_sensitive=*/true));
1372+
auto bound_pred_mixed_equal =
1373+
std::dynamic_pointer_cast<BoundPredicate>(bound_mixed_equal);
1374+
ASSERT_NE(bound_pred_mixed_equal, nullptr);
1375+
1376+
ICEBERG_ASSIGN_OR_THROW(auto projected_mixed_equal,
1377+
transform->Project("part", bound_pred_mixed_equal));
1378+
ASSERT_NE(projected_mixed_equal, nullptr);
1379+
EXPECT_EQ(projected_mixed_equal->op(), Expression::Operation::kEq);
1380+
1381+
auto unbound_projected_mixed_equal =
1382+
internal::checked_pointer_cast<UnboundPredicateImpl<BoundReference>>(
1383+
std::move(projected_mixed_equal));
1384+
ASSERT_NE(unbound_projected_mixed_equal, nullptr);
1385+
EXPECT_EQ(unbound_projected_mixed_equal->op(), Expression::Operation::kEq);
1386+
EXPECT_EQ(unbound_projected_mixed_equal->literals().size(), 1);
1387+
EXPECT_EQ(
1388+
std::get<std::string>(unbound_projected_mixed_equal->literals().front().value()),
1389+
"a😜b🧐c");
1390+
}
1391+
1392+
TEST_F(TransformProjectTest, TruncateProjectStringStartsWithChineseCharactersShort) {
1393+
auto transform = Transform::Truncate(5);
1394+
1395+
// Chinese characters (3-byte UTF-8)
1396+
// "你好世界" has 4 code points, width is 5
1397+
auto unbound_chinese_short = Expressions::StartsWith("value", "你好世界");
1398+
ICEBERG_ASSIGN_OR_THROW(
1399+
auto bound_chinese_short,
1400+
unbound_chinese_short->Bind(*string_schema_, /*case_sensitive=*/true));
1401+
auto bound_pred_chinese_short =
1402+
std::dynamic_pointer_cast<BoundPredicate>(bound_chinese_short);
1403+
ASSERT_NE(bound_pred_chinese_short, nullptr);
1404+
1405+
ICEBERG_ASSIGN_OR_THROW(auto projected_chinese_short,
1406+
transform->Project("part", bound_pred_chinese_short));
1407+
ASSERT_NE(projected_chinese_short, nullptr);
1408+
EXPECT_EQ(projected_chinese_short->op(), Expression::Operation::kStartsWith);
1409+
1410+
auto unbound_projected_chinese_short =
1411+
internal::checked_pointer_cast<UnboundPredicateImpl<BoundReference>>(
1412+
std::move(projected_chinese_short));
1413+
ASSERT_NE(unbound_projected_chinese_short, nullptr);
1414+
EXPECT_EQ(unbound_projected_chinese_short->op(), Expression::Operation::kStartsWith);
1415+
EXPECT_EQ(unbound_projected_chinese_short->literals().size(), 1);
1416+
EXPECT_EQ(
1417+
std::get<std::string>(unbound_projected_chinese_short->literals().front().value()),
1418+
"你好世界");
1419+
}
1420+
1421+
TEST_F(TransformProjectTest, TruncateProjectStringStartsWithChineseCharactersEqualWidth) {
1422+
auto transform = Transform::Truncate(5);
1423+
1424+
// Chinese characters exactly matching width
1425+
// "你好世界好" has exactly 5 code points
1426+
auto unbound_chinese_equal = Expressions::StartsWith("value", "你好世界好");
1427+
ICEBERG_ASSIGN_OR_THROW(
1428+
auto bound_chinese_equal,
1429+
unbound_chinese_equal->Bind(*string_schema_, /*case_sensitive=*/true));
1430+
auto bound_pred_chinese_equal =
1431+
std::dynamic_pointer_cast<BoundPredicate>(bound_chinese_equal);
1432+
ASSERT_NE(bound_pred_chinese_equal, nullptr);
1433+
1434+
ICEBERG_ASSIGN_OR_THROW(auto projected_chinese_equal,
1435+
transform->Project("part", bound_pred_chinese_equal));
1436+
ASSERT_NE(projected_chinese_equal, nullptr);
1437+
EXPECT_EQ(projected_chinese_equal->op(), Expression::Operation::kEq);
1438+
1439+
auto unbound_projected_chinese_equal =
1440+
internal::checked_pointer_cast<UnboundPredicateImpl<BoundReference>>(
1441+
std::move(projected_chinese_equal));
1442+
ASSERT_NE(unbound_projected_chinese_equal, nullptr);
1443+
EXPECT_EQ(unbound_projected_chinese_equal->op(), Expression::Operation::kEq);
1444+
EXPECT_EQ(unbound_projected_chinese_equal->literals().size(), 1);
1445+
EXPECT_EQ(
1446+
std::get<std::string>(unbound_projected_chinese_equal->literals().front().value()),
1447+
"你好世界好");
1448+
}
1449+
1450+
TEST_F(TransformProjectTest,
1451+
TruncateProjectStringNotStartsWithCodePointCountEqualToWidth) {
1452+
auto transform = Transform::Truncate(5);
1453+
1454+
// NotStartsWith with code point count == width
1455+
// Should convert to NotEq
1456+
auto unbound_not_starts_equal = Expressions::NotStartsWith("value", "😜🧐🤔🤪🥳");
1457+
ICEBERG_ASSIGN_OR_THROW(
1458+
auto bound_not_starts_equal,
1459+
unbound_not_starts_equal->Bind(*string_schema_, /*case_sensitive=*/true));
1460+
auto bound_pred_not_starts_equal =
1461+
std::dynamic_pointer_cast<BoundPredicate>(bound_not_starts_equal);
1462+
ASSERT_NE(bound_pred_not_starts_equal, nullptr);
1463+
1464+
ICEBERG_ASSIGN_OR_THROW(auto projected_not_starts_equal,
1465+
transform->Project("part", bound_pred_not_starts_equal));
1466+
ASSERT_NE(projected_not_starts_equal, nullptr);
1467+
EXPECT_EQ(projected_not_starts_equal->op(), Expression::Operation::kNotEq);
1468+
1469+
auto unbound_projected_not_starts_equal =
1470+
internal::checked_pointer_cast<UnboundPredicateImpl<BoundReference>>(
1471+
std::move(projected_not_starts_equal));
1472+
ASSERT_NE(unbound_projected_not_starts_equal, nullptr);
1473+
EXPECT_EQ(unbound_projected_not_starts_equal->op(), Expression::Operation::kNotEq);
1474+
EXPECT_EQ(unbound_projected_not_starts_equal->literals().size(), 1);
1475+
EXPECT_EQ(std::get<std::string>(
1476+
unbound_projected_not_starts_equal->literals().front().value()),
1477+
"😜🧐🤔🤪🥳");
1478+
}
1479+
1480+
TEST_F(TransformProjectTest,
1481+
TruncateProjectStringNotStartsWithCodePointCountLessThanWidth) {
1482+
auto transform = Transform::Truncate(5);
1483+
1484+
// NotStartsWith with code point count < width
1485+
// Should remain NotStartsWith
1486+
auto unbound_not_starts_short = Expressions::NotStartsWith("value", "😜🧐");
1487+
ICEBERG_ASSIGN_OR_THROW(
1488+
auto bound_not_starts_short,
1489+
unbound_not_starts_short->Bind(*string_schema_, /*case_sensitive=*/true));
1490+
auto bound_pred_not_starts_short =
1491+
std::dynamic_pointer_cast<BoundPredicate>(bound_not_starts_short);
1492+
ASSERT_NE(bound_pred_not_starts_short, nullptr);
1493+
1494+
ICEBERG_ASSIGN_OR_THROW(auto projected_not_starts_short,
1495+
transform->Project("part", bound_pred_not_starts_short));
1496+
ASSERT_NE(projected_not_starts_short, nullptr);
1497+
EXPECT_EQ(projected_not_starts_short->op(), Expression::Operation::kNotStartsWith);
1498+
1499+
auto unbound_projected_not_starts_short =
1500+
internal::checked_pointer_cast<UnboundPredicateImpl<BoundReference>>(
1501+
std::move(projected_not_starts_short));
1502+
ASSERT_NE(unbound_projected_not_starts_short, nullptr);
1503+
EXPECT_EQ(unbound_projected_not_starts_short->op(),
1504+
Expression::Operation::kNotStartsWith);
1505+
EXPECT_EQ(unbound_projected_not_starts_short->literals().size(), 1);
1506+
EXPECT_EQ(std::get<std::string>(
1507+
unbound_projected_not_starts_short->literals().front().value()),
1508+
"😜🧐");
1509+
}
1510+
1511+
TEST_F(TransformProjectTest,
1512+
TruncateProjectStringNotStartsWithCodePointCountGreaterThanWidth) {
1513+
auto transform = Transform::Truncate(5);
1514+
1515+
// NotStartsWith with code point count > width
1516+
// Should return nullptr (cannot project)
1517+
auto unbound_not_starts_long =
1518+
Expressions::NotStartsWith("value", "😜🧐🤔🤪🥳😵‍💫😂");
1519+
ICEBERG_ASSIGN_OR_THROW(
1520+
auto bound_not_starts_long,
1521+
unbound_not_starts_long->Bind(*string_schema_, /*case_sensitive=*/true));
1522+
auto bound_pred_not_starts_long =
1523+
std::dynamic_pointer_cast<BoundPredicate>(bound_not_starts_long);
1524+
ASSERT_NE(bound_pred_not_starts_long, nullptr);
1525+
1526+
ICEBERG_ASSIGN_OR_THROW(auto projected_not_starts_long,
1527+
transform->Project("part", bound_pred_not_starts_long));
1528+
EXPECT_EQ(projected_not_starts_long, nullptr);
1529+
}
1530+
12741531
TEST_F(TransformProjectTest, YearProjectEquality) {
12751532
auto transform = Transform::Year();
12761533

@@ -1398,4 +1655,31 @@ TEST_F(TransformProjectTest, TemporalProjectInSet) {
13981655
EXPECT_EQ(projected_in->op(), Expression::Operation::kIn);
13991656
}
14001657

1658+
TEST_F(TransformProjectTest, DayTimestampProjectionFix) {
1659+
auto transform = Transform::Day();
1660+
1661+
// Predicate: value < 1970-01-01 00:00:00 (0)
1662+
// This implies value <= -1 micros.
1663+
// day(-1 micros) = -1 day (1969-12-31).
1664+
// If we don't fix, we project to day <= -1.
1665+
// If we fix (for buggy writers), we project to day <= 0.
1666+
auto unbound = Expressions::LessThan("value", Literal::Timestamp(0));
1667+
1668+
ICEBERG_ASSIGN_OR_THROW(auto bound,
1669+
unbound->Bind(*timestamp_schema_, /*case_sensitive=*/true));
1670+
auto bound_pred = std::dynamic_pointer_cast<BoundPredicate>(bound);
1671+
ASSERT_NE(bound_pred, nullptr);
1672+
1673+
ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred));
1674+
ASSERT_NE(projected, nullptr);
1675+
1676+
auto unbound_projected =
1677+
internal::checked_pointer_cast<UnboundPredicateImpl<BoundReference>>(
1678+
std::move(projected));
1679+
EXPECT_EQ(unbound_projected->op(), Expression::Operation::kLtEq);
1680+
ASSERT_EQ(unbound_projected->literals().size(), 1);
1681+
int32_t val = std::get<int32_t>(unbound_projected->literals().front().value());
1682+
EXPECT_EQ(val, 0) << "Expected projected value to be 0 (fix applied), but got " << val;
1683+
}
1684+
14011685
} // namespace iceberg

src/iceberg/transform.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,8 @@ class ICEBERG_EXPORT Transform : public util::Formattable {
177177
/// Projected(transform(value)) is true.
178178
/// \param name The name of the partition column.
179179
/// \param predicate The predicate to project.
180-
/// \return A Result containing either a unique pointer to the projected predicate or an
181-
/// Error if the projection fails.
180+
/// \return A Result containing either a unique pointer to the projected predicate,
181+
/// nullptr if the projection cannot be performed, or an Error if the projection fails.
182182
Result<std::unique_ptr<UnboundPredicate>> Project(
183183
std::string_view name, const std::shared_ptr<BoundPredicate>& predicate);
184184

0 commit comments

Comments
 (0)