Skip to content

Commit d260740

Browse files
committed
fix: replace brittle regex parsing with type-safe Transform::param() API
Address critical code review feedback by removing regex-based width extraction. Changes: - Added Transform::param() public getter returning std::optional<int32_t> - Updated truncate optimization to use param() instead of ToString() + regex - Removed <regex> include (no longer needed) - More robust: immune to ToString() format changes - Better performance: no regex compilation or string parsing - Type-safe: compile-time checked, no runtime parsing errors This fixes the most critical issue from code review - the brittle dependency on Transform::ToString() string format. The new API is production-grade and follows C++ best practices for accessing configuration parameters.
1 parent a777ab7 commit d260740

File tree

3 files changed

+53
-42
lines changed

3 files changed

+53
-42
lines changed

src/iceberg/expression/predicate.cc

Lines changed: 36 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
#include <algorithm>
2323
#include <format>
24-
#include <regex>
2524

2625
#include "iceberg/exception.h"
2726
#include "iceberg/expression/expressions.h"
@@ -263,50 +262,47 @@ Result<std::shared_ptr<Expression>> UnboundPredicate<B>::BindLiteralOperation(
263262
if (BASE::op() == Expression::Operation::kEq &&
264263
bound_term->kind() == Term::Kind::kTransform) {
265264
// Use checked_cast for fail-fast debug behavior
266-
auto* transform_term =
267-
internal::checked_cast<BoundTransform*>(bound_term.get());
265+
auto* transform_term = internal::checked_cast<BoundTransform*>(bound_term.get());
268266

269267
if (transform_term->transform()->transform_type() == TransformType::kTruncate &&
270268
literal.type()->type_id() == TypeId::kString &&
271269
!literal.IsNull()) { // Null safety: skip null literals
272270

273-
// TODO: Avoid ToString/regex parsing once Transform API exposes width directly
274-
// (e.g., TruncateTransform::width() getter would be cleaner and faster)
275-
// Extract width from transform string (format: "truncate[width]")
276-
std::string transform_str = transform_term->transform()->ToString();
277-
278-
// Static regex to avoid recompilation on each bind (micro-optimization)
279-
static const std::regex width_regex(R"(truncate\[(\d+)\])");
280-
std::smatch match;
281-
282-
if (std::regex_match(transform_str, match, width_regex)) {
283-
int32_t truncate_width = std::stoi(match[1].str());
284-
285-
// Skip width=0: truncate(col, 0) == "" would rewrite to STARTS_WITH("")
286-
// which is tautologically true and could accidentally broaden filters
287-
if (truncate_width == 0) {
288-
// Don't optimize; let the normal predicate handle this edge case
289-
return std::make_shared<BoundLiteralPredicate>(
290-
BASE::op(), std::move(bound_term), std::move(literal));
291-
}
292-
293-
auto& string_value = std::get<std::string>(literal.value());
294-
295-
// Count UTF-8 code points (not bytes!)
296-
// Truncate uses code points: "José" has 4 code points but 5 bytes
297-
int32_t code_point_count = CountUTF8CodePoints(string_value);
298-
299-
// Only optimize if literal code point count equals truncate width
300-
// Example: truncate(col, 5) == "Alice" (5 code points) can be optimized
301-
// truncate(col, 10) == "abc" (3 code points) CANNOT
302-
// truncate(col, 4) == "José" (4 code points, 5 bytes) CAN be optimized
303-
if (code_point_count == truncate_width) {
304-
// Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value"
305-
// This benefits from strict metrics evaluation for startsWith in manifest filtering
306-
return std::make_shared<BoundLiteralPredicate>(
307-
Expression::Operation::kStartsWith, transform_term->reference(),
308-
std::move(literal));
309-
}
271+
// Extract width parameter using type-safe API
272+
auto width_opt = transform_term->transform()->param();
273+
if (!width_opt) {
274+
// Should never happen for truncate, but be defensive
275+
return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
276+
std::move(literal));
277+
}
278+
279+
int32_t truncate_width = *width_opt;
280+
281+
// Skip width=0: truncate(col, 0) == "" would rewrite to STARTS_WITH("")
282+
// which is tautologically true and could accidentally broaden filters
283+
// (Note: Transform::Truncate already validates width > 0, but defensive check)
284+
if (truncate_width == 0) {
285+
return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
286+
std::move(literal));
287+
}
288+
289+
auto& string_value = std::get<std::string>(literal.value());
290+
291+
// Count UTF-8 code points (not bytes!)
292+
// Truncate uses code points: "José" has 4 code points but 5 bytes
293+
int32_t code_point_count = CountUTF8CodePoints(string_value);
294+
295+
// Only optimize if literal code point count equals truncate width
296+
// Example: truncate(col, 5) == "Alice" (5 code points) can be optimized
297+
// truncate(col, 10) == "abc" (3 code points) CANNOT
298+
// truncate(col, 4) == "José" (4 code points, 5 bytes) CAN be optimized
299+
if (code_point_count == truncate_width) {
300+
// Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value"
301+
// This benefits from strict metrics evaluation for startsWith in manifest
302+
// filtering
303+
return std::make_shared<BoundLiteralPredicate>(Expression::Operation::kStartsWith,
304+
transform_term->reference(),
305+
std::move(literal));
310306
}
311307
}
312308
}

src/iceberg/test/predicate_test.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@
1717
* under the License.
1818
*/
1919

20-
#include "iceberg/expression/expressions.h"
2120
#include "iceberg/expression/predicate.h"
21+
22+
#include "iceberg/expression/expressions.h"
2223
#include "iceberg/schema.h"
2324
#include "iceberg/test/matchers.h"
2425
#include "iceberg/type.h"
@@ -502,7 +503,8 @@ TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonString) {
502503
ASSERT_THAT(bound_result, IsOk());
503504
auto bound_pred = bound_result.value();
504505

505-
// Should remain as kEq, not converted to STARTS_WITH (binary doesn't support startsWith)
506+
// Should remain as kEq, not converted to STARTS_WITH (binary doesn't support
507+
// startsWith)
506508
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
507509

508510
// The term should still be a transform

src/iceberg/transform.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,19 @@ class ICEBERG_EXPORT Transform : public util::Formattable {
141141
/// \brief Returns the transform type.
142142
TransformType transform_type() const;
143143

144+
/// \brief Returns the optional parameter for parameterized transforms.
145+
///
146+
/// For transforms like bucket(N) or truncate(W), returns the parameter value.
147+
/// For non-parameterized transforms (identity, year, etc.), returns std::nullopt.
148+
///
149+
/// \return The parameter if present, otherwise std::nullopt
150+
std::optional<int32_t> param() const {
151+
if (auto* p = std::get_if<int32_t>(&param_)) {
152+
return *p;
153+
}
154+
return std::nullopt;
155+
}
156+
144157
/// \brief Binds this transform to a source type, returning a typed TransformFunction.
145158
///
146159
/// This creates a concrete transform implementation based on the transform type and

0 commit comments

Comments
 (0)