Skip to content

Commit e5d7bb9

Browse files
committed
fix: replace brittle regex parsing with type-safe Transform::param() API
Address critical code review feedback by removing regex-based width extraction. Changes: - Added Transform::param() public getter returning std::optional<int32_t> - Updated truncate optimization to use param() instead of ToString() + regex - Removed <regex> include (no longer needed) - More robust: immune to ToString() format changes - Better performance: no regex compilation or string parsing - Type-safe: compile-time checked, no runtime parsing errors This fixes the most critical issue from code review - the brittle dependency on Transform::ToString() string format. The new API is production-grade and follows C++ best practices for accessing configuration parameters.
1 parent a777ab7 commit e5d7bb9

File tree

2 files changed

+47
-38
lines changed

2 files changed

+47
-38
lines changed

src/iceberg/expression/predicate.cc

Lines changed: 34 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
#include <algorithm>
2323
#include <format>
24-
#include <regex>
2524

2625
#include "iceberg/exception.h"
2726
#include "iceberg/expression/expressions.h"
@@ -270,43 +269,40 @@ Result<std::shared_ptr<Expression>> UnboundPredicate<B>::BindLiteralOperation(
270269
literal.type()->type_id() == TypeId::kString &&
271270
!literal.IsNull()) { // Null safety: skip null literals
272271

273-
// TODO: Avoid ToString/regex parsing once Transform API exposes width directly
274-
// (e.g., TruncateTransform::width() getter would be cleaner and faster)
275-
// Extract width from transform string (format: "truncate[width]")
276-
std::string transform_str = transform_term->transform()->ToString();
277-
278-
// Static regex to avoid recompilation on each bind (micro-optimization)
279-
static const std::regex width_regex(R"(truncate\[(\d+)\])");
280-
std::smatch match;
281-
282-
if (std::regex_match(transform_str, match, width_regex)) {
283-
int32_t truncate_width = std::stoi(match[1].str());
284-
285-
// Skip width=0: truncate(col, 0) == "" would rewrite to STARTS_WITH("")
286-
// which is tautologically true and could accidentally broaden filters
287-
if (truncate_width == 0) {
288-
// Don't optimize; let the normal predicate handle this edge case
289-
return std::make_shared<BoundLiteralPredicate>(
290-
BASE::op(), std::move(bound_term), std::move(literal));
291-
}
292-
293-
auto& string_value = std::get<std::string>(literal.value());
294-
295-
// Count UTF-8 code points (not bytes!)
296-
// Truncate uses code points: "José" has 4 code points but 5 bytes
297-
int32_t code_point_count = CountUTF8CodePoints(string_value);
298-
299-
// Only optimize if literal code point count equals truncate width
300-
// Example: truncate(col, 5) == "Alice" (5 code points) can be optimized
301-
// truncate(col, 10) == "abc" (3 code points) CANNOT
302-
// truncate(col, 4) == "José" (4 code points, 5 bytes) CAN be optimized
303-
if (code_point_count == truncate_width) {
304-
// Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value"
305-
// This benefits from strict metrics evaluation for startsWith in manifest filtering
306-
return std::make_shared<BoundLiteralPredicate>(
307-
Expression::Operation::kStartsWith, transform_term->reference(),
308-
std::move(literal));
309-
}
272+
// Extract width parameter using type-safe API
273+
auto width_opt = transform_term->transform()->param();
274+
if (!width_opt) {
275+
// Should never happen for truncate, but be defensive
276+
return std::make_shared<BoundLiteralPredicate>(
277+
BASE::op(), std::move(bound_term), std::move(literal));
278+
}
279+
280+
int32_t truncate_width = *width_opt;
281+
282+
// Skip width=0: truncate(col, 0) == "" would rewrite to STARTS_WITH("")
283+
// which is tautologically true and could accidentally broaden filters
284+
// (Note: Transform::Truncate already validates width > 0, but defensive check)
285+
if (truncate_width == 0) {
286+
return std::make_shared<BoundLiteralPredicate>(
287+
BASE::op(), std::move(bound_term), std::move(literal));
288+
}
289+
290+
auto& string_value = std::get<std::string>(literal.value());
291+
292+
// Count UTF-8 code points (not bytes!)
293+
// Truncate uses code points: "José" has 4 code points but 5 bytes
294+
int32_t code_point_count = CountUTF8CodePoints(string_value);
295+
296+
// Only optimize if literal code point count equals truncate width
297+
// Example: truncate(col, 5) == "Alice" (5 code points) can be optimized
298+
// truncate(col, 10) == "abc" (3 code points) CANNOT
299+
// truncate(col, 4) == "José" (4 code points, 5 bytes) CAN be optimized
300+
if (code_point_count == truncate_width) {
301+
// Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value"
302+
// This benefits from strict metrics evaluation for startsWith in manifest filtering
303+
return std::make_shared<BoundLiteralPredicate>(
304+
Expression::Operation::kStartsWith, transform_term->reference(),
305+
std::move(literal));
310306
}
311307
}
312308
}

src/iceberg/transform.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,19 @@ class ICEBERG_EXPORT Transform : public util::Formattable {
141141
/// \brief Returns the transform type.
142142
TransformType transform_type() const;
143143

144+
/// \brief Returns the optional parameter for parameterized transforms.
145+
///
146+
/// For transforms like bucket(N) or truncate(W), returns the parameter value.
147+
/// For non-parameterized transforms (identity, year, etc.), returns std::nullopt.
148+
///
149+
/// \return The parameter if present, otherwise std::nullopt
150+
std::optional<int32_t> param() const {
151+
if (auto* p = std::get_if<int32_t>(&param_)) {
152+
return *p;
153+
}
154+
return std::nullopt;
155+
}
156+
144157
/// \brief Binds this transform to a source type, returning a typed TransformFunction.
145158
///
146159
/// This creates a concrete transform implementation based on the transform type and

0 commit comments

Comments
 (0)