Skip to content

Commit f845db7

Browse files
rustyconoverclaude
andcommitted
feat: add strip_accents option to inflect() for diacritics removal
Adds a strip_accents option to all three inflect() variants (table function, scalar string, scalar struct) that removes diacritical marks before applying case transformations. Uses DuckDB's bundled utf8proc for accent stripping. Default behavior preserves accents for backward compatibility. Closes #4 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 5f2fa55 commit f845db7

File tree

3 files changed

+240
-23
lines changed

3 files changed

+240
-23
lines changed

docs/README.md

Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,9 +238,12 @@ This example inflects the keys of a `STRUCT` or table's column names to a target
238238
SELECT * FROM inflect('case_style', {'example_field': 5, 'ExampleField2': 3});
239239

240240
-- or inflect an entire result
241-
242241
SELECT * FROM inflect('case_style', (SELECT * FROM 'example.parquet'));
243242

243+
-- optionally strip accents/diacritics from names
244+
SELECT inflect('snake', 'Libellé civilité', true);
245+
SELECT inflect('snake', {"Libellé": 1}, true);
246+
FROM inflect('snake', (SELECT * FROM 'data.parquet'), strip_accents := true);
244247
```
245248

246249
**Supported case styles:**
@@ -387,6 +390,66 @@ Single-character tokens are silently ignored (minimum 2 characters).
387390
- **Snake, kebab, screaming_snake** output is unaffected since those styles don't use mixed case
388391
- **Thread-safe**: Acronym configuration uses a read-write lock for concurrent access
389392

393+
## Accent Stripping
394+
395+
By default, accented characters (é, ü, ñ, etc.) are preserved as-is during case transformations. When working with data that contains diacritics—such as French column names—you may want fully normalized ASCII output. Use the `strip_accents` option to remove diacritical marks before applying the case transformation.
396+
397+
### Scalar String
398+
399+
Pass `true` as the third argument:
400+
401+
```sql
402+
SELECT inflect('snake', 'Libellé civilité', true) as v;
403+
┌──────────────────┐
404+
│ v │
405+
varchar
406+
├──────────────────┤
407+
│ libelle_civilite │
408+
└──────────────────┘
409+
```
410+
411+
### Scalar Struct
412+
413+
Field names are stripped of accents:
414+
415+
```sql
416+
SELECT inflect('snake', {"Libellé": 1, "civilité": 2}, true) as v;
417+
┌──────────────────────────────────────────┐
418+
│ v │
419+
│ struct(libelle integer, civilite integer) │
420+
├──────────────────────────────────────────┤
421+
│ {'libelle': 1, 'civilite': 2} │
422+
└──────────────────────────────────────────┘
423+
```
424+
425+
### Table Function
426+
427+
Use the `strip_accents` named parameter:
428+
429+
```sql
430+
SELECT * FROM inflect('snake', (SELECT 1 AS "Libellé"), strip_accents := true);
431+
┌─────────┐
432+
│ libelle │
433+
│ int32 │
434+
├─────────┤
435+
1
436+
└─────────┘
437+
```
438+
439+
### Default Behavior
440+
441+
Without `strip_accents` (or with `false`), accents are preserved—this is fully backward compatible:
442+
443+
```sql
444+
SELECT inflect('snake', 'Libellé civilité') as v;
445+
┌──────────────────────┐
446+
│ v │
447+
varchar
448+
├──────────────────────┤
449+
│ libellé_civilité │
450+
└──────────────────────┘
451+
```
452+
390453
## Advanced Usage
391454

392455
### Nested Struct Transformation
@@ -466,6 +529,10 @@ A: Yes, `table_case` converts to snake_case and pluralizes the name (e.g., `User
466529

467530
A: Yes! You can nest `inflect()` calls or pipe results through multiple transformations.
468531

532+
**Q: How do I handle accented/diacritical column names?**
533+
534+
A: Use the `strip_accents` option to remove diacritics before case conversion. For scalar calls, pass `true` as the third argument: `inflect('snake', 'Libellé', true)`. For the table function, use the named parameter: `FROM inflect('snake', (SELECT ...), strip_accents := true)`.
535+
469536
## Contributing
470537

471538
The Inflector extension is open source and developed by [Query.Farm](https://query.farm).

src/inflector_extension.cpp

Lines changed: 92 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,21 @@
77
#include "duckdb/main/config.hpp"
88
#include <duckdb/parser/parsed_data/create_scalar_function_info.hpp>
99
#include <duckdb/parser/parsed_data/create_table_function_info.hpp>
10+
#include <duckdb/planner/expression/bound_function_expression.hpp>
1011
#include "rust.h"
1112
#include "query_farm_telemetry.hpp"
13+
#include "utf8proc.hpp"
14+
1215
namespace duckdb {
1316

17+
static std::string StripAccentsString(const std::string &input) {
18+
auto stripped = utf8proc_remove_accents((const utf8proc_uint8_t *)input.c_str(),
19+
(utf8proc_ssize_t)input.size());
20+
std::string result((const char *)stripped);
21+
free(stripped);
22+
return result;
23+
}
24+
1425
// Generic helper for string transformations with documentation
1526
inline void RegisterInflectorTransform(ExtensionLoader &loader, const char *sql_name,
1627
char *(*cruet_func)(const char *), const char *description,
@@ -73,15 +84,19 @@ inline void RegisterInflectorPredicate(ExtensionLoader &loader, const char *sql_
7384
}
7485

7586
struct InflectBindData : public FunctionData {
76-
InflectBindData() {
87+
InflectBindData() : strip_accents(false) {
7788
}
7889

90+
bool strip_accents;
91+
7992
unique_ptr<FunctionData> Copy() const override {
80-
return make_uniq<InflectBindData>();
93+
auto copy = make_uniq<InflectBindData>();
94+
copy->strip_accents = strip_accents;
95+
return copy;
8196
}
8297
bool Equals(const FunctionData &other_p) const override {
83-
// auto &other = other_p.Cast<CollectBindData>();
84-
return true;
98+
auto &other = other_p.Cast<InflectBindData>();
99+
return strip_accents == other.strip_accents;
85100
}
86101
};
87102

@@ -119,22 +134,35 @@ static unique_ptr<FunctionData> InflectTableBind(ClientContext &context, TableFu
119134
}
120135
TransformFunc transform = it->second;
121136

137+
bool strip_accents = false;
138+
auto sa_it = input.named_parameters.find("strip_accents");
139+
if (sa_it != input.named_parameters.end()) {
140+
strip_accents = sa_it->second.GetValue<bool>();
141+
}
142+
122143
// Process each input column
123144
for (idx_t i = 0; i < input.input_table_types.size(); i++) {
124145
auto &part_name = input.input_table_names[i];
125146
auto &part_type = input.input_table_types[i];
126147

127148
return_types.push_back(part_type);
128149

129-
char *new_name = transform(part_name.c_str());
150+
std::string name_to_transform = part_name;
151+
if (strip_accents) {
152+
name_to_transform = StripAccentsString(name_to_transform);
153+
}
154+
155+
char *new_name = transform(name_to_transform.c_str());
130156
if (!new_name) {
131157
throw InternalException("Inflector transform returned null - memory allocation failed");
132158
}
133159
names.emplace_back(new_name);
134160
free_c_string(new_name);
135161
}
136162

137-
return make_uniq<InflectBindData>();
163+
auto bind_data = make_uniq<InflectBindData>();
164+
bind_data->strip_accents = strip_accents;
165+
return bind_data;
138166
}
139167

140168
static OperatorResultType InflectInOut(ExecutionContext &context, TableFunctionInput &data_p, DataChunk &input,
@@ -154,21 +182,24 @@ static OperatorFinalizeResultType InflectInOutFinalize(ExecutionContext &context
154182
}
155183

156184
struct InflectScalarBindData : public FunctionData {
157-
InflectScalarBindData(char *(*transform_func_p)(const char *)) : transform_func(transform_func_p) {
185+
InflectScalarBindData(char *(*transform_func_p)(const char *), bool strip_accents_p = false)
186+
: transform_func(transform_func_p), strip_accents(strip_accents_p) {
158187
}
159188

160189
unique_ptr<FunctionData> Copy() const override {
161-
return make_uniq<InflectScalarBindData>(transform_func);
190+
return make_uniq<InflectScalarBindData>(transform_func, strip_accents);
162191
}
163192
bool Equals(const FunctionData &other_p) const override {
164193
auto &other = other_p.Cast<InflectScalarBindData>();
165-
return transform_func == other.transform_func;
194+
return transform_func == other.transform_func && strip_accents == other.strip_accents;
166195
}
167196

168197
char *(*transform_func)(const char *);
198+
bool strip_accents;
169199
};
170200

171-
LogicalType InflectLogicalType(const LogicalType &type, TransformFunc transform, bool recursive) {
201+
LogicalType InflectLogicalType(const LogicalType &type, TransformFunc transform, bool recursive,
202+
bool strip_accents = false) {
172203
switch (type.id()) {
173204

174205
case LogicalTypeId::STRUCT: {
@@ -182,11 +213,17 @@ LogicalType InflectLogicalType(const LogicalType &type, TransformFunc transform,
182213

183214
LogicalType updated_type = subtype;
184215
if (recursive) {
185-
updated_type = InflectLogicalType(subtype, transform, false);
216+
updated_type = InflectLogicalType(subtype, transform, false, strip_accents);
217+
}
218+
219+
// Strip accents from the name before inflecting if requested
220+
std::string name_to_transform = name;
221+
if (strip_accents) {
222+
name_to_transform = StripAccentsString(name_to_transform);
186223
}
187224

188225
// Apply name inflection here
189-
auto new_name = transform(name.c_str());
226+
auto new_name = transform(name_to_transform.c_str());
190227
if (!new_name) {
191228
throw InternalException("Inflector transform returned null - memory allocation failed");
192229
}
@@ -203,7 +240,7 @@ LogicalType InflectLogicalType(const LogicalType &type, TransformFunc transform,
203240
// Recurse into element type if allowed
204241
LogicalType elem = child_type;
205242
if (recursive) {
206-
elem = InflectLogicalType(child_type, transform, true);
243+
elem = InflectLogicalType(child_type, transform, true, strip_accents);
207244
}
208245
return LogicalType::LIST(elem);
209246
}
@@ -216,8 +253,8 @@ LogicalType InflectLogicalType(const LogicalType &type, TransformFunc transform,
216253
LogicalType new_value = value_type;
217254

218255
if (recursive) {
219-
new_key = InflectLogicalType(key_type, transform, true);
220-
new_value = InflectLogicalType(value_type, transform, true);
256+
new_key = InflectLogicalType(key_type, transform, true, strip_accents);
257+
new_value = InflectLogicalType(value_type, transform, true, strip_accents);
221258
}
222259
return LogicalType::MAP(new_key, new_value);
223260
}
@@ -231,8 +268,8 @@ LogicalType InflectLogicalType(const LogicalType &type, TransformFunc transform,
231268
unique_ptr<FunctionData> InflectScalarBind(ClientContext &context, ScalarFunction &bound_function,
232269
vector<unique_ptr<Expression>> &arguments) {
233270

234-
if (arguments.size() != 2) {
235-
throw InvalidInputException("inflect() requires exactly two arguments: function name and value to inflect");
271+
if (arguments.size() < 2 || arguments.size() > 3) {
272+
throw InvalidInputException("inflect() requires 2 or 3 arguments: format, value, and optionally strip_accents");
236273
}
237274

238275
auto &arg = arguments[0];
@@ -259,21 +296,43 @@ unique_ptr<FunctionData> InflectScalarBind(ClientContext &context, ScalarFunctio
259296
}
260297
TransformFunc transform = it->second;
261298

299+
// Check for strip_accents (3rd argument)
300+
bool strip_accents = false;
301+
if (arguments.size() == 3) {
302+
auto &sa_arg = arguments[2];
303+
if (sa_arg->HasParameter()) {
304+
throw ParameterNotResolvedException();
305+
}
306+
if (!sa_arg->IsFoldable()) {
307+
throw BinderException("inflect: strip_accents argument must be constant");
308+
}
309+
strip_accents = BooleanValue::Get(ExpressionExecutor::EvaluateScalar(context, *sa_arg));
310+
}
311+
262312
// We should deal with the type here now.
263-
bound_function.return_type = InflectLogicalType(arguments[1]->return_type, transform, true);
313+
bound_function.return_type = InflectLogicalType(arguments[1]->return_type, transform, true, strip_accents);
264314

265-
return make_uniq<InflectScalarBindData>(transform);
315+
return make_uniq<InflectScalarBindData>(transform, strip_accents);
266316
}
267317

268318
void InflectStringFunc(DataChunk &args, ExpressionState &state, Vector &result) {
319+
auto &func_expr = state.expr.Cast<BoundFunctionExpression>();
320+
auto &bind_data = func_expr.bind_info->Cast<InflectScalarBindData>();
321+
bool strip_accents = bind_data.strip_accents;
322+
269323
auto &type_vector = args.data[0];
270324
auto &source = args.data[1];
271325

272326
BinaryExecutor::Execute<string_t, string_t, string_t>(
273-
type_vector, source, result, args.size(), [&result](string_t name, string_t data) -> string_t {
327+
type_vector, source, result, args.size(),
328+
[&result, strip_accents](string_t name, string_t data) -> string_t {
274329
auto function_name = name.GetString();
275330
auto value = data.GetString();
276331

332+
if (strip_accents) {
333+
value = StripAccentsString(value);
334+
}
335+
277336
auto it = transformer_map.find(function_name);
278337
if (it == transformer_map.end()) {
279338
throw InvalidInputException(
@@ -422,6 +481,7 @@ void LoadInternal(ExtensionLoader &loader) {
422481
TableFunction("inflect", {LogicalType::VARCHAR, LogicalType::TABLE}, nullptr, InflectTableBind);
423482
inflect_table_function.in_out_function = InflectInOut;
424483
inflect_table_function.in_out_function_final = InflectInOutFinalize;
484+
inflect_table_function.named_parameters["strip_accents"] = LogicalType::BOOLEAN;
425485
CreateTableFunctionInfo table_func_info(inflect_table_function);
426486
FunctionDescription table_func_desc;
427487
table_func_desc.description = "Transforms column names in query results using the specified case format";
@@ -438,13 +498,24 @@ void LoadInternal(ExtensionLoader &loader) {
438498
// Scalar functions: inflect string values or struct field names
439499
auto scalar_function_set = ScalarFunctionSet("inflect");
440500
auto inflect_string_function = ScalarFunction("inflect", {LogicalType::VARCHAR, LogicalType::VARCHAR},
441-
LogicalType::VARCHAR, InflectStringFunc);
501+
LogicalType::VARCHAR, InflectStringFunc, InflectScalarBind);
442502
scalar_function_set.AddFunction(inflect_string_function);
443503

444504
auto inflect_struct_function = ScalarFunction("inflect", {LogicalType::VARCHAR, LogicalType::ANY}, LogicalType::ANY,
445505
InflectScalarFunc, InflectScalarBind);
446506
scalar_function_set.AddFunction(inflect_struct_function);
447507

508+
// 3-argument overloads with strip_accents boolean
509+
auto inflect_string_sa_function =
510+
ScalarFunction("inflect", {LogicalType::VARCHAR, LogicalType::VARCHAR, LogicalType::BOOLEAN},
511+
LogicalType::VARCHAR, InflectStringFunc, InflectScalarBind);
512+
scalar_function_set.AddFunction(inflect_string_sa_function);
513+
514+
auto inflect_struct_sa_function =
515+
ScalarFunction("inflect", {LogicalType::VARCHAR, LogicalType::ANY, LogicalType::BOOLEAN}, LogicalType::ANY,
516+
InflectScalarFunc, InflectScalarBind);
517+
scalar_function_set.AddFunction(inflect_struct_sa_function);
518+
448519
CreateScalarFunctionInfo scalar_func_info(scalar_function_set);
449520

450521
// Description for string variant

0 commit comments

Comments
 (0)