Skip to content

Commit 30c72a1

Browse files
authored
Add split UDF func to ease certain string handling (#2039)
Summary: Add split UDF func to ease certain string handling This is functionality that I'm planning to use for my upcoming Kubecon demo/talk. For this use case, I want to access the individual fields of the XFCC header just like the example in the UDF docstring. Relevant Issues: N/A Type of change: /kind feature Test Plan: New tests pass Changelog Message: Added `px.split` function to support parsing strings that contain delimiters --------- Signed-off-by: Dom Del Nano <[email protected]>
1 parent 3c41d55 commit 30c72a1

File tree

3 files changed

+47
-0
lines changed

3 files changed

+47
-0
lines changed

src/carnot/funcs/builtins/json_ops.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ void RegisterJSONOpsOrDie(udf::Registry* registry) {
3131
registry->RegisterOrDie<PluckAsInt64UDF>("pluck_int64");
3232
registry->RegisterOrDie<PluckAsFloat64UDF>("pluck_float64");
3333
registry->RegisterOrDie<PluckArrayUDF>("pluck_array");
34+
registry->RegisterOrDie<SplitUDF>("split");
3435

3536
// Up to 8 script args are supported for the _script_reference UDF, due to the lack of support for
3637
// variadic UDF arguments in the UDF registry today. We should clean this up if/when variadic UDF

src/carnot/funcs/builtins/json_ops.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,42 @@ class PluckArrayUDF : public udf::ScalarUDF {
228228
}
229229
};
230230

231+
class SplitUDF : public udf::ScalarUDF {
232+
public:
233+
StringValue Exec(FunctionContext*, StringValue in, StringValue delimiter) {
234+
rapidjson::StringBuffer sb;
235+
rapidjson::Writer<rapidjson::StringBuffer> writer(sb);
236+
writer.StartArray();
237+
238+
for (absl::string_view part : absl::StrSplit(in.data(), delimiter.data())) {
239+
writer.String(part.data(), part.size());
240+
}
241+
242+
writer.EndArray();
243+
return sb.GetString();
244+
}
245+
246+
static udf::ScalarUDFDocBuilder Doc() {
247+
return udf::ScalarUDFDocBuilder(
248+
"Splits a string by a delimiter and a returns JSON encoded array of strings.")
249+
.Details(
250+
"This function splits a string by a delimiter and returns a JSON encoded array of "
251+
"strings. The function is useful for splitting strings and then passing the result to "
252+
"px.pluck_array in order to access individual values of a delimited string.")
253+
.Example(R"doc(
254+
| df = px.DataFrame('http_events', start_time='-5m')
255+
| # Returns By=http://frontend.px.dev;URI=http://testclient.px.dev
256+
| df.xfcc_hdr = px.pluck(df.req_headers, 'X-Forwarded-Client-Cert')
257+
| df.xfcc_parts = px.split(df.xfcc_hdr, ';')
258+
| df.by = px.pluck_array(df.xfcc_hdr, 0) # Returns "By=http://frontend.px.dev"
259+
| df.uri = px.pluck_array(df.xfcc_hdr, 1) # Returns "URI=http://testclient.px.dev"
260+
)doc")
261+
.Arg("input_str", "The string to split.")
262+
.Arg("delimiter", "The string value to split the input string.")
263+
.Returns("A JSON encoded array of the split strings.");
264+
}
265+
};
266+
231267
/**
232268
DocString intentionally omitted, this is a non-public function.
233269
This function creates a custom deep link by creating a "script reference" from a label,

src/carnot/funcs/builtins/json_ops_test.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,16 @@ TEST(JSONOps, PluckArrayUDF_index_out_of_bound) {
111111
udf_tester.ForInput(kTestJSONArray, 3).Expect("");
112112
}
113113

114+
TEST(JSONOps, SplitUDF_with_present_delimiter) {
115+
auto udf_tester = udf::UDFTester<SplitUDF>();
116+
udf_tester.ForInput("foo,bar,baz", ",").Expect(R"(["foo","bar","baz"])");
117+
}
118+
119+
TEST(JSONOps, SplitUDF_with_missing_delimiter) {
120+
auto udf_tester = udf::UDFTester<SplitUDF>();
121+
udf_tester.ForInput("foo,bar,baz", ";").Expect(R"(["foo,bar,baz"])");
122+
}
123+
114124
TEST(JSONOps, ScriptReferenceUDF_no_args) {
115125
auto udf_tester = udf::UDFTester<ScriptReferenceUDF<>>();
116126
auto res = udf_tester.ForInput("text", "px/script").Result();

0 commit comments

Comments
 (0)