Skip to content

Commit 258dabb

Browse files
authored
Merge pull request #272 from ClibMouse/KQL_X3_URL_Parse
KQL URL parse using x3
2 parents 764876c + c3ef4c1 commit 258dabb

File tree

7 files changed

+240
-60
lines changed

7 files changed

+240
-60
lines changed
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
#include <Columns/ColumnString.h>
2+
#include <DataTypes/DataTypeString.h>
3+
#include <Functions/FunctionFactory.h>
4+
#include <Functions/FunctionHelpers.h>
5+
#include <Functions/IFunction.h>
6+
#include <boost/fusion/adapted/std_pair.hpp>
7+
#include <boost/spirit/home/x3.hpp>
8+
9+
#include <format>
10+
11+
namespace x3 = boost::spirit::x3;
12+
13+
namespace
14+
{
15+
using x3::char_;
16+
using x3::lexeme;
17+
using x3::lit;
18+
19+
struct KQLURLstate
20+
{
21+
std::string schema;
22+
std::string user;
23+
std::string pass;
24+
std::string host;
25+
std::string port;
26+
std::string path;
27+
std::string frag;
28+
std::vector<std::pair<std::string, std::string>> args;
29+
};
30+
31+
const auto endschema = lit("://");
32+
const auto colon = lit(":");
33+
const auto at = lit("@");
34+
const auto slash = lit("/");
35+
const auto equals = lit("=");
36+
const auto fragmark = lit("#");
37+
const auto openbracket = lit("[");
38+
const auto closebracket = lit("]");
39+
const auto question = lit("?");
40+
const auto ampersand = lit("&");
41+
42+
const auto endhost = char_("/:?#");
43+
const auto endport = char_("/?#");
44+
const auto endauth = char_("@:/?#");
45+
const auto endpath = char_("?#");
46+
const auto endarg = char_("#&");
47+
48+
const auto set_schema = [](auto & ctx) { _val(ctx).schema = _attr(ctx); };
49+
const auto set_auth = [](auto & ctx)
50+
{
51+
const auto & auth = _attr(ctx);
52+
_val(ctx).user = at_c<0>(auth);
53+
_val(ctx).pass = at_c<1>(auth);
54+
};
55+
const auto set_host = [](auto & ctx) { _val(ctx).host = _attr(ctx); };
56+
const auto set_port = [](auto & ctx) { _val(ctx).port = _attr(ctx); };
57+
const auto set_path = [](auto & ctx) { _val(ctx).path = _attr(ctx); };
58+
const auto set_arg = [](auto & ctx)
59+
{
60+
const auto & arg = _attr(ctx);
61+
_val(ctx).args.emplace_back(at_c<0>(arg), at_c<1>(arg));
62+
};
63+
const auto set_frag = [](auto & ctx) { _val(ctx).frag = _attr(ctx); };
64+
65+
template <typename T>
66+
auto as = [](auto p) { return x3::rule<struct _, T>{} = as_parser(p); };
67+
68+
const auto KQL_URL_SCHEMA_def = lexeme[+(char_ - endschema) >> endschema][set_schema];
69+
const auto KQL_URL_AUTH_def = lexeme[+(char_ - endauth) >> colon >> +(char_ - endauth) >> at][set_auth];
70+
const auto KQL_URL_HOST_def
71+
= lexeme[as<std::string>((openbracket >> +(char_ - closebracket) >> closebracket) | (+(char_ - endhost)))][set_host];
72+
const auto KQL_URL_PORT_def = lexeme[colon >> +(char_ - endport)][set_port];
73+
const auto KQL_URL_PATH_def = lexeme[&slash >> +(char_ - endpath)][set_path];
74+
const auto KQL_URL_ARG_def = lexeme[(question | ampersand) >> +(char_ - equals) >> equals >> +(char_ - endarg)][set_arg];
75+
const auto KQL_URL_FRAG_def = lexeme[fragmark >> +char_][set_frag];
76+
77+
const x3::rule<class KQLURL, KQLURLstate> KQL_URL = "KQL URL";
78+
const auto KQL_URL_def = KQL_URL_SCHEMA_def >> -KQL_URL_AUTH_def >> -KQL_URL_HOST_def >> -KQL_URL_PORT_def >> -KQL_URL_PATH_def
79+
>> *KQL_URL_ARG_def >> -KQL_URL_FRAG_def;
80+
81+
BOOST_SPIRIT_DEFINE(KQL_URL);
82+
}
83+
84+
namespace DB
85+
{
86+
namespace ErrorCodes
87+
{
88+
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
89+
}
90+
91+
class FunctionKqlParseURL : public IFunction
92+
{
93+
public:
94+
static constexpr auto name = "kql_parseurl";
95+
static FunctionPtr create(ContextPtr context) { return std::make_shared<FunctionKqlParseURL>(std::move(context)); }
96+
97+
explicit FunctionKqlParseURL(ContextPtr context_) : context(std::move(context_)) { }
98+
~FunctionKqlParseURL() override = default;
99+
100+
ColumnPtr
101+
executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override;
102+
String getName() const override { return name; }
103+
size_t getNumberOfArguments() const override { return 1; }
104+
DataTypePtr getReturnTypeImpl(const DataTypes &) const override { return std::make_shared<DataTypeString>(); }
105+
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
106+
107+
private:
108+
ContextPtr context;
109+
};
110+
111+
ColumnPtr
112+
FunctionKqlParseURL::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, const size_t input_rows_count) const
113+
{
114+
auto result = ColumnString::create();
115+
116+
if (!isStringOrFixedString(arguments.at(0).type))
117+
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "parse_url(): argument #1 - invalid data type: string");
118+
119+
for (size_t i = 0; i < input_rows_count; ++i)
120+
{
121+
const auto in_str = arguments[0].column->getDataAt(i).toView();
122+
KQLURLstate url;
123+
parse(in_str.begin(), in_str.end(), KQL_URL, url);
124+
bool first = false;
125+
std::string args = "{";
126+
for (auto q_iter = url.args.begin(); q_iter < url.args.end(); ++q_iter)
127+
{
128+
args.append((first ? ",\"" : "\"") + q_iter->first + "\":\"" + q_iter->second + "\"");
129+
first = true;
130+
}
131+
args.append("}");
132+
const auto out_str = std::format(
133+
"{}\"Scheme\":\"{}\",\"Host\":\"{}\",\"Port\":\"{}\",\"Path\":\"{}\",\"Username\":\"{}\",\"Password\":\"{}\",\"Query "
134+
"Parameters\":{},\"Fragment\":\"{}\"{}",
135+
"{",
136+
url.schema,
137+
url.host,
138+
url.port,
139+
url.path,
140+
url.user,
141+
url.pass,
142+
args,
143+
url.frag,
144+
"}");
145+
result->insertData(out_str.c_str(), out_str.size());
146+
}
147+
return result;
148+
}
149+
150+
REGISTER_FUNCTION(KqlParseURL)
151+
{
152+
factory.registerFunction<FunctionKqlParseURL>();
153+
}
154+
}

src/Parsers/Kusto/KQL_ReleaseNote.md

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,41 @@
1-
## KQL implemented features
1+
## KQL implemented features
22
# April XX, 2023
33
## Bugfixes
4+
- Corrected an issue with parse_url in which hostnames and port numbers were not correctly parsed.
5+
```
6+
parse_url follows the folowing structure.
7+
8+
Scheme://Username:Password@Host:Port/Path?QueryParameters#Fragment
9+
10+
'://' is required for further parsing.
11+
All other fields are optional and are parsed from left to right.
12+
Username and Password are parsed together, require ':' and '@', and will not match if either contains '/', '?', or '#'.
13+
IPv6 addresses are required to be encapsulated in brackets.
14+
Host ends with '/', ':', '?' or '#'.
15+
Port starts with ':' and ends with '/', '?' or '#'.
16+
Path requires to start with '/' and ends with '?' or '#'.
17+
Query Parameters is recursive, starts with '?', ends with '#', expected to be in the form of argument=value, and separated by '&'.
18+
Fragment must start with '#'.
19+
20+
Notes on differences between ADX and ClickHouse:
21+
22+
ClickHouse will return a formated string. 'extract_json' can be used to convert the string.
23+
print x = parse_url("http://[2001:0db8:0000:0000:0000:ff00:0042:8329]?asd=qwe&qwe=asd") | project extract_json("$.Scheme", x);
24+
ClickHouse includes Path as '/' where ADX requires anything after '/' to populate Path.
25+
print parse_url("http://host:1234/");
26+
ClickHouse includes Port where ADX requires '/' for Port and without '/' will treat Port as part of Host.
27+
print parse_url("http://host:1234?arg=value")
28+
ClickHouse includes arg value in Query parameters where ADX treats this as host.
29+
print parse_url("http://?arg=value");
30+
ClickHouse will not parse IPv6 addresses not encapsulated in brackets [RFC 3986](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.2)
31+
Correct IPv6
32+
print parse_url("http://[2001:db8:3333:4444:5555:6666:7777:8888]:1234/filepath/index.htm")
33+
Incorrect IPv6
34+
print parse_url("http://2001:db8:3333:4444:5555:6666:7777:8888:1234/filepath/index.htm");
35+
print parse_url("http://2001:db8:3333:4444:5555:6666:7777:8888/filepath/index.htm");
36+
ADX will incorrectly consume part of encapsulated IPv6 Host as Port from last colon to '/'.
37+
print parse_url("http://[2001:db8:3333:4444:5555:6666:7777:8888]/filepath/index.htm")
38+
```
439
- [arg_max()](https://learn.microsoft.com/en-us/azure/data-explorer/kusto/query/arg-max-aggfunction) and [arg_min()](https://learn.microsoft.com/en-us/azure/data-explorer/kusto/query/arg-min-aggfunction)
540
support multiple arguments now.
641
`Customers | arg_max(Age, FirstName, LastName)`
@@ -37,7 +72,6 @@
3772
select * from kql(TableWithVariousDataTypes | project JoinDate | where JoinDate between (datetime('2020-06-30') .. datetime('2025-06-30')));
3873
select * from kql(TableWithVariousDataTypes | project JoinDate | where JoinDate !between (datetime('2020-06-30') .. datetime('2025-06-30')));
3974
```
40-
4175
# March 15, 2023
4276
## Feature
4377
- KQL - improve timespan textual representation in the CLI

src/Parsers/Kusto/KustoFunctions/KQLStringFunctions.cpp

Lines changed: 1 addition & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -419,55 +419,7 @@ bool ParseJson::convertImpl(String & out, IParser::Pos & pos)
419419

420420
bool ParseURL::convertImpl(String & out, IParser::Pos & pos)
421421
{
422-
const String fn_name = getKQLFunctionName(pos);
423-
if (fn_name.empty())
424-
return false;
425-
426-
++pos;
427-
const String url = getConvertedArgument(fn_name, pos);
428-
429-
const String scheme = std::format(R"(concat('"Scheme":"', protocol({0}),'"'))", url);
430-
const String host = std::format(R"(concat('"Host":"', domain({0}),'"'))", url);
431-
String port = std::format(R"(concat('"Port":"', toString(port({0})),'"'))", url);
432-
const String path = std::format(R"(concat('"Path":"', path({0}),'"'))", url);
433-
const String username_pwd = std::format("netloc({0})", url);
434-
const String query_string = std::format("queryString({0})", url);
435-
const String fragment = std::format(R"(concat('"Fragment":"',fragment({0}),'"'))", url);
436-
const String username = std::format(
437-
R"(concat('"Username":"', arrayElement(splitByChar(':',arrayElement(splitByChar('@',{0}) ,1)),1),'"'))", username_pwd);
438-
const String password = std::format(
439-
R"(concat('"Password":"', arrayElement(splitByChar(':',arrayElement(splitByChar('@',{0}) ,1)),2),'"'))", username_pwd);
440-
String query_parameters
441-
= std::format(R"(concat('"Query Parameters":', concat('{{"', replace(replace({}, '=', '":"'),'&','","') ,'"}}')))", query_string);
442-
443-
bool all_space = true;
444-
for (char ch : url)
445-
{
446-
if (ch == '\'' || ch == '\"')
447-
continue;
448-
if (ch != ' ')
449-
{
450-
all_space = false;
451-
break;
452-
}
453-
}
454-
455-
if (all_space)
456-
{
457-
port = R"('"Port":""')";
458-
query_parameters = "'\"Query Parameters\":{}'";
459-
}
460-
out = std::format(
461-
"concat('{{',{},',',{},',',{},',',{},',',{},',',{},',',{},',',{},'}}')",
462-
scheme,
463-
host,
464-
port,
465-
path,
466-
username,
467-
password,
468-
query_parameters,
469-
fragment);
470-
return true;
422+
return directMapping(out, pos, "kql_parseurl");
471423
}
472424

473425
bool ParseURLQuery::convertImpl(String & out, IParser::Pos & pos)

src/Parsers/tests/KQL/gtest_KQL_Operators.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -250,10 +250,6 @@ INSTANTIATE_TEST_SUITE_P(ParserKQLQuery_Operators, ParserTest,
250250
"print strcmp('a','b')",
251251
"SELECT multiIf('a' = 'b', 0, 'a' < 'b', -1, 1) AS print_0"
252252
},
253-
{
254-
"print parse_url('https://john:[email protected]:1234/this/is/a/path?k1=v1&k2=v2#fragment')",
255-
"SELECT concat('{', concat('\"Scheme\":\"', protocol('https://john:[email protected]:1234/this/is/a/path?k1=v1&k2=v2#fragment'), '\"'), ',', concat('\"Host\":\"', domain('https://john:[email protected]:1234/this/is/a/path?k1=v1&k2=v2#fragment'), '\"'), ',', concat('\"Port\":\"', toString(port('https://john:[email protected]:1234/this/is/a/path?k1=v1&k2=v2#fragment')), '\"'), ',', concat('\"Path\":\"', path('https://john:[email protected]:1234/this/is/a/path?k1=v1&k2=v2#fragment'), '\"'), ',', concat('\"Username\":\"', splitByChar(':', splitByChar('@', netloc('https://john:[email protected]:1234/this/is/a/path?k1=v1&k2=v2#fragment'))[1])[1], '\"'), ',', concat('\"Password\":\"', splitByChar(':', splitByChar('@', netloc('https://john:[email protected]:1234/this/is/a/path?k1=v1&k2=v2#fragment'))[1])[2], '\"'), ',', concat('\"Query Parameters\":', concat('{\"', replace(replace(queryString('https://john:[email protected]:1234/this/is/a/path?k1=v1&k2=v2#fragment'), '=', '\":\"'), '&', '\",\"'), '\"}')), ',', concat('\"Fragment\":\"', fragment('https://john:[email protected]:1234/this/is/a/path?k1=v1&k2=v2#fragment'), '\"'), '}') AS print_0"
256-
},
257253
{
258254
"Customers | summarize t = make_list(FirstName) by FirstName",
259255
"SELECT\n FirstName,\n groupArrayIf(FirstName, FirstName IS NOT NULL) AS t\nFROM Customers\nGROUP BY FirstName"

src/Parsers/tests/KQL/gtest_KQL_StringFunctions.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,10 @@ INSTANTIATE_TEST_SUITE_P(ParserKQLQuery_String, ParserTest,
241241
{
242242
"print new_guid()",
243243
"SELECT generateUUIDv4() AS print_0"
244+
},
245+
{
246+
"print parse_url('https://john:[email protected]:1234/this/is/a/path?k1=v1&k2=v2#fragment')",
247+
"SELECT kql_parseurl('https://john:[email protected]:1234/this/is/a/path?k1=v1&k2=v2#fragment') AS print_0",
244248
},
245249
{
246250
"print str = make_string(dynamic([75, 117, 115, 116, 111]))",

tests/queries/0_stateless/02366_kql_func_string.reference

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -302,10 +302,29 @@ S3VzdG8x
302302

303303
Kusto1
304304
\N
305-
-- parse_url()
306-
{"Scheme":"scheme","Host":"","Port":"0","Path":"/this/is/a/path","Username":"username","Password":"password","Query Parameters":{"k1":"v1","k2":"v2"},"Fragment":"fragment"}
307-
-- parse_url()
305+
-- parse_url() same as ADX
306+
{"Scheme":"scheme","Host":"host","Port":"1234","Path":"/this/is/a/path","Username":"username","Password":"password","Query Parameters":{"k1":"v1","k2":"v2"},"Fragment":"fragment"}
308307
{"Scheme":"","Host":"","Port":"","Path":"","Username":"","Password":"","Query Parameters":{},"Fragment":""}
308+
{"Scheme":"http","Host":"2001:db8:3333:4444:5555:6666:7777:8888","Port":"1234","Path":"/filepath/index.htm","Username":"","Password":"","Query Parameters":{},"Fragment":""}
309+
{"Scheme":"http","Host":"host","Port":"1234","Path":"","Username":"","Password":"","Query Parameters":{},"Fragment":""}
310+
{"Scheme":"http","Host":"","Port":"","Path":"/this/is/a/path/index.htm","Username":"","Password":"","Query Parameters":{},"Fragment":""}
311+
{"Scheme":"http","Host":"","Port":"","Path":"","Username":"","Password":"","Query Parameters":{},"Fragment":"fragment"}
312+
{"Scheme":"http","Host":"host","Port":"abcd","Path":"","Username":"","Password":"","Query Parameters":{},"Fragment":""}
313+
{"Scheme":"http","Host":"host","Port":"","Path":"/filepath","Username":"","Password":"","Query Parameters":{"arg":":bogus@some"},"Fragment":""}
314+
{"Scheme":"http","Host":"","Port":"","Path":"","Username":"username","Password":"password","Query Parameters":{},"Fragment":""}
315+
-- parse_url() differs from ADX
316+
{"Scheme":"http","Host":"host","Port":"1234","Path":"/","Username":"","Password":"","Query Parameters":{},"Fragment":""}
317+
{"Scheme":"http","Host":"","Port":"1234","Path":"/","Username":"","Password":"","Query Parameters":{},"Fragment":""}
318+
{"Scheme":"http","Host":"","Port":"","Path":"","Username":"","Password":"","Query Parameters":{"arg":"value"},"Fragment":""}
319+
{"Scheme":"http","Host":"host","Port":"1234","Path":"","Username":"","Password":"","Query Parameters":{"arg":"value"},"Fragment":""}
320+
{"Scheme":"http","Host":"","Port":"","Path":"/","Username":"","Password":"","Query Parameters":{},"Fragment":""}
321+
{"Scheme":"http","Host":"","Port":"","Path":"/filepath","Username":"","Password":"","Query Parameters":{},"Fragment":""}
322+
{"Scheme":"http","Host":"","Port":"port:","Path":"/anything","Username":"","Password":"","Query Parameters":{"arg":"value"},"Fragment":""}
323+
{"Scheme":"http","Host":"","Port":"port:port","Path":"/anything","Username":"","Password":"","Query Parameters":{"arg":"value"},"Fragment":""}
324+
{"Scheme":"http","Host":"host","Port":"","Path":"/","Username":"","Password":"","Query Parameters":{},"Fragment":""}
325+
-- parse_url() invalid IPV6
326+
{"Scheme":"http","Host":"2001","Port":"db8:3333:4444:5555:6666:7777:8888:1234","Path":"/filepath/index.htm","Username":"","Password":"","Query Parameters":{},"Fragment":""}
327+
{"Scheme":"http","Host":"2001","Port":"db8:3333:4444:5555:6666:7777:8888","Path":"/filepath/index.htm","Username":"","Password":"","Query Parameters":{},"Fragment":""}
309328
-- parse_urlquery()
310329
{"Query Parameters":{"k1":"v1","k2":"v2","k3":"v3"}}
311330
-- strcat --

tests/queries/0_stateless/02366_kql_func_string.sql

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -284,10 +284,31 @@ print '-- base64_decode_tostring';
284284
print base64_decode_tostring('');
285285
print base64_decode_tostring('S3VzdG8x');
286286
print base64_decode_tostring('S3VzdG8====');
287-
print '-- parse_url()';
287+
print '-- parse_url() same as ADX';
288288
print parse_url('scheme://username:password@host:1234/this/is/a/path?k1=v1&k2=v2#fragment');
289-
print '-- parse_url()';
290289
print parse_url('');
290+
print parse_url("http://[2001:db8:3333:4444:5555:6666:7777:8888]:1234/filepath/index.htm")
291+
print parse_url("http://host");
292+
print parse_url("http://host:1234");
293+
print parse_url("http:///this/is/a/path/index.htm");
294+
print parse_url("http://#fragment");
295+
print parse_url("http://host:abcd");
296+
print parse_url('http://host/filepath?arg=:bogus@some');
297+
print parse_url("http://username:password@");
298+
print parse_url(1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
299+
print '-- parse_url() differs from ADX';
300+
print parse_url("http://host:1234/");
301+
print parse_url("http://:1234/");
302+
print parse_url("http://?arg=value");
303+
print parse_url("http://host:1234?arg=value");
304+
print parse_url("http:///");
305+
print parse_url("http:///filepath");
306+
print parse_url("http://:port:/anything?arg=value");
307+
print parse_url("http://:port:port/anything?arg=value");
308+
print parse_url("http://host/");
309+
print '-- parse_url() invalid IPV6';
310+
print parse_url("http://2001:db8:3333:4444:5555:6666:7777:8888:1234/filepath/index.htm");
311+
print parse_url("http://2001:db8:3333:4444:5555:6666:7777:8888/filepath/index.htm");
291312
print '-- parse_urlquery()';
292313
print parse_urlquery('k1=v1&k2=v2&k3=v3');
293314
print '-- strcat --';

0 commit comments

Comments
 (0)