Skip to content

Commit 19e2a37

Browse files
feat: Add ParseKVLax for Flexible Key-Value Parsing (#4007)
* feat: improve ParseKV function to handle unquoted values with spaces - Replace regex-based parsing with scanner approach for better handling of complex key-value pairs - Add support for unquoted values containing spaces (e.g., UNIFIhost=Express 7) - Maintain backward compatibility with existing quoted and simple unquoted values - Add robust filtering to prevent false positives from invalid key patterns - Improve quote handling and escaping for quoted values - Add comprehensive test cases covering edge cases and mixed scenarios Fixes parsing issues with CEF logs and other formats where values contain spaces without quotes. * fix: minor formatting cleanup in ParseKVLax helpers * refactor: improve code style in isInsideQuotedValue using early continue (gocritic) * fix: work on mmetc feedback * fix: change backslash logic to even check --------- Co-authored-by: mmetc <92726601+mmetc@users.noreply.github.com> Co-authored-by: marco <marco@crowdsec.net>
1 parent 3aa2c7e commit 19e2a37

File tree

3 files changed

+257
-0
lines changed

3 files changed

+257
-0
lines changed

pkg/exprhelpers/expr_lib.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,13 @@ var exprFuncs = []exprCustomFunc{
461461
new(func(string, map[string]any, string) error),
462462
},
463463
},
464+
{
465+
name: "ParseKVLax",
466+
function: ParseKVLax,
467+
signature: []any{
468+
new(func(string, map[string]any, string) error),
469+
},
470+
},
464471
{
465472
name: "Hostname",
466473
function: Hostname,

pkg/exprhelpers/exprlib_test.go

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2201,3 +2201,146 @@ func TestParseKv(t *testing.T) {
22012201
})
22022202
}
22032203
}
2204+
2205+
func TestParseKvLax(t *testing.T) {
2206+
err := Init(nil)
2207+
require.NoError(t, err)
2208+
2209+
tests := []struct {
2210+
name string
2211+
value string
2212+
want map[string]string
2213+
expr string
2214+
wantBuildErr bool
2215+
wantRuntimeErr bool
2216+
}{
2217+
{
2218+
name: "ParseKVLax() test: valid string",
2219+
value: "foo=bar",
2220+
want: map[string]string{"foo": "bar"},
2221+
expr: `ParseKVLax(value, out, "a")`,
2222+
},
2223+
{
2224+
name: "ParseKVLax() test: valid string multiple",
2225+
value: "foo=bar bar=foo",
2226+
want: map[string]string{"foo": "bar", "bar": "foo"},
2227+
expr: `ParseKVLax(value, out, "a")`,
2228+
},
2229+
{
2230+
name: "ParseKVLax() test: quoted string",
2231+
value: `foo="bar=toto"`,
2232+
want: map[string]string{"foo": "bar=toto"},
2233+
expr: `ParseKVLax(value, out, "a")`,
2234+
},
2235+
{
2236+
name: "ParseKVLax() test: empty unquoted string",
2237+
value: `foo= bar=toto`,
2238+
want: map[string]string{"bar": "toto", "foo": ""},
2239+
expr: `ParseKVLax(value, out, "a")`,
2240+
},
2241+
{
2242+
name: "ParseKVLax() test: empty quoted string",
2243+
value: `foo="" bar=toto`,
2244+
want: map[string]string{"bar": "toto", "foo": ""},
2245+
expr: `ParseKVLax(value, out, "a")`,
2246+
},
2247+
{
2248+
name: "ParseKVLax() test: unquoted value with spaces",
2249+
value: `UNIFIhost=Express 7 port=443`,
2250+
want: map[string]string{"UNIFIhost": "Express 7", "port": "443"},
2251+
expr: `ParseKVLax(value, out, "a")`,
2252+
},
2253+
{
2254+
name: "ParseKVLax() test: mixed quoted and unquoted with spaces",
2255+
value: `msg="Hello World" host=My Server name=test`,
2256+
want: map[string]string{"msg": "Hello World", "host": "My Server", "name": "test"},
2257+
expr: `ParseKVLax(value, out, "a")`,
2258+
},
2259+
{
2260+
name: "ParseKVLax() test: escaped quotes in quoted value",
2261+
value: `msg="He said \"Hello\"" status=ok`,
2262+
want: map[string]string{"msg": `He said "Hello"`, "status": "ok"},
2263+
expr: `ParseKVLax(value, out, "a")`,
2264+
},
2265+
{
2266+
name: "ParseKVLax() test: escaped backslashes in quoted value",
2267+
value: `path="C:\\Program Files\\App" status=running`,
2268+
want: map[string]string{"path": `C:\Program Files\App`, "status": "running"},
2269+
expr: `ParseKVLax(value, out, "a")`,
2270+
},
2271+
{
2272+
name: "ParseKVLax() test: empty unquoted value at end",
2273+
value: `host=server port=443 debug=`,
2274+
want: map[string]string{"host": "server", "port": "443", "debug": ""},
2275+
expr: `ParseKVLax(value, out, "a")`,
2276+
},
2277+
{
2278+
name: "ParseKVLax() test: complex CEF-like log extension",
2279+
value: `src=192.168.1.100 duser=admin msg=User login successful UNIFIhost=Express 7 UNIFIport=443`,
2280+
want: map[string]string{"src": "192.168.1.100", "duser": "admin", "msg": "User login successful", "UNIFIhost": "Express 7", "UNIFIport": "443"},
2281+
expr: `ParseKVLax(value, out, "a")`,
2282+
},
2283+
{
2284+
name: "ParseKVLax() test: iptables-style values with flags",
2285+
value: `RES=0x00 SYN URGP=0 ID=25029 DF PROTO=TCP`,
2286+
want: map[string]string{"RES": "0x00 SYN", "URGP": "0", "ID": "25029 DF", "PROTO": "TCP"},
2287+
expr: `ParseKVLax(value, out, "a")`,
2288+
},
2289+
{
2290+
name: "ParseKVLax() test: keycloak-style JSON values",
2291+
value: `error=user_not_found, code_id=e44d80b4-058d-4b45-b2ee-fac3d174e10c, userId=null, type=LOGIN_ERROR`,
2292+
want: map[string]string{"error": "user_not_found,", "code_id": "e44d80b4-058d-4b45-b2ee-fac3d174e10c,", "userId": "null,", "type": "LOGIN_ERROR"},
2293+
expr: `ParseKVLax(value, out, "a")`,
2294+
},
2295+
{
2296+
name: "ParseKVLax() test: key= after escaped quotes inside quoted value",
2297+
value: `msg="say \"fake=val\" here" real=value`,
2298+
want: map[string]string{"msg": `say "fake=val" here`, "real": "value"},
2299+
expr: `ParseKVLax(value, out, "a")`,
2300+
},
2301+
{
2302+
name: "ParseKVLax() test: escaped backslash before closing quote",
2303+
value: `path="C:\\" next=val`,
2304+
want: map[string]string{"path": `C:\`, "next": "val"},
2305+
expr: `ParseKVLax(value, out, "a")`,
2306+
},
2307+
{
2308+
name: "ParseKVLax() test: invalid type for first argument",
2309+
value: "",
2310+
expr: `ParseKVLax(42, out, "a")`,
2311+
wantBuildErr: true,
2312+
},
2313+
{
2314+
name: "ParseKVLax() test: no key=value pairs",
2315+
value: "no pairs here",
2316+
expr: `ParseKVLax(value, out, "a")`,
2317+
wantRuntimeErr: true,
2318+
},
2319+
}
2320+
2321+
for _, tc := range tests {
2322+
t.Run(tc.name, func(t *testing.T) {
2323+
outMap := make(map[string]any)
2324+
env := map[string]any{
2325+
"value": tc.value,
2326+
"out": outMap,
2327+
}
2328+
vm, err := expr.Compile(tc.expr, GetExprOptions(env)...)
2329+
if tc.wantBuildErr {
2330+
require.Error(t, err)
2331+
return
2332+
}
2333+
2334+
require.NoError(t, err)
2335+
2336+
_, err = expr.Run(vm, env)
2337+
if tc.wantRuntimeErr {
2338+
require.Error(t, err)
2339+
return
2340+
}
2341+
2342+
require.NoError(t, err)
2343+
assert.Equal(t, tc.want, outMap["a"])
2344+
})
2345+
}
2346+
}

pkg/exprhelpers/helpers.go

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ func init() { //nolint:gochecknoinits
6262
}
6363

6464
var keyValuePattern = regexp.MustCompile(`(?P<key>[^=\s]+)=(?:"(?P<quoted_value>[^"\\]*(?:\\.[^"\\]*)*)"|(?P<value>[^=\s]+)|\s*)`)
65+
var keyStart = regexp.MustCompile(`([a-zA-Z_][a-zA-Z0-9_.-]*)=`) // More restrictive key pattern for loose parsing
6566

6667
var (
6768
geoIPCityReader *geoip2.Reader
@@ -988,6 +989,112 @@ func ParseKV(params ...any) (any, error) {
988989
return nil, nil
989990
}
990991

992+
// ParseKVLax parses key-value pairs with lax matching, supporting unquoted multi-word values
993+
// by using a scanner approach instead of regex.
994+
func ParseKVLax(params ...any) (any, error) {
995+
blob := params[0].(string)
996+
target := params[1].(map[string]any)
997+
prefix := params[2].(string)
998+
999+
if _, ok := target[prefix]; !ok {
1000+
target[prefix] = make(map[string]string)
1001+
} else if _, ok := target[prefix].(map[string]string); !ok {
1002+
log.Errorf("ParseKVLax: target is not a map[string]string")
1003+
return nil, errors.New("target is not a map[string]string")
1004+
}
1005+
1006+
km := target[prefix].(map[string]string)
1007+
1008+
// Find all key= occurrences and slice values between them.
1009+
idxs := keyStart.FindAllStringSubmatchIndex(blob, -1)
1010+
if len(idxs) == 0 {
1011+
log.Errorf("could not find any key/value pair in line")
1012+
return nil, errors.New("invalid input format")
1013+
}
1014+
1015+
// Filter out matches that are inside quoted values
1016+
validIdxs := make([][]int, 0, len(idxs))
1017+
for _, m := range idxs {
1018+
keyStartPos := m[0]
1019+
// Check if this key= is inside a quoted value by looking backwards
1020+
if !isInsideQuotedValue(blob, keyStartPos) {
1021+
validIdxs = append(validIdxs, m)
1022+
}
1023+
}
1024+
1025+
if len(validIdxs) == 0 {
1026+
log.Errorf("could not find any key/value pair in line")
1027+
return nil, errors.New("invalid input format")
1028+
}
1029+
1030+
for i, m := range validIdxs {
1031+
// m layout: [ fullStart, fullEnd, group1Start, group1End ]
1032+
key := blob[m[2]:m[3]]
1033+
valStart := m[1] // right after '='
1034+
1035+
var valEnd int
1036+
if i+1 < len(validIdxs) {
1037+
valEnd = validIdxs[i+1][0] // start of next key
1038+
} else {
1039+
valEnd = len(blob)
1040+
}
1041+
1042+
raw := strings.TrimSpace(blob[valStart:valEnd])
1043+
val := parseValueLax(raw)
1044+
km[key] = val
1045+
}
1046+
1047+
log.Tracef("unmarshaled KV (lax): %+v", target[prefix])
1048+
return nil, nil
1049+
}
1050+
1051+
// parseValueLax handles quoted and unquoted values for lax parsing.
1052+
// - If it begins with a quote, it removes the surrounding quotes
1053+
// if the closing one is present and unescapes \" and \\.
1054+
// - For unquoted values, returns the entire trimmed value as-is
1055+
func parseValueLax(s string) string {
1056+
if s == "" {
1057+
return ""
1058+
}
1059+
1060+
if s[0] != '"' {
1061+
return s
1062+
}
1063+
1064+
if len(s) >= 2 && s[len(s)-1] == '"' {
1065+
body := s[1 : len(s)-1]
1066+
body = strings.ReplaceAll(body, `\\`, `\`)
1067+
body = strings.ReplaceAll(body, `\"`, `"`)
1068+
return body
1069+
}
1070+
1071+
return strings.TrimPrefix(s, `"`)
1072+
}
1073+
1074+
// isInsideQuotedValue checks if a position in the string is inside a quoted value
1075+
// by counting unescaped quotes before the position
1076+
func isInsideQuotedValue(s string, pos int) bool {
1077+
inQuote := false
1078+
1079+
for i := 0; i <= pos && i < len(s); i++ {
1080+
if s[i] != '"' {
1081+
continue
1082+
}
1083+
1084+
// Check if this quote is escaped
1085+
backslashCount := 0
1086+
for j := i - 1; j >= 0 && s[j] == '\\'; j-- {
1087+
backslashCount++
1088+
}
1089+
1090+
if backslashCount%2 == 0 {
1091+
inQuote = !inQuote
1092+
}
1093+
}
1094+
1095+
return inQuote
1096+
}
1097+
9911098
func Hostname(params ...any) (any, error) {
9921099
hostname, err := os.Hostname()
9931100
if err != nil {

0 commit comments

Comments
 (0)