Skip to content

Commit 6e628bc

Browse files
authored
feat: Support field:(<term>...) Lucene searches (#1315)
# Summary This PR updates HyperDX's lucene support to include parenthesized field searches of the form `<field>:(<term>...)`. Prior to these changes, HyperDX would ignore the `<field>` entirely and search as if the query were just `<term>...`. With these changes, the search is performed just like a `<term>...` search except: 1. The `field` is used for the search, instead of the implicit field expression (eg. `Body` for `otel_logs`) 2. The search is performed without `hasToken()`, as we assume that fields do not have bloom filters setup (matching the current behavior for how we search fields) This support has the added benefit of unlocking multi-token substring searches (Ref HDX-1931) - Previously, you could not search a field for a substring with multiple tokens, eg `error.message:*Method not allowed*` is interpreted as 3 separate terms, and only `*Method` would be associated with `error.message`. `error.message:"Method not allowed"` and `error.message:"*Method not allowed*"` look for exact matches, instead of substrings. - Now, this can be accomplished with `error.message:("Method not allowed")`. This matches the current behavior of a search like `"Method not allowed"`, which would search the source's default implicit column (eg. `Body`) for the substring "Method not allowed". ## Testing To test these changes, this PR adds a few dozen query parser unit test cases.
1 parent f612bf3 commit 6e628bc

File tree

3 files changed

+438
-131
lines changed

3 files changed

+438
-131
lines changed

.changeset/hungry-ways-rush.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@hyperdx/common-utils": patch
3+
---
4+
5+
feat: Support field:(<term>...) Lucene searches
Lines changed: 255 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,30 @@
11
import { ClickhouseClient } from '@/clickhouse/node';
22
import { getMetadata } from '@/core/metadata';
3-
import { CustomSchemaSQLSerializerV2 } from '@/queryParser';
3+
import {
4+
CustomSchemaSQLSerializerV2,
5+
genEnglishExplanation,
6+
SearchQueryBuilder,
7+
} from '@/queryParser';
48

59
describe('CustomSchemaSQLSerializerV2 - json', () => {
6-
function getTestTable(field) {
7-
return { name: field, type: 'JSON' };
8-
}
910
const metadata = getMetadata(
1011
new ClickhouseClient({ host: 'http://localhost:8123' }),
1112
);
12-
// @ts-ignore
13-
metadata.getColumn = ({ column }) => {
14-
return new Promise((resolve, reject) => {
15-
if (column.indexOf('.') >= 0) return resolve(undefined);
16-
const testTable = getTestTable(column);
17-
// @ts-ignore
18-
return resolve(testTable);
19-
});
20-
};
13+
metadata.getColumn = jest.fn().mockImplementation(async ({ column }) => {
14+
if (column === 'ResourceAttributesJSON') {
15+
return { name: 'ResourceAttributesJSON', type: 'JSON' };
16+
} else if (column === 'LogAttributes') {
17+
return { name: 'LogAttributes', type: 'Map' };
18+
} else if (column === 'ServiceName') {
19+
return { name: 'ServiceName', type: 'String' };
20+
} else if (column === 'SeverityNumber') {
21+
return { name: 'SeverityNumber', type: 'UInt8' };
22+
} else if (column === 'foo') {
23+
return { name: 'foo', type: 'String' };
24+
} else {
25+
return undefined;
26+
}
27+
});
2128
const databaseName = 'testName';
2229
const tableName = 'testTable';
2330
const connectionId = 'testId';
@@ -26,89 +33,294 @@ describe('CustomSchemaSQLSerializerV2 - json', () => {
2633
databaseName,
2734
tableName,
2835
connectionId,
36+
implicitColumnExpression: 'Body',
2937
});
3038

3139
it('getColumnForField', async () => {
32-
const field1 = 'serviceName.test';
33-
const res1 = await serializer.getColumnForField(field1);
40+
const field1 = 'ResourceAttributesJSON.test';
41+
const res1 = await serializer.getColumnForField(field1, {});
3442
expect(res1).toEqual({
3543
column: '',
3644
columnJSON: {
3745
number:
38-
"dynamicType(`serviceName`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `serviceName`.`test`",
39-
string: 'toString(`serviceName`.`test`)',
46+
"dynamicType(`ResourceAttributesJSON`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `ResourceAttributesJSON`.`test`",
47+
string: 'toString(`ResourceAttributesJSON`.`test`)',
4048
},
4149
found: true,
4250
propertyType: 'json',
4351
});
44-
const field2 = 'logBody.test.nest';
45-
const res2 = await serializer.getColumnForField(field2);
52+
const field2 = 'ResourceAttributesJSON.test.nest';
53+
const res2 = await serializer.getColumnForField(field2, {});
4654
expect(res2).toEqual({
4755
column: '',
4856
columnJSON: {
4957
number:
50-
"dynamicType(`logBody`.`test`.`nest`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `logBody`.`test`.`nest`",
51-
string: 'toString(`logBody`.`test`.`nest`)',
58+
"dynamicType(`ResourceAttributesJSON`.`test`.`nest`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `ResourceAttributesJSON`.`test`.`nest`",
59+
string: 'toString(`ResourceAttributesJSON`.`test`.`nest`)',
5260
},
5361
found: true,
5462
propertyType: 'json',
5563
});
5664
});
5765

5866
it('compare - eq, isNotNull, gte, lte, lt, gt', async () => {
59-
const eqField = 'serviceName.eq.test';
67+
const eqField = 'ResourceAttributesJSON.eq.test';
6068
const eqTerm = 'testTerm';
61-
const eq1 = await serializer.eq(eqField, eqTerm, false);
62-
expect(eq1).toBe("(toString(`serviceName`.`eq`.`test`) = 'testTerm')");
63-
const eq2 = await serializer.eq(eqField, eqTerm, true);
64-
expect(eq2).toBe("(toString(`serviceName`.`eq`.`test`) != 'testTerm')");
69+
const eq1 = await serializer.eq(eqField, eqTerm, false, {});
70+
expect(eq1).toBe(
71+
"(toString(`ResourceAttributesJSON`.`eq`.`test`) = 'testTerm')",
72+
);
73+
const eq2 = await serializer.eq(eqField, eqTerm, true, {});
74+
expect(eq2).toBe(
75+
"(toString(`ResourceAttributesJSON`.`eq`.`test`) != 'testTerm')",
76+
);
6577
});
6678

6779
it('compare - isNotNull', async () => {
68-
const isNotNullField = 'serviceName.isNotNull.test';
69-
const isNotNull1 = await serializer.isNotNull(isNotNullField, false);
80+
const isNotNullField = 'ResourceAttributesJSON.isNotNull.test';
81+
const isNotNull1 = await serializer.isNotNull(isNotNullField, false, {});
7082
expect(isNotNull1).toBe(
71-
'notEmpty(toString(`serviceName`.`isNotNull`.`test`)) = 1',
83+
'notEmpty(toString(`ResourceAttributesJSON`.`isNotNull`.`test`)) = 1',
7284
);
73-
const isNotNull2 = await serializer.isNotNull(isNotNullField, true);
85+
const isNotNull2 = await serializer.isNotNull(isNotNullField, true, {});
7486
expect(isNotNull2).toBe(
75-
'notEmpty(toString(`serviceName`.`isNotNull`.`test`)) != 1',
87+
'notEmpty(toString(`ResourceAttributesJSON`.`isNotNull`.`test`)) != 1',
7688
);
7789
});
7890

7991
it('compare - gte', async () => {
80-
const gteField = 'serviceName.gte.test';
92+
const gteField = 'ResourceAttributesJSON.gte.test';
8193
const gteTerm = '30';
82-
const gte = await serializer.gte(gteField, gteTerm);
94+
const gte = await serializer.gte(gteField, gteTerm, {});
8395
expect(gte).toBe(
84-
"(dynamicType(`serviceName`.`gte`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `serviceName`.`gte`.`test` >= '30')",
96+
"(dynamicType(`ResourceAttributesJSON`.`gte`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `ResourceAttributesJSON`.`gte`.`test` >= '30')",
8597
);
8698
});
8799

88100
it('compare - lte', async () => {
89-
const lteField = 'serviceName.lte.test';
101+
const lteField = 'ResourceAttributesJSON.lte.test';
90102
const lteTerm = '40';
91-
const lte = await serializer.lte(lteField, lteTerm);
103+
const lte = await serializer.lte(lteField, lteTerm, {});
92104
expect(lte).toBe(
93-
"(dynamicType(`serviceName`.`lte`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `serviceName`.`lte`.`test` <= '40')",
105+
"(dynamicType(`ResourceAttributesJSON`.`lte`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `ResourceAttributesJSON`.`lte`.`test` <= '40')",
94106
);
95107
});
96108

97109
it('compare - gt', async () => {
98-
const gtField = 'serviceName.gt.test';
110+
const gtField = 'ResourceAttributesJSON.gt.test';
99111
const gtTerm = '70';
100-
const gt = await serializer.gt(gtField, gtTerm);
112+
const gt = await serializer.gt(gtField, gtTerm, {});
101113
expect(gt).toBe(
102-
"(dynamicType(`serviceName`.`gt`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `serviceName`.`gt`.`test` > '70')",
114+
"(dynamicType(`ResourceAttributesJSON`.`gt`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `ResourceAttributesJSON`.`gt`.`test` > '70')",
103115
);
104116
});
105117

106118
it('compare - lt', async () => {
107-
const ltField = 'serviceName.lt.test';
119+
const ltField = 'ResourceAttributesJSON.lt.test';
108120
const ltTerm = '2';
109-
const lt = await serializer.lt(ltField, ltTerm);
121+
const lt = await serializer.lt(ltField, ltTerm, {});
110122
expect(lt).toBe(
111-
"(dynamicType(`serviceName`.`lt`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `serviceName`.`lt`.`test` < '2')",
123+
"(dynamicType(`ResourceAttributesJSON`.`lt`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `ResourceAttributesJSON`.`lt`.`test` < '2')",
112124
);
113125
});
126+
127+
const testCases = [
128+
{
129+
lucene: '"foo bar baz"',
130+
sql: "((hasToken(lower(Body), lower('foo')) AND hasToken(lower(Body), lower('bar')) AND hasToken(lower(Body), lower('baz')) AND (lower(Body) LIKE lower('%foo bar baz%'))))",
131+
english: 'event has whole word "foo bar baz"',
132+
},
133+
{
134+
lucene: 'foo bar baz',
135+
sql: "((hasToken(lower(Body), lower('foo'))) AND (hasToken(lower(Body), lower('bar'))) AND (hasToken(lower(Body), lower('baz'))))",
136+
english:
137+
'event has whole word foo AND event has whole word bar AND event has whole word baz',
138+
},
139+
{
140+
lucene: 'ServiceName:foo bar baz',
141+
sql: "((ServiceName ILIKE '%foo%') AND (hasToken(lower(Body), lower('bar'))) AND (hasToken(lower(Body), lower('baz'))))",
142+
english:
143+
"'ServiceName' contains foo AND event has whole word bar AND event has whole word baz",
144+
},
145+
{
146+
lucene: 'ServiceName:"foo bar baz"',
147+
sql: "((ServiceName = 'foo bar baz'))",
148+
english: "'ServiceName' is foo bar baz",
149+
},
150+
{
151+
lucene: 'ServiceName:("foo bar baz")',
152+
sql: "(((ServiceName ILIKE '%foo bar baz%')))",
153+
english: '(ServiceName contains "foo bar baz")',
154+
},
155+
{
156+
lucene: 'ServiceName:(abc def)',
157+
sql: "(((ServiceName ILIKE '%abc%') AND (ServiceName ILIKE '%def%')))",
158+
english: '(ServiceName contains abc AND ServiceName contains def)',
159+
},
160+
{
161+
lucene: '(abc def)',
162+
sql: "(((hasToken(lower(Body), lower('abc'))) AND (hasToken(lower(Body), lower('def')))))",
163+
english: '(event has whole word abc AND event has whole word def)',
164+
},
165+
{
166+
lucene: '("abc def")',
167+
sql: "(((hasToken(lower(Body), lower('abc')) AND hasToken(lower(Body), lower('def')) AND (lower(Body) LIKE lower('%abc def%')))))",
168+
english: '(event has whole word "abc def")',
169+
},
170+
{
171+
lucene: 'foo:bar',
172+
sql: "((foo ILIKE '%bar%'))",
173+
english: "'foo' contains bar",
174+
},
175+
{
176+
lucene: '(foo:bar)',
177+
sql: "(((foo ILIKE '%bar%')))",
178+
english: "('foo' contains bar)",
179+
},
180+
{
181+
lucene: 'bar',
182+
sql: "((hasToken(lower(Body), lower('bar'))))",
183+
english: 'event has whole word bar',
184+
},
185+
{
186+
lucene: '(bar)',
187+
sql: "(((hasToken(lower(Body), lower('bar')))))",
188+
english: '(event has whole word bar)',
189+
},
190+
{
191+
lucene: 'foo:(bar)',
192+
sql: "(((foo ILIKE '%bar%')))",
193+
english: '(foo contains bar)',
194+
},
195+
{
196+
lucene: 'foo:(bar) baz',
197+
sql: "(((foo ILIKE '%bar%')) AND (hasToken(lower(Body), lower('baz'))))",
198+
english: '(foo contains bar) AND event has whole word baz',
199+
},
200+
{
201+
lucene: 'LogAttributes.error.message:("Failed to fetch")',
202+
sql: "(((`LogAttributes`['error.message'] ILIKE '%Failed to fetch%')))",
203+
english: '(LogAttributes.error.message contains "Failed to fetch")',
204+
},
205+
{
206+
lucene: 'ResourceAttributesJSON.error.message:("Failed to fetch")',
207+
sql: "(((toString(`ResourceAttributesJSON`.`error`.`message`) ILIKE '%Failed to fetch%')))",
208+
english:
209+
'(ResourceAttributesJSON.error.message contains "Failed to fetch")',
210+
},
211+
{
212+
lucene: 'SeverityNumber:>10',
213+
sql: "((SeverityNumber > '10'))",
214+
english: "'SeverityNumber' is greater than 10",
215+
},
216+
{
217+
lucene: 'ResourceAttributesJSON.error.severity:>10',
218+
sql: "((dynamicType(`ResourceAttributesJSON`.`error`.`severity`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `ResourceAttributesJSON`.`error`.`severity` > '10'))",
219+
english: "'ResourceAttributesJSON.error.severity' is greater than 10",
220+
},
221+
{
222+
lucene: 'foo:(bar baz)',
223+
sql: "(((foo ILIKE '%bar%') AND (foo ILIKE '%baz%')))",
224+
english: '(foo contains bar AND foo contains baz)',
225+
},
226+
{
227+
lucene: '-foo:bar',
228+
sql: "((foo NOT ILIKE '%bar%'))",
229+
english: "'foo' does not contain bar",
230+
},
231+
{
232+
lucene: 'NOT foo:(bar baz)',
233+
sql: "(NOT ((foo ILIKE '%bar%') AND (foo ILIKE '%baz%')))",
234+
english: 'NOT (foo contains bar AND foo contains baz)',
235+
},
236+
{
237+
lucene: '-foo:(bar baz)',
238+
sql: "(NOT ((foo ILIKE '%bar%') AND (foo ILIKE '%baz%')))",
239+
english: 'NOT (foo contains bar AND foo contains baz)',
240+
},
241+
{
242+
lucene: '-foo:(bar)',
243+
sql: "(NOT ((foo ILIKE '%bar%')))",
244+
english: 'NOT (foo contains bar)',
245+
},
246+
{
247+
lucene: '-foo:(-bar)',
248+
sql: "(NOT ((foo NOT ILIKE '%bar%')))",
249+
english: 'NOT (foo does not contain bar)',
250+
},
251+
{
252+
lucene: '*bar',
253+
sql: "((lower(Body) LIKE lower('%bar')))",
254+
english: 'event ends with bar',
255+
},
256+
{
257+
lucene: 'foo:*bar',
258+
sql: "((foo ILIKE '%bar%'))",
259+
english: "'foo' contains bar",
260+
},
261+
{
262+
lucene: 'foo:*bar*',
263+
sql: "((foo ILIKE '%bar%'))",
264+
english: "'foo' contains bar",
265+
},
266+
{
267+
lucene: 'foo:(*bar)',
268+
sql: "(((lower(foo) LIKE lower('%bar'))))",
269+
english: '(foo ends with bar)',
270+
},
271+
{
272+
lucene: 'foo:(bar*)',
273+
sql: "(((lower(foo) LIKE lower('bar%'))))",
274+
english: '(foo starts with bar)',
275+
},
276+
{
277+
lucene: 'foo:(*bar*)',
278+
sql: "(((lower(foo) LIKE lower('%bar%'))))",
279+
english: '(foo contains bar)',
280+
},
281+
{
282+
lucene: 'foo:[1 TO 5]',
283+
sql: '((foo BETWEEN 1 AND 5))',
284+
english: 'foo is between 1 and 5',
285+
},
286+
{
287+
lucene: 'foo:(bar:(baz) qux)',
288+
sql: "((((bar ILIKE '%baz%')) AND (foo ILIKE '%qux%')))",
289+
english: '((bar contains baz) AND foo contains qux)',
290+
},
291+
];
292+
293+
it.each(testCases)(
294+
'converts "$lucene" to SQL "$sql"',
295+
async ({ lucene, sql }) => {
296+
const builder = new SearchQueryBuilder(lucene, serializer);
297+
const actualSql = await builder.build();
298+
expect(actualSql).toBe(sql);
299+
},
300+
);
301+
302+
it.each(testCases)(
303+
'converts "$lucene" to english "$english"',
304+
async ({ lucene, english }) => {
305+
const actualEnglish = await genEnglishExplanation(lucene);
306+
expect(actualEnglish).toBe(english);
307+
},
308+
);
309+
310+
it('correctly searches multi-column implicit field', async () => {
311+
const serializer = new CustomSchemaSQLSerializerV2({
312+
metadata,
313+
databaseName,
314+
tableName,
315+
connectionId,
316+
implicitColumnExpression: 'Body, OtherColumn',
317+
});
318+
319+
const lucene = 'foo bar';
320+
const builder = new SearchQueryBuilder(lucene, serializer);
321+
const actualSql = await builder.build();
322+
const expectedSql =
323+
"((hasToken(lower(concatWithSeparator(';',Body,OtherColumn)), lower('foo'))) AND (hasToken(lower(concatWithSeparator(';',Body,OtherColumn)), lower('bar'))))";
324+
expect(actualSql).toBe(expectedSql);
325+
});
114326
});

0 commit comments

Comments
 (0)