Skip to content

Commit 4a85617

Browse files
authored
feat: Add hasAllTokens for text index support (#1637)
Closes HDX-3245 # Summary This PR updates the Lucene to SQL compilation process to generate conditions using `hasAllTokens` when the target column has a text index defined. `hasAllTokens` has a couple of limitations which are solved for: 1. The `needle` argument must be no more than 64 tokens, or `hasAllTokens` will error. To support search terms with more than 64 tokens, terms are first broken up into batches of 50 tokens, each batch is passed to a separate `hasAllTokens` call. When multiple `hasAllTokens` calls are used, we also use substring matching `lower(Body) LIKE '%term with many tokens...%'`. 2. `hasAllTokens` may only be used when `enable_full_text_index = 1`. The existence of a text index does not guarantee that `enable_full_text_index = 1`, since the text index could have been created with a query that explicitly specified `SETTINGS enable_full_text_index = 1`. We cannot set this option in every query HyperDX makes, because the setting was not available prior to v25.12. To solve for this, we check the value of `enable_full_text_index` in `system.settings`, and only use `hasAllTokens` if the setting exists and is enabled. ## Testing Setup ### Enable Full Text Index First, make sure you're running at least ClickHouse 25.12. Then, update the ClickHouse `users.xml`'s default profile with the following (or otherwise update your user's profile): ```xml <clickhouse> <profiles> <default> ... <enable_full_text_index>1</enable_full_text_index> </default> </profiles> ... <clickhouse> ``` ### Add a Full Text Index ```sql ALTER TABLE otel_logs ADD INDEX text_idx(Body) TYPE text(tokenizer=splitByNonAlpha, preprocessor=lower(Body)) SETTINGS enable_full_text_index=1; ALTER TABLE otel_logs MATERIALIZE INDEX text_idx; ``` ## Limitations 1. We currently only support the `splitByNonAlpha` tokenizer. If the text index is created with a different tokenizer, `hasAllTokens` will not be used. If needed, this limitation can be removed in the future by implementing `tokenizeTerm`, `termContainsSeparators`, and token batching logic specific to the other tokenizers. 2. This requires the latest (Beta) version of the full text index and related setting, available in ClickHouse v25.12.
1 parent 9f51920 commit 4a85617

File tree

8 files changed

+799
-8
lines changed

8 files changed

+799
-8
lines changed

.changeset/tough-swans-doubt.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
"@hyperdx/common-utils": patch
3+
"@hyperdx/api": patch
4+
"@hyperdx/app": patch
5+
---
6+
7+
feat: Add hasAllTokens for text index support

packages/common-utils/src/__tests__/metadata.int.test.ts

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -522,4 +522,35 @@ describe('Metadata Integration Tests', () => {
522522
);
523523
});
524524
});
525+
526+
describe('getSetting', () => {
527+
let metadata: Metadata;
528+
beforeEach(async () => {
529+
metadata = new Metadata(hdxClient, new MetadataCache());
530+
});
531+
532+
it('should get setting that exists and is enabled', async () => {
533+
const settingValue = await metadata.getSetting({
534+
settingName: 'format_csv_allow_single_quotes',
535+
connectionId: 'test_connection',
536+
});
537+
expect(settingValue).toBe('0');
538+
});
539+
540+
it('should get setting that exists and is disabled', async () => {
541+
const settingValue = await metadata.getSetting({
542+
settingName: 'format_csv_allow_double_quotes',
543+
connectionId: 'test_connection',
544+
});
545+
expect(settingValue).toBe('1');
546+
});
547+
548+
it('should return undefined for setting that does not exist', async () => {
549+
const settingValue = await metadata.getSetting({
550+
settingName: 'enable_quantum_tunnelling',
551+
connectionId: 'test_connection',
552+
});
553+
expect(settingValue).toBeUndefined();
554+
});
555+
});
525556
});

packages/common-utils/src/__tests__/queryParser.test.ts

Lines changed: 337 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -767,6 +767,343 @@ describe('CustomSchemaSQLSerializerV2 - bloom_filter tokens() indices', () => {
767767
});
768768
});
769769

770+
describe('CustomSchemaSQLSerializerV2 - text indices', () => {
771+
const metadata = getMetadata(
772+
new ClickhouseClient({ host: 'http://localhost:8123' }),
773+
);
774+
775+
const databaseName = 'default';
776+
const tableName = 'otel_logs';
777+
const connectionId = 'test';
778+
779+
beforeEach(() => {
780+
// Mock getColumn to return Body as String column
781+
metadata.getColumn = jest.fn().mockImplementation(async ({ column }) => {
782+
if (column === 'Body') {
783+
return { name: 'Body', type: 'String' };
784+
} else if (column === 'ServiceName') {
785+
return { name: 'ServiceName', type: 'String' };
786+
}
787+
return undefined;
788+
});
789+
790+
metadata.getSetting = jest
791+
.fn()
792+
.mockImplementation(async ({ settingName }) => {
793+
if (settingName === 'enable_full_text_index') {
794+
return '1';
795+
}
796+
return undefined;
797+
});
798+
});
799+
800+
it('should use hasAllTokens when text index exists', async () => {
801+
metadata.getSkipIndices = jest.fn().mockResolvedValue([
802+
{
803+
name: 'idx_body_text',
804+
type: 'text',
805+
typeFull: 'text(tokenizer=splitByNonAlpha)',
806+
expression: 'Body',
807+
granularity: '8',
808+
},
809+
]);
810+
811+
const serializer = new CustomSchemaSQLSerializerV2({
812+
metadata,
813+
databaseName,
814+
tableName,
815+
connectionId,
816+
implicitColumnExpression: 'Body',
817+
});
818+
819+
const builder = new SearchQueryBuilder('foo', serializer);
820+
const sql = await builder.build();
821+
822+
expect(sql).toBe("((hasAllTokens(Body, 'foo')))");
823+
});
824+
825+
it('should use hasAllTokens for multi-token terms with single call', async () => {
826+
metadata.getSkipIndices = jest.fn().mockResolvedValue([
827+
{
828+
name: 'idx_body_text',
829+
type: 'text',
830+
typeFull: 'text(tokenizer=splitByNonAlpha, preprocessor=lower(Body))',
831+
expression: 'Body',
832+
granularity: '8',
833+
},
834+
]);
835+
836+
const serializer = new CustomSchemaSQLSerializerV2({
837+
metadata,
838+
databaseName,
839+
tableName,
840+
connectionId,
841+
implicitColumnExpression: 'Body',
842+
});
843+
844+
const builder = new SearchQueryBuilder('"foo bar"', serializer);
845+
const sql = await builder.build();
846+
847+
expect(sql).toContain("hasAllTokens(Body, 'foo bar')");
848+
expect(sql).toContain("(lower(Body) LIKE lower('%foo bar%'))");
849+
});
850+
851+
it('should fallback to hasToken when no text indexes are found', async () => {
852+
// Mock getSkipIndices to return empty
853+
metadata.getSkipIndices = jest.fn().mockResolvedValue([]);
854+
855+
const serializer = new CustomSchemaSQLSerializerV2({
856+
metadata,
857+
databaseName,
858+
tableName,
859+
connectionId,
860+
implicitColumnExpression: 'Body',
861+
});
862+
863+
const builder = new SearchQueryBuilder('foo', serializer);
864+
const sql = await builder.build();
865+
866+
// Should use hasToken (existing behavior)
867+
expect(sql).toBe("((hasToken(lower(Body), lower('foo'))))");
868+
});
869+
870+
it('should handle text index on a different column', async () => {
871+
metadata.getSkipIndices = jest.fn().mockResolvedValue([
872+
{
873+
name: 'idx_body_text',
874+
type: 'text',
875+
typeFull: 'text(tokenizer=splitByNonAlpha)',
876+
expression: 'OtherBody',
877+
granularity: '8',
878+
},
879+
]);
880+
881+
const serializer = new CustomSchemaSQLSerializerV2({
882+
metadata,
883+
databaseName,
884+
tableName,
885+
connectionId,
886+
implicitColumnExpression: 'Body',
887+
});
888+
889+
const builder = new SearchQueryBuilder('foo', serializer);
890+
const sql = await builder.build();
891+
892+
// Should fallback to hasToken (index doesn't use tokens())
893+
expect(sql).toBe("((hasToken(lower(Body), lower('foo'))))");
894+
});
895+
896+
it('should handle negated searches with hasAllTokens', async () => {
897+
metadata.getSkipIndices = jest.fn().mockResolvedValue([
898+
{
899+
name: 'idx_body_text',
900+
type: 'text',
901+
typeFull: 'text(tokenizer=splitByNonAlpha, preprocessor=lower(Body))',
902+
expression: 'Body',
903+
granularity: '8',
904+
},
905+
]);
906+
907+
const serializer = new CustomSchemaSQLSerializerV2({
908+
metadata,
909+
databaseName,
910+
tableName,
911+
connectionId,
912+
implicitColumnExpression: 'Body',
913+
});
914+
915+
const builder = new SearchQueryBuilder('-foo', serializer);
916+
const sql = await builder.build();
917+
918+
// Should use NOT hasAllTokens
919+
expect(sql).toBe("((NOT hasAllTokens(Body, 'foo')))");
920+
});
921+
922+
it('should not use text index for explicit field searches', async () => {
923+
metadata.getSkipIndices = jest.fn().mockResolvedValue([
924+
{
925+
name: 'idx_body_text',
926+
type: 'text',
927+
typeFull: 'text(tokenizer=splitByNonAlpha)',
928+
expression: 'Body',
929+
granularity: '8',
930+
},
931+
]);
932+
933+
const serializer = new CustomSchemaSQLSerializerV2({
934+
metadata,
935+
databaseName,
936+
tableName,
937+
connectionId,
938+
implicitColumnExpression: 'Body',
939+
});
940+
941+
// Query: 'ServiceName:foo'
942+
const builder = new SearchQueryBuilder('ServiceName:foo', serializer);
943+
const sql = await builder.build();
944+
945+
// Should use ILIKE, not hasAll or hasToken
946+
expect(sql).toContain('ILIKE');
947+
expect(sql).not.toContain('hasAll');
948+
expect(sql).not.toContain('hasToken');
949+
});
950+
951+
it('should batch tokens into groups to avoid hitting the hasAllTokens limit', async () => {
952+
metadata.getSkipIndices = jest.fn().mockResolvedValue([
953+
{
954+
name: 'idx_body_text',
955+
type: 'text',
956+
typeFull: 'text(tokenizer=splitByNonAlpha)',
957+
expression: 'Body',
958+
granularity: '8',
959+
},
960+
]);
961+
962+
const serializer = new CustomSchemaSQLSerializerV2({
963+
metadata,
964+
databaseName,
965+
tableName,
966+
connectionId,
967+
implicitColumnExpression: 'Body',
968+
});
969+
970+
const builder = new SearchQueryBuilder(
971+
'"1 2 3 4 5 6 7 8 9 10; 11 12 13 14 15 16 17 18 19 20; 21 22 23 24 25 26 27 28 29 30; 31 32 33 34 35 36 37 38 39 40; 41 42 43 44 45 46 47 48 49 50; 51 52 53 54 55 56 57 58 59 60;"',
972+
serializer,
973+
);
974+
const sql = await builder.build();
975+
976+
// Should generate separate hasAllTokens for each term (not single statement)
977+
expect(sql).toContain(
978+
"hasAllTokens(Body, '1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50') AND hasAllTokens(Body, '51 52 53 54 55 56 57 58 59 60') AND (lower(Body) LIKE lower('%1 2 3 4 5 6 7 8 9 10; 11 12 13 14 15 16 17 18 19 20; 21 22 23 24 25 26 27 28 29 30; 31 32 33 34 35 36 37 38 39 40; 41 42 43 44 45 46 47 48 49 50; 51 52 53 54 55 56 57 58 59 60;%'))",
979+
);
980+
});
981+
982+
it('should use hasAllTokens for multiple separate terms', async () => {
983+
metadata.getSkipIndices = jest.fn().mockResolvedValue([
984+
{
985+
name: 'idx_body_text',
986+
type: 'text',
987+
typeFull: 'text(tokenizer=splitByNonAlpha)',
988+
expression: 'Body',
989+
granularity: '8',
990+
},
991+
]);
992+
993+
const serializer = new CustomSchemaSQLSerializerV2({
994+
metadata,
995+
databaseName,
996+
tableName,
997+
connectionId,
998+
implicitColumnExpression: 'Body',
999+
});
1000+
1001+
const builder = new SearchQueryBuilder('foo NOT bar baz', serializer);
1002+
const sql = await builder.build();
1003+
1004+
// Should generate separate hasAllTokens for each term (not single statement)
1005+
expect(sql).toContain("hasAllTokens(Body, 'foo')");
1006+
expect(sql).toContain("NOT (hasAllTokens(Body, 'bar'))");
1007+
expect(sql).toContain("hasAllTokens(Body, 'baz')");
1008+
});
1009+
1010+
it('should not use text index when enable_full_text_index is disabled', async () => {
1011+
metadata.getSkipIndices = jest.fn().mockResolvedValue([
1012+
{
1013+
name: 'idx_body_text',
1014+
type: 'text',
1015+
typeFull: 'text(tokenizer=splitByNonAlpha)',
1016+
expression: 'Body',
1017+
granularity: '8',
1018+
},
1019+
]);
1020+
1021+
// Mock getSetting to disable full text index
1022+
metadata.getSetting = jest
1023+
.fn()
1024+
.mockImplementation(async ({ settingName }) => {
1025+
if (settingName === 'enable_full_text_index') {
1026+
return '0';
1027+
}
1028+
return undefined;
1029+
});
1030+
1031+
const serializer = new CustomSchemaSQLSerializerV2({
1032+
metadata,
1033+
databaseName,
1034+
tableName,
1035+
connectionId,
1036+
implicitColumnExpression: 'Body',
1037+
});
1038+
1039+
const builder = new SearchQueryBuilder('foo', serializer);
1040+
const sql = await builder.build();
1041+
1042+
// Should fallback to hasToken (full text index disabled)
1043+
expect(sql).toBe("((hasToken(lower(Body), lower('foo'))))");
1044+
});
1045+
1046+
it('should not use text index when enable_full_text_index is unavailable (ClickHouse version is old)', async () => {
1047+
metadata.getSkipIndices = jest.fn().mockResolvedValue([
1048+
{
1049+
name: 'idx_body_text',
1050+
type: 'text',
1051+
typeFull: 'text(tokenizer=splitByNonAlpha)',
1052+
expression: 'Body',
1053+
granularity: '8',
1054+
},
1055+
]);
1056+
1057+
// Mock getSetting to disable full text index
1058+
metadata.getSetting = jest.fn().mockResolvedValue(undefined);
1059+
1060+
const serializer = new CustomSchemaSQLSerializerV2({
1061+
metadata,
1062+
databaseName,
1063+
tableName,
1064+
connectionId,
1065+
implicitColumnExpression: 'Body',
1066+
});
1067+
1068+
const builder = new SearchQueryBuilder('foo', serializer);
1069+
const sql = await builder.build();
1070+
1071+
// Should fallback to hasToken (full text index disabled)
1072+
expect(sql).toBe("((hasToken(lower(Body), lower('foo'))))");
1073+
});
1074+
1075+
it('should not use text index when getSetting throws an error', async () => {
1076+
metadata.getSkipIndices = jest.fn().mockResolvedValue([
1077+
{
1078+
name: 'idx_body_text',
1079+
type: 'text',
1080+
typeFull: 'text(tokenizer=splitByNonAlpha)',
1081+
expression: 'Body',
1082+
granularity: '8',
1083+
},
1084+
]);
1085+
1086+
// Mock getSetting to disable full text index
1087+
metadata.getSetting = jest
1088+
.fn()
1089+
.mockRejectedValue(new Error('Failed to get setting'));
1090+
1091+
const serializer = new CustomSchemaSQLSerializerV2({
1092+
metadata,
1093+
databaseName,
1094+
tableName,
1095+
connectionId,
1096+
implicitColumnExpression: 'Body',
1097+
});
1098+
1099+
const builder = new SearchQueryBuilder('foo', serializer);
1100+
const sql = await builder.build();
1101+
1102+
// Should fallback to hasToken (full text index disabled)
1103+
expect(sql).toBe("((hasToken(lower(Body), lower('foo'))))");
1104+
});
1105+
});
1106+
7701107
describe('CustomSchemaSQLSerializerV2 - indexCoversColumn', () => {
7711108
const metadata = getMetadata(
7721109
new ClickhouseClient({ host: 'http://localhost:8123' }),

packages/common-utils/src/__tests__/renderChartConfig.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ describe('renderChartConfig', () => {
3838
.fn()
3939
.mockResolvedValue({ primary_key: 'timestamp' }),
4040
getSkipIndices: jest.fn().mockResolvedValue([]),
41+
getSetting: jest.fn().mockResolvedValue(undefined),
4142
} as unknown as jest.Mocked<Metadata>;
4243
});
4344

0 commit comments

Comments
 (0)