Skip to content

Commit cda8255

Browse files
authored
ES|QL: Add initial grammar and planning for RRF (snapshot) (#123396)
1 parent f1f2df7 commit cda8255

File tree

29 files changed

+3235
-2351
lines changed

29 files changed

+3235
-2351
lines changed

docs/changelog/123396.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 123396
2+
summary: Add initial grammar and planning for RRF (snapshot)
3+
area: ES|QL
4+
type: feature
5+
issues: []
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0; you may not use this file except in compliance with the Elastic License
5+
* 2.0.
6+
*/
7+
8+
package org.elasticsearch.compute.operator;
9+
10+
import org.apache.lucene.util.BytesRef;
11+
import org.elasticsearch.compute.data.Block;
12+
import org.elasticsearch.compute.data.BytesRefBlock;
13+
import org.elasticsearch.compute.data.DoubleVector;
14+
import org.elasticsearch.compute.data.Page;
15+
16+
import java.util.HashMap;
17+
18+
/**
19+
* Updates the score column with new scores using the RRF formula.
20+
* Receives the position of the score and fork columns.
21+
* The new score we assign to each row is equal to {@code 1 / (rank_constant + row_number)}.
22+
* We use the fork discriminator column to determine the {@code row_number} for each row.
23+
*/
24+
public class RrfScoreEvalOperator extends AbstractPageMappingOperator {
25+
26+
public record Factory(int forkPosition, int scorePosition) implements OperatorFactory {
27+
@Override
28+
public Operator get(DriverContext driverContext) {
29+
return new RrfScoreEvalOperator(forkPosition, scorePosition);
30+
}
31+
32+
@Override
33+
public String describe() {
34+
return "RrfScoreEvalOperator";
35+
}
36+
37+
}
38+
39+
private final int scorePosition;
40+
private final int forkPosition;
41+
42+
private HashMap<String, Integer> counters = new HashMap<>();
43+
44+
public RrfScoreEvalOperator(int forkPosition, int scorePosition) {
45+
this.scorePosition = scorePosition;
46+
this.forkPosition = forkPosition;
47+
}
48+
49+
@Override
50+
protected Page process(Page page) {
51+
BytesRefBlock forkBlock = (BytesRefBlock) page.getBlock(forkPosition);
52+
53+
DoubleVector.Builder scores = forkBlock.blockFactory().newDoubleVectorBuilder(forkBlock.getPositionCount());
54+
55+
for (int i = 0; i < page.getPositionCount(); i++) {
56+
String fork = forkBlock.getBytesRef(i, new BytesRef()).utf8ToString();
57+
58+
int rank = counters.getOrDefault(fork, 1);
59+
counters.put(fork, rank + 1);
60+
scores.appendDouble(1.0 / (60 + rank));
61+
}
62+
63+
Block scoreBlock = scores.build().asBlock();
64+
page = page.appendBlock(scoreBlock);
65+
66+
int[] projections = new int[page.getBlockCount() - 1];
67+
68+
for (int i = 0; i < page.getBlockCount() - 1; i++) {
69+
projections[i] = i == scorePosition ? page.getBlockCount() - 1 : i;
70+
}
71+
72+
return page.projectBlocks(projections);
73+
}
74+
75+
@Override
76+
public String toString() {
77+
return "RrfScoreEvalOperator";
78+
}
79+
}
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
//
2+
// CSV spec for RRF command
3+
//
4+
5+
simpleRrf
6+
required_capability: fork
7+
required_capability: rrf
8+
required_capability: match_operator_colon
9+
10+
FROM employees METADATA _id, _index, _score
11+
| FORK ( WHERE emp_no:10001 )
12+
( WHERE emp_no:10002 )
13+
| RRF
14+
| EVAL _score = round(_score, 4)
15+
| KEEP _score, _fork, emp_no
16+
| SORT _score, _fork, emp_no
17+
;
18+
19+
_score:double | _fork:keyword | emp_no:integer
20+
0.0164 | fork1 | 10001
21+
0.0164 | fork2 | 10002
22+
;
23+
24+
rrfWithMatchAndScore
25+
required_capability: fork
26+
required_capability: rrf
27+
required_capability: match_operator_colon
28+
29+
FROM books METADATA _id, _index, _score
30+
| FORK ( WHERE title:"Tolkien" | SORT _score, _id DESC | LIMIT 3 )
31+
( WHERE author:"Tolkien" | SORT _score, _id DESC | LIMIT 3 )
32+
| RRF
33+
| EVAL _fork = mv_sort(_fork)
34+
| EVAL _score = round(_score, 5)
35+
| KEEP _score, _fork, _id
36+
;
37+
38+
_score:double | _fork:keyword | _id:keyword
39+
0.03279 | [fork1, fork2] | 4
40+
0.01613 | fork1 | 56
41+
0.01613 | fork2 | 60
42+
0.01587 | fork2 | 1
43+
0.01587 | fork1 | 26
44+
;
45+
46+
rrfWithDisjunctionAndPostFilter
47+
required_capability: fork
48+
required_capability: rrf
49+
required_capability: match_operator_colon
50+
51+
FROM books METADATA _id, _index, _score
52+
| FORK ( WHERE title:"Tolkien" OR author:"Tolkien" | SORT _score, _id DESC | LIMIT 3 )
53+
( WHERE author:"Tolkien" | SORT _score, _id DESC | LIMIT 3 )
54+
| RRF
55+
| EVAL _fork = mv_sort(_fork)
56+
| EVAL _score = round(_score, 5)
57+
| KEEP _score, _fork, _id
58+
| WHERE _score > 0.014
59+
;
60+
61+
_score:double | _fork:keyword | _id:keyword
62+
0.03252 | [fork1, fork2] | 60
63+
0.032 | [fork1, fork2] | 1
64+
0.01639 | fork2 | 4
65+
0.01587 | fork1 | 40
66+
;
67+
68+
rrfWithStats
69+
required_capability: fork
70+
required_capability: rrf
71+
required_capability: match_operator_colon
72+
73+
FROM books METADATA _id, _index, _score
74+
| FORK ( WHERE title:"Tolkien" | SORT _score, _id DESC | LIMIT 3 )
75+
( WHERE author:"Tolkien" | SORT _score, _id DESC | LIMIT 3 )
76+
( WHERE author:"Ursula K. Le Guin" AND title:"short stories" | SORT _score, _id DESC | LIMIT 3)
77+
| RRF
78+
| STATS count_fork=COUNT(*) BY _fork
79+
;
80+
81+
count_fork:long | _fork:keyword
82+
3 | fork1
83+
3 | fork2
84+
1 | fork3
85+
;
86+
87+
rrfWithMultipleForkBranches
88+
required_capability: fork
89+
required_capability: rrf
90+
required_capability: match_operator_colon
91+
92+
FROM books METADATA _id, _index, _score
93+
| FORK (WHERE author:"Keith Faulkner" AND qstr("author:Rory or author:Beverlie") | SORT _score, _id DESC | LIMIT 3)
94+
(WHERE author:"Ursula K. Le Guin" | SORT _score, _id DESC | LIMIT 3)
95+
(WHERE title:"Tolkien" AND author:"Tolkien" AND year > 2000 AND mv_count(author) == 1 | SORT _score, _id DESC | LIMIT 3)
96+
(WHERE match(author, "Keith Faulkner") AND match(author, "Rory Tyger") | SORT _score, _id DESC | LIMIT 3)
97+
| RRF
98+
| EVAL _fork = mv_sort(_fork)
99+
| EVAL _score = round(_score, 4)
100+
| EVAL title = trim(substring(title, 1, 20))
101+
| KEEP _score, author, title, _fork
102+
;
103+
104+
_score:double | author:keyword | title:keyword | _fork:keyword
105+
0.0328 | [Keith Faulkner, Rory Tyger] | Pop! Went Another Ba | [fork1, fork4]
106+
0.0164 | J.R.R. Tolkien | Letters of J R R Tol | fork3
107+
0.0164 | Ursula K. Le Guin | The wind's twelve qu | fork2
108+
0.0161 | [Beverlie Manson, Keith Faulkner] | Rainbow's End: A Mag | fork1
109+
0.0161 | Ursula K. Le Guin | The Word For World i | fork2
110+
0.0159 | Ursula K. Le Guin | The Dispossessed | fork2
111+
;

x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/action/ForkIT.java

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,32 @@ public void testScoringKeepAndSort() {
350350
}
351351
}
352352

353+
public void testRrf() {
354+
assumeTrue("requires RRF capability", EsqlCapabilities.Cap.RRF.isEnabled());
355+
356+
var query = """
357+
FROM test METADATA _score, _id, _index
358+
| WHERE id > 2
359+
| FORK
360+
( WHERE content:"fox" | SORT _score, _id DESC )
361+
( WHERE content:"dog" | SORT _score, _id DESC )
362+
| RRF
363+
| EVAL _score = round(_score, 4)
364+
| KEEP id, content, _score, _fork
365+
""";
366+
try (var resp = run(query)) {
367+
assertColumnNames(resp.columns(), List.of("id", "content", "_score", "_fork"));
368+
assertColumnTypes(resp.columns(), List.of("integer", "keyword", "double", "keyword"));
369+
assertThat(getValuesList(resp.values()).size(), equalTo(3));
370+
Iterable<Iterable<Object>> expectedValues = List.of(
371+
List.of(6, "The quick brown fox jumps over the lazy dog", 0.0325, List.of("fork1", "fork2")),
372+
List.of(4, "The dog is brown but this document is very very long", 0.0164, "fork2"),
373+
List.of(3, "This dog is really brown", 0.0159, "fork2")
374+
);
375+
assertValues(resp.values(), expectedValues);
376+
}
377+
}
378+
353379
public void testThreeSubQueries() {
354380
var query = """
355381
FROM test

x-pack/plugin/esql/src/main/antlr/EsqlBaseLexer.g4

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ import ChangePoint,
6565
Metrics,
6666
MvExpand,
6767
Project,
68+
Rrf,
6869
Rename,
6970
Show,
7071
UnknownCommand;

0 commit comments

Comments
 (0)