Skip to content

Commit ae92c83

Browse files
committed
Fix most extractSnippets CSV tests, add some more test cases
1 parent 5f20480 commit ae92c83

File tree

2 files changed

+66
-24
lines changed

2 files changed

+66
-24
lines changed

x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/HighlighterExpressionEvaluator.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,8 @@ protected void appendMatch(BytesRefBlock.Builder builder, Scorable scorer, int d
140140
for (Text highlightText : highlight.fragments()) {
141141
byte[] highlightBytes = highlightText.bytes().bytes();
142142
if (highlightBytes.length > fragmentLength) {
143-
// TODO - This isn't a great solution, but in order to resolve character encoding issues in the
143+
// TODO - Figure out a better way to construct BytesRef
144+
// This isn't a great solution, but in order to resolve character encoding issues in the
144145
// returned BytesRef we need to ensure that the fragment size we return is equal to what was requested.
145146
// Since the highlighter's default sentence boundary scanner can return longer fragments, we're truncating for now.
146147
byte[] truncatedBytes = truncateUtf8(highlightBytes, fragmentLength);
@@ -163,7 +164,8 @@ private static byte[] truncateUtf8(byte[] bytes, int maxLength) throws Character
163164
.onUnmappableCharacter(CodingErrorAction.IGNORE);
164165

165166
CharBuffer chars = dec.decode(ByteBuffer.wrap(bytes, 0, maxLength));
166-
ByteBuffer out = StandardCharsets.UTF_8.encode(chars);
167+
String trimmed = chars.toString().trim();
168+
ByteBuffer out = StandardCharsets.UTF_8.encode(trimmed);
167169

168170
byte[] result = new byte[out.remaining()];
169171
out.get(result);

x-pack/plugin/esql/qa/testFixtures/src/main/resources/extract-snippets-function.csv-spec

Lines changed: 62 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ FROM books
1616

1717
// tag::extract-snippets-with-field-result[]
1818
book_no:keyword | author:text | title:text | snippets:keyword
19-
1211 | Fyodor Dostoevsky | The brothers Karamazov | achievement of perhaps the
19+
1211 | Fyodor Dostoevsky | The brothers Karamazov | achievement of perhaps th
2020
// end::extract-snippets-with-field-result[]
2121
;
2222

@@ -25,18 +25,18 @@ required_capability: extract_snippets_function
2525

2626
FROM books
2727
| WHERE MATCH(description, "hobbit")
28-
| EVAL snippets = extract_snippets(description, "hobbit", 1, 25)
28+
| EVAL snippets = extract_snippets(description, "hobbit", 1, 50)
2929
| KEEP book_no, author, title, snippets
3030
| SORT book_no
3131
| LIMIT 5
3232
;
3333

3434
book_no:keyword | author:text | title:text | snippets:keyword
35-
1463 | J. R. R. Tolkien | Realms of Tolkien: Images of Middle-earth | appropriate passage from The Hobbit
36-
2301 | John Ronald Reuel Tolkien | Smith of Wootton Major & Farmer Giles of Ham | beloved author of THE HOBBIT
37-
2675 | J.R.R. Tolkien | The Lord of the Rings - Boxed Set | Bilbo Baggins is a hobbit
38-
2714 | J. R. R. Tolkien | Return of the King Being the Third Part of The Lord of the Rings | the story begun in The Hobbit
39-
2936 | John Ronald Reuel Tolkien | Fellowship of the Ring 2ND Edition | into the hands of the hobbit
35+
1463 | J. R. R. Tolkien | Realms of Tolkien: Images of Middle-earth | is accompanied by appropriate passage from The Hob
36+
2301 | John Ronald Reuel Tolkien | Smith of Wootton Major & Farmer Giles of Ham | Tolkien, beloved author of THE HOBBIT.
37+
2675 | J.R.R. Tolkien | The Lord of the Rings - Boxed Set | This beautiful gift edition of The Hobbit, J.R.R.
38+
2714 | J. R. R. Tolkien | Return of the King Being the Third Part of The Lord of the Rings | Concluding the story begun in The Hobbit, this is
39+
2936 | John Ronald Reuel Tolkien | Fellowship of the Ring 2ND Edition | them all - which has fallen into the hands of the
4040
;
4141

4242
extractMultipleSnippetsWithMatch
@@ -51,11 +51,11 @@ FROM books
5151
;
5252

5353
book_no:keyword | author:text | title:text | snippets:keyword
54-
1463 | J. R. R. Tolkien | Realms of Tolkien: Images of Middle-earth | appropriate passage from The Hobbit
55-
2301 | John Ronald Reuel Tolkien | Smith of Wootton Major & Farmer Giles of Ham | beloved author of THE HOBBIT
56-
2675 | J.R.R. Tolkien | The Lord of the Rings - Boxed Set | [Bilbo Baggins is a hobbit, beautiful gift edition of The Hobbit, Tolkien's own children, The Hobbit]
57-
2714 | J. R. R. Tolkien | Return of the King Being the Third Part of The Lord of the Rings | [the story begun in The Hobbit, , THE HOBBIT: AN UNEXPECTED, film adaptation of The Hobbit]
58-
2936 | John Ronald Reuel Tolkien | Fellowship of the Ring 2ND Edition | into the hands of the hobbit
54+
1463 | J. R. R. Tolkien | Realms of Tolkien: Images of Middle-earth | appropriate passage from
55+
2301 | John Ronald Reuel Tolkien | Smith of Wootton Major & Farmer Giles of Ham | beloved author of THE HOB
56+
2675 | J.R.R. Tolkien | The Lord of the Rings - Boxed Set | [Bilbo Baggins is a hobbit, beautiful gift edition of, Tolkien's own children, T]
57+
2714 | J. R. R. Tolkien | Return of the King Being the Third Part of The Lord of the Rings | [the story begun in The Ho, , THE HOBBIT: AN UNEXPECT, film adaptation of The Ho]
58+
2936 | John Ronald Reuel Tolkien | Fellowship of the Ring 2ND Edition | into the hands of the hob
5959
;
6060

6161

@@ -67,33 +67,73 @@ FROM books
6767
| EVAL snippets = extract_snippets(description, "hobbit", 3, 25)
6868
| MV_EXPAND snippets
6969
| KEEP book_no, author, title, snippets
70-
| SORT book_no
71-
| LIMIT 5
70+
| SORT snippets
71+
| LIMIT 9
7272
;
7373

74-
book_no:keyword | author:text | title:text | snippets:keyword
75-
1463 | J. R. R. Tolkien | Realms of Tolkien: Images of Middle-earth | appropriate passage from The Hobbit
76-
2301 | John Ronald Reuel Tolkien | Smith of Wootton Major & Farmer Giles of Ham | beloved author of THE HOBBIT
77-
2675 | J.R.R. Tolkien | The Lord of the Rings - Boxed Set | beautiful gift edition of The Hobbit
78-
2675 | J.R.R. Tolkien | The Lord of the Rings - Boxed Set | Tolkien's own children, The Hobbit
79-
2675 | J.R.R. Tolkien | The Lord of the Rings - Boxed Set | Bilbo Baggins is a hobbit
74+
book_no:keyword | author:text | title:text | snippets:keyword
75+
2714 | J. R. R. Tolkien | Return of the King Being the Third Part of The Lord of the Rings | , THE HOBBIT: AN UNEXPECT
76+
2675 | J.R.R. Tolkien | The Lord of the Rings - Boxed Set | Bilbo Baggins is a hobbit
77+
6760 | J. R. R. Tolkien | Roverandom | By the author of The Hobb
78+
7350 | [Christopher Tolkien, John Ronald Reuel Tolkien] | Return of the Shadow | The character of the hobb
79+
4289 | J R R Tolkien | Poems from the Hobbit | Tolkien's Hobbit poems in
80+
4289 | J R R Tolkien | Poems from the Hobbit | Tolkien's acclaimed The H
81+
2675 | J.R.R. Tolkien | The Lord of the Rings - Boxed Set | Tolkien's own children, T
82+
1463 | J. R. R. Tolkien | Realms of Tolkien: Images of Middle-earth | appropriate passage from
83+
2675 | J.R.R. Tolkien | The Lord of the Rings - Boxed Set | beautiful gift edition of
8084
;
8185

8286
extractMultipleSnippetsWithSomeNoMatches
8387
required_capability: extract_snippets_function
8488

8589
FROM books
8690
| WHERE MATCH(author, "Faulkner")
87-
| EVAL snippets = extract_snippets(description, "slavery", 1, 10)
91+
| EVAL snippets = extract_snippets(description, "slavery", 1, 25)
8892
| KEEP book_no, author, title, snippets
8993
| SORT book_no
9094
| LIMIT 5
9195
;
9296

9397
book_no:keyword | author:text | title:text | snippets:keyword
94-
2378 | [Carol Faulkner, Holly Byers Ochoa, Lucretia Mott] | Selected Letters of Lucretia Coffin Mott (Women in American History) | abolition of slavery
98+
2378 | [Carol Faulkner, Holly Byers Ochoa, Lucretia Mott] | Selected Letters of Lucretia Coffin Mott (Women in American History) | , and the abolition of sl
9599
2713 | William Faulkner | Collected Stories of William Faulkner | null
96100
2847 | Colleen Faulkner | To Love A Dark Stranger (Lovegram Historical Romance) | null
97101
2883 | William Faulkner | A Summer of Faulkner: As I Lay Dying/The Sound and the Fury/Light in August (Oprah's Book Club) | null
98102
3293 | Danny Faulkner | Universe by Design | null
99103
;
104+
105+
extractSnippetsWithDefaultNumSnippetsAndLength
106+
107+
FROM books
108+
| WHERE MATCH(description, "hobbit")
109+
| EVAL snippets = extract_snippets(description, "hobbit")
110+
| KEEP book_no, author, title, snippets
111+
| SORT book_no
112+
| LIMIT 5
113+
;
114+
115+
book_no:keyword | author:text | title:text | snippets:keyword
116+
1463 | J. R. R. Tolkien | Realms of Tolkien: Images of Middle-earth | from The H
117+
2301 | John Ronald Reuel Tolkien | Smith of Wootton Major & Farmer Giles of Ham | of THE HOB
118+
2675 | J.R.R. Tolkien | The Lord of the Rings - Boxed Set | of The Hob
119+
2714 | J. R. R. Tolkien | Return of the King Being the Third Part of The Lord of the Rings | in The Hob
120+
2936 | John Ronald Reuel Tolkien | Fellowship of the Ring 2ND Edition | of the hob
121+
;
122+
123+
extractSnippetsWithDefaultLength
124+
125+
FROM books
126+
| WHERE MATCH(description, "hobbit")
127+
| EVAL snippets = extract_snippets(description, "hobbit", 3)
128+
| KEEP book_no, author, title, snippets
129+
| SORT book_no
130+
| LIMIT 5
131+
;
132+
133+
book_no:keyword | author:text | title:text | snippets:keyword
134+
1463 | J. R. R. Tolkien | Realms of Tolkien: Images of Middle-earth | from The H
135+
2301 | John Ronald Reuel Tolkien | Smith of Wootton Major & Farmer Giles of Ham | of THE HOB
136+
2675 | J.R.R. Tolkien | The Lord of the Rings - Boxed Set | [of The Hob, Baggins is, children,]
137+
2714 | J. R. R. Tolkien | Return of the King Being the Third Part of The Lord of the Rings | [in The Hob, , THE HOBB, of The Hob]
138+
2936 | John Ronald Reuel Tolkien | Fellowship of the Ring 2ND Edition | of the hob
139+
;

0 commit comments

Comments
 (0)