Skip to content

Commit f86e93c

Browse files
authored
Fix tsdb codec when doc-values spread in two blocks (#108276) (#108282)
Currently, loading ordinals multiple times (after advanceExact) for documents with values spread across multiple blocks in the TSDB codec will fail due to the absence of re-seeking for the ordinals block. Doc-values of a document can spread across multiple blocks in two cases: when it has more than 128 values or when it exceeds the remaining space in the current block.
1 parent 0fe59ca commit f86e93c

File tree

3 files changed

+140
-6
lines changed

3 files changed

+140
-6
lines changed

docs/changelog/108276.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 108276
2+
summary: Fix tsdb codec when doc-values spread in two blocks
3+
area: TSDB
4+
type: bug
5+
issues: []

server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesProducer.java

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1011,8 +1011,9 @@ public long longValue() throws IOException {
10111011
final int blockIndex = index >>> ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SHIFT;
10121012
final int blockInIndex = index & ES87TSDBDocValuesFormat.NUMERIC_BLOCK_MASK;
10131013
if (blockIndex != currentBlockIndex) {
1014-
assert blockIndex > currentBlockIndex;
1015-
if (blockIndex - 1 > currentBlockIndex) {
1014+
assert blockIndex > currentBlockIndex : blockIndex + " < " + currentBlockIndex;
1015+
// no need to seek if the loading block is the next block
1016+
if (currentBlockIndex + 1 != blockIndex) {
10161017
valuesData.seek(indexReader.get(blockIndex));
10171018
}
10181019
currentBlockIndex = blockIndex;
@@ -1071,8 +1072,9 @@ public long longValue() throws IOException {
10711072
final int blockIndex = index >>> ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SHIFT;
10721073
final int blockInIndex = index & ES87TSDBDocValuesFormat.NUMERIC_BLOCK_MASK;
10731074
if (blockIndex != currentBlockIndex) {
1074-
assert blockIndex > currentBlockIndex;
1075-
if (blockIndex - 1 > currentBlockIndex) {
1075+
assert blockIndex > currentBlockIndex : blockIndex + "<=" + currentBlockIndex;
1076+
// no need to seek if the loading block is the next block
1077+
if (currentBlockIndex + 1 != blockIndex) {
10761078
valuesData.seek(indexReader.get(blockIndex));
10771079
}
10781080
currentBlockIndex = blockIndex;
@@ -1106,8 +1108,8 @@ long advance(long index) throws IOException {
11061108
final long blockIndex = index >>> ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SHIFT;
11071109
final int blockInIndex = (int) (index & ES87TSDBDocValuesFormat.NUMERIC_BLOCK_MASK);
11081110
if (blockIndex != currentBlockIndex) {
1109-
assert blockIndex > currentBlockIndex;
1110-
if (blockIndex - 1 > currentBlockIndex) {
1111+
// no need to seek if the loading block is the next block
1112+
if (currentBlockIndex + 1 != blockIndex) {
11111113
valuesData.seek(indexReader.get(blockIndex));
11121114
}
11131115
currentBlockIndex = blockIndex;

server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,20 @@
1111
import org.apache.lucene.analysis.Analyzer;
1212
import org.apache.lucene.codecs.Codec;
1313
import org.apache.lucene.document.Document;
14+
import org.apache.lucene.document.Field;
1415
import org.apache.lucene.document.SortedDocValuesField;
16+
import org.apache.lucene.document.SortedNumericDocValuesField;
1517
import org.apache.lucene.document.SortedSetDocValuesField;
18+
import org.apache.lucene.document.StringField;
1619
import org.apache.lucene.index.DirectoryReader;
1720
import org.apache.lucene.index.IndexReader;
21+
import org.apache.lucene.index.IndexWriter;
1822
import org.apache.lucene.index.IndexWriterConfig;
23+
import org.apache.lucene.index.LeafReaderContext;
1924
import org.apache.lucene.index.SortedDocValues;
25+
import org.apache.lucene.index.SortedNumericDocValues;
2026
import org.apache.lucene.index.SortedSetDocValues;
27+
import org.apache.lucene.index.StoredFields;
2128
import org.apache.lucene.search.DocIdSetIterator;
2229
import org.apache.lucene.store.Directory;
2330
import org.apache.lucene.tests.analysis.MockAnalyzer;
@@ -27,6 +34,14 @@
2734
import org.apache.lucene.util.BytesRef;
2835

2936
import java.io.IOException;
37+
import java.util.ArrayList;
38+
import java.util.Arrays;
39+
import java.util.HashMap;
40+
import java.util.List;
41+
import java.util.Map;
42+
43+
import static org.hamcrest.Matchers.equalTo;
44+
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
3045

3146
public class ES87TSDBDocValuesFormatTests extends BaseDocValuesFormatTestCase {
3247

@@ -116,4 +131,116 @@ public void testSortedSetDocValuesSingleUniqueValue() throws IOException {
116131
}
117132
}
118133

134+
public void testOneDocManyValues() throws Exception {
135+
IndexWriterConfig config = new IndexWriterConfig();
136+
try (Directory dir = newDirectory(); IndexWriter writer = new IndexWriter(dir, config)) {
137+
int numValues = 128 + random().nextInt(1024); // > 2^7 to require two blocks
138+
Document d = new Document();
139+
for (int i = 0; i < numValues; i++) {
140+
d.add(new SortedSetDocValuesField("dv", new BytesRef("v-" + i)));
141+
}
142+
writer.addDocument(d);
143+
try (DirectoryReader reader = DirectoryReader.open(writer)) {
144+
LeafReaderContext leaf = reader.leaves().get(0);
145+
SortedSetDocValues dv = leaf.reader().getSortedSetDocValues("dv");
146+
for (int i = 0; i < 3; i++) {
147+
assertTrue(dv.advanceExact(0));
148+
assertThat(dv.docValueCount(), equalTo(numValues));
149+
for (int v = 0; v < dv.docValueCount(); v++) {
150+
assertThat(dv.nextOrd(), greaterThanOrEqualTo(0L));
151+
}
152+
}
153+
}
154+
}
155+
}
156+
157+
public void testManyDocsWithManyValues() throws Exception {
158+
final int numDocs = 10 + random().nextInt(20);
159+
final Map<String, List<String>> sortedSet = new HashMap<>(); // key -> doc-values
160+
final Map<String, long[]> sortedNumbers = new HashMap<>(); // key -> numbers
161+
try (Directory directory = newDirectory()) {
162+
IndexWriterConfig conf = newIndexWriterConfig();
163+
try (RandomIndexWriter writer = new RandomIndexWriter(random(), directory, conf)) {
164+
for (int i = 0; i < numDocs; i++) {
165+
Document doc = new Document();
166+
String key = "k-" + i;
167+
doc.add(new StringField("key", new BytesRef(key), Field.Store.YES));
168+
int numValues = random().nextInt(600);
169+
List<String> binary = new ArrayList<>();
170+
for (int v = 0; v < numValues; v++) {
171+
String dv = "v-" + random().nextInt(3) + ":" + v;
172+
binary.add(dv);
173+
doc.add(new SortedSetDocValuesField("binary", new BytesRef(dv)));
174+
}
175+
sortedSet.put(key, binary.stream().sorted().toList());
176+
numValues = random().nextInt(600);
177+
long[] numbers = new long[numValues];
178+
for (int v = 0; v < numValues; v++) {
179+
numbers[v] = random().nextInt(10) * 1000 + v;
180+
doc.add(new SortedNumericDocValuesField("numbers", numbers[v]));
181+
}
182+
Arrays.sort(numbers);
183+
sortedNumbers.put(key, numbers);
184+
writer.addDocument(doc);
185+
}
186+
writer.commit();
187+
}
188+
try (IndexReader reader = maybeWrapWithMergingReader(DirectoryReader.open(directory))) {
189+
for (LeafReaderContext leaf : reader.leaves()) {
190+
StoredFields storedFields = leaf.reader().storedFields();
191+
int iters = 1 + random().nextInt(5);
192+
for (int i = 0; i < iters; i++) {
193+
// check with binary
194+
SortedSetDocValues binaryDV = leaf.reader().getSortedSetDocValues("binary");
195+
int doc = random().nextInt(leaf.reader().maxDoc());
196+
while ((doc = binaryDV.advance(doc)) != DocIdSetIterator.NO_MORE_DOCS) {
197+
String key = storedFields.document(doc).getBinaryValue("key").utf8ToString();
198+
List<String> expected = sortedSet.get(key);
199+
List<String> actual = new ArrayList<>();
200+
for (int v = 0; v < binaryDV.docValueCount(); v++) {
201+
long ord = binaryDV.nextOrd();
202+
actual.add(binaryDV.lookupOrd(ord).utf8ToString());
203+
}
204+
assertEquals(expected, actual);
205+
int repeats = random().nextInt(3);
206+
for (int r = 0; r < repeats; r++) {
207+
assertTrue(binaryDV.advanceExact(doc));
208+
actual.clear();
209+
for (int v = 0; v < binaryDV.docValueCount(); v++) {
210+
long ord = binaryDV.nextOrd();
211+
actual.add(binaryDV.lookupOrd(ord).utf8ToString());
212+
}
213+
assertEquals(expected, actual);
214+
}
215+
doc++;
216+
doc += random().nextInt(3);
217+
}
218+
// check with numbers
219+
doc = random().nextInt(leaf.reader().maxDoc());
220+
SortedNumericDocValues numbersDV = leaf.reader().getSortedNumericDocValues("numbers");
221+
while ((doc = numbersDV.advance(doc)) != DocIdSetIterator.NO_MORE_DOCS) {
222+
String key = storedFields.document(doc).getBinaryValue("key").utf8ToString();
223+
long[] expected = sortedNumbers.get(key);
224+
long[] actual = new long[expected.length];
225+
for (int v = 0; v < numbersDV.docValueCount(); v++) {
226+
actual[v] = numbersDV.nextValue();
227+
}
228+
assertArrayEquals(expected, actual);
229+
int repeats = random().nextInt(3);
230+
for (int r = 0; r < repeats; r++) {
231+
assertTrue(numbersDV.advanceExact(doc));
232+
actual = new long[expected.length];
233+
for (int v = 0; v < numbersDV.docValueCount(); v++) {
234+
actual[v] = numbersDV.nextValue();
235+
}
236+
assertArrayEquals(expected, actual);
237+
}
238+
doc++;
239+
doc += random().nextInt(3);
240+
}
241+
}
242+
}
243+
}
244+
}
245+
}
119246
}

0 commit comments

Comments
 (0)