Skip to content

Commit ddabd49

Browse files
committed
Add version to choose between NO_COMPRESS and FSST
1 parent bba14ea commit ddabd49

File tree

8 files changed

+326
-26
lines changed

8 files changed

+326
-26
lines changed

server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import org.elasticsearch.index.IndexVersions;
2121
import org.elasticsearch.index.codec.bloomfilter.ES87BloomFilterPostingsFormat;
2222
import org.elasticsearch.index.codec.postings.ES812PostingsFormat;
23+
import org.elasticsearch.index.codec.tsdb.BinaryDVCompressionMode;
2324
import org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesFormat;
2425
import org.elasticsearch.index.mapper.CompletionFieldMapper;
2526
import org.elasticsearch.index.mapper.IdFieldMapper;
@@ -36,6 +37,7 @@ public class PerFieldFormatSupplier {
3637
private static final DocValuesFormat docValuesFormat = new Lucene90DocValuesFormat();
3738
private static final KnnVectorsFormat knnVectorsFormat = new Lucene99HnswVectorsFormat();
3839
private static final ES819TSDBDocValuesFormat tsdbDocValuesFormat = new ES819TSDBDocValuesFormat();
40+
private static final DocValuesFormat stringDocValuesFormat = new ES819TSDBDocValuesFormat(BinaryDVCompressionMode.COMPRESSED_WITH_FSST);
3941
private static final ES812PostingsFormat es812PostingsFormat = new ES812PostingsFormat();
4042
private static final PostingsFormat completionPostingsFormat = PostingsFormat.forName("Completion101");
4143

@@ -105,6 +107,13 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
105107
}
106108

107109
public DocValuesFormat getDocValuesFormatForField(String field) {
110+
if (mapperService != null) {
111+
Mapper mapper = mapperService.mappingLookup().getMapper(field);
112+
if (mapper != null && mapper.typeName().equals("wildcard")) {
113+
return stringDocValuesFormat;
114+
}
115+
}
116+
108117
if (useTSDBDocValuesFormat(field)) {
109118
return tsdbDocValuesFormat;
110119
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
2+
/*
3+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
4+
* or more contributor license agreements. Licensed under the "Elastic License
5+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
6+
* Public License v 1"; you may not use this file except in compliance with, at
7+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
8+
* License v3.0 only", or the "Server Side Public License, v 1".
9+
*/
10+
11+
package org.elasticsearch.index.codec.tsdb;
12+
13+
public enum BinaryDVCompressionMode {
14+
15+
NO_COMPRESS((byte) 0),
16+
COMPRESSED_WITH_FSST((byte) 1);
17+
18+
public final byte code;
19+
20+
BinaryDVCompressionMode(byte code) {
21+
this.code = code;
22+
}
23+
24+
public static BinaryDVCompressionMode fromMode(byte mode) {
25+
return switch (mode) {
26+
case 0 -> NO_COMPRESS;
27+
case 1 -> COMPRESSED_WITH_FSST;
28+
default -> throw new IllegalStateException("unknown compression mode [" + mode + "]");
29+
};
30+
}
31+
}

server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java

Lines changed: 144 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
import org.apache.lucene.store.ByteArrayDataOutput;
3030
import org.apache.lucene.store.ByteBuffersDataOutput;
3131
import org.apache.lucene.store.ByteBuffersIndexOutput;
32-
import org.apache.lucene.store.ChecksumIndexInput;
3332
import org.apache.lucene.store.Directory;
3433
import org.apache.lucene.store.IOContext;
3534
import org.apache.lucene.store.IndexOutput;
@@ -45,6 +44,7 @@
4544
import org.elasticsearch.common.compress.fsst.FSST;
4645
import org.elasticsearch.common.compress.fsst.ReservoirSampler;
4746
import org.elasticsearch.core.IOUtils;
47+
import org.elasticsearch.index.codec.tsdb.BinaryDVCompressionMode;
4848
import org.elasticsearch.index.codec.tsdb.TSDBDocValuesEncoder;
4949

5050
import java.io.Closeable;
@@ -68,6 +68,7 @@ final class ES819TSDBDocValuesConsumer extends XDocValuesConsumer {
6868
private byte[] termsDictBuffer;
6969
private final int skipIndexIntervalSize;
7070
final boolean enableOptimizedMerge;
71+
private final BinaryDVCompressionMode binaryDVCompressionMode;
7172
private final SegmentWriteState state;
7273

7374
ES819TSDBDocValuesConsumer(
@@ -77,9 +78,11 @@ final class ES819TSDBDocValuesConsumer extends XDocValuesConsumer {
7778
String dataCodec,
7879
String dataExtension,
7980
String metaCodec,
80-
String metaExtension
81+
String metaExtension,
82+
BinaryDVCompressionMode binaryDVCompressionMode
8183
) throws IOException {
8284
this.termsDictBuffer = new byte[1 << 14];
85+
this.binaryDVCompressionMode = binaryDVCompressionMode;
8386
this.state = state;
8487
this.dir = state.directory;
8588
this.context = state.context;
@@ -279,7 +282,146 @@ public void mergeBinaryField(FieldInfo mergeFieldInfo, MergeState mergeState) th
279282
public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
280283
meta.writeInt(field.number);
281284
meta.writeByte(ES819TSDBDocValuesFormat.BINARY);
285+
meta.writeByte(binaryDVCompressionMode.code);
286+
switch (binaryDVCompressionMode) {
287+
case NO_COMPRESS -> doAddUncompressedBinary(field, valuesProducer);
288+
case COMPRESSED_WITH_FSST -> doAddCompressedBinaryFSST(field, valuesProducer);
289+
}
290+
}
291+
292+
public void doAddUncompressedBinary(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
293+
meta.writeInt(field.number);
294+
meta.writeByte(ES819TSDBDocValuesFormat.BINARY);
295+
296+
if (valuesProducer instanceof TsdbDocValuesProducer tsdbValuesProducer && tsdbValuesProducer.mergeStats.supported()) {
297+
final int numDocsWithField = tsdbValuesProducer.mergeStats.sumNumDocsWithField();
298+
final int minLength = tsdbValuesProducer.mergeStats.minLength();
299+
final int maxLength = tsdbValuesProducer.mergeStats.maxLength();
300+
301+
assert numDocsWithField <= maxDoc;
302+
303+
BinaryDocValues values = valuesProducer.getBinary(field);
304+
long start = data.getFilePointer();
305+
meta.writeLong(start); // dataOffset
306+
307+
OffsetsAccumulator offsetsAccumulator = null;
308+
DISIAccumulator disiAccumulator = null;
309+
try {
310+
if (numDocsWithField > 0 && numDocsWithField < maxDoc) {
311+
disiAccumulator = new DISIAccumulator(dir, context, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
312+
}
313+
314+
assert maxLength >= minLength;
315+
if (maxLength > minLength) {
316+
offsetsAccumulator = new OffsetsAccumulator(dir, context, data, numDocsWithField);
317+
}
318+
319+
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
320+
BytesRef v = values.binaryValue();
321+
data.writeBytes(v.bytes, v.offset, v.length);
322+
if (disiAccumulator != null) {
323+
disiAccumulator.addDocId(doc);
324+
}
325+
if (offsetsAccumulator != null) {
326+
offsetsAccumulator.addDoc(v.length);
327+
}
328+
}
329+
meta.writeLong(data.getFilePointer() - start); // dataLength
330+
331+
if (numDocsWithField == 0) {
332+
meta.writeLong(-2); // docsWithFieldOffset
333+
meta.writeLong(0L); // docsWithFieldLength
334+
meta.writeShort((short) -1); // jumpTableEntryCount
335+
meta.writeByte((byte) -1); // denseRankPower
336+
} else if (numDocsWithField == maxDoc) {
337+
meta.writeLong(-1); // docsWithFieldOffset
338+
meta.writeLong(0L); // docsWithFieldLength
339+
meta.writeShort((short) -1); // jumpTableEntryCount
340+
meta.writeByte((byte) -1); // denseRankPower
341+
} else {
342+
long offset = data.getFilePointer();
343+
meta.writeLong(offset); // docsWithFieldOffset
344+
final short jumpTableEntryCount = disiAccumulator.build(data);
345+
meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
346+
meta.writeShort(jumpTableEntryCount);
347+
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
348+
}
349+
350+
meta.writeInt(numDocsWithField);
351+
meta.writeInt(minLength);
352+
meta.writeInt(maxLength);
353+
if (offsetsAccumulator != null) {
354+
offsetsAccumulator.build(meta, data);
355+
}
356+
} finally {
357+
IOUtils.close(disiAccumulator, offsetsAccumulator);
358+
}
359+
} else {
360+
BinaryDocValues values = valuesProducer.getBinary(field);
361+
long start = data.getFilePointer();
362+
meta.writeLong(start); // dataOffset
363+
int numDocsWithField = 0;
364+
int minLength = Integer.MAX_VALUE;
365+
int maxLength = 0;
366+
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
367+
numDocsWithField++;
368+
BytesRef v = values.binaryValue();
369+
int length = v.length;
370+
data.writeBytes(v.bytes, v.offset, v.length);
371+
minLength = Math.min(length, minLength);
372+
maxLength = Math.max(length, maxLength);
373+
}
374+
assert numDocsWithField <= maxDoc;
375+
meta.writeLong(data.getFilePointer() - start); // dataLength
376+
377+
if (numDocsWithField == 0) {
378+
meta.writeLong(-2); // docsWithFieldOffset
379+
meta.writeLong(0L); // docsWithFieldLength
380+
meta.writeShort((short) -1); // jumpTableEntryCount
381+
meta.writeByte((byte) -1); // denseRankPower
382+
} else if (numDocsWithField == maxDoc) {
383+
meta.writeLong(-1); // docsWithFieldOffset
384+
meta.writeLong(0L); // docsWithFieldLength
385+
meta.writeShort((short) -1); // jumpTableEntryCount
386+
meta.writeByte((byte) -1); // denseRankPower
387+
} else {
388+
long offset = data.getFilePointer();
389+
meta.writeLong(offset); // docsWithFieldOffset
390+
values = valuesProducer.getBinary(field);
391+
final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
392+
meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
393+
meta.writeShort(jumpTableEntryCount);
394+
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
395+
}
396+
397+
meta.writeInt(numDocsWithField);
398+
meta.writeInt(minLength);
399+
meta.writeInt(maxLength);
400+
if (maxLength > minLength) {
401+
start = data.getFilePointer();
402+
meta.writeLong(start);
403+
meta.writeVInt(ES819TSDBDocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT);
404+
405+
final DirectMonotonicWriter writer = DirectMonotonicWriter.getInstance(
406+
meta,
407+
data,
408+
numDocsWithField + 1,
409+
ES819TSDBDocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT
410+
);
411+
long addr = 0;
412+
writer.add(addr);
413+
values = valuesProducer.getBinary(field);
414+
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
415+
addr += values.binaryValue().length;
416+
writer.add(addr);
417+
}
418+
writer.finish();
419+
meta.writeLong(data.getFilePointer() - start);
420+
}
421+
}
422+
}
282423

424+
public void doAddCompressedBinaryFSST(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
283425
if (valuesProducer instanceof TsdbDocValuesProducer tsdbValuesProducer && tsdbValuesProducer.mergeStats.supported()) {
284426
final int numDocsWithField = tsdbValuesProducer.mergeStats.sumNumDocsWithField();
285427
final int minLength = tsdbValuesProducer.mergeStats.minLength();

server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormat.java

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import org.apache.lucene.index.SegmentReadState;
1515
import org.apache.lucene.index.SegmentWriteState;
1616
import org.elasticsearch.core.SuppressForbidden;
17+
import org.elasticsearch.index.codec.tsdb.BinaryDVCompressionMode;
1718

1819
import java.io.IOException;
1920

@@ -47,7 +48,8 @@ public class ES819TSDBDocValuesFormat extends org.apache.lucene.codecs.DocValues
4748
static final byte SORTED_NUMERIC = 4;
4849

4950
static final int VERSION_START = 0;
50-
static final int VERSION_CURRENT = VERSION_START;
51+
static final int VERSION_BINARY_DV_COMPRESSION = 1;
52+
static final int VERSION_CURRENT = VERSION_BINARY_DV_COMPRESSION;
5153

5254
static final int TERMS_DICT_BLOCK_LZ4_SHIFT = 6;
5355
static final int TERMS_DICT_BLOCK_LZ4_SIZE = 1 << TERMS_DICT_BLOCK_LZ4_SHIFT;
@@ -106,20 +108,26 @@ private static boolean getOptimizedMergeEnabledDefault() {
106108

107109
final int skipIndexIntervalSize;
108110
private final boolean enableOptimizedMerge;
111+
private final BinaryDVCompressionMode binaryDVCompressionMode;
109112

110113
/** Default constructor. */
111114
public ES819TSDBDocValuesFormat() {
112-
this(DEFAULT_SKIP_INDEX_INTERVAL_SIZE, OPTIMIZED_MERGE_ENABLE_DEFAULT);
115+
this(DEFAULT_SKIP_INDEX_INTERVAL_SIZE, OPTIMIZED_MERGE_ENABLE_DEFAULT, BinaryDVCompressionMode.NO_COMPRESS);
116+
}
117+
118+
public ES819TSDBDocValuesFormat(BinaryDVCompressionMode binaryDVCompressionMode) {
119+
this(DEFAULT_SKIP_INDEX_INTERVAL_SIZE, OPTIMIZED_MERGE_ENABLE_DEFAULT, binaryDVCompressionMode);
113120
}
114121

115122
/** Doc values fields format with specified skipIndexIntervalSize. */
116-
public ES819TSDBDocValuesFormat(int skipIndexIntervalSize, boolean enableOptimizedMerge) {
123+
public ES819TSDBDocValuesFormat(int skipIndexIntervalSize, boolean enableOptimizedMerge, BinaryDVCompressionMode binaryDVCompressionMode) {
117124
super(CODEC_NAME);
118125
if (skipIndexIntervalSize < 2) {
119126
throw new IllegalArgumentException("skipIndexIntervalSize must be > 1, got [" + skipIndexIntervalSize + "]");
120127
}
121128
this.skipIndexIntervalSize = skipIndexIntervalSize;
122129
this.enableOptimizedMerge = enableOptimizedMerge;
130+
this.binaryDVCompressionMode = binaryDVCompressionMode;
123131
}
124132

125133
@Override
@@ -131,7 +139,8 @@ public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOExcept
131139
DATA_CODEC,
132140
DATA_EXTENSION,
133141
META_CODEC,
134-
META_EXTENSION
142+
META_EXTENSION,
143+
binaryDVCompressionMode
135144
);
136145
}
137146

0 commit comments

Comments
 (0)