Skip to content

Commit 1365156

Browse files
authored
LUCENE-9996: Reduce RAM usage of DWPT for a single document. (#184)
With this change, doc-value terms dictionaries use a shared `ByteBlockPool` across all fields, and points, binary doc values and doc-value ordinals use slightly smaller page sizes.
1 parent 065026b commit 1365156

File tree

12 files changed

+121
-36
lines changed

12 files changed

+121
-36
lines changed

lucene/CHANGES.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,9 @@ Improvements
377377

378378
Optimizations
379379
---------------------
380-
(No changes)
380+
* LUCENE-9996: Improved memory efficiency of IndexWriter's RAM buffer, in
381+
particular in the case of many fields and many indexing threads.
382+
(Adrien Grand)
381383

382384
Bug Fixes
383385
---------------------

lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
3838
/** Maximum length for a binary field. */
3939
private static final int MAX_LENGTH = ArrayUtil.MAX_ARRAY_LENGTH;
4040

41-
// 32 KB block sizes for PagedBytes storage:
42-
private static final int BLOCK_BITS = 15;
41+
// 4 kB block sizes for PagedBytes storage:
42+
private static final int BLOCK_BITS = 12;
4343

4444
private final PagedBytes bytes;
4545
private final DataOutput bytesOut;

lucene/core/src/java/org/apache/lucene/index/IndexingChain.java

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ final class IndexingChain implements Accountable {
6464

6565
// Writes postings and term vectors:
6666
final TermsHash termsHash;
67+
// Shared pool for doc-value terms
68+
final ByteBlockPool docValuesBytePool;
6769
// Writes stored fields
6870
final StoredFieldsConsumer storedFieldsConsumer;
6971
final TermVectorsConsumer termVectorsWriter;
@@ -127,6 +129,7 @@ final class IndexingChain implements Accountable {
127129
termsHash =
128130
new FreqProxTermsWriter(
129131
intBlockAllocator, byteBlockAllocator, bytesUsed, termVectorsWriter);
132+
docValuesBytePool = new ByteBlockPool(byteBlockAllocator);
130133
}
131134

132135
private void onAbortingException(Throwable th) {
@@ -696,19 +699,19 @@ private void initializeFieldInfo(PerField pf) throws IOException {
696699
pf.docValuesWriter = new BinaryDocValuesWriter(fi, bytesUsed);
697700
break;
698701
case SORTED:
699-
pf.docValuesWriter = new SortedDocValuesWriter(fi, bytesUsed);
702+
pf.docValuesWriter = new SortedDocValuesWriter(fi, bytesUsed, docValuesBytePool);
700703
break;
701704
case SORTED_NUMERIC:
702705
pf.docValuesWriter = new SortedNumericDocValuesWriter(fi, bytesUsed);
703706
break;
704707
case SORTED_SET:
705-
pf.docValuesWriter = new SortedSetDocValuesWriter(fi, bytesUsed);
708+
pf.docValuesWriter = new SortedSetDocValuesWriter(fi, bytesUsed, docValuesBytePool);
706709
break;
707710
default:
708711
throw new AssertionError("unrecognized DocValues.Type: " + dvType);
709712
}
710713
if (fi.getPointDimensionCount() != 0) {
711-
pf.pointValuesWriter = new PointValuesWriter(byteBlockAllocator, bytesUsed, fi);
714+
pf.pointValuesWriter = new PointValuesWriter(bytesUsed, fi);
712715
}
713716
if (fi.getVectorDimension() != 0) {
714717
pf.vectorValuesWriter = new VectorValuesWriter(fi, bytesUsed);

lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,33 +20,36 @@
2020
import org.apache.lucene.codecs.MutablePointValues;
2121
import org.apache.lucene.codecs.PointsReader;
2222
import org.apache.lucene.codecs.PointsWriter;
23+
import org.apache.lucene.store.DataOutput;
2324
import org.apache.lucene.util.ArrayUtil;
24-
import org.apache.lucene.util.ByteBlockPool;
2525
import org.apache.lucene.util.BytesRef;
2626
import org.apache.lucene.util.Counter;
27+
import org.apache.lucene.util.PagedBytes;
2728

2829
/** Buffers up pending byte[][] value(s) per doc, then flushes when segment flushes. */
2930
class PointValuesWriter {
3031
private final FieldInfo fieldInfo;
31-
private final ByteBlockPool bytes;
32+
private final PagedBytes bytes;
33+
private final DataOutput bytesOut;
3234
private final Counter iwBytesUsed;
3335
private int[] docIDs;
3436
private int numPoints;
3537
private int numDocs;
3638
private int lastDocID = -1;
3739
private final int packedBytesLength;
3840

39-
PointValuesWriter(ByteBlockPool.Allocator allocator, Counter bytesUsed, FieldInfo fieldInfo) {
41+
PointValuesWriter(Counter bytesUsed, FieldInfo fieldInfo) {
4042
this.fieldInfo = fieldInfo;
4143
this.iwBytesUsed = bytesUsed;
42-
this.bytes = new ByteBlockPool(allocator);
44+
this.bytes = new PagedBytes(12);
45+
bytesOut = bytes.getDataOutput();
4346
docIDs = new int[16];
4447
iwBytesUsed.addAndGet(16 * Integer.BYTES);
4548
packedBytesLength = fieldInfo.getPointDimensionCount() * fieldInfo.getPointNumBytes();
4649
}
4750

4851
// TODO: if exactly the same value is added to exactly the same doc, should we dedup?
49-
public void addPackedValue(int docID, BytesRef value) {
52+
public void addPackedValue(int docID, BytesRef value) throws IOException {
5053
if (value == null) {
5154
throw new IllegalArgumentException(
5255
"field=" + fieldInfo.name + ": point value must not be null");
@@ -65,7 +68,9 @@ public void addPackedValue(int docID, BytesRef value) {
6568
docIDs = ArrayUtil.grow(docIDs, numPoints + 1);
6669
iwBytesUsed.addAndGet((docIDs.length - numPoints) * Integer.BYTES);
6770
}
68-
bytes.append(value);
71+
final long bytesRamBytesUsedBefore = bytes.ramBytesUsed();
72+
bytesOut.writeBytes(value.bytes, value.offset, value.length);
73+
iwBytesUsed.addAndGet(bytes.ramBytesUsed() - bytesRamBytesUsedBefore);
6974
docIDs[numPoints] = docID;
7075
if (docID != lastDocID) {
7176
numDocs++;
@@ -86,6 +91,7 @@ public int getNumDocs() {
8691

8792
public void flush(SegmentWriteState state, Sorter.DocMap sortMap, PointsWriter writer)
8893
throws IOException {
94+
final PagedBytes.Reader bytesReader = bytes.freeze(false);
8995
PointValues points =
9096
new MutablePointValues() {
9197
final int[] ords = new int[numPoints];
@@ -164,14 +170,13 @@ public int getDocID(int i) {
164170
@Override
165171
public void getValue(int i, BytesRef packedValue) {
166172
final long offset = (long) packedBytesLength * ords[i];
167-
packedValue.length = packedBytesLength;
168-
bytes.setRawBytesRef(packedValue, offset);
173+
bytesReader.fillSlice(packedValue, offset, packedBytesLength);
169174
}
170175

171176
@Override
172177
public byte getByteAt(int i, int k) {
173178
final long offset = (long) packedBytesLength * ords[i] + k;
174-
return bytes.readByte(offset);
179+
return bytesReader.getByte(offset);
175180
}
176181

177182
@Override

lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,12 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
4848
private int[] finalSortedValues;
4949
private int[] finalOrdMap;
5050

51-
public SortedDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
51+
public SortedDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed, ByteBlockPool pool) {
5252
this.fieldInfo = fieldInfo;
5353
this.iwBytesUsed = iwBytesUsed;
5454
hash =
5555
new BytesRefHash(
56-
new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator(iwBytesUsed)),
56+
pool,
5757
BytesRefHash.DEFAULT_CAPACITY,
5858
new DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed));
5959
pending = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);

lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,12 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
5555
private int[] finalSortedValues;
5656
private int[] finalOrdMap;
5757

58-
SortedSetDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
58+
SortedSetDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed, ByteBlockPool pool) {
5959
this.fieldInfo = fieldInfo;
6060
this.iwBytesUsed = iwBytesUsed;
6161
hash =
6262
new BytesRefHash(
63-
new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator(iwBytesUsed)),
63+
pool,
6464
BytesRefHash.DEFAULT_CAPACITY,
6565
new DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed));
6666
pending = PackedLongValues.packedBuilder(PackedInts.COMPACT);

lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -377,14 +377,6 @@ public void setRawBytesRef(BytesRef ref, final long offset) {
377377
}
378378
}
379379

380-
/** Read a single byte at the given {@code offset}. */
381-
public byte readByte(long offset) {
382-
int bufferIndex = (int) (offset >> BYTE_BLOCK_SHIFT);
383-
int pos = (int) (offset & BYTE_BLOCK_MASK);
384-
byte[] buffer = buffers[bufferIndex];
385-
return buffer[pos];
386-
}
387-
388380
@Override
389381
public long ramBytesUsed() {
390382
long size = BASE_RAM_BYTES;

lucene/core/src/java/org/apache/lucene/util/PagedBytes.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,17 @@ public void fillSlice(BytesRef b, long start, int length) {
100100
}
101101
}
102102

103+
/**
104+
* Get the byte at the given offset.
105+
*
106+
* @lucene.internal
107+
*/
108+
public byte getByte(long o) {
109+
final int index = (int) (o >> blockBits);
110+
final int offset = (int) (o & blockMask);
111+
return blocks[index][offset];
112+
}
113+
103114
/**
104115
* Reads length as 1 or 2 byte vInt prefix, starting at <i>start</i>.
105116
*

lucene/core/src/java/org/apache/lucene/util/packed/PackedLongValues.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ public class PackedLongValues extends LongValues implements Accountable {
2929
private static final long BASE_RAM_BYTES_USED =
3030
RamUsageEstimator.shallowSizeOfInstance(PackedLongValues.class);
3131

32-
static final int DEFAULT_PAGE_SIZE = 1024;
32+
static final int DEFAULT_PAGE_SIZE = 256;
3333
static final int MIN_PAGE_SIZE = 64;
3434
// More than 1M doesn't really makes sense with these appending buffers
3535
// since their goal is to try to have small numbers of bits per value

lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,21 +17,39 @@
1717
package org.apache.lucene.index;
1818

1919
import java.io.IOException;
20-
import org.apache.lucene.analysis.*;
20+
import java.util.function.Function;
21+
import org.apache.lucene.analysis.Analyzer;
22+
import org.apache.lucene.analysis.MockAnalyzer;
23+
import org.apache.lucene.analysis.MockTokenizer;
24+
import org.apache.lucene.analysis.TokenFilter;
25+
import org.apache.lucene.analysis.TokenStream;
26+
import org.apache.lucene.analysis.Tokenizer;
2127
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
2228
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
2329
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
30+
import org.apache.lucene.document.BinaryDocValuesField;
2431
import org.apache.lucene.document.Document;
2532
import org.apache.lucene.document.Field;
33+
import org.apache.lucene.document.Field.Store;
2634
import org.apache.lucene.document.FieldType;
35+
import org.apache.lucene.document.IntPoint;
36+
import org.apache.lucene.document.NumericDocValuesField;
37+
import org.apache.lucene.document.SortedDocValuesField;
38+
import org.apache.lucene.document.SortedNumericDocValuesField;
39+
import org.apache.lucene.document.SortedSetDocValuesField;
40+
import org.apache.lucene.document.StoredField;
41+
import org.apache.lucene.document.StringField;
2742
import org.apache.lucene.document.TextField;
43+
import org.apache.lucene.document.VectorField;
2844
import org.apache.lucene.search.DocIdSetIterator;
2945
import org.apache.lucene.store.Directory;
3046
import org.apache.lucene.util.AttributeSource;
3147
import org.apache.lucene.util.BytesRef;
3248
import org.apache.lucene.util.LuceneTestCase;
3349
import org.apache.lucene.util.TestUtil;
3450
import org.apache.lucene.util.Version;
51+
import org.hamcrest.MatcherAssert;
52+
import org.hamcrest.Matchers;
3553

3654
public class TestDocumentWriter extends LuceneTestCase {
3755
private Directory dir;
@@ -307,4 +325,63 @@ public void testLUCENE_1590() throws Exception {
307325
fi.fieldInfo("f2").getIndexOptions());
308326
reader.close();
309327
}
328+
329+
/** Make sure that every new field doesn't increment memory usage by more than 16kB */
330+
private void doTestRAMUsage(Function<String, IndexableField> fieldSupplier) throws IOException {
331+
try (Directory dir = newDirectory();
332+
IndexWriter w =
333+
new IndexWriter(
334+
dir,
335+
newIndexWriterConfig()
336+
.setMaxBufferedDocs(10)
337+
.setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH))) {
338+
Document doc = new Document();
339+
final int numFields = 100;
340+
for (int i = 0; i < numFields; ++i) {
341+
doc.add(fieldSupplier.apply("f" + i));
342+
}
343+
w.addDocument(doc);
344+
assertTrue(w.hasChangesInRam());
345+
MatcherAssert.assertThat(w.ramBytesUsed(), Matchers.lessThan(numFields * 16384L));
346+
}
347+
}
348+
349+
public void testRAMUsageStored() throws IOException {
350+
doTestRAMUsage(field -> new StoredField(field, new BytesRef("Lucene")));
351+
}
352+
353+
public void testRAMUsageIndexed() throws IOException {
354+
doTestRAMUsage(field -> new StringField(field, new BytesRef("Lucene"), Store.NO));
355+
}
356+
357+
public void testRAMUsagePoint() throws IOException {
358+
doTestRAMUsage(field -> new IntPoint(field, 42));
359+
}
360+
361+
public void testRAMUsageNumericDocValue() throws IOException {
362+
doTestRAMUsage(field -> new NumericDocValuesField(field, 42));
363+
}
364+
365+
public void testRAMUsageSortedDocValue() throws IOException {
366+
doTestRAMUsage(field -> new SortedDocValuesField(field, new BytesRef("Lucene")));
367+
}
368+
369+
public void testRAMUsageBinaryDocValue() throws IOException {
370+
doTestRAMUsage(field -> new BinaryDocValuesField(field, new BytesRef("Lucene")));
371+
}
372+
373+
public void testRAMUsageSortedNumericDocValue() throws IOException {
374+
doTestRAMUsage(field -> new SortedNumericDocValuesField(field, 42));
375+
}
376+
377+
public void testRAMUsageSortedSetDocValue() throws IOException {
378+
doTestRAMUsage(field -> new SortedSetDocValuesField(field, new BytesRef("Lucene")));
379+
}
380+
381+
public void testRAMUsageVector() throws IOException {
382+
doTestRAMUsage(
383+
field ->
384+
new VectorField(
385+
field, new float[] {1, 2, 3, 4}, VectorValues.SimilarityFunction.EUCLIDEAN));
386+
}
310387
}

0 commit comments

Comments
 (0)