Skip to content

Commit 6ed24d2

Browse files
iverasekderusso
authored andcommitted
Reduce data amplification in IVFVectorsWriter (elastic#129698)
With this change we will create first the tmp file and the posting list and once the file is deleted we will merge the vectors on the vec file. Therefore we only have two copies of the vector at the same time.
1 parent 00da642 commit 6ed24d2

File tree

1 file changed

+88
-83
lines changed

1 file changed

+88
-83
lines changed

server/src/main/java/org/elasticsearch/index/codec/vectors/IVFVectorsWriter.java

Lines changed: 88 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -225,102 +225,107 @@ public int ordToDoc(int ord) {
225225
}
226226

227227
@Override
228-
@SuppressForbidden(reason = "require usage of Lucene's IOUtils#deleteFilesIgnoringExceptions(...)")
229228
public final void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException {
230-
rawVectorDelegate.mergeOneField(fieldInfo, mergeState);
231229
if (fieldInfo.getVectorEncoding().equals(VectorEncoding.FLOAT32)) {
232-
final int numVectors;
233-
String tempRawVectorsFileName = null;
234-
boolean success = false;
235-
// build a float vector values with random access. In order to do that we dump the vectors to
236-
// a temporary file
237-
// and write the docID follow by the vector
238-
try (IndexOutput out = mergeState.segmentInfo.dir.createTempOutput(mergeState.segmentInfo.name, "ivf_", IOContext.DEFAULT)) {
239-
tempRawVectorsFileName = out.getName();
240-
// TODO do this better, we shouldn't have to write to a temp file, we should be able to
241-
// to just from the merged vector values, the tricky part is the random access.
242-
numVectors = writeFloatVectorValues(fieldInfo, out, MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
243-
CodecUtil.writeFooter(out);
230+
mergeOneFieldIVF(fieldInfo, mergeState);
231+
}
232+
// we merge the vectors at the end so we only have two copies of the vectors on disk at the same time.
233+
rawVectorDelegate.mergeOneField(fieldInfo, mergeState);
234+
}
235+
236+
@SuppressForbidden(reason = "require usage of Lucene's IOUtils#deleteFilesIgnoringExceptions(...)")
237+
private void mergeOneFieldIVF(FieldInfo fieldInfo, MergeState mergeState) throws IOException {
238+
final int numVectors;
239+
String tempRawVectorsFileName = null;
240+
boolean success = false;
241+
// build a float vector values with random access. In order to do that we dump the vectors to
242+
// a temporary file
243+
// and write the docID follow by the vector
244+
try (IndexOutput out = mergeState.segmentInfo.dir.createTempOutput(mergeState.segmentInfo.name, "ivf_", IOContext.DEFAULT)) {
245+
tempRawVectorsFileName = out.getName();
246+
// TODO do this better, we shouldn't have to write to a temp file, we should be able to
247+
// to just from the merged vector values, the tricky part is the random access.
248+
numVectors = writeFloatVectorValues(fieldInfo, out, MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
249+
CodecUtil.writeFooter(out);
250+
success = true;
251+
} finally {
252+
if (success == false && tempRawVectorsFileName != null) {
253+
org.apache.lucene.util.IOUtils.deleteFilesIgnoringExceptions(mergeState.segmentInfo.dir, tempRawVectorsFileName);
254+
}
255+
}
256+
try (IndexInput in = mergeState.segmentInfo.dir.openInput(tempRawVectorsFileName, IOContext.DEFAULT)) {
257+
float[] calculatedGlobalCentroid = new float[fieldInfo.getVectorDimension()];
258+
final FloatVectorValues floatVectorValues = getFloatVectorValues(fieldInfo, in, numVectors);
259+
success = false;
260+
long centroidOffset;
261+
long centroidLength;
262+
String centroidTempName = null;
263+
int numCentroids;
264+
IndexOutput centroidTemp = null;
265+
CentroidAssignments centroidAssignments;
266+
try {
267+
centroidTemp = mergeState.segmentInfo.dir.createTempOutput(mergeState.segmentInfo.name, "civf_", IOContext.DEFAULT);
268+
centroidTempName = centroidTemp.getName();
269+
270+
centroidAssignments = calculateAndWriteCentroids(
271+
fieldInfo,
272+
floatVectorValues,
273+
centroidTemp,
274+
mergeState,
275+
calculatedGlobalCentroid
276+
);
277+
numCentroids = centroidAssignments.numCentroids();
278+
244279
success = true;
245280
} finally {
246-
if (success == false && tempRawVectorsFileName != null) {
247-
org.apache.lucene.util.IOUtils.deleteFilesIgnoringExceptions(mergeState.segmentInfo.dir, tempRawVectorsFileName);
281+
if (success == false && centroidTempName != null) {
282+
IOUtils.closeWhileHandlingException(centroidTemp);
283+
org.apache.lucene.util.IOUtils.deleteFilesIgnoringExceptions(mergeState.segmentInfo.dir, centroidTempName);
248284
}
249285
}
250-
try (IndexInput in = mergeState.segmentInfo.dir.openInput(tempRawVectorsFileName, IOContext.DEFAULT)) {
251-
float[] calculatedGlobalCentroid = new float[fieldInfo.getVectorDimension()];
252-
final FloatVectorValues floatVectorValues = getFloatVectorValues(fieldInfo, in, numVectors);
253-
success = false;
254-
long centroidOffset;
255-
long centroidLength;
256-
String centroidTempName = null;
257-
int numCentroids;
258-
IndexOutput centroidTemp = null;
259-
CentroidAssignments centroidAssignments;
260-
try {
261-
centroidTemp = mergeState.segmentInfo.dir.createTempOutput(mergeState.segmentInfo.name, "civf_", IOContext.DEFAULT);
262-
centroidTempName = centroidTemp.getName();
263-
264-
centroidAssignments = calculateAndWriteCentroids(
286+
try {
287+
if (numCentroids == 0) {
288+
centroidOffset = ivfCentroids.getFilePointer();
289+
writeMeta(fieldInfo, centroidOffset, 0, new long[0], null);
290+
CodecUtil.writeFooter(centroidTemp);
291+
IOUtils.close(centroidTemp);
292+
return;
293+
}
294+
CodecUtil.writeFooter(centroidTemp);
295+
IOUtils.close(centroidTemp);
296+
centroidOffset = ivfCentroids.alignFilePointer(Float.BYTES);
297+
try (IndexInput centroidsInput = mergeState.segmentInfo.dir.openInput(centroidTempName, IOContext.DEFAULT)) {
298+
ivfCentroids.copyBytes(centroidsInput, centroidsInput.length() - CodecUtil.footerLength());
299+
centroidLength = ivfCentroids.getFilePointer() - centroidOffset;
300+
301+
CentroidSupplier centroidSupplier = createCentroidSupplier(
302+
centroidsInput,
303+
numCentroids,
265304
fieldInfo,
266-
floatVectorValues,
267-
centroidTemp,
268-
mergeState,
269305
calculatedGlobalCentroid
270306
);
271-
numCentroids = centroidAssignments.numCentroids();
272-
273-
success = true;
274-
} finally {
275-
if (success == false && centroidTempName != null) {
276-
IOUtils.closeWhileHandlingException(centroidTemp);
277-
org.apache.lucene.util.IOUtils.deleteFilesIgnoringExceptions(mergeState.segmentInfo.dir, centroidTempName);
278-
}
279-
}
280-
try {
281-
if (numCentroids == 0) {
282-
centroidOffset = ivfCentroids.getFilePointer();
283-
writeMeta(fieldInfo, centroidOffset, 0, new long[0], null);
284-
CodecUtil.writeFooter(centroidTemp);
285-
IOUtils.close(centroidTemp);
286-
return;
287-
}
288-
CodecUtil.writeFooter(centroidTemp);
289-
IOUtils.close(centroidTemp);
290-
centroidOffset = ivfCentroids.alignFilePointer(Float.BYTES);
291-
try (IndexInput centroidsInput = mergeState.segmentInfo.dir.openInput(centroidTempName, IOContext.DEFAULT)) {
292-
ivfCentroids.copyBytes(centroidsInput, centroidsInput.length() - CodecUtil.footerLength());
293-
centroidLength = ivfCentroids.getFilePointer() - centroidOffset;
294-
295-
CentroidSupplier centroidSupplier = createCentroidSupplier(
296-
centroidsInput,
297-
numCentroids,
298-
fieldInfo,
299-
calculatedGlobalCentroid
300-
);
301-
302-
// build a float vector values with random access
303-
// build centroids
304-
final long[] offsets = buildAndWritePostingsLists(
305-
fieldInfo,
306-
centroidSupplier,
307-
floatVectorValues,
308-
ivfClusters,
309-
centroidAssignments.assignmentsByCluster()
310-
);
311-
assert offsets.length == centroidSupplier.size();
312-
writeMeta(fieldInfo, centroidOffset, centroidLength, offsets, calculatedGlobalCentroid);
313-
}
314-
} finally {
315-
org.apache.lucene.util.IOUtils.deleteFilesIgnoringExceptions(
316-
mergeState.segmentInfo.dir,
317-
tempRawVectorsFileName,
318-
centroidTempName
307+
308+
// build a float vector values with random access
309+
// build centroids
310+
final long[] offsets = buildAndWritePostingsLists(
311+
fieldInfo,
312+
centroidSupplier,
313+
floatVectorValues,
314+
ivfClusters,
315+
centroidAssignments.assignmentsByCluster()
319316
);
317+
assert offsets.length == centroidSupplier.size();
318+
writeMeta(fieldInfo, centroidOffset, centroidLength, offsets, calculatedGlobalCentroid);
320319
}
321320
} finally {
322-
org.apache.lucene.util.IOUtils.deleteFilesIgnoringExceptions(mergeState.segmentInfo.dir, tempRawVectorsFileName);
321+
org.apache.lucene.util.IOUtils.deleteFilesIgnoringExceptions(
322+
mergeState.segmentInfo.dir,
323+
tempRawVectorsFileName,
324+
centroidTempName
325+
);
323326
}
327+
} finally {
328+
org.apache.lucene.util.IOUtils.deleteFilesIgnoringExceptions(mergeState.segmentInfo.dir, tempRawVectorsFileName);
324329
}
325330
}
326331

0 commit comments

Comments
 (0)