[GPU] Support for performance profiling (#136021)

ldematte · web-flow · commit 6f5f26869ee5 · 2025-10-09T11:09:42.000+02:00
In order to better understand the performance characteristics of vector indexing with a GPU, this PR introduces 2 changes:
- changes to KnnIndexTester (more logging, support different write buffer sizes in input, support async-profiler
- more logging in the GPU codec
diff --git a/.gitignore b/.gitignore
@@ -46,6 +46,7 @@ build/
 **/.local*
 .vagrant/
 /logs/
+**/target/
 
 # osx stuff
 .DS_Store
diff --git a/qa/vector/build.gradle b/qa/vector/build.gradle
@@ -51,13 +51,34 @@ tasks.register("checkVec", JavaExec) {
   systemProperty "es.logger.out", "console"
   systemProperty "es.logger.level", "INFO"  // Change to DEBUG if needed
   systemProperty 'es.nativelibs.path', TestUtil.getTestLibraryPath(file("../../libs/native/libraries/build/platform/").toString())
-  jvmArgs '-Xms4g', '-Xmx4g', '-Djava.util.concurrent.ForkJoinPool.common.parallelism=8', '-XX:+UnlockDiagnosticVMOptions', '-XX:+DebugNonSafepoints', '-XX:+HeapDumpOnOutOfMemoryError'
+  jvmArgs '-Xms16g', '-Xmx16g', '-Djava.util.concurrent.ForkJoinPool.common.parallelism=8', '-XX:+UnlockDiagnosticVMOptions', '-XX:+DebugNonSafepoints', '-XX:+HeapDumpOnOutOfMemoryError'
   if (buildParams.getRuntimeJavaVersion().map { it.majorVersion.toInteger() }.get() >= 21) {
     jvmArgs '--add-modules=jdk.incubator.vector', '--enable-native-access=ALL-UNNAMED'
   }
   if (System.getenv("DO_PROFILING") != null) {
     jvmArgs '-XX:StartFlightRecording=dumponexit=true,maxsize=250M,filename=knn.jfr,settings=profile.jfc'
   }
+  def asyncProfilerPath = System.getProperty("asyncProfiler.path", null)
+  if (asyncProfilerPath != null) {
+    def asyncProfilerEvent = System.getProperty("asyncProfiler.event", "cpu")
+    if (OS.current().equals(OS.MAC)) {
+      def asyncProfilerAgent = "${asyncProfilerPath}/lib/libasyncProfiler.dylib"
+      println "Using async-profiler agent ${asyncProfilerAgent}"
+
+      // MacOS implementation of async-profiler does not support wall clock profiling with another event.
+      // Wall clock times can be obtained separately invoking this task with `-DasyncProfiler.event=wall`
+      jvmArgs "-agentpath:${asyncProfilerAgent}=start,event=${asyncProfilerEvent},interval=10ms,file=${layout.buildDirectory.asFile.get()}/tmp/elasticsearch-0_%t_%p.jfr"
+    } else if (OS.current().equals(OS.LINUX)) {
+      // Linux implementation of async-profiler uses perf_event, which allows wall clock profiling with another event (cpu)
+      def additionalWallInterval = asyncProfilerEvent.equals("cpu") ? ",wall=50ms" : ""
+
+      def asyncProfilerAgent = "${asyncProfilerPath}/lib/libasyncProfiler.so"
+      println "Using async-profiler agent ${asyncProfilerAgent}"
+      jvmArgs "-agentpath:${asyncProfilerAgent}=start,event=${asyncProfilerEvent},interval=10ms${additionalWallInterval},file=${layout.buildDirectory.asFile.get()}/tmp/elasticsearch-0_%t_%p.jfr"
+    } else {
+      println "Ignoring 'asyncProfiler.path': not available on ${OS.current()}";
+    }
+  }
   if (buildParams.getIsRuntimeJavaHomeSet()) {
     executable = "${buildParams.runtimeJavaHome.get()}/bin/java" + (OS.current() == OS.WINDOWS ? '.exe' : '')
   } else {
diff --git a/qa/vector/src/main/java/org/elasticsearch/test/knn/CmdLineArgs.java b/qa/vector/src/main/java/org/elasticsearch/test/knn/CmdLineArgs.java
@@ -9,10 +9,12 @@
 
 package org.elasticsearch.test.knn;
 
+import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.VectorEncoding;
 import org.apache.lucene.index.VectorSimilarityFunction;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.core.PathUtils;
+import org.elasticsearch.monitor.jvm.JvmInfo;
 import org.elasticsearch.xcontent.ObjectParser;
 import org.elasticsearch.xcontent.ParseField;
 import org.elasticsearch.xcontent.ToXContentObject;
@@ -53,7 +55,9 @@ record CmdLineArgs(
     VectorEncoding vectorEncoding,
     int dimensions,
     boolean earlyTermination,
-    KnnIndexTester.MergePolicyType mergePolicy
+    KnnIndexTester.MergePolicyType mergePolicy,
+    double writerBufferSizeInMb,
+    int writerMaxBufferedDocs
 ) implements ToXContentObject {
 
     static final ParseField DOC_VECTORS_FIELD = new ParseField("doc_vectors");
@@ -82,6 +86,15 @@ record CmdLineArgs(
     static final ParseField FILTER_SELECTIVITY_FIELD = new ParseField("filter_selectivity");
     static final ParseField SEED_FIELD = new ParseField("seed");
     static final ParseField MERGE_POLICY_FIELD = new ParseField("merge_policy");
+    static final ParseField WRITER_BUFFER_MB_FIELD = new ParseField("writer_buffer_mb");
+    static final ParseField WRITER_BUFFER_DOCS_FIELD = new ParseField("writer_buffer_docs");
+
+    /** By default, in ES the default writer buffer size is 10% of the heap space
+     * (see {@code IndexingMemoryController.INDEX_BUFFER_SIZE_SETTING}).
+     * We configure the Java heap size for this tool in {@code build.gradle}; currently we default to 16GB, so in that case
+     * the buffer size would be 1.6GB.
+     */
+    static final double DEFAULT_WRITER_BUFFER_MB = (JvmInfo.jvmInfo().getMem().getHeapMax().getBytes() / (1024.0 * 1024.0)) * 0.1;
 
     static CmdLineArgs fromXContent(XContentParser parser) throws IOException {
         Builder builder = PARSER.apply(parser, null);
@@ -117,6 +130,8 @@ static CmdLineArgs fromXContent(XContentParser parser) throws IOException {
         PARSER.declareFloat(Builder::setFilterSelectivity, FILTER_SELECTIVITY_FIELD);
         PARSER.declareLong(Builder::setSeed, SEED_FIELD);
         PARSER.declareString(Builder::setMergePolicy, MERGE_POLICY_FIELD);
+        PARSER.declareDouble(Builder::setWriterBufferMb, WRITER_BUFFER_MB_FIELD);
+        PARSER.declareInt(Builder::setWriterMaxBufferedDocs, WRITER_BUFFER_DOCS_FIELD);
     }
 
     @Override
@@ -152,6 +167,8 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
         builder.field(EARLY_TERMINATION_FIELD.getPreferredName(), earlyTermination);
         builder.field(FILTER_SELECTIVITY_FIELD.getPreferredName(), filterSelectivity);
         builder.field(SEED_FIELD.getPreferredName(), seed);
+        builder.field(WRITER_BUFFER_MB_FIELD.getPreferredName(), writerBufferSizeInMb);
+        builder.field(WRITER_BUFFER_DOCS_FIELD.getPreferredName(), writerMaxBufferedDocs);
         return builder.endObject();
     }
 
@@ -186,6 +203,13 @@ static class Builder {
         private float filterSelectivity = 1f;
         private long seed = 1751900822751L;
         private KnnIndexTester.MergePolicyType mergePolicy = null;
+        private double writerBufferSizeInMb = DEFAULT_WRITER_BUFFER_MB;
+
+        /**
+         * Elasticsearch does not set this explicitly, and in Lucene this setting is
+         * disabled by default (writer flushes by RAM usage).
+         */
+        private int writerMaxBufferedDocs = IndexWriterConfig.DISABLE_AUTO_FLUSH;
 
         public Builder setDocVectors(List<String> docVectors) {
             if (docVectors == null || docVectors.isEmpty()) {
@@ -316,6 +340,16 @@ public Builder setMergePolicy(String mergePolicy) {
             return this;
         }
 
+        public Builder setWriterBufferMb(double writerBufferSizeInMb) {
+            this.writerBufferSizeInMb = writerBufferSizeInMb;
+            return this;
+        }
+
+        public Builder setWriterMaxBufferedDocs(int writerMaxBufferedDocs) {
+            this.writerMaxBufferedDocs = writerMaxBufferedDocs;
+            return this;
+        }
+
         public CmdLineArgs build() {
             if (docVectors == null) {
                 throw new IllegalArgumentException("Document vectors path must be provided");
@@ -350,7 +384,9 @@ public CmdLineArgs build() {
                 vectorEncoding,
                 dimensions,
                 earlyTermination,
-                mergePolicy
+                mergePolicy,
+                writerBufferSizeInMb,
+                writerMaxBufferedDocs
             );
         }
     }
diff --git a/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java b/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java
@@ -240,7 +240,9 @@ public static void main(String[] args) throws Exception {
                     cmdLineArgs.dimensions(),
                     cmdLineArgs.vectorSpace(),
                     cmdLineArgs.numDocs(),
-                    mergePolicy
+                    mergePolicy,
+                    cmdLineArgs.writerBufferSizeInMb(),
+                    cmdLineArgs.writerMaxBufferedDocs()
                 );
                 if (cmdLineArgs.reindex() == false && Files.exists(indexPath) == false) {
                     throw new IllegalArgumentException("Index path does not exist: " + indexPath);
@@ -301,7 +303,14 @@ public String toString() {
                 return "No results available.";
             }
 
-            String[] indexingHeaders = { "index_name", "index_type", "num_docs", "index_time(ms)", "force_merge_time(ms)", "num_segments" };
+            String[] indexingHeaders = {
+                "index_name",
+                "index_type",
+                "num_docs",
+                "doc_add_time(ms)",
+                "total_index_time(ms)",
+                "force_merge_time(ms)",
+                "num_segments" };
 
             // Define column headers
             String[] searchHeaders = {
@@ -327,6 +336,7 @@ public String toString() {
                     indexResult.indexName,
                     indexResult.indexType,
                     Integer.toString(indexResult.numDocs),
+                    Long.toString(indexResult.docAddTimeMS),
                     Long.toString(indexResult.indexTimeMS),
                     Long.toString(indexResult.forceMergeTimeMS),
                     Integer.toString(indexResult.numSegments) };
@@ -409,6 +419,7 @@ private int[] calculateColumnWidths(String[] headers, String[]... data) {
 
     static class Results {
         final String indexType, indexName;
+        public long docAddTimeMS;
         int numDocs;
         final float filterSelectivity;
         long indexTimeMS;
diff --git a/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexer.java b/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexer.java
@@ -29,6 +29,7 @@
 import org.apache.lucene.index.ConcurrentMergeScheduler;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.MergePolicy;
 import org.apache.lucene.index.VectorEncoding;
 import org.apache.lucene.index.VectorSimilarityFunction;
@@ -60,7 +61,6 @@
 import static org.elasticsearch.test.knn.KnnIndexTester.logger;
 
 class KnnIndexer {
-    private static final double WRITER_BUFFER_MB = 128;
     static final String ID_FIELD = "id";
     static final String VECTOR_FIELD = "vector";
 
@@ -73,6 +73,8 @@ class KnnIndexer {
     private final int numDocs;
     private final int numIndexThreads;
     private final MergePolicy mergePolicy;
+    private final double writerBufferSizeInMb;
+    private final int writerMaxBufferedDocs;
 
     KnnIndexer(
         List<Path> docsPath,
@@ -83,7 +85,9 @@ class KnnIndexer {
         int dim,
         VectorSimilarityFunction similarityFunction,
         int numDocs,
-        MergePolicy mergePolicy
+        MergePolicy mergePolicy,
+        double writerBufferSizeInMb,
+        int writerMaxBufferedDocs
     ) {
         this.docsPath = docsPath;
         this.indexPath = indexPath;
@@ -94,12 +98,15 @@ class KnnIndexer {
         this.similarityFunction = similarityFunction;
         this.numDocs = numDocs;
         this.mergePolicy = mergePolicy;
+        this.writerBufferSizeInMb = writerBufferSizeInMb;
+        this.writerMaxBufferedDocs = writerMaxBufferedDocs;
     }
 
     void createIndex(KnnIndexTester.Results result) throws IOException, InterruptedException, ExecutionException {
         IndexWriterConfig iwc = new IndexWriterConfig().setOpenMode(IndexWriterConfig.OpenMode.CREATE);
         iwc.setCodec(codec);
-        iwc.setRAMBufferSizeMB(WRITER_BUFFER_MB);
+        iwc.setMaxBufferedDocs(writerMaxBufferedDocs);
+        iwc.setRAMBufferSizeMB(writerBufferSizeInMb);
         iwc.setUseCompoundFile(false);
         if (mergePolicy != null) {
             iwc.setMergePolicy(mergePolicy);
@@ -178,15 +185,20 @@ public boolean isEnabled(String component) {
 
                     VectorReader inReader = VectorReader.create(in, dim, vectorEncoding, offsetByteSize);
                     try (ExecutorService exec = Executors.newFixedThreadPool(numIndexThreads, r -> new Thread(r, "KnnIndexer-Thread"))) {
-                        List<Future<?>> threads = new ArrayList<>();
+                        List<Future<?>> futures = new ArrayList<>();
+                        List<IndexerThread> threads = new ArrayList<>();
                         for (int i = 0; i < numIndexThreads; i++) {
-                            Thread t = new IndexerThread(iw, inReader, dim, vectorEncoding, fieldType, numDocsIndexed, numDocs);
+                            var t = new IndexerThread(iw, inReader, dim, vectorEncoding, fieldType, numDocsIndexed, numDocs);
+                            threads.add(t);
                             t.setDaemon(true);
-                            threads.add(exec.submit(t));
+                            futures.add(exec.submit(t));
                         }
-                        for (Future<?> t : threads) {
-                            t.get();
+                        for (Future<?> future : futures) {
+                            future.get();
                         }
+                        result.docAddTimeMS = TimeUnit.NANOSECONDS.toMillis(
+                            threads.stream().mapToLong(x -> x.docAddTime).sum() / numIndexThreads
+                        );
                     }
                 }
             }
@@ -243,6 +255,9 @@ static class IndexerThread extends Thread {
         private final float[] floatVectorBuffer;
         private final VectorReader in;
 
+        long readTime;
+        long docAddTime;
+
         private IndexerThread(
             IndexWriter iw,
             VectorReader in,
@@ -289,23 +304,32 @@ private void _run() throws IOException {
                     continue;
                 }
 
-                Document doc = new Document();
+                var startRead = System.nanoTime();
+                final IndexableField field;
                 switch (vectorEncoding) {
                     case BYTE -> {
                         in.next(byteVectorBuffer);
-                        doc.add(new KnnByteVectorField(VECTOR_FIELD, byteVectorBuffer, fieldType));
+                        field = new KnnByteVectorField(VECTOR_FIELD, byteVectorBuffer, fieldType);
                     }
                     case FLOAT32 -> {
                         in.next(floatVectorBuffer);
-                        doc.add(new KnnFloatVectorField(VECTOR_FIELD, floatVectorBuffer, fieldType));
+                        field = new KnnFloatVectorField(VECTOR_FIELD, floatVectorBuffer, fieldType);
                     }
+                    default -> throw new UnsupportedOperationException();
                 }
+                long endRead = System.nanoTime();
+                readTime += (endRead - startRead);
+
+                Document doc = new Document();
+                doc.add(field);
 
                 if ((id + 1) % 25000 == 0) {
                     logger.debug("Done indexing " + (id + 1) + " documents.");
                 }
                 doc.add(new StoredField(ID_FIELD, id));
                 iw.addDocument(doc);
+
+                docAddTime += (System.nanoTime() - endRead);
             }
         }
     }
diff --git a/server/src/main/java/org/elasticsearch/index/engine/EngineConfig.java b/server/src/main/java/org/elasticsearch/index/engine/EngineConfig.java
@@ -23,6 +23,7 @@
 import org.elasticsearch.common.unit.MemorySizeValue;
 import org.elasticsearch.core.Nullable;
 import org.elasticsearch.core.TimeValue;
+import org.elasticsearch.core.UpdateForV10;
 import org.elasticsearch.index.IndexMode;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.codec.CodecProvider;
@@ -131,6 +132,7 @@ public Supplier<RetentionLeases> retentionLeasesSupplier() {
      * TODO: Remove in 9.0
      */
     @Deprecated
+    @UpdateForV10(owner = UpdateForV10.Owner.DISTRIBUTED_INDEXING)
     public static final Setting<Boolean> INDEX_OPTIMIZE_AUTO_GENERATED_IDS = Setting.boolSetting(
         "index.optimize_auto_generated_id",
         true,
@@ -213,6 +215,7 @@ public EngineConfig(
         // Add an escape hatch in case this change proves problematic - it used
         // to be a fixed amound of RAM: 256 MB.
         // TODO: Remove this escape hatch in 8.x
+        @UpdateForV10(owner = UpdateForV10.Owner.DISTRIBUTED_INDEXING)
         final String escapeHatchProperty = "es.index.memory.max_index_buffer_size";
         String maxBufferSize = System.getProperty(escapeHatchProperty);
         if (maxBufferSize != null) {
diff --git a/x-pack/plugin/gpu/src/main/java/org/elasticsearch/xpack/gpu/codec/CuVSResourceManager.java b/x-pack/plugin/gpu/src/main/java/org/elasticsearch/xpack/gpu/codec/CuVSResourceManager.java
@@ -130,6 +130,7 @@ private int numLockedResources() {
         @Override
         public ManagedCuVSResources acquire(int numVectors, int dims, CuVSMatrix.DataType dataType) throws InterruptedException {
             try {
+                var started = System.nanoTime();
                 lock.lock();
 
                 boolean allConditionsMet = false;
@@ -181,6 +182,8 @@ public ManagedCuVSResources acquire(int numVectors, int dims, CuVSMatrix.DataTyp
                         enoughResourcesCondition.await();
                     }
                 }
+                var elapsed = started - System.nanoTime();
+                logger.debug("Resource acquired in [{}ms]", elapsed / 1_000_000.0);
                 res.locked = true;
                 return res;
             } finally {
diff --git a/x-pack/plugin/gpu/src/main/java/org/elasticsearch/xpack/gpu/codec/ES92GpuHnswVectorsWriter.java b/x-pack/plugin/gpu/src/main/java/org/elasticsearch/xpack/gpu/codec/ES92GpuHnswVectorsWriter.java