5959
6060/**
6161 * Benchmark for measuring query performance with and without doc values skipper in Elasticsearch.
62- * <p>
63- * <b>Goal:</b> This benchmark is designed to **mimic and benchmark the execution of a range query in LogsDB**,
64- * with and without a **sparse doc values index** on the `host.name` and `@timestamp` fields.
65- * <p>
66- * <b>Document Structure:</b>
67- * - `host.name`: A keyword field (sorted, non-stored).
68- * - `@timestamp`: A numeric field, indexed for range queries and using doc values with or without a doc values sparse index.
69- * <p>
70- * <b>Index Sorting:</b>
71- * The index is sorted primarily by `host.name` (ascending) and secondarily by `@timestamp` (descending).
62+ *
63+ * <p><b>Goal:</b> This benchmark is designed to mimic and benchmark the execution of a range query in LogsDB,
64+ * with and without a sparse doc values index on the {@code host.name} and {@code @timestamp} fields.
65+ *
66+ * <p><b>Document Structure:</b>
67+ * <ul>
68+ * <li>{@code host.name}: A keyword field (sorted, non-stored).</li>
69+ * <li>{@code @timestamp}: A numeric field, indexed for range queries and using doc values with or without a doc values sparse index.</li>
70+ * </ul>
71+ *
72+ * <p><b>Index Sorting:</b>
73+ * The index is sorted primarily by {@code host.name} (ascending) and secondarily by {@code @timestamp} (descending).
7274 * Documents are grouped into batches, where each hostname gets a dedicated batch of timestamps.
73- * This is meant to simulate collection of logs from a set of hosts in a certain time interval.
74- * <p>
75- * <b>Batched Data Behavior:</b>
76- * - The `host.name` value is generated in batches (e.g., "host-0", "host-1", ...).
77- * - Each batch contains a fixed number of documents (`batchSize`).
78- * - The `@timestamp` value resets to `BASE_TIMESTAMP` at the start of each batch.
79- * - A random **timestamp delta** (0-{@code timestampIncrementMillis} ms) is added to ensure timestamps within each batch have slight
80- * variation.
81- * <p>
82- * <b>Example Output:</b>
83- * The table below shows a sample of generated documents (with a batch size of 10,000):
75+ * This is meant to simulate collecting logs from a set of hosts over a certain time interval.
76+ *
77+ * <p><b>Batched Data Behavior:</b>
78+ * <ul>
79+ * <li>The {@code host.name} value is generated in batches (e.g., "host-0", "host-1", ...).</li>
80+ * <li>Each batch contains a fixed number of documents ({@code batchSize}).</li>
81+ * <li>The {@code @timestamp} value resets to {@code BASE_TIMESTAMP} at the start of each batch.</li>
82+ * <li>A random timestamp delta (0–{@code deltaTime} ms) is added so that each document in a batch differs slightly.</li>
83+ * </ul>
8484 *
85+ * <p><b>Example Output:</b>
8586 * <pre>
8687 * | Document # | host.name | @timestamp (ms since epoch) |
87- * |-----------|----------| ---------------------------|
88- * | 1 | host-0 | 1704067200005 |
89- * | 2 | host-0 | 1704067201053 |
90- * | 3 | host-0 | 1704067202091 |
91- * | ... | ... | ... |
92- * | 10000 | host-0 | 1704077199568 |
93- * | 10001 | host-1 | 1704067200042 |
94- * | 10002 | host-1 | 1704067201099 |
95- * | ... | ... | ... |
88+ * |-----------|-----------|--- ---------------------------|
89+ * | 1 | host-0 | 1704067200005 |
90+ * | 2 | host-0 | 1704067201053 |
91+ * | 3 | host-0 | 1704067202091 |
92+ * | ... | ... | ... |
93+ * | 10000 | host-0 | 1704077199568 |
94+ * | 10001 | host-1 | 1704067200042 |
95+ * | 10002 | host-1 | 1704067201099 |
96+ * | ... | ... | ... |
9697 * </pre>
9798 *
98- * <p>
99- * When running the range query we also retrieve just a fraction of the data, to simulate a real-world scenario where a
100- * dashboard requires only the most recent logs.
99+ * <p>When running the range query, we retrieve only a fraction of the total data,
100+ * simulating a real-world scenario where a dashboard only needs the most recent logs.
101101 */
102102@ BenchmarkMode (Mode .SampleTime )
103103@ OutputTimeUnit (TimeUnit .MILLISECONDS )
108108@ Measurement (iterations = 5 )
109109public class DateFieldMapperDocValuesSkipperBenchmark {
110110
111- public static void main (String [] args ) throws RunnerException {
112- final Options options = new OptionsBuilder ().include (DateFieldMapperDocValuesSkipperBenchmark .class .getSimpleName ())
113- .addProfiler (AsyncProfiler .class )
114- .build ();
115-
116- new Runner (options ).run ();
117- }
118-
111+ /**
112+ * Total number of documents to index.
113+ */
119114 @ Param ("1343120" )
120115 private int nDocs ;
121116
117+ /**
118+ * Number of documents per hostname batch.
119+ */
122120 @ Param ({ "1340" , "121300" })
123121 private int batchSize ;
124122
123+ /**
124+ * Maximum random increment (in milliseconds) added to each doc's timestamp.
125+ */
125126 @ Param ("1000" )
126127 private int deltaTime ;
127128
129+ /**
130+ * Fraction of the total time range (derived from {@code batchSize * deltaTime}) that the range query will cover.
131+ */
128132 @ Param ({ "0.01" , "0.2" , "0.8" })
129133 private double queryRange ;
130134
135+ /**
136+ * Number of docs to index before forcing a commit, thus creating multiple Lucene segments.
137+ */
131138 @ Param ({ "7390" , "398470" })
132139 private int commitEvery ;
133140
141+ /**
142+ * Seed for random data generation.
143+ */
134144 @ Param ("42" )
135145 private int seed ;
136146
@@ -143,122 +153,182 @@ public static void main(String[] args) throws RunnerException {
143153 private ExecutorService executorService ;
144154
145155 /**
146- * Sets up the benchmark by creating Lucene indexes with and without doc values skipper.
156+ * Main entry point for running this benchmark via JMH.
157+ *
158+ * @param args command line arguments (unused)
159+ * @throws RunnerException if the benchmark fails to run
160+ */
161+ public static void main (String [] args ) throws RunnerException {
162+ final Options options = new OptionsBuilder ().include (DateFieldMapperDocValuesSkipperBenchmark .class .getSimpleName ())
163+ .addProfiler (AsyncProfiler .class )
164+ .build ();
165+
166+ new Runner (options ).run ();
167+ }
168+
169+ /**
170+ * Sets up the benchmark by creating Lucene indexes (with and without doc values skipper).
171+ * Sets up a single-threaded executor for searching the indexes and avoid concurrent search threads.
147172 *
148- * @throws IOException if an error occurs during index creation.
173+ * @throws IOException if an error occurs while building the index
149174 */
150175 @ Setup (Level .Trial )
151176 public void setup () throws IOException {
152177 executorService = Executors .newSingleThreadExecutor ();
153- Directory tempDirectoryWithoutDocValuesSkipper = FSDirectory .open (Files .createTempDirectory ("temp1-" ));
154- Directory tempDirectoryWithDocValuesSkipper = FSDirectory .open (Files .createTempDirectory ("temp2-" ));
178+
179+ final Directory tempDirectoryWithoutDocValuesSkipper = FSDirectory .open (Files .createTempDirectory ("temp1-" ));
180+ final Directory tempDirectoryWithDocValuesSkipper = FSDirectory .open (Files .createTempDirectory ("temp2-" ));
155181
156182 indexSearcherWithoutDocValuesSkipper = createIndex (tempDirectoryWithoutDocValuesSkipper , false , commitEvery );
157183 indexSearcherWithDocValuesSkipper = createIndex (tempDirectoryWithDocValuesSkipper , true , commitEvery );
158184 }
159185
160186 /**
161- * Creates an {@link IndexSearcher} from a newly created {@link IndexWriter}. Documents
162- * are added to the index and committed in batches of a specified size to generate multiple segments .
187+ * Creates an {@link IndexSearcher} after indexing documents in batches.
188+ * Each batch commit forces multiple segments to be created .
163189 *
164- * @param directory the Lucene {@link Directory} where the index will be written
165- * @param withDocValuesSkipper indicates whether certain fields should skip doc values
166- * @param commitEvery the number of documents after which to force a commit
167- * @return an {@link IndexSearcher} that can be used to query the newly created index
168- * @throws IOException if an I/O error occurs during index writing or reading
190+ * @param directory the Lucene {@link Directory} for writing the index
191+ * @param withDocValuesSkipper true if we should enable doc values skipper on certain fields
192+ * @param commitEvery number of documents after which to commit (and thus segment)
193+ * @return an {@link IndexSearcher} for querying the newly built index
194+ * @throws IOException if an I/O error occurs during index writing
169195 */
170196 private IndexSearcher createIndex (final Directory directory , final boolean withDocValuesSkipper , final int commitEvery )
171197 throws IOException {
198+
172199 final IndexWriterConfig config = new IndexWriterConfig (new StandardAnalyzer ());
200+ // NOTE: index sort config matching LogsDB's sort order
173201 config .setIndexSort (
174202 new Sort (
175- new SortField (HOSTNAME_FIELD , SortField .Type .STRING , false ), // NOTE: `host.name` ascending
176- new SortedNumericSortField (TIMESTAMP_FIELD , SortField .Type .LONG , true ) // NOTE: `@timestamp` descending
203+ new SortField (HOSTNAME_FIELD , SortField .Type .STRING , false ),
204+ new SortedNumericSortField (TIMESTAMP_FIELD , SortField .Type .LONG , true )
177205 )
178206 );
179207
180208 final Random random = new Random (seed );
209+
181210 try (IndexWriter indexWriter = new IndexWriter (directory , config )) {
182211 int docCountSinceLastCommit = 0 ;
212+
183213 for (int i = 0 ; i < nDocs ; i ++) {
184214 final Document doc = new Document ();
185215 addFieldsToDocument (doc , i , withDocValuesSkipper , random );
186216 indexWriter .addDocument (doc );
187217 docCountSinceLastCommit ++;
188218
189- // NOTE: make sure we have multiple Lucene segments
219+ // Force commit periodically to create multiple Lucene segments
190220 if (docCountSinceLastCommit >= commitEvery ) {
191221 indexWriter .commit ();
192222 docCountSinceLastCommit = 0 ;
193223 }
194224 }
195225
196226 indexWriter .commit ();
197- final DirectoryReader reader = DirectoryReader .open (indexWriter );
198- // NOTE: internally Elasticsearch runs multiple search threads concurrently, (at least) one per Lucene segment.
199- // Here we simplify the benchmark making sure we have a single-threaded search execution using a single thread
200- // executor Service.
227+
228+ // Open a reader and create a searcher on top of it using a single thread executor.
229+ DirectoryReader reader = DirectoryReader .open (indexWriter );
201230 return new IndexSearcher (reader , executorService );
202231 }
203232 }
204233
234+ /**
235+ * Populates the given {@link Document} with fields, optionally using doc values skipper.
236+ *
237+ * @param doc the Lucene document to fill
238+ * @param docIndex index of the document being added
239+ * @param withDocValuesSkipper true if doc values skipper is enabled
240+ * @param random seeded {@link Random} for data variation
241+ */
205242 private void addFieldsToDocument (final Document doc , int docIndex , boolean withDocValuesSkipper , final Random random ) {
243+
206244 final int batchIndex = docIndex / batchSize ;
207245 final String hostName = "host-" + batchIndex ;
208- final long timestampDelta = random .nextInt (0 , deltaTime );
209- final long timestamp = BASE_TIMESTAMP + ((docIndex % batchSize ) * deltaTime ) + timestampDelta ;
246+
247+ // Slightly vary the timestamp in each document
248+ final long timestamp = BASE_TIMESTAMP + ((docIndex % batchSize ) * deltaTime ) + random .nextInt (0 , deltaTime );
210249
211250 if (withDocValuesSkipper ) {
212- doc .add (SortedNumericDocValuesField .indexedField (TIMESTAMP_FIELD , timestamp )); // NOTE: doc values skipper on `@timestamp`
213- doc .add (SortedDocValuesField .indexedField (HOSTNAME_FIELD , new BytesRef (hostName ))); // NOTE: doc values skipper on `host.name`
251+ // Sparse doc values index on `@timestamp` and `host.name`
252+ doc .add (SortedNumericDocValuesField .indexedField (TIMESTAMP_FIELD , timestamp ));
253+ doc .add (SortedDocValuesField .indexedField (HOSTNAME_FIELD , new BytesRef (hostName )));
214254 } else {
255+ // Standard doc values, points and inverted index
215256 doc .add (new StringField (HOSTNAME_FIELD , hostName , Field .Store .NO ));
216- doc .add (new SortedDocValuesField (HOSTNAME_FIELD , new BytesRef (hostName ))); // NOTE: doc values without the doc values skipper on
217- // `host.name`
218- doc .add (new LongPoint (TIMESTAMP_FIELD , timestamp )); // KDB tree on `@timestamp`
219- doc .add (new SortedNumericDocValuesField (TIMESTAMP_FIELD , timestamp )); // NOTE: doc values without the doc values skipper on
220- // `@timestamp`
257+ doc .add (new SortedDocValuesField (HOSTNAME_FIELD , new BytesRef (hostName )));
258+ doc .add (new LongPoint (TIMESTAMP_FIELD , timestamp ));
259+ doc .add (new SortedNumericDocValuesField (TIMESTAMP_FIELD , timestamp ));
221260 }
222261 }
223262
224263 /**
225- * Computes a dynamic timestamp upper bound based on the batch size ,
226- * timestamp increment , and user-specified fraction .
264+ * Calculates the upper bound for the timestamp range query based on {@code batchSize} ,
265+ * {@code deltaTime} , and {@code queryRange} .
227266 *
228- * @return The computed upper bound for the timestamp range query.
267+ * @return the computed upper bound for the timestamp range query
229268 */
230269 private long rangeEndTimestamp () {
231- return BASE_TIMESTAMP + (( long ) (batchSize * deltaTime * queryRange ) );
270+ return BASE_TIMESTAMP + (long ) (batchSize * deltaTime * queryRange );
232271 }
233272
273+ /**
274+ * Executes a range query without doc values skipper.
275+ *
276+ * @param bh the blackhole consuming the query result
277+ * @throws IOException if a search error occurs
278+ */
234279 @ Benchmark
235280 public void rangeQueryWithoutDocValuesSkipper (final Blackhole bh ) throws IOException {
236281 bh .consume (rangeQuery (indexSearcherWithoutDocValuesSkipper , BASE_TIMESTAMP , rangeEndTimestamp (), true ));
237282 }
238283
284+ /**
285+ * Executes a range query with doc values skipper enabled.
286+ *
287+ * @param bh the blackhole consuming the query result
288+ * @throws IOException if a search error occurs
289+ */
239290 @ Benchmark
240291 public void rangeQueryWithDocValuesSkipper (final Blackhole bh ) throws IOException {
241292 bh .consume (rangeQuery (indexSearcherWithDocValuesSkipper , BASE_TIMESTAMP , rangeEndTimestamp (), false ));
242293 }
243294
295+ /**
296+ * Runs the actual Lucene range query, optionally combining a {@link LongPoint} index query
297+ * with doc values ({@link SortedNumericDocValuesField}) via {@link IndexOrDocValuesQuery},
298+ * and then wrapping it with an {@link IndexSortSortedNumericDocValuesRangeQuery} to utilize the index sort.
299+ *
300+ * @param searcher the Lucene {@link IndexSearcher}
301+ * @param rangeStartTimestamp lower bound of the timestamp range
302+ * @param rangeEndTimestamp upper bound of the timestamp range
303+ * @param isIndexed true if we should combine indexed and doc value queries
304+ * @return the total number of matching documents
305+ * @throws IOException if a search error occurs
306+ */
244307 private long rangeQuery (final IndexSearcher searcher , long rangeStartTimestamp , long rangeEndTimestamp , boolean isIndexed )
245308 throws IOException {
309+
246310 assert rangeEndTimestamp > rangeStartTimestamp ;
311+
247312 final Query rangeQuery = isIndexed
248313 ? new IndexOrDocValuesQuery (
249314 LongPoint .newRangeQuery (TIMESTAMP_FIELD , rangeStartTimestamp , rangeEndTimestamp ),
250315 SortedNumericDocValuesField .newSlowRangeQuery (TIMESTAMP_FIELD , rangeStartTimestamp , rangeEndTimestamp )
251316 )
252317 : SortedNumericDocValuesField .newSlowRangeQuery (TIMESTAMP_FIELD , rangeStartTimestamp , rangeEndTimestamp );
318+
253319 final Query query = new IndexSortSortedNumericDocValuesRangeQuery (
254320 TIMESTAMP_FIELD ,
255321 rangeStartTimestamp ,
256322 rangeEndTimestamp ,
257323 rangeQuery
258324 );
325+
259326 return searcher .count (query );
260327 }
261328
329+ /**
330+ * Shuts down the executor service after the trial completes.
331+ */
262332 @ TearDown (Level .Trial )
263333 public void tearDown () {
264334 if (executorService != null ) {
0 commit comments