Use IntArrayList/IntHashSet to replace usages of List/Set of Integer (#14774)

easyice · easyice · commit 9c0cf3dcce43 · 2025-06-16T17:28:45.000+08:00
* Replace List&lt;Integer&gt; by IntArrayList

* Replace Set&lt;Integer&gt; by IntHashSet

* mark UpdateGraphsUtils as lucene internal
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -82,6 +82,8 @@ Optimizations
 
 * GITHUB#14753: Implement IndexedDISI#docIDRunEnd. (Ge Song)
 
+* GITHUB#14774: Use IntArrayList/IntHashSet to replace usages of List/Set of Integer. (Zhang Chao)
+
 Bug Fixes
 ---------------------
 * GITHUB#14654: ValueSource.fromDoubleValuesSource(dvs).getSortField() would throw errors when
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
@@ -31,7 +31,6 @@
 
 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Collections;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
@@ -40,6 +39,7 @@
 import java.util.Set;
 import java.util.function.BooleanSupplier;
 import java.util.function.Supplier;
+import org.apache.lucene.internal.hppc.IntArrayList;
 
 /**
  * Regular Expression extension to <code>Automaton</code>.
@@ -767,14 +767,14 @@ private Automaton toAutomaton(
    * @return the original codepoint and the set of alternates
    */
   private int[] toCaseInsensitiveChar(int codepoint) {
-    List<Integer> list = new ArrayList<>();
+    IntArrayList list = new IntArrayList();
     CaseFolding.expand(
         codepoint,
         (int variant) -> {
           list.add(variant);
         });
-    Collections.sort(list);
-    return list.stream().mapToInt(Integer::intValue).toArray();
+    list.sort();
+    return list.toArray();
   }
 
   /**
@@ -785,7 +785,7 @@ private int[] toCaseInsensitiveChar(int codepoint) {
    * activated by optional flag.
    */
   private void expandCaseInsensitiveRange(
-      int start, int end, List<Integer> rangeStarts, List<Integer> rangeEnds) {
+      int start, int end, IntArrayList rangeStarts, IntArrayList rangeEnds) {
     if (start > end)
       throw new IllegalArgumentException(
           "invalid range: from (" + start + ") cannot be > to (" + end + ")");
@@ -1341,8 +1341,8 @@ final RegExp parseCharClassExp() throws IllegalArgumentException {
   }
 
   final RegExp parseCharClasses() throws IllegalArgumentException {
-    ArrayList<Integer> starts = new ArrayList<>();
-    ArrayList<Integer> ends = new ArrayList<>();
+    IntArrayList starts = new IntArrayList();
+    IntArrayList ends = new IntArrayList();
 
     do {
       // look for escape
@@ -1385,20 +1385,17 @@ final RegExp parseCharClasses() throws IllegalArgumentException {
     // not sure why we bother optimizing nodes, same automaton...
     // definitely saves time vs fixing toString()-based tests.
     if (starts.size() == 1) {
-      if (starts.get(0).intValue() == ends.get(0).intValue()) {
+      if (starts.get(0) == ends.get(0)) {
         return makeChar(flags, starts.get(0));
       } else {
         return makeCharRange(flags, starts.get(0), ends.get(0));
       }
     } else {
-      return makeCharClass(
-          flags,
-          starts.stream().mapToInt(Integer::intValue).toArray(),
-          ends.stream().mapToInt(Integer::intValue).toArray());
+      return makeCharClass(flags, starts.toArray(), ends.toArray());
     }
   }
 
-  void expandPreDefined(List<Integer> starts, List<Integer> ends) {
+  void expandPreDefined(IntArrayList starts, IntArrayList ends) {
     if (peek("\\")) {
       // escape
       starts.add((int) '\\');
@@ -1472,13 +1469,10 @@ void expandPreDefined(List<Integer> starts, List<Integer> ends) {
   final RegExp matchPredefinedCharacterClass() {
     // See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
     if (match('\\') && peek("\\ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")) {
-      var starts = new ArrayList<Integer>();
-      var ends = new ArrayList<Integer>();
+      var starts = new IntArrayList();
+      var ends = new IntArrayList();
       expandPreDefined(starts, ends);
-      return makeCharClass(
-          flags,
-          starts.stream().mapToInt(Integer::intValue).toArray(),
-          ends.stream().mapToInt(Integer::intValue).toArray());
+      return makeCharClass(flags, starts.toArray(), ends.toArray());
     }
 
     return null;
diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/MergingHnswGraphBuilder.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/MergingHnswGraphBuilder.java
@@ -20,7 +20,7 @@
 import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
 
 import java.io.IOException;
-import java.util.Set;
+import org.apache.lucene.internal.hppc.IntCursor;
 import org.apache.lucene.internal.hppc.IntHashSet;
 import org.apache.lucene.util.BitSet;
 
@@ -143,11 +143,11 @@ public OnHeapHnswGraph build(int maxOrd) throws IOException {
   /** Merge the smaller graph into the current larger graph. */
   private void updateGraph(HnswGraph gS, int[] ordMapS) throws IOException {
     int size = gS.size();
-    Set<Integer> j = UpdateGraphsUtils.computeJoinSet(gS);
+    IntHashSet j = UpdateGraphsUtils.computeJoinSet(gS);
 
     // for nodes that in the join set, add them directly to the graph
-    for (int node : j) {
-      addGraphNode(ordMapS[node]);
+    for (IntCursor node : j) {
+      addGraphNode(ordMapS[node.value]);
     }
 
     // for each node outside of j set:
diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/UpdateGraphsUtils.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/UpdateGraphsUtils.java
@@ -20,13 +20,14 @@
 import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
 
 import java.io.IOException;
-import java.util.HashSet;
-import java.util.Set;
+import org.apache.lucene.internal.hppc.IntHashSet;
 import org.apache.lucene.util.LongHeap;
 
 /**
  * Utility class for updating a big graph with smaller graphs. This is used during merging of
  * segments containing HNSW graphs.
+ *
+ * @lucene.internal
  */
 public class UpdateGraphsUtils {
 
@@ -37,11 +38,11 @@ public class UpdateGraphsUtils {
    *
    * @return a set of nodes that best cover the graph
    */
-  public static Set<Integer> computeJoinSet(HnswGraph graph) throws IOException {
+  public static IntHashSet computeJoinSet(HnswGraph graph) throws IOException {
     int k; // coverage for the current node
     int size = graph.size();
     LongHeap heap = new LongHeap(size);
-    Set<Integer> j = new HashSet<>();
+    IntHashSet j = new IntHashSet();
     boolean[] stale = new boolean[size];
     short[] counts = new short[size];
     long gExit = 0L;
diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java
@@ -61,6 +61,7 @@
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.VectorEncoding;
 import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.internal.hppc.IntHashSet;
 import org.apache.lucene.search.AbstractKnnCollector;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.IndexSearcher;
@@ -537,7 +538,7 @@ public void testBuildingJoinSet() throws IOException {
     HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, M, beamWidth, seed);
     HnswGraph graph = builder.build(vectors.size());
 
-    Set<Integer> j = UpdateGraphsUtils.computeJoinSet(graph);
+    IntHashSet j = UpdateGraphsUtils.computeJoinSet(graph);
     assertTrue(
         "Join set size [" + j.size() + "] is not less than graph size [" + graph.size() + "]",
         j.size() < graph.size());
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java
@@ -21,12 +21,12 @@
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.HashSet;
 import java.util.List;
 import java.util.Random;
-import java.util.Set;
 import org.apache.lucene.index.FloatVectorValues;
 import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.internal.hppc.IntCursor;
+import org.apache.lucene.internal.hppc.IntHashSet;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.VectorUtil;
 import org.apache.lucene.util.hnsw.NeighborQueue;
@@ -190,14 +190,14 @@ private float[][] computeCentroids(boolean normalizeCenters) throws IOException
    * centroids
    */
   private float[][] initializeForgy() throws IOException {
-    Set<Integer> selection = new HashSet<>();
+    IntHashSet selection = new IntHashSet();
     while (selection.size() < numCentroids) {
       selection.add(random.nextInt(numVectors));
     }
     float[][] initialCentroids = new float[numCentroids][];
     int i = 0;
-    for (Integer selectedIdx : selection) {
-      float[] vector = vectors.vectorValue(selectedIdx);
+    for (IntCursor selectedIdx : selection) {
+      float[] vector = vectors.vectorValue(selectedIdx.value);
       initialCentroids[i++] = ArrayUtil.copyOfSubArray(vector, 0, vector.length);
     }
     return initialCentroids;