Skip to content

Commit 9c0cf3d

Browse files
committed
Use IntArrayList/IntHashSet to replace usages of List/Set of Integer (#14774)
* Replace List<Integer> by IntArrayList * Replace Set<Integer> by IntHashSet * mark UpdateGraphsUtils as lucene internal
1 parent 9f159c4 commit 9c0cf3d

File tree

6 files changed

+31
-33
lines changed

6 files changed

+31
-33
lines changed

lucene/CHANGES.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ Optimizations
8282

8383
* GITHUB#14753: Implement IndexedDISI#docIDRunEnd. (Ge Song)
8484

85+
* GITHUB#14774: Use IntArrayList/IntHashSet to replace usages of List/Set of Integer. (Zhang Chao)
86+
8587
Bug Fixes
8688
---------------------
8789
* GITHUB#14654: ValueSource.fromDoubleValuesSource(dvs).getSortField() would throw errors when

lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131

3232
import java.io.IOException;
3333
import java.util.ArrayList;
34-
import java.util.Collections;
3534
import java.util.HashSet;
3635
import java.util.Iterator;
3736
import java.util.List;
@@ -40,6 +39,7 @@
4039
import java.util.Set;
4140
import java.util.function.BooleanSupplier;
4241
import java.util.function.Supplier;
42+
import org.apache.lucene.internal.hppc.IntArrayList;
4343

4444
/**
4545
* Regular Expression extension to <code>Automaton</code>.
@@ -767,14 +767,14 @@ private Automaton toAutomaton(
767767
* @return the original codepoint and the set of alternates
768768
*/
769769
private int[] toCaseInsensitiveChar(int codepoint) {
770-
List<Integer> list = new ArrayList<>();
770+
IntArrayList list = new IntArrayList();
771771
CaseFolding.expand(
772772
codepoint,
773773
(int variant) -> {
774774
list.add(variant);
775775
});
776-
Collections.sort(list);
777-
return list.stream().mapToInt(Integer::intValue).toArray();
776+
list.sort();
777+
return list.toArray();
778778
}
779779

780780
/**
@@ -785,7 +785,7 @@ private int[] toCaseInsensitiveChar(int codepoint) {
785785
* activated by optional flag.
786786
*/
787787
private void expandCaseInsensitiveRange(
788-
int start, int end, List<Integer> rangeStarts, List<Integer> rangeEnds) {
788+
int start, int end, IntArrayList rangeStarts, IntArrayList rangeEnds) {
789789
if (start > end)
790790
throw new IllegalArgumentException(
791791
"invalid range: from (" + start + ") cannot be > to (" + end + ")");
@@ -1341,8 +1341,8 @@ final RegExp parseCharClassExp() throws IllegalArgumentException {
13411341
}
13421342

13431343
final RegExp parseCharClasses() throws IllegalArgumentException {
1344-
ArrayList<Integer> starts = new ArrayList<>();
1345-
ArrayList<Integer> ends = new ArrayList<>();
1344+
IntArrayList starts = new IntArrayList();
1345+
IntArrayList ends = new IntArrayList();
13461346

13471347
do {
13481348
// look for escape
@@ -1385,20 +1385,17 @@ final RegExp parseCharClasses() throws IllegalArgumentException {
13851385
// not sure why we bother optimizing nodes, same automaton...
13861386
// definitely saves time vs fixing toString()-based tests.
13871387
if (starts.size() == 1) {
1388-
if (starts.get(0).intValue() == ends.get(0).intValue()) {
1388+
if (starts.get(0) == ends.get(0)) {
13891389
return makeChar(flags, starts.get(0));
13901390
} else {
13911391
return makeCharRange(flags, starts.get(0), ends.get(0));
13921392
}
13931393
} else {
1394-
return makeCharClass(
1395-
flags,
1396-
starts.stream().mapToInt(Integer::intValue).toArray(),
1397-
ends.stream().mapToInt(Integer::intValue).toArray());
1394+
return makeCharClass(flags, starts.toArray(), ends.toArray());
13981395
}
13991396
}
14001397

1401-
void expandPreDefined(List<Integer> starts, List<Integer> ends) {
1398+
void expandPreDefined(IntArrayList starts, IntArrayList ends) {
14021399
if (peek("\\")) {
14031400
// escape
14041401
starts.add((int) '\\');
@@ -1472,13 +1469,10 @@ void expandPreDefined(List<Integer> starts, List<Integer> ends) {
14721469
final RegExp matchPredefinedCharacterClass() {
14731470
// See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
14741471
if (match('\\') && peek("\\ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")) {
1475-
var starts = new ArrayList<Integer>();
1476-
var ends = new ArrayList<Integer>();
1472+
var starts = new IntArrayList();
1473+
var ends = new IntArrayList();
14771474
expandPreDefined(starts, ends);
1478-
return makeCharClass(
1479-
flags,
1480-
starts.stream().mapToInt(Integer::intValue).toArray(),
1481-
ends.stream().mapToInt(Integer::intValue).toArray());
1475+
return makeCharClass(flags, starts.toArray(), ends.toArray());
14821476
}
14831477

14841478
return null;

lucene/core/src/java/org/apache/lucene/util/hnsw/MergingHnswGraphBuilder.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
2121

2222
import java.io.IOException;
23-
import java.util.Set;
23+
import org.apache.lucene.internal.hppc.IntCursor;
2424
import org.apache.lucene.internal.hppc.IntHashSet;
2525
import org.apache.lucene.util.BitSet;
2626

@@ -143,11 +143,11 @@ public OnHeapHnswGraph build(int maxOrd) throws IOException {
143143
/** Merge the smaller graph into the current larger graph. */
144144
private void updateGraph(HnswGraph gS, int[] ordMapS) throws IOException {
145145
int size = gS.size();
146-
Set<Integer> j = UpdateGraphsUtils.computeJoinSet(gS);
146+
IntHashSet j = UpdateGraphsUtils.computeJoinSet(gS);
147147

148148
// for nodes that in the join set, add them directly to the graph
149-
for (int node : j) {
150-
addGraphNode(ordMapS[node]);
149+
for (IntCursor node : j) {
150+
addGraphNode(ordMapS[node.value]);
151151
}
152152

153153
// for each node outside of j set:

lucene/core/src/java/org/apache/lucene/util/hnsw/UpdateGraphsUtils.java

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,14 @@
2020
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
2121

2222
import java.io.IOException;
23-
import java.util.HashSet;
24-
import java.util.Set;
23+
import org.apache.lucene.internal.hppc.IntHashSet;
2524
import org.apache.lucene.util.LongHeap;
2625

2726
/**
2827
* Utility class for updating a big graph with smaller graphs. This is used during merging of
2928
* segments containing HNSW graphs.
29+
*
30+
* @lucene.internal
3031
*/
3132
public class UpdateGraphsUtils {
3233

@@ -37,11 +38,11 @@ public class UpdateGraphsUtils {
3738
*
3839
* @return a set of nodes that best cover the graph
3940
*/
40-
public static Set<Integer> computeJoinSet(HnswGraph graph) throws IOException {
41+
public static IntHashSet computeJoinSet(HnswGraph graph) throws IOException {
4142
int k; // coverage for the current node
4243
int size = graph.size();
4344
LongHeap heap = new LongHeap(size);
44-
Set<Integer> j = new HashSet<>();
45+
IntHashSet j = new IntHashSet();
4546
boolean[] stale = new boolean[size];
4647
short[] counts = new short[size];
4748
long gExit = 0L;

lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
import org.apache.lucene.index.Term;
6262
import org.apache.lucene.index.VectorEncoding;
6363
import org.apache.lucene.index.VectorSimilarityFunction;
64+
import org.apache.lucene.internal.hppc.IntHashSet;
6465
import org.apache.lucene.search.AbstractKnnCollector;
6566
import org.apache.lucene.search.DocIdSetIterator;
6667
import org.apache.lucene.search.IndexSearcher;
@@ -537,7 +538,7 @@ public void testBuildingJoinSet() throws IOException {
537538
HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, M, beamWidth, seed);
538539
HnswGraph graph = builder.build(vectors.size());
539540

540-
Set<Integer> j = UpdateGraphsUtils.computeJoinSet(graph);
541+
IntHashSet j = UpdateGraphsUtils.computeJoinSet(graph);
541542
assertTrue(
542543
"Join set size [" + j.size() + "] is not less than graph size [" + graph.size() + "]",
543544
j.size() < graph.size());

lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,12 @@
2121
import java.io.IOException;
2222
import java.util.ArrayList;
2323
import java.util.Arrays;
24-
import java.util.HashSet;
2524
import java.util.List;
2625
import java.util.Random;
27-
import java.util.Set;
2826
import org.apache.lucene.index.FloatVectorValues;
2927
import org.apache.lucene.index.VectorSimilarityFunction;
28+
import org.apache.lucene.internal.hppc.IntCursor;
29+
import org.apache.lucene.internal.hppc.IntHashSet;
3030
import org.apache.lucene.util.ArrayUtil;
3131
import org.apache.lucene.util.VectorUtil;
3232
import org.apache.lucene.util.hnsw.NeighborQueue;
@@ -190,14 +190,14 @@ private float[][] computeCentroids(boolean normalizeCenters) throws IOException
190190
* centroids
191191
*/
192192
private float[][] initializeForgy() throws IOException {
193-
Set<Integer> selection = new HashSet<>();
193+
IntHashSet selection = new IntHashSet();
194194
while (selection.size() < numCentroids) {
195195
selection.add(random.nextInt(numVectors));
196196
}
197197
float[][] initialCentroids = new float[numCentroids][];
198198
int i = 0;
199-
for (Integer selectedIdx : selection) {
200-
float[] vector = vectors.vectorValue(selectedIdx);
199+
for (IntCursor selectedIdx : selection) {
200+
float[] vector = vectors.vectorValue(selectedIdx.value);
201201
initialCentroids[i++] = ArrayUtil.copyOfSubArray(vector, 0, vector.length);
202202
}
203203
return initialCentroids;

0 commit comments

Comments
 (0)