17
17
18
18
package org .apache .lucene .codecs ;
19
19
20
- import static org .apache .lucene .search .DocIdSetIterator .NO_MORE_DOCS ;
21
-
22
20
import java .io .Closeable ;
23
21
import java .io .IOException ;
24
22
import java .util .ArrayList ;
25
- import java .util .Arrays ;
26
23
import java .util .List ;
27
24
import org .apache .lucene .index .DocIDMerger ;
28
25
import org .apache .lucene .index .FieldInfo ;
29
26
import org .apache .lucene .index .MergeState ;
30
- import org .apache .lucene .index .RandomAccessVectorValues ;
31
- import org .apache .lucene .index .RandomAccessVectorValuesProducer ;
32
- import org .apache .lucene .index .VectorSimilarityFunction ;
33
27
import org .apache .lucene .index .VectorValues ;
34
28
import org .apache .lucene .search .TopDocs ;
35
29
import org .apache .lucene .util .Bits ;
@@ -48,7 +42,11 @@ public abstract void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectors
48
42
/** Called once at the end before close */
49
43
public abstract void finish () throws IOException ;
50
44
51
- /** Merge the vector values from multiple segments, for all fields */
45
+ /**
46
+ * Merges the segment vectors for all fields. This default implementation delegates to {@link
47
+ * #writeField}, passing a {@link KnnVectorsReader} that combines the vector values and ignores
48
+ * deleted documents.
49
+ */
52
50
public void merge (MergeState mergeState ) throws IOException {
53
51
for (int i = 0 ; i < mergeState .fieldInfos .length ; i ++) {
54
52
KnnVectorsReader reader = mergeState .knnVectorsReaders [i ];
@@ -57,163 +55,106 @@ public void merge(MergeState mergeState) throws IOException {
57
55
reader .checkIntegrity ();
58
56
}
59
57
}
58
+
60
59
for (FieldInfo fieldInfo : mergeState .mergeFieldInfos ) {
61
60
if (fieldInfo .hasVectorValues ()) {
62
- mergeVectors (fieldInfo , mergeState );
63
- }
64
- }
65
- finish ();
66
- }
61
+ if (mergeState .infoStream .isEnabled ("VV" )) {
62
+ mergeState .infoStream .message ("VV" , "merging " + mergeState .segmentInfo );
63
+ }
67
64
68
- private void mergeVectors (FieldInfo mergeFieldInfo , final MergeState mergeState )
69
- throws IOException {
70
- if (mergeState .infoStream .isEnabled ("VV" )) {
71
- mergeState .infoStream .message ("VV" , "merging " + mergeState .segmentInfo );
72
- }
73
- // Create a new VectorValues by iterating over the sub vectors, mapping the resulting
74
- // docids using docMaps in the mergeState.
75
- writeField (
76
- mergeFieldInfo ,
77
- new KnnVectorsReader () {
78
- @ Override
79
- public long ramBytesUsed () {
80
- return 0 ;
81
- }
65
+ writeField (
66
+ fieldInfo ,
67
+ new KnnVectorsReader () {
68
+ @ Override
69
+ public long ramBytesUsed () {
70
+ return 0 ;
71
+ }
82
72
83
- @ Override
84
- public void close () throws IOException {
85
- throw new UnsupportedOperationException ();
86
- }
73
+ @ Override
74
+ public void close () {
75
+ throw new UnsupportedOperationException ();
76
+ }
87
77
88
- @ Override
89
- public void checkIntegrity () throws IOException {
90
- throw new UnsupportedOperationException ();
91
- }
78
+ @ Override
79
+ public void checkIntegrity () {
80
+ throw new UnsupportedOperationException ();
81
+ }
92
82
93
- @ Override
94
- public VectorValues getVectorValues (String field ) throws IOException {
95
- List <VectorValuesSub > subs = new ArrayList <>();
96
- int dimension = -1 ;
97
- VectorSimilarityFunction similarityFunction = null ;
98
- int nonEmptySegmentIndex = 0 ;
99
- for (int i = 0 ; i < mergeState .knnVectorsReaders .length ; i ++) {
100
- KnnVectorsReader knnVectorsReader = mergeState .knnVectorsReaders [i ];
101
- if (knnVectorsReader != null ) {
102
- if (mergeFieldInfo != null && mergeFieldInfo .hasVectorValues ()) {
103
- int segmentDimension = mergeFieldInfo .getVectorDimension ();
104
- VectorSimilarityFunction segmentSimilarityFunction =
105
- mergeFieldInfo .getVectorSimilarityFunction ();
106
- if (dimension == -1 ) {
107
- dimension = segmentDimension ;
108
- similarityFunction = mergeFieldInfo .getVectorSimilarityFunction ();
109
- } else if (dimension != segmentDimension ) {
110
- throw new IllegalStateException (
111
- "Varying dimensions for vector-valued field "
112
- + mergeFieldInfo .name
113
- + ": "
114
- + dimension
115
- + "!="
116
- + segmentDimension );
117
- } else if (similarityFunction != segmentSimilarityFunction ) {
118
- throw new IllegalStateException (
119
- "Varying similarity functions for vector-valued field "
120
- + mergeFieldInfo .name
121
- + ": "
122
- + similarityFunction
123
- + "!="
124
- + segmentSimilarityFunction );
125
- }
126
- VectorValues values = knnVectorsReader .getVectorValues (mergeFieldInfo .name );
127
- if (values != null ) {
128
- subs .add (
129
- new VectorValuesSub (nonEmptySegmentIndex ++, mergeState .docMaps [i ], values ));
130
- }
131
- }
83
+ @ Override
84
+ public VectorValues getVectorValues (String field ) throws IOException {
85
+ return MergedVectorValues .mergeVectorValues (fieldInfo , mergeState );
132
86
}
133
- }
134
- return new VectorValuesMerger (subs , mergeState );
135
- }
136
87
137
- @ Override
138
- public TopDocs search (String field , float [] target , int k , Bits acceptDocs )
139
- throws IOException {
140
- throw new UnsupportedOperationException ();
141
- }
142
- });
88
+ @ Override
89
+ public TopDocs search (String field , float [] target , int k , Bits acceptDocs ) {
90
+ throw new UnsupportedOperationException ();
91
+ }
92
+ });
143
93
144
- if (mergeState .infoStream .isEnabled ("VV" )) {
145
- mergeState .infoStream .message ("VV" , "merge done " + mergeState .segmentInfo );
94
+ if (mergeState .infoStream .isEnabled ("VV" )) {
95
+ mergeState .infoStream .message ("VV" , "merge done " + mergeState .segmentInfo );
96
+ }
97
+ }
146
98
}
99
+ finish ();
147
100
}
148
101
149
102
/** Tracks state of one sub-reader that we are merging */
150
103
private static class VectorValuesSub extends DocIDMerger .Sub {
151
104
152
105
final VectorValues values ;
153
- final int segmentIndex ;
154
- int count ;
155
106
156
- VectorValuesSub (int segmentIndex , MergeState .DocMap docMap , VectorValues values ) {
107
+ VectorValuesSub (MergeState .DocMap docMap , VectorValues values ) {
157
108
super (docMap );
158
109
this .values = values ;
159
- this .segmentIndex = segmentIndex ;
160
110
assert values .docID () == -1 ;
161
111
}
162
112
163
113
@ Override
164
114
public int nextDoc () throws IOException {
165
- int docId = values .nextDoc ();
166
- if (docId != NO_MORE_DOCS ) {
167
- // Note: this does count deleted docs since they are present in the to-be-merged segment
168
- ++count ;
169
- }
170
- return docId ;
115
+ return values .nextDoc ();
171
116
}
172
117
}
173
118
174
- /**
175
- * View over multiple VectorValues supporting iterator-style access via DocIdMerger. Maintains a
176
- * reverse ordinal mapping for documents having values in order to support random access by dense
177
- * ordinal.
178
- */
179
- private static class VectorValuesMerger extends VectorValues
180
- implements RandomAccessVectorValuesProducer {
119
+ /** View over multiple VectorValues supporting iterator-style access via DocIdMerger. */
120
+ public static class MergedVectorValues extends VectorValues {
181
121
private final List <VectorValuesSub > subs ;
182
122
private final DocIDMerger <VectorValuesSub > docIdMerger ;
183
- private final int [] ordBase ;
184
123
private final int cost ;
185
- private int size ;
124
+ private final int size ;
186
125
187
126
private int docId ;
188
127
private VectorValuesSub current ;
189
- /* For each doc with a vector, record its ord in the segments being merged. This enables random
190
- * access into the unmerged segments using the ords from the merged segment.
191
- */
192
- private int [] ordMap ;
193
- private int ord ;
194
128
195
- VectorValuesMerger (List <VectorValuesSub > subs , MergeState mergeState ) throws IOException {
129
+ /** Returns a merged view over all the segment's {@link VectorValues}. */
130
+ public static MergedVectorValues mergeVectorValues (FieldInfo fieldInfo , MergeState mergeState )
131
+ throws IOException {
132
+ assert fieldInfo != null && fieldInfo .hasVectorValues ();
133
+
134
+ List <VectorValuesSub > subs = new ArrayList <>();
135
+ for (int i = 0 ; i < mergeState .knnVectorsReaders .length ; i ++) {
136
+ KnnVectorsReader knnVectorsReader = mergeState .knnVectorsReaders [i ];
137
+ if (knnVectorsReader != null ) {
138
+ VectorValues values = knnVectorsReader .getVectorValues (fieldInfo .name );
139
+ if (values != null ) {
140
+ subs .add (new VectorValuesSub (mergeState .docMaps [i ], values ));
141
+ }
142
+ }
143
+ }
144
+ return new MergedVectorValues (subs , mergeState );
145
+ }
146
+
147
+ private MergedVectorValues (List <VectorValuesSub > subs , MergeState mergeState )
148
+ throws IOException {
196
149
this .subs = subs ;
197
150
docIdMerger = DocIDMerger .of (subs , mergeState .needsIndexSort );
198
151
int totalCost = 0 , totalSize = 0 ;
199
152
for (VectorValuesSub sub : subs ) {
200
153
totalCost += sub .values .cost ();
201
154
totalSize += sub .values .size ();
202
155
}
203
- /* This size includes deleted docs, but when we iterate over docs here (nextDoc())
204
- * we skip deleted docs. So we sneakily update this size once we observe that iteration is complete.
205
- * That way by the time we are asked to do random access for graph building, we have a correct size.
206
- */
207
156
cost = totalCost ;
208
157
size = totalSize ;
209
- ordMap = new int [size ];
210
- ordBase = new int [subs .size ()];
211
- int lastBase = 0 ;
212
- for (int k = 0 ; k < subs .size (); k ++) {
213
- int size = subs .get (k ).values .size ();
214
- ordBase [k ] = lastBase ;
215
- lastBase += size ;
216
- }
217
158
docId = -1 ;
218
159
}
219
160
@@ -227,12 +168,8 @@ public int nextDoc() throws IOException {
227
168
current = docIdMerger .next ();
228
169
if (current == null ) {
229
170
docId = NO_MORE_DOCS ;
230
- /* update the size to reflect the number of *non-deleted* documents seen so we can support
231
- * random access. */
232
- size = ord ;
233
171
} else {
234
172
docId = current .mappedDocID ;
235
- ordMap [ord ++] = ordBase [current .segmentIndex ] + current .count - 1 ;
236
173
}
237
174
return docId ;
238
175
}
@@ -247,11 +184,6 @@ public BytesRef binaryValue() throws IOException {
247
184
return current .values .binaryValue ();
248
185
}
249
186
250
- @ Override
251
- public RandomAccessVectorValues randomAccess () {
252
- return new MergerRandomAccess ();
253
- }
254
-
255
187
@ Override
256
188
public int advance (int target ) {
257
189
throw new UnsupportedOperationException ();
@@ -271,52 +203,5 @@ public long cost() {
271
203
public int dimension () {
272
204
return subs .get (0 ).values .dimension ();
273
205
}
274
-
275
- class MergerRandomAccess implements RandomAccessVectorValues {
276
-
277
- private final List <RandomAccessVectorValues > raSubs ;
278
-
279
- MergerRandomAccess () {
280
- raSubs = new ArrayList <>(subs .size ());
281
- for (VectorValuesSub sub : subs ) {
282
- if (sub .values instanceof RandomAccessVectorValuesProducer ) {
283
- raSubs .add (((RandomAccessVectorValuesProducer ) sub .values ).randomAccess ());
284
- } else {
285
- throw new IllegalStateException (
286
- "Cannot merge VectorValues without support for random access" );
287
- }
288
- }
289
- }
290
-
291
- @ Override
292
- public int size () {
293
- return size ;
294
- }
295
-
296
- @ Override
297
- public int dimension () {
298
- return VectorValuesMerger .this .dimension ();
299
- }
300
-
301
- @ Override
302
- public float [] vectorValue (int target ) throws IOException {
303
- int unmappedOrd = ordMap [target ];
304
- int segmentOrd = Arrays .binarySearch (ordBase , unmappedOrd );
305
- if (segmentOrd < 0 ) {
306
- // get the index of the greatest lower bound
307
- segmentOrd = -2 - segmentOrd ;
308
- }
309
- while (segmentOrd < ordBase .length - 1 && ordBase [segmentOrd + 1 ] == ordBase [segmentOrd ]) {
310
- // forward over empty segments which will share the same ordBase
311
- segmentOrd ++;
312
- }
313
- return raSubs .get (segmentOrd ).vectorValue (unmappedOrd - ordBase [segmentOrd ]);
314
- }
315
-
316
- @ Override
317
- public BytesRef binaryValue (int targetOrd ) throws IOException {
318
- throw new UnsupportedOperationException ();
319
- }
320
- }
321
206
}
322
207
}
0 commit comments