Skip to content

Commit 01367c7

Browse files
committed
Merge branch 'main' into pr-12543
2 parents 8db886b + 12fc7bf commit 01367c7

File tree

121 files changed

+7766
-932
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

121 files changed

+7766
-932
lines changed

.github/workflows/distribution.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ jobs:
2626
# we want to run the distribution tests on all major OSs, but it's occasionally too slow (or hangs or the forked process is not started at all..., not sure the cause) on windows.
2727
#os: [ubuntu-latest, macos-latest, windows-latest]
2828
os: [ubuntu-latest, macos-latest]
29+
env:
30+
GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }}
2931

3032
steps:
3133
- uses: actions/checkout@v3

.github/workflows/gradle-precommit.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ jobs:
2727
# Test JVMs.
2828
java: [ '17' ]
2929

30+
env:
31+
GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }}
32+
3033
steps:
3134
- uses: actions/checkout@v3
3235

@@ -58,6 +61,9 @@ jobs:
5861
# Test JVMs.
5962
java: [ '17' ]
6063

64+
env:
65+
GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }}
66+
6167
steps:
6268
- uses: actions/checkout@v3
6369

.github/workflows/hunspell.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ jobs:
1515

1616
runs-on: ubuntu-latest
1717

18+
env:
19+
GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }}
20+
1821
steps:
1922
- uses: actions/checkout@v3
2023

gradle/ge.gradle

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
def isCIBuild = System.getenv().keySet().find { it ==~ /(?i)((JENKINS|HUDSON)(_\w+)?|CI)/ } != null
19+
20+
gradleEnterprise {
21+
server = "https://ge.apache.org"
22+
buildScan {
23+
capture { taskInputFiles = true }
24+
uploadInBackground = !isCIBuild
25+
publishAlways()
26+
publishIfAuthenticated()
27+
obfuscation {
28+
ipAddresses { addresses -> addresses.collect { address -> "0.0.0.0"} }
29+
}
30+
}
31+
}
32+
33+
buildCache {
34+
local {
35+
enabled = !isCIBuild
36+
}
37+
38+
remote(gradleEnterprise.buildCache) {
39+
enabled = false
40+
}
41+
}

lucene/CHANGES.txt

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ API Changes
6262

6363
* GITHUB#12599: Add RandomAccessInput#readBytes method to the RandomAccessInput interface. (Ignacio Vera)
6464

65+
* GITHUB#12709 Consolidate FSTStore and BytesStore in FST. Created FSTReader which contains the common methods
66+
of the two (Anh Dung Bui)
67+
6568
New Features
6669
---------------------
6770

@@ -150,13 +153,18 @@ API Changes
150153
* GITHUB#12592: Add RandomAccessInput#length method to the RandomAccessInput interface. In addition deprecate
151154
ByteBuffersDataInput#size in favour of this new method. (Ignacio Vera)
152155

153-
* GITHUB#12646: Move FST#addNode to FSTCompiler to avoid a circular dependency between FST and FSTCompiler
156+
* GITHUB#12646, GITHUB#12690: Move FST#addNode to FSTCompiler to avoid a circular dependency
157+
between FST and FSTCompiler (Anh Dung Bui)
154158

155159
New Features
156160
---------------------
161+
157162
* GITHUB#12548: Added similarityToQueryVector API to compute vector similarity scores
158163
with DoubleValuesSource. (Shubham Chaudhary)
159164

165+
* GITHUB#12685: Lucene now records if documents have been indexed as blocks in SegmentInfo. This is recorded on a per
166+
segment basis and maintained across merges. The property is exposed via LeafReaderMetadata. (Simon Willnauer)
167+
160168
Improvements
161169
---------------------
162170
* GITHUB#12523: TaskExecutor waits for all tasks to complete before returning when Exceptions
@@ -186,6 +194,8 @@ Improvements
186194
* GITHUB#12705, GITHUB#12705: Improve handling of NullPointerException and IllegalStateException
187195
in MMapDirectory's IndexInputs. (Uwe Schindler, Michael Sokolov)
188196

197+
* GITHUB#12689: TaskExecutor to cancel all tasks on exception to avoid needless computation. (Luca Cavanna)
198+
189199
Optimizations
190200
---------------------
191201
* GITHUB#12183: Make TermStates#build concurrent. (Shubham Chaudhary)
@@ -194,7 +204,7 @@ Optimizations
194204

195205
* GITHUB#12382: Faster top-level conjunctions on term queries when sorting by
196206
descending score. (Adrien Grand)
197-
207+
198208
* GITHUB#12591: Use stable radix sort to speed up the sorting of update terms. (Guo Feng)
199209

200210
* GITHUB#12587: Use radix sort to speed up the sorting of terms in TermInSetQuery. (Guo Feng)
@@ -209,11 +219,18 @@ Optimizations
209219

210220
* GITHUB#12668: ImpactsEnums now decode frequencies lazily like PostingsEnums.
211221
(Adrien Grand)
212-
222+
213223
* GITHUB#12651: Use 2d array for OnHeapHnswGraph representation. (Patrick Zhai)
214224

215225
* GITHUB#12653: Optimize computing number of levels in MultiLevelSkipListWriter#bufferSkip. (Shubham Chaudhary)
216226

227+
* GITHUB#12589: Disjunctions now sometimes run as conjunctions when the minimum
228+
competitive score requires multiple clauses to match. (Adrien Grand)
229+
230+
* GITHUB#12710: Use Arrays#mismatch for Outputs#common operations. (Guo Feng)
231+
232+
* GITHUB#12712: Speed up sorting postings file with an offline radix sorter in BPIndexReader. (Guo Feng)
233+
217234
Changes in runtime behavior
218235
---------------------
219236

@@ -232,6 +249,8 @@ Bug Fixes
232249

233250
* GITHUB#12642: Ensure #finish only gets called once on the base collector during drill-sideways (Greg Miller)
234251

252+
* GITHUB#12682: Scorer should sum up scores into a double. (Shubham Chaudhary)
253+
235254
Build
236255
---------------------
237256

lucene/backward-codecs/src/java/module-info.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
exports org.apache.lucene.backward_codecs.lucene91;
3535
exports org.apache.lucene.backward_codecs.lucene92;
3636
exports org.apache.lucene.backward_codecs.lucene94;
37+
exports org.apache.lucene.backward_codecs.lucene95;
3738
exports org.apache.lucene.backward_codecs.packed;
3839
exports org.apache.lucene.backward_codecs.store;
3940

@@ -46,7 +47,8 @@
4647
org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat,
4748
org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsFormat,
4849
org.apache.lucene.backward_codecs.lucene92.Lucene92HnswVectorsFormat,
49-
org.apache.lucene.backward_codecs.lucene94.Lucene94HnswVectorsFormat;
50+
org.apache.lucene.backward_codecs.lucene94.Lucene94HnswVectorsFormat,
51+
org.apache.lucene.backward_codecs.lucene95.Lucene95HnswVectorsFormat;
5052
provides org.apache.lucene.codecs.Codec with
5153
org.apache.lucene.backward_codecs.lucene80.Lucene80Codec,
5254
org.apache.lucene.backward_codecs.lucene84.Lucene84Codec,
@@ -55,5 +57,6 @@
5557
org.apache.lucene.backward_codecs.lucene90.Lucene90Codec,
5658
org.apache.lucene.backward_codecs.lucene91.Lucene91Codec,
5759
org.apache.lucene.backward_codecs.lucene92.Lucene92Codec,
58-
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec;
60+
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec,
61+
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec;
5962
}

lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene70/Lucene70SegmentInfoFormat.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,7 @@ private SegmentInfo parseSegmentInfo(
307307
segment,
308308
docCount,
309309
isCompoundFile,
310+
false,
310311
null,
311312
diagnostics,
312313
segmentID,

lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene86/Lucene86SegmentInfoFormat.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ private SegmentInfo parseSegmentInfo(
164164
segment,
165165
docCount,
166166
isCompoundFile,
167+
false,
167168
null,
168169
diagnostics,
169170
segmentID,

lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90Codec.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636
import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
3737
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
3838
import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat;
39-
import org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat;
4039
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
4140
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
4241
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
@@ -143,7 +142,7 @@ public final FieldInfosFormat fieldInfosFormat() {
143142
}
144143

145144
@Override
146-
public final SegmentInfoFormat segmentInfoFormat() {
145+
public SegmentInfoFormat segmentInfoFormat() {
147146
return segmentInfosFormat;
148147
}
149148

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.lucene.backward_codecs.lucene90;
19+
20+
import java.io.IOException;
21+
import java.util.Map;
22+
import java.util.Set;
23+
import org.apache.lucene.codecs.CodecUtil;
24+
import org.apache.lucene.codecs.SegmentInfoFormat;
25+
import org.apache.lucene.index.CorruptIndexException;
26+
import org.apache.lucene.index.IndexFileNames;
27+
import org.apache.lucene.index.IndexWriter;
28+
import org.apache.lucene.index.SegmentInfo;
29+
import org.apache.lucene.index.SegmentInfos;
30+
import org.apache.lucene.index.SortFieldProvider;
31+
import org.apache.lucene.search.Sort;
32+
import org.apache.lucene.search.SortField;
33+
import org.apache.lucene.store.ChecksumIndexInput;
34+
import org.apache.lucene.store.DataInput;
35+
import org.apache.lucene.store.DataOutput;
36+
import org.apache.lucene.store.Directory;
37+
import org.apache.lucene.store.IOContext;
38+
import org.apache.lucene.util.Version;
39+
40+
/**
41+
* Lucene 9.0 Segment info format.
42+
*
43+
* <p>Files:
44+
*
45+
* <ul>
46+
* <li><code>.si</code>: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files,
47+
* Attributes, IndexSort, Footer
48+
* </ul>
49+
*
50+
* Data types:
51+
*
52+
* <ul>
53+
* <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}
54+
* <li>SegSize --&gt; {@link DataOutput#writeInt Int32}
55+
* <li>SegVersion --&gt; {@link DataOutput#writeString String}
56+
* <li>SegMinVersion --&gt; {@link DataOutput#writeString String}
57+
* <li>Files --&gt; {@link DataOutput#writeSetOfStrings Set&lt;String&gt;}
58+
* <li>Diagnostics,Attributes --&gt; {@link DataOutput#writeMapOfStrings Map&lt;String,String&gt;}
59+
* <li>IsCompoundFile --&gt; {@link DataOutput#writeByte Int8}
60+
* <li>IndexSort --&gt; {@link DataOutput#writeVInt Int32} count, followed by {@code count}
61+
* SortField
62+
* <li>SortField --&gt; {@link DataOutput#writeString String} sort class, followed by a per-sort
63+
* bytestream (see {@link SortFieldProvider#readSortField(DataInput)})
64+
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}
65+
* </ul>
66+
*
67+
* Field Descriptions:
68+
*
69+
* <ul>
70+
* <li>SegVersion is the code version that created the segment.
71+
* <li>SegMinVersion is the minimum code version that contributed documents to the segment.
72+
* <li>SegSize is the number of documents contained in the segment index.
73+
* <li>IsCompoundFile records whether the segment is written as a compound file or not. If this is
74+
* -1, the segment is not a compound file. If it is 1, the segment is a compound file.
75+
* <li>The Diagnostics Map is privately written by {@link IndexWriter}, as a debugging aid, for
76+
* each segment it creates. It includes metadata like the current Lucene version, OS, Java
77+
* version, why the segment was created (merge, flush, addIndexes), etc.
78+
* <li>Files is a list of files referred to by this segment.
79+
* </ul>
80+
*
81+
* @see SegmentInfos
82+
* @lucene.experimental
83+
*/
84+
public class Lucene90SegmentInfoFormat extends SegmentInfoFormat {
85+
86+
/** File extension used to store {@link SegmentInfo}. */
87+
public static final String SI_EXTENSION = "si";
88+
89+
static final String CODEC_NAME = "Lucene90SegmentInfo";
90+
static final int VERSION_START = 0;
91+
static final int VERSION_CURRENT = VERSION_START;
92+
93+
/** Sole constructor. */
94+
public Lucene90SegmentInfoFormat() {}
95+
96+
@Override
97+
public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context)
98+
throws IOException {
99+
final String fileName = IndexFileNames.segmentFileName(segment, "", SI_EXTENSION);
100+
try (ChecksumIndexInput input = dir.openChecksumInput(fileName)) {
101+
Throwable priorE = null;
102+
SegmentInfo si = null;
103+
try {
104+
CodecUtil.checkIndexHeader(
105+
input, CODEC_NAME, VERSION_START, VERSION_CURRENT, segmentID, "");
106+
si = parseSegmentInfo(dir, input, segment, segmentID);
107+
} catch (Throwable exception) {
108+
priorE = exception;
109+
} finally {
110+
CodecUtil.checkFooter(input, priorE);
111+
}
112+
return si;
113+
}
114+
}
115+
116+
private SegmentInfo parseSegmentInfo(
117+
Directory dir, DataInput input, String segment, byte[] segmentID) throws IOException {
118+
final Version version = Version.fromBits(input.readInt(), input.readInt(), input.readInt());
119+
byte hasMinVersion = input.readByte();
120+
final Version minVersion;
121+
switch (hasMinVersion) {
122+
case 0:
123+
minVersion = null;
124+
break;
125+
case 1:
126+
minVersion = Version.fromBits(input.readInt(), input.readInt(), input.readInt());
127+
break;
128+
default:
129+
throw new CorruptIndexException("Illegal boolean value " + hasMinVersion, input);
130+
}
131+
132+
final int docCount = input.readInt();
133+
if (docCount < 0) {
134+
throw new CorruptIndexException("invalid docCount: " + docCount, input);
135+
}
136+
final boolean isCompoundFile = input.readByte() == SegmentInfo.YES;
137+
138+
final Map<String, String> diagnostics = input.readMapOfStrings();
139+
final Set<String> files = input.readSetOfStrings();
140+
final Map<String, String> attributes = input.readMapOfStrings();
141+
142+
int numSortFields = input.readVInt();
143+
Sort indexSort;
144+
if (numSortFields > 0) {
145+
SortField[] sortFields = new SortField[numSortFields];
146+
for (int i = 0; i < numSortFields; i++) {
147+
String name = input.readString();
148+
sortFields[i] = SortFieldProvider.forName(name).readSortField(input);
149+
}
150+
indexSort = new Sort(sortFields);
151+
} else if (numSortFields < 0) {
152+
throw new CorruptIndexException("invalid index sort field count: " + numSortFields, input);
153+
} else {
154+
indexSort = null;
155+
}
156+
157+
SegmentInfo si =
158+
new SegmentInfo(
159+
dir,
160+
version,
161+
minVersion,
162+
segment,
163+
docCount,
164+
isCompoundFile,
165+
false,
166+
null,
167+
diagnostics,
168+
segmentID,
169+
attributes,
170+
indexSort);
171+
si.setFiles(files);
172+
return si;
173+
}
174+
175+
@Override
176+
public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOException {
177+
throw new UnsupportedOperationException("Old formats can't be used for writing");
178+
}
179+
}

0 commit comments

Comments
 (0)