Skip to content

Commit a9a3f65

Browse files
Fix regression to account payloads while merging (#103)
Before PR#11, during merging if any merging segment has payloads for a certain field, the new merged segment will also has payloads set up for this field. PR #11 introduced a bug where the first segment among merging segments will define if the new merged segment will have payloads. If the first segment doesn't have payloads, and others do, the new merged segment mistakenly will not have payloads set up. This PR fixes this bug. Relates to #11
1 parent f7a3587 commit a9a3f65

File tree

5 files changed

+102
-62
lines changed

5 files changed

+102
-62
lines changed

lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/TestLucene50TermVectorsFormat.java

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
*/
1717
package org.apache.lucene.backward_codecs.lucene50;
1818

19-
import java.io.IOException;
2019
import org.apache.lucene.backward_codecs.lucene87.Lucene87RWCodec;
2120
import org.apache.lucene.codecs.Codec;
2221
import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
@@ -27,10 +26,4 @@ public class TestLucene50TermVectorsFormat extends BaseTermVectorsFormatTestCase
2726
protected Codec getCodec() {
2827
return new Lucene87RWCodec();
2928
}
30-
31-
@Override
32-
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9334")
33-
public void testMerge() throws IOException {
34-
super.testMerge();
35-
}
3629
}

lucene/core/src/java/org/apache/lucene/index/FieldInfo.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -552,8 +552,7 @@ void setStoreTermVectors() {
552552
}
553553

554554
void setStorePayloads() {
555-
if (indexOptions != IndexOptions.NONE
556-
&& indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
555+
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
557556
storePayloads = true;
558557
}
559558
this.checkConsistency();

lucene/core/src/java/org/apache/lucene/index/FieldInfos.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -654,6 +654,9 @@ FieldInfo add(FieldInfo fi, long dvGen) {
654654
if (fi.attributes() != null) {
655655
fi.attributes().forEach((k, v) -> curFi.putAttribute(k, v));
656656
}
657+
if (fi.hasPayloads()) {
658+
curFi.setStorePayloads();
659+
}
657660
return curFi;
658661
}
659662
// This field wasn't yet added to this in-RAM segment's FieldInfo,

lucene/core/src/test/org/apache/lucene/index/TestTermVectors.java

Lines changed: 98 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -16,70 +16,22 @@
1616
*/
1717
package org.apache.lucene.index;
1818

19+
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween;
20+
1921
import java.io.IOException;
2022
import org.apache.lucene.analysis.MockAnalyzer;
21-
import org.apache.lucene.analysis.MockTokenizer;
23+
import org.apache.lucene.analysis.TokenStream;
2224
import org.apache.lucene.document.Document;
2325
import org.apache.lucene.document.Field;
2426
import org.apache.lucene.document.FieldType;
2527
import org.apache.lucene.document.TextField;
2628
import org.apache.lucene.store.Directory;
27-
import org.apache.lucene.util.English;
29+
import org.apache.lucene.util.BytesRef;
2830
import org.apache.lucene.util.IOUtils;
2931
import org.apache.lucene.util.LuceneTestCase;
3032
import org.apache.lucene.util.TestUtil;
31-
import org.junit.AfterClass;
32-
import org.junit.BeforeClass;
3333

3434
public class TestTermVectors extends LuceneTestCase {
35-
private static IndexReader reader;
36-
private static Directory directory;
37-
38-
@BeforeClass
39-
public static void beforeClass() throws Exception {
40-
directory = newDirectory();
41-
RandomIndexWriter writer =
42-
new RandomIndexWriter(
43-
random(),
44-
directory,
45-
newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true))
46-
.setMergePolicy(newLogMergePolicy()));
47-
// writer.setNoCFSRatio(1.0);
48-
// writer.infoStream = System.out;
49-
for (int i = 0; i < 1000; i++) {
50-
Document doc = new Document();
51-
FieldType ft = new FieldType(TextField.TYPE_STORED);
52-
int mod3 = i % 3;
53-
int mod2 = i % 2;
54-
if (mod2 == 0 && mod3 == 0) {
55-
ft.setStoreTermVectors(true);
56-
ft.setStoreTermVectorOffsets(true);
57-
ft.setStoreTermVectorPositions(true);
58-
} else if (mod2 == 0) {
59-
ft.setStoreTermVectors(true);
60-
ft.setStoreTermVectorPositions(true);
61-
} else if (mod3 == 0) {
62-
ft.setStoreTermVectors(true);
63-
ft.setStoreTermVectorOffsets(true);
64-
} else {
65-
ft.setStoreTermVectors(true);
66-
}
67-
doc.add(new Field("field", English.intToEnglish(i), ft));
68-
// test no term vectors too
69-
doc.add(new TextField("noTV", English.intToEnglish(i), Field.Store.YES));
70-
writer.addDocument(doc);
71-
}
72-
reader = writer.getReader();
73-
writer.close();
74-
}
75-
76-
@AfterClass
77-
public static void afterClass() throws Exception {
78-
reader.close();
79-
directory.close();
80-
reader = null;
81-
directory = null;
82-
}
8335

8436
private IndexWriter createWriter(Directory dir) throws IOException {
8537
return new IndexWriter(
@@ -166,4 +118,98 @@ public void testFullMergeAddIndexesReader() throws Exception {
166118
verifyIndex(target);
167119
IOUtils.close(target, input[0], input[1]);
168120
}
121+
122+
/**
123+
* Assert that a merged segment has payloads set up in fieldInfo, if at least 1 segment has
124+
* payloads for this field.
125+
*/
126+
public void testMergeWithPayloads() throws Exception {
127+
final FieldType ft1 = new FieldType(TextField.TYPE_NOT_STORED);
128+
ft1.setStoreTermVectors(true);
129+
ft1.setStoreTermVectorOffsets(true);
130+
ft1.setStoreTermVectorPositions(true);
131+
ft1.setStoreTermVectorPayloads(true);
132+
ft1.freeze();
133+
134+
final int numDocsInSegment = 10;
135+
for (boolean hasPayloads : new boolean[] {false, true}) {
136+
Directory dir = newDirectory();
137+
IndexWriterConfig indexWriterConfig =
138+
new IndexWriterConfig(new MockAnalyzer(random())).setMaxBufferedDocs(numDocsInSegment);
139+
IndexWriter writer = new IndexWriter(dir, indexWriterConfig);
140+
TokenStreamGenerator tkg1 = new TokenStreamGenerator(hasPayloads);
141+
TokenStreamGenerator tkg2 = new TokenStreamGenerator(!hasPayloads);
142+
143+
// create one segment with payloads, and another without payloads
144+
for (int i = 0; i < numDocsInSegment; i++) {
145+
Document doc = new Document();
146+
doc.add(new Field("c", tkg1.newTokenStream(), ft1));
147+
writer.addDocument(doc);
148+
}
149+
for (int i = 0; i < numDocsInSegment; i++) {
150+
Document doc = new Document();
151+
doc.add(new Field("c", tkg2.newTokenStream(), ft1));
152+
writer.addDocument(doc);
153+
}
154+
155+
IndexReader reader1 = writer.getReader();
156+
assertEquals(2, reader1.leaves().size());
157+
assertEquals(
158+
hasPayloads,
159+
reader1.leaves().get(0).reader().getFieldInfos().fieldInfo("c").hasPayloads());
160+
assertNotEquals(
161+
hasPayloads,
162+
reader1.leaves().get(1).reader().getFieldInfos().fieldInfo("c").hasPayloads());
163+
164+
writer.forceMerge(1);
165+
IndexReader reader2 = writer.getReader();
166+
assertEquals(1, reader2.leaves().size());
167+
// assert that in the merged segments payloads set up for the field
168+
assertTrue(reader2.leaves().get(0).reader().getFieldInfos().fieldInfo("c").hasPayloads());
169+
170+
IOUtils.close(writer, reader1, reader2, dir);
171+
}
172+
}
173+
174+
/** A generator for token streams with optional null payloads */
175+
private static class TokenStreamGenerator {
176+
private final String[] terms;
177+
private final BytesRef[] termBytes;
178+
private final boolean hasPayloads;
179+
180+
public TokenStreamGenerator(boolean hasPayloads) {
181+
this.hasPayloads = hasPayloads;
182+
final int termsCount = 10;
183+
terms = new String[termsCount];
184+
termBytes = new BytesRef[termsCount];
185+
for (int i = 0; i < termsCount; ++i) {
186+
terms[i] = TestUtil.randomRealisticUnicodeString(random());
187+
termBytes[i] = new BytesRef(terms[i]);
188+
}
189+
}
190+
191+
public TokenStream newTokenStream() {
192+
return new OptionalNullPayloadTokenStream(TestUtil.nextInt(random(), 1, 5), terms, termBytes);
193+
}
194+
195+
private class OptionalNullPayloadTokenStream
196+
extends BaseTermVectorsFormatTestCase.RandomTokenStream {
197+
public OptionalNullPayloadTokenStream(
198+
int len, String[] sampleTerms, BytesRef[] sampleTermBytes) {
199+
super(len, sampleTerms, sampleTermBytes);
200+
}
201+
202+
@Override
203+
protected BytesRef randomPayload() {
204+
if (hasPayloads == false) {
205+
return null;
206+
}
207+
final int len = randomIntBetween(1, 5);
208+
final BytesRef payload = new BytesRef(len);
209+
random().nextBytes(payload.bytes);
210+
payload.length = len;
211+
return payload;
212+
}
213+
}
214+
}
169215
}

lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -667,7 +667,6 @@ public void testRandom() throws IOException {
667667
dir.close();
668668
}
669669

670-
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9334")
671670
public void testMerge() throws IOException {
672671
final RandomDocumentFactory docFactory = new RandomDocumentFactory(5, 20);
673672
final int numDocs = atLeast(100);

0 commit comments

Comments
 (0)