Skip to content
Merged
Show file tree
Hide file tree
Changes from 62 commits
Commits
Show all changes
138 commits
Select commit Hold shift + click to select a range
34f2f34
Add codecs for lucene 8x
drempapis Jan 2, 2025
a867385
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 2, 2025
69b6b6e
Add Lucene80 Codec support
drempapis Jan 2, 2025
81c9fce
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 2, 2025
30e2cb4
Add wrapped lucene 84 codec
drempapis Jan 2, 2025
000a7aa
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 3, 2025
2c53ff6
apply spot
drempapis Jan 3, 2025
d3f18a5
Add comment
drempapis Jan 3, 2025
b3afec7
Update doc
drempapis Jan 3, 2025
5fef3eb
Add doc
drempapis Jan 3, 2025
1596495
update doc
drempapis Jan 3, 2025
1ec0501
update doc
drempapis Jan 3, 2025
394dae3
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 3, 2025
a279780
Update docs/changelog/119503.yaml
drempapis Jan 3, 2025
ecb162d
update doc
drempapis Jan 3, 2025
2448af7
Merge branch 'fix/117042_Support_7x_segments_as_archive_in_8x' of git…
drempapis Jan 3, 2025
9d9b6af
unmute tests
drempapis Jan 3, 2025
02ddf82
revert
drempapis Jan 3, 2025
bf79ef3
Merge remote-tracking branch 'upstream/main'
drempapis Jan 3, 2025
590091c
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 7, 2025
528f220
Update docs/changelog/119503.yaml
drempapis Jan 7, 2025
f1c91bd
Merge remote-tracking branch 'upstream/main'
drempapis Jan 7, 2025
ba67bff
Merge remote-tracking branch 'upstream/main'
drempapis Jan 7, 2025
6d1c65a
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 8, 2025
2c3654a
Merge remote-tracking branch 'upstream/main'
drempapis Jan 8, 2025
58d4762
Merge remote-tracking branch 'upstream/main'
drempapis Jan 8, 2025
bc38090
Merge remote-tracking branch 'upstream/main'
drempapis Jan 9, 2025
87f790f
Update after review
drempapis Jan 9, 2025
4334bc4
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 9, 2025
ef0447b
Merge remote-tracking branch 'upstream/main'
drempapis Jan 9, 2025
a7a8a84
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 9, 2025
d91844d
Add ITests for LuceceCodecs8.x
drempapis Jan 9, 2025
fe009d7
Merge remote-tracking branch 'upstream/main'
drempapis Jan 10, 2025
a747a40
Merge remote-tracking branch 'upstream/main'
drempapis Jan 10, 2025
f3e47ae
Merge remote-tracking branch 'upstream/main'
drempapis Jan 10, 2025
7fdc027
Update and extend testing
drempapis Jan 13, 2025
5b831ec
update tests
drempapis Jan 13, 2025
338dbb1
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 13, 2025
25b216e
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 13, 2025
50530a2
Update library
drempapis Jan 13, 2025
7033ed0
Merge branch 'fix/117042_Support_7x_segments_as_archive_in_8x' of git…
drempapis Jan 13, 2025
9be8cea
update forbidden API
drempapis Jan 13, 2025
420e646
Add smoke tests for codecs
drempapis Jan 13, 2025
48ab5b4
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 13, 2025
ad70e72
fix test
drempapis Jan 13, 2025
6b479e2
Merge branch 'fix/117042_Support_7x_segments_as_archive_in_8x' of git…
drempapis Jan 13, 2025
2bc0107
Merge remote-tracking branch 'upstream/main'
drempapis Jan 13, 2025
99e816f
Delete internal javadoc
drempapis Jan 13, 2025
3196523
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 13, 2025
f3b3d00
Merge remote-tracking branch 'upstream/main'
drempapis Jan 13, 2025
f52789e
Merge remote-tracking branch 'upstream/main'
drempapis Jan 14, 2025
ec243b7
Merge remote-tracking branch 'upstream/main'
drempapis Jan 14, 2025
900fcb8
Modify MetadataOnly PointsReader to adjust Lucene86 changes
drempapis Jan 15, 2025
46d408f
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 15, 2025
2abea64
[CI] Auto commit changes from spotless
Jan 15, 2025
f93eb9b
Merge remote-tracking branch 'upstream/main'
drempapis Jan 16, 2025
f7e0012
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 16, 2025
2777916
Merge remote-tracking branch 'upstream/main'
drempapis Jan 16, 2025
7d71f93
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 16, 2025
623bd7b
Merge remote-tracking branch 'upstream/main'
drempapis Jan 16, 2025
13806d1
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 16, 2025
3fee6af
Merge remote-tracking branch 'upstream/main'
drempapis Jan 16, 2025
008d767
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 17, 2025
af3fff9
Merge remote-tracking branch 'upstream/main'
drempapis Jan 17, 2025
a41bbad
Merge remote-tracking branch 'upstream/main'
drempapis Jan 17, 2025
6010cc9
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 18, 2025
023e8ea
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 20, 2025
8115a61
Merge remote-tracking branch 'upstream/main'
drempapis Jan 20, 2025
0609411
update after review
drempapis Jan 20, 2025
11ab217
Merge branch 'fix/117042_Support_7x_segments_as_archive_in_8x' of git…
drempapis Jan 20, 2025
d9dedc7
revert code after review
drempapis Jan 20, 2025
4f85349
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 20, 2025
6b2361e
Merge remote-tracking branch 'upstream/main'
drempapis Jan 20, 2025
41dcc1c
Merge remote-tracking branch 'upstream/main'
drempapis Jan 21, 2025
2343576
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 21, 2025
2df7f62
Merge remote-tracking branch 'upstream/main'
drempapis Jan 21, 2025
de31af3
update after review
drempapis Jan 22, 2025
4c3f29a
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 22, 2025
6b95b0c
Merge remote-tracking branch 'upstream/main'
drempapis Jan 22, 2025
74cb102
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 22, 2025
6f8ff50
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 22, 2025
3ebc9f4
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 22, 2025
eadf8cf
Merge remote-tracking branch 'upstream/main'
drempapis Jan 23, 2025
41dc557
Merge remote-tracking branch 'upstream/main'
drempapis Jan 24, 2025
2ca8a9f
Update after review
drempapis Jan 24, 2025
e0e1740
Merge remote-tracking branch 'upstream/main'
drempapis Jan 24, 2025
1ba2eaa
Merge remote-tracking branch 'upstream/main'
drempapis Jan 24, 2025
6820c35
Merge remote-tracking branch 'upstream/main'
drempapis Jan 24, 2025
8280559
Merge remote-tracking branch 'upstream/main'
drempapis Jan 24, 2025
806d442
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 24, 2025
3e0de6d
Update after review
drempapis Jan 27, 2025
a57c0d1
Merge branch 'fix/117042_Support_7x_segments_as_archive_in_8x' of git…
drempapis Jan 27, 2025
bb561e2
[CI] Auto commit changes from spotless
Jan 27, 2025
03bbc25
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 27, 2025
e82375e
Merge remote-tracking branch 'upstream/main'
drempapis Jan 27, 2025
e98aac7
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 27, 2025
b39ef53
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 27, 2025
ce4c1c9
Merge remote-tracking branch 'upstream/main'
drempapis Jan 27, 2025
69fc6f6
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 27, 2025
8a66843
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 28, 2025
ff92f92
Merge remote-tracking branch 'upstream/main'
drempapis Jan 28, 2025
c0f4d18
Merge remote-tracking branch 'upstream/main'
drempapis Jan 28, 2025
02ac377
Merge remote-tracking branch 'upstream/main'
drempapis Jan 29, 2025
7941cbc
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 29, 2025
bb93eac
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 30, 2025
42aa647
Merge remote-tracking branch 'upstream/main'
drempapis Jan 30, 2025
9c9dc66
Update after review
drempapis Jan 30, 2025
b93bf88
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 30, 2025
1cf1afc
Merge branch 'fix/117042_Support_7x_segments_as_archive_in_8x' of git…
drempapis Jan 30, 2025
6e31623
update after review
drempapis Jan 30, 2025
f155836
[CI] Auto commit changes from spotless
Jan 30, 2025
c668a3a
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 30, 2025
bd258ab
Merge remote-tracking branch 'upstream/main'
drempapis Jan 30, 2025
c96cd31
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 30, 2025
6b26b66
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Jan 30, 2025
b20d791
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Mar 5, 2025
58be89d
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Mar 10, 2025
1539844
update after review
drempapis Mar 10, 2025
80919b6
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Mar 10, 2025
5a4ef59
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Mar 11, 2025
d42aa51
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Mar 11, 2025
668ccb3
update after review
drempapis Mar 11, 2025
952ec01
update after review
drempapis Mar 11, 2025
b636061
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Mar 11, 2025
ca586f9
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Mar 13, 2025
6783285
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Mar 13, 2025
e911ed2
update after review
drempapis Mar 14, 2025
7770291
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Mar 14, 2025
44cd893
minor naming update after review
drempapis Mar 14, 2025
b9eb516
[CI] Auto commit changes from spotless
Mar 14, 2025
bb4458d
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Mar 17, 2025
6fec899
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Mar 18, 2025
ca0f256
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Mar 18, 2025
b3bbc6a
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Mar 19, 2025
36f62ca
update after review
drempapis Mar 20, 2025
36edf83
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Mar 20, 2025
2952e62
[CI] Auto commit changes from spotless
Mar 20, 2025
43465fc
Merge branch 'main' into fix/117042_Support_7x_segments_as_archive_in_8x
drempapis Mar 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/119503.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 119503
summary: Support 7x segments as archive in 8x / 9x
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could probably need some more explanation on the cases that this fixes (i.e. archive indices that were mounted in 7x)

Copy link
Contributor Author

@drempapis drempapis Jan 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you, @cbuescher, for the feedback. I added a more explanatory summary description. This is still short, but it better describes what the PR does.

area: Search
type: bug
issues:
- 117042
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
Expand All @@ -26,6 +27,11 @@
import org.apache.lucene.index.Terms;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.Version;
import org.elasticsearch.xpack.lucene.bwc.codecs.lucene80.BWCLucene80Codec;
import org.elasticsearch.xpack.lucene.bwc.codecs.lucene84.BWCLucene84Codec;
import org.elasticsearch.xpack.lucene.bwc.codecs.lucene86.BWCLucene86Codec;
import org.elasticsearch.xpack.lucene.bwc.codecs.lucene87.BWCLucene87Codec;

import java.io.IOException;
import java.util.ArrayList;
Expand All @@ -41,18 +47,25 @@ protected BWCCodec(String name) {
super(name);
}

protected final PostingsFormat postingsFormat = new PerFieldPostingsFormat() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
throw new UnsupportedOperationException("Old codecs can't be used for writing");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

postings format are not only for writing, why do we return this version in some cases and another one without the exception in some other scenario?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I followed the original LuceneXXCodec pattern.

In class Lucene70Codec,

private final PostingsFormat postingsFormat =
      new PerFieldPostingsFormat() {
        @Override
        public PostingsFormat getPostingsFormatForField(String field) {
          throw new IllegalStateException(
              "This codec should only be used for reading, not writing");
        }
      };

and in class Lucene80Codec, no implementation is provided, and the exception is different.

  private final PostingsFormat postingsFormat =
      new PerFieldPostingsFormat() {
        @Override
        public PostingsFormat getPostingsFormatForField(String field) {
          throw new UnsupportedOperationException("Old codecs can't be used for writing");
        }
      };

where for classes Lucene84Codec ++, an implementation is provided

  private final PostingsFormat postingsFormat =
      new PerFieldPostingsFormat() {
        @Override
        public PostingsFormat getPostingsFormatForField(String field) {
          return Lucene84Codec.this.getPostingsFormatForField(field);
        }
      };

public PostingsFormat getPostingsFormatForField(String field) {
    return defaultFormat;
  }

this.defaultFormat = new Lucene84PostingsFormat();

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. getPostingsFormatForField only gets called at write time, so it is correct that we throw error like we do there in BWCLucene70Codec. The important bit though is that we rely on plain PerFieldPostingsFormat from Lucene in BWCLucene70Codec while in previous codecs we use the legacy adapting postings format to override the postings format. That explains the existing difference.

I don't understand though the difference you have e.g. between BWCLucene80Codec and BWCLucene86Codec . Shouldn't we use PerFieldPostingsFormat consistently in all the more recent formats? I admit that I need to do a bit more digging before I have a definitive answer on this. I am also not clear if we should have made adjustments around postings format moving to Lucene 10, and if we have any test coverage for them.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I followed the paradigm in the org.apache.lucene.backward_codecs package for the provided Lucene**Codecs.
All the codecs after version 84 have the format using the PerFieldPostingsFormat class. For example, the Lucene912Codec

private final DocValuesFormat docValuesFormat =
      new PerFieldDocValuesFormat() {
        @Override
        public DocValuesFormat getDocValuesFormatForField(String field) {
          return Lucene912Codec.this.getDocValuesFormatForField(field);
        }
      };

I understand that we should use this format for all the more recent formats.

I am also not clear if we should have made adjustments around postings format moving to Lucene 10, and if we have any test coverage for them.

@javanna, Do you have any idea how to test this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, took me forever to get back to this, I apologize. I think I better understand now.

My doubts came from this comment at https://github.com/elastic/elasticsearch/blob/main/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/LegacyAdaptingPerFieldPostingsFormat.java#L55 , and the presence of Lucene50PostingsFormat in backwards-covecs still in Lucene 10. That is correct though, because that was still used until Lucene 8.4, which introduced Lucene84PostingsFormat (see apache/lucene@7755cdf). Lucene 10 supports reading from Lucene 8.0 segments, which is why the 50 postings format still needs to be present.
I will open a PR to adjust that comment and clarify what needs to be done and when. That's good news though, nothing different should have happened in Lucene 10, and LegacyAdaptingPerFieldPostingsFormat sounds like something very specific for ancient versions, that is not needed in Lucene 7.0 codecs or above.

I think that we can unify here postingsFormat() in BWCCodec directly to return that instance of PerFieldPostingsFormat that throws exception in its getPostingsFormatPerField. That method is only called for writing, yet using PerFieldPostingsFormat ensures that we rely on SPI to load the postings format by name, and there is nothing else to do. Lucene60Codec is the only one that needs to override that postingsFormat method.

I will add a comment below also on other adjustments to make in the 84 codec etc.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The way to test this is to run some full text search on top of archive indices on the different versions.This could be done at the Lucene level, but I see that OldRepositoriesIT below is also a good fit. I think that we currently run a _count call, we should probably try and execute a match query, and perhaps a phrase query as well as it relies on positions. That may require adjusting a bit the data that is in the archived index.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a match query applied to Codec Tests

GET index/_search
{
  "query": {
    "match": {
      "content": "..."
    }
  }
}

}
};

@Override
public NormsFormat normsFormat() {
public final NormsFormat normsFormat() {
throw new UnsupportedOperationException();
}

@Override
public TermVectorsFormat termVectorsFormat() {
public final TermVectorsFormat termVectorsFormat() {
throw new UnsupportedOperationException();
}

@Override
public KnnVectorsFormat knnVectorsFormat() {
public final KnnVectorsFormat knnVectorsFormat() {
throw new UnsupportedOperationException();
}

Expand Down Expand Up @@ -118,13 +131,14 @@ private static FieldInfos filterFields(FieldInfos fieldInfos) {
}

public static SegmentInfo wrap(SegmentInfo segmentInfo) {
final Codec codec = segmentInfo.getCodec();
Codec codec = getBackwardCompatibleCodec(segmentInfo.getCodec());

final SegmentInfo segmentInfo1 = new SegmentInfo(
segmentInfo.dir,
// Use Version.LATEST instead of original version, otherwise SegmentCommitInfo will bark when processing (N-1 limitation)
// TODO: perhaps store the original version information in attributes so that we can retrieve it later when needed?
org.apache.lucene.util.Version.LATEST,
org.apache.lucene.util.Version.LATEST,
Version.LATEST,
Version.LATEST,
segmentInfo.name,
segmentInfo.maxDoc(),
segmentInfo.getUseCompoundFile(),
Expand All @@ -139,6 +153,20 @@ public static SegmentInfo wrap(SegmentInfo segmentInfo) {
return segmentInfo1;
}

// Special handling for Lucene8xCodecs (which are currently bundled with Lucene)
// Use BWCLucene8xCodec instead as that one extends BWCCodec (similar to all other older codecs)
private static Codec getBackwardCompatibleCodec(Codec codec) {
if (codec == null) return null;

return switch (codec.getClass().getSimpleName()) {
case "Lucene80Codec" -> new BWCLucene80Codec();
case "Lucene84Codec" -> new BWCLucene84Codec();
case "Lucene86Codec" -> new BWCLucene86Codec();
case "Lucene87Codec" -> new BWCLucene87Codec();
default -> codec;
};
}

/**
* In-memory postings format that shows no postings available.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ public class MetadataOnlyBKDReader extends PointValues {
final int docCount;
final int version;

final int numIndexBytes;
final long minLeafBlockFP;
private final long indexStartPointer;

public MetadataOnlyBKDReader(IndexInput metaIn) throws IOException {
version = CodecUtil.checkHeader(metaIn, "BKD", VERSION_START, VERSION_CURRENT);
final int numDims = metaIn.readVInt();
Expand Down Expand Up @@ -85,6 +89,16 @@ public MetadataOnlyBKDReader(IndexInput metaIn) throws IOException {

pointCount = metaIn.readVLong();
docCount = metaIn.readVInt();

numIndexBytes = metaIn.readVInt();
if (version >= VERSION_META_FILE) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's add a comment to explain that instead of forking the entire reader based on version, we went for adding this conditional and maintaining a single class. Perhaps document here also which version of lucene introduced this change, for clarity?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you, @javanna. Is there a specific Javadoc annotation in this class that is a partial copy of one in Lucene?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reiterating this a bit, for Lucene86Codec ++, we need only the following reads

numIndexBytes = metaIn.readVInt();
minLeafBlockFP = metaIn.readLong();
indexStartPointer = metaIn.readLong();

Otherwise, we do get the exception.

Caused by: org.apache.lucene.index.CorruptIndexException: codec header mismatch: actual header=0 vs expected header=1071082519 (resource=Endianness reverser Checksum Index Input wrapper)

Suppressed: org.apache.lucene.index.CorruptIndexException: checksum passed (8e136756). possibly transient resource issue, or a Lucene or JVM bug (resource=Endianness reverser Checksum Index Input wrapper)
		at [email protected]/org.apache.lucene.codecs.CodecUtil.checkFooter(CodecUtil.java:501)
		at org.elasticsearch.xpack.lucene.bwc.codecs.lucene86.Lucene86MetadataOnlyPointsReader.<init>(Lucene86MetadataOnlyPointsReader.java:117)
		... 36 more

The class Lucene86PointsReader has introduced additional IndexInputs and checkings, e.g., checksum, Footer, which fails if the data stream is not consumed (or something like that; I should dig more into this to provide the exact details)

I will investigate the case of maintaining only one MetadataOnlyPointsReader applied to all versions with conditionals. The MetadataOnlyBKDReader code can be simplified.

minLeafBlockFP = metaIn.readLong();
indexStartPointer = metaIn.readLong();
} else {
indexStartPointer = metaIn.getFilePointer();
minLeafBlockFP = metaIn.readVLong();
metaIn.seek(indexStartPointer);
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
import org.elasticsearch.xpack.lucene.bwc.codecs.BWCCodec;
import org.elasticsearch.xpack.lucene.bwc.codecs.lucene60.Lucene60MetadataOnlyPointsFormat;

Expand All @@ -45,12 +44,7 @@ public DocValuesFormat getDocValuesFormatForField(String field) {
return defaultDVFormat;
}
};
private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
throw new IllegalStateException("This codec should only be used for reading, not writing");
}
};
private final PointsFormat pointsFormat = new Lucene60MetadataOnlyPointsFormat();

// Needed for SPI loading
@SuppressWarnings("unused")
Expand Down Expand Up @@ -100,6 +94,6 @@ public PostingsFormat postingsFormat() {

@Override
public PointsFormat pointsFormat() {
return new Lucene60MetadataOnlyPointsFormat();
return pointsFormat;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

package org.elasticsearch.xpack.lucene.bwc.codecs.lucene80;

import org.apache.lucene.backward_codecs.lucene50.Lucene50CompoundFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50StoredFieldsFormat;
import org.apache.lucene.backward_codecs.lucene60.Lucene60FieldInfosFormat;
import org.apache.lucene.backward_codecs.lucene70.Lucene70SegmentInfoFormat;
import org.apache.lucene.backward_codecs.lucene80.Lucene80DocValuesFormat;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.PointsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.elasticsearch.xpack.lucene.bwc.codecs.BWCCodec;
import org.elasticsearch.xpack.lucene.bwc.codecs.lucene60.Lucene60MetadataOnlyPointsFormat;

/**
* Implements the Lucene 8.0 index format. Loaded via SPI for indices created/written with Lucene 8.0.0-8.3.0
* (Elasticsearch [7.0.0-7.5.2]), mounted as archive indices in Elasticsearch 8.x / 9.x.
*/
public class BWCLucene80Codec extends BWCCodec {

private final FieldInfosFormat fieldInfosFormat = wrap(new Lucene60FieldInfosFormat());
private final SegmentInfoFormat segmentInfosFormat = wrap(new Lucene70SegmentInfoFormat());
private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat();
private final CompoundFormat compoundFormat = new Lucene50CompoundFormat();

private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() {
@Override
public DocValuesFormat getDocValuesFormatForField(String field) {
return defaultDVFormat;
}
};
private final DocValuesFormat defaultDVFormat = new Lucene80DocValuesFormat();

private final StoredFieldsFormat storedFieldsFormat;
private final PointsFormat pointsFormat = new Lucene60MetadataOnlyPointsFormat();

// Needed for SPI loading
@SuppressWarnings("unused")
public BWCLucene80Codec() {
this("BWCLucene80Codec");
}

public BWCLucene80Codec(String name) {
super(name);
this.storedFieldsFormat = new Lucene50StoredFieldsFormat(Lucene50StoredFieldsFormat.Mode.BEST_SPEED);
}

@Override
public final StoredFieldsFormat storedFieldsFormat() {
return storedFieldsFormat;
}

@Override
public final PostingsFormat postingsFormat() {
return postingsFormat;
}

@Override
public final FieldInfosFormat fieldInfosFormat() {
return fieldInfosFormat;
}

@Override
public final SegmentInfoFormat segmentInfoFormat() {
return segmentInfosFormat;
}

@Override
public final LiveDocsFormat liveDocsFormat() {
return liveDocsFormat;
}

@Override
public final CompoundFormat compoundFormat() {
return compoundFormat;
}

@Override
public final PointsFormat pointsFormat() {
return pointsFormat;
}

@Override
public final DocValuesFormat docValuesFormat() {
return docValuesFormat;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

package org.elasticsearch.xpack.lucene.bwc.codecs.lucene84;

import org.apache.lucene.backward_codecs.lucene50.Lucene50CompoundFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50StoredFieldsFormat;
import org.apache.lucene.backward_codecs.lucene60.Lucene60FieldInfosFormat;
import org.apache.lucene.backward_codecs.lucene70.Lucene70SegmentInfoFormat;
import org.apache.lucene.backward_codecs.lucene80.Lucene80DocValuesFormat;
import org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.PointsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
import org.elasticsearch.xpack.lucene.bwc.codecs.BWCCodec;
import org.elasticsearch.xpack.lucene.bwc.codecs.lucene60.Lucene60MetadataOnlyPointsFormat;

/**
* Implements the Lucene 8.4 index format. Loaded via SPI for indices created/written with Lucene 8.4.0-8.5.1
* (Elasticsearch [7.6.0-7.8.1]), mounted as archive indices in Elasticsearch 8.x / 9.x.
*/
public class BWCLucene84Codec extends BWCCodec {

private final FieldInfosFormat fieldInfosFormat = wrap(new Lucene60FieldInfosFormat());
private final SegmentInfoFormat segmentInfosFormat = wrap(new Lucene70SegmentInfoFormat());
private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat();
private final CompoundFormat compoundFormat = new Lucene50CompoundFormat();
private final PostingsFormat defaultFormat;

private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return BWCLucene84Codec.this.getPostingsFormatForField(field);
}
};
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that this is not needed, we can use the ordinary PerFieldPostingsFormat defined in BWCCodec and rely on SPI to load the format by name. This method should throw exception as it's called only for writing.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From the previous comments, I understand that the method is enough on the BWCCodec because it is called for writing and can be applied to all BWCLucene8*Codecs

  protected final PostingsFormat postingsFormat = new PerFieldPostingsFormat() {
        @Override
        public PostingsFormat getPostingsFormatForField(String field) {
            throw new UnsupportedOperationException("Old codecs can't be used for writing");
        }
    };

However, I don't understand the comment, letting the SPI to load the defaultFormat by name. If the previous assumption is correct, do we need to read this?

I think that this is not needed, we can use the ordinary PerFieldPostingsFormat defined in BWCCodec and rely on SPI to load the format by name. This method should throw exception as it's called only for writing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, so I think that all the BWC8* can return the same postings format that you are defining in BWCCodec now. getPostingsFormatForField can safely throw an exception at all times because it's used for writing. The reading side is in PerFieldPostingsFormat, and makes sure that we load postings format by name, according to their name written in the indices that we are going to read. Nothing special needed, as long as all the supported postings formats are wired into SPI and loadable, which is the case in plain Lucene already for all the BWC8*codecs

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done!


private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() {
@Override
public DocValuesFormat getDocValuesFormatForField(String field) {
return BWCLucene84Codec.this.getDocValuesFormatForField(field);
}
};

private final StoredFieldsFormat storedFieldsFormat;
private final PointsFormat pointsFormat = new Lucene60MetadataOnlyPointsFormat();

// Needed for SPI loading
@SuppressWarnings("unused")
public BWCLucene84Codec() {
this("BWCLucene84Codec");
}

public BWCLucene84Codec(String name) {
super(name);
this.storedFieldsFormat = new Lucene50StoredFieldsFormat(Lucene50StoredFieldsFormat.Mode.BEST_SPEED);
this.defaultFormat = new Lucene84PostingsFormat();
}

@Override
public StoredFieldsFormat storedFieldsFormat() {
return storedFieldsFormat;
}

@Override
public PostingsFormat postingsFormat() {
return postingsFormat;
}

@Override
public final FieldInfosFormat fieldInfosFormat() {
return fieldInfosFormat;
}

@Override
public SegmentInfoFormat segmentInfoFormat() {
return segmentInfosFormat;
}

@Override
public final LiveDocsFormat liveDocsFormat() {
return liveDocsFormat;
}

@Override
public CompoundFormat compoundFormat() {
return compoundFormat;
}

@Override
public PointsFormat pointsFormat() {
return pointsFormat;
}

/**
* Returns the postings format that should be used for writing new segments of <code>field</code>.
*
* <p>The default implementation always returns "Lucene84".
*
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility:
* future version of Lucene are only guaranteed to be able to read the default implementation.
*/
public PostingsFormat getPostingsFormatForField(String field) {
return defaultFormat;
}

/**
* Returns the docvalues format that should be used for writing new segments of <code>field</code>
* .
*
* <p>The default implementation always returns "Lucene80".
*
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility:
* future version of Lucene are only guaranteed to be able to read the default implementation.
*/
public DocValuesFormat getDocValuesFormatForField(String field) {
return defaultDVFormat;
}

@Override
public final DocValuesFormat docValuesFormat() {
return docValuesFormat;
}

private final DocValuesFormat defaultDVFormat = new Lucene80DocValuesFormat();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same question here for "normsFormat()" and "termVectorsFormat()" methods that are implemented in Lucene84Codec.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the formats that we provide are those that we provide bwc support for, the others we don't want to maintain and support.

}
Loading