diff --git a/.pre-commit-config.yml b/.pre-commit-config.yml index d5028cc05fb1..09f71e69e2dd 100644 --- a/.pre-commit-config.yml +++ b/.pre-commit-config.yml @@ -162,28 +162,12 @@ repos: name: Fix Markdown language: system entry: uv - args: [ 'run', 'rumdl', 'fmt' ] + args: [ 'run', 'rumdl', 'check', '--fix' ] env: UV_PROJECT: dev-tools UV_FROZEN: "1" types: [ 'markdown'] require_serial: true - exclude: - glob: - # TODO: fix formatting of these files separately - - .github/PULL_REQUEST_TEMPLATE.md - - CONTRIBUTING.md - - dev-docs/file-formats.md - - dev-docs/github-issues-howto.md - - dev-tools/aws-jmh/README.md - - dev-tools/scripts/README.md - - lucene/backward-codecs/README.md - - lucene/distribution/src/binary-release/README.md - - lucene/luke/README.md - - lucene/luke/src/distribution/README.md - - lucene/MIGRATE.md - - lucene/SYSTEM_REQUIREMENTS.md - - README.md - id: ruff-check name: Fix Python diff --git a/.rumdl.toml b/.rumdl.toml index 3bd7cb1cd62b..757f4445b99c 100644 --- a/.rumdl.toml +++ b/.rumdl.toml @@ -3,3 +3,11 @@ line-length = 0 # not really a markdown file, but a template exclude = [ "lucene/documentation/src/markdown/index.template.md" ] + +[MD007] +# match indentation set in .editorconfig for least friction +indent = 4 + +[per-file-ignores] +# doesn't start with level 1 heading on purpose +".github/PULL_REQUEST_TEMPLATE.md" = [ "MD041" ] diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 262c9900f02e..8fab94464492 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -59,9 +59,9 @@ In case your contribution fixes a bug, please create a new test case that fails ### IDE support - *IntelliJ* - IntelliJ idea can import and build gradle-based projects out of the box. It will default to running tests by calling the gradle wrapper, and while this works, it is can be a bit slow. If instead you configure IntelliJ to use its own built-in test runner by (in 2024 version) navigating to settings for Build Execution & Deployment/Build Tools/Gradle (under File/Settings menu on some platforms) and selecting "Build and Run using: IntelliJ IDEA" and "Run Tests using: IntelliJ IDEA", then some tests will run faster. However some other tests will not run using this configuration. -- *Eclipse* - Basic support ([help/IDEs.txt](https://github.com/apache/lucene/blob/main/help/IDEs.txt#L7)). -- *VSCode* - Basic support ([help/IDEs.txt](https://github.com/apache/lucene/blob/main/help/IDEs.txt#L23)). -- *Neovim* - Basic support ([help/IDEs.txt](https://github.com/apache/lucene/blob/main/help/IDEs.txt#L32)). +- *Eclipse* - Basic support ([help/IDEs.txt](https://github.com/apache/lucene/blob/main/help/IDEs.txt#L7)). +- *VSCode* - Basic support ([help/IDEs.txt](https://github.com/apache/lucene/blob/main/help/IDEs.txt#L23)). +- *Neovim* - Basic support ([help/IDEs.txt](https://github.com/apache/lucene/blob/main/help/IDEs.txt#L32)). - *Netbeans* - Not tested. ## Benchmarking @@ -78,7 +78,7 @@ Feel free to share your findings (especially if your implementation performs bet ## Contributing your work -You can open a pull request at https://github.com/apache/lucene. +You can open a pull request at . Please be patient. Committers are busy people too. If no one responds to your patch after a few days, please make friendly reminders. Please incorporate others' suggestions into your patch if you think they're reasonable. Finally, remember that even a patch that is not committed is useful to the community. diff --git a/README.md b/README.md index 665f00366378..d1511ba3ef4d 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ written in Java. ## Online Documentation -This README file only contains basic setup instructions. For more +This README file only contains basic setup instructions. For more comprehensive documentation, visit: - Latest Releases: @@ -38,7 +38,7 @@ comprehensive documentation, visit: ## Building -### Basic steps: +### Basic steps 1. Install JDK 25 using your package manager or download manually from [OpenJDK](https://jdk.java.net/), @@ -48,7 +48,7 @@ comprehensive documentation, visit: 2. Clone Lucene's git repository (or download the source distribution). 3. Run gradle launcher script (`gradlew`). -We'll assume that you know how to get and set up the JDK - if you don't, then we suggest starting at https://jdk.java.net/ and learning more about Java, before returning to this README. +We'll assume that you know how to get and set up the JDK - if you don't, then we suggest starting at and learning more about Java, before returning to this README. ## Contributing diff --git a/dev-docs/file-formats.md b/dev-docs/file-formats.md index 23d2942eb410..16f08ffca3d6 100644 --- a/dev-docs/file-formats.md +++ b/dev-docs/file-formats.md @@ -36,12 +36,14 @@ on their own. ## How to split the data into files? Most file formats split the data into 3 files: + - metadata, - index data, - raw data. The metadata file contains all the data that is read once at open time. This helps on several fronts: + - One can validate the checksums of this data at open time without significant overhead since all data needs to be read anyway, this helps detect corruptions early. @@ -124,4 +126,4 @@ by merges. All default implementations do this. ## How to make backward-compatible changes to file formats? -See [here](../lucene/backward-codecs/README.md). +See [Index Backwards Compatibility](../lucene/backward-codecs/README.md). diff --git a/dev-docs/github-issues-howto.md b/dev-docs/github-issues-howto.md index 49a6aaa9abd8..8c9c59fb4736 100644 --- a/dev-docs/github-issues-howto.md +++ b/dev-docs/github-issues-howto.md @@ -29,7 +29,7 @@ All issues/PRs associated with a milestone must be resolved before the release, Once the release is done, the Milestone should be closed then a new Milestone for the next release should be created. -You can see the list of current active (opened) Milestones here. https://github.com/apache/lucene/milestones +You can see the list of current active (opened) Milestones here. See [GitHub documentation](https://docs.github.com/en/issues/using-labels-and-milestones-to-track-work/about-milestones) for more details. diff --git a/dev-tools/aws-jmh/README.md b/dev-tools/aws-jmh/README.md index 2d384f047d55..99c68f88ba47 100644 --- a/dev-tools/aws-jmh/README.md +++ b/dev-tools/aws-jmh/README.md @@ -15,23 +15,21 @@ limitations under the License. --> +# EC2 Microbenchmarks + Runs lucene microbenchmarks across a variety of CPUs in EC2. Example: -```console -export AWS_ACCESS_KEY_ID=xxxxx -export AWS_SECRET_ACCESS_KEY=yyyy -make PATCH_BRANCH=rmuir:some-speedup -``` + export AWS_ACCESS_KEY_ID=xxxxx + export AWS_SECRET_ACCESS_KEY=yyyy + make PATCH_BRANCH=rmuir:some-speedup Results file will be in build/report.txt You can also pass additional JMH args if you want: -```console -make PATCH_BRANCH=rmuir:some-speedup JMH_ARGS='float -p size=756' -``` + make PATCH_BRANCH=rmuir:some-speedup JMH_ARGS='float -p size=756' Prerequisites: diff --git a/dev-tools/scripts/README.md b/dev-tools/scripts/README.md index 4b8441870496..d1e0ed6b838c 100644 --- a/dev-tools/scripts/README.md +++ b/dev-tools/scripts/README.md @@ -194,4 +194,3 @@ and prints a regular expression that will match all of them ### gitignore-gen.sh TBD - diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md index b427d336fccb..f567862a2f0c 100644 --- a/lucene/MIGRATE.md +++ b/lucene/MIGRATE.md @@ -36,18 +36,21 @@ Starting with Lucene 11.0.0, the index upgrade policy has been relaxed to allow #### Upgrade Scenarios -**Scenario 1: No format breaks (wider upgrade span)** +##### Scenario 1: No format breaks (wider upgrade span) + - Index created with Lucene 10.x can be opened directly in Lucene 11.x, 12.x, 13.x, 14.x (as long as MIN_SUPPORTED_MAJOR stays ≤ 10) - Simply open the index with the new version; segments will be upgraded gradually through normal merging - Optional: Call `forceMerge()` or use `UpgradeIndexMergePolicy` to upgrade segment formats immediately - **Important**: You still only get one upgrade per index lifetime. Once MIN_SUPPORTED_MAJOR is bumped above 10, the index becomes unopenable and must be reindexed. -**Scenario 2: Format breaks occur** +##### Scenario 2: Format breaks occur + - If a major version introduces incompatible format changes, `MIN_SUPPORTED_MAJOR` will be bumped - Indexes created before the new minimum will throw `IndexFormatTooOldException` - Full reindexing is required for such indexes -**Scenario 3: After using your upgrade** +##### Scenario 3: After using your upgrade + - Index created with Lucene 10.x, successfully opened with Lucene 14.x - The index's creation version is still 10 (this never changes) - When Lucene 15+ bumps MIN_SUPPORTED_MAJOR above 10, this index becomes unopenable @@ -72,6 +75,7 @@ try (Directory dir = FSDirectory.open(indexPath)) { #### Error Handling Enhanced error messages will clearly indicate: + - Whether the index creation version is below `MIN_SUPPORTED_MAJOR` (reindex required) - Whether segments are too old to read directly (sequential upgrade required) @@ -85,7 +89,7 @@ number of segments that may be merged together. Query caching is now disabled by default. To enable caching back, do something like below in a static initialization block: -``` +```java int maxCachedQueries = 1_000; long maxRamBytesUsed = 50 * 1024 * 1024; // 50MB IndexSearcher.setDefaultQueryCache(new LRUQueryCache(maxCachedQueries, maxRamBytesUsed)); @@ -124,11 +128,11 @@ DataInput.readGroupVInt method: subclasses should delegate or reimplement it ent ### OpenNLP dependency upgrade -[Apache OpenNLP](https://opennlp.apache.org) 2.x opens the door to accessing various models via the ONNX runtime. To migrate you will need to update any deprecated OpenNLP methods that you may be using. +[Apache OpenNLP](https://opennlp.apache.org) 2.x opens the door to accessing various models via the ONNX runtime. To migrate you will need to update any deprecated OpenNLP methods that you may be using. ### Snowball dependency upgrade -Snowball has folded the "German2" stemmer into their "German" stemmer, so there's no "German2" anymore. For Lucene APIs (TokenFilter, TokenFilterFactory) that accept String, "German2" will be mapped to "German" to avoid breaking users. If you were previously creating German2Stemmer instances, you'll need to change your code to create GermanStemmer instances instead. For more information see https://snowballstem.org/algorithms/german2/stemmer.html +Snowball has folded the "German2" stemmer into their "German" stemmer, so there's no "German2" anymore. For Lucene APIs (TokenFilter, TokenFilterFactory) that accept String, "German2" will be mapped to "German" to avoid breaking users. If you were previously creating German2Stemmer instances, you'll need to change your code to create GermanStemmer instances instead. For more information see ### Romanian analysis @@ -155,6 +159,7 @@ Instead, call storedFields()/termVectors() to return an instance which can fetch and will be garbage-collected as usual. For example: + ```java TopDocs hits = searcher.search(query, 10); StoredFields storedFields = reader.storedFields(); @@ -230,7 +235,6 @@ for the currently-positioned document (doing so will result in undefined behavio `IOContext.READONCE` for opening internally, as that's the only valid usage pattern for checksum input. Callers should remove the parameter when calling this method. - ### DaciukMihovAutomatonBuilder is renamed to StringsToAutomaton and made package-private The former `DaciukMihovAutomatonBuilder#build` functionality is exposed through `Automata#makeStringUnion`. @@ -300,7 +304,7 @@ access the members using method calls instead of field accesses. Affected classe - `TermAndVector` (GITHUB#13772) - Many basic Lucene classes, including `CollectionStatistics`, `TermStatistics` and `LeafMetadata` (GITHUB#13328) -### Boolean flags on IOContext replaced with a new ReadAdvice enum. +### Boolean flags on IOContext replaced with a new ReadAdvice enum The `readOnce`, `load` and `random` flags on `IOContext` have been replaced with a new `ReadAdvice` enum. @@ -324,6 +328,7 @@ To migrate, use a provided `CollectorManager` implementation that suits your use to follow the new API pattern. The straight forward approach would be to instantiate the single-threaded `Collector` in a wrapper `CollectorManager`. For example + ```java public class CustomCollectorManager implements CollectorManager> { @Override @@ -354,12 +359,12 @@ List results = searcher.search(query, new CustomCollectorManager()); 1. `IntField(String name, int value)`. Use `IntField(String, int, Field.Store)` with `Field.Store#NO` instead. 2. `DoubleField(String name, double value)`. Use `DoubleField(String, double, Field.Store)` with `Field.Store#NO` instead. -2. `FloatField(String name, float value)`. Use `FloatField(String, float, Field.Store)` with `Field.Store#NO` instead. -3. `LongField(String name, long value)`. Use `LongField(String, long, Field.Store)` with `Field.Store#NO` instead. -4. `LongPoint#newDistanceFeatureQuery(String field, float weight, long origin, long pivotDistance)`. Use `LongField#newDistanceFeatureQuery` instead -5. `BooleanQuery#TooManyClauses`, `BooleanQuery#getMaxClauseCount()`, `BooleanQuery#setMaxClauseCount()`. Use `IndexSearcher#TooManyClauses`, `IndexSearcher#getMaxClauseCount()`, `IndexSearcher#setMaxClauseCount()` instead -6. `ByteBuffersDataInput#size()`. Use `ByteBuffersDataInput#length()` instead -7. `SortedSetDocValuesFacetField#label`. `FacetsConfig#pathToString(String[])` can be applied to path as a replacement if string path is desired. +3. `FloatField(String name, float value)`. Use `FloatField(String, float, Field.Store)` with `Field.Store#NO` instead. +4. `LongField(String name, long value)`. Use `LongField(String, long, Field.Store)` with `Field.Store#NO` instead. +5. `LongPoint#newDistanceFeatureQuery(String field, float weight, long origin, long pivotDistance)`. Use `LongField#newDistanceFeatureQuery` instead +6. `BooleanQuery#TooManyClauses`, `BooleanQuery#getMaxClauseCount()`, `BooleanQuery#setMaxClauseCount()`. Use `IndexSearcher#TooManyClauses`, `IndexSearcher#getMaxClauseCount()`, `IndexSearcher#setMaxClauseCount()` instead +7. `ByteBuffersDataInput#size()`. Use `ByteBuffersDataInput#length()` instead +8. `SortedSetDocValuesFacetField#label`. `FacetsConfig#pathToString(String[])` can be applied to path as a replacement if string path is desired. ### Auto I/O throttling disabled by default in ConcurrentMergeScheduler (GITHUB#13293) @@ -439,7 +444,6 @@ to the new coordinates: |org.apache.lucene:lucene-analyzers-smartcn |org.apache.lucene:lucene-analysis-smartcn | |org.apache.lucene:lucene-analyzers-stempel |org.apache.lucene:lucene-analysis-stempel | - ### LucenePackage class removed (LUCENE-10260) `LucenePackage` class has been removed. The implementation string can be @@ -563,7 +567,7 @@ User dictionary now strictly validates if the (concatenated) segment is the same unexpected runtime exceptions or behaviours. For example, these entries are not allowed at all and an exception is thrown when loading the dictionary file. -``` +```text # concatenated "日本経済新聞" does not match the surface form "日経新聞" 日経新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞 @@ -631,7 +635,7 @@ is discouraged in favor of the default `MMapDirectory`. ### Similarity.SimScorer.computeXXXFactor methods removed (LUCENE-8014) `SpanQuery` and `PhraseQuery` now always calculate their slops as -`(1.0 / (1.0 + distance))`. Payload factor calculation is performed by +`(1.0 / (1.0 + distance))`. Payload factor calculation is performed by `PayloadDecoder` in the `lucene-queries` module. ### Scorer must produce positive scores (LUCENE-7996) @@ -645,9 +649,9 @@ As a side-effect of this change, negative boosts are now rejected and ### CustomScoreQuery, BoostedQuery and BoostingQuery removed (LUCENE-8099) -Instead use `FunctionScoreQuery` and a `DoubleValuesSource` implementation. `BoostedQuery` +Instead use `FunctionScoreQuery` and a `DoubleValuesSource` implementation. `BoostedQuery` and `BoostingQuery` may be replaced by calls to `FunctionScoreQuery.boostByValue()` and -`FunctionScoreQuery.boostByQuery()`. To replace more complex calculations in +`FunctionScoreQuery.boostByQuery()`. To replace more complex calculations in `CustomScoreQuery`, use the `lucene-expressions` module: ```java @@ -666,7 +670,6 @@ Changing `IndexOptions` for a field on the fly will now result into an (`FieldType.indexOptions() != IndexOptions.NONE`) then all documents must have the same index options for that field. - ### IndexSearcher.createNormalizedWeight() removed (LUCENE-8242) Instead use `IndexSearcher.createWeight()`, rewriting the query first, and using @@ -744,7 +747,7 @@ Lucene. ### LeafCollector.setScorer() now takes a Scorable rather than a Scorer (LUCENE-6228) `Scorer` has a number of methods that should never be called from `Collector`s, for example -those that advance the underlying iterators. To hide these, `LeafCollector.setScorer()` +those that advance the underlying iterators. To hide these, `LeafCollector.setScorer()` now takes a `Scorable`, an abstract class that scorers can extend, with methods `docId()` and `score()`. @@ -981,10 +984,10 @@ removed in favour of the newly introduced `search(LeafReaderContextPartition[] p ### Indexing vectors with 8 bit scalar quantization is no longer supported but 7 and 4 bit quantization still work (GITHUB#13519) 8 bit scalar vector quantization is no longer supported: it was buggy -starting in 9.11 (GITHUB#13197). 4 and 7 bit quantization are still -supported. Existing (9.11) Lucene indices that previously used 8 bit +starting in 9.11 (GITHUB#13197). 4 and 7 bit quantization are still +supported. Existing (9.11) Lucene indices that previously used 8 bit quantization can still be read/searched but the results from -`KNN*VectorQuery` are silently buggy. Further 8 bit quantized vector +`KNN*VectorQuery` are silently buggy. Further 8 bit quantized vector indexing into such (9.11) indices is not permitted, so your path forward if you wish to continue using the same 9.11 index is to index additional vectors into the same field with either 4 or 7 bit diff --git a/lucene/SYSTEM_REQUIREMENTS.md b/lucene/SYSTEM_REQUIREMENTS.md index ec9025a4dd40..8638d1f9e9f1 100644 --- a/lucene/SYSTEM_REQUIREMENTS.md +++ b/lucene/SYSTEM_REQUIREMENTS.md @@ -21,7 +21,7 @@ Apache Lucene runs on Java 25 or greater. It is also recommended to always use the latest update version of your Java VM, because bugs may affect Lucene. An overview of known JVM bugs -can be found on https://cwiki.apache.org/confluence/display/LUCENE/JavaBugs +can be found on With all Java versions it is strongly recommended to not use experimental `-XX` JVM options. diff --git a/lucene/backward-codecs/README.md b/lucene/backward-codecs/README.md index 8e4b2c613070..701906a3f2b7 100644 --- a/lucene/backward-codecs/README.md +++ b/lucene/backward-codecs/README.md @@ -34,6 +34,7 @@ we create fresh copies of the codec and format, and move the existing ones into backwards-codecs. Older codecs are tested in two ways: + * Through unit tests like TestLucene80NormsFormat, which checks we can write then read data using each old format * Through TestBackwardsCompatibility, which loads indices created in previous diff --git a/lucene/distribution/src/binary-release/README.md b/lucene/distribution/src/binary-release/README.md index 24f6a04ab237..ab16d58ed353 100644 --- a/lucene/distribution/src/binary-release/README.md +++ b/lucene/distribution/src/binary-release/README.md @@ -23,9 +23,9 @@ This is a binary distribution of Lucene. Lucene is a Java full-text search engine. Lucene is not a complete application, but rather a code library and an API that can easily be used to add search capabilities to applications. -* The Lucene web site is at: https://lucene.apache.org/ +* The Lucene web site is at: * Please join the Lucene-User mailing list by sending a message to: -java-user-subscribe@lucene.apache.org + ## Files in this binary distribution @@ -42,8 +42,7 @@ Third-party licenses and notice files. Please note that this package does not include all the binary dependencies of all Lucene modules. Up-to-date dependency information for each Lucene -module is published to Maven central (as Maven POMs). +module is published to Maven central (as Maven POMs). To review the documentation, read the main documentation page, located at: `docs/index.html` - diff --git a/lucene/luke/README.md b/lucene/luke/README.md index b2e6a88369ed..64e839e9a349 100644 --- a/lucene/luke/README.md +++ b/lucene/luke/README.md @@ -21,4 +21,4 @@ Integrated desktop GUI tool: a utility for browsing, searching and maintaining i ## Older releases -Older releases of Luke (prior to 8.1) can be found at https://github.com/DmitryKey/luke +Older releases of Luke (prior to 8.1) can be found at diff --git a/lucene/luke/src/distribution/README.md b/lucene/luke/src/distribution/README.md index e3ae680c3bce..81400a220487 100644 --- a/lucene/luke/src/distribution/README.md +++ b/lucene/luke/src/distribution/README.md @@ -20,14 +20,11 @@ This is Luke, Apache Lucene low-level index inspection and repair utility. Luke requires Java ${required.java.version}. You can start it with: -``` -java -jar ${luke.cmd} -``` + + java -jar ${luke.cmd} or, using Java modules: -``` -java --module-path . --add-modules jdk.unsupported --module org.apache.lucene.luke -``` + java --module-path . --add-modules jdk.unsupported --module org.apache.lucene.luke Happy index hacking!