Change IllegalStateException for IllegalArgumentException in NoisyChannelSpellChecker, so we get a bad request instead of an internal server error

carlosdelest · carlosdelest · commit 65ac6075846a · 2025-08-01T12:34:57.000+02:00
diff --git a/server/src/internalClusterTest/java/org/elasticsearch/search/suggest/phrase/PhraseSuggesterIT.java b/server/src/internalClusterTest/java/org/elasticsearch/search/suggest/phrase/PhraseSuggesterIT.java
@@ -0,0 +1,146 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.search.suggest.phrase;
+
+import org.elasticsearch.ExceptionsHelper;
+import org.elasticsearch.action.search.SearchPhaseExecutionException;
+import org.elasticsearch.action.search.SearchRequestBuilder;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.rest.RestStatus;
+import org.elasticsearch.search.suggest.SuggestBuilder;
+import org.elasticsearch.test.ESIntegTestCase;
+import org.elasticsearch.xcontent.XContentFactory;
+
+import java.io.IOException;
+
+import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SHARDS;
+import static org.elasticsearch.search.suggest.SuggestBuilders.phraseSuggestion;
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoFailuresAndResponse;
+import static org.hamcrest.Matchers.equalTo;
+
+public class PhraseSuggesterIT extends ESIntegTestCase {
+
+    /**
+     * Reproduces the IllegalArgumentException: "At least one unigram is required but all tokens were ngrams"
+     *
+     * This happens when:
+     * 1. A phrase suggester is configured to use an analyzer that only produces n-grams (no unigrams)
+     * 2. The NoisyChannelSpellChecker is created with requireUnigram=true (which is the default)
+     * 3. The input text is analyzed and all resulting tokens are marked as n-grams
+     * 4. The NoisyChannelSpellChecker throws an IllegalArgumentException because it expects at least one unigram
+     */
+    public void testPhraseSuggestionWithNgramOnlyAnalyzerThrowsException() throws IOException {
+        createIndexAndDocs(false);
+
+        // Create a phrase suggestion that uses the ngram-only field
+        // This should trigger the IllegalArgumentException because:
+        // 1. The "text.ngrams" field uses an analyzer that only produces n-grams
+        // 2. When "hello world" is analyzed, it produces only n-grams, but no unigrams
+        // 3. The DirectCandidateGenerator.analyze() method sets anyTokens=true but anyUnigram=false
+        // 4. NoisyChannelSpellChecker.end() throws IllegalArgumentException
+        SearchRequestBuilder searchBuilder = createSuggesterSearch("text.ngrams");
+
+        // This should throw SearchPhaseExecutionException wrapping IllegalArgumentException
+        SearchPhaseExecutionException exception = expectThrows(SearchPhaseExecutionException.class, searchBuilder);
+        assertNotNull(exception.getCause());
+        assertThat(exception.status(), equalTo(RestStatus.BAD_REQUEST));
+        Throwable rootCause = ExceptionsHelper.unwrap(exception, IllegalArgumentException.class);
+
+        assertTrue(
+            "Expected IllegalArgumentException but got: " + rootCause.getClass().getSimpleName(),
+            rootCause instanceof IllegalArgumentException
+        );
+        assertEquals("At least one unigram is required but all tokens were ngrams", rootCause.getMessage());
+    }
+
+    private static SearchRequestBuilder createSuggesterSearch(String fieldName) {
+        PhraseSuggestionBuilder phraseSuggestion = phraseSuggestion(fieldName).text("hello world")
+            .addCandidateGenerator(new DirectCandidateGeneratorBuilder("text").suggestMode("always").minWordLength(1).maxEdits(2));
+
+        SearchRequestBuilder searchBuilder = prepareSearch("test").setSize(0)
+            .suggest(new SuggestBuilder().addSuggestion("test_suggestion", phraseSuggestion));
+        return searchBuilder;
+    }
+
+    /**
+     * Demonstrates that the same configuration works fine when using a different field that produces unigrams
+     */
+    public void testPhraseSuggestionWithUnigramFieldWorks() throws IOException {
+        createIndexAndDocs(false);
+
+        // Use the main "text" field instead of "text.ngrams" - this should work fine
+        // because the standard analyzer produces unigrams
+        SearchRequestBuilder searchRequestBuilder = createSuggesterSearch("text");
+
+        // This should NOT throw an exception
+        assertNoFailuresAndResponse(searchRequestBuilder, response -> {
+            // Just verify we get a response without exceptions
+            assertNotNull(response.getSuggest());
+        });
+    }
+
+    /**
+     * Test showing the same ngram-only configuration works when shingle filter allows output_unigrams=true
+     */
+    public void testPhraseSuggestionWithNgramsAndUnigramsWorks() throws IOException {
+        createIndexAndDocs(true);
+
+        // Use the ngrams field, but this time it should work because the analyzer produces unigrams too
+        SearchRequestBuilder searchRequestBuilder = createSuggesterSearch("text.ngrams");
+
+        // This should NOT throw an exception because unigrams are available
+        assertNoFailuresAndResponse(searchRequestBuilder, response -> { assertNotNull(response.getSuggest()); });
+    }
+
+    private void createIndexAndDocs(boolean allowUnigrams) throws IOException {
+        // Create an index with a shingle analyzer that outputs NO unigrams (only n-grams)
+        assertAcked(
+            prepareCreate("test").setSettings(
+                Settings.builder()
+                    .put(SETTING_NUMBER_OF_SHARDS, randomIntBetween(1, 5))
+                    .put("index.analysis.analyzer.ngram_only.tokenizer", "standard")
+                    .putList("index.analysis.analyzer.ngram_only.filter", "my_shingle", "lowercase")
+                    .put("index.analysis.filter.my_shingle.type", "shingle")
+                    .put("index.analysis.filter.my_shingle.output_unigrams", allowUnigrams)
+                    .put("index.analysis.filter.my_shingle.min_shingle_size", 2)
+                    .put("index.analysis.filter.my_shingle.max_shingle_size", 3)
+            )
+                .setMapping(
+                    XContentFactory.jsonBuilder()
+                        .startObject()
+                        .startObject("_doc")
+                        .startObject("properties")
+                        .startObject("text")
+                        .field("type", "text")
+                        .field("analyzer", "standard")
+                        .startObject("fields")
+                        .startObject("ngrams")
+                        .field("type", "text")
+                        .field("analyzer", "ngram_only") // Use our ngram-only analyzer for suggestions
+                        .endObject()
+                        .endObject()
+                        .endObject()
+                        .endObject()
+                        .endObject()
+                        .endObject()
+                )
+        );
+
+        ensureGreen();
+
+        // Index some test documents
+        indexDoc("test", "1", "text", "hello world test");
+        indexDoc("test", "2", "text", "another test phrase");
+        indexDoc("test", "3", "text", "some more content");
+        refresh();
+    }
+
+}
diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java
@@ -87,7 +87,7 @@ public void end() {
                     candidateSetsList.add(currentSet);
                 }
                 if (requireUnigram && anyUnigram == false && anyTokens) {
-                    throw new IllegalStateException("At least one unigram is required but all tokens were ngrams");
+                    throw new IllegalArgumentException("At least one unigram is required but all tokens were ngrams");
                 }
             }
         });

Original file line number	Diff line number	Diff line change
`@@ -87,7 +87,7 @@ public void end() {`
`87`	`87`	`candidateSetsList.add(currentSet);`
`88`	`88`	`}`
`89`	`89`	`if (requireUnigram && anyUnigram == false && anyTokens) {`
`90`		`- throw new IllegalStateException("At least one unigram is required but all tokens were ngrams");`
	`90`	`+ throw new IllegalArgumentException("At least one unigram is required but all tokens were ngrams");`
`91`	`91`	`}`
`92`	`92`	`}`
`93`	`93`	`});`