diff --git a/CHANGELOG.md b/CHANGELOG.md index fb6dd293b9f36..276ba96567449 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - WLM group custom search settings - groundwork and timeout ([#20536](https://github.com/opensearch-project/OpenSearch/issues/20536)) - Expose JVM runtime metrics via telemetry framework ([#20844](https://github.com/opensearch-project/OpenSearch/pull/20844)) - Add intra segment support for single-value metric aggregations ([#20503](https://github.com/opensearch-project/OpenSearch/pull/20503)) +- Add ref_path support for package-based hunspell dictionary loading ([#20840](https://github.com/opensearch-project/OpenSearch/pull/20840)) - Add support for enabling pluggable data formats, starting with phase-1 of decoupling shard from engine, and introducing basic abstractions ([#20675](https://github.com/opensearch-project/OpenSearch/pull/20675)) ### Changed diff --git a/server/src/main/java/org/opensearch/index/analysis/HunspellTokenFilterFactory.java b/server/src/main/java/org/opensearch/index/analysis/HunspellTokenFilterFactory.java index dcfb77a90481d..3834c99886a86 100644 --- a/server/src/main/java/org/opensearch/index/analysis/HunspellTokenFilterFactory.java +++ b/server/src/main/java/org/opensearch/index/analysis/HunspellTokenFilterFactory.java @@ -39,10 +39,34 @@ import org.opensearch.indices.analysis.HunspellService; import java.util.Locale; +import java.util.regex.Pattern; /** * The token filter factory for the hunspell analyzer * + * The dictionary is loaded from either: + * + * + *

Usage Examples:

+ *
+ * // Traditional locale-based (loads from config/hunspell/en_US/)
+ * {
+ *   "type": "hunspell",
+ *   "locale": "en_US"
+ * }
+ *
+ * // Package-based (loads from config/analyzers/pkg-1234/hunspell/en_US/)
+ * {
+ *   "type": "hunspell",
+ *   "ref_path": "pkg-1234",
+ *   "locale": "en_US"
+ * }
+ * 
+ * + * * @opensearch.internal */ public class HunspellTokenFilterFactory extends AbstractTokenFilterFactory { @@ -54,14 +78,32 @@ public class HunspellTokenFilterFactory extends AbstractTokenFilterFactory { public HunspellTokenFilterFactory(IndexSettings indexSettings, String name, Settings settings, HunspellService hunspellService) { super(indexSettings, name, settings); + // Get both ref_path and locale parameters + String refPath = settings.get("ref_path"); // Package ID only (optional) String locale = settings.get("locale", settings.get("language", settings.get("lang", null))); - if (locale == null) { - throw new IllegalArgumentException("missing [locale | language | lang] configuration for hunspell token filter"); - } - dictionary = hunspellService.getDictionary(locale); - if (dictionary == null) { - throw new IllegalArgumentException(String.format(Locale.ROOT, "Unknown hunspell dictionary for locale [%s]", locale)); + if (refPath != null) { + // Package-based loading: ref_path (package ID) + locale (required) + if (locale == null) { + throw new IllegalArgumentException("When using ref_path, the 'locale' parameter is required for hunspell token filter"); + } + + // Validate ref_path and locale are safe package/locale identifiers + validatePackageIdentifier(refPath, "ref_path"); + validatePackageIdentifier(locale, "locale"); + + // Load from package directory: config/analyzers/{ref_path}/hunspell/{locale}/ + dictionary = hunspellService.getDictionaryFromPackage(refPath, locale); + } else if (locale != null) { + // Traditional locale-based loading (backward compatible) + // Loads from config/hunspell/{locale}/ + // Validate locale to prevent path traversal and cache key ambiguity + validatePackageIdentifier(locale, "locale"); + dictionary = hunspellService.getDictionary(locale); + } else { + throw new IllegalArgumentException( + "The 'locale' parameter is required for hunspell token filter. Set it to the hunspell dictionary locale (e.g., 'en_US')." + ); } dedup = settings.getAsBoolean("dedup", true); @@ -81,4 +123,38 @@ public boolean longestOnly() { return longestOnly; } + /** + * Allowlist pattern for safe package identifiers and locales. + * Permits only alphanumeric characters, hyphens, and underscores. + * Examples: "pkg-1234", "en_US", "my-package-v2", "en_US_custom" + */ + private static final Pattern SAFE_IDENTIFIER_PATTERN = Pattern.compile("^[a-zA-Z0-9][a-zA-Z0-9_-]*$|^[a-zA-Z0-9]$"); + + /** + * Validates that a package identifier or locale contains only safe characters. + * Uses an allowlist approach: only alphanumeric characters, hyphens, and underscores are permitted. + * This prevents path traversal, cache key injection, and other security issues. + * + * @param value The value to validate (package ID or locale) + * @param paramName The parameter name for error messages + * @throws IllegalArgumentException if validation fails + */ + static void validatePackageIdentifier(String value, String paramName) { + if (value == null || value.isEmpty()) { + throw new IllegalArgumentException(String.format(Locale.ROOT, "Invalid %s: value cannot be null or empty.", paramName)); + } + + if (!SAFE_IDENTIFIER_PATTERN.matcher(value).matches()) { + throw new IllegalArgumentException( + String.format( + Locale.ROOT, + "Invalid %s: [%s]. Only alphanumeric characters, hyphens, and underscores are allowed.", + paramName, + value + ) + ); + } + + } + } diff --git a/server/src/main/java/org/opensearch/indices/analysis/HunspellService.java b/server/src/main/java/org/opensearch/indices/analysis/HunspellService.java index 027cd502da1fb..cafb03767f3be 100644 --- a/server/src/main/java/org/opensearch/indices/analysis/HunspellService.java +++ b/server/src/main/java/org/opensearch/indices/analysis/HunspellService.java @@ -42,6 +42,7 @@ import org.opensearch.common.settings.Setting.Property; import org.opensearch.common.settings.Settings; import org.opensearch.common.util.io.IOUtils; +import org.opensearch.core.common.Strings; import org.opensearch.core.util.FileSystemUtils; import org.opensearch.env.Environment; @@ -59,32 +60,32 @@ import java.util.function.Function; /** - * Serves as a node level registry for hunspell dictionaries. This services expects all dictionaries to be located under - * the {@code /hunspell} directory, where each locale has its dedicated sub-directory which holds the dictionary - * files. For example, the dictionary files for {@code en_US} locale must be placed under {@code /hunspell/en_US} - * directory. - *

- * The following settings can be set for each dictionary: + * Serves as a node level registry for hunspell dictionaries. This service supports loading dictionaries from: + *

+ * + *

Cache Key Strategy:

+ * + * + *

The following settings can be set for each dictionary: *

- *

- * These settings can either be configured as node level configuration, such as: - *

+ * + *

These settings can either be configured as node level configuration, such as: *


  *     indices.analysis.hunspell.dictionary.en_US.ignore_case: true
  *     indices.analysis.hunspell.dictionary.en_US.strict_affix_parsing: false
  * 
- *

- * or, as dedicated configuration per dictionary, placed in a {@code settings.yml} file under the dictionary directory. For - * example, the following can be the content of the {@code /hunspell/en_US/settings.yml} file: - *

- *


- *     ignore_case: true
- *     strict_affix_parsing: false
- * 
+ * + *

or, as dedicated configuration per dictionary, placed in a {@code settings.yml} file under the dictionary directory. * * @see org.opensearch.index.analysis.HunspellTokenFilterFactory * @@ -112,16 +113,18 @@ public class HunspellService { private final Map knownDictionaries; private final boolean defaultIgnoreCase; private final Path hunspellDir; + private final Environment env; private final Function loadingFunction; public HunspellService(final Settings settings, final Environment env, final Map knownDictionaries) throws IOException { this.knownDictionaries = Collections.unmodifiableMap(knownDictionaries); + this.env = env; this.hunspellDir = resolveHunspellDirectory(env); this.defaultIgnoreCase = HUNSPELL_IGNORE_CASE.get(settings); this.loadingFunction = (locale) -> { try { - return loadDictionary(locale, settings, env); + return loadDictionary(locale, settings, env, hunspellDir); } catch (Exception e) { logger.error("Failed to load hunspell dictionary for locale: " + locale, e); throw new IllegalStateException("Failed to load hunspell dictionary for locale: " + locale); @@ -135,8 +138,10 @@ public HunspellService(final Settings settings, final Environment env, final Map /** * Returns the hunspell dictionary for the given locale. + * Loads from traditional location: config/hunspell/{locale}/ * - * @param locale The name of the locale + * @param locale The name of the locale (e.g., "en_US") + * @return The loaded Dictionary */ public Dictionary getDictionary(String locale) { Dictionary dictionary = knownDictionaries.get(locale); @@ -146,6 +151,141 @@ public Dictionary getDictionary(String locale) { return dictionary; } + /** + * Returns the hunspell dictionary from a package directory. + * Loads from package location: config/analyzers/{packageId}/hunspell/{locale}/ + * + *

Cache key format: "{packageId}:{locale}" (e.g., "pkg-1234:en_US") + * + * @param packageId The package ID (e.g., "pkg-1234") + * @param locale The locale (e.g., "en_US") + * @return The loaded Dictionary + * @throws IllegalArgumentException if packageId or locale is null + * @throws IllegalStateException if hunspell directory not found or dictionary cannot be loaded + */ + public Dictionary getDictionaryFromPackage(String packageId, String locale) { + if (Strings.isNullOrEmpty(packageId)) { + throw new IllegalArgumentException("packageId cannot be null or empty"); + } + if (Strings.isNullOrEmpty(locale)) { + throw new IllegalArgumentException("locale cannot be null or empty"); + } + + String cacheKey = buildPackageCacheKey(packageId, locale); + + return dictionaries.computeIfAbsent(cacheKey, (key) -> { + try { + return loadDictionaryFromPackage(packageId, locale); + } catch (Exception e) { + + throw new IllegalStateException( + String.format(Locale.ROOT, "Failed to load hunspell dictionary for package [%s] locale [%s]", packageId, locale), + e + ); + } + }); + } + + /** + * Loads a hunspell dictionary from a package directory. + * Expects hunspell files at: config/analyzers/{packageId}/hunspell/{locale}/ + * + * @param packageId The package identifier + * @param locale The locale (e.g., "en_US") + * @return The loaded Dictionary + * @throws Exception if loading fails + */ + private Dictionary loadDictionaryFromPackage(String packageId, String locale) throws Exception { + // Validate raw inputs before path resolution (defense-in-depth, caller should also validate) + if (packageId.contains("/") || packageId.contains("\\") || packageId.contains("..")) { + throw new IllegalArgumentException( + String.format(Locale.ROOT, "Invalid package ID: [%s]. Must not contain path separators or '..' sequences.", packageId) + ); + } + if (locale.contains("/") || locale.contains("\\") || locale.contains("..")) { + throw new IllegalArgumentException( + String.format(Locale.ROOT, "Invalid locale: [%s]. Must not contain path separators or '..' sequences.", locale) + ); + } + + // Resolve analyzers base directory: config/analyzers/ + Path analyzersBaseDir = env.configDir().resolve("analyzers"); + + // Resolve package directory: config/analyzers/{packageId}/ + Path packageDir = analyzersBaseDir.resolve(packageId); + + // Security check: ensure path stays under config/analyzers/ (prevent path traversal attacks) + // Both paths must be converted to absolute and normalized before comparison + // Defense-in-depth: raw input validation above should prevent this, but we verify + // the resolved path as a secondary safeguard against any future code path changes + Path analyzersBaseDirAbsolute = analyzersBaseDir.toAbsolutePath().normalize(); + Path packageDirAbsolute = packageDir.toAbsolutePath().normalize(); + if (!packageDirAbsolute.startsWith(analyzersBaseDirAbsolute)) { + throw new IllegalArgumentException( + String.format(Locale.ROOT, "Package path must be under config/analyzers directory. Package: [%s]", packageId) + ); + } + + // Additional check: ensure the resolved package directory is exactly one level under analyzers/ + // This prevents packageId=".." or "foo/../bar" from escaping + if (!packageDirAbsolute.getParent().equals(analyzersBaseDirAbsolute)) { + throw new IllegalArgumentException( + String.format(Locale.ROOT, "Invalid package ID: [%s]. Package ID cannot contain path traversal sequences.", packageId) + ); + } + + // Check if package directory exists + if (!Files.isDirectory(packageDir)) { + throw new OpenSearchException( + String.format(Locale.ROOT, "Package directory not found: [%s]. Expected at: %s", packageId, packageDir) + ); + } + + // Auto-detect hunspell directory within package + Path packageHunspellDir = packageDir.resolve("hunspell"); + if (!Files.isDirectory(packageHunspellDir)) { + throw new OpenSearchException( + String.format( + Locale.ROOT, + "Hunspell directory not found in package [%s]. " + "Expected 'hunspell' subdirectory at: %s", + packageId, + packageHunspellDir + ) + ); + } + + // Resolve locale directory within hunspell + Path dicDir = packageHunspellDir.resolve(locale); + + // Security check: ensure locale path doesn't escape hunspell directory (prevent path traversal) + Path hunspellDirAbsolute = packageHunspellDir.toAbsolutePath().normalize(); + Path dicDirAbsolute = dicDir.toAbsolutePath().normalize(); + if (!dicDirAbsolute.startsWith(hunspellDirAbsolute)) { + throw new IllegalArgumentException( + String.format(Locale.ROOT, "Locale path must be under hunspell directory. Locale: [%s]", locale) + ); + } + + if (logger.isDebugEnabled()) { + logger.debug("Loading hunspell dictionary from package [{}] locale [{}] at [{}]...", packageId, locale, dicDirAbsolute); + } + + if (!FileSystemUtils.isAccessibleDirectory(dicDir, logger)) { + throw new OpenSearchException( + String.format( + Locale.ROOT, + "Locale [%s] not found in package [%s]. " + "Expected directory at: %s", + locale, + packageId, + dicDirAbsolute + ) + ); + } + + // Delegate to loadDictionary with the package's hunspell directory as base + return loadDictionary(locale, Settings.EMPTY, env, packageHunspellDir); + } + private Path resolveHunspellDirectory(Environment env) { return env.configDir().resolve("hunspell"); } @@ -179,29 +319,33 @@ private void scanAndLoadDictionaries() throws IOException { } /** - * Loads the hunspell dictionary for the given local. + * Loads a hunspell dictionary from a base directory by resolving the locale subdirectory, + * finding .aff and .dic files, and creating the Dictionary object. + * Used by both traditional locale-based loading (baseDir=hunspellDir) and + * package-based loading (baseDir=packageHunspellDir). * - * @param locale The locale of the hunspell dictionary to be loaded. - * @param nodeSettings The node level settings - * @param env The node environment (from which the conf path will be resolved) + * @param locale The locale of the hunspell dictionary to be loaded + * @param nodeSettings The node level settings (pass Settings.EMPTY for package-based loading) + * @param env The node environment + * @param baseDir The base directory containing locale subdirectories with .aff/.dic files * @return The loaded Hunspell dictionary - * @throws Exception when loading fails (due to IO errors or malformed dictionary files) + * @throws Exception when loading fails */ - private Dictionary loadDictionary(String locale, Settings nodeSettings, Environment env) throws Exception { + private Dictionary loadDictionary(String locale, Settings nodeSettings, Environment env, Path baseDir) throws Exception { if (logger.isDebugEnabled()) { - logger.debug("Loading hunspell dictionary [{}]...", locale); + logger.debug("Loading hunspell dictionary [{}] from [{}]...", locale, baseDir); } - Path dicDir = hunspellDir.resolve(locale); + Path dicDir = baseDir.resolve(locale); if (FileSystemUtils.isAccessibleDirectory(dicDir, logger) == false) { throw new OpenSearchException(String.format(Locale.ROOT, "Could not find hunspell dictionary [%s]", locale)); } - // merging node settings with hunspell dictionary specific settings + // Merge node settings with hunspell dictionary specific settings Settings dictSettings = HUNSPELL_DICTIONARY_OPTIONS.get(nodeSettings); nodeSettings = loadDictionarySettings(dicDir, dictSettings.getByPrefix(locale + ".")); - boolean ignoreCase = nodeSettings.getAsBoolean("ignore_case", defaultIgnoreCase); + // Find and validate affix files Path[] affixFiles = FileSystemUtils.files(dicDir, "*.aff"); if (affixFiles.length == 0) { throw new OpenSearchException(String.format(Locale.ROOT, "Missing affix file for hunspell dictionary [%s]", locale)); @@ -209,22 +353,20 @@ private Dictionary loadDictionary(String locale, Settings nodeSettings, Environm if (affixFiles.length != 1) { throw new OpenSearchException(String.format(Locale.ROOT, "Too many affix files exist for hunspell dictionary [%s]", locale)); } - InputStream affixStream = null; + // Load dictionary files and create Dictionary object Path[] dicFiles = FileSystemUtils.files(dicDir, "*.dic"); List dicStreams = new ArrayList<>(dicFiles.length); + InputStream affixStream = null; try { - - for (int i = 0; i < dicFiles.length; i++) { - dicStreams.add(Files.newInputStream(dicFiles[i])); + for (Path dicFile : dicFiles) { + dicStreams.add(Files.newInputStream(dicFile)); } - affixStream = Files.newInputStream(affixFiles[0]); try (Directory tmp = new NIOFSDirectory(env.tmpDir())) { return new Dictionary(tmp, "hunspell", affixStream, dicStreams, ignoreCase); } - } catch (Exception e) { logger.error(() -> new ParameterizedMessage("Could not load hunspell dictionary [{}]", locale), e); throw e; @@ -255,4 +397,17 @@ private static Settings loadDictionarySettings(Path dir, Settings defaults) thro return defaults; } + + /** + * Builds the cache key for a package-based dictionary. + * Format: "{packageId}:{locale}" (e.g., "pkg-1234:en_US") + * + * @param packageId The package ID + * @param locale The locale + * @return The cache key + */ + public static String buildPackageCacheKey(String packageId, String locale) { + return packageId + ":" + locale; + } + } diff --git a/server/src/test/java/org/opensearch/index/analysis/HunspellTokenFilterFactoryTests.java b/server/src/test/java/org/opensearch/index/analysis/HunspellTokenFilterFactoryTests.java index 665235b01b88f..7878bc72b6d2a 100644 --- a/server/src/test/java/org/opensearch/index/analysis/HunspellTokenFilterFactoryTests.java +++ b/server/src/test/java/org/opensearch/index/analysis/HunspellTokenFilterFactoryTests.java @@ -37,10 +37,12 @@ import java.io.IOException; +import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.instanceOf; import static org.hamcrest.Matchers.is; public class HunspellTokenFilterFactoryTests extends OpenSearchTestCase { + public void testDedup() throws IOException { Settings settings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) @@ -67,4 +69,233 @@ public void testDedup() throws IOException { hunspellTokenFilter = (HunspellTokenFilterFactory) tokenFilter; assertThat(hunspellTokenFilter.dedup(), is(false)); } + + /** + * Test dedup and longestOnly settings work with ref_path. + */ + public void testRefPathWithDedupAndLongestOnly() throws IOException { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_hunspell.type", "hunspell") + .put("index.analysis.filter.my_hunspell.ref_path", "test-pkg") + .put("index.analysis.filter.my_hunspell.locale", "en_US") + .put("index.analysis.filter.my_hunspell.dedup", false) + .put("index.analysis.filter.my_hunspell.longest_only", true) + .build(); + + TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, getDataPath("/indices/analyze/conf_dir")); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_hunspell"); + assertThat(tokenFilter, instanceOf(HunspellTokenFilterFactory.class)); + HunspellTokenFilterFactory hunspellTokenFilter = (HunspellTokenFilterFactory) tokenFilter; + + assertThat(hunspellTokenFilter.dedup(), is(false)); + assertThat(hunspellTokenFilter.longestOnly(), is(true)); + } + + /** + * Test traditional locale-only loading still works (backward compatibility). + */ + public void testTraditionalLocaleOnlyLoadingStillWorks() throws IOException { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_hunspell.type", "hunspell") + .put("index.analysis.filter.my_hunspell.locale", "en_US") + // No ref_path - should load from config/hunspell/en_US/ + .build(); + + TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, getDataPath("/indices/analyze/conf_dir")); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_hunspell"); + assertThat(tokenFilter, instanceOf(HunspellTokenFilterFactory.class)); + } + + /** + * Test that missing both ref_path and locale throws exception. + */ + public void testMissingBothRefPathAndLocaleThrowsException() throws IOException { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_hunspell.type", "hunspell") + .build(); + + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, getDataPath("/indices/analyze/conf_dir")) + ); + assertThat(e.getMessage(), containsString("locale")); + } + + /** + * Test validatePackageIdentifier accepts valid identifiers. + */ + public void testValidatePackageIdentifierAcceptsValid() { + // These should not throw + HunspellTokenFilterFactory.validatePackageIdentifier("pkg-1234", "ref_path"); + HunspellTokenFilterFactory.validatePackageIdentifier("en_US", "locale"); + HunspellTokenFilterFactory.validatePackageIdentifier("my-package-v2", "ref_path"); + HunspellTokenFilterFactory.validatePackageIdentifier("en_US_custom", "locale"); + HunspellTokenFilterFactory.validatePackageIdentifier("a", "ref_path"); // single char + HunspellTokenFilterFactory.validatePackageIdentifier("AB", "ref_path"); // two chars + } + + /** + * Test validatePackageIdentifier rejects null. + */ + public void testValidatePackageIdentifierRejectsNull() { + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> HunspellTokenFilterFactory.validatePackageIdentifier(null, "ref_path") + ); + assertThat(e.getMessage(), containsString("null or empty")); + } + + /** + * Test validatePackageIdentifier rejects empty string. + */ + public void testValidatePackageIdentifierRejectsEmpty() { + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> HunspellTokenFilterFactory.validatePackageIdentifier("", "ref_path") + ); + assertThat(e.getMessage(), containsString("null or empty")); + } + + /** + * Test validatePackageIdentifier rejects slash. + */ + public void testValidatePackageIdentifierRejectsSlash() { + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> HunspellTokenFilterFactory.validatePackageIdentifier("foo/bar", "ref_path") + ); + assertThat(e.getMessage(), containsString("Only alphanumeric")); + } + + /** + * Test validatePackageIdentifier rejects backslash. + */ + public void testValidatePackageIdentifierRejectsBackslash() { + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> HunspellTokenFilterFactory.validatePackageIdentifier("foo\\bar", "ref_path") + ); + assertThat(e.getMessage(), containsString("Only alphanumeric")); + } + + /** + * Test validatePackageIdentifier rejects colon (cache key separator). + */ + public void testValidatePackageIdentifierRejectsColon() { + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> HunspellTokenFilterFactory.validatePackageIdentifier("pkg:inject", "ref_path") + ); + assertThat(e.getMessage(), containsString("Only alphanumeric")); + } + + /** + * Test validatePackageIdentifier rejects dots. + */ + public void testValidatePackageIdentifierRejectsDots() { + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> HunspellTokenFilterFactory.validatePackageIdentifier("pkg.v1", "ref_path") + ); + assertThat(e.getMessage(), containsString("Only alphanumeric")); + } + + /** + * Test validatePackageIdentifier rejects double dots (path traversal). + */ + public void testValidatePackageIdentifierRejectsDoubleDots() { + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> HunspellTokenFilterFactory.validatePackageIdentifier("foo..bar", "ref_path") + ); + assertThat(e.getMessage(), containsString("Only alphanumeric")); + } + + /** + * Test validatePackageIdentifier rejects ".." (pure path traversal). + */ + public void testValidatePackageIdentifierRejectsPureDotDot() { + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> HunspellTokenFilterFactory.validatePackageIdentifier("..", "ref_path") + ); + assertThat(e.getMessage(), containsString("Only alphanumeric")); + } + + /** + * Test validatePackageIdentifier rejects spaces. + */ + public void testValidatePackageIdentifierRejectsSpaces() { + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> HunspellTokenFilterFactory.validatePackageIdentifier("my package", "ref_path") + ); + assertThat(e.getMessage(), containsString("Only alphanumeric")); + } + + /** + * Test validatePackageIdentifier rejects special characters. + */ + public void testValidatePackageIdentifierRejectsSpecialChars() { + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> HunspellTokenFilterFactory.validatePackageIdentifier("pkg@v1", "ref_path") + ); + assertThat(e.getMessage(), containsString("Only alphanumeric")); + } + + /** + * Test that create() method produces a valid HunspellStemFilter token stream. + */ + public void testCreateProducesTokenStream() throws IOException { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_hunspell.type", "hunspell") + .put("index.analysis.filter.my_hunspell.ref_path", "test-pkg") + .put("index.analysis.filter.my_hunspell.locale", "en_US") + .build(); + + TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, getDataPath("/indices/analyze/conf_dir")); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_hunspell"); + assertThat(tokenFilter, instanceOf(HunspellTokenFilterFactory.class)); + + // Call create() to cover the HunspellStemFilter creation line + org.apache.lucene.analysis.TokenStream ts = tokenFilter.create(new org.apache.lucene.tests.analysis.CannedTokenStream()); + assertNotNull(ts); + } + + /** + * Test that traditional locale create() method also works. + */ + public void testCreateWithTraditionalLocale() throws IOException { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_hunspell.type", "hunspell") + .put("index.analysis.filter.my_hunspell.locale", "en_US") + .build(); + + TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, getDataPath("/indices/analyze/conf_dir")); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_hunspell"); + + org.apache.lucene.analysis.TokenStream ts = tokenFilter.create(new org.apache.lucene.tests.analysis.CannedTokenStream()); + assertNotNull(ts); + } + + /** + * Test that 'language' alias works for locale parameter (backward compatibility). + */ + public void testLanguageAliasForLocale() throws IOException { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_hunspell.type", "hunspell") + .put("index.analysis.filter.my_hunspell.language", "en_US") + .build(); + + TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, getDataPath("/indices/analyze/conf_dir")); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_hunspell"); + assertThat(tokenFilter, instanceOf(HunspellTokenFilterFactory.class)); + } } diff --git a/server/src/test/java/org/opensearch/indices/analyze/HunspellServiceTests.java b/server/src/test/java/org/opensearch/indices/analyze/HunspellServiceTests.java index f66045898f4a3..12149661b278f 100644 --- a/server/src/test/java/org/opensearch/indices/analyze/HunspellServiceTests.java +++ b/server/src/test/java/org/opensearch/indices/analyze/HunspellServiceTests.java @@ -106,4 +106,285 @@ public void testDicWithTwoAffs() { assertEquals("Failed to load hunspell dictionary for locale: en_US", e.getMessage()); assertNull(e.getCause()); } + + // ========== REF_PATH (Package-based Dictionary) TESTS ========== + + public void testGetDictionaryFromPackage() throws Exception { + Path tempDir = createTempDir(); + // Create package directory structure: config/analyzers/pkg-1234/hunspell/en_US/ + Path packageDir = tempDir.resolve("config").resolve("analyzers").resolve("pkg-1234").resolve("hunspell").resolve("en_US"); + java.nio.file.Files.createDirectories(packageDir); + + // Create minimal hunspell files + createHunspellFiles(packageDir, "en_US"); + + Settings settings = Settings.builder() + .put(HUNSPELL_LAZY_LOAD.getKey(), randomBoolean()) + .put(Environment.PATH_HOME_SETTING.getKey(), tempDir) + .build(); + + Environment environment = new Environment(settings, tempDir.resolve("config")); + HunspellService hunspellService = new HunspellService(settings, environment, emptyMap()); + + // Test getDictionaryFromPackage + Dictionary dictionary = hunspellService.getDictionaryFromPackage("pkg-1234", "en_US"); + assertThat(dictionary, notNullValue()); + } + + public void testGetDictionaryFromPackageCaching() throws Exception { + Path tempDir = createTempDir(); + Path packageDir = tempDir.resolve("config").resolve("analyzers").resolve("pkg-1234").resolve("hunspell").resolve("en_US"); + java.nio.file.Files.createDirectories(packageDir); + createHunspellFiles(packageDir, "en_US"); + + Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), tempDir).build(); + + Environment environment = new Environment(settings, tempDir.resolve("config")); + HunspellService hunspellService = new HunspellService(settings, environment, emptyMap()); + + // First call - loads from disk + Dictionary dict1 = hunspellService.getDictionaryFromPackage("pkg-1234", "en_US"); + assertThat(dict1, notNullValue()); + + // Second call - should return cached instance + Dictionary dict2 = hunspellService.getDictionaryFromPackage("pkg-1234", "en_US"); + assertSame("Should return same cached instance", dict1, dict2); + } + + public void testMultiplePackagesCaching() throws Exception { + Path tempDir = createTempDir(); + + // Create two different package directories + Path pkg1Dir = tempDir.resolve("config").resolve("analyzers").resolve("pkg-1234").resolve("hunspell").resolve("en_US"); + Path pkg2Dir = tempDir.resolve("config").resolve("analyzers").resolve("pkg-5678").resolve("hunspell").resolve("en_US"); + java.nio.file.Files.createDirectories(pkg1Dir); + java.nio.file.Files.createDirectories(pkg2Dir); + createHunspellFiles(pkg1Dir, "en_US"); + createHunspellFiles(pkg2Dir, "en_US"); + + Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), tempDir).build(); + + Environment environment = new Environment(settings, tempDir.resolve("config")); + HunspellService hunspellService = new HunspellService(settings, environment, emptyMap()); + + // Load both package dictionaries + Dictionary dict1 = hunspellService.getDictionaryFromPackage("pkg-1234", "en_US"); + Dictionary dict2 = hunspellService.getDictionaryFromPackage("pkg-5678", "en_US"); + + assertThat(dict1, notNullValue()); + assertThat(dict2, notNullValue()); + assertNotSame("Different package directories should have different Dictionary instances", dict1, dict2); + + } + + public void testBuildPackageCacheKey() { + assertEquals("pkg-1234:en_US", HunspellService.buildPackageCacheKey("pkg-1234", "en_US")); + assertEquals("my-package:fr_FR", HunspellService.buildPackageCacheKey("my-package", "fr_FR")); + } + + public void testGetDictionaryFromPackageNotFound() throws Exception { + Path tempDir = createTempDir(); + // Don't create the package directory - it doesn't exist + + Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), tempDir).build(); + + Environment environment = new Environment(settings, tempDir.resolve("config")); + HunspellService hunspellService = new HunspellService(settings, environment, emptyMap()); + + IllegalStateException e = expectThrows(IllegalStateException.class, () -> { + hunspellService.getDictionaryFromPackage("nonexistent-pkg", "en_US"); + }); + assertTrue(e.getMessage().contains("Failed to load hunspell dictionary for package")); + } + + public void testMixedCacheKeysTraditionalAndPackage() throws Exception { + Path tempDir = createTempDir(); + + // Create traditional hunspell directory + Path traditionalDir = tempDir.resolve("config").resolve("hunspell").resolve("en_US"); + java.nio.file.Files.createDirectories(traditionalDir); + createHunspellFiles(traditionalDir, "en_US"); + + // Create package directory + Path packageDir = tempDir.resolve("config").resolve("analyzers").resolve("pkg-1234").resolve("hunspell").resolve("en_US"); + java.nio.file.Files.createDirectories(packageDir); + createHunspellFiles(packageDir, "en_US"); + + Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), tempDir).build(); + + Environment environment = new Environment(settings, tempDir.resolve("config")); + HunspellService hunspellService = new HunspellService(settings, environment, emptyMap()); + + // Load traditional dictionary + Dictionary traditionalDict = hunspellService.getDictionary("en_US"); + // Load package-based dictionary + Dictionary packageDict = hunspellService.getDictionaryFromPackage("pkg-1234", "en_US"); + + assertThat(traditionalDict, notNullValue()); + assertThat(packageDict, notNullValue()); + assertNotSame("Traditional and package dictionaries should be different instances", traditionalDict, packageDict); + + } + + public void testGetDictionaryFromPackageWithNullPackageId() throws Exception { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()) + .put(HUNSPELL_LAZY_LOAD.getKey(), true) + .build(); + Environment environment = new Environment(settings, getDataPath("/indices/analyze/conf_dir")); + HunspellService hunspellService = new HunspellService(settings, environment, emptyMap()); + + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> hunspellService.getDictionaryFromPackage(null, "en_US") + ); + assertThat(e.getMessage(), org.hamcrest.Matchers.containsString("packageId")); + } + + public void testGetDictionaryFromPackageWithEmptyPackageId() throws Exception { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()) + .put(HUNSPELL_LAZY_LOAD.getKey(), true) + .build(); + Environment environment = new Environment(settings, getDataPath("/indices/analyze/conf_dir")); + HunspellService hunspellService = new HunspellService(settings, environment, emptyMap()); + + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> hunspellService.getDictionaryFromPackage("", "en_US") + ); + assertThat(e.getMessage(), org.hamcrest.Matchers.containsString("packageId")); + } + + public void testGetDictionaryFromPackageWithNullLocale() throws Exception { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()) + .put(HUNSPELL_LAZY_LOAD.getKey(), true) + .build(); + Environment environment = new Environment(settings, getDataPath("/indices/analyze/conf_dir")); + HunspellService hunspellService = new HunspellService(settings, environment, emptyMap()); + + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> hunspellService.getDictionaryFromPackage("test-pkg", null) + ); + assertThat(e.getMessage(), org.hamcrest.Matchers.containsString("locale")); + } + + public void testGetDictionaryFromPackageWithEmptyLocale() throws Exception { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()) + .put(HUNSPELL_LAZY_LOAD.getKey(), true) + .build(); + Environment environment = new Environment(settings, getDataPath("/indices/analyze/conf_dir")); + HunspellService hunspellService = new HunspellService(settings, environment, emptyMap()); + + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> hunspellService.getDictionaryFromPackage("test-pkg", "") + ); + assertThat(e.getMessage(), org.hamcrest.Matchers.containsString("locale")); + } + + public void testPackageWithMissingHunspellSubdir() throws Exception { + Path tempDir = createTempDir(); + // Create package dir WITHOUT hunspell subdirectory + Path packageDir = tempDir.resolve("config").resolve("analyzers").resolve("bad-pkg"); + java.nio.file.Files.createDirectories(packageDir); + + Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), tempDir).build(); + Environment environment = new Environment(settings, tempDir.resolve("config")); + HunspellService hunspellService = new HunspellService(settings, environment, emptyMap()); + + Exception e = expectThrows(Exception.class, () -> hunspellService.getDictionaryFromPackage("bad-pkg", "en_US")); + assertTrue(e.getMessage().contains("bad-pkg")); + } + + public void testPackageMissingLocaleDir() throws Exception { + Path tempDir = createTempDir(); + // Create package + hunspell dir but no locale subdir + Path hunspellDir = tempDir.resolve("config").resolve("analyzers").resolve("pkg-empty").resolve("hunspell"); + java.nio.file.Files.createDirectories(hunspellDir); + + Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), tempDir).build(); + Environment environment = new Environment(settings, tempDir.resolve("config")); + HunspellService hunspellService = new HunspellService(settings, environment, emptyMap()); + + Exception e = expectThrows(Exception.class, () -> hunspellService.getDictionaryFromPackage("pkg-empty", "en_US")); + assertTrue(e.getMessage().contains("en_US") || e.getMessage().contains("pkg-empty")); + } + + public void testPackageMissingAffFile() throws Exception { + Path tempDir = createTempDir(); + Path localeDir = tempDir.resolve("config").resolve("analyzers").resolve("pkg-noaff").resolve("hunspell").resolve("en_US"); + java.nio.file.Files.createDirectories(localeDir); + // Only create .dic, no .aff + java.nio.file.Files.write(localeDir.resolve("en_US.dic"), java.util.Arrays.asList("1", "test")); + + Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), tempDir).build(); + Environment environment = new Environment(settings, tempDir.resolve("config")); + HunspellService hunspellService = new HunspellService(settings, environment, emptyMap()); + + Exception e = expectThrows(Exception.class, () -> hunspellService.getDictionaryFromPackage("pkg-noaff", "en_US")); + assertTrue(e.getMessage().contains("affix") || e.getMessage().contains("pkg-noaff")); + } + + public void testPathTraversalInPackageId() throws Exception { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()) + .put(HUNSPELL_LAZY_LOAD.getKey(), true) + .build(); + Environment environment = new Environment(settings, getDataPath("/indices/analyze/conf_dir")); + HunspellService hunspellService = new HunspellService(settings, environment, emptyMap()); + + Exception e = expectThrows(Exception.class, () -> hunspellService.getDictionaryFromPackage("..", "en_US")); + assertNotNull(e); + } + + public void testPathTraversalInLocale() throws Exception { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()) + .put(HUNSPELL_LAZY_LOAD.getKey(), true) + .build(); + Environment environment = new Environment(settings, getDataPath("/indices/analyze/conf_dir")); + HunspellService hunspellService = new HunspellService(settings, environment, emptyMap()); + + Exception e = expectThrows(Exception.class, () -> hunspellService.getDictionaryFromPackage("test-pkg", "../en_US")); + assertNotNull(e); + } + + public void testSlashInPackageId() throws Exception { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()) + .put(HUNSPELL_LAZY_LOAD.getKey(), true) + .build(); + Environment environment = new Environment(settings, getDataPath("/indices/analyze/conf_dir")); + HunspellService hunspellService = new HunspellService(settings, environment, emptyMap()); + + Exception e = expectThrows(Exception.class, () -> hunspellService.getDictionaryFromPackage("foo/bar", "en_US")); + assertNotNull(e); + } + + public void testBackslashInLocale() throws Exception { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()) + .put(HUNSPELL_LAZY_LOAD.getKey(), true) + .build(); + Environment environment = new Environment(settings, getDataPath("/indices/analyze/conf_dir")); + HunspellService hunspellService = new HunspellService(settings, environment, emptyMap()); + + Exception e = expectThrows(Exception.class, () -> hunspellService.getDictionaryFromPackage("test-pkg", "en\\US")); + assertNotNull(e); + } + + // Helper method to create minimal hunspell files for testing + private void createHunspellFiles(Path directory, String locale) throws java.io.IOException { + // Create .aff file + Path affFile = directory.resolve(locale + ".aff"); + java.nio.file.Files.write(affFile, java.util.Arrays.asList("SET UTF-8", "SFX S Y 1", "SFX S 0 s .")); + + // Create .dic file + Path dicFile = directory.resolve(locale + ".dic"); + java.nio.file.Files.write(dicFile, java.util.Arrays.asList("3", "test/S", "word/S", "hello")); + } } diff --git a/server/src/test/resources/indices/analyze/conf_dir/analyzers/test-pkg/hunspell/en_US/en_US.aff b/server/src/test/resources/indices/analyze/conf_dir/analyzers/test-pkg/hunspell/en_US/en_US.aff new file mode 100644 index 0000000000000..2ddd985437187 --- /dev/null +++ b/server/src/test/resources/indices/analyze/conf_dir/analyzers/test-pkg/hunspell/en_US/en_US.aff @@ -0,0 +1,201 @@ +SET ISO8859-1 +TRY esianrtolcdugmphbyfvkwzESIANRTOLCDUGMPHBYFVKWZ' +NOSUGGEST ! + +# ordinal numbers +COMPOUNDMIN 1 +# only in compounds: 1th, 2th, 3th +ONLYINCOMPOUND c +# compound rules: +# 1. [0-9]*1[0-9]th (10th, 11th, 12th, 56714th, etc.) +# 2. [0-9]*[02-9](1st|2nd|3rd|[4-9]th) (21st, 22nd, 123rd, 1234th, etc.) +COMPOUNDRULE 2 +COMPOUNDRULE n*1t +COMPOUNDRULE n*mp +WORDCHARS 0123456789 + +PFX A Y 1 +PFX A 0 re . + +PFX I Y 1 +PFX I 0 in . + +PFX U Y 1 +PFX U 0 un . + +PFX C Y 1 +PFX C 0 de . + +PFX E Y 1 +PFX E 0 dis . + +PFX F Y 1 +PFX F 0 con . + +PFX K Y 1 +PFX K 0 pro . + +SFX V N 2 +SFX V e ive e +SFX V 0 ive [^e] + +SFX N Y 3 +SFX N e ion e +SFX N y ication y +SFX N 0 en [^ey] + +SFX X Y 3 +SFX X e ions e +SFX X y ications y +SFX X 0 ens [^ey] + +SFX H N 2 +SFX H y ieth y +SFX H 0 th [^y] + +SFX Y Y 1 +SFX Y 0 ly . + +SFX G Y 2 +SFX G e ing e +SFX G 0 ing [^e] + +SFX J Y 2 +SFX J e ings e +SFX J 0 ings [^e] + +SFX D Y 4 +SFX D 0 d e +SFX D y ied [^aeiou]y +SFX D 0 ed [^ey] +SFX D 0 ed [aeiou]y + +SFX T N 4 +SFX T 0 st e +SFX T y iest [^aeiou]y +SFX T 0 est [aeiou]y +SFX T 0 est [^ey] + +SFX R Y 4 +SFX R 0 r e +SFX R y ier [^aeiou]y +SFX R 0 er [aeiou]y +SFX R 0 er [^ey] + +SFX Z Y 4 +SFX Z 0 rs e +SFX Z y iers [^aeiou]y +SFX Z 0 ers [aeiou]y +SFX Z 0 ers [^ey] + +SFX S Y 4 +SFX S y ies [^aeiou]y +SFX S 0 s [aeiou]y +SFX S 0 es [sxzh] +SFX S 0 s [^sxzhy] + +SFX P Y 3 +SFX P y iness [^aeiou]y +SFX P 0 ness [aeiou]y +SFX P 0 ness [^y] + +SFX M Y 1 +SFX M 0 's . + +SFX B Y 3 +SFX B 0 able [^aeiou] +SFX B 0 able ee +SFX B e able [^aeiou]e + +SFX L Y 1 +SFX L 0 ment . + +REP 88 +REP a ei +REP ei a +REP a ey +REP ey a +REP ai ie +REP ie ai +REP are air +REP are ear +REP are eir +REP air are +REP air ere +REP ere air +REP ere ear +REP ere eir +REP ear are +REP ear air +REP ear ere +REP eir are +REP eir ere +REP ch te +REP te ch +REP ch ti +REP ti ch +REP ch tu +REP tu ch +REP ch s +REP s ch +REP ch k +REP k ch +REP f ph +REP ph f +REP gh f +REP f gh +REP i igh +REP igh i +REP i uy +REP uy i +REP i ee +REP ee i +REP j di +REP di j +REP j gg +REP gg j +REP j ge +REP ge j +REP s ti +REP ti s +REP s ci +REP ci s +REP k cc +REP cc k +REP k qu +REP qu k +REP kw qu +REP o eau +REP eau o +REP o ew +REP ew o +REP oo ew +REP ew oo +REP ew ui +REP ui ew +REP oo ui +REP ui oo +REP ew u +REP u ew +REP oo u +REP u oo +REP u oe +REP oe u +REP u ieu +REP ieu u +REP ue ew +REP ew ue +REP uff ough +REP oo ieu +REP ieu oo +REP ier ear +REP ear ier +REP ear air +REP air ear +REP w qu +REP qu w +REP z ss +REP ss z +REP shun tion +REP shun sion +REP shun cion diff --git a/server/src/test/resources/indices/analyze/conf_dir/analyzers/test-pkg/hunspell/en_US/en_US.dic b/server/src/test/resources/indices/analyze/conf_dir/analyzers/test-pkg/hunspell/en_US/en_US.dic new file mode 100644 index 0000000000000..d278da593c573 --- /dev/null +++ b/server/src/test/resources/indices/analyze/conf_dir/analyzers/test-pkg/hunspell/en_US/en_US.dic @@ -0,0 +1,106 @@ +100 +test/S +word/S +hello +world/S +example/S +package/S +dictionary/S +hunspell +analysis +search/S +index/S +document/S +cluster/S +node/S +shard/S +replica/S +query/S +filter/S +token/S +analyzer/S +mapping/S +setting/S +request/S +response/S +action/S +cache/S +locale +config +plugin/S +module/S +server/S +client/S +service/S +manager/S +factory/S +handler/S +transport/S +network/S +thread/S +pool/S +memory +storage +engine/S +snapshot/S +restore +backup/S +monitor/S +metric/S +health +status +version/S +update/S +delete +create +read +write +merge +refresh +flush +commit +recover +replicate +allocate +balance +route +forward +ingest +process +transform +validate +authenticate +authorize +encrypt +decrypt +compress +decompress +serialize +deserialize +compute +execute +invoke +dispatch +publish +subscribe +notify +broadcast +stream +buffer/S +pipeline/S +workflow/S +template/S +pattern/S +schema/S +format/S +protocol/S +endpoint/S +interface/S +abstract +concrete +virtual +static +dynamic +public +private +secure \ No newline at end of file