Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- WLM group custom search settings - groundwork and timeout ([#20536](https://github.com/opensearch-project/OpenSearch/issues/20536))
- Expose JVM runtime metrics via telemetry framework ([#20844](https://github.com/opensearch-project/OpenSearch/pull/20844))
- Add intra segment support for single-value metric aggregations ([#20503](https://github.com/opensearch-project/OpenSearch/pull/20503))
- Add ref_path support for package-based hunspell dictionary loading ([#20840](https://github.com/opensearch-project/OpenSearch/pull/20840))
- Add support for enabling pluggable data formats, starting with phase-1 of decoupling shard from engine, and introducing basic abstractions ([#20675](https://github.com/opensearch-project/OpenSearch/pull/20675))

### Changed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,34 @@
import org.opensearch.indices.analysis.HunspellService;

import java.util.Locale;
import java.util.regex.Pattern;

/**
* The token filter factory for the hunspell analyzer
*
* The dictionary is loaded from either:
* <ul>
* <li>A ref_path (package ID, e.g., "pkg-1234") combined with locale for package-based dictionaries</li>
* <li>A locale (e.g., "en_US") for traditional hunspell dictionaries from config/hunspell/</li>
* </ul>
*
* <h2>Usage Examples:</h2>
* <pre>
* // Traditional locale-based (loads from config/hunspell/en_US/)
* {
* "type": "hunspell",
* "locale": "en_US"
* }
*
* // Package-based (loads from config/analyzers/pkg-1234/hunspell/en_US/)
* {
* "type": "hunspell",
* "ref_path": "pkg-1234",
* "locale": "en_US"
* }
* </pre>
*
*
* @opensearch.internal
*/
public class HunspellTokenFilterFactory extends AbstractTokenFilterFactory {
Expand All @@ -54,14 +78,32 @@ public class HunspellTokenFilterFactory extends AbstractTokenFilterFactory {
public HunspellTokenFilterFactory(IndexSettings indexSettings, String name, Settings settings, HunspellService hunspellService) {
super(indexSettings, name, settings);

// Get both ref_path and locale parameters
String refPath = settings.get("ref_path"); // Package ID only (optional)
String locale = settings.get("locale", settings.get("language", settings.get("lang", null)));
if (locale == null) {
throw new IllegalArgumentException("missing [locale | language | lang] configuration for hunspell token filter");
}

dictionary = hunspellService.getDictionary(locale);
if (dictionary == null) {
throw new IllegalArgumentException(String.format(Locale.ROOT, "Unknown hunspell dictionary for locale [%s]", locale));
if (refPath != null) {
// Package-based loading: ref_path (package ID) + locale (required)
if (locale == null) {
throw new IllegalArgumentException("When using ref_path, the 'locale' parameter is required for hunspell token filter");
}

// Validate ref_path and locale are safe package/locale identifiers
validatePackageIdentifier(refPath, "ref_path");
validatePackageIdentifier(locale, "locale");

// Load from package directory: config/analyzers/{ref_path}/hunspell/{locale}/
dictionary = hunspellService.getDictionaryFromPackage(refPath, locale);
} else if (locale != null) {
// Traditional locale-based loading (backward compatible)
// Loads from config/hunspell/{locale}/
// Validate locale to prevent path traversal and cache key ambiguity
validatePackageIdentifier(locale, "locale");
dictionary = hunspellService.getDictionary(locale);
} else {
throw new IllegalArgumentException(
"The 'locale' parameter is required for hunspell token filter. Set it to the hunspell dictionary locale (e.g., 'en_US')."
);
}

dedup = settings.getAsBoolean("dedup", true);
Expand All @@ -81,4 +123,38 @@ public boolean longestOnly() {
return longestOnly;
}

/**
* Allowlist pattern for safe package identifiers and locales.
* Permits only alphanumeric characters, hyphens, and underscores.
* Examples: "pkg-1234", "en_US", "my-package-v2", "en_US_custom"
*/
private static final Pattern SAFE_IDENTIFIER_PATTERN = Pattern.compile("^[a-zA-Z0-9][a-zA-Z0-9_-]*$|^[a-zA-Z0-9]$");

/**
* Validates that a package identifier or locale contains only safe characters.
* Uses an allowlist approach: only alphanumeric characters, hyphens, and underscores are permitted.
* This prevents path traversal, cache key injection, and other security issues.
*
* @param value The value to validate (package ID or locale)
* @param paramName The parameter name for error messages
* @throws IllegalArgumentException if validation fails
*/
static void validatePackageIdentifier(String value, String paramName) {
if (value == null || value.isEmpty()) {
throw new IllegalArgumentException(String.format(Locale.ROOT, "Invalid %s: value cannot be null or empty.", paramName));
}

if (!SAFE_IDENTIFIER_PATTERN.matcher(value).matches()) {
throw new IllegalArgumentException(
String.format(
Locale.ROOT,
"Invalid %s: [%s]. Only alphanumeric characters, hyphens, and underscores are allowed.",
paramName,
value
)
);
}

}

}
Loading
Loading