Skip to content

Commit 4630c23

Browse files
committed
feat(hunspell): Add ref_path support for package-based dictionary loading
- Add ref_path parameter to HunspellTokenFilterFactory for package-based dictionaries - Load from config/analyzers/{packageId}/hunspell/{locale}/ - Node-level cache with {packageId}:{locale} cache keys for multi-tenant isolation - Refactor loadDictionary to accept baseDir parameter for code reuse - Add regex allowlist validation for ref_path and locale - Shared loadDictionaryFromDirectory for .aff/.dic file loading - Backward compatible: traditional config/hunspell/{locale} Signed-off-by: shayush622 <ayush5267@gmail.com>
1 parent 55c022a commit 4630c23

File tree

7 files changed

+1096
-36
lines changed

7 files changed

+1096
-36
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
2121
- WLM group custom search settings - groundwork and timeout ([#20536](https://github.com/opensearch-project/OpenSearch/issues/20536))
2222
- Expose JVM runtime metrics via telemetry framework ([#20844](https://github.com/opensearch-project/OpenSearch/pull/20844))
2323
- Add intra segment support for single-value metric aggregations ([#20503](https://github.com/opensearch-project/OpenSearch/pull/20503))
24+
- Add ref_path support for package-based hunspell dictionary loading ([#20840](https://github.com/opensearch-project/OpenSearch/pull/20840))
2425

2526
### Changed
2627
- Make telemetry `Tags` immutable ([#20788](https://github.com/opensearch-project/OpenSearch/pull/20788))

server/src/main/java/org/opensearch/index/analysis/HunspellTokenFilterFactory.java

Lines changed: 82 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,34 @@
3939
import org.opensearch.indices.analysis.HunspellService;
4040

4141
import java.util.Locale;
42+
import java.util.regex.Pattern;
4243

4344
/**
4445
* The token filter factory for the hunspell analyzer
4546
*
47+
* The dictionary is loaded from either:
48+
* <ul>
49+
* <li>A ref_path (package ID, e.g., "pkg-1234") combined with locale for package-based dictionaries</li>
50+
* <li>A locale (e.g., "en_US") for traditional hunspell dictionaries from config/hunspell/</li>
51+
* </ul>
52+
*
53+
* <h2>Usage Examples:</h2>
54+
* <pre>
55+
* // Traditional locale-based (loads from config/hunspell/en_US/)
56+
* {
57+
* "type": "hunspell",
58+
* "locale": "en_US"
59+
* }
60+
*
61+
* // Package-based (loads from config/analyzers/pkg-1234/hunspell/en_US/)
62+
* {
63+
* "type": "hunspell",
64+
* "ref_path": "pkg-1234",
65+
* "locale": "en_US"
66+
* }
67+
* </pre>
68+
*
69+
*
4670
* @opensearch.internal
4771
*/
4872
public class HunspellTokenFilterFactory extends AbstractTokenFilterFactory {
@@ -54,14 +78,32 @@ public class HunspellTokenFilterFactory extends AbstractTokenFilterFactory {
5478
public HunspellTokenFilterFactory(IndexSettings indexSettings, String name, Settings settings, HunspellService hunspellService) {
5579
super(indexSettings, name, settings);
5680

81+
// Get both ref_path and locale parameters
82+
String refPath = settings.get("ref_path"); // Package ID only (optional)
5783
String locale = settings.get("locale", settings.get("language", settings.get("lang", null)));
58-
if (locale == null) {
59-
throw new IllegalArgumentException("missing [locale | language | lang] configuration for hunspell token filter");
60-
}
6184

62-
dictionary = hunspellService.getDictionary(locale);
63-
if (dictionary == null) {
64-
throw new IllegalArgumentException(String.format(Locale.ROOT, "Unknown hunspell dictionary for locale [%s]", locale));
85+
if (refPath != null) {
86+
// Package-based loading: ref_path (package ID) + locale (required)
87+
if (locale == null) {
88+
throw new IllegalArgumentException("When using ref_path, the 'locale' parameter is required for hunspell token filter");
89+
}
90+
91+
// Validate ref_path and locale are safe package/locale identifiers
92+
validatePackageIdentifier(refPath, "ref_path");
93+
validatePackageIdentifier(locale, "locale");
94+
95+
// Load from package directory: config/analyzers/{ref_path}/hunspell/{locale}/
96+
dictionary = hunspellService.getDictionaryFromPackage(refPath, locale);
97+
} else if (locale != null) {
98+
// Traditional locale-based loading (backward compatible)
99+
// Loads from config/hunspell/{locale}/
100+
// Validate locale to prevent path traversal and cache key ambiguity
101+
validatePackageIdentifier(locale, "locale");
102+
dictionary = hunspellService.getDictionary(locale);
103+
} else {
104+
throw new IllegalArgumentException(
105+
"The 'locale' parameter is required for hunspell token filter. Set it to the hunspell dictionary locale (e.g., 'en_US')."
106+
);
65107
}
66108

67109
dedup = settings.getAsBoolean("dedup", true);
@@ -81,4 +123,38 @@ public boolean longestOnly() {
81123
return longestOnly;
82124
}
83125

126+
/**
127+
* Allowlist pattern for safe package identifiers and locales.
128+
* Permits only alphanumeric characters, hyphens, and underscores.
129+
* Examples: "pkg-1234", "en_US", "my-package-v2", "en_US_custom"
130+
*/
131+
private static final Pattern SAFE_IDENTIFIER_PATTERN = Pattern.compile("^[a-zA-Z0-9][a-zA-Z0-9_-]*$|^[a-zA-Z0-9]$");
132+
133+
/**
134+
* Validates that a package identifier or locale contains only safe characters.
135+
* Uses an allowlist approach: only alphanumeric characters, hyphens, and underscores are permitted.
136+
* This prevents path traversal, cache key injection, and other security issues.
137+
*
138+
* @param value The value to validate (package ID or locale)
139+
* @param paramName The parameter name for error messages
140+
* @throws IllegalArgumentException if validation fails
141+
*/
142+
static void validatePackageIdentifier(String value, String paramName) {
143+
if (value == null || value.isEmpty()) {
144+
throw new IllegalArgumentException(String.format(Locale.ROOT, "Invalid %s: value cannot be null or empty.", paramName));
145+
}
146+
147+
if (!SAFE_IDENTIFIER_PATTERN.matcher(value).matches()) {
148+
throw new IllegalArgumentException(
149+
String.format(
150+
Locale.ROOT,
151+
"Invalid %s: [%s]. Only alphanumeric characters, hyphens, and underscores are allowed.",
152+
paramName,
153+
value
154+
)
155+
);
156+
}
157+
158+
}
159+
84160
}

0 commit comments

Comments
 (0)