Skip to content

Commit 1513ae8

Browse files
committed
feat(hunspell): Add ref_path support for package-based dictionary loading
- Add ref_path parameter to HunspellTokenFilterFactory for package-based dictionaries - Load from config/analyzers/{packageId}/hunspell/{locale}/ - Node-level cache with {packageId}:{locale} cache keys for multi-tenant isolation - Refactor loadDictionary to accept baseDir parameter for code reuse - Add regex allowlist validation for ref_path and locale - Shared loadDictionaryFromDirectory for .aff/.dic file loading - Backward compatible: traditional config/hunspell/{locale} Signed-off-by: shayush622 <ayush5267@gmail.com>
1 parent da1de97 commit 1513ae8

File tree

7 files changed

+1123
-36
lines changed

7 files changed

+1123
-36
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
2121
- WLM group custom search settings - groundwork and timeout ([#20536](https://github.com/opensearch-project/OpenSearch/issues/20536))
2222
- Expose JVM runtime metrics via telemetry framework ([#20844](https://github.com/opensearch-project/OpenSearch/pull/20844))
2323
- Add intra segment support for single-value metric aggregations ([#20503](https://github.com/opensearch-project/OpenSearch/pull/20503))
24+
- Add ref_path support for package-based hunspell dictionary loading ([#20840](https://github.com/opensearch-project/OpenSearch/pull/20840))
2425

2526
### Changed
2627
- Make telemetry `Tags` immutable ([#20788](https://github.com/opensearch-project/OpenSearch/pull/20788))

server/src/main/java/org/opensearch/index/analysis/HunspellTokenFilterFactory.java

Lines changed: 97 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,34 @@
3939
import org.opensearch.indices.analysis.HunspellService;
4040

4141
import java.util.Locale;
42+
import java.util.regex.Pattern;
4243

4344
/**
4445
* The token filter factory for the hunspell analyzer
4546
*
47+
* The dictionary is loaded from either:
48+
* <ul>
49+
* <li>A ref_path (package ID, e.g., "pkg-1234") combined with locale for package-based dictionaries</li>
50+
* <li>A locale (e.g., "en_US") for traditional hunspell dictionaries from config/hunspell/</li>
51+
* </ul>
52+
*
53+
* <h2>Usage Examples:</h2>
54+
* <pre>
55+
* // Traditional locale-based (loads from config/hunspell/en_US/)
56+
* {
57+
* "type": "hunspell",
58+
* "locale": "en_US"
59+
* }
60+
*
61+
* // Package-based (loads from config/analyzers/pkg-1234/hunspell/en_US/)
62+
* {
63+
* "type": "hunspell",
64+
* "ref_path": "pkg-1234",
65+
* "locale": "en_US"
66+
* }
67+
* </pre>
68+
*
69+
*
4670
* @opensearch.internal
4771
*/
4872
public class HunspellTokenFilterFactory extends AbstractTokenFilterFactory {
@@ -54,14 +78,40 @@ public class HunspellTokenFilterFactory extends AbstractTokenFilterFactory {
5478
public HunspellTokenFilterFactory(IndexSettings indexSettings, String name, Settings settings, HunspellService hunspellService) {
5579
super(indexSettings, name, settings);
5680

81+
// Get both ref_path and locale parameters
82+
String refPath = settings.get("ref_path"); // Package ID only (optional)
5783
String locale = settings.get("locale", settings.get("language", settings.get("lang", null)));
58-
if (locale == null) {
59-
throw new IllegalArgumentException("missing [locale | language | lang] configuration for hunspell token filter");
60-
}
6184

62-
dictionary = hunspellService.getDictionary(locale);
63-
if (dictionary == null) {
64-
throw new IllegalArgumentException(String.format(Locale.ROOT, "Unknown hunspell dictionary for locale [%s]", locale));
85+
if (refPath != null) {
86+
// Package-based loading: ref_path (package ID) + locale (required)
87+
if (locale == null) {
88+
throw new IllegalArgumentException("When using ref_path, the 'locale' parameter is required for hunspell token filter");
89+
}
90+
91+
// Validate ref_path and locale are safe package/locale identifiers
92+
validatePackageIdentifier(refPath, "ref_path");
93+
validatePackageIdentifier(locale, "locale");
94+
95+
// Load from package directory: config/analyzers/{ref_path}/hunspell/{locale}/
96+
dictionary = hunspellService.getDictionaryFromPackage(refPath, locale);
97+
if (dictionary == null) {
98+
throw new IllegalArgumentException(
99+
String.format(Locale.ROOT, "Could not find hunspell dictionary for locale [%s] in package [%s]", locale, refPath)
100+
);
101+
}
102+
} else if (locale != null) {
103+
// Traditional locale-based loading (backward compatible)
104+
// Loads from config/hunspell/{locale}/
105+
// Validate locale to prevent path traversal and cache key ambiguity
106+
validatePackageIdentifier(locale, "locale");
107+
dictionary = hunspellService.getDictionary(locale);
108+
if (dictionary == null) {
109+
throw new IllegalArgumentException(String.format(Locale.ROOT, "Unknown hunspell dictionary for locale [%s]", locale));
110+
}
111+
} else {
112+
throw new IllegalArgumentException(
113+
"The 'locale' parameter is required for hunspell token filter. Set it to the hunspell dictionary locale (e.g., 'en_US')."
114+
);
65115
}
66116

67117
dedup = settings.getAsBoolean("dedup", true);
@@ -81,4 +131,45 @@ public boolean longestOnly() {
81131
return longestOnly;
82132
}
83133

134+
/**
135+
* Allowlist pattern for safe package identifiers and locales.
136+
* Permits alphanumeric characters, hyphens, underscores, and dots (but not leading/trailing dots).
137+
* Examples: "pkg-1234", "en_US", "my-package-v2", "en_US_custom"
138+
*/
139+
private static final Pattern SAFE_IDENTIFIER_PATTERN = Pattern.compile("^[a-zA-Z0-9][a-zA-Z0-9._-]*[a-zA-Z0-9]$|^[a-zA-Z0-9]$");
140+
141+
/**
142+
* Validates that a package identifier or locale contains only safe characters.
143+
* Uses an allowlist approach: only alphanumeric, hyphen, underscore, and dot (not leading/trailing) are permitted.
144+
* This prevents path traversal, cache key injection, and other security issues.
145+
*
146+
* @param value The value to validate (package ID or locale)
147+
* @param paramName The parameter name for error messages
148+
* @throws IllegalArgumentException if validation fails
149+
*/
150+
static void validatePackageIdentifier(String value, String paramName) {
151+
if (value == null || value.isEmpty()) {
152+
throw new IllegalArgumentException(String.format(Locale.ROOT, "Invalid %s: value cannot be null or empty.", paramName));
153+
}
154+
155+
if (!SAFE_IDENTIFIER_PATTERN.matcher(value).matches()) {
156+
throw new IllegalArgumentException(
157+
String.format(
158+
Locale.ROOT,
159+
"Invalid %s: [%s]. Only alphanumeric characters, hyphens, underscores, "
160+
+ "and dots (not leading/trailing) are allowed.",
161+
paramName,
162+
value
163+
)
164+
);
165+
}
166+
167+
// Additional check: reject ".." sequences even within otherwise valid characters (e.g., "foo..bar")
168+
if (value.contains("..")) {
169+
throw new IllegalArgumentException(
170+
String.format(Locale.ROOT, "Invalid %s: [%s]. Consecutive dots ('..') are not allowed.", paramName, value)
171+
);
172+
}
173+
}
174+
84175
}

0 commit comments

Comments
 (0)