3939import org .opensearch .indices .analysis .HunspellService ;
4040
4141import java .util .Locale ;
42+ import java .util .regex .Pattern ;
4243
4344/**
4445 * The token filter factory for the hunspell analyzer
4546 *
47+ * The dictionary is loaded from either:
48+ * <ul>
49+ * <li>A ref_path (package ID, e.g., "pkg-1234") combined with locale for package-based dictionaries</li>
50+ * <li>A locale (e.g., "en_US") for traditional hunspell dictionaries from config/hunspell/</li>
51+ * </ul>
52+ *
53+ * <h2>Usage Examples:</h2>
54+ * <pre>
55+ * // Traditional locale-based (loads from config/hunspell/en_US/)
56+ * {
57+ * "type": "hunspell",
58+ * "locale": "en_US"
59+ * }
60+ *
61+ * // Package-based (loads from config/analyzers/pkg-1234/hunspell/en_US/)
62+ * {
63+ * "type": "hunspell",
64+ * "ref_path": "pkg-1234",
65+ * "locale": "en_US"
66+ * }
67+ * </pre>
68+ *
69+ *
4670 * @opensearch.internal
4771 */
4872public class HunspellTokenFilterFactory extends AbstractTokenFilterFactory {
@@ -54,14 +78,40 @@ public class HunspellTokenFilterFactory extends AbstractTokenFilterFactory {
5478 public HunspellTokenFilterFactory (IndexSettings indexSettings , String name , Settings settings , HunspellService hunspellService ) {
5579 super (indexSettings , name , settings );
5680
81+ // Get both ref_path and locale parameters
82+ String refPath = settings .get ("ref_path" ); // Package ID only (optional)
5783 String locale = settings .get ("locale" , settings .get ("language" , settings .get ("lang" , null )));
58- if (locale == null ) {
59- throw new IllegalArgumentException ("missing [locale | language | lang] configuration for hunspell token filter" );
60- }
6184
62- dictionary = hunspellService .getDictionary (locale );
63- if (dictionary == null ) {
64- throw new IllegalArgumentException (String .format (Locale .ROOT , "Unknown hunspell dictionary for locale [%s]" , locale ));
85+ if (refPath != null ) {
86+ // Package-based loading: ref_path (package ID) + locale (required)
87+ if (locale == null ) {
88+ throw new IllegalArgumentException ("When using ref_path, the 'locale' parameter is required for hunspell token filter" );
89+ }
90+
91+ // Validate ref_path and locale are safe package/locale identifiers
92+ validatePackageIdentifier (refPath , "ref_path" );
93+ validatePackageIdentifier (locale , "locale" );
94+
95+ // Load from package directory: config/analyzers/{ref_path}/hunspell/{locale}/
96+ dictionary = hunspellService .getDictionaryFromPackage (refPath , locale );
97+ if (dictionary == null ) {
98+ throw new IllegalArgumentException (
99+ String .format (Locale .ROOT , "Could not find hunspell dictionary for locale [%s] in package [%s]" , locale , refPath )
100+ );
101+ }
102+ } else if (locale != null ) {
103+ // Traditional locale-based loading (backward compatible)
104+ // Loads from config/hunspell/{locale}/
105+ // Validate locale to prevent path traversal and cache key ambiguity
106+ validatePackageIdentifier (locale , "locale" );
107+ dictionary = hunspellService .getDictionary (locale );
108+ if (dictionary == null ) {
109+ throw new IllegalArgumentException (String .format (Locale .ROOT , "Unknown hunspell dictionary for locale [%s]" , locale ));
110+ }
111+ } else {
112+ throw new IllegalArgumentException (
113+ "The 'locale' parameter is required for hunspell token filter. Set it to the hunspell dictionary locale (e.g., 'en_US')."
114+ );
65115 }
66116
67117 dedup = settings .getAsBoolean ("dedup" , true );
@@ -81,4 +131,45 @@ public boolean longestOnly() {
81131 return longestOnly ;
82132 }
83133
134+ /**
135+ * Allowlist pattern for safe package identifiers and locales.
136+ * Permits alphanumeric characters, hyphens, underscores, and dots (but not leading/trailing dots).
137+ * Examples: "pkg-1234", "en_US", "my-package-v2", "en_US_custom"
138+ */
139+ private static final Pattern SAFE_IDENTIFIER_PATTERN = Pattern .compile ("^[a-zA-Z0-9][a-zA-Z0-9._-]*[a-zA-Z0-9]$|^[a-zA-Z0-9]$" );
140+
141+ /**
142+ * Validates that a package identifier or locale contains only safe characters.
143+ * Uses an allowlist approach: only alphanumeric, hyphen, underscore, and dot (not leading/trailing) are permitted.
144+ * This prevents path traversal, cache key injection, and other security issues.
145+ *
146+ * @param value The value to validate (package ID or locale)
147+ * @param paramName The parameter name for error messages
148+ * @throws IllegalArgumentException if validation fails
149+ */
150+ static void validatePackageIdentifier (String value , String paramName ) {
151+ if (value == null || value .isEmpty ()) {
152+ throw new IllegalArgumentException (String .format (Locale .ROOT , "Invalid %s: value cannot be null or empty." , paramName ));
153+ }
154+
155+ if (!SAFE_IDENTIFIER_PATTERN .matcher (value ).matches ()) {
156+ throw new IllegalArgumentException (
157+ String .format (
158+ Locale .ROOT ,
159+ "Invalid %s: [%s]. Only alphanumeric characters, hyphens, underscores, "
160+ + "and dots (not leading/trailing) are allowed." ,
161+ paramName ,
162+ value
163+ )
164+ );
165+ }
166+
167+ // Additional check: reject ".." sequences even within otherwise valid characters (e.g., "foo..bar")
168+ if (value .contains (".." )) {
169+ throw new IllegalArgumentException (
170+ String .format (Locale .ROOT , "Invalid %s: [%s]. Consecutive dots ('..') are not allowed." , paramName , value )
171+ );
172+ }
173+ }
174+
84175}
0 commit comments