3939import org .opensearch .indices .analysis .HunspellService ;
4040
4141import java .util .Locale ;
42+ import java .util .regex .Pattern ;
4243
4344/**
4445 * The token filter factory for the hunspell analyzer
4546 *
47+ * The dictionary is loaded from either:
48+ * <ul>
49+ * <li>A ref_path (package ID, e.g., "pkg-1234") combined with locale for package-based dictionaries</li>
50+ * <li>A locale (e.g., "en_US") for traditional hunspell dictionaries from config/hunspell/</li>
51+ * </ul>
52+ *
53+ * <h2>Usage Examples:</h2>
54+ * <pre>
55+ * // Traditional locale-based (loads from config/hunspell/en_US/)
56+ * {
57+ * "type": "hunspell",
58+ * "locale": "en_US"
59+ * }
60+ *
61+ * // Package-based (loads from config/analyzers/pkg-1234/hunspell/en_US/)
62+ * {
63+ * "type": "hunspell",
64+ * "ref_path": "pkg-1234",
65+ * "locale": "en_US"
66+ * }
67+ * </pre>
68+ *
69+ *
4670 * @opensearch.internal
4771 */
4872public class HunspellTokenFilterFactory extends AbstractTokenFilterFactory {
@@ -54,14 +78,32 @@ public class HunspellTokenFilterFactory extends AbstractTokenFilterFactory {
5478 public HunspellTokenFilterFactory (IndexSettings indexSettings , String name , Settings settings , HunspellService hunspellService ) {
5579 super (indexSettings , name , settings );
5680
81+ // Get both ref_path and locale parameters
82+ String refPath = settings .get ("ref_path" ); // Package ID only (optional)
5783 String locale = settings .get ("locale" , settings .get ("language" , settings .get ("lang" , null )));
58- if (locale == null ) {
59- throw new IllegalArgumentException ("missing [locale | language | lang] configuration for hunspell token filter" );
60- }
6184
62- dictionary = hunspellService .getDictionary (locale );
63- if (dictionary == null ) {
64- throw new IllegalArgumentException (String .format (Locale .ROOT , "Unknown hunspell dictionary for locale [%s]" , locale ));
85+ if (refPath != null ) {
86+ // Package-based loading: ref_path (package ID) + locale (required)
87+ if (locale == null ) {
88+ throw new IllegalArgumentException ("When using ref_path, the 'locale' parameter is required for hunspell token filter" );
89+ }
90+
91+ // Validate ref_path and locale are safe package/locale identifiers
92+ validatePackageIdentifier (refPath , "ref_path" );
93+ validatePackageIdentifier (locale , "locale" );
94+
95+ // Load from package directory: config/analyzers/{ref_path}/hunspell/{locale}/
96+ dictionary = hunspellService .getDictionaryFromPackage (refPath , locale );
97+ } else if (locale != null ) {
98+ // Traditional locale-based loading (backward compatible)
99+ // Loads from config/hunspell/{locale}/
100+ // Validate locale to prevent path traversal and cache key ambiguity
101+ validatePackageIdentifier (locale , "locale" );
102+ dictionary = hunspellService .getDictionary (locale );
103+ } else {
104+ throw new IllegalArgumentException (
105+ "The 'locale' parameter is required for hunspell token filter. Set it to the hunspell dictionary locale (e.g., 'en_US')."
106+ );
65107 }
66108
67109 dedup = settings .getAsBoolean ("dedup" , true );
@@ -81,4 +123,38 @@ public boolean longestOnly() {
81123 return longestOnly ;
82124 }
83125
126+ /**
127+ * Allowlist pattern for safe package identifiers and locales.
128+ * Permits only alphanumeric characters, hyphens, and underscores.
129+ * Examples: "pkg-1234", "en_US", "my-package-v2", "en_US_custom"
130+ */
131+ private static final Pattern SAFE_IDENTIFIER_PATTERN = Pattern .compile ("^[a-zA-Z0-9][a-zA-Z0-9_-]*$|^[a-zA-Z0-9]$" );
132+
133+ /**
134+ * Validates that a package identifier or locale contains only safe characters.
135+ * Uses an allowlist approach: only alphanumeric characters, hyphens, and underscores are permitted.
136+ * This prevents path traversal, cache key injection, and other security issues.
137+ *
138+ * @param value The value to validate (package ID or locale)
139+ * @param paramName The parameter name for error messages
140+ * @throws IllegalArgumentException if validation fails
141+ */
142+ static void validatePackageIdentifier (String value , String paramName ) {
143+ if (value == null || value .isEmpty ()) {
144+ throw new IllegalArgumentException (String .format (Locale .ROOT , "Invalid %s: value cannot be null or empty." , paramName ));
145+ }
146+
147+ if (!SAFE_IDENTIFIER_PATTERN .matcher (value ).matches ()) {
148+ throw new IllegalArgumentException (
149+ String .format (
150+ Locale .ROOT ,
151+ "Invalid %s: [%s]. Only alphanumeric characters, hyphens, and underscores are allowed." ,
152+ paramName ,
153+ value
154+ )
155+ );
156+ }
157+
158+ }
159+
84160}
0 commit comments