[DOCS] Add docs for new Lucene's filters for Japanese text. (elastic#112356)

daixque · daixque · commit 4b166d94219a · 2024-09-04T14:53:43.000+02:00
(cherry picked from commit 2982fc6)
diff --git a/docs/plugins/analysis-kuromoji.asciidoc b/docs/plugins/analysis-kuromoji.asciidoc
@@ -624,3 +624,123 @@ Which results in:
   } ]
 }
 --------------------------------------------------
+
+[[analysis-kuromoji-hiragana-uppercase]]
+==== `hiragana_uppercase` token filter
+
+The `hiragana_uppercase` token filter normalizes small letters (捨て仮名) in hiragana into standard letters.
+This filter is useful if you want to search against old style Japanese text such as
+patents, legal documents, contract policies, etc.
+
+For example:
+
+[source,console]
+--------------------------------------------------
+PUT kuromoji_sample
+{
+  "settings": {
+    "index": {
+      "analysis": {
+        "analyzer": {
+          "my_analyzer": {
+            "tokenizer": "kuromoji_tokenizer",
+            "filter": [
+              "hiragana_uppercase"
+            ]
+          }
+        }
+      }
+    }
+  }
+}
+
+GET kuromoji_sample/_analyze
+{
+  "analyzer": "my_analyzer",
+  "text": "ちょっとまって"
+}
+--------------------------------------------------
+
+Which results in:
+
+[source,console-result]
+--------------------------------------------------
+{
+  "tokens": [
+    {
+      "token": "ちよつと",
+      "start_offset": 0,
+      "end_offset": 4,
+      "type": "word",
+      "position": 0
+    },
+    {
+      "token": "まつ",
+      "start_offset": 4,
+      "end_offset": 6,
+      "type": "word",
+      "position": 1
+    },
+    {
+      "token": "て",
+      "start_offset": 6,
+      "end_offset": 7,
+      "type": "word",
+      "position": 2
+    }
+  ]
+}
+--------------------------------------------------
+
+[[analysis-kuromoji-katakana-uppercase]]
+==== `katakana_uppercase` token filter
+
+The `katakana_uppercase` token filter normalizes small letters (捨て仮名) in katakana into standard letters.
+This filter is useful if you want to search against old style Japanese text such as
+patents, legal documents, contract policies, etc.
+
+For example:
+
+[source,console]
+--------------------------------------------------
+PUT kuromoji_sample
+{
+  "settings": {
+    "index": {
+      "analysis": {
+        "analyzer": {
+          "my_analyzer": {
+            "tokenizer": "kuromoji_tokenizer",
+            "filter": [
+              "katakana_uppercase"
+            ]
+          }
+        }
+      }
+    }
+  }
+}
+
+GET kuromoji_sample/_analyze
+{
+  "analyzer": "my_analyzer",
+  "text": "ストップウォッチ"
+}
+--------------------------------------------------
+
+Which results in:
+
+[source,console-result]
+--------------------------------------------------
+{
+  "tokens": [
+    {
+      "token": "ストツプウオツチ",
+      "start_offset": 0,
+      "end_offset": 8,
+      "type": "word",
+      "position": 0
+    }
+  ]
+}
+--------------------------------------------------