From 4b166d94219aa54bd5e2e96a35c0579ad207aa1a Mon Sep 17 00:00:00 2001 From: Dai Sugimori Date: Wed, 4 Sep 2024 19:55:56 +0900 Subject: [PATCH] [DOCS] Add docs for new Lucene's filters for Japanese text. (#112356) (cherry picked from commit 2982fc61e81fa23ed03f7b51854af9f5352666bb) --- docs/plugins/analysis-kuromoji.asciidoc | 120 ++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/docs/plugins/analysis-kuromoji.asciidoc b/docs/plugins/analysis-kuromoji.asciidoc index 1f114e9ad9ed6..b1d1d5a751057 100644 --- a/docs/plugins/analysis-kuromoji.asciidoc +++ b/docs/plugins/analysis-kuromoji.asciidoc @@ -624,3 +624,123 @@ Which results in: } ] } -------------------------------------------------- + +[[analysis-kuromoji-hiragana-uppercase]] +==== `hiragana_uppercase` token filter + +The `hiragana_uppercase` token filter normalizes small letters (捨て仮名) in hiragana into standard letters. +This filter is useful if you want to search against old style Japanese text such as +patents, legal documents, contract policies, etc. + +For example: + +[source,console] +-------------------------------------------------- +PUT kuromoji_sample +{ + "settings": { + "index": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "kuromoji_tokenizer", + "filter": [ + "hiragana_uppercase" + ] + } + } + } + } + } +} + +GET kuromoji_sample/_analyze +{ + "analyzer": "my_analyzer", + "text": "ちょっとまって" +} +-------------------------------------------------- + +Which results in: + +[source,console-result] +-------------------------------------------------- +{ + "tokens": [ + { + "token": "ちよつと", + "start_offset": 0, + "end_offset": 4, + "type": "word", + "position": 0 + }, + { + "token": "まつ", + "start_offset": 4, + "end_offset": 6, + "type": "word", + "position": 1 + }, + { + "token": "て", + "start_offset": 6, + "end_offset": 7, + "type": "word", + "position": 2 + } + ] +} +-------------------------------------------------- + +[[analysis-kuromoji-katakana-uppercase]] +==== `katakana_uppercase` token filter + +The `katakana_uppercase` token filter normalizes small letters (捨て仮名) in katakana into standard letters. +This filter is useful if you want to search against old style Japanese text such as +patents, legal documents, contract policies, etc. + +For example: + +[source,console] +-------------------------------------------------- +PUT kuromoji_sample +{ + "settings": { + "index": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "kuromoji_tokenizer", + "filter": [ + "katakana_uppercase" + ] + } + } + } + } + } +} + +GET kuromoji_sample/_analyze +{ + "analyzer": "my_analyzer", + "text": "ストップウォッチ" +} +-------------------------------------------------- + +Which results in: + +[source,console-result] +-------------------------------------------------- +{ + "tokens": [ + { + "token": "ストツプウオツチ", + "start_offset": 0, + "end_offset": 8, + "type": "word", + "position": 0 + } + ] +} +--------------------------------------------------