|
| 1 | += text_chunker |
| 2 | +// tag::single-source[] |
| 3 | +:type: processor |
| 4 | +:categories: ["AI"] |
| 5 | + |
| 6 | +component_type_dropdown::[] |
| 7 | + |
| 8 | +Breaks down text-based message content into manageable chunks using a configurable strategy. This processor is ideal for creating vector embeddings of large text documents. |
| 9 | + |
| 10 | +ifndef::env-cloud[] |
| 11 | +Introduced in version 4.51.0. |
| 12 | +endif::[] |
| 13 | + |
| 14 | +[tabs] |
| 15 | +====== |
| 16 | +Common:: |
| 17 | ++ |
| 18 | +-- |
| 19 | +
|
| 20 | +```yml |
| 21 | +# Common configuration fields, showing default values |
| 22 | +label: "" |
| 23 | +text_chunker: |
| 24 | + strategy: "" # No default (required) |
| 25 | + chunk_size: 512 |
| 26 | + chunk_overlap: 100 |
| 27 | + separators: |
| 28 | + - "\n\n" |
| 29 | + - "\n" |
| 30 | + - " " |
| 31 | + - "" |
| 32 | + length_measure: runes |
| 33 | + include_code_blocks: false |
| 34 | + keep_reference_links: false |
| 35 | +``` |
| 36 | +
|
| 37 | +-- |
| 38 | +Advanced:: |
| 39 | ++ |
| 40 | +-- |
| 41 | +
|
| 42 | +```yml |
| 43 | +# All configuration fields, showing default values |
| 44 | +label: "" |
| 45 | +text_chunker: |
| 46 | + strategy: "" # No default (required) |
| 47 | + chunk_size: 512 |
| 48 | + chunk_overlap: 100 |
| 49 | + separators: |
| 50 | + - "\n\n" |
| 51 | + - "\n" |
| 52 | + - " " |
| 53 | + - "" |
| 54 | + length_measure: runes |
| 55 | + token_encoding: cl100k_base # No default (optional) |
| 56 | + allowed_special: [] |
| 57 | + disallowed_special: |
| 58 | + - all |
| 59 | + include_code_blocks: false |
| 60 | + keep_reference_links: false |
| 61 | +``` |
| 62 | +
|
| 63 | +-- |
| 64 | +====== |
| 65 | + |
| 66 | +== Fields |
| 67 | + |
| 68 | +=== `strategy` |
| 69 | + |
| 70 | +Choose a strategy for breaking content down into chunks. |
| 71 | + |
| 72 | +*Type*: `string` |
| 73 | + |
| 74 | +Options: |
| 75 | + |
| 76 | +|=== |
| 77 | +| Option | Description |
| 78 | + |
| 79 | +| `markdown` |
| 80 | +| Use Markdown headers as the separators between chunks. |
| 81 | + |
| 82 | +| `recursive_character` |
| 83 | +| <<separators, Specify character strings>> to use as separators between chunks. |
| 84 | + |
| 85 | +| `token` |
| 86 | +| Split text into tokens up to the maximum chunk size. |
| 87 | + |
| 88 | +|=== |
| 89 | + |
| 90 | + |
| 91 | +=== `chunk_size` |
| 92 | + |
| 93 | +The maximum size of each chunk, using the selected <<length_measure,`length_measure`>>. |
| 94 | + |
| 95 | +*Type*: `int` |
| 96 | + |
| 97 | +*Default*: `512` |
| 98 | + |
| 99 | +=== `chunk_overlap` |
| 100 | + |
| 101 | +The number of characters duplicated in adjacent chunks of text. |
| 102 | + |
| 103 | +*Type*: `int` |
| 104 | + |
| 105 | +*Default*: `100` |
| 106 | + |
| 107 | +=== `separators` |
| 108 | + |
| 109 | +A list of strings to use as separators between chunks when the <<strategy, `recursive_character` strategy option>> is specified. |
| 110 | + |
| 111 | +By default, the following separators are tried in turn until one is successful: |
| 112 | + |
| 113 | +- Double newlines (`\n\n`) |
| 114 | +- Single newlines (`\n`) |
| 115 | +- Spaces (`" "`,`""`) |
| 116 | + |
| 117 | +*Type*: `array` |
| 118 | + |
| 119 | +*Default*: `["\n\n", "\n", " ", ""]` |
| 120 | + |
| 121 | +=== `length_measure` |
| 122 | + |
| 123 | +Choose a method to measure the length of a string. |
| 124 | + |
| 125 | +*Type*: `string` |
| 126 | + |
| 127 | +*Default*: `runes` |
| 128 | + |
| 129 | +|=== |
| 130 | +| Option | Description |
| 131 | + |
| 132 | +| `graphemes` |
| 133 | +| Count the number of Unicode graphemes. |
| 134 | + |
| 135 | +| `runes` |
| 136 | +| Count the number of Unicode code points. |
| 137 | + |
| 138 | +| `token` |
| 139 | +| Count the number of tokens using the `token_encoding` tokenizer. |
| 140 | + |
| 141 | +| `utf8` |
| 142 | +| Count the number of UTF-8 bytes. |
| 143 | + |
| 144 | +|=== |
| 145 | + |
| 146 | + |
| 147 | +=== `token_encoding` |
| 148 | + |
| 149 | +The type of encoding to use for tokenization. |
| 150 | + |
| 151 | +*Type*: `string` |
| 152 | + |
| 153 | +```yml |
| 154 | +# Examples |
| 155 | + |
| 156 | +token_encoding: cl100k_base |
| 157 | + |
| 158 | +token_encoding: r50k_base |
| 159 | +``` |
| 160 | + |
| 161 | +=== `allowed_special` |
| 162 | + |
| 163 | +A list of special tokens to include in the output from this processor. |
| 164 | + |
| 165 | +*Type*: `array` |
| 166 | + |
| 167 | +*Default*: `[]` |
| 168 | + |
| 169 | +=== `disallowed_special` |
| 170 | + |
| 171 | +A list of special tokens to exclude from the output of this processor. |
| 172 | + |
| 173 | +*Type*: `array` |
| 174 | + |
| 175 | +*Default*: `["all"]` |
| 176 | + |
| 177 | +=== `include_code_blocks` |
| 178 | + |
| 179 | +When set to `true`, this processor includes code blocks in the output. |
| 180 | + |
| 181 | +*Type*: `bool` |
| 182 | + |
| 183 | +*Default*: `false` |
| 184 | + |
| 185 | +=== `keep_reference_links` |
| 186 | + |
| 187 | +When set to `true`, this processor includes reference links in the output. |
| 188 | + |
| 189 | +*Type*: `bool` |
| 190 | + |
| 191 | +*Default*: `false` |
| 192 | + |
| 193 | + |
| 194 | +// end::single-source[] |
0 commit comments