Skip to content

Commit e225ffd

Browse files
author
elasticsearchmachine
committed
Merge remote-tracking branch 'origin/main' into lucene_snapshot
2 parents 745acf9 + 9b951cd commit e225ffd

File tree

82 files changed

+3388
-1625
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

82 files changed

+3388
-1625
lines changed

build-tools-internal/src/main/groovy/elasticsearch.ide.gradle

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,36 @@ if (providers.systemProperty('idea.active').getOrNull() == 'true') {
122122
.findAll { it != null }
123123
}
124124

125+
// force IntelliJ to generate *.iml files for each imported module
126+
tasks.register("enableExternalConfiguration") {
127+
group = 'ide'
128+
description = 'Enable per-module *.iml files'
129+
130+
doLast {
131+
modifyXml('.idea/misc.xml') {xml ->
132+
def externalStorageConfig = xml.component.find { it.'@name' == 'ExternalStorageConfigurationManager' }
133+
if (externalStorageConfig) {
134+
xml.remove(externalStorageConfig)
135+
}
136+
}
137+
}
138+
}
139+
140+
// modifies the idea module config to enable preview features on 'elasticsearch-native' module
141+
tasks.register("enablePreviewFeatures") {
142+
group = 'ide'
143+
description = 'Enables preview features on native library module'
144+
dependsOn tasks.named("enableExternalConfiguration")
145+
146+
doLast {
147+
['main', 'test'].each { sourceSet ->
148+
modifyXml(".idea/modules/libs/native/elasticsearch.libs.elasticsearch-native.${sourceSet}.iml") { xml ->
149+
xml.component.find { it.'@name' == 'NewModuleRootManager' }?.'@LANGUAGE_LEVEL' = 'JDK_21_PREVIEW'
150+
}
151+
}
152+
}
153+
}
154+
125155
tasks.register('buildDependencyArtifacts') {
126156
group = 'ide'
127157
description = 'Builds artifacts needed as dependency for IDE modules'
@@ -149,7 +179,10 @@ if (providers.systemProperty('idea.active').getOrNull() == 'true') {
149179
testRunner = 'choose_per_test'
150180
}
151181
taskTriggers {
152-
afterSync tasks.named('configureIdeCheckstyle'), tasks.named('configureIdeaGradleJvm'), tasks.named('buildDependencyArtifacts')
182+
afterSync tasks.named('configureIdeCheckstyle'),
183+
tasks.named('configureIdeaGradleJvm'),
184+
tasks.named('buildDependencyArtifacts'),
185+
tasks.named('enablePreviewFeatures')
153186
}
154187
encodings {
155188
encoding = 'UTF-8'

build-tools-internal/src/main/java/org/elasticsearch/gradle/internal/InternalDistributionDownloadPlugin.java

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -172,9 +172,6 @@ private static String distributionProjectName(ElasticsearchDistribution distribu
172172
if (distribution.getType() == InternalElasticsearchDistributionTypes.DOCKER_IRONBANK) {
173173
return projectName + "ironbank-docker" + archString + "-export";
174174
}
175-
if (distribution.getType() == InternalElasticsearchDistributionTypes.DOCKER_CLOUD) {
176-
return projectName + "cloud-docker" + archString + "-export";
177-
}
178175
if (distribution.getType() == InternalElasticsearchDistributionTypes.DOCKER_CLOUD_ESS) {
179176
return projectName + "cloud-ess-docker" + archString + "-export";
180177
}

build-tools-internal/src/main/java/org/elasticsearch/gradle/internal/distribution/DockerCloudElasticsearchDistributionType.java

Lines changed: 0 additions & 27 deletions
This file was deleted.

build-tools-internal/src/main/java/org/elasticsearch/gradle/internal/distribution/InternalElasticsearchDistributionTypes.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ public class InternalElasticsearchDistributionTypes {
1919
public static ElasticsearchDistributionType DOCKER = new DockerElasticsearchDistributionType();
2020
public static ElasticsearchDistributionType DOCKER_UBI = new DockerUbiElasticsearchDistributionType();
2121
public static ElasticsearchDistributionType DOCKER_IRONBANK = new DockerIronBankElasticsearchDistributionType();
22-
public static ElasticsearchDistributionType DOCKER_CLOUD = new DockerCloudElasticsearchDistributionType();
2322
public static ElasticsearchDistributionType DOCKER_CLOUD_ESS = new DockerCloudEssElasticsearchDistributionType();
2423
public static ElasticsearchDistributionType DOCKER_WOLFI = new DockerWolfiElasticsearchDistributionType();
2524

@@ -29,7 +28,6 @@ public class InternalElasticsearchDistributionTypes {
2928
DOCKER,
3029
DOCKER_UBI,
3130
DOCKER_IRONBANK,
32-
DOCKER_CLOUD,
3331
DOCKER_CLOUD_ESS,
3432
DOCKER_WOLFI
3533
);

build-tools-internal/src/main/java/org/elasticsearch/gradle/internal/test/DistroTestPlugin.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@
4949
import static org.elasticsearch.gradle.internal.distribution.InternalElasticsearchDistributionTypes.ALL_INTERNAL;
5050
import static org.elasticsearch.gradle.internal.distribution.InternalElasticsearchDistributionTypes.DEB;
5151
import static org.elasticsearch.gradle.internal.distribution.InternalElasticsearchDistributionTypes.DOCKER;
52-
import static org.elasticsearch.gradle.internal.distribution.InternalElasticsearchDistributionTypes.DOCKER_CLOUD;
5352
import static org.elasticsearch.gradle.internal.distribution.InternalElasticsearchDistributionTypes.DOCKER_CLOUD_ESS;
5453
import static org.elasticsearch.gradle.internal.distribution.InternalElasticsearchDistributionTypes.DOCKER_IRONBANK;
5554
import static org.elasticsearch.gradle.internal.distribution.InternalElasticsearchDistributionTypes.DOCKER_UBI;
@@ -149,7 +148,6 @@ private static Map<ElasticsearchDistributionType, TaskProvider<?>> lifecycleTask
149148
lifecyleTasks.put(DOCKER, project.getTasks().register(taskPrefix + ".docker"));
150149
lifecyleTasks.put(DOCKER_UBI, project.getTasks().register(taskPrefix + ".docker-ubi"));
151150
lifecyleTasks.put(DOCKER_IRONBANK, project.getTasks().register(taskPrefix + ".docker-ironbank"));
152-
lifecyleTasks.put(DOCKER_CLOUD, project.getTasks().register(taskPrefix + ".docker-cloud"));
153151
lifecyleTasks.put(DOCKER_CLOUD_ESS, project.getTasks().register(taskPrefix + ".docker-cloud-ess"));
154152
lifecyleTasks.put(DOCKER_WOLFI, project.getTasks().register(taskPrefix + ".docker-wolfi"));
155153
lifecyleTasks.put(ARCHIVE, project.getTasks().register(taskPrefix + ".archives"));

docs/changelog/113563.yaml

Lines changed: 0 additions & 5 deletions
This file was deleted.

docs/reference/connector/docs/connectors-API-tutorial.asciidoc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ Refer to the individual connectors-references,connector references for these con
367367
====
368368
We're using a self-managed connector in this tutorial.
369369
To use these APIs with an Elastic managed connector, there's some extra setup for API keys.
370-
Refer to native-connectors-manage-API-keys for details.
370+
Refer to <<es-native-connectors-manage-API-keys>> for details.
371371
====
372372

373373
We're now ready to sync our PostgreSQL data to {es}.

docs/reference/inference/inference-apis.asciidoc

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ Elastic –, then create an {infer} endpoint by the <<put-inference-api>>.
3535
Now use <<semantic-search-semantic-text, semantic text>> to perform
3636
<<semantic-search, semantic search>> on your data.
3737

38-
3938
[discrete]
4039
[[default-enpoints]]
4140
=== Default {infer} endpoints
@@ -53,6 +52,67 @@ For these models, the minimum number of allocations is `0`.
5352
If there is no {infer} activity that uses the endpoint, the number of allocations will scale down to `0` automatically after 15 minutes.
5453

5554

55+
[discrete]
56+
[[infer-chunking-config]]
57+
=== Configuring chunking
58+
59+
{infer-cap} endpoints have a limit on the amount of text they can process at once, determined by the model's input capacity.
60+
Chunking is the process of splitting the input text into pieces that remain within these limits.
61+
It occurs when ingesting documents into <<semantic-text,`semantic_text` fields>>.
62+
Chunking also helps produce sections that are digestible for humans.
63+
Returning a long document in search results is less useful than providing the most relevant chunk of text.
64+
65+
Each chunk will include the text subpassage and the corresponding embedding generated from it.
66+
67+
By default, documents are split into sentences and grouped in sections up to 250 words with 1 sentence overlap so that each chunk shares a sentence with the previous chunk.
68+
Overlapping ensures continuity and prevents vital contextual information in the input text from being lost by a hard break.
69+
70+
{es} uses the https://unicode-org.github.io/icu-docs/[ICU4J] library to detect word and sentence boundaries for chunking.
71+
https://unicode-org.github.io/icu/userguide/boundaryanalysis/#word-boundary[Word boundaries] are identified by following a series of rules, not just the presence of a whitespace character.
72+
For written languages that do use whitespace such as Chinese or Japanese dictionary lookups are used to detect word boundaries.
73+
74+
75+
[discrete]
76+
==== Chunking strategies
77+
78+
Two strategies are available for chunking: `sentence` and `word`.
79+
80+
The `sentence` strategy splits the input text at sentence boundaries.
81+
Each chunk contains one or more complete sentences ensuring that the integrity of sentence-level context is preserved, except when a sentence causes a chunk to exceed a word count of `max_chunk_size`, in which case it will be split across chunks.
82+
The `sentence_overlap` option defines the number of sentences from the previous chunk to include in the current chunk which is either `0` or `1`.
83+
84+
The `word` strategy splits the input text on individual words up to the `max_chunk_size` limit.
85+
The `overlap` option is the number of words from the previous chunk to include in the current chunk.
86+
87+
The default chunking strategy is `sentence`.
88+
89+
NOTE: The default chunking strategy for {infer} endpoints created before 8.16 is `word`.
90+
91+
92+
[discrete]
93+
==== Example of configuring the chunking behavior
94+
95+
The following example creates an {infer} endpoint with the `elasticsearch` service that deploys the ELSER model by default and configures the chunking behavior.
96+
97+
[source,console]
98+
------------------------------------------------------------
99+
PUT _inference/sparse_embedding/small_chunk_size
100+
{
101+
"service": "elasticsearch",
102+
"service_settings": {
103+
"num_allocations": 1,
104+
"num_threads": 1
105+
},
106+
"chunking_settings": {
107+
"strategy": "sentence",
108+
"max_chunk_size": 100,
109+
"sentence_overlap": 0
110+
}
111+
}
112+
------------------------------------------------------------
113+
// TEST[skip:TBD]
114+
115+
56116
include::delete-inference.asciidoc[]
57117
include::get-inference.asciidoc[]
58118
include::post-inference.asciidoc[]

docs/reference/inference/inference-shared.asciidoc

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,36 @@ end::task-settings[]
3131

3232
tag::task-type[]
3333
The type of the {infer} task that the model will perform.
34-
end::task-type[]
34+
end::task-type[]
35+
36+
tag::chunking-settings[]
37+
Chunking configuration object.
38+
Refer to <<infer-chunking-config>> to learn more about chunking.
39+
end::chunking-settings[]
40+
41+
tag::chunking-settings-max-chunking-size[]
42+
Specifies the maximum size of a chunk in words.
43+
Defaults to `250`.
44+
This value cannot be higher than `300` or lower than `20` (for `sentence` strategy) or `10` (for `word` strategy).
45+
end::chunking-settings-max-chunking-size[]
46+
47+
tag::chunking-settings-overlap[]
48+
Only for `word` chunking strategy.
49+
Specifies the number of overlapping words for chunks.
50+
Defaults to `100`.
51+
This value cannot be higher than the half of `max_chunking_size`.
52+
end::chunking-settings-overlap[]
53+
54+
tag::chunking-settings-sentence-overlap[]
55+
Only for `sentence` chunking strategy.
56+
Specifies the numnber of overlapping sentences for chunks.
57+
It can be either `1` or `0`.
58+
Defaults to `1`.
59+
end::chunking-settings-sentence-overlap[]
60+
61+
tag::chunking-settings-strategy[]
62+
Specifies the chunking strategy.
63+
It could be either `sentence` or `word`.
64+
end::chunking-settings-strategy[]
65+
66+

docs/reference/inference/service-alibabacloud-ai-search.asciidoc

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,26 @@ Available task types:
3434
[[infer-service-alibabacloud-ai-search-api-request-body]]
3535
==== {api-request-body-title}
3636

37+
`chunking_settings`::
38+
(Optional, object)
39+
include::inference-shared.asciidoc[tag=chunking-settings]
40+
41+
`max_chunking_size`:::
42+
(Optional, integer)
43+
include::inference-shared.asciidoc[tag=chunking-settings-max-chunking-size]
44+
45+
`overlap`:::
46+
(Optional, integer)
47+
include::inference-shared.asciidoc[tag=chunking-settings-overlap]
48+
49+
`sentence_overlap`:::
50+
(Optional, integer)
51+
include::inference-shared.asciidoc[tag=chunking-settings-sentence-overlap]
52+
53+
`strategy`:::
54+
(Optional, string)
55+
include::inference-shared.asciidoc[tag=chunking-settings-strategy]
56+
3757
`service`::
3858
(Required, string) The type of service supported for the specified task type.
3959
In this case,
@@ -108,7 +128,6 @@ To modify this, set the `requests_per_minute` setting of this object in your ser
108128
include::inference-shared.asciidoc[tag=request-per-minute-example]
109129
--
110130

111-
112131
`task_settings`::
113132
(Optional, object)
114133
include::inference-shared.asciidoc[tag=task-settings]

0 commit comments

Comments
 (0)