From 4abef8ef129531f51f3ff9292899f99fc768e928 Mon Sep 17 00:00:00 2001 From: George Wallace Date: Fri, 8 Nov 2024 11:11:47 -0700 Subject: [PATCH 1/2] [DOCS] : swap allocation sections --- .../inference/service-elser.asciidoc | 55 +++++++++---------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/docs/reference/inference/service-elser.asciidoc b/docs/reference/inference/service-elser.asciidoc index 273d743e47a4b..fab4364ce026c 100644 --- a/docs/reference/inference/service-elser.asciidoc +++ b/docs/reference/inference/service-elser.asciidoc @@ -96,6 +96,33 @@ If `adaptive_allocations` is enabled, do not set this value, because it's automa Sets the number of threads used by each model allocation during inference. This generally increases the speed per inference request. The inference process is a compute-bound process; `threads_per_allocations` must not exceed the number of available allocated processors per node. Must be a power of 2. Max allowed value is 32. +[discrete] +[[inference-example-elser-adaptive-allocation]] +==== Setting adaptive allocations for the ELSER service + +NOTE: For more information on how to optimize your ELSER endpoints, refer to {ml-docs}/ml-nlp-elser.html#elser-recommendations[the ELSER recommendations] section in the model documentation. +To learn more about model autoscaling, refer to the {ml-docs}/ml-nlp-auto-scale.html[trained model autoscaling] page. + +The following example shows how to create an {infer} endpoint called `my-elser-model` to perform a `sparse_embedding` task type and configure adaptive allocations. + +The request below will automatically download the ELSER model if it isn't already downloaded and then deploy the model. + +[source,console] +------------------------------------------------------------ +PUT _inference/sparse_embedding/my-elser-model +{ + "service": "elser", + "service_settings": { + "adaptive_allocations": { + "enabled": true, + "min_number_of_allocations": 3, + "max_number_of_allocations": 10 + }, + "num_threads": 1 + } +} +------------------------------------------------------------ +// TEST[skip:TBD] [discrete] [[inference-example-elser]] @@ -146,31 +173,3 @@ This error usually just reflects a timeout, while the model downloads in the bac You can check the download progress in the {ml-app} UI. If using the Python client, you can set the `timeout` parameter to a higher value. ==== - -[discrete] -[[inference-example-elser-adaptive-allocation]] -==== Setting adaptive allocations for the ELSER service - -NOTE: For more information on how to optimize your ELSER endpoints, refer to {ml-docs}/ml-nlp-elser.html#elser-recommendations[the ELSER recommendations] section in the model documentation. -To learn more about model autoscaling, refer to the {ml-docs}/ml-nlp-auto-scale.html[trained model autoscaling] page. - -The following example shows how to create an {infer} endpoint called `my-elser-model` to perform a `sparse_embedding` task type and configure adaptive allocations. - -The request below will automatically download the ELSER model if it isn't already downloaded and then deploy the model. - -[source,console] ------------------------------------------------------------- -PUT _inference/sparse_embedding/my-elser-model -{ - "service": "elser", - "service_settings": { - "adaptive_allocations": { - "enabled": true, - "min_number_of_allocations": 3, - "max_number_of_allocations": 10 - }, - "num_threads": 1 - } -} ------------------------------------------------------------- -// TEST[skip:TBD] \ No newline at end of file From 02466aade8e80bc585c3195338b0992488dd2d06 Mon Sep 17 00:00:00 2001 From: George Wallace Date: Tue, 12 Nov 2024 16:14:42 -0700 Subject: [PATCH 2/2] Update service-elser.asciidoc --- docs/reference/inference/service-elser.asciidoc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/reference/inference/service-elser.asciidoc b/docs/reference/inference/service-elser.asciidoc index fab4364ce026c..6d38ddd39504a 100644 --- a/docs/reference/inference/service-elser.asciidoc +++ b/docs/reference/inference/service-elser.asciidoc @@ -98,12 +98,12 @@ Must be a power of 2. Max allowed value is 32. [discrete] [[inference-example-elser-adaptive-allocation]] -==== Setting adaptive allocations for the ELSER service +==== ELSER service example NOTE: For more information on how to optimize your ELSER endpoints, refer to {ml-docs}/ml-nlp-elser.html#elser-recommendations[the ELSER recommendations] section in the model documentation. To learn more about model autoscaling, refer to the {ml-docs}/ml-nlp-auto-scale.html[trained model autoscaling] page. -The following example shows how to create an {infer} endpoint called `my-elser-model` to perform a `sparse_embedding` task type and configure adaptive allocations. +The following example shows how to create an {infer} endpoint called `my-elser-model` to perform a `sparse_embedding` task type and configure adaptive allocations (recommended). The request below will automatically download the ELSER model if it isn't already downloaded and then deploy the model. @@ -126,11 +126,13 @@ PUT _inference/sparse_embedding/my-elser-model [discrete] [[inference-example-elser]] -==== ELSER service example +==== Creating ELSER service without adaptive allocations The following example shows how to create an {infer} endpoint called `my-elser-model` to perform a `sparse_embedding` task type. Refer to the {ml-docs}/ml-nlp-elser.html[ELSER model documentation] for more info. +The following example shows how to create an {infer} endpoint called `my-elser-model` to perform a `sparse_embedding` task type when adaptive allocations isn't required or {ml-docs}/ml-nlp-auto-scale.html[trained model autoscaling] isn't available. + NOTE: If you want to optimize your ELSER endpoint for ingest, set the number of threads to `1` (`"num_threads": 1`). If you want to optimize your ELSER endpoint for search, set the number of threads to greater than `1`.