Skip to content

Commit 8b89902

Browse files
Update inference API specification to include new Llama Service
1 parent 76537d6 commit 8b89902

File tree

15 files changed

+1123
-52
lines changed

15 files changed

+1123
-52
lines changed

output/openapi/elasticsearch-openapi.json

Lines changed: 195 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

output/openapi/elasticsearch-serverless-openapi.json

Lines changed: 195 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

output/schema/schema.json

Lines changed: 435 additions & 46 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

output/typescript/types.ts

Lines changed: 36 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

specification/_doc_ids/table.csv

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,7 @@ inference-api-put-googleaistudio,https://www.elastic.co/docs/api/doc/elasticsear
368368
inference-api-put-googlevertexai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-googlevertexai,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-google-vertex-ai.html,
369369
inference-api-put-huggingface,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-hugging-face,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-hugging-face.html,
370370
inference-api-put-jinaai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-jinaai,,
371+
inference-api-put-llama,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-llama,,
371372
inference-api-put-mistral,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-mistral,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-mistral.html,
372373
inference-api-put-openai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-openai,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-openai.html,
373374
inference-api-put-voyageai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-voyageai,,
@@ -397,6 +398,7 @@ knn-inner-hits,https://www.elastic.co/docs/solutions/search/vector/knn#nested-kn
397398
license-management,https://www.elastic.co/docs/deploy-manage/license/manage-your-license-in-self-managed-cluster,,
398399
list-analytics-collection,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-search-application-get-behavioral-analytics,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/list-analytics-collection.html,
399400
list-synonyms-sets,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-synonyms-get-synonyms-sets,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/list-synonyms-sets.html,
401+
llama-api-models,https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html/,,
400402
logstash-api-delete-pipeline,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-logstash-delete-pipeline,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/logstash-api-delete-pipeline.html,
401403
logstash-api-get-pipeline,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-logstash-get-pipeline,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/logstash-api-get-pipeline.html,
402404
logstash-api-put-pipeline,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-logstash-put-pipeline,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/logstash-api-put-pipeline.html,
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
{
2+
"inference.put_llama": {
3+
"documentation": {
4+
"url": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-llama.html",
5+
"description": "Configure a Llama inference endpoint"
6+
},
7+
"stability": "stable",
8+
"visibility": "public",
9+
"headers": {
10+
"accept": ["application/json"],
11+
"content_type": ["application/json"]
12+
},
13+
"url": {
14+
"paths": [
15+
{
16+
"path": "/_inference/{task_type}/{llama_inference_id}",
17+
"methods": ["PUT"],
18+
"parts": {
19+
"task_type": {
20+
"type": "string",
21+
"description": "The task type"
22+
},
23+
"llama_inference_id": {
24+
"type": "string",
25+
"description": "The inference ID"
26+
}
27+
}
28+
}
29+
]
30+
},
31+
"body": {
32+
"description": "The inference endpoint's task and service settings"
33+
}
34+
}
35+
}

specification/inference/_types/CommonTypes.ts

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1367,6 +1367,68 @@ export enum JinaAITextEmbeddingTask {
13671367
search
13681368
}
13691369

1370+
export class LlamaServiceSettings {
1371+
/**
1372+
* The URL endpoint of the Llama stack endpoint.
1373+
* URL must contain:
1374+
* * For `text_embedding` task - `/v1/inference/embeddings`.
1375+
* * For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`.
1376+
*/
1377+
url: string
1378+
/**
1379+
* The name of the model to use for the inference task.
1380+
* Refer to the Llama downloading models documentation for different ways of getting list of available models and downloading them.
1381+
* Service has been tested and confirmed to be working with the following models:
1382+
* * For `text_embedding` task - `all-MiniLM-L6-v2`.
1383+
* * For `completion` and `chat_completion` tasks - `llama3.2:3b`.
1384+
* @ext_doc_id llama-api-models
1385+
*/
1386+
model_id: string
1387+
/**
1388+
* A valid API key for accessing Llama stack endpoint that is going to be sent as part of Bearer authentication header.
1389+
* This field is optional because Llama stack doesn't provide authentication by default.
1390+
*
1391+
* IMPORTANT: You need to provide the API key only once, during the inference model creation.
1392+
* The get inference endpoint API does not retrieve your API key.
1393+
* After creating the inference model, you cannot change the associated API key.
1394+
* If you want to use a different API key, delete the inference model and recreate it with the same name and the updated API key.
1395+
*/
1396+
api_key?: string
1397+
/**
1398+
* For a `text_embedding` task, the maximum number of tokens per input before chunking occurs.
1399+
*/
1400+
max_input_tokens?: integer
1401+
/**
1402+
* For a `text_embedding` task, the number of dimensions the resulting output embeddings should have.
1403+
*/
1404+
dimensions?: integer
1405+
/**
1406+
* For a `text_embedding` task, the similarity measure. One of cosine, dot_product, l2_norm.
1407+
*/
1408+
similarity?: LlamaSimilarityType
1409+
/**
1410+
* This setting helps to minimize the number of rate limit errors returned from the Llama API.
1411+
* By default, the `llama` service sets the number of requests allowed per minute to 3000.
1412+
*/
1413+
rate_limit?: RateLimitSetting
1414+
}
1415+
1416+
export enum LlamaTaskType {
1417+
text_embedding,
1418+
completion,
1419+
chat_completion
1420+
}
1421+
1422+
export enum LlamaServiceType {
1423+
llama
1424+
}
1425+
1426+
export enum LlamaSimilarityType {
1427+
cosine,
1428+
dot_product,
1429+
l2_norm
1430+
}
1431+
13701432
export class MistralServiceSettings {
13711433
/**
13721434
* A valid API key of your Mistral account.

specification/inference/_types/Services.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ import {
3535
TaskTypeGoogleVertexAI,
3636
TaskTypeHuggingFace,
3737
TaskTypeJinaAi,
38+
TaskTypeLlama,
3839
TaskTypeMistral,
3940
TaskTypeOpenAI,
4041
TaskTypeVoyageAI,
@@ -229,6 +230,17 @@ export class InferenceEndpointInfoJinaAi extends InferenceEndpoint {
229230
task_type: TaskTypeJinaAi
230231
}
231232

233+
export class InferenceEndpointInfoLlama extends InferenceEndpoint {
234+
/**
235+
* The inference Id
236+
*/
237+
inference_id: string
238+
/**
239+
* The task type
240+
*/
241+
task_type: TaskTypeLlama
242+
}
243+
232244
export class InferenceEndpointInfoMistral extends InferenceEndpoint {
233245
/**
234246
* The inference Id
@@ -354,6 +366,7 @@ export class RateLimitSetting {
354366
* * `googlevertexai` service: `30000`
355367
* * `hugging_face` service: `3000`
356368
* * `jinaai` service: `2000`
369+
* * `llama` service: `3000`
357370
* * `mistral` service: `240`
358371
* * `openai` service and task type `text_embedding`: `3000`
359372
* * `openai` service and task type `completion`: `500`

specification/inference/_types/TaskType.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,12 @@ export enum TaskTypeHuggingFace {
104104
text_embedding
105105
}
106106

107+
export enum TaskTypeLlama {
108+
text_embedding,
109+
chat_completion,
110+
completion
111+
}
112+
107113
export enum TaskTypeMistral {
108114
text_embedding,
109115
chat_completion,

specification/inference/put/PutRequest.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ import { TaskType } from '@inference/_types/TaskType'
4343
* * Google AI Studio (`completion`, `text_embedding`)
4444
* * Google Vertex AI (`rerank`, `text_embedding`)
4545
* * Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)
46+
* * Llama (`chat_completion`, `completion`, `text_embedding`)
4647
* * Mistral (`chat_completion`, `completion`, `text_embedding`)
4748
* * OpenAI (`chat_completion`, `completion`, `text_embedding`)
4849
* * VoyageAI (`text_embedding`, `rerank`)

0 commit comments

Comments
 (0)