From 1ed0023ea25017e38b3f5caf4091e0e233a90f96 Mon Sep 17 00:00:00 2001 From: Benedikt Rollik Date: Wed, 23 Apr 2025 15:24:05 +0200 Subject: [PATCH 1/4] docs(infr): remove dedicated model pages --- ...-added-new-model-preview-deepseek-r1-d.mdx | 2 +- ...inference-added-model-library-expanded.mdx | 2 +- menu/navigation.json | 72 -------- pages/generative-apis/faq.mdx | 2 +- .../bge-multilingual-gemma2.mdx | 70 -------- .../deepseek-r1-distill-llama-70b.mdx | 82 --------- .../deepseek-r1-distill-llama-8b.mdx | 83 --------- .../llama-3-70b-instruct.mdx | 86 --------- .../reference-content/llama-3-8b-instruct.mdx | 89 ---------- .../llama-3.1-70b-instruct.mdx | 84 --------- .../llama-3.1-8b-instruct.mdx | 86 --------- .../llama-3.1-nemotron-70b-instruct.mdx | 81 --------- .../llama-3.3-70b-instruct.mdx | 80 --------- .../mistral-7b-instruct-v0.3.mdx | 81 --------- .../mistral-nemo-instruct-2407.mdx | 86 --------- .../mistral-small-24b-instruct-2501.mdx | 77 -------- .../mixtral-8x7b-instruct-v0.1.mdx | 80 --------- .../reference-content/model-catalog.mdx | 12 -- .../reference-content/molmo-72b-0924.mdx | 164 ----------------- .../reference-content/moshika-0.1-8b.mdx | 87 --------- .../reference-content/moshiko-0.1-8b.mdx | 86 --------- .../reference-content/pixtral-12b-2409.mdx | 168 ------------------ .../qwen2.5-coder-32b-instruct.mdx | 78 -------- .../reference-content/sentence-t5-xxl.mdx | 70 -------- .../reference-content/supported-models.mdx | 46 ++--- .../reference-content/wizardlm-70b-v1.0.mdx | 78 -------- .../index.mdx | 2 +- 27 files changed, 27 insertions(+), 1907 deletions(-) delete mode 100644 pages/managed-inference/reference-content/bge-multilingual-gemma2.mdx delete mode 100644 pages/managed-inference/reference-content/deepseek-r1-distill-llama-70b.mdx delete mode 100644 pages/managed-inference/reference-content/deepseek-r1-distill-llama-8b.mdx delete mode 100644 pages/managed-inference/reference-content/llama-3-70b-instruct.mdx delete mode 100644 pages/managed-inference/reference-content/llama-3-8b-instruct.mdx delete mode 100644 pages/managed-inference/reference-content/llama-3.1-70b-instruct.mdx delete mode 100644 pages/managed-inference/reference-content/llama-3.1-8b-instruct.mdx delete mode 100644 pages/managed-inference/reference-content/llama-3.1-nemotron-70b-instruct.mdx delete mode 100644 pages/managed-inference/reference-content/llama-3.3-70b-instruct.mdx delete mode 100644 pages/managed-inference/reference-content/mistral-7b-instruct-v0.3.mdx delete mode 100644 pages/managed-inference/reference-content/mistral-nemo-instruct-2407.mdx delete mode 100644 pages/managed-inference/reference-content/mistral-small-24b-instruct-2501.mdx delete mode 100644 pages/managed-inference/reference-content/mixtral-8x7b-instruct-v0.1.mdx delete mode 100644 pages/managed-inference/reference-content/molmo-72b-0924.mdx delete mode 100644 pages/managed-inference/reference-content/moshika-0.1-8b.mdx delete mode 100644 pages/managed-inference/reference-content/moshiko-0.1-8b.mdx delete mode 100644 pages/managed-inference/reference-content/pixtral-12b-2409.mdx delete mode 100644 pages/managed-inference/reference-content/qwen2.5-coder-32b-instruct.mdx delete mode 100644 pages/managed-inference/reference-content/sentence-t5-xxl.mdx delete mode 100644 pages/managed-inference/reference-content/wizardlm-70b-v1.0.mdx diff --git a/changelog/february2025/2025-02-14-managed-inference-added-new-model-preview-deepseek-r1-d.mdx b/changelog/february2025/2025-02-14-managed-inference-added-new-model-preview-deepseek-r1-d.mdx index 7390982cd8..e913a6fd90 100644 --- a/changelog/february2025/2025-02-14-managed-inference-added-new-model-preview-deepseek-r1-d.mdx +++ b/changelog/february2025/2025-02-14-managed-inference-added-new-model-preview-deepseek-r1-d.mdx @@ -9,7 +9,7 @@ category: ai-data product: managed-inference --- -[DeepSeek R1 Distilled Llama 70B](/managed-inference/reference-content/deepseek-r1-distill-llama-70b/) is now available on Managed Inference. +[DeepSeek R1 Distilled Llama 70B](/managed-inference/reference-content/model-catalog/#deepseek-r1-distill-llama-70b) is now available on Managed Inference. DeepSeek R1 Distilled Llama improves Llama model performance on reasoning use cases like mathematics or code. diff --git a/changelog/september2024/2024-09-05-managed-inference-added-model-library-expanded.mdx b/changelog/september2024/2024-09-05-managed-inference-added-model-library-expanded.mdx index e146cc5210..53d1fa9f2b 100644 --- a/changelog/september2024/2024-09-05-managed-inference-added-model-library-expanded.mdx +++ b/changelog/september2024/2024-09-05-managed-inference-added-model-library-expanded.mdx @@ -9,7 +9,7 @@ category: ai-data product: managed-inference --- -[Meta Llama 3.1 8b](/managed-inference/reference-content/llama-3.1-8b-instruct/), [Meta Llama 3.1 70b](/managed-inference/reference-content/llama-3.1-70b-instruct/) and [Mistral Nemo](/managed-inference/reference-content/mistral-nemo-instruct-2407/) are available for deployment on Managed Inference. +[Meta Llama 3.1 8b](/managed-inference/reference-content/model-catalog/#llama-31-8b-instruct), [Meta Llama 3.1 70b](/managed-inference/reference-content/model-catalog/llama-31-70b-instruct) and [Mistral Nemo](/managed-inference/reference-content/model-catalog/#mistral-nemo-instruct-2407) are available for deployment on Managed Inference. Released July 2024, these models all support a very large context window of up to 128k tokens, particularly useful for RAG applications. diff --git a/menu/navigation.json b/menu/navigation.json index 13faa8ef78..87d1e2f7b7 100644 --- a/menu/navigation.json +++ b/menu/navigation.json @@ -883,78 +883,6 @@ { "label": "Managed Inference model catalog", "slug": "model-catalog" - }, - { - "label": "BGE-Multilingual-Gemma2 model", - "slug": "bge-multilingual-gemma2" - }, - { - "label": "Llama-3-8b-instruct model", - "slug": "llama-3-8b-instruct" - }, - { - "label": "Llama-3-70b-instruct model", - "slug": "llama-3-70b-instruct" - }, - { - "label": "Llama-3.1-8b-instruct model", - "slug": "llama-3.1-8b-instruct" - }, - { - "label": "Llama-3.1-70b-instruct model", - "slug": "llama-3.1-70b-instruct" - }, - { - "label": "Llama-3.1-nemotron-70b-instruct model", - "slug": "llama-3.1-nemotron-70b-instruct" - }, - { - "label": "Llama-3.3-70b-instruct model", - "slug": "llama-3.3-70b-instruct" - }, - { - "label": "DeepSeek-R1-Distill-Llama-70B model", - "slug": "deepseek-r1-distill-llama-70b" - }, - { - "label": "DeepSeek-R1-Distill-Llama-8B model", - "slug": "deepseek-r1-distill-llama-8b" - }, - { - "label": "Mistral-7b-instruct-v0.3 model", - "slug": "mistral-7b-instruct-v0.3" - }, - { - "label": "Mistral-nemo-instruct-2407 model", - "slug": "mistral-nemo-instruct-2407" - }, - { - "label": "Mixtral-8x7b-instruct-v0.1 model", - "slug": "mixtral-8x7b-instruct-v0.1" - }, - { - "label": "Molmo-72b-0924 model", - "slug": "molmo-72b-0924" - }, - { - "label": "Moshika-0.1-8b model", - "slug": "moshika-0.1-8b" - }, - { - "label": "Moshiko-0.1-8b model", - "slug": "moshiko-0.1-8b" - }, - { - "label": "Pixtral-12b-2409 model", - "slug": "pixtral-12b-2409" - }, - { - "label": "Qwen2.5-coder-32b-instruct model", - "slug": "qwen2.5-coder-32b-instruct" - }, - { - "label": "Sentence-t5-xxl model", - "slug": "sentence-t5-xxl" } ], "label": "Additional Content", diff --git a/pages/generative-apis/faq.mdx b/pages/generative-apis/faq.mdx index 75f50185b8..ca18fca4e4 100644 --- a/pages/generative-apis/faq.mdx +++ b/pages/generative-apis/faq.mdx @@ -55,7 +55,7 @@ Note that in this example, the first line where the free tier applies will not d ## What is a token and how are they counted? A token is the minimum unit of content that is seen and processed by a model. Hence, token definitions depend on input types: - For text, on average, `1` token corresponds to `~4` characters, and thus `0.75` words (as words are on average five characters long) -- For images, `1` token corresponds to a square of pixels. For example, [pixtral-12b-2409 model](https://www.scaleway.com/en/docs/managed-inference/reference-content/pixtral-12b-2409/#frequently-asked-questions) image tokens of `16x16` pixels (16-pixel height, and 16-pixel width, hence `256` pixels in total). +- For images, `1` token corresponds to a square of pixels. For example, `pixtral-12b-2409 `model image tokens of `16x16` pixels (16-pixel height, and 16-pixel width, hence `256` pixels in total). The exact token count and definition depend on [tokenizers](https://huggingface.co/learn/llm-course/en/chapter2/4) used by each model. When this difference is significant (such as for image processing), you can find detailed information in each model documentation (for instance in [`pixtral-12b-2409` size limit documentation](https://www.scaleway.com/en/docs/managed-inference/reference-content/pixtral-12b-2409/#frequently-asked-questions)). Otherwise, when the model is open, you can find this information in the model files on platforms such as Hugging Face, usually in the `tokenizer_config.json` file. diff --git a/pages/managed-inference/reference-content/bge-multilingual-gemma2.mdx b/pages/managed-inference/reference-content/bge-multilingual-gemma2.mdx deleted file mode 100644 index 885eaeef94..0000000000 --- a/pages/managed-inference/reference-content/bge-multilingual-gemma2.mdx +++ /dev/null @@ -1,70 +0,0 @@ ---- -meta: - title: Understanding the BGE-Multilingual-Gemma2 embedding model - description: Deploy your own secure BGE-Multilingual-Gemma2 embedding model with Scaleway Managed Inference. Privacy-focused, fully managed. -content: - h1: Understanding the BGE-Multilingual-Gemma2 embedding model - paragraph: This page provides information on the BGE-Multilingual-Gemma2 embedding model -tags: embedding -categories: - - ai-data -dates: - validation: 2024-10-30 - posted: 2024-10-30 ---- - -## Model overview - -| Attribute | Details | -|-----------------|------------------------------------| -| Provider | [baai](https://huggingface.co/BAAI) | -| Compatible Instances | L4, L40S (FP32) | -| Context size | 4096 tokens | - -## Model name - -```bash -baai/bge-multilingual-gemma2:fp32 -``` - -## Compatible Instances - -| Instance type | Max context length | -| ------------- |-------------| -| L4 | 4096 (FP32) | -| L40S | 4096 (FP32) | - -## Model introduction - -BGE is short for BAAI General Embedding. This particular model is an LLM-based embedding, trained on a diverse range of languages and tasks from the lightweight [google/gemma-2-9b](https://huggingface.co/google/gemma-2-9b). -As such, it is distributed under the [Gemma terms of use](https://ai.google.dev/gemma/terms). - -## Why is it useful? - -- BGE-Multilingual-Gemma2 tops the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard), scoring the number one spot in French and Polish, and number seven in English, at the time of writing this page (Q4 2024). -- As its name suggests, the model's training data spans a broad range of languages, including English, Chinese, Polish, French, and more. -- It encodes text into 3584-dimensional vectors, providing a very detailed representation of sentence semantics. -- BGE-Multilingual-Gemma2 in its L4/FP32 configuration boats a high context length of 4096 tokens, particularly useful for ingesting data and building RAG applications. - -## How to use it - -### Sending Managed Inference requests - -To perform inference tasks with your embedding model deployed at Scaleway, use the following command: - -```bash -curl https://.ifr.fr-par.scaleway.com/v1/embeddings \ - -H "Authorization: Bearer " \ - -H "Content-Type: application/json" \ - -d '{ - "input": "Embeddings can represent text in a numerical format.", - "model": "baai/bge-multilingual-gemma2:fp32" - }' -``` - -Make sure to replace `` and `` with your actual [IAM API key](/iam/how-to/create-api-keys/) and the Deployment UUID you are targeting. - -### Receiving Inference responses - -Upon sending the HTTP request to the public or private endpoints exposed by the server, you will receive inference responses from the managed Managed Inference server. -Process the output data according to your application's needs. The response will contain the output generated by the embedding model based on the input provided in the request. diff --git a/pages/managed-inference/reference-content/deepseek-r1-distill-llama-70b.mdx b/pages/managed-inference/reference-content/deepseek-r1-distill-llama-70b.mdx deleted file mode 100644 index d59a491267..0000000000 --- a/pages/managed-inference/reference-content/deepseek-r1-distill-llama-70b.mdx +++ /dev/null @@ -1,82 +0,0 @@ ---- -meta: - title: Understanding the DeepSeek-R1-Distill-Llama-70B model - description: Deploy your own secure DeepSeek-R1-Distill-Llama-70B model with Scaleway Managed Inference. Privacy-focused, fully managed. -content: - h1: Understanding the DeepSeek-R1-Distill-Llama-70B model - paragraph: This page provides information on the DeepSeek-R1-Distill-Llama-70B model -tags: -dates: - validation: 2025-02-06 - posted: 2025-02-06 -categories: - - ai-data ---- - -## Model overview - -| Attribute | Details | -|-----------------|------------------------------------| -| Provider | [Deepseek](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) | -| License | [MIT](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/mit.md) | -| Compatible Instances | H100 (FP8), H100-2 (FP8, BF16) | -| Context Length | up to 131k tokens | - -## Model names - -```bash -deepseek/deepseek-r1-distill-llama-70b:bf16 -``` - -## Compatible Instances - -| Instance type | Max context length | -| ------------- |-------------| -| H100 | 15k (FP8) | -| H100-2 | 131k (FP8), 56k (BF16) | - -## Model introduction - -Released January 21, 2025, Deepseek’s R1 Distilled Llama 70B is a distilled version of the Llama model family based on Deepseek R1. -DeepSeek R1 Distill Llama 70B is designed to improve the performance of Llama models on reasoning use case such as mathematics and coding tasks. - -## Why is it useful? - -It is great to see Deepseek improving open(weight) models, and we are excited to fully support their mission with integration in the Scaleway ecosystem. - -- DeepSeek-R1-Distill-Llama was optimized to reach accuracy close to Deepseek-R1 in tasks like mathematics and coding, while keeping inference costs limited and tokens speed efficient. -- DeepSeek-R1-Distill-Llama supports a context window of up to 56K tokens and tool calling, keeping interaction with other components possible. - -## How to use it - -### Sending Managed Inference requests - -To perform inference tasks with your DeepSeek R1 Distill Llama deployed at Scaleway, use the following command: - -```bash -curl -s \ --H "Authorization: Bearer " \ --H "Content-Type: application/json" \ ---request POST \ ---url "https://.ifr.fr-par.scaleway.com/v1/chat/completions" \ ---data '{"model":"deepseek/deepseek-r1-distill-llama-70b:fp8", "messages":[{"role": "user","content": "There is a llama in my garden, what should I do?"}], "max_tokens": 500, "temperature": 0.7, "stream": false}' -``` - -Make sure to replace `` and `` with your actual [IAM API key](/iam/how-to/create-api-keys/) and the Deployment UUID you are targeting. - - - Ensure that the `messages` array is properly formatted with roles (user, assistant) and content. - - - - This model is better used without `system prompt`, as suggested by the model provider. - - -### Receiving inference responses - -Upon sending the HTTP request to the public or private endpoints exposed by the server, you will receive inference responses from the Managed Inference server. -Process the output data according to your application's needs. The response will contain the output generated by the LLM model based on the input provided in the request. - - - Despite efforts for accuracy, the possibility of generated text containing inaccuracies or [hallucinations](/managed-inference/concepts/#hallucinations) exists. Always verify the content generated independently. - diff --git a/pages/managed-inference/reference-content/deepseek-r1-distill-llama-8b.mdx b/pages/managed-inference/reference-content/deepseek-r1-distill-llama-8b.mdx deleted file mode 100644 index 5c6821e94a..0000000000 --- a/pages/managed-inference/reference-content/deepseek-r1-distill-llama-8b.mdx +++ /dev/null @@ -1,83 +0,0 @@ ---- -meta: - title: Understanding the DeepSeek-R1-Distill-Llama-8B model - description: Deploy your own secure DeepSeek-R1-Distill-Llama-8B model with Scaleway Managed Inference. Privacy-focused, fully managed. -content: - h1: Understanding the DeepSeek-R1-Distill-Llama-8B model - paragraph: This page provides information on the DeepSeek-R1-Distill-Llama-8B model -tags: -dates: - validation: 2025-02-06 - posted: 2025-02-06 -categories: - - ai-data ---- - -## Model overview - -| Attribute | Details | -|-----------------|------------------------------------| -| Provider | [Deepseek](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) | -| License | [MIT](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/mit.md) | -| Compatible Instances | L4, L40S, H100 (FP8, BF16) | -| Context Length | up to 131k tokens | - -## Model names - -```bash -deepseek/deepseek-r1-distill-llama-8b:bf16 -``` - -## Compatible Instances - -| Instance type | Max context length | -| ------------- |-------------| -| L4 | 90k (FP8), 39k (BF16) | -| L40S | 131k (FP8, BF16) | -| H100 | 131k (FP8, BF16) | - -## Model introduction - -Released January 21, 2025, Deepseek’s R1 Distilled Llama 8B is a distilled version of the Llama model family based on Deepseek R1. -DeepSeek R1 Distill Llama 8B is designed to improve the performance of Llama models on reasoning use cases such as mathematics and coding tasks. - -## Why is it useful? - -It is great to see Deepseek improving open(weight) models, and we are excited to fully support their mission with integration in the Scaleway ecosystem. - -- DeepSeek-R1-Distill-Llama was optimized to reach accuracy close to Deepseek-R1 in tasks like mathematics and coding, while keeping inference costs limited and tokens speed efficient. -- DeepSeek-R1-Distill-Llama supports a context window of up to 131K tokens and tool calling, keeping interaction with other components possible. - -## How to use it - -### Sending Managed Inference requests - -To perform inference tasks with your DeepSeek R1 Distill Llama deployed at Scaleway, use the following command: - -```bash -curl -s \ --H "Authorization: Bearer " \ --H "Content-Type: application/json" \ ---request POST \ ---url "https://.ifr.fr-par.scaleway.com/v1/chat/completions" \ ---data '{"model":"deepseek/deepseek-r1-distill-llama-8b:fp8", "messages":[{"role": "user","content": "There is a llama in my garden, what should I do?"}], "max_tokens": 500, "temperature": 0.7, "stream": false}' -``` - -Make sure to replace `` and `` with your actual [IAM API key](/iam/how-to/create-api-keys/) and the Deployment UUID you are targeting. - - - Ensure that the `messages` array is properly formatted with roles (user, assistant) and content. - - - - This model is better used without `system prompt`, as suggested by the model provider. - - -### Receiving inference responses - -Upon sending the HTTP request to the public or private endpoints exposed by the server, you will receive inference responses from the managed Managed Inference server. -Process the output data according to your application's needs. The response will contain the output generated by the LLM model based on the input provided in the request. - - - Despite efforts for accuracy, the possibility of generated text containing inaccuracies or [hallucinations](/managed-inference/concepts/#hallucinations) exists. Always verify the content generated independently. - diff --git a/pages/managed-inference/reference-content/llama-3-70b-instruct.mdx b/pages/managed-inference/reference-content/llama-3-70b-instruct.mdx deleted file mode 100644 index a0c5c9ce68..0000000000 --- a/pages/managed-inference/reference-content/llama-3-70b-instruct.mdx +++ /dev/null @@ -1,86 +0,0 @@ ---- -meta: - title: Understanding the Llama-3-70b-instruct model - description: Deploy your own secure Llama-3-70b-instruct model with Scaleway Managed Inference. Privacy-focused, fully managed. -content: - h1: Understanding the Llama-3-70b-instruct model - paragraph: This page provides information on the Llama-3-70b-instruct model -tags: -dates: - validation: 2024-12-03 - posted: 2024-05-28 -categories: - - ai-data ---- - -## Model overview - -| Attribute | Details | -|-----------------|------------------------------------| -| Provider | [Meta](https://llama.meta.com/llama3/) | -| Compatible Instances | H100, H100-2 (FP8) | -| Context size | 8192 tokens | - -## Model names - -```bash -meta/llama-3-70b-instruct:fp8 -``` - -## Compatible Instances - -- [H100 (FP8)](https://www.scaleway.com/en/h100-pcie-try-it-now/) -- H100-2 (FP8) - -## Model introduction - -Meta’s Llama 3 is an iteration of the open-access Llama family. -Llama 3 was designed to match the best proprietary models, enhanced by community feedback for greater utility and responsibly spearheading the deployment of LLMs. -With a commitment to open-source principles, this release marks the beginning of a multilingual, multimodal future for Llama 3, pushing the boundaries in reasoning and coding capabilities. - -## Why is it useful? - -We are dedicated to supporting Meta's commitment to open(weight) AI and its mission, through integration into the Scaleway ecosystem. -Llama 3 marks a significant advancement over Llama 2 and other available models due to several enhancements: - -Llama-3-70b-instruct offers seamless integration with chat applications and customer service platforms, facilitating smooth communication between businesses and their customers. -Its robust performance in natural language understanding, enhanced by superior common sense reasoning, enriches user experience and boosts customer satisfaction. - -In particular, this model: -- Offers a doubled context length of 8,192 tokens, compared to its predecessor. -- Uses a more extensive token vocabulary, featuring 128,000 tokens, allowing for more efficient language encoding. -- Demonstrates a reduction in false "refusals" by less than one-third compared to Llama 2. - -## How to use it - -### Sending Managed Inference requests - -To perform inference tasks with your Llama-3 deployed at Scaleway, use the following command: - -```bash -curl -s \ --H "Authorization: Bearer " \ --H "Content-Type: application/json" \ ---request POST \ ---url "https://.ifr.fr-par.scaleway.com/v1/chat/completions" \ ---data '{"model":"meta/llama-3-70b-instruct:fp8", "messages":[{"role": "user","content": "Sing me a song about Xavier Niel"}], "max_tokens": 500, "top_p": 1, "temperature": 0.7, "stream": false}' -``` - -Make sure to replace `` and `` with your actual [IAM API key](/iam/how-to/create-api-keys/) and the Deployment UUID you are targeting. - - - The model name allows Scaleway to put your prompts in the expected format. - - - - Ensure that the `messages` array is properly formatted with roles (system, user, assistant) and content. - - -### Receiving Inference responses - -Upon sending the HTTP request to the public or private endpoints exposed by the server, you will receive inference responses from the managed Managed Inference server. -Process the output data according to your application's needs. The response will contain the output generated by the LLM model based on the input provided in the request. - - - Despite efforts for accuracy, the possibility of generated text containing inaccuracies or [hallucinations](/managed-inference/concepts/#hallucinations) exists. Always verify the content generated independently. - diff --git a/pages/managed-inference/reference-content/llama-3-8b-instruct.mdx b/pages/managed-inference/reference-content/llama-3-8b-instruct.mdx deleted file mode 100644 index 6d8a398cf9..0000000000 --- a/pages/managed-inference/reference-content/llama-3-8b-instruct.mdx +++ /dev/null @@ -1,89 +0,0 @@ ---- -meta: - title: Understanding the Llama-3-8b-instruct model - description: Deploy your own secure Llama-3-8b-instruct model with Scaleway Managed Inference. Privacy-focused, fully managed. -content: - h1: Understanding the Llama-3-8b-instruct model - paragraph: This page provides information on the Llama-3-8b-instruct model -tags: -dates: - validation: 2025-04-07 - posted: 2024-09-10 -categories: - - ai-data ---- - -## Model overview - -| Attribute | Details | -|-----------------|------------------------------------| -| Provider | [Meta](https://llama.meta.com/llama3/) | -| Compatible Instances | L4, L40S, H100, H100-2 (FP8, BF16) | -| Context size | 8192 tokens | - -## Model names - -```bash -meta/llama-3-8b-instruct:bf16 -meta/llama-3-8b-instruct:fp8 -``` - -## Compatible Instances - -| Instance type | Max context length | -| ------------- |-------------| -| L4 | 8192 (FP8, BF16) | -| L40S | 8192 (FP8, BF16) | -| H100 | 8192 (FP8, BF16) | -| H100-2 | 8192 (FP8, BF16) | - -## Model introduction - -Meta’s Llama 3 is an iteration of the open-access Llama family. -Llama 3 was designed to match the best proprietary models, enhanced by community feedback for greater utility and responsibly spearheading the deployment of LLMs. -With a commitment to open-source principles, this release marks the beginning of a multilingual, multimodal future for Llama 3, pushing the boundaries in reasoning and coding capabilities. - -## Why is it useful? - -It is great to see Meta continuing its commitment to open(weight) AI, and we are excited to fully support their mission with integration in the Scaleway ecosystem. - -Llama 3 marks a significant advancement over Llama 2 and other available models due to several enhancements: - -- It was trained on a dataset that is seven times larger than that used for Llama 2. -- It offers a doubled context length of 8,192 tokens, compared to its predecessor. -- The model uses a more extensive token vocabulary, featuring 128,000 tokens, allowing for more efficient language encoding. -- It demonstrates a reduction in false "refusals" by less than one-third compared to Llama 2. - -## How to use it - -### Sending Managed Inference requests - -To perform inference tasks with your Llama-3 deployed at Scaleway, use the following command: - -```bash -curl -s \ --H "Authorization: Bearer " \ --H "Content-Type: application/json" \ ---request POST \ ---url "https://.ifr.fr-par.scaleway.com/v1/chat/completions" \ ---data '{"model":"meta/llama-3-8b-instruct:fp8", "messages":[{"role": "user","content": "There is a llama in my garden, what should I do?"}], "max_tokens": 500, "top_p": 1, "temperature": 0.7, "stream": false}' -``` - -Make sure to replace `` and `` with your actual [IAM API key](/iam/how-to/create-api-keys/) and the Deployment UUID you are targeting. - - - The model name allows Scaleway to put your prompts in the expected format. - - - - Ensure that the `messages` array is properly formatted with roles (system, user, assistant) and content. - - -### Receiving Inference responses - -Upon sending the HTTP request to the public or private endpoints exposed by the server, you will receive inference responses from the managed Managed Inference server. -Process the output data according to your application's needs. The response will contain the output generated by the LLM model based on the input provided in the request. - - - Despite efforts for accuracy, the possibility of generated text containing inaccuracies or [hallucinations](/managed-inference/concepts/#hallucinations) exists. Always verify the content generated independently. - diff --git a/pages/managed-inference/reference-content/llama-3.1-70b-instruct.mdx b/pages/managed-inference/reference-content/llama-3.1-70b-instruct.mdx deleted file mode 100644 index abc1f7df7c..0000000000 --- a/pages/managed-inference/reference-content/llama-3.1-70b-instruct.mdx +++ /dev/null @@ -1,84 +0,0 @@ ---- -meta: - title: Understanding the Llama-3.1-70b-instruct model - description: Deploy your own secure Llama-3.1-70b-instruct model with Scaleway Managed Inference. Privacy-focused, fully managed. -content: - h1: Understanding the Llama-3.1-70b-instruct model - paragraph: This page provides information on the Llama-3.1-70b-instruct model -tags: -dates: - validation: 2025-03-03 - posted: 2024-08-31 -categories: - - ai-data ---- - -## Model overview - -| Attribute | Details | -|-----------------|------------------------------------| -| Provider | [Meta](https://llama.meta.com/llama3/) | -| License | [Llama 3.1 community](https://llama.meta.com/llama3_1/license/) | | -| Compatible Instances | H100 (FP8), H100-2 (FP8, BF16) | -| Context Length | up to 128k tokens | - -## Model names - -```bash -meta/llama-3.1-70b-instruct:fp8 -meta/llama-3.1-70b-instruct:bf16 -``` - -## Compatible Instances - -| Instance type | Max context length | -| ------------- |-------------| -| H100 | 17k (FP8) | -| H100-2 | 128k (FP8), 70k (BF16) | - -## Model introduction - -Released July 23, 2024, Meta’s Llama 3.1 is an iteration of the open-access Llama family. -Llama 3.1 was designed to match the best proprietary models, outperform many of the available open source on common industry benchmarks. - -## Why is it useful? - -It is great to see Meta continuing its commitment to open(weight) AI, and we are excited to fully support their mission with integration in the Scaleway ecosystem. - -- Llama 3.1 was optimized for multilingual dialogue use cases, with many supported languages: English, German, French, Italian, Portuguese, Hindi, Spanish, and Thai. -- Llama 3.1 brings a context window up to 128K tokens, a sharp increase compared to its predecessor (Llama 3 was 8192). -- Llama 3.1 supports tool calling, enabling the model to answer a given prompt using tool(s) it knows about, making it possible to interact with the outside world. - -## How to use it - -### Sending Managed Inference requests - -To perform inference tasks with your Llama-3.1 deployed at Scaleway, use the following command: - -```bash -curl -s \ --H "Authorization: Bearer " \ --H "Content-Type: application/json" \ ---request POST \ ---url "https://.ifr.fr-par.scaleway.com/v1/chat/completions" \ ---data '{"model":"meta/llama-3.1-70b-instruct:fp8", "messages":[{"role": "user","content": "There is a llama in my garden, what should I do?"}], "max_tokens": 500, "temperature": 0.7, "stream": false}' -``` - -Make sure to replace `` and `` with your actual [IAM API key](/iam/how-to/create-api-keys/) and the Deployment UUID you are targeting. - - - The model name allows Scaleway to put your prompts in the expected format. - - - - Ensure that the `messages` array is properly formatted with roles (system, user, assistant) and content. - - -### Receiving Inference responses - -Upon sending the HTTP request to the public or private endpoints exposed by the server, you will receive inference responses from the managed Managed Inference server. -Process the output data according to your application's needs. The response will contain the output generated by the LLM model based on the input provided in the request. - - - Despite efforts for accuracy, the possibility of generated text containing inaccuracies or [hallucinations](/managed-inference/concepts/#hallucinations) exists. Always verify the content generated independently. - \ No newline at end of file diff --git a/pages/managed-inference/reference-content/llama-3.1-8b-instruct.mdx b/pages/managed-inference/reference-content/llama-3.1-8b-instruct.mdx deleted file mode 100644 index 42894b1640..0000000000 --- a/pages/managed-inference/reference-content/llama-3.1-8b-instruct.mdx +++ /dev/null @@ -1,86 +0,0 @@ ---- -meta: - title: Understanding the Llama-3.1-8b-instruct model - description: Deploy your own secure Llama-3.1-8b-instruct model with Scaleway Managed Inference. Privacy-focused, fully managed. -content: - h1: Understanding the Llama-3.1-8b-instruct model - paragraph: This page provides information on the Llama-3.1-8b-instruct model -tags: -dates: - validation: 2025-04-01 - posted: 2024-08-31 -categories: - - ai-data ---- - -## Model overview - -| Attribute | Details | -|-----------------|------------------------------------| -| Provider | [Meta](https://llama.meta.com/llama3/) | -| License | [Llama 3.1 community](https://llama.meta.com/llama3_1/license/) | -| Compatible Instances | L4, L40S, H100, H100-2 (FP8, BF16) | -| Context Length | up to 128k tokens | - -## Model names - -```bash -meta/llama-3.1-8b-instruct:fp8 -meta/llama-3.1-8b-instruct:bf16 -``` - -## Compatible Instances - -| Instance type | Max context length | -| ------------- |-------------| -| L4 | 96k (FP8), 27k (BF16) | -| L40S | 128k (FP8, BF16) | -| H100 | 128k (FP8, BF16) | -| H100-2 | 128k (FP8, BF16) | - -## Model introduction - -Released July 23, 2024, Meta’s Llama 3.1 is an iteration of the open-access Llama family. -Llama 3.1 was designed to match the best proprietary models, outperform many of the available open source on common industry benchmarks. - -## Why is it useful? - -It is great to see Meta continuing its commitment to open(weight) AI, and we are excited to fully support their mission with integration in the Scaleway ecosystem. - -- Llama 3.1 was optimized for multilingual dialogue use cases, with many supported languages: English, German, French, Italian, Portuguese, Hindi, Spanish, and Thai. -- Llama 3.1 brings a context window up to 128K tokens, a sharp increase compared to its predecessor (Llama 3 was 8192). -- Llama 3.1 supports tool calling, enabling the model to answer a given prompt using tool(s) it knows about, making it possible to interact with the outside world. - -## How to use it - -### Sending Managed Inference requests - -To perform inference tasks with your Llama-3.1 deployed at Scaleway, use the following command: - -```bash -curl -s \ --H "Authorization: Bearer " \ --H "Content-Type: application/json" \ ---request POST \ ---url "https://.ifr.fr-par.scaleway.com/v1/chat/completions" \ ---data '{"model":"meta/llama-3.1-8b-instruct:fp8", "messages":[{"role": "user","content": "There is a llama in my garden, what should I do?"}], "max_tokens": 500, "temperature": 0.7, "stream": false}' -``` - -Make sure to replace `` and `` with your actual [IAM API key](/iam/how-to/create-api-keys/) and the Deployment UUID you are targeting. - - - The model name allows Scaleway to put your prompts in the expected format. - - - - Ensure that the `messages` array is properly formatted with roles (system, user, assistant) and content. - - -### Receiving Inference responses - -Upon sending the HTTP request to the public or private endpoints exposed by the server, you will receive inference responses from the managed Managed Inference server. -Process the output data according to your application's needs. The response will contain the output generated by the LLM model based on the input provided in the request. - - - Despite efforts for accuracy, the possibility of generated text containing inaccuracies or [hallucinations](/managed-inference/concepts/#hallucinations) exists. Always verify the content generated independently. - diff --git a/pages/managed-inference/reference-content/llama-3.1-nemotron-70b-instruct.mdx b/pages/managed-inference/reference-content/llama-3.1-nemotron-70b-instruct.mdx deleted file mode 100644 index c2528ef456..0000000000 --- a/pages/managed-inference/reference-content/llama-3.1-nemotron-70b-instruct.mdx +++ /dev/null @@ -1,81 +0,0 @@ ---- -meta: - title: Understanding the Llama-3.1-Nemotron-70b-instruct model - description: Deploy your own secure Llama-3.1-Nemotron-70b-instruct model with Scaleway Managed Inference. Privacy-focused, fully managed. -content: - h1: Understanding the Llama-3.1-Nemotron-70b-instruct model - paragraph: This page provides information on the Llama-3.1-Nemotron-70b-instruct model -tags: -dates: - validation: 2024-11-15 - posted: 2024-11-15 -categories: - - ai-data ---- - -## Model overview - -| Attribute | Details | -|-----------------|------------------------------------| -| Provider | [Nvidia](https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct) | -| License | [Llama 3.1 community](https://llama.meta.com/llama3_1/license/) | | -| Compatible Instances | H100 (FP8), H100-2 (FP8) | -| Context Length | up to 128k tokens | - -## Model names - -```bash -meta/llama-3.1-nemotron-70b-instruct:fp8 -``` - -## Compatible Instances - -| Instance type | Max context length | -| ------------- |-------------| -| H100 | 16k (FP8) | -| H100-2 | 128k (FP8) | - -## Model introduction - -Introduced October 14, 2024, NVIDIA's Nemotron 70B Instruct is a specialized version of the Llama 3.1 model designed to follow complex instructions. -NVIDIA employed Reinforcement Learning from Human Feedback (RLHF) to fine-tune the model’s ability to generate relevant and informative responses. - -## Why is it useful? - -- As of October 2024, Llama 3.1 Nemotron 70B has achieved top rankings in multiple automatic alignment benchmarks. It boasts an overall score of 94.1 on RewardBench, with specific scores of 97.5 for chat performance and 98.1 in reasoning tasks. -- Just like with the original Llama 3.1, this model brings a context window up to 128K tokens and [supports tool calling](/managed-inference/reference-content/function-calling-support). -- With 70 billion parameters, this model is highly capable of generating sophisticated, human-like responses in a wide range of applications, from casual chatbots to complex technical systems. - -## How to use it - -### Sending Managed Inference requests - -To perform inference tasks with your Llama-3.1-Nemotron-70b-instruct deployed at Scaleway, use the following command: - -```bash -curl -s \ --H "Authorization: Bearer " \ --H "Content-Type: application/json" \ ---request POST \ ---url "https://.ifr.fr-par.scaleway.com/v1/chat/completions" \ ---data '{"model":"meta/llama-3.1-nemotron-70b-instruct:fp8", "messages":[{"role": "user","content": "There is a llama in my garden, what should I do?"}], "max_tokens": 500, "temperature": 0.7, "stream": false}' -``` - -Make sure to replace `` and `` with your actual [IAM API key](/iam/how-to/create-api-keys/) and the Deployment UUID you are targeting. - - - The model name allows Scaleway to put your prompts in the expected format. - - - - Ensure that the `messages` array is properly formatted with roles (system, user, assistant) and content. - - -### Receiving Inference responses - -Upon sending the HTTP request to the public or private endpoints exposed by the server, you will receive inference responses from the managed Managed Inference server. -Process the output data according to your application's needs. The response will contain the output generated by the LLM model based on the input provided in the request. - - - Despite efforts for accuracy, the possibility of generated text containing inaccuracies or [hallucinations](/managed-inference/concepts/#hallucinations) exists. Always verify the content generated independently. - \ No newline at end of file diff --git a/pages/managed-inference/reference-content/llama-3.3-70b-instruct.mdx b/pages/managed-inference/reference-content/llama-3.3-70b-instruct.mdx deleted file mode 100644 index d129843c39..0000000000 --- a/pages/managed-inference/reference-content/llama-3.3-70b-instruct.mdx +++ /dev/null @@ -1,80 +0,0 @@ ---- -meta: - title: Understanding the Llama-3.3-70b-instruct model - description: Deploy your own secure Llama-3.3-70b-instruct model with Scaleway Managed Inference. Privacy-focused, fully managed. -content: - h1: Understanding the Llama-3.3-70b-instruct model - paragraph: This page provides information on the Llama-3.3-70b-instruct model -tags: -dates: - validation: 2024-12-12 - posted: 2024-12-12 -categories: - - ai-data ---- - -## Model overview - -| Attribute | Details | -|-----------------|------------------------------------| -| Provider | [Meta](https://www.llama.com/) | -| License | [Llama 3.3 community](https://www.llama.com/llama3_3/license/) | -| Compatible Instances | H100 (FP8), H100-2 (FP8, BF16) | -| Context length | Up to 131k tokens | - -## Model names - -```bash -meta/llama-3.3-70b-instruct:bf16 -``` - -## Compatible Instances - -| Instance type | Max context length | -| ------------- |-------------| -| H100 | 15k (FP8) | -| H100-2 | 131k (FP8), 62k (BF16) | - -## Model introduction - -Released December 6, 2024, Meta’s Llama 3.3 70b is a fine-tune of the [Llama 3.1 70b](/managed-inference/reference-content/llama-3.1-70b-instruct/) model. -This model is still text-only (text in/text out). However, Llama 3.3 was designed to approach the performance of Llama 3.1 405B on some applications. - -## Why is it useful? - -- Llama 3.3 uses the same prompt format as Llama 3.1. Prompts written for Llama 3.1 work unchanged with Llama 3.3. -- Llama 3.3 supports 7 languages in addition to English: French, German, Hindi, Italian, Portuguese, Spanish, and Thai. - -## How to use it - -### Sending Managed Inference requests - -To perform inference tasks with your Llama-3.3 deployed at Scaleway, use the following command: - -```bash -curl -s \ --H "Authorization: Bearer " \ --H "Content-Type: application/json" \ ---request POST \ ---url "https://.ifr.fr-par.scaleway.com/v1/chat/completions" \ ---data '{"model":"meta/llama-3.3-70b-instruct:bf16", "messages":[{"role": "user","content": "There is a llama in my garden, what should I do?"}], "max_tokens": 500, "temperature": 0.7, "stream": false}' -``` - -Make sure to replace `` and `` with your actual [IAM API key](/iam/how-to/create-api-keys/) and the Deployment UUID you are targeting. - - - The model name allows Scaleway to put your prompts in the expected format. - - - - Ensure that the `messages` array is properly formatted with roles (system, user, assistant) and content. - - -### Receiving Inference responses - -Upon sending the HTTP request to the public or private endpoints exposed by the server, you will receive inference responses from the managed Managed Inference server. -Process the output data according to your application's needs. The response will contain the output generated by the LLM model based on the input provided in the request. - - - Despite efforts for accuracy, the possibility of generated text containing inaccuracies or [hallucinations](/managed-inference/concepts/#hallucinations) exists. Always verify the content generated independently. - diff --git a/pages/managed-inference/reference-content/mistral-7b-instruct-v0.3.mdx b/pages/managed-inference/reference-content/mistral-7b-instruct-v0.3.mdx deleted file mode 100644 index be1d2d4f2d..0000000000 --- a/pages/managed-inference/reference-content/mistral-7b-instruct-v0.3.mdx +++ /dev/null @@ -1,81 +0,0 @@ ---- -meta: - title: Understanding the Mistral-7b-instruct-v0.3 model - description: Deploy your own secure Mistral-7b-instruct-v0.3 model with Scaleway Managed Inference. Privacy-focused, fully managed. -content: - h1: Understanding the Mistral-7b-instruct-v0.3 model - paragraph: This page provides information on the Mistral-7b-instruct-v0.3 model -tags: -dates: - validation: 2025-01-02 - posted: 2024-06-26 -categories: - - ai-data ---- - -## Model overview - -| Attribute | Details | -|-----------------|------------------------------------| -| Provider | [Mistral](https://mistral.ai/technology/#models) | -| Compatible Instances | L4, L40S, H100, H100-2 (BF16) | -| Context size | 32K tokens | - -## Model name - -```bash -mistral/mistral-7b-instruct-v0.3:bf16 -``` - -## Compatible Instances - -| Instance type | Max context length | -| ------------- |-------------| -| L4 | 32k (BF16) | -| L40S | 32k (BF16) | -| H100 | 32k (BF16) | -| H100-2 | 32k (BF16) | - -## Model introduction - -The first dense model released by Mistral AI, perfect for experimentation, customization, and quick iteration. At the time of the release, it matched the capabilities of models up to 30B parameters. -This model is open-weight and distributed under the Apache 2.0 license. - -## Why is it useful? - -Mistral-7B-Instruct-v0.3 is the smallest and latest Large Language Model (LLM) from Mistral AI, providing a 32k context window and support for function calling. -It does not have any moderation mechanisms to finely respect guardrails. Use with caution for deployments in environments requiring moderated outputs. - -## How to use it - -### Sending Inference requests - -To perform inference tasks with your Mistral model deployed at Scaleway, use the following command: - -```bash -curl -s \ --H "Authorization: Bearer " \ --H "Content-Type: application/json" \ ---request POST \ ---url "https://.ifr.fr-par.scaleway.com/v1/chat/completions" \ ---data '{"model":"mistral/mistral-7b-instruct-v0.3:bf16", "messages":[{"role": "user","content": "Explain Public Cloud in a nutshell."}], "top_p": 1, "temperature": 0.7, "stream": false}' -``` - -Make sure to replace `` and `` with your actual [IAM API key](/iam/how-to/create-api-keys/) and the Deployment UUID you are targeting. - - - The model name allows Scaleway to put your prompts in the expected format. - - - - Ensure that the `messages` array is properly formatted with roles (system, user, assistant) and content. - - -### Receiving Managed Inference responses - -Upon sending the HTTP request to the public or private endpoints exposed by the server, you will receive inference responses from the managed Managed Inference server. -Process the output data according to your application's needs. The response will contain the output generated by the LLM model based on the input provided in the request. - - - Despite efforts for accuracy, the possibility of generated text containing inaccuracies or [hallucinations](/managed-inference/concepts/#hallucinations) exists. Always verify the content generated independently. - diff --git a/pages/managed-inference/reference-content/mistral-nemo-instruct-2407.mdx b/pages/managed-inference/reference-content/mistral-nemo-instruct-2407.mdx deleted file mode 100644 index 79859b4b07..0000000000 --- a/pages/managed-inference/reference-content/mistral-nemo-instruct-2407.mdx +++ /dev/null @@ -1,86 +0,0 @@ ---- -meta: - title: Understanding the Mistral-nemo-instruct-2407 model - description: Deploy your own secure Mistral-nemo-instruct-2407 model with Scaleway Managed Inference. Privacy-focused, fully managed. -content: - h1: Understanding the Mistral-nemo-instruct-2407 model - paragraph: This page provides information on the Mistral-nemo-instruct-2407 model -tags: -dates: - validation: 2025-02-24 - posted: 2024-08-20 -categories: - - ai-data ---- - -## Model overview - -| Attribute | Details | -|-----------------|------------------------------------| -| Provider | [Mistral](https://mistral.ai/technology/#models) | -| Compatible Instances | L40S, H100, H100-2 (FP8) | -| Context size | 128K tokens | - -## Model name - -```bash -mistral/mistral-nemo-instruct-2407:fp8 -``` - -## Compatible Instances - -| Instance type | Max context length | -| ------------- |-------------| -| L40 | 128k (FP8) | -| H100 | 128k (FP8) | -| H100-2 | 128k (FP8) | - -## Model introduction - -Mistral Nemo is a state-of-the-art transformer model of 12B parameters, built by Mistral in collaboration with NVIDIA. -This model is open-weight and distributed under the Apache 2.0 license. -It was trained on a large proportion of multilingual and code data. - -## Why is it useful? - -- Mistral Nemo offers a very large context window of up to 128k tokens, particularly useful for RAG applications. -- It is easy to use and a drop-in replacement in any system already using Mistral 7B. -- This model was designed for global, multilingual applications. It is particularly strong in English, French, German, Spanish, Italian, Portuguese, Chinese, Japanese, Korean, Arabic, and Hindi. - -## How to use it - -### Sending Inference requests - - - Unlike previous Mistral models, Mistral Nemo requires smaller temperatures. It is recommend to use a temperature of 0.35. - - -To perform inference tasks with your Mistral model deployed at Scaleway, use the following command: - -```bash -curl -s \ --H "Authorization: Bearer " \ --H "Content-Type: application/json" \ ---request POST \ ---url "https://.ifr.fr-par.scaleway.com/v1/chat/completions" \ ---data '{"model":"mistral/mistral-nemo-instruct-2407:fp8", "messages":[{"role": "user","content": "Sing me a song about Xavier Niel"}], "top_p": 1, "temperature": 0.35, "stream": false}' -``` - -Make sure to replace `` and `` with your actual [IAM API key](/iam/how-to/create-api-keys/) and the Deployment UUID you are targeting. - - - The model name allows Scaleway to put your prompts in the expected format. - - - - Ensure that the `messages` array is properly formatted with roles (system, user, assistant) and content. - - -### Receiving Managed Inference responses - -Upon sending the HTTP request to the public or private endpoints exposed by the server, you will receive inference responses from the managed Managed Inference server. -Process the output data according to your application's needs. The response will contain the output generated by the LLM model based on the input provided in the request. - - - Despite efforts for accuracy, the possibility of generated text containing inaccuracies or [hallucinations](/managed-inference/concepts/#hallucinations) exists. Always verify the content generated independently. - diff --git a/pages/managed-inference/reference-content/mistral-small-24b-instruct-2501.mdx b/pages/managed-inference/reference-content/mistral-small-24b-instruct-2501.mdx deleted file mode 100644 index 9b19c023d5..0000000000 --- a/pages/managed-inference/reference-content/mistral-small-24b-instruct-2501.mdx +++ /dev/null @@ -1,77 +0,0 @@ ---- -meta: - title: Understanding the Mistral-small-24b-base-2501 model - description: Deploy your own secure Mistral-small-24b-base-2501 model with Scaleway Managed Inference. Privacy-focused, fully managed. -content: - h1: Understanding the Mistral-small-24b-base-2501 model - paragraph: This page provides information on the Mistral-small-24b-base-2501 model -tags: -dates: - validation: 2025-03-04 - posted: 2025-03-04 -categories: - - ai-data ---- - -## Model overview - -| Attribute | Details | -|-----------------|------------------------------------| -| Provider | [Mistral](https://mistral.ai/technology/#models) | -| Compatible Instances | L40S, H100, H100-2 (FP8) | -| Context size | 32K tokens | - -## Model name - -```bash -mistral/mistral-small-24b-instruct-2501:fp8 -``` - -## Compatible Instances - -| Instance type | Max context length | -| ------------- |-------------| -| L40 | 20k (FP8) | -| H100 | 32k (FP8) | -| H100-2 | 32k (FP8) | - -## Model introduction - -Mistral Small 24B Instruct is a state-of-the-art transformer model of 24B parameters, built by Mistral. -This model is open-weight and distributed under the Apache 2.0 license. - -## Why is it useful? - -- Mistral Small 24B offers a large context window of up to 32k tokens and provide both conversational and reasoning capabilities. -- This model supports multiple languages, including English, French, German, Spanish, Italian, Chinese, Japanese, Korean, Portuguese, Dutch, and Polish. -- It supersedes Mistral Nemo Instruct, although its tokens throughput is slightly lower. - -## How to use it - -### Sending Inference requests - -To perform inference tasks with your Mistral model deployed at Scaleway, use the following command: - -```bash -curl -s \ --H "Authorization: Bearer " \ --H "Content-Type: application/json" \ ---request POST \ ---url "https://.ifr.fr-par.scaleway.com/v1/chat/completions" \ ---data '{"model":"mistral/mistral-small-24b-instruct-2501:fp8", "messages":[{"role": "user","content": "Tell me about Scaleway."}], "top_p": 1, "temperature": 0.7, "stream": false}' -``` - -Make sure to replace `` and `` with your actual [IAM API key](/iam/how-to/create-api-keys/) and the Deployment UUID you are targeting. - - - Ensure that the `messages` array is properly formatted with roles (system, user, assistant) and content. - - -### Receiving Managed Inference responses - -Upon sending the HTTP request to the public or private endpoints exposed by the server, you will receive inference responses from the managed Managed Inference server. -Process the output data according to your application's needs. The response will contain the output generated by the LLM model based on the input provided in the request. - - - Despite efforts for accuracy, the possibility of generated text containing inaccuracies or [hallucinations](/managed-inference/concepts/#hallucinations) exists. Always verify the content generated independently. - diff --git a/pages/managed-inference/reference-content/mixtral-8x7b-instruct-v0.1.mdx b/pages/managed-inference/reference-content/mixtral-8x7b-instruct-v0.1.mdx deleted file mode 100644 index 7f809e68f0..0000000000 --- a/pages/managed-inference/reference-content/mixtral-8x7b-instruct-v0.1.mdx +++ /dev/null @@ -1,80 +0,0 @@ ---- -meta: - title: Understanding the Mixtral-8x7b-instruct-v0.1 model - description: Deploy your own secure Mixtral-8x7b-Instruct model with Scaleway Managed Inference. Privacy-focused, fully managed. -content: - h1: Understanding the Mixtral-8x7b-instruct-v0.1 model - paragraph: This page provides information on the Mixtral-8x7b-instruct-v0.1 model -tags: -dates: - validation: 2025-03-19 - posted: 2024-05-28 -categories: - - ai-data ---- - -## Model overview - -| Attribute | Details | -|-----------------|------------------------------------| -| Provider | [Mistral](https://mistral.ai/technology/#models) | -| Compatible Instances | H100 (FP8) - H100-2 (BF16) | -| Context size | 32k tokens | - -## Model names - -```bash -mistral/mixtral-8x7b-instruct-v0.1:fp8 -mistral/mixtral-8x7b-instruct-v0.1:bf16 -``` - -## Compatible Instances - -| Instance type | Max context length | -| ------------- |-------------| -| H100 | 32k (FP8) -| H100-2 | 32k (BF16) - -## Model introduction - -Mixtral-8x7b-instruct-v0.1, developed by Mistral, is tailored for instructional platforms and virtual assistants. -Trained on vast instructional datasets, it provides clear and concise instructions across various domains, enhancing user learning experiences. - -## Why is it useful? - -Mixtral-8x7b-instruct-v0.1, trained on the [Nabuchodonosor supercomputer](https://www.scaleway.com/en/ai-supercomputers/), delivers high-quality instruction generation with exceptional performance. -This model excels in code generation and understanding multiple languages, making it an ideal choice for developing virtual assistants or educational platforms that require reliability and excellence. - -## How to use it - -### Sending Inference requests - -To perform inference tasks with your Mixtral model deployed at Scaleway, use the following command: - -```bash -curl -s \ --H "Authorization: Bearer " \ --H "Content-Type: application/json" \ ---request POST \ ---url "https://.ifr.fr-par.scaleway.com/v1/chat/completions" \ ---data '{"model":"mistral/mixtral-8x7b-instruct-v0.1:fp8", "messages":[{"role": "user","content": "Sing me a song about Scaleway"}], "max_tokens": 200, "top_p": 1, "temperature": 1, "stream": false}' -``` - -Make sure to replace `` and `` with your actual [IAM API key](/iam/how-to/create-api-keys/) and the Deployment UUID you are targeting. - - - The model name allows Scaleway to put your prompts in the expected format. - - - - Ensure that the `messages` array is properly formatted with roles (system, user, assistant) and content. - - -### Receiving Managed Inference responses - -Upon sending the HTTP request to the public or private endpoints exposed by the server, you will receive inference responses from the managed Managed Inference server. -Process the output data according to your application's needs. The response will contain the output generated by the LLM model based on the input provided in the request. - - - Despite efforts for accuracy, the possibility of generated text containing inaccuracies or [hallucinations](/managed-inference/concepts/#hallucinations) exists. Always verify the content generated independently. - diff --git a/pages/managed-inference/reference-content/model-catalog.mdx b/pages/managed-inference/reference-content/model-catalog.mdx index 3ea11e5d0e..c35c956098 100644 --- a/pages/managed-inference/reference-content/model-catalog.mdx +++ b/pages/managed-inference/reference-content/model-catalog.mdx @@ -33,7 +33,6 @@ A quick overview of available models in Scaleway's catalog and their core attrib | [`mixtral-8x7b-instruct-v0.1`](#mixtral-8x7b-instruct-v01) | Mistral | 32k | Text | H100-2 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | | [`moshiko-0.1-8b`](#moshiko-01-8b) | Kyutai | 4k | Audio to Audio | L4, H100 | [CC-BY-4.0](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/cc-by-4.0.md) | | [`moshika-0.1-8b`](#moshika-01-8b) | Kyutai | 4k | Audio to Audio| L4, H100 | [CC-BY-4.0](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/cc-by-4.0.md) | -| [`wizardlm-70b-v1.0`](#wizardlm-70b-v10) | WizardLM | 4k | Text | H100, H100-2 | [Llama 2 community](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/LICENSE.txt) | | [`pixtral-12b-2409`](#pixtral-12b-2409) | Mistral | 128k | Text, Vision | L40S, H100, H100-2 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | | [`molmo-72b-0924`](#molmo-72b-0924) | Allen AI | 50k | Text, Vision | H100-2 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) and [Twonyi Qianwen license](https://huggingface.co/Qwen/Qwen2-72B/blob/main/LICENSE)| | [`qwen2.5-coder-32b-instruct`](#qwen25-coder-32b-instruct) | Qwen | 32k | Code | H100, H100-2 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | @@ -58,7 +57,6 @@ A quick overview of available models in Scaleway's catalog and their core attrib | `mixtral-8x7b-instruct-v0.1` | Yes | Yes | English, French, German, Italian, Spanish | | `moshiko-0.1-8b` | No | No | English | | `moshika-0.1-8b` | No | No | English | -| `wizardLM-70b-v1.0` | Yes | No | English | | `pixtral-12b-2409` | Yes | Yes | English | | `molmo-72b-0924` | Yes | No | English | | `qwen2.5-coder-32b-instruct` | Yes | Yes | English, French, Spanish, Portuguese, German, Italian, Russian, Chinese, Japanese, Korean, Vietnamese, Thai, Arabic and 16 additional languages. | @@ -249,16 +247,6 @@ kyutai/moshika-0.1-8b:bf16 kyutai/moshika-0.1-8b:fp8 ``` -### WizardLM-70B-V1.0 -WizardLM-70B-V1.0, developed by WizardLM, is specifically designed for content creation platforms and writing assistants. -With its extensive training in diverse textual data, WizardLM-70B-V1.0 generates high-quality content and assists writers in various creative and professional endeavors. - -#### Model names -``` -wizardlm/wizardlm-70b-v1.0:fp8 -wizardlm/wizardlm-70b-v1.0:fp16 -``` - ## Code models ### Qwen2.5-coder-32b-instruct diff --git a/pages/managed-inference/reference-content/molmo-72b-0924.mdx b/pages/managed-inference/reference-content/molmo-72b-0924.mdx deleted file mode 100644 index b07a5c6283..0000000000 --- a/pages/managed-inference/reference-content/molmo-72b-0924.mdx +++ /dev/null @@ -1,164 +0,0 @@ ---- -meta: - title: Understanding the Molmo-72b-0924 model - description: Deploy your own secure Molmo-72b-0924 model with Scaleway Managed Inference. Privacy-focused, fully managed. -content: - h1: Understanding the Molmo-72b-0924 model - paragraph: This page provides information on the Molmo-72b-0924 model -tags: ai molmo inference -dates: - validation: 2024-11-27 - posted: 2024-11-27 -categories: - - ai-data ---- - -## Model overview - -| Attribute | Details | -|-----------------|------------------------------------| -| Provider | [Allen Institute for AI](https://molmo.allenai.org/blog) | -| License | Apache 2.0 | | -| Compatible Instances | H100-2 (FP8) | -| Context size | 50k tokens | - -## Model name - -```bash -allenai/molmo-72b-0924:fp8 -``` - -## Compatible Instances - -| Instance type | Max context length | -| ------------- |-------------| -| H100-2 | 50k (FP8) - -## Model introduction - -Molmo 72B is the powerhouse of the Molmo family, multimodal models developed by the renowned research lab Allen Institute for AI. -Vision-language models like Molmo can analyze an image and offer insights from visual content alongside text. This multimodal functionality creates new opportunities for applications that need both visual and textual comprehension. - -Molmo is open-weight and distributed under the Apache 2.0 license. All artifacts (code, data set, evaluations) are also expected to be fully open-source. -Its base model is Qwen2-72B ([Twonyi Qianwen license](https://huggingface.co/Qwen/Qwen2-72B/blob/main/LICENSE)). - -## Why is it useful? - -- Molmo-72b allows you to process real world and high resolution images, unlocking capacities such as transcribing handwritten files or payment receipts, extracting information from graphs, captioning images, etc. -- This model achieves the [highest academic benchmark scores and ranks second on human evaluation](https://huggingface.co/allenai/Molmo-72B-0924#evaluations) at the time of writing (September 2024) - - - Molmo-72b can understand and analyze images, not generate them. You will use it through the /v1/chat/completions endpoint. - - -## How to use it - -### Sending inference requests - - - Unlike regular chat models, Molmo-72b can take an `image_url` in the content array. - - -To perform inference tasks with your Molmo-72b model deployed at Scaleway, use the following command: - -```bash -curl -s \ --H "Authorization: Bearer " \ --H "Content-Type: application/json" \ ---request POST \ ---url "https://.ifr.fr-par.scw.cloud/v1/chat/completions" \ ---data '{ - "model": "allenai/molmo-72b-0924:fp8", - "messages": [ - { - "role": "user", - "content": [ - {"type" : "text", "text": "Describe this image in detail please."}, - {"type": "image_url", "image_url": {"url": "https://picsum.photos/id/32/512/512"}} - ] - } - ], - "top_p": 1, - "temperature": 0.7, - "stream": false -}' -``` - -Make sure to replace `` and `` with your actual [IAM API key](/iam/how-to/create-api-keys/) and the Deployment UUID you are targeting. - - - The model name allows Scaleway to put your prompts in the expected format. - - -### Known limitations - -- Molmo-72b was reported to struggle with transparent images. The official recommandation is to add white or dark background to images for the time being. -- Molmo-72b chat template does not support the system role. Ensure that the `messages` array is properly formatted with user role and assistant role only. -- Molmo-72b is not able to generate structured outputs (`response_format` parameter not supported). -- Molmo-72b cannot do function calling (`tools` parameter not supported). - -### Passing images to Molmo-72b - -#### Image URLs -If the image is available online, you can just include the image URL in your request as demonstrated above. This approach is simple and does not require any encoding. - -#### Base64 encoded image -Base64 encoding is a standard way to transform binary data, like images, into a text format, making it easier to transmit over the internet. - -The following Python code sample shows you how to encode an image in base64 format and pass it to your request payload. - -```python -import base64 -from io import BytesIO -from PIL import Image - -def encode_image(img): - buffered = BytesIO() - img.save(buffered, format="JPEG") - encoded_string = base64.b64encode(buffered.getvalue()).decode("utf-8") - return encoded_string - -img = Image.open("path_to_your_image.jpg") -base64_img = encode_image(img) - -payload = { - "messages": [ - { - "role": "user", - "content": [ - {"type": "text", "text": "What is this image?"}, - { - "type": "image_url", - "image_url": {"url": f"data:image/jpeg;base64,{base64_img}"}, - }, - ], - } - ], - ... # other parameters -} - -``` - -### Receiving Managed Inference responses - -Upon sending the HTTP request to the public or private endpoints exposed by the server, you will receive inference responses from the managed Managed Inference server. -Process the output data according to your application's needs. The response will contain the output generated by the visual language model based on the input provided in the request. - - - Despite efforts for accuracy, the possibility of generated text containing inaccuracies or [hallucinations](/managed-inference/concepts/#hallucinations) exists. Always verify the content generated independently. - - -## Frequently Asked Questions - -#### What types of images are supported by Molmo-72b? -- Bitmap (or raster) image formats, meaning storing images as grids of individual pixels, are supported: PNG, JPEG, WEBP, and non-animated GIFs in particular. -- Vector image formats (SVG, PSD) are not supported. - -#### Are other file types supported? -Only bitmaps can be analyzed by Molmo. PDFs and videos are not supported. - -#### Is there a limit to the size of each image? -The only limitation is the context window (1 token for each 16x16 pixel). - -#### What is the maximum amount of images per conversation? -One conversation can handle a maximum of 1 image (per request). Sending more than one image will return a 400 error. \ No newline at end of file diff --git a/pages/managed-inference/reference-content/moshika-0.1-8b.mdx b/pages/managed-inference/reference-content/moshika-0.1-8b.mdx deleted file mode 100644 index 7cb82cd1e8..0000000000 --- a/pages/managed-inference/reference-content/moshika-0.1-8b.mdx +++ /dev/null @@ -1,87 +0,0 @@ ---- -meta: - title: Understanding the Moshika-0.1-8b model - description: Deploy your own secure Moshika-0.1-8b model with Scaleway Managed Inference. Privacy-focused, fully managed. -content: - h1: Understanding the Moshika-0.1-8b model - paragraph: This page provides information on the Moshika-0.1-8b model -tags: -dates: - validation: 2024-10-30 - posted: 2024-10-30 -categories: - - ai-data ---- - -## Model overview - -| Attribute | Details | -|-----------------|------------------------------------| -| Provider | [Kyutai](https://github.com/kyutai-labs/moshi) | -| Compatible Instances | L4, H100 (FP8, BF16) | -| Context size | 4096 tokens | - -## Model names - -```bash -kyutai/moshika-0.1-8b:bf16 -kyutai/moshika-0.1-8b:fp8 -``` - -## Compatible Instances - -| Instance type | Max context length | -| ------------- |-------------| -| L4 | 4096 (FP8, BF16) | -| H100 | 4096 (FP8, BF16) | - -## Model introduction - -Kyutai's Moshi is a speech-text foundation model for real-time dialogue. -Moshi is an experimental next-generation conversational model, designed to understand and respond fluidly and naturally to complex conversations, while providing unprecedented expressiveness and spontaneity. -While current systems for spoken dialogue rely on a pipeline of separate components, Moshi is the first real-time full-duplex spoken large language model. -Moshika is the variant of Moshi with a female voice in English. - -## Why is it useful? - -Moshi offers seamless real-time dialogue capabilities, enabling users to engage in natural conversations with the model. -It allows the modeling of arbitrary conversational dynamics, including overlapping speech, interruptions, interjections, and more. -In particular, this model: -- Processes 24 kHz audio down to a 12.5 Hz representation with a bandwith of 1.1 kbps, performing better than existing non-streaming models. -- Achieves a theoretical latency of 160 ms, with a practical latency of 200 ms, making it suitable for real-time applications. - -## How to use it - -To perform inference tasks with your Moshi deployed at Scaleway, a WebSocket API is exposed for real-time dialogue and is accessible at the following endpoint: - -```bash -wss://.ifr.fr-par.scaleway.com/api/chat -``` - -### Testing the WebSocket endpoint - -To test the endpoint, use the following command: - -```bash -curl -i --http1.1 \ --H "Authorization: Bearer " \ --H "Connection: Upgrade" \ --H "Upgrade: websocket" \ --H "Sec-WebSocket-Key: SGVsbG8sIHdvcmxkIQ==" \ --H "Sec-WebSocket-Version: 13" \ ---url "https://.ifr.fr-par.scaleway.com/api/chat" -``` - -Make sure to replace `` and `` with your actual [IAM API key](/iam/how-to/create-api-keys/) and the Deployment UUID you are targeting. - - - Authentication can be done using the `token` query parameter, which should be set to your IAM API key, if headers are not supported (e.g., in a browser). - - -The server should respond with a `101 Switching Protocols` status code, indicating that the connection has been successfully upgraded to a WebSocket connection. - -### Interacting with the model - -We provide code samples in various programming languages (Python, Rust, typescript) to interact with the model using the WebSocket API as well as a simple web interface. -Those code samples can be found in our [GitHub repository](https://github.com/scaleway/moshi-client-examples). -This repository contains instructions on how to run the code samples and interact with the model. \ No newline at end of file diff --git a/pages/managed-inference/reference-content/moshiko-0.1-8b.mdx b/pages/managed-inference/reference-content/moshiko-0.1-8b.mdx deleted file mode 100644 index 2796ca4150..0000000000 --- a/pages/managed-inference/reference-content/moshiko-0.1-8b.mdx +++ /dev/null @@ -1,86 +0,0 @@ ---- -meta: - title: Understanding the Moshiko-0.1-8b model - description: Deploy your own secure Moshiko-0.1-8b model with Scaleway Managed Inference. Privacy-focused, fully managed. -content: - h1: Understanding the Moshiko-0.1-8b model - paragraph: This page provides information on the Moshiko-0.1-8b model -tags: -dates: - validation: 2024-10-30 -categories: - - ai-data ---- - -## Model overview - -| Attribute | Details | -|-----------------|------------------------------------| -| Provider | [Kyutai](https://github.com/kyutai-labs/moshi) | -| Compatible Instances | L4, H100 (FP8, BF16) | -| Context size | 4096 tokens | - -## Model names - -```bash -kyutai/moshiko-0.1-8b:bf16 -kyutai/moshiko-0.1-8b:fp8 -``` - -## Compatible Instances - -| Instance type | Max context length | -| ------------- |-------------| -| L4 | 4096 (FP8, BF16) | -| H100 | 4096 (FP8, BF16) | - -## Model introduction - -Kyutai's Moshi is a speech-text foundation model for real-time dialogue. -Moshi is an experimental next-generation conversational model, designed to understand and respond fluidly and naturally to complex conversations, while providing unprecedented expressiveness and spontaneity. -While current systems for spoken dialogue rely on a pipeline of separate components, Moshi is the first real-time full-duplex spoken large language model. -Moshiko is the variant of Moshi with a male voice in English. - -## Why is it useful? - -Moshi offers seamless real-time dialogue capabilities, enabling users to engage in natural conversations with the model. -It allows the modeling of arbitrary conversational dynamics, including overlapping speech, interruptions, interjections, and more. -In particular, this model: -- Processes 24 kHz audio down to a 12.5 Hz representation with a bandwith of 1.1 kbps, performing better than existing non-streaming models. -- Achieves a theoretical latency of 160 ms, with a practical latency of 200 ms, making it suitable for real-time applications. - -## How to use it - -To perform inference tasks with your Moshi deployed at Scaleway, a WebSocket API is exposed for real-time dialogue and is accessible at the following endpoint: - -```bash -wss://.ifr.fr-par.scaleway.com/api/chat -``` - -### Testing the WebSocket endpoint - -To test the endpoint, use the following command: - -```bash -curl -i --http1.1 \ --H "Authorization: Bearer " \ --H "Connection: Upgrade" \ --H "Upgrade: websocket" \ --H "Sec-WebSocket-Key: SGVsbG8sIHdvcmxkIQ==" \ --H "Sec-WebSocket-Version: 13" \ ---url "https://.ifr.fr-par.scaleway.com/api/chat" -``` - -Make sure to replace `` and `` with your actual [IAM API key](/iam/how-to/create-api-keys/) and the Deployment UUID you are targeting. - - - Authentication can be done using the `token` query parameter, which should be set to your IAM API key, if headers are not supported (e.g., in a browser). - - -The server should respond with a `101 Switching Protocols` status code, indicating that the connection has been successfully upgraded to a WebSocket connection. - -### Interacting with the model - -We provide code samples in various programming languages (Python, Rust, typescript) to interact with the model using the WebSocket API as well as a simple web interface. -Those code samples can be found in our [GitHub repository](https://github.com/scaleway/moshi-client-examples). -This repository contains instructions on how to run the code samples and interact with the model. \ No newline at end of file diff --git a/pages/managed-inference/reference-content/pixtral-12b-2409.mdx b/pages/managed-inference/reference-content/pixtral-12b-2409.mdx deleted file mode 100644 index 2c90bd1cc1..0000000000 --- a/pages/managed-inference/reference-content/pixtral-12b-2409.mdx +++ /dev/null @@ -1,168 +0,0 @@ ---- -meta: - title: Understanding the Pixtral-12b-2409 model - description: Deploy your own secure Pixtral-12b-2409 model with Scaleway Managed Inference. Privacy-focused, fully managed. -content: - h1: Understanding the Pixtral-12b-2409 model - paragraph: This page provides information on the Pixtral-12b-2409 model -tags: -dates: - validation: 2025-04-01 - posted: 2024-09-23 -categories: - - ai-data ---- - -## Model overview - -| Attribute | Details | -|-----------------|------------------------------------| -| Provider | [Mistral](https://mistral.ai/technology/#models) | -| Compatible Instances | L40S, H100, H100-2 (bf16) | -| Context size | 128k tokens | - -## Model name - -```bash -mistral/pixtral-12b-2409:bf16 -``` - -## Compatible Instances - -| Instance type | Max context length | -| ------------- |-------------| -| L40S | 50k (BF16) -| H100 | 128k (BF16) -| H100-2 | 128k (BF16) - -## Model introduction - -Pixtral is a vision language model introducing a novel architecture: 12B parameter multimodal decoder plus 400M parameter vision encoder. -It can analyze images and offer insights from visual content alongside text. -This multimodal functionality creates new opportunities for applications that need both visual and textual comprehension. - -Pixtral is open-weight and distributed under the Apache 2.0 license. - -## Why is it useful? - -- Pixtral allows you to process real world and high resolution images, unlocking capacities such as transcribing handwritten files or payment receipts, extracting information from graphs, captioning images, etc. -- It offers large context window of up to 128k tokens, particularly useful for RAG applications -- Pixtral supports variable image sizes and types: PNG (.png), JPEG (.jpeg and .jpg), WEBP (.webp), as well as non-animated GIF with only one frame (.gif) - - - Pixtral 12B can understand and analyze images, not generate them. You will use it through the /v1/chat/completions endpoint. - - -## How to use it - -### Sending Inference requests - - - Unlike previous Mistral models, Pixtral can take an `image_url` in the content array. - - -To perform inference tasks with your Pixtral model deployed at Scaleway, use the following command: - -```bash -curl -s \ --H "Authorization: Bearer " \ --H "Content-Type: application/json" \ ---request POST \ ---url "https://.ifr.fr-par.scw.cloud/v1/chat/completions" \ ---data '{ - "model": "mistral/pixtral-12b-2409:bf16", - "messages": [ - { - "role": "user", - "content": [ - {"type" : "text", "text": "Describe this image in detail please."}, - {"type": "image_url", "image_url": {"url": "https://picsum.photos/id/32/512/512"}}, - {"type" : "text", "text": "and this one as well."}, - {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}} - ] - } - ], - "top_p": 1, - "temperature": 0.7, - "stream": false -}' -``` - -Make sure to replace `` and `` with your actual [IAM API key](/iam/how-to/create-api-keys/) and the Deployment UUID you are targeting. - - - The model name allows Scaleway to put your prompts in the expected format. - - - - Ensure that the `messages` array is properly formatted with roles (system, user, assistant) and content. - - -### Passing images to Pixtral - -1. Image URLs -If the image is available online, you can just include the image URL in your request as demonstrated above. This approach is simple and does not require any encoding. - -2. Base64 encoded image -Base64 encoding is a standard way to transform binary data, like images, into a text format, making it easier to transmit over the internet. - -The following Python code sample shows you how to encode an image in base64 format and pass it to your request payload. - - -```python -import base64 -from io import BytesIO -from PIL import Image - -def encode_image(img): - buffered = BytesIO() - img.save(buffered, format="JPEG") - encoded_string = base64.b64encode(buffered.getvalue()).decode("utf-8") - return encoded_string - -img = Image.open("path_to_your_image.jpg") -base64_img = encode_image(img) - -payload = { - "messages": [ - { - "role": "user", - "content": [ - {"type": "text", "text": "What is this image?"}, - { - "type": "image_url", - "image_url": {"url": f"data:image/jpeg;base64,{base64_img}"}, - }, - ], - } - ], - ... # other parameters -} - -``` - -### Receiving Managed Inference responses - -Upon sending the HTTP request to the public or private endpoints exposed by the server, you will receive inference responses from the managed Managed Inference server. -Process the output data according to your application's needs. The response will contain the output generated by the visual language model based on the input provided in the request. - - - Despite efforts for accuracy, the possibility of generated text containing inaccuracies or [hallucinations](/managed-inference/concepts/#hallucinations) exists. Always verify the content generated independently. - - -## Frequently Asked Questions - -#### What types of images are supported by Pixtral? -- Bitmap (or raster) image formats, meaning storing images as grids of individual pixels, are supported: PNG, JPEG, WEBP, and non-animated GIFs in particular. -- Vector image formats (SVG, PSD) are not supported. - -#### Are other files supported? -Only bitmaps can be analyzed by Pixtral, PDFs and videos are not supported. - -#### Is there a limit to the size of each image? -Images size are limited: -- Directly by the maximum context window. As an example, since tokens are squares of 16x16 pixels, the maximum context window taken by a single image is `4096` tokens (ie. `(1024*1024)/(16*16)`) -- Indirectly by the model accuracy: resolution above 1024x1024 will not increase model output accuracy. Indeed, images above 1024 pixels width or height will be automatically downscaled to fit within 1024x1024 dimension. Note that image ratio and overall aspect is preserved (images are not cropped, only additionaly compressed). - -#### What is the maximum amount of images per conversation? -One conversation can handle up to 12 images (per request). The 13rd will return a 413 error. diff --git a/pages/managed-inference/reference-content/qwen2.5-coder-32b-instruct.mdx b/pages/managed-inference/reference-content/qwen2.5-coder-32b-instruct.mdx deleted file mode 100644 index 64e943e1cc..0000000000 --- a/pages/managed-inference/reference-content/qwen2.5-coder-32b-instruct.mdx +++ /dev/null @@ -1,78 +0,0 @@ ---- -meta: - title: Understanding the Qwen2.5-Coder-32B-Instruct model - description: Deploy your own secure Qwen2.5-Coder-32B-Instruct model with Scaleway Managed Inference. Privacy-focused, fully managed. -content: - h1: Understanding the Qwen2.5-Coder-32B-Instruct model - paragraph: This page provides information on the Qwen2.5-Coder-32B-Instruct model -tags: -dates: - validation: 2024-12-08 - posted: 2024-12-08 -categories: - - ai-data ---- - -## Model overview - -| Attribute | Details | -|-----------------|------------------------------------| -| Provider | [Qwen](https://qwenlm.github.io/) | -| License | [Apache 2.0](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct/blob/main/LICENSE) | -| Compatible Instances | H100, H100-2 (INT8) | -| Context Length | up to 32k tokens | - -## Model names - -```bash -qwen/qwen2.5-coder-32b-instruct:int8 -``` - -## Compatible Instances - -| Instance type | Max context length | -| ------------- |-------------| -| H100 | 32k (INT8) -| H100-2 | 32k (INT8) - -## Model introduction - -Qwen2.5-coder is your intelligent programming assistant familiar with more than 40 programming languages. -With Qwen2.5-coder deployed at Scaleway, your company can benefit from code generation, AI-assisted code repair, and code reasoning. - -## Why is it useful? - -- Qwen2.5-coder achieved the best performance on multiple popular code generation benchmarks (EvalPlus, LiveCodeBench, BigCodeBench), outranking many open-source models and providing competitive performance with GPT-4o. -- This model is versatile. While demonstrating strong and comprehensive coding abilities, it also possesses good general and mathematical skills. - -## How to use it - -### Sending Managed Inference requests - -To perform inference tasks with your Qwen2.5-coder deployed at Scaleway, use the following command: - -```bash -curl -s \ --H "Authorization: Bearer " \ --H "Content-Type: application/json" \ ---request POST \ ---url "https://.ifr.fr-par.scaleway.com/v1/chat/completions" \ ---data '{"model":"qwen/qwen2.5-coder-32b-instruct:int8", "messages":[{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful code assistant."},{"role": "user","content": "Write a quick sort algorithm."}], "max_tokens": 1000, "temperature": 0.8, "stream": false}' -``` - - - The model name allows Scaleway to put your prompts in the expected format. - - - - Ensure that the `messages` array is properly formatted with roles (system, user, assistant) and content. - - -### Receiving Inference responses - -Upon sending the HTTP request to the public or private endpoints exposed by the server, you will receive inference responses from the managed Managed Inference server. -Process the output data according to your application's needs. The response will contain the output generated by the LLM model based on the input provided in the request. - - - Despite efforts for accuracy, the possibility of generated text containing inaccuracies or [hallucinations](/managed-inference/concepts/#hallucinations) exists. Always verify the content generated independently. - diff --git a/pages/managed-inference/reference-content/sentence-t5-xxl.mdx b/pages/managed-inference/reference-content/sentence-t5-xxl.mdx deleted file mode 100644 index bdb962da0e..0000000000 --- a/pages/managed-inference/reference-content/sentence-t5-xxl.mdx +++ /dev/null @@ -1,70 +0,0 @@ ---- -meta: - title: Understanding the Sentence-t5-xxl embedding model - description: Deploy your own secure Sentence-t5-xxl embedding model with Scaleway Managed Inference. Privacy-focused, fully managed. -content: - h1: Understanding the Sentence-t5-xxl embedding model - paragraph: This page provides information on the Sentence-t5-xxl embedding model -tags: embedding -dates: - validation: 2024-12-03 - posted: 2024-05-22 -categories: - - ai-data ---- - -## Model overview - -| Attribute | Details | -|-----------------|------------------------------------| -| Provider | [sentence-transformers](https://www.sbert.net/) | -| Compatible Instances | L4 (FP32) | -| Context size | 512 tokens | - -## Model name - -```bash -sentence-transformers/sentence-t5-xxl:fp32 -``` - -## Compatible Instances - -| Instance type | Max context length | -| ------------- |-------------| -| L4 | 512 (FP32) | - -## Model introduction - -The Sentence-T5-XXL model represents a significant evolution in sentence embeddings, building on the robust foundation of the Text-To-Text Transfer Transformer (T5) architecture. -Designed for performance in various language processing tasks, Sentence-T5-XXL leverages the strengths of T5's encoder-decoder structure to generate high-dimensional vectors that encapsulate rich semantic information. -This model has been meticulously tuned for tasks such as text classification, semantic similarity, and clustering, making it a useful tool in the RAG (Retrieval-Augmented Generation) framework. It excels in sentence similarity tasks, but its performance in semantic search tasks is less optimal. - -## Why is it useful? - -The Sentence-T5-XXL model is highly ranked on the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard) for open models under Apache-2 license: - -- Sentence-T5-XXL encodes text into 768-dimensional vectors, providing a detailed and nuanced representation of sentence semantics. -- This model was trained on a diverse dataset of 2 billion question-answer pairs from various online communities, ensuring broad applicability and robustness. - -## How to use it - -### Sending Managed Inference requests - -To perform inference tasks with your Embedding model deployed at Scaleway, use the following command: - -```bash -curl https://.ifr.fr-par.scaleway.com/v1/embeddings \ - -H "Authorization: Bearer " \ - -H "Content-Type: application/json" \ - -d '{ - "input": "Embeddings can represent text in a numerical format.", - "model": "sentence-transformers/sentence-t5-xxl:fp32" - }' -``` - -Make sure to replace `` and `` with your actual [IAM API key](/iam/how-to/create-api-keys/) and the Deployment UUID you are targeting. - -### Receiving Inference responses - -Upon sending the HTTP request to the public or private endpoints exposed by the server, you will receive inference responses from the managed Managed Inference server. -Process the output data according to your application's needs. The response will contain the output generated by the embedding model based on the input provided in the request. diff --git a/pages/managed-inference/reference-content/supported-models.mdx b/pages/managed-inference/reference-content/supported-models.mdx index 845be58327..3ab403bbd4 100644 --- a/pages/managed-inference/reference-content/supported-models.mdx +++ b/pages/managed-inference/reference-content/supported-models.mdx @@ -15,7 +15,7 @@ categories: Scaleway Managed Inference allows you to deploy various AI models, either from: - * [Scaleway catalog](#scaleway-catalog): A curated set of ready-to-deploy models available through the [Scaleway console](https://console.scaleway.com/inference/deployments/) or the [Managed Inference models API](https://www.scaleway.com/en/developers/api/inference/#path-models-list-models) + * [Scaleway catalog](#scaleway-catalog): A curated set of ready-to-deploy models available through the [Scaleway console](https:/console.scaleway.com/inference/deployments/) or the [Managed Inference models API](https:/www.scaleway.com/en/developers/api/inference/#path-models-list-models) * [Custom models](#custom-models): Models that you import, typically from sources like Hugging Face. ## Scaleway catalog @@ -28,21 +28,21 @@ _More details to be added._ | Provider | Model identifier | Documentation | License | |------------|-----------------------------------|--------------------------------------------------------------------------|-------------------------------------------------------| -| Allen AI | `molmo-72b-0924` | [View Details](/managed-inference/reference-content/molmo-72b-0924/) | [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0) | -| Deepseek | `deepseek-r1-distill-llama-70b` | [View Details](/managed-inference/reference-content/deepseek-r1-distill-llama-70b/) | [MIT license](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/mit.md) | -| Deepseek | `deepseek-r1-distill-llama-8b` | [View Details](/managed-inference/reference-content/deepseek-r1-distill-llama-8b/) | [MIT license](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/mit.md) | -| Meta | `llama-3-70b-instruct` | [View Details](/managed-inference/reference-content/llama-3-70b-instruct/) | [Llama 3 license](https://www.llama.com/llama3/license/) | -| Meta | `llama-3-8b-instruct` | [View Details](/managed-inference/reference-content/llama-3-8b-instruct/) | [Llama 3 license](https://www.llama.com/llama3/license/) | -| Meta | `llama-3.1-70b-instruct` | [View Details](/managed-inference/reference-content/llama-3.1-70b-instruct/) | [Llama 3.1 community license](https://www.llama.com/llama3_1/license/) | -| Meta | `llama-3.1-8b-instruct` | [View Details](/managed-inference/reference-content/llama-3.1-8b-instruct/) | [Llama 3.1 license](https://www.llama.com/llama3_1/license/) | -| Meta | `llama-3.3-70b-instruct` | [View Details](/managed-inference/reference-content/llama-3.3-70b-instruct/) | [Llama 3.3 license](https://www.llama.com/llama3_3/license/) | -| Nvidia | `llama-3.1-nemotron-70b-instruct` | [View Details](/managed-inference/reference-content/llama-3.1-nemotron-70b-instruct/)| [Llama 3.1 community license](https://www.llama.com/llama3_1/license/) | -| Mistral | `mixtral-8x7b-instruct-v0.1` | [View Details](/managed-inference/reference-content/mixtral-8x7b-instruct-v0.1/) | [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0) | -| Mistral | `mistral-7b-instruct-v0.3` | [View Details](/managed-inference/reference-content/mistral-7b-instruct-v0.3/) | [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0) | -| Mistral | `mistral-nemo-instruct-2407` | [View Details](/managed-inference/reference-content/mistral-nemo-instruct-2407/) | [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0) | -| Mistral | `mistral-small-24b-instruct-2501` | [View Details](/managed-inference/reference-content/mistral-small-24b-instruct-2501/)| [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0) | -| Mistral | `pixtral-12b-2409` | [View Details](/managed-inference/reference-content/pixtral-12b-2409/) | [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0) | -| Qwen | `qwen2.5-coder-32b-instruct` | [View Details](/managed-inference/reference-content/qwen2.5-coder-32b-instruct/) | [Apache 2.0 license](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct/blob/main/LICENSE) | +| Allen AI | `molmo-72b-0924` | [View Details](/managed-inference/reference-content/model-catalog/#molmo-72b-0924) | [Apache 2.0 license](https:/www.apache.org/licenses/LICENSE-2.0) | +| Deepseek | `deepseek-r1-distill-llama-70b` | [View Details](/managed-inference/reference-content/model-catalog/#deepseek-r1-distill-llama-70b) | [MIT license](https:/huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/mit.md) | +| Deepseek | `deepseek-r1-distill-llama-8b` | [View Details](/managed-inference/reference-content/model-catalog/#deepseek-r1-distill-llama-8b) | [MIT license](https:/huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/mit.md) | +| Meta | `llama-3-70b-instruct` | [View Details](/managed-inference/reference-content/model-catalog/#llama-3-70b-instruct) | [Llama 3 license](https:/www.llama.com/llama3/license/) | +| Meta | `llama-3-8b-instruct` | [View Details](/managed-inference/reference-content/model-catalog/#llama-3-8b-instruct) | [Llama 3 license](https:/www.llama.com/llama3/license/) | +| Meta | `llama-3.1-70b-instruct` | [View Details](/managed-inference/reference-content/model-catalog/#llama-31-70b-instruct) | [Llama 3.1 community license](https:/www.llama.com/llama3_1/license/) | +| Meta | `llama-3.1-8b-instruct` | [View Details](/managed-inference/reference-content/model-catalog/#llama-31-8b-instruct) | [Llama 3.1 license](https:/www.llama.com/llama3_1/license/) | +| Meta | `llama-3.3-70b-instruct` | [View Details](/managed-inference/reference-content/model-catalog/#llama-33-70b-instruct) | [Llama 3.3 license](https:/www.llama.com/llama3_3/license/) | +| Nvidia | `llama-3.1-nemotron-70b-instruct` | [View Details](/managed-inference/reference-content/model-catalog/#llama-31-nemotron-70b-instruct)| [Llama 3.1 community license](https:/www.llama.com/llama3_1/license/) | +| Mistral | `mixtral-8x7b-instruct-v0.1` | [View Details](/managed-inference/reference-content/model-catalog/#mixtral-8x7b-instruct-v01/) | [Apache 2.0 license](https:/www.apache.org/licenses/LICENSE-2.0) | +| Mistral | `mistral-7b-instruct-v0.3` | [View Details](/managed-inference/reference-content/model-catalog/#mistral-7b-instruct-v03) | [Apache 2.0 license](https:/www.apache.org/licenses/LICENSE-2.0) | +| Mistral | `mistral-nemo-instruct-2407` | [View Details](/managed-inference/reference-content/model-catalog/#mistral-nemo-instruct-2407) | [Apache 2.0 license](https:/www.apache.org/licenses/LICENSE-2.0) | +| Mistral | `mistral-small-24b-instruct-2501` | [View Details](/managed-inference/reference-content/model-catalog/#mistral-small-24b-instruct-2501)| [Apache 2.0 license](https:/www.apache.org/licenses/LICENSE-2.0) | +| Mistral | `pixtral-12b-2409` | [View Details](/managed-inference/reference-content/model-catalog/#pixtral-12b-2409) | [Apache 2.0 license](https:/www.apache.org/licenses/LICENSE-2.0) | +| Qwen | `qwen2.5-coder-32b-instruct` | [View Details](/managed-inference/reference-content/model-catalog/#qwen2.5-coder-32b-instruct) | [Apache 2.0 license](https:/huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct/blob/main/LICENSE) | ### Vision models @@ -52,21 +52,21 @@ _More details to be added._ | Provider | Model identifier | Documentation | License | |----------|------------------|----------------|---------| -| BAAI | `bge-multilingual-gemma2` | [View Details](/managed-inference/reference-content/bge-multilingual-gemma2/) | [Gemma Terms of Use](https://ai.google.dev/gemma/terms) | -| Sentence Transformers | `sentence-t5-xxl` | [View Details](/managed-inference/reference-content/sentence-t5-xxl/) | [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0) | +| BAAI | `bge-multilingual-gemma2` | [View Details](/managed-inference/reference-content/model-catalog/bge-multilingual-gemma2/) | [Gemma Terms of Use](https:/ai.google.dev/gemma/terms) | +| Sentence Transformers | `sentence-t5-xxl` | [View Details](/managed-inference/reference-content/model-catalog/sentence-t5-xxl/) | [Apache 2.0 license](https:/www.apache.org/licenses/LICENSE-2.0) | ## Custom models - Custom model support is currently in **beta**. If you encounter issues or limitations, please report them via our [Slack community channel](https://scaleway-community.slack.com/archives/C01SGLGRLEA) or [customer support](https://console.scaleway.com/support/tickets/create?for=product&productName=inference). + Custom model support is currently in **beta**. If you encounter issues or limitations, please report them via our [Slack community channel](https:/scaleway-community.slack.com/archives/C01SGLGRLEA) or [customer support](https:/console.scaleway.com/support/tickets/create?for=product&productName=inference). ### Prerequisites We recommend starting with a variation of a supported model from the Scaleway catalog. - For example, you can deploy a [quantized (4-bit) version of Llama 3.3](https://huggingface.co/unsloth/Llama-3.3-70B-Instruct-bnb-4bit). + For example, you can deploy a [quantized (4-bit) version of Llama 3.3](https:/huggingface.co/unsloth/Llama-3.3-70B-Instruct-bnb-4bit). If deploying a fine-tuned version of Llama 3.3, make sure your file structure matches the example linked above. @@ -76,7 +76,7 @@ To deploy a custom model via Hugging Face, ensure the following: * You must have access to the model using your Hugging Face credentials. * For gated models, request access through your Hugging Face account. - * Credentials are not stored, but we recommend using [read or fine-grained access tokens](https://huggingface.co/docs/hub/security-tokens). + * Credentials are not stored, but we recommend using [read or fine-grained access tokens](https:/huggingface.co/docs/hub/security-tokens). #### Required files @@ -85,7 +85,7 @@ Your model repository must include: * A `config.json` file containig: * An `architectures` array (see [supported architectures](#supported-models-architecture) for the exact list of supported values). * `max_position_embeddings` - * Model weights in the [`.safetensors`](https://huggingface.co/docs/safetensors/index) format + * Model weights in the [`.safetensors`](https:/huggingface.co/docs/safetensors/index) format * A chat template included in either: * `tokenizer_config.json` as a `chat_template` field, or * `chat_template.json` as a `chat_template` field @@ -101,7 +101,7 @@ Your model must be one of the following types: **Security Notice**
- Models using formats that allow arbitrary code execution, such as Python [`pickle`](https://docs.python.org/3/library/pickle.html), are **not supported**. + Models using formats that allow arbitrary code execution, such as Python [`pickle`](https:/docs.python.org/3/library/pickle.html), are **not supported**.
## API support diff --git a/pages/managed-inference/reference-content/wizardlm-70b-v1.0.mdx b/pages/managed-inference/reference-content/wizardlm-70b-v1.0.mdx deleted file mode 100644 index bb3423d284..0000000000 --- a/pages/managed-inference/reference-content/wizardlm-70b-v1.0.mdx +++ /dev/null @@ -1,78 +0,0 @@ ---- -meta: - title: Understanding the WizardLM-70B-V1.0 model - description: Deploy your own secure WizardLM-70B model with Scaleway Managed Inference. Privacy-focused, fully managed. -content: - h1: Understanding the WizardLM-70B-V1.0 model - paragraph: This page provides information on the WizardLM-70B-V1.0 model -tags: model-library ai-data wizardLM -dates: - validation: 2025-03-19 - posted: 2024-03-26 -categories: - - ai-data ---- - -## Model overview - -| Attribute | Details | -|-----------------|------------------------------------| -| Provider | [WizardLM](https://wizardlm.github.io/WizardLM2/) | -| Compatible Instances | H100 (FP8) - H100-2 (FP16) | -| Context size | 4,096 tokens | - -## Model names - -```bash -wizardlm/wizardlm-70b-v1.0:fp8 -wizardlm/wizardlm-70b-v1.0:fp16 -``` - -## Compatible Instances - -- [H100-1 (INT8)](https://www.scaleway.com/en/h100-pcie-try-it-now/) -- [H100-2 (FP16)](https://www.scaleway.com/en/h100-pcie-try-it-now/) - -## Model introduction - -WizardLM-70B-V1.0, developed by WizardLM, is specifically designed for content creation platforms and writing assistants. -With its extensive training in diverse textual data, WizardLM-70B-V1.0 generates high-quality content and assists writers in various creative and professional endeavors. - -## Why is it useful? - -WizardLM-70B-V1.0 offers unparalleled versatility and creativity in content generation. Whether you are a writer seeking inspiration or a content platform looking to automate content creation, this model delivers exceptional performance. -Its adaptability and natural language fluency make it valuable for enhancing productivity and creativity. - -## How to use it - -### Sending Inference requests - -To perform inference tasks with your WizardLM model deployed at Scaleway, use the following command: - -```bash -curl -s \ --H "Authorization: Bearer " \ --H "Content-Type: application/json" \ ---request POST \ ---url "https://.ifr.fr-par.scaleway.com/v1/chat/completions" \ ---data '{"model":"wizardlm/wizardlm-70b-v1.0:fp8", "messages":[{"role": "user","content": "Say hello to Scaleway's Inference"}], "max_tokens": 200, "top_p": 1, "temperature": 1, "stream": false}' -``` - -Make sure to replace `` and `` with your actual [IAM API key](/iam/how-to/create-api-keys/) and the Deployment UUID you are targeting. - - - The model name allows Scaleway to put your prompts in the expected format. - - - - Ensure that the `messages` array is properly formatted with roles (system, user, assistant) and content. - - -### Receiving Managed Inference responses - -Upon sending the HTTP request to the public or private endpoints exposed by the server, you will receive inference responses from the Managed Inference server. -Process the output data according to your application's needs. The response will contain the output generated by the LLM model based on the input provided in the request. - - - Despite efforts for accuracy, the possibility of generated text containing inaccuracies or [hallucinations](/managed-inference/concepts/#hallucinations) exists. Always verify the content generated independently. - \ No newline at end of file diff --git a/tutorials/processing-images-structured-outputs-pixtral/index.mdx b/tutorials/processing-images-structured-outputs-pixtral/index.mdx index 3d6442cc88..a00c2c78ae 100644 --- a/tutorials/processing-images-structured-outputs-pixtral/index.mdx +++ b/tutorials/processing-images-structured-outputs-pixtral/index.mdx @@ -26,7 +26,7 @@ This tutorial will guide you through the process of using the Pixtral vision mod - A Scaleway account logged into the [console](https://console.scaleway.com) - A Python environment (version 3.7 or higher) - An API key from Scaleway [Identity and Access Management](/iam/) -- Access to a Scaleway [Managed Inference](/managed-inference/reference-content/pixtral-12b-2409/) endpoint with Pixtral deployed or to Scaleway [Generative APIs](/generative-apis/quickstart/) service +- Access to a Scaleway [Managed Inference](/managed-inference/reference-content/model-catalog/#pixtral-12b-2409) endpoint with Pixtral deployed or to Scaleway [Generative APIs](/generative-apis/quickstart/) service - The `openai` and `pydantic` Python libraries installed ## Setting up the environment From 87e56e0421e98014e481f48c11075604dcb9655c Mon Sep 17 00:00:00 2001 From: Benedikt Rollik Date: Wed, 23 Apr 2025 15:25:38 +0200 Subject: [PATCH 2/4] Apply suggestions from code review --- pages/generative-apis/faq.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pages/generative-apis/faq.mdx b/pages/generative-apis/faq.mdx index ca18fca4e4..b6539101c5 100644 --- a/pages/generative-apis/faq.mdx +++ b/pages/generative-apis/faq.mdx @@ -55,7 +55,7 @@ Note that in this example, the first line where the free tier applies will not d ## What is a token and how are they counted? A token is the minimum unit of content that is seen and processed by a model. Hence, token definitions depend on input types: - For text, on average, `1` token corresponds to `~4` characters, and thus `0.75` words (as words are on average five characters long) -- For images, `1` token corresponds to a square of pixels. For example, `pixtral-12b-2409 `model image tokens of `16x16` pixels (16-pixel height, and 16-pixel width, hence `256` pixels in total). +- For images, `1` token corresponds to a square of pixels. For example, `pixtral-12b-2409` model image tokens of `16x16` pixels (16-pixel height, and 16-pixel width, hence `256` pixels in total). The exact token count and definition depend on [tokenizers](https://huggingface.co/learn/llm-course/en/chapter2/4) used by each model. When this difference is significant (such as for image processing), you can find detailed information in each model documentation (for instance in [`pixtral-12b-2409` size limit documentation](https://www.scaleway.com/en/docs/managed-inference/reference-content/pixtral-12b-2409/#frequently-asked-questions)). Otherwise, when the model is open, you can find this information in the model files on platforms such as Hugging Face, usually in the `tokenizer_config.json` file. From 0ebc64b41199fc51d356d89e1d2fc129435ba9fc Mon Sep 17 00:00:00 2001 From: Benedikt Rollik Date: Wed, 23 Apr 2025 15:36:04 +0200 Subject: [PATCH 3/4] docs(infr): fix broken links --- pages/managed-inference/how-to/import-custom-model.mdx | 2 +- pages/managed-inference/reference-content/model-catalog.mdx | 2 +- .../managed-inference/reference-content/supported-models.mdx | 4 ++-- tutorials/how-to-implement-rag/index.mdx | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pages/managed-inference/how-to/import-custom-model.mdx b/pages/managed-inference/how-to/import-custom-model.mdx index e3f503be2b..c66744716f 100644 --- a/pages/managed-inference/how-to/import-custom-model.mdx +++ b/pages/managed-inference/how-to/import-custom-model.mdx @@ -48,4 +48,4 @@ Scaleway provides a selection of common models for deployment from the Scaleway - Estimated cost. Once checked, click **Begin import** to finalize the process. -Your imported model will now appear in the model library. You can proceed to [deploy your model on Managed Inference](/ai-data/managed-inference/how-to/create-deployment/). \ No newline at end of file +Your imported model will now appear in the model library. You can proceed to [deploy your model on Managed Inference](/managed-inference/how-to/create-deployment/). \ No newline at end of file diff --git a/pages/managed-inference/reference-content/model-catalog.mdx b/pages/managed-inference/reference-content/model-catalog.mdx index c35c956098..5e3fd58cdf 100644 --- a/pages/managed-inference/reference-content/model-catalog.mdx +++ b/pages/managed-inference/reference-content/model-catalog.mdx @@ -116,7 +116,7 @@ allenai/molmo-72b-0924:fp8 ## Text models ### Llama-3.3-70b-instruct -Released December 6, 2024, Meta’s Llama 3.3 70b is a fine-tune of the [Llama 3.1 70b](/managed-inference/reference-content/llama-3.1-70b-instruct/) model. +Released December 6, 2024, Meta’s Llama 3.3 70b is a fine-tune of the [Llama 3.1 70b](/managed-inference/managed-inference/reference-content/model-catalog/#llama-31-70b-instruct) model. This model is still text-only (text in/text out). However, Llama 3.3 was designed to approach the performance of Llama 3.1 405B on some applications. #### Model name diff --git a/pages/managed-inference/reference-content/supported-models.mdx b/pages/managed-inference/reference-content/supported-models.mdx index 3ab403bbd4..7cd1cfcb9e 100644 --- a/pages/managed-inference/reference-content/supported-models.mdx +++ b/pages/managed-inference/reference-content/supported-models.mdx @@ -52,8 +52,8 @@ _More details to be added._ | Provider | Model identifier | Documentation | License | |----------|------------------|----------------|---------| -| BAAI | `bge-multilingual-gemma2` | [View Details](/managed-inference/reference-content/model-catalog/bge-multilingual-gemma2/) | [Gemma Terms of Use](https:/ai.google.dev/gemma/terms) | -| Sentence Transformers | `sentence-t5-xxl` | [View Details](/managed-inference/reference-content/model-catalog/sentence-t5-xxl/) | [Apache 2.0 license](https:/www.apache.org/licenses/LICENSE-2.0) | +| BAAI | `bge-multilingual-gemma2` | [View Details](/managed-inference/reference-content/model-catalog/#bge-multilingual-gemma2) | [Gemma Terms of Use](https:/ai.google.dev/gemma/terms) | +| Sentence Transformers | `sentence-t5-xxl` | [View Details](/managed-inference/reference-content/model-catalog/#sentence-t5-xxl) | [Apache 2.0 license](https:/www.apache.org/licenses/LICENSE-2.0) | ## Custom models diff --git a/tutorials/how-to-implement-rag/index.mdx b/tutorials/how-to-implement-rag/index.mdx index b806a4b3a2..a7aa528ecb 100644 --- a/tutorials/how-to-implement-rag/index.mdx +++ b/tutorials/how-to-implement-rag/index.mdx @@ -26,7 +26,7 @@ LangChain simplifies the process of enhancing language models with retrieval cap - A Scaleway account logged into the [console](https://console.scaleway.com) - [Owner](/iam/concepts/#owner) status or [IAM permissions](/iam/concepts/#permission) allowing you to perform actions in the intended Organization - A valid [API key](/iam/how-to/create-api-keys/) -- An [Inference Deployment](/managed-inference/how-to/create-deployment/): set it up using [sentence-transformers/sentence-t5-xxl](/managed-inference/reference-content/sentence-t5-xxl/) on an L4 instance to efficiently process embeddings. +- An [Inference Deployment](/managed-inference/how-to/create-deployment/): set it up using [sentence-transformers/sentence-t5-xxl](/managed-inference/reference-content/model-catalog/#sentence-t5-xxl) on an L4 instance to efficiently process embeddings. - An [Inference Deployment](/managed-inference/how-to/create-deployment/) with the large language model of your choice. - An [Object Storage Bucket](/object-storage/how-to/create-a-bucket/) to store all the data you want to inject into your LLM model. - A [Managed Database](/managed-databases-for-postgresql-and-mysql/how-to/create-a-database/) to securely store all your embeddings. From c8e7681e6af2e0408564018aedcdde18a49b8e68 Mon Sep 17 00:00:00 2001 From: Benedikt Rollik Date: Wed, 23 Apr 2025 15:46:11 +0200 Subject: [PATCH 4/4] docs(infr): fix 404 --- pages/managed-inference/reference-content/model-catalog.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pages/managed-inference/reference-content/model-catalog.mdx b/pages/managed-inference/reference-content/model-catalog.mdx index 5e3fd58cdf..0860e0c377 100644 --- a/pages/managed-inference/reference-content/model-catalog.mdx +++ b/pages/managed-inference/reference-content/model-catalog.mdx @@ -116,7 +116,7 @@ allenai/molmo-72b-0924:fp8 ## Text models ### Llama-3.3-70b-instruct -Released December 6, 2024, Meta’s Llama 3.3 70b is a fine-tune of the [Llama 3.1 70b](/managed-inference/managed-inference/reference-content/model-catalog/#llama-31-70b-instruct) model. +Released December 6, 2024, Meta’s Llama 3.3 70b is a fine-tune of the [Llama 3.1 70b](/managed-inference/reference-content/model-catalog/#llama-31-70b-instruct) model. This model is still text-only (text in/text out). However, Llama 3.3 was designed to approach the performance of Llama 3.1 405B on some applications. #### Model name