MicrosoftDocs
diff --git a/‎articles/ai-foundry/model-inference/concepts/models.md
Lines changed: 1 addition & 1 deletion b/‎articles/ai-foundry/model-inference/concepts/models.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎articles/ai-foundry/model-inference/includes/use-chat-reasoning/java.md
Lines changed: 4 additions & 4 deletions b/‎articles/ai-foundry/model-inference/includes/use-chat-reasoning/java.md
Lines changed: 4 additions & 4 deletions
diff --git a/‎articles/ai-foundry/model-inference/includes/use-chat-reasoning/javascript.md
Lines changed: 4 additions & 3 deletions b/‎articles/ai-foundry/model-inference/includes/use-chat-reasoning/javascript.md
Lines changed: 4 additions & 3 deletions
diff --git a/‎articles/ai-foundry/model-inference/includes/use-embeddings/rest.md
Lines changed: 7 additions & 2 deletions b/‎articles/ai-foundry/model-inference/includes/use-embeddings/rest.md
Lines changed: 7 additions & 2 deletions
diff --git a/‎articles/ai-foundry/model-inference/includes/use-image-embeddings/rest.md
Lines changed: 6 additions & 6 deletions b/‎articles/ai-foundry/model-inference/includes/use-image-embeddings/rest.md
Lines changed: 6 additions & 6 deletions
diff --git a/‎articles/ai-services/document-intelligence/concept/troubleshoot-latency.md
Lines changed: 123 additions & 0 deletions b/‎articles/ai-services/document-intelligence/concept/troubleshoot-latency.md
Lines changed: 123 additions & 0 deletions
diff --git a/‎articles/ai-services/document-intelligence/media/latency/azure-portal-create-alert.png
126 KB b/‎articles/ai-services/document-intelligence/media/latency/azure-portal-create-alert.png
126 KB
diff --git a/‎articles/ai-services/document-intelligence/media/latency/azure-portal-monitoring-metrics.png
127 KB b/‎articles/ai-services/document-intelligence/media/latency/azure-portal-monitoring-metrics.png
127 KB
diff --git a/‎articles/ai-services/document-intelligence/media/latency/azure-portal-monitoring.png
166 KB b/‎articles/ai-services/document-intelligence/media/latency/azure-portal-monitoring.png
166 KB
diff --git a/‎articles/ai-services/document-intelligence/media/latency/azure-status.png
91.4 KB b/‎articles/ai-services/document-intelligence/media/latency/azure-status.png
91.4 KB
@@ -97,7 +97,7 @@ DeepSeek family of models include DeepSeek-R1, which excels at reasoning tasks u
 
 | Model  | Type | Tier | Capabilities |
 | ------ | ---- | --- | ------------ |
-| [DeekSeek-R1](https://ai.azure.com/explore/models/deepseek-r1/version/1/registry/azureml-deepseek) | chat-completion | Global standard | - **Input:** text (16,384 tokens) <br /> - **Output:**  (163,840 tokens) <br /> - **Languages:** `en` and `zh` <br />  - **Tool calling:** No <br /> - **Response formats:** Text (with reasoning content). |
+| [DeekSeek-R1](https://ai.azure.com/explore/models/deepseek-r1/version/1/registry/azureml-deepseek) | chat-completion <br /> [(with reasoning content)](../how-to/use-chat-reasoning.md) | Global standard | - **Input:** text (16,384 tokens) <br /> - **Output:**  (163,840 tokens) <br /> - **Languages:** `en` and `zh` <br />  - **Tool calling:** No <br /> - **Response formats:** Text. |
 
 See [this model collection in Azure AI Foundry portal](https://ai.azure.com/explore/models?&selectedCollection=deepseek).
 
 
@@ -27,7 +27,7 @@ To complete this tutorial, you need:
   <dependency>
       <groupId>com.azure</groupId>
       <artifactId>azure-ai-inference</artifactId>
-      <version>1.0.0-beta.1</version>
+      <version>1.0.0-beta.2</version>
   </dependency>
   ```
 
@@ -65,7 +65,6 @@ First, create the client to consume the model. The following code uses an endpoi
 ChatCompletionsClient client = new ChatCompletionsClient(
         new URI("https://<resource>.services.ai.azure.com/models"),
         new AzureKeyCredential(System.getProperty("AZURE_INFERENCE_CREDENTIAL")),
-        "${variants-sample}"
 ```
 
 > [!TIP]
@@ -76,8 +75,7 @@ If you have configured the resource to with **Microsoft Entra ID** support, you
 ```java
 client = new ChatCompletionsClient(
         new URI("https://<resource>.services.ai.azure.com/models"),
-        new DefaultAzureCredentialBuilder().build(),
-        "${variants-sample}"
+        new DefaultAzureCredentialBuilder().build()
 );
 ```
 
@@ -87,6 +85,7 @@ The following example shows how you can create a basic chat request to the model
 
 ```java
 ChatCompletionsOptions requestOptions = new ChatCompletionsOptions()
+        .setModel("DeepSeek-R1")
         .setMessages(Arrays.asList(
                 new ChatRequestUserMessage("How many languages are in the world?")
         ));
@@ -167,6 +166,7 @@ You can _stream_ the content to get it as it's being generated. Streaming conten
 
 ```java
 ChatCompletionsOptions requestOptions = new ChatCompletionsOptions()
+        .setModel("DeepSeek-R1")
         .setMessages(Arrays.asList(
                 new ChatRequestUserMessage("How many languages are in the world? Write an essay about it.")
         ))
 
@@ -38,8 +38,7 @@ import { AzureKeyCredential } from "@azure/core-auth";
 
 const client = new ModelClient(
     process.env.AZURE_INFERENCE_ENDPOINT, 
-    new AzureKeyCredential(process.env.AZURE_INFERENCE_CREDENTIAL),
-    "deepseek-r1"
+    new AzureKeyCredential(process.env.AZURE_INFERENCE_CREDENTIAL)
 );
 ```
 
@@ -58,7 +57,6 @@ const clientOptions = { credentials: { "https://cognitiveservices.azure.com" } }
 const client = new ModelClient(
     "https://<resource>.services.ai.azure.com/models", 
     new DefaultAzureCredential(),
-    "deepseek-r1",
     clientOptions,
 );
 ```
@@ -74,6 +72,7 @@ var messages = [
 
 var response = await client.path("/chat/completions").post({
     body: {
+        model: "DeepSeek-R1",
         messages: messages,
     }
 });
@@ -163,6 +162,7 @@ var messages = [
 
 var response = await client.path("/chat/completions").post({
     body: {
+        model: "DeepSeek-R1",
         messages: messages,
     }
 }).asNodeStream();
@@ -229,6 +229,7 @@ try {
     ];
 
     var response = await client.path("/chat/completions").post({
+        model: "DeepSeek-R1",
         body: {
             messages: messages,
         }
 
@@ -50,6 +50,7 @@ Create an embedding request to see the output of the model.
 
 ```json
 {
+    "model": "text-embedding-3-small",
     "input": [
         "The ultimate answer to the question of life"
     ]
@@ -93,6 +94,7 @@ It can be useful to compute embeddings in input batches. The parameter `inputs`
 
 ```json
 {
+    "model": "text-embedding-3-small",
     "input": [
         "The ultimate answer to the question of life", 
         "The largest planet in our solar system is Jupiter"
@@ -150,6 +152,7 @@ You can specify the number of dimensions for the embeddings. The following examp
 
 ```json
 {
+    "model": "text-embedding-3-small",
     "input": [
         "The ultimate answer to the question of life"
     ],
@@ -161,23 +164,25 @@ You can specify the number of dimensions for the embeddings. The following examp
 
 Some models can generate multiple embeddings for the same input depending on how you plan to use them. This capability allows you to retrieve more accurate embeddings for RAG patterns. 
 
-The following example shows how to create embeddings that are used to create an embedding for a document that will be stored in a vector database:
+The following example shows how to create embeddings that are used to create an embedding for a document that will be stored in a vector database. Since `text-embedding-3-small` doesn't support this capability, we are using an embedding model from Cohere in the following example:
 
 
 ```json
 {
+    "model": "cohere-embed-v3-english",
     "input": [
         "The answer to the ultimate question of life, the universe, and everything is 42"
     ],
     "input_type": "document"
 }
 ```
 
-When you work on a query to retrieve such a document, you can use the following code snippet to create the embeddings for the query and maximize the retrieval performance.
+When you work on a query to retrieve such a document, you can use the following code snippet to create the embeddings for the query and maximize the retrieval performance. Since `text-embedding-3-small` doesn't support this capability, we are using an embedding model from Cohere in the following example:
 
 
 ```json
 {
+    "model": "cohere-embed-v3-english",
     "input": [
         "What's the ultimate meaning of life?"
     ],
 
@@ -33,15 +33,15 @@ To use embedding models in your application, you need:
 To use the text embeddings, use the route `/images/embeddings` appended to your base URL along with your credential indicated in `api-key`. `Authorization` header is also supported with the format `Bearer <key>`.
 
 ```http
-POST https://<resource>.services.ai.azure.com/models/images/embeddings
+POST https://<resource>.services.ai.azure.com/models/images/embeddings?api-version=2024-05-01-preview
 Content-Type: application/json
 api-key: <key>
 ```
 
 If you configured the resource with **Microsoft Entra ID** support, pass you token in the `Authorization` header:
 
 ```http
-POST https://<resource>.services.ai.azure.com/models/images/embeddings
+POST https://<resource>.services.ai.azure.com/models/images/embeddings?api-version=2024-05-01-preview
 Content-Type: application/json
 Authorization: Bearer <token>
 ```
@@ -52,7 +52,7 @@ To create image embeddings, you need to pass the image data as part of your requ
 
 ```json
 {
-    "model": "${varients-sample}",
+    "model": "Cohere-embed-v3-english",
     "input": [
         {
             "image": "data:image/png;base64,iVBORw0KGgoAAAANSUh..."
@@ -103,7 +103,7 @@ Some models can generate embeddings from images and text pairs. In this case, yo
 
 ```json
 {
-    "model": "${varients-sample}",
+    "model": "Cohere-embed-v3-english",
     "input": [
         {
             "image": "data:image/png;base64,iVBORw0KGgoAAAANSUh...",
@@ -122,7 +122,7 @@ The following example shows how to create embeddings that are used to create an
 
 ```json
 {
-    "model": "${varients-sample}",
+    "model": "Cohere-embed-v3-english",
     "input": [
         {
             "image": "data:image/png;base64,iVBORw0KGgoAAAANSUh..."
@@ -137,7 +137,7 @@ When you work on a query to retrieve such a document, you can use the following
 
 ```json
 {
-    "model": "${varients-sample}",
+    "model": "Cohere-embed-v3-english",
     "input": [
         {
             "image": "data:image/png;base64,iVBORw0KGgoAAAANSUh..."
 
@@ -0,0 +1,123 @@
+---
+title: Troubleshoot latency issues with Document Intelligence API
+titleSuffix: Azure AI services
+description: Learn troubleshooting tips, remedial solutions, and best practices for addressing Document Intelligence latency issues.
+author: laujan
+manager: nitinme
+ms.service: azure-ai-document-intelligence
+ms.topic: troubleshooting
+ms.date: 02/05/2025
+ms.author: lajanuar
+---
+
+# Troubleshooting latency issues in Azure AI Document Intelligence
+
+This article presents troubleshooting tips, remedial solutions, and best practices to address Document Intelligence latency issues. Latency refers to the duration an API server takes to handle and process an incoming request before delivering the response to the client. The time required to analyze a document varies based on its size (such as the number of pages) and the content on each page. 
+
+Document Intelligence operates as a multitenant service, ensuring that latency for similar documents is comparable, though not always identical. Variability in latency and performance is an inherent characteristic of any microservice-based, stateless, asynchronous service, especially when processing images and large documents on a large scale. Despite continuous efforts to increase hardware capacity and enhance scalability, some latency issues can still arise during runtime.
+
+> [!NOTE]
+>
+> * Azure AI services don't provide a Service Level Agreement (SLA) for latency.
+> * The Document Intelligence API offers asynchronous functionality, allowing you to access results up to 24 hours after sending your request to our backend. 
+> * Use the request ID provided by the POST operation to retrieve these results. If you encounter issues during your standard polling sequence, save the request ID and try again later before considering a retry. For further assistance, refer to our [service page](../service-limits.md#detailed-description-quota-adjustment-and-best-practices).
+
+## Set your latency baseline
+
+To evaluate latency, you should first establish baseline metrics for your specific scenario. These metrics give you the expected end-to-end and server latency within the context of your application environment. Once you have these baseline metrics, it becomes easier to distinguish between expected and unexpected conditions.
+
+## Check Azure region status
+
+When you're experiencing latency issues, the first step is to check [Azure status](https://azure.status.microsoft/status) for any current outages or issues that might impact your services.
+
+* All active events are listed under the `Current Impact` tab.
+
+* You can also check your resource in the host region. Go to Geography → Products And Services → AI + Machine Learning → Azure AI Document Intelligence and check the status for your region:
+
+   :::image type="content" source="../media/latency/azure-status.png" alt-text="Screenshot of the Microsoft Azure status page." lightbox="../media/latency/azure-status.png":::
+
+## Check file size
+
+Monitor the size of files you send via the request API. Processing larger files in parallel can result in increased processing times. Normalize your metric by measuring latency per page. If you observe sustained periods (exceeding one hour) where latency per page consistently surpasses 15 seconds, consider addressing the issue.
+
+## Check Azure Blob storage latency
+
+The size of a request affects latency in Azure Storage operations. Larger operations take more time to complete due to the increased volume of data transferred over the network and processed by Azure Storage.
+
+Azure Storage provides two latency metrics for block blobs in the Azure portal:
+
+   * End-to-end (E2E) latency measures the interval from when Azure Storage receives the first packet of the request until Azure Storage receives a client acknowledgment on the last packet of the response.
+
+   * Server latency measures the interval from when Azure Storage receives the last packet of the request until the first packet of the response is returned from Azure Storage.
+
+To view latency metrics, navigate to your storage resource in the Azure portal:
+
+* On the left navigation window, select **Insights** from the **Monitoring** drop-down menu.
+
+* The insights tab opens a window that includes a chart showing both `E2E` and `Server` latency metrics:
+
+   :::image type="content" source="../media/latency/azure-storage.png" alt-text="Screenshot of Azure Storage latency metrics in the Azure portal.":::
+
+
+For more information, *see* [Latency in Blob storage](/azure/storage/blobs/storage-blobs-latency).
+
+
+## Check monitoring metrics for your resource
+
+Azure portal monitors offer insights into your applications to enhance their performance and availability. There are several tools that you can use to monitor your app's performance in the Azure portal:
+
+1.  On the **Overview** page, select **Monitoring**, select the time period, and review the **Request latency** metrics on page.
+
+    :::image type="content" source="../media/latency/azure-portal-monitoring.png" alt-text="Screenshot of Azure usage monitoring metrics in the Azure portal.":::
+
+1. On the left navigation window, select **Metrics** from the **Monitoring** drop-down menu.
+
+   * In the main window, select ➕**Add metric**.
+
+   * Keep the **Scope** and **Metric Namespace** fields unchanged. Add the **Latency** parameter to the **Metric** field and adjust the **Aggregation** field as needed.
+
+      :::image type="content" source="../media/latency/azure-portal-monitoring-metrics.png" alt-text="Screenshot of add your own metrics setting in the Azure portal.":::
+
+## Set a latency alert in the Azure portal
+
+Alerts assist you in identifying and resolving issues by providing proactive notifications when Azure Monitor data suggests a potential issue. An alert rule keeps an eye on your data and notifies you when set criteria are met on your specified resource. You can set up an alert in the Azure portal as follows:
+
+1. On the left navigation window, select **Alerts** from the **Monitoring** drop-down menu.
+
+1. Select the **Create alert rule** button.
+
+1. In the new window that opens, select **Latency** from the **Select a signal** drop-down menu.
+
+   :::image type="content" source="../media/latency/azure-portal-create-alert.png" alt-text="Screenshot of the create an alert rule page in the Azure portal.":::
+
+1. Configure the alert by completing the fields on the page.
+
+1. After you complete the configuration, select **Review ➕ create**
+
+
+ ### Contact us
+
+If you're unable to resolve long latency issue, [email us](mailto:[email protected]) with the following information:
+
+* Model Name
+
+* Version
+
+* Subscription ID
+
+* Resource ID
+
+* Timestamp and issue description
+
+* Request IDs of the concerning operations (if possible)
+
+* Logs
+
+* Sample files
+
+* JSON file (output/analyze results)
+
+* Training set (if it's a training issue related to custom neural models)
+
+
+For more assistance, you can also or use the feedback widget at the bottom of any Microsoft Learn page.
Original file line number	Diff line number	Diff line change
`@@ -50,6 +50,7 @@ Create an embedding request to see the output of the model.`
`50`	`50`
`51`	`51`	```json
`52`	`52`	`{`
	`53`	`+ "model": "text-embedding-3-small",`
`53`	`54`	`"input": [`
`54`	`55`	`"The ultimate answer to the question of life"`
`55`	`56`	`]`
@@ -93,6 +94,7 @@ It can be useful to compute embeddings in input batches. The parameter `inputs`
`93`	`94`
`94`	`95`	```json
`95`	`96`	`{`
	`97`	`+ "model": "text-embedding-3-small",`
`96`	`98`	`"input": [`
`97`	`99`	`"The ultimate answer to the question of life",`
`98`	`100`	`"The largest planet in our solar system is Jupiter"`
`@@ -150,6 +152,7 @@ You can specify the number of dimensions for the embeddings. The following examp`
`150`	`152`
`151`	`153`	```json
`152`	`154`	`{`
	`155`	`+ "model": "text-embedding-3-small",`
`153`	`156`	`"input": [`
`154`	`157`	`"The ultimate answer to the question of life"`
`155`	`158`	`],`
`@@ -161,23 +164,25 @@ You can specify the number of dimensions for the embeddings. The following examp`
`161`	`164`
`162`	`165`	`Some models can generate multiple embeddings for the same input depending on how you plan to use them. This capability allows you to retrieve more accurate embeddings for RAG patterns.`
`163`	`166`
`164`		`-The following example shows how to create embeddings that are used to create an embedding for a document that will be stored in a vector database:`
	`167`	+The following example shows how to create embeddings that are used to create an embedding for a document that will be stored in a vector database. Since `text-embedding-3-small` doesn't support this capability, we are using an embedding model from Cohere in the following example:
`165`	`168`
`166`	`169`
`167`	`170`	```json
`168`	`171`	`{`
	`172`	`+ "model": "cohere-embed-v3-english",`
`169`	`173`	`"input": [`
`170`	`174`	`"The answer to the ultimate question of life, the universe, and everything is 42"`
`171`	`175`	`],`
`172`	`176`	`"input_type": "document"`
`173`	`177`	`}`
`174`	`178`	```
`175`	`179`
`176`		`-When you work on a query to retrieve such a document, you can use the following code snippet to create the embeddings for the query and maximize the retrieval performance.`
	`180`	+When you work on a query to retrieve such a document, you can use the following code snippet to create the embeddings for the query and maximize the retrieval performance. Since `text-embedding-3-small` doesn't support this capability, we are using an embedding model from Cohere in the following example:
`177`	`181`
`178`	`182`
`179`	`183`	```json
`180`	`184`	`{`
	`185`	`+ "model": "cohere-embed-v3-english",`
`181`	`186`	`"input": [`
`182`	`187`	`"What's the ultimate meaning of life?"`
`183`	`188`	`],`
Original file line number	Diff line number	Diff line change
`@@ -33,15 +33,15 @@ To use embedding models in your application, you need:`
`33`	`33`	To use the text embeddings, use the route `/images/embeddings` appended to your base URL along with your credential indicated in `api-key`. `Authorization` header is also supported with the format `Bearer <key>`.
`34`	`34`
`35`	`35`	```http
`36`		`-POST https://<resource>.services.ai.azure.com/models/images/embeddings`
	`36`	`+POST https://<resource>.services.ai.azure.com/models/images/embeddings?api-version=2024-05-01-preview`
`37`	`37`	`Content-Type: application/json`
`38`	`38`	`api-key: <key>`
`39`	`39`	```
`40`	`40`
`41`	`41`	If you configured the resource with Microsoft Entra ID support, pass you token in the `Authorization` header:
`42`	`42`
`43`	`43`	```http
`44`		`-POST https://<resource>.services.ai.azure.com/models/images/embeddings`
	`44`	`+POST https://<resource>.services.ai.azure.com/models/images/embeddings?api-version=2024-05-01-preview`
`45`	`45`	`Content-Type: application/json`
`46`	`46`	`Authorization: Bearer <token>`
`47`	`47`	```
`@@ -52,7 +52,7 @@ To create image embeddings, you need to pass the image data as part of your requ`
`52`	`52`
`53`	`53`	```json
`54`	`54`	`{`
`55`		`- "model": "${varients-sample}",`
	`55`	`+ "model": "Cohere-embed-v3-english",`
`56`	`56`	`"input": [`
`57`	`57`	`{`
`58`	`58`	`"image": "data:image/png;base64,iVBORw0KGgoAAAANSUh..."`
`@@ -103,7 +103,7 @@ Some models can generate embeddings from images and text pairs. In this case, yo`
`103`	`103`
`104`	`104`	```json
`105`	`105`	`{`
`106`		`- "model": "${varients-sample}",`
	`106`	`+ "model": "Cohere-embed-v3-english",`
`107`	`107`	`"input": [`
`108`	`108`	`{`
`109`	`109`	`"image": "data:image/png;base64,iVBORw0KGgoAAAANSUh...",`
`@@ -122,7 +122,7 @@ The following example shows how to create embeddings that are used to create an`
`122`	`122`
`123`	`123`	```json
`124`	`124`	`{`
`125`		`- "model": "${varients-sample}",`
	`125`	`+ "model": "Cohere-embed-v3-english",`
`126`	`126`	`"input": [`
`127`	`127`	`{`
`128`	`128`	`"image": "data:image/png;base64,iVBORw0KGgoAAAANSUh..."`
`@@ -137,7 +137,7 @@ When you work on a query to retrieve such a document, you can use the following`
`137`	`137`
`138`	`138`	```json
`139`	`139`	`{`
`140`		`- "model": "${varients-sample}",`
	`140`	`+ "model": "Cohere-embed-v3-english",`
`141`	`141`	`"input": [`
`142`	`142`	`{`
`143`	`143`	`"image": "data:image/png;base64,iVBORw0KGgoAAAANSUh..."`