[FEAT] Support for thinking/reasoning capabilities for LLMs in Bedrock and AzureOpenAI (#188)

pk-zipstack · ritwik-g · web-flow · commit bb327e8ae13d · 2025-06-05T12:32:43.000+05:30
* Updated llama-index packages to support reasoning models

* Added temperature in json of azure and added thinking support for bedrock claude models

* Migrated to BedrockConverse

* Updated version of the sdk

* Added reasoning effort parameter for azure openai

* Update src/unstract/sdk/adapters/llm/azure_open_ai/src/azure_open_ai.py

Co-authored-by: Ritwik G &lt;100672805+ritwik-g@users.noreply.github.com&gt;
Signed-off-by: Praveen Kumar &lt;praveen@zipstack.com&gt;

* Updated sdk version from 0.72.1 to 0.73.0

* Fixed issue with additonal_kwargs

* Updated anthropic llama-index version to support claude sonnet 4

---------

Signed-off-by: Praveen Kumar &lt;praveen@zipstack.com&gt;
Co-authored-by: Ritwik G &lt;100672805+ritwik-g@users.noreply.github.com&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,7 +26,7 @@ dependencies = [
     "python-magic~=0.4.27",
     "python-dotenv==1.0.0",
     # Adapter changes
-    "llama-index==0.12.37",
+    "llama-index==0.12.39",
     "tiktoken~=0.9.0",
     "transformers==4.37.0",
     "llama-index-embeddings-google==0.3.0",
@@ -48,17 +48,17 @@ dependencies = [
     "llama-index-vector-stores-weaviate==1.3.1",
     "llama-index-vector-stores-pinecone==0.4.2",
     "llama-index-vector-stores-qdrant==0.4.2",
-    "llama-index-llms-openai==0.3.42",
+    "llama-index-llms-openai==0.4.1",
     "llama-index-llms-palm==0.3.0",
     "llama-index-llms-mistralai==0.3.1",
     "mistralai==1.2.5",
-    "llama-index-llms-anyscale==0.3.0",
-    "llama-index-llms-anthropic==0.6.14",
-    "llama-index-llms-azure-openai==0.3.2",
+    "llama-index-llms-anyscale==0.3.1",
+    "llama-index-llms-anthropic==0.7.2",
+    "llama-index-llms-azure-openai==0.3.3",
     "llama-index-llms-vertex==0.4.6",
     "llama-index-llms-replicate==0.4.0",
     "llama-index-llms-ollama==0.5.0",
-    "llama-index-llms-bedrock==0.3.3",
+    "llama-index-llms-bedrock-converse==0.7.1",
     # For Llama Parse X2Text
     "llama-parse==0.5.19",
     # OCR
diff --git a/src/unstract/sdk/__init__.py b/src/unstract/sdk/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "v0.72.0"
+__version__ = "v0.73.0"
 
 
 def get_sdk_version() -> str:
diff --git a/src/unstract/sdk/adapters/llm/azure_open_ai/src/azure_open_ai.py b/src/unstract/sdk/adapters/llm/azure_open_ai/src/azure_open_ai.py
@@ -3,6 +3,7 @@
 
 from llama_index.core.llms import LLM
 from llama_index.llms.azure_openai import AzureOpenAI
+from llama_index.llms.openai.utils import O1_MODELS
 
 from unstract.sdk.adapters.exceptions import AdapterError
 from unstract.sdk.adapters.llm.constants import LLMKeys
@@ -20,6 +21,8 @@ class Constants:
     API_TYPE = "azure"
     TIMEOUT = "timeout"
     DEFAULT_MODEL = "gpt-35-turbo"
+    ENABLE_REASONING = "enable_reasoning"
+    REASONING_EFFORT = "reasoning_effort"
 
 
 class AzureOpenAILLM(LLMAdapter):
@@ -55,21 +58,33 @@ def get_llm_instance(self) -> LLM:
         )
         max_tokens = self.config.get(Constants.MAX_TOKENS)
         max_tokens = int(max_tokens) if max_tokens else None
+        enable_reasoning = self.config.get(Constants.ENABLE_REASONING)
+        model = self.config.get(Constants.MODEL, Constants.DEFAULT_MODEL)
+
+        llm_kwargs = {
+            "model": model,
+            "deployment_name": str(self.config.get(Constants.DEPLOYMENT_NAME)),
+            "api_key": str(self.config.get(Constants.API_KEY)),
+            "api_version": str(self.config.get(Constants.API_VERSION)),
+            "azure_endpoint": str(self.config.get(Constants.AZURE_ENDPONT)),
+            "api_type": Constants.API_TYPE,
+            "temperature": 0,
+            "timeout": float(self.config.get(Constants.TIMEOUT, LLMKeys.DEFAULT_TIMEOUT)),
+            "max_retries": max_retries,
+        }
+
+        if enable_reasoning:
+            llm_kwargs["reasoning_effort"] = self.config.get(
+                    Constants.REASONING_EFFORT
+                )
+
+        if model not in O1_MODELS:
+            llm_kwargs["max_completion_tokens"] = max_tokens
+        else:
+            llm_kwargs["max_tokens"] = max_tokens
+
         try:
-            llm: LLM = AzureOpenAI(
-                model=self.config.get(Constants.MODEL, Constants.DEFAULT_MODEL),
-                deployment_name=str(self.config.get(Constants.DEPLOYMENT_NAME)),
-                api_key=str(self.config.get(Constants.API_KEY)),
-                api_version=str(self.config.get(Constants.API_VERSION)),
-                azure_endpoint=str(self.config.get(Constants.AZURE_ENDPONT)),
-                api_type=Constants.API_TYPE,
-                temperature=0,
-                timeout=float(
-                    self.config.get(Constants.TIMEOUT, LLMKeys.DEFAULT_TIMEOUT)
-                ),
-                max_retries=max_retries,
-                max_tokens=max_tokens,
-            )
+            llm: LLM = AzureOpenAI(**llm_kwargs)
             return llm
         except Exception as e:
             raise AdapterError(str(e))
diff --git a/src/unstract/sdk/adapters/llm/azure_open_ai/src/static/json_schema.json b/src/unstract/sdk/adapters/llm/azure_open_ai/src/static/json_schema.json
@@ -67,6 +67,53 @@
       "title": "Timeout",
       "default": 900,
       "description": "Timeout in seconds"
+    },
+    "enable_reasoning": {
+      "type": "boolean",
+      "title": "Enable Reasoning",
+      "default": false,
+      "description": "Allow the model to apply extra reasoning for complex tasks. May slightly increase latency and cost, typically within 20–50% depending on the level selected. Only applicable for [O series models](https://platform.openai.com/docs/models#reasoning)."
+    }
+  },
+  "allOf": [
+    {
+      "if": {
+        "properties": {
+          "enable_reasoning": {
+            "const": true
+          }
+        }
+      },
+      "then": {
+        "properties": {
+          "reasoning_effort": {
+            "type": "string",
+            "enum": [
+              "low",
+              "medium",
+              "high"
+            ],
+            "default": "medium",
+            "title": "Reasoning Effort",
+            "description": "Sets the Reasoning Strength when Reasoning Effort is enabled"
+          }
+        },
+        "required": [
+          "reasoning_effort"
+        ]
+      }
+    },
+    {
+      "if": {
+        "properties": {
+          "enable_reasoning": {
+            "const": false
+          }
+        }
+      },
+      "then": {
+        "properties": {}
+      }
     }
-  }
+  ]
 }
diff --git a/src/unstract/sdk/adapters/llm/bedrock/src/bedrock.py b/src/unstract/sdk/adapters/llm/bedrock/src/bedrock.py
@@ -2,7 +2,7 @@
 from typing import Any
 
 from llama_index.core.llms import LLM
-from llama_index.llms.bedrock import Bedrock
+from llama_index.llms.bedrock_converse import BedrockConverse
 from unstract.sdk.adapters.exceptions import AdapterError
 from unstract.sdk.adapters.llm.constants import LLMKeys
 from unstract.sdk.adapters.llm.llm_adapter import LLMAdapter
@@ -19,6 +19,8 @@ class Constants:
     CONTEXT_SIZE = "context_size"
     MAX_TOKENS = "max_tokens"
     DEFAULT_MAX_TOKENS = 512  # Default at llama-index
+    ENABLE_THINKING = "enable_thinking"
+    BUDGET_TOKENS = "budget_tokens"
 
 
 class BedrockLLM(LLMAdapter):
@@ -49,16 +51,28 @@ def get_icon() -> str:
         return "/icons/adapter-icons/Bedrock.png"
 
     def get_llm_instance(self) -> LLM:
+
+        thinking = self.config.get(Constants.ENABLE_THINKING)
+        thinking_dict = None
+        temperature = 0
+        additional_kwargs = None
+
+        if thinking:
+            additional_kwargs = {
+                "additionalModelRequestFields": {
+                    "thinking": {
+                        "type": "enabled",
+                        "budget_tokens": self.config.get(Constants.BUDGET_TOKENS)
+                    }
+                }
+            }
+            temperature = 1
+
         try:
-            context_size: int | None = (
-                int(self.config.get(Constants.CONTEXT_SIZE, 0))
-                if self.config.get(Constants.CONTEXT_SIZE)
-                else None
-            )
             max_tokens = int(
                 self.config.get(Constants.MAX_TOKENS, Constants.DEFAULT_MAX_TOKENS)
             )
-            llm: LLM = Bedrock(
+            llm: LLM = BedrockConverse(
                 model=self.config.get(Constants.MODEL),
                 aws_access_key_id=self.config.get(Constants.ACCESS_KEY_ID),
                 aws_secret_access_key=self.config.get(Constants.SECRET_ACCESS_KEY),
@@ -69,9 +83,9 @@ def get_llm_instance(self) -> LLM:
                 max_retries=int(
                     self.config.get(Constants.MAX_RETRIES, LLMKeys.DEFAULT_MAX_RETRIES)
                 ),
-                temperature=0,
-                context_size=context_size,
+                temperature=temperature,
                 max_tokens=max_tokens,
+                additional_kwargs=additional_kwargs,
             )
             return llm
         except Exception as e:
diff --git a/src/unstract/sdk/adapters/llm/bedrock/src/static/json_schema.json b/src/unstract/sdk/adapters/llm/bedrock/src/static/json_schema.json
@@ -38,13 +38,6 @@
       "title": "AWS Region name",
       "description": "Provide the AWS Region name where the service is running. Eg. us-east-1"
     },
-    "context_size": {
-      "type": "number",
-      "minimum": 0,
-      "multipleOf": 1,
-      "title": "Context Size",
-      "description": "The maximum number of context (input) tokens for the model. For setting default in supported models, leave this empty."
-    },
     "max_tokens": {
       "type": "number",
       "minimum": 0,
@@ -68,6 +61,49 @@
       "title": "Timeout",
       "default": 900,
       "description": "Timeout in seconds"
+    },
+    "enable_thinking": {
+      "type": "boolean",
+      "title": "Enable Extended Thinking",
+      "default": false,
+      "description": "Enhance reasoning for complex tasks with step-by-step transparency. Available only for Claude 3.7 Sonnet."
+    }
+  },
+  "allOf": [
+    {
+      "if": {
+        "properties": {
+          "enable_thinking": {
+            "const": true
+          }
+        }
+      },
+      "then": {
+        "properties": {
+          "budget_tokens": {
+            "type": "number",
+            "minimum": 1024,
+            "default": 1024,
+            "title": "Thinking Budget Tokens",
+            "description": "Sets the max tokens for Claude's internal reasoning when thinking is enabled"
+          }
+        },
+        "required": [
+          "budget_tokens"
+        ]
+      }
+    },
+    {
+      "if": {
+        "properties": {
+          "enable_thinking": {
+            "const": false
+          }
+        }
+      },
+      "then": {
+        "properties": {}
+      }
     }
-  }
+  ]
 }
diff --git a/src/unstract/sdk/adapters/llm/llm_adapter.py b/src/unstract/sdk/adapters/llm/llm_adapter.py
@@ -76,9 +76,14 @@ def _test_llm_instance(llm: LLM | None) -> bool:
         if hasattr(llm, "model") and llm.model not in O1_MODELS:
             completion_kwargs["temperature"] = 0.003
 
-        if hasattr(llm, "thinking_dict") and llm.thinking_dict is not None:
+        if (
+            hasattr(llm, "thinking_dict") and llm.thinking_dict is not None
+        ) or (
+            hasattr(llm, "additional_kwargs") and llm.additional_kwargs is not None
+        ):
             completion_kwargs["temperature"] = 1
 
+
         response = llm.complete("The capital of Tamilnadu is ", **completion_kwargs)
         response_lower_case: str = response.text.lower()
         find_match = re.search("chennai", response_lower_case)
diff --git a/src/unstract/sdk/adapters/llm/open_ai/src/open_ai.py b/src/unstract/sdk/adapters/llm/open_ai/src/open_ai.py
@@ -20,7 +20,7 @@ class Constants:
     API_BASE = "api_base"
     API_VERSION = "api_version"
     MAX_TOKENS = "max_tokens"
-    RESONING_EFFORT = "reasoning_effort"
+    REASONING_EFFORT = "reasoning_effort"
     ENABLE_REASONING = "enable_reasoning"
 
 
@@ -79,7 +79,7 @@ def get_llm_instance(self) -> LLM:
 
             if enable_reasoning:
                 llm_kwargs["reasoning_effort"] = self.config.get(
-                    Constants.RESONING_EFFORT
+                    Constants.REASONING_EFFORT
                 )
 
             llm = OpenAI(**llm_kwargs)
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = "v0.72.0"`
	`1`	`+__version__ = "v0.73.0"`
`2`	`2`
`3`	`3`
`4`	`4`	`def get_sdk_version() -> str:`