lastmile-ai
diff --git a/‎pyproject.toml‎
Lines changed: 0 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/mcp_agent/config.py‎
Lines changed: 2 additions & 0 deletions b/‎src/mcp_agent/config.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/mcp_agent/workflows/llm/augmented_llm.py‎
Lines changed: 5 additions & 0 deletions b/‎src/mcp_agent/workflows/llm/augmented_llm.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/mcp_agent/workflows/llm/augmented_llm_anthropic.py‎
Lines changed: 66 additions & 95 deletions b/‎src/mcp_agent/workflows/llm/augmented_llm_anthropic.py‎
Lines changed: 66 additions & 95 deletions
diff --git a/‎src/mcp_agent/workflows/llm/augmented_llm_azure.py‎
Lines changed: 33 additions & 6 deletions b/‎src/mcp_agent/workflows/llm/augmented_llm_azure.py‎
Lines changed: 33 additions & 6 deletions
@@ -16,7 +16,6 @@ requires-python = ">=3.10"
 dependencies = [
     "aiohttp>=3.11.13",
     "fastapi>=0.115.6",
-    "instructor>=1.7.9",
     "jsonref>=1.1.0",
     "mcp>=1.10.1",
     "numpy>=2.1.3",
 
@@ -10,6 +10,7 @@
 import threading
 import warnings
 
+from httpx import URL
 from pydantic import AliasChoices, BaseModel, ConfigDict, Field, field_validator
 from pydantic_settings import BaseSettings, SettingsConfigDict
 
@@ -205,6 +206,7 @@ class AnthropicSettings(BaseSettings, VertexAIMixin, BedrockMixin):
             "provider", "ANTHROPIC_PROVIDER", "anthropic__provider"
         ),
     )
+    base_url: str | URL | None = Field(default=None)
 
     model_config = SettingsConfigDict(
         env_prefix="ANTHROPIC_",
 
@@ -168,6 +168,11 @@ class RequestParams(CreateMessageRequestParams):
     This is used to stably identify the user in the LLM provider's logs.
     """
 
+    strict: bool = False
+    """
+    Whether models that support strict mode should strictly enforce the response schema.
+    """
+
 
 class AugmentedLLMProtocol(Protocol, Generic[MessageParamT, MessageT]):
     """Protocol defining the interface for augmented LLMs"""
 
@@ -48,7 +48,7 @@
 from mcp_agent.tracing.telemetry import get_tracer, is_otel_serializable, telemetry
 from mcp_agent.tracing.token_tracking_decorator import track_tokens
 from mcp_agent.utils.common import ensure_serializable, typed_dict_extras, to_string
-from mcp_agent.utils.pydantic_type_serializer import serialize_model, deserialize_model
+
 from mcp_agent.workflows.llm.augmented_llm import (
     AugmentedLLM,
     ModelT,
@@ -83,15 +83,6 @@ class RequestCompletionRequest(BaseModel):
     payload: dict
 
 
-class RequestStructuredCompletionRequest(BaseModel):
-    config: AnthropicSettings
-    params: RequestParams
-    response_model: Type[ModelT] | None = None
-    serialized_response_model: str | None = None
-    response_str: str
-    model: str
-
-
 def create_anthropic_instance(settings: AnthropicSettings):
     """Select and initialise the appropriate anthropic client instance based on settings"""
     if settings.provider == "bedrock":
@@ -419,68 +410,86 @@ async def generate_structured(
         response_model: Type[ModelT],
         request_params: RequestParams | None = None,
     ) -> ModelT:
-        # First we invoke the LLM to generate a string response
-        # We need to do this in a two-step process because Instructor doesn't
-        # know how to invoke MCP tools via call_tool, so we'll handle all the
-        # processing first and then pass the final response through Instructor
+        # Use Anthropic's native structured output via a forced tool call carrying JSON input
+        import json
+
         tracer = get_tracer(self.context)
         with tracer.start_as_current_span(
             f"{self.__class__.__name__}.{self.name}.generate_structured"
         ) as span:
             span.set_attribute(GEN_AI_AGENT_NAME, self.agent.name)
             self._annotate_span_for_generation_message(span, message)
 
-            response = await self.generate_str(
-                message=message,
-                request_params=request_params,
-            )
-
             params = self.get_request_params(request_params)
-
             if self.context.tracing_enabled:
                 AugmentedLLM.annotate_span_with_request_params(span, params)
 
-            model = await self.select_model(params)
-            span.set_attribute(GEN_AI_REQUEST_MODEL, model)
-
-            span.set_attribute("response_model", response_model.__name__)
-
-            serialized_response_model: str | None = None
-
-            if self.executor and self.executor.execution_engine == "temporal":
-                # Serialize the response model to a string
-                serialized_response_model = serialize_model(response_model)
-
-            structured_response = await self.executor.execute(
-                AnthropicCompletionTasks.request_structured_completion_task,
-                RequestStructuredCompletionRequest(
-                    config=self.context.config.anthropic,
-                    params=params,
-                    response_model=response_model
-                    if not serialized_response_model
-                    else None,
-                    serialized_response_model=serialized_response_model,
-                    response_str=response,
-                    model=model,
-                ),
+            model_name = (
+                await self.select_model(params) or self.default_request_params.model
             )
+            span.set_attribute(GEN_AI_REQUEST_MODEL, model_name)
 
-            # TODO: saqadri (MAC) - fix request_structured_completion_task to return ensure_serializable
-            # Convert dict back to the proper model instance if needed
-            if isinstance(structured_response, dict):
-                structured_response = response_model.model_validate(structured_response)
+            # Convert message(s) to Anthropic format
+            messages: List[MessageParam] = []
+            if params.use_history:
+                messages.extend(self.history.get())
+            messages.extend(
+                AnthropicConverter.convert_mixed_messages_to_anthropic(message)
+            )
 
-            if self.context.tracing_enabled:
-                try:
-                    span.set_attribute(
-                        "structured_response_json",
-                        structured_response.model_dump_json(),
-                    )
-                # pylint: disable=broad-exception-caught
-                except Exception:
-                    span.set_attribute("unstructured_response", response)
+            # Define a single tool that matches the Pydantic schema
+            schema = response_model.model_json_schema()
+            tools: List[ToolParam] = [
+                {
+                    "name": "return_structured_output",
+                    "description": "Return the response in the required JSON format",
+                    "input_schema": schema,
+                }
+            ]
+
+            args = {
+                "model": model_name,
+                "messages": messages,
+                "system": self.instruction or params.systemPrompt,
+                "tools": tools,
+                "tool_choice": {"type": "tool", "name": "return_structured_output"},
+            }
+            if params.maxTokens is not None:
+                args["max_tokens"] = params.maxTokens
+            if params.stopSequences:
+                args["stop_sequences"] = params.stopSequences
+
+            # Call Anthropic directly (one-turn streaming for consistency)
+            base_url = None
+            if self.context and self.context.config and self.context.config.anthropic:
+                base_url = self.context.config.anthropic.base_url
+                api_key = self.context.config.anthropic.api_key
+                client = AsyncAnthropic(api_key=api_key, base_url=base_url)
+            else:
+                client = AsyncAnthropic()
+
+            async with client:
+                async with client.messages.stream(**args) as stream:
+                    final = await stream.get_final_message()
+
+            # Extract tool_use input and validate
+            for block in final.content:
+                if (
+                    getattr(block, "type", None) == "tool_use"
+                    and getattr(block, "name", "") == "return_structured_output"
+                ):
+                    data = getattr(block, "input", None)
+                    try:
+                        if isinstance(data, str):
+                            return response_model.model_validate(json.loads(data))
+                        return response_model.model_validate(data)
+                    except Exception:
+                        # Fallthrough to error
+                        break
 
-            return structured_response
+            raise ValueError(
+                "Failed to obtain structured output from Anthropic response"
+            )
 
     @classmethod
     def convert_message_to_message_param(
@@ -770,44 +779,6 @@ async def request_completion_task(
             response = ensure_serializable(response)
             return response
 
-    @staticmethod
-    @workflow_task
-    @telemetry.traced()
-    async def request_structured_completion_task(
-        request: RequestStructuredCompletionRequest,
-    ):
-        """
-        Request a structured completion using Instructor's Anthropic API.
-        """
-        import instructor
-
-        if request.response_model:
-            response_model = request.response_model
-        elif request.serialized_response_model:
-            response_model = deserialize_model(request.serialized_response_model)
-        else:
-            raise ValueError(
-                "Either response_model or serialized_response_model must be provided for structured completion."
-            )
-
-        # We pass the text through instructor to extract structured data
-        client = instructor.from_anthropic(create_anthropic_instance(request.config))
-
-        # Extract structured data from natural language without blocking the loop
-        loop = asyncio.get_running_loop()
-        structured_response = await loop.run_in_executor(
-            None,
-            functools.partial(
-                client.chat.completions.create,
-                model=request.model,
-                response_model=response_model,
-                messages=[{"role": "user", "content": request.response_str}],
-                max_tokens=request.params.maxTokens,
-            ),
-        )
-
-        return structured_response
-
 
 class AnthropicMCPTypeConverter(ProviderToMCPConverter[MessageParam, Message]):
     """
 
@@ -2,6 +2,7 @@
 import functools
 import json
 from typing import Any, Iterable, Optional, Type, Union
+from azure.core.exceptions import HttpResponseError
 from azure.ai.inference import ChatCompletionsClient
 from azure.ai.inference.models import (
     ChatCompletions,
@@ -351,6 +352,7 @@ async def generate_structured(
             name=response_model.__name__,
             description=response_model.__doc__,
             schema=json_schema,
+            strict=request_params.strict,
         )
         request_params.metadata = metadata
 
@@ -362,7 +364,7 @@ async def generate_structured(
 
     @classmethod
     def convert_message_to_message_param(
-        cls, message: ResponseMessage, **kwargs
+        cls, message: ResponseMessage
     ) -> AssistantMessage:
         """Convert a response object to an input parameter object to allow LLM calls to be chained."""
         assistant_message = AssistantMessage(
@@ -539,12 +541,37 @@ async def request_completion_task(
                 ),
             )
 
-        payload = request.payload
-        # Offload sync SDK call to a thread to avoid blocking the event loop
+        payload = request.payload.copy()
         loop = asyncio.get_running_loop()
-        response = await loop.run_in_executor(
-            None, functools.partial(azure_client.complete, **payload)
-        )
+
+        try:
+            response = await loop.run_in_executor(
+                None, functools.partial(azure_client.complete, **payload)
+            )
+        except HttpResponseError as e:
+            logger = get_logger(__name__)
+
+            if e.status_code != 400:
+                logger.error(f"Azure API call failed: {e}")
+                raise
+
+            logger.warning(
+                f"Initial Azure API call failed: {e}. Retrying with fallback parameters."
+            )
+
+            # Create a new payload with fallback values for commonly problematic parameters
+            fallback_payload = {**payload, "max_tokens": None, "temperature": 1}
+
+            try:
+                response = await loop.run_in_executor(
+                    None, functools.partial(azure_client.complete, **fallback_payload)
+                )
+            except Exception as retry_error:
+                # If retry also fails, raise a more informative error
+                raise RuntimeError(
+                    f"Azure API call failed even with fallback parameters. "
+                    f"Original error: {e}. Retry error: {retry_error}"
+                ) from retry_error
         return response