2626
2727from .monitoring import TokenUsage
2828from .tools import Tool
29- from .utils import RateLimiter , _is_package_available , encode_image_base64 , make_image_url , parse_json_blob
29+ from .utils import RateLimiter , Retrying , _is_package_available , encode_image_base64 , make_image_url , parse_json_blob
3030
3131
3232if TYPE_CHECKING :
3535
3636logger = logging .getLogger (__name__ )
3737
38+ RETRY_WAIT = 120
39+ RETRY_MAX_ATTEMPTS = 3
3840STRUCTURED_GENERATION_PROVIDERS = ["cerebras" , "fireworks-ai" ]
3941CODEAGENT_RESPONSE_FORMAT = {
4042 "type" : "json_schema" ,
@@ -1078,6 +1080,8 @@ class ApiModel(Model):
10781080 Pre-configured API client instance. If not provided, a default client will be created. Defaults to None.
10791081 requests_per_minute (`float`, **optional**):
10801082 Rate limit in requests per minute.
1083+ retry (`bool`, **optional**):
1084+ Wether to retry on rate limit errors, up to RETRY_MAX_ATTEMPTS times. Defaults to True.
10811085 **kwargs:
10821086 Additional keyword arguments to forward to the underlying model completion call.
10831087 """
@@ -1088,12 +1092,21 @@ def __init__(
10881092 custom_role_conversions : dict [str , str ] | None = None ,
10891093 client : Any | None = None ,
10901094 requests_per_minute : float | None = None ,
1095+ retry : bool = True ,
10911096 ** kwargs ,
10921097 ):
10931098 super ().__init__ (model_id = model_id , ** kwargs )
10941099 self .custom_role_conversions = custom_role_conversions or {}
10951100 self .client = client or self .create_client ()
10961101 self .rate_limiter = RateLimiter (requests_per_minute )
1102+ self .retryer = Retrying (
1103+ max_attempts = RETRY_MAX_ATTEMPTS if retry else 1 ,
1104+ wait_seconds = RETRY_WAIT ,
1105+ retry_predicate = is_rate_limit_error ,
1106+ reraise = True ,
1107+ before_sleep_logger = (logger , logging .INFO ),
1108+ after_logger = (logger , logging .INFO ),
1109+ )
10971110
10981111 def create_client (self ):
10991112 """Create the API client for the specific service."""
@@ -1104,6 +1117,17 @@ def _apply_rate_limit(self):
11041117 self .rate_limiter .throttle ()
11051118
11061119
1120+ def is_rate_limit_error (exception : BaseException ) -> bool :
1121+ """Check if the exception is a rate limit error."""
1122+ error_str = str (exception ).lower ()
1123+ return (
1124+ "429" in error_str
1125+ or "rate limit" in error_str
1126+ or "too many requests" in error_str
1127+ or "rate_limit" in error_str
1128+ )
1129+
1130+
11071131class LiteLLMModel (ApiModel ):
11081132 """Model to use [LiteLLM Python SDK](https://docs.litellm.ai/docs/#litellm-python-sdk) to access hundreds of LLMs.
11091133
@@ -1186,7 +1210,8 @@ def generate(
11861210 ** kwargs ,
11871211 )
11881212 self ._apply_rate_limit ()
1189- response = self .client .completion (** completion_kwargs )
1213+ response = self .retryer (self .client .completion , ** completion_kwargs )
1214+
11901215 if not response .choices :
11911216 raise RuntimeError (
11921217 f"Unexpected API response: model '{ self .model_id } ' returned no choices. "
@@ -1228,7 +1253,9 @@ def generate_stream(
12281253 ** kwargs ,
12291254 )
12301255 self ._apply_rate_limit ()
1231- for event in self .client .completion (** completion_kwargs , stream = True , stream_options = {"include_usage" : True }):
1256+ for event in self .retryer (
1257+ self .client .completion , ** completion_kwargs , stream = True , stream_options = {"include_usage" : True }
1258+ ):
12321259 if getattr (event , "usage" , None ):
12331260 yield ChatMessageStreamDelta (
12341261 content = "" ,
@@ -1398,8 +1425,8 @@ class InferenceClientModel(ApiModel):
13981425 Example:
13991426 ```python
14001427 >>> engine = InferenceClientModel(
1401- ... model_id="Qwen/Qwen2.5-Coder-32B-Instruct ",
1402- ... provider="nebius ",
1428+ ... model_id="Qwen/Qwen3-Next-80B-A3B-Thinking ",
1429+ ... provider="hyperbolic ",
14031430 ... token="your_hf_token_here",
14041431 ... max_tokens=5000,
14051432 ... )
@@ -1412,7 +1439,7 @@ class InferenceClientModel(ApiModel):
14121439
14131440 def __init__ (
14141441 self ,
1415- model_id : str = "Qwen/Qwen2.5-Coder-32B -Instruct" ,
1442+ model_id : str = "Qwen/Qwen3-Next-80B-A3B -Instruct" ,
14161443 provider : str | None = None ,
14171444 token : str | None = None ,
14181445 timeout : int = 120 ,
@@ -1472,7 +1499,7 @@ def generate(
14721499 ** kwargs ,
14731500 )
14741501 self ._apply_rate_limit ()
1475- response = self .client .chat_completion ( ** completion_kwargs )
1502+ response = self .retryer ( self . client .chat_completion , ** completion_kwargs )
14761503 content = response .choices [0 ].message .content
14771504 if stop_sequences is not None and not self .supports_stop_parameter :
14781505 content = remove_content_after_stop_sequences (content , stop_sequences )
@@ -1506,8 +1533,11 @@ def generate_stream(
15061533 ** kwargs ,
15071534 )
15081535 self ._apply_rate_limit ()
1509- for event in self .client .chat .completions .create (
1510- ** completion_kwargs , stream = True , stream_options = {"include_usage" : True }
1536+ for event in self .retryer (
1537+ self .client .chat .completions .create ,
1538+ ** completion_kwargs ,
1539+ stream = True ,
1540+ stream_options = {"include_usage" : True },
15111541 ):
15121542 if getattr (event , "usage" , None ):
15131543 yield ChatMessageStreamDelta (
@@ -1539,12 +1569,12 @@ def generate_stream(
15391569 raise ValueError (f"No content or tool calls in event: { event } " )
15401570
15411571
1542- class OpenAIServerModel (ApiModel ):
1572+ class OpenAIModel (ApiModel ):
15431573 """This model connects to an OpenAI-compatible API server.
15441574
15451575 Parameters:
15461576 model_id (`str`):
1547- The model identifier to use on the server (e.g. "gpt-3.5-turbo ").
1577+ The model identifier to use on the server (e.g. "gpt-5 ").
15481578 api_base (`str`, *optional*):
15491579 The base URL of the OpenAI-compatible API server.
15501580 api_key (`str`, *optional*):
@@ -1595,7 +1625,7 @@ def create_client(self):
15951625 import openai
15961626 except ModuleNotFoundError as e :
15971627 raise ModuleNotFoundError (
1598- "Please install 'openai' extra to use OpenAIServerModel : `pip install 'smolagents[openai]'`"
1628+ "Please install 'openai' extra to use OpenAIModel : `pip install 'smolagents[openai]'`"
15991629 ) from e
16001630
16011631 return openai .OpenAI (** self .client_kwargs )
@@ -1619,8 +1649,11 @@ def generate_stream(
16191649 ** kwargs ,
16201650 )
16211651 self ._apply_rate_limit ()
1622- for event in self .client .chat .completions .create (
1623- ** completion_kwargs , stream = True , stream_options = {"include_usage" : True }
1652+ for event in self .retryer (
1653+ self .client .chat .completions .create ,
1654+ ** completion_kwargs ,
1655+ stream = True ,
1656+ stream_options = {"include_usage" : True },
16241657 ):
16251658 if event .usage :
16261659 yield ChatMessageStreamDelta (
@@ -1670,7 +1703,7 @@ def generate(
16701703 ** kwargs ,
16711704 )
16721705 self ._apply_rate_limit ()
1673- response = self .client .chat .completions .create ( ** completion_kwargs )
1706+ response = self .retryer ( self . client .chat .completions .create , ** completion_kwargs )
16741707 content = response .choices [0 ].message .content
16751708 if stop_sequences is not None and not self .supports_stop_parameter :
16761709 content = remove_content_after_stop_sequences (content , stop_sequences )
@@ -1686,10 +1719,10 @@ def generate(
16861719 )
16871720
16881721
1689- OpenAIModel = OpenAIServerModel
1722+ OpenAIServerModel = OpenAIModel
16901723
16911724
1692- class AzureOpenAIServerModel ( OpenAIServerModel ):
1725+ class AzureOpenAIModel ( OpenAIModel ):
16931726 """This model connects to an Azure OpenAI deployment.
16941727
16951728 Parameters:
@@ -1740,16 +1773,16 @@ def create_client(self):
17401773 import openai
17411774 except ModuleNotFoundError as e :
17421775 raise ModuleNotFoundError (
1743- "Please install 'openai' extra to use AzureOpenAIServerModel : `pip install 'smolagents[openai]'`"
1776+ "Please install 'openai' extra to use AzureOpenAIModel : `pip install 'smolagents[openai]'`"
17441777 ) from e
17451778
17461779 return openai .AzureOpenAI (** self .client_kwargs )
17471780
17481781
1749- AzureOpenAIModel = AzureOpenAIServerModel
1782+ AzureOpenAIServerModel = AzureOpenAIModel
17501783
17511784
1752- class AmazonBedrockServerModel (ApiModel ):
1785+ class AmazonBedrockModel (ApiModel ):
17531786 """
17541787 A model class for interacting with Amazon Bedrock Server models through the Bedrock API.
17551788
@@ -1789,7 +1822,7 @@ class AmazonBedrockServerModel(ApiModel):
17891822 Examples:
17901823 Creating a model instance with default settings:
17911824 ```python
1792- >>> bedrock_model = AmazonBedrockServerModel (
1825+ >>> bedrock_model = AmazonBedrockModel (
17931826 ... model_id='us.amazon.nova-pro-v1:0'
17941827 ... )
17951828 ```
@@ -1798,15 +1831,15 @@ class AmazonBedrockServerModel(ApiModel):
17981831 ```python
17991832 >>> import boto3
18001833 >>> client = boto3.client('bedrock-runtime', region_name='us-west-2')
1801- >>> bedrock_model = AmazonBedrockServerModel (
1834+ >>> bedrock_model = AmazonBedrockModel (
18021835 ... model_id='us.amazon.nova-pro-v1:0',
18031836 ... client=client
18041837 ... )
18051838 ```
18061839
18071840 Creating a model instance with client_kwargs for internal client creation:
18081841 ```python
1809- >>> bedrock_model = AmazonBedrockServerModel (
1842+ >>> bedrock_model = AmazonBedrockModel (
18101843 ... model_id='us.amazon.nova-pro-v1:0',
18111844 ... client_kwargs={'region_name': 'us-west-2', 'endpoint_url': 'https://custom-endpoint.com'}
18121845 ... )
@@ -1823,7 +1856,7 @@ class AmazonBedrockServerModel(ApiModel):
18231856 ... "guardrailVersion": 'v1'
18241857 ... },
18251858 ... }
1826- >>> bedrock_model = AmazonBedrockServerModel (
1859+ >>> bedrock_model = AmazonBedrockModel (
18271860 ... model_id='anthropic.claude-3-haiku-20240307-v1:0',
18281861 ... **additional_api_config
18291862 ... )
@@ -1929,7 +1962,7 @@ def generate(
19291962 )
19301963 self ._apply_rate_limit ()
19311964 # self.client is created in ApiModel class
1932- response = self .client .converse ( ** completion_kwargs )
1965+ response = self .retryer ( self . client .converse , ** completion_kwargs )
19331966
19341967 # Get content blocks with "text" key: in case thinking blocks are present, discard them
19351968 message_content_blocks_with_text = [
@@ -1953,7 +1986,7 @@ def generate(
19531986 )
19541987
19551988
1956- AmazonBedrockModel = AmazonBedrockServerModel
1989+ AmazonBedrockServerModel = AmazonBedrockModel
19571990
19581991__all__ = [
19591992 "REMOVE_PARAMETER" ,
0 commit comments