feat: add fallback mechanism for specific error codes in ai-proxy-multi (apache#12571)

Revolyssup · web-flow · commit 1934d5602cb8 · 2025-09-03T11:50:55.000+05:30
diff --git a/apisix/plugins/ai-drivers/openai-base.lua b/apisix/plugins/ai-drivers/openai-base.lua
@@ -272,6 +272,11 @@ function _M.request(self, ctx, conf, request_table, extra_opts)
         return handle_error(err)
     end
 
+    -- handling this error separately is needed for retries
+    if res.status == 429 or (res.status >= 500 and res.status < 600 )then
+        return res.status
+    end
+
     local code, body = read_response(ctx, res)
 
     if conf.keepalive then
diff --git a/apisix/plugins/ai-proxy-multi.lua b/apisix/plugins/ai-proxy-multi.lua
@@ -45,6 +45,26 @@ local _M = {
     schema = schema.ai_proxy_multi_schema,
 }
 
+local function fallback_strategy_has(strategy, name)
+    if not strategy then
+        return false
+    end
+
+    if type(strategy) == "string" then
+        return strategy == name
+    end
+
+    if type(strategy) == "table" then
+        for _, v in ipairs(strategy) do
+            if v == name then
+                return true
+            end
+        end
+    end
+
+    return false
+end
+
 
 local function get_chash_key_schema(hash_on)
     if hash_on == "vars" then
@@ -306,7 +326,8 @@ local function pick_target(ctx, conf, ups_tab)
         return nil, nil, err
     end
     ctx.balancer_server = instance_name
-    if conf.fallback_strategy == "instance_health_and_rate_limiting" then
+    if conf.fallback_strategy == "instance_health_and_rate_limiting" or -- for backwards compatible
+       fallback_strategy_has(conf.fallback_strategy, "rate_limiting") then
         local ai_rate_limiting = require("apisix.plugins.ai-rate-limiting")
         for _ = 1, #conf.instances do
             if ai_rate_limiting.check_instance_status(nil, ctx, instance_name) then
@@ -363,7 +384,32 @@ function _M.access(conf, ctx)
 end
 
 
-_M.before_proxy = base.before_proxy
+local function retry_on_error(ctx, conf, code)
+    if not ctx.server_picker then
+        return code
+    end
+    ctx.server_picker.after_balance(ctx, true)
+    if (code == 429 and fallback_strategy_has(conf.fallback_strategy, "http_429")) or
+       (code >= 500 and code < 600 and
+       fallback_strategy_has(conf.fallback_strategy, "http_5xx")) then
+        local name, ai_instance, err = pick_ai_instance(ctx, conf)
+        if err then
+            core.log.error("failed to pick new AI instance: ", err)
+            return 502
+        end
+        ctx.balancer_ip = name
+        ctx.picked_ai_instance_name = name
+        ctx.picked_ai_instance = ai_instance
+        return
+    end
+    return code
+end
+
+function _M.before_proxy(conf, ctx)
+     return base.before_proxy(conf, ctx, function (ctx, conf, code)
+        return retry_on_error(ctx, conf, code)
+    end)
+end
 
 function _M.log(conf, ctx)
     if conf.logging then
diff --git a/apisix/plugins/ai-proxy/base.lua b/apisix/plugins/ai-proxy/base.lua
@@ -20,7 +20,6 @@ local core = require("apisix.core")
 local require = require
 local pcall   = pcall
 local exporter = require("apisix.plugins.prometheus.exporter")
-local bad_request = ngx.HTTP_BAD_REQUEST
 
 local _M = {}
 
@@ -47,51 +46,63 @@ function _M.set_logging(ctx, summaries, payloads)
 end
 
 
-function _M.before_proxy(conf, ctx)
-    local ai_instance = ctx.picked_ai_instance
-    local ai_driver = require("apisix.plugins.ai-drivers." .. ai_instance.provider)
+-- when on_error function is passed, before_proxy will keep on retrying until
+-- on_error returns abort code
+function _M.before_proxy(conf, ctx, on_error)
+    while true do
+        local ai_instance = ctx.picked_ai_instance
+        local ai_driver = require("apisix.plugins.ai-drivers." .. ai_instance.provider)
 
-    local request_body, err = ai_driver.validate_request(ctx)
-    if not request_body then
-        return bad_request, err
-    end
-
-    local extra_opts = {
-        endpoint = core.table.try_read_attr(ai_instance, "override", "endpoint"),
-        query_params = ai_instance.auth.query or {},
-        headers = (ai_instance.auth.header or {}),
-        model_options = ai_instance.options,
-    }
+        local request_body, err = ai_driver.validate_request(ctx)
+        if not request_body then
+            return 400, err
+        end
 
-    if request_body.stream then
-        request_body.stream_options = {
-            include_usage = true
+        local extra_opts = {
+            endpoint = core.table.try_read_attr(ai_instance, "override", "endpoint"),
+            query_params = ai_instance.auth.query or {},
+            headers = (ai_instance.auth.header or {}),
+            model_options = ai_instance.options,
         }
-        ctx.var.request_type = "ai_stream"
-    else
-        ctx.var.request_type = "ai_chat"
-    end
-    if request_body.model then
-        ctx.var.request_llm_model = request_body.model
-    end
-    local model = ai_instance.options and ai_instance.options.model or request_body.model
-    if model then
-        ctx.var.llm_model = model
-    end
 
-    local do_request = function()
-        ctx.llm_request_start_time = ngx.now()
-        ctx.var.llm_request_body = request_body
-        return ai_driver:request(ctx, conf, request_body, extra_opts)
-    end
-    exporter.inc_llm_active_connections(ctx)
-    local ok, code_or_err, body = pcall(do_request)
-    exporter.dec_llm_active_connections(ctx)
-    if not ok then
-        core.log.error("failed to send request to AI service: ", code_or_err)
-        return 500
+        if request_body.stream then
+            request_body.stream_options = {
+                include_usage = true
+            }
+            ctx.var.request_type = "ai_stream"
+        else
+            ctx.var.request_type = "ai_chat"
+        end
+        if request_body.model then
+            ctx.var.request_llm_model = request_body.model
+        end
+        local model = ai_instance.options and ai_instance.options.model or request_body.model
+        if model then
+            ctx.var.llm_model = model
+        end
+
+        local do_request = function()
+            ctx.llm_request_start_time = ngx.now()
+            ctx.var.llm_request_body = request_body
+            return ai_driver:request(ctx, conf, request_body, extra_opts)
+        end
+
+        exporter.inc_llm_active_connections(ctx)
+        local ok, code_or_err, body = pcall(do_request)
+        exporter.dec_llm_active_connections(ctx)
+        if not ok then
+            core.log.error("failed to send request to AI service: ", code_or_err)
+            return 500
+        end
+        if code_or_err and on_error then
+            local abort_code = on_error(ctx, conf, code_or_err)
+            if abort_code then
+                return abort_code, body
+            end
+        else
+            return code_or_err, body
+        end
     end
-    return code_or_err, body
 end
 
 
diff --git a/apisix/plugins/ai-proxy/schema.lua b/apisix/plugins/ai-proxy/schema.lua
@@ -196,9 +196,19 @@ _M.ai_proxy_multi_schema = {
         instances = ai_instance_schema,
         logging_schema = logging_schema,
         fallback_strategy = {
-            type = "string",
-            enum = { "instance_health_and_rate_limiting" },
-            default = "instance_health_and_rate_limiting",
+            anyOf = {
+              {
+                type = "string",
+                enum = {"instance_health_and_rate_limiting", "http_429", "http_5xx"}
+              },
+              {
+                type = "array",
+                items = {
+                  type = "string",
+                  enum = {"rate_limiting", "http_429", "http_5xx"}
+                }
+              }
+            }
         },
         timeout = {
             type = "integer",
diff --git a/apisix/plugins/ai-request-rewrite.lua b/apisix/plugins/ai-request-rewrite.lua
@@ -213,11 +213,15 @@ function _M.access(conf, ctx)
     }
 
     -- Send request to LLM service
-    local _, _, err = request_to_llm(conf, ai_request_table, ctx)
+    local code, _, err = request_to_llm(conf, ai_request_table, ctx)
     if err then
         core.log.error("failed to request LLM: ", err)
         return HTTP_INTERNAL_SERVER_ERROR
     end
+    if code == 429 or (code >= 500 and code < 600 ) then
+        core.log.error("LLM service returned error status: ", code)
+        return HTTP_INTERNAL_SERVER_ERROR
+    end
 end
 
 return _M
diff --git a/docs/en/latest/plugins/ai-proxy-multi.md b/docs/en/latest/plugins/ai-proxy-multi.md
@@ -51,7 +51,7 @@ In addition, the Plugin also supports logging LLM request information in the acc
 
 | Name                               | Type            | Required | Default                           | Valid Values | Description |
 |------------------------------------|----------------|----------|-----------------------------------|--------------|-------------|
-| fallback_strategy                  | string         | False    | instance_health_and_rate_limiting | instance_health_and_rate_limiting | Fallback strategy. When set, the Plugin will check whether the specified instance’s token has been exhausted when a request is forwarded. If so, forward the request to the next instance regardless of the instance priority. When not set, the Plugin will not forward the request to low priority instances when token of the high priority instance is exhausted. |
+| fallback_strategy                  | string or array         | False    |  | string: "instance_health_and_rate_limiting", "http_429", "http_5xx"<br>array: ["rate_limiting", "http_429", "http_5xx"] | Fallback strategy. When set, the Plugin will check whether the specified instance’s token has been exhausted when a request is forwarded. If so, forward the request to the next instance regardless of the instance priority. When not set, the Plugin will not forward the request to low priority instances when token of the high priority instance is exhausted. |
 | balancer                           | object         | False    |                                   |              | Load balancing configurations. |
 | balancer.algorithm                 | string         | False    | roundrobin                     | [roundrobin, chash] | Load balancing algorithm. When set to `roundrobin`, weighted round robin algorithm is used. When set to `chash`, consistent hashing algorithm is used. |
 | balancer.hash_on                   | string         | False    |                                   | [vars, headers, cookie, consumer, vars_combinations] | Used when `type` is `chash`. Support hashing on [NGINX variables](https://nginx.org/en/docs/varindex.html), headers, cookie, consumer, or a combination of [NGINX variables](https://nginx.org/en/docs/varindex.html). |
@@ -186,7 +186,7 @@ DeepSeek responses: 2
 
 ### Configure Instance Priority and Rate Limiting
 
-The following example demonstrates how you can configure two models with different priorities and apply rate limiting on the instance with a higher priority. In the case where `fallback_strategy` is set to `instance_health_and_rate_limiting`, the Plugin should continue to forward requests to the low priority instance once the high priority instance's rate limiting quota is fully consumed.
+The following example demonstrates how you can configure two models with different priorities and apply rate limiting on the instance with a higher priority. In the case where `fallback_strategy` is set to `["rate_limiting"]`, the Plugin should continue to forward requests to the low priority instance once the high priority instance's rate limiting quota is fully consumed.
 
 Create a Route as such and update with your LLM providers, models, API keys, and endpoints if applicable:
 
@@ -199,7 +199,7 @@ curl "http://127.0.0.1:9180/apisix/admin/routes" -X PUT \
     "methods": ["POST"],
     "plugins": {
       "ai-proxy-multi": {
-        "fallback_strategy: "instance_health_and_rate_limiting",
+        "fallback_strategy: ["rate_limiting"],
         "instances": [
           {
             "name": "openai-instance",
@@ -423,7 +423,7 @@ curl "http://127.0.0.1:9180/apisix/admin/routes" -X PUT \
     "plugins": {
       "key-auth": {},
       "ai-proxy-multi": {
-        "fallback_strategy: "instance_health_and_rate_limiting",
+        "fallback_strategy: ["rate_limiting"],
         "instances": [
           {
             "name": "openai-instance",
diff --git a/docs/en/latest/plugins/ai-rate-limiting.md b/docs/en/latest/plugins/ai-rate-limiting.md
@@ -413,9 +413,9 @@ X-AI-RateLimit-Reset-deepseek-instance: 0
 
 ### Configure Instance Priority and Rate Limiting
 
-The following example demonstrates how you can configure two models with different priorities and apply rate limiting on the instance with a higher priority. In the case where `fallback_strategy` is set to `instance_health_and_rate_limiting`, the Plugin should continue to forward requests to the low priority instance once the high priority instance's rate limiting quota is fully consumed.
+The following example demonstrates how you can configure two models with different priorities and apply rate limiting on the instance with a higher priority. In the case where `fallback_strategy` is set to `["rate_limiting"]`, the Plugin should continue to forward requests to the low priority instance once the high priority instance's rate limiting quota is fully consumed.
 
-Create a Route as such to set rate limiting and a higher priority on `openai-instance` instance and set the `fallback_strategy` to `instance_health_and_rate_limiting`. Update with your LLM providers, models, API keys, and endpoints, if applicable:
+Create a Route as such to set rate limiting and a higher priority on `openai-instance` instance and set the `fallback_strategy` to `["rate_limiting"]`. Update with your LLM providers, models, API keys, and endpoints, if applicable:
 
 ```shell
 curl "http://127.0.0.1:9180/apisix/admin/routes" -X PUT \
@@ -426,7 +426,7 @@ curl "http://127.0.0.1:9180/apisix/admin/routes" -X PUT \
     "methods": ["POST"],
     "plugins": {
       "ai-proxy-multi": {
-        "fallback_strategy: "instance_health_and_rate_limiting",
+        "fallback_strategy: ["rate_limiting"],
         "instances": [
           {
             "name": "openai-instance",
@@ -650,7 +650,7 @@ curl "http://127.0.0.1:9180/apisix/admin/routes" -X PUT \
     "plugins": {
       "key-auth": {},
       "ai-proxy-multi": {
-        "fallback_strategy: "instance_health_and_rate_limiting",
+        "fallback_strategy: ["rate_limiting"],
         "instances": [
           {
             "name": "openai-instance",
diff --git a/t/plugin/ai-proxy-multi.balancer.t b/t/plugin/ai-proxy-multi.balancer.t