Skip to content

Commit 1934d56

Browse files
authored
feat: add fallback mechanism for specific error codes in ai-proxy-multi (apache#12571)
1 parent ae19642 commit 1934d56

File tree

8 files changed

+772
-55
lines changed

8 files changed

+772
-55
lines changed

apisix/plugins/ai-drivers/openai-base.lua

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,11 @@ function _M.request(self, ctx, conf, request_table, extra_opts)
272272
return handle_error(err)
273273
end
274274

275+
-- handling this error separately is needed for retries
276+
if res.status == 429 or (res.status >= 500 and res.status < 600 )then
277+
return res.status
278+
end
279+
275280
local code, body = read_response(ctx, res)
276281

277282
if conf.keepalive then

apisix/plugins/ai-proxy-multi.lua

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,26 @@ local _M = {
4545
schema = schema.ai_proxy_multi_schema,
4646
}
4747

48+
local function fallback_strategy_has(strategy, name)
49+
if not strategy then
50+
return false
51+
end
52+
53+
if type(strategy) == "string" then
54+
return strategy == name
55+
end
56+
57+
if type(strategy) == "table" then
58+
for _, v in ipairs(strategy) do
59+
if v == name then
60+
return true
61+
end
62+
end
63+
end
64+
65+
return false
66+
end
67+
4868

4969
local function get_chash_key_schema(hash_on)
5070
if hash_on == "vars" then
@@ -306,7 +326,8 @@ local function pick_target(ctx, conf, ups_tab)
306326
return nil, nil, err
307327
end
308328
ctx.balancer_server = instance_name
309-
if conf.fallback_strategy == "instance_health_and_rate_limiting" then
329+
if conf.fallback_strategy == "instance_health_and_rate_limiting" or -- for backwards compatible
330+
fallback_strategy_has(conf.fallback_strategy, "rate_limiting") then
310331
local ai_rate_limiting = require("apisix.plugins.ai-rate-limiting")
311332
for _ = 1, #conf.instances do
312333
if ai_rate_limiting.check_instance_status(nil, ctx, instance_name) then
@@ -363,7 +384,32 @@ function _M.access(conf, ctx)
363384
end
364385

365386

366-
_M.before_proxy = base.before_proxy
387+
local function retry_on_error(ctx, conf, code)
388+
if not ctx.server_picker then
389+
return code
390+
end
391+
ctx.server_picker.after_balance(ctx, true)
392+
if (code == 429 and fallback_strategy_has(conf.fallback_strategy, "http_429")) or
393+
(code >= 500 and code < 600 and
394+
fallback_strategy_has(conf.fallback_strategy, "http_5xx")) then
395+
local name, ai_instance, err = pick_ai_instance(ctx, conf)
396+
if err then
397+
core.log.error("failed to pick new AI instance: ", err)
398+
return 502
399+
end
400+
ctx.balancer_ip = name
401+
ctx.picked_ai_instance_name = name
402+
ctx.picked_ai_instance = ai_instance
403+
return
404+
end
405+
return code
406+
end
407+
408+
function _M.before_proxy(conf, ctx)
409+
return base.before_proxy(conf, ctx, function (ctx, conf, code)
410+
return retry_on_error(ctx, conf, code)
411+
end)
412+
end
367413

368414
function _M.log(conf, ctx)
369415
if conf.logging then

apisix/plugins/ai-proxy/base.lua

Lines changed: 52 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ local core = require("apisix.core")
2020
local require = require
2121
local pcall = pcall
2222
local exporter = require("apisix.plugins.prometheus.exporter")
23-
local bad_request = ngx.HTTP_BAD_REQUEST
2423

2524
local _M = {}
2625

@@ -47,51 +46,63 @@ function _M.set_logging(ctx, summaries, payloads)
4746
end
4847

4948

50-
function _M.before_proxy(conf, ctx)
51-
local ai_instance = ctx.picked_ai_instance
52-
local ai_driver = require("apisix.plugins.ai-drivers." .. ai_instance.provider)
49+
-- when on_error function is passed, before_proxy will keep on retrying until
50+
-- on_error returns abort code
51+
function _M.before_proxy(conf, ctx, on_error)
52+
while true do
53+
local ai_instance = ctx.picked_ai_instance
54+
local ai_driver = require("apisix.plugins.ai-drivers." .. ai_instance.provider)
5355

54-
local request_body, err = ai_driver.validate_request(ctx)
55-
if not request_body then
56-
return bad_request, err
57-
end
58-
59-
local extra_opts = {
60-
endpoint = core.table.try_read_attr(ai_instance, "override", "endpoint"),
61-
query_params = ai_instance.auth.query or {},
62-
headers = (ai_instance.auth.header or {}),
63-
model_options = ai_instance.options,
64-
}
56+
local request_body, err = ai_driver.validate_request(ctx)
57+
if not request_body then
58+
return 400, err
59+
end
6560

66-
if request_body.stream then
67-
request_body.stream_options = {
68-
include_usage = true
61+
local extra_opts = {
62+
endpoint = core.table.try_read_attr(ai_instance, "override", "endpoint"),
63+
query_params = ai_instance.auth.query or {},
64+
headers = (ai_instance.auth.header or {}),
65+
model_options = ai_instance.options,
6966
}
70-
ctx.var.request_type = "ai_stream"
71-
else
72-
ctx.var.request_type = "ai_chat"
73-
end
74-
if request_body.model then
75-
ctx.var.request_llm_model = request_body.model
76-
end
77-
local model = ai_instance.options and ai_instance.options.model or request_body.model
78-
if model then
79-
ctx.var.llm_model = model
80-
end
8167

82-
local do_request = function()
83-
ctx.llm_request_start_time = ngx.now()
84-
ctx.var.llm_request_body = request_body
85-
return ai_driver:request(ctx, conf, request_body, extra_opts)
86-
end
87-
exporter.inc_llm_active_connections(ctx)
88-
local ok, code_or_err, body = pcall(do_request)
89-
exporter.dec_llm_active_connections(ctx)
90-
if not ok then
91-
core.log.error("failed to send request to AI service: ", code_or_err)
92-
return 500
68+
if request_body.stream then
69+
request_body.stream_options = {
70+
include_usage = true
71+
}
72+
ctx.var.request_type = "ai_stream"
73+
else
74+
ctx.var.request_type = "ai_chat"
75+
end
76+
if request_body.model then
77+
ctx.var.request_llm_model = request_body.model
78+
end
79+
local model = ai_instance.options and ai_instance.options.model or request_body.model
80+
if model then
81+
ctx.var.llm_model = model
82+
end
83+
84+
local do_request = function()
85+
ctx.llm_request_start_time = ngx.now()
86+
ctx.var.llm_request_body = request_body
87+
return ai_driver:request(ctx, conf, request_body, extra_opts)
88+
end
89+
90+
exporter.inc_llm_active_connections(ctx)
91+
local ok, code_or_err, body = pcall(do_request)
92+
exporter.dec_llm_active_connections(ctx)
93+
if not ok then
94+
core.log.error("failed to send request to AI service: ", code_or_err)
95+
return 500
96+
end
97+
if code_or_err and on_error then
98+
local abort_code = on_error(ctx, conf, code_or_err)
99+
if abort_code then
100+
return abort_code, body
101+
end
102+
else
103+
return code_or_err, body
104+
end
93105
end
94-
return code_or_err, body
95106
end
96107

97108

apisix/plugins/ai-proxy/schema.lua

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -196,9 +196,19 @@ _M.ai_proxy_multi_schema = {
196196
instances = ai_instance_schema,
197197
logging_schema = logging_schema,
198198
fallback_strategy = {
199-
type = "string",
200-
enum = { "instance_health_and_rate_limiting" },
201-
default = "instance_health_and_rate_limiting",
199+
anyOf = {
200+
{
201+
type = "string",
202+
enum = {"instance_health_and_rate_limiting", "http_429", "http_5xx"}
203+
},
204+
{
205+
type = "array",
206+
items = {
207+
type = "string",
208+
enum = {"rate_limiting", "http_429", "http_5xx"}
209+
}
210+
}
211+
}
202212
},
203213
timeout = {
204214
type = "integer",

apisix/plugins/ai-request-rewrite.lua

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,11 +213,15 @@ function _M.access(conf, ctx)
213213
}
214214

215215
-- Send request to LLM service
216-
local _, _, err = request_to_llm(conf, ai_request_table, ctx)
216+
local code, _, err = request_to_llm(conf, ai_request_table, ctx)
217217
if err then
218218
core.log.error("failed to request LLM: ", err)
219219
return HTTP_INTERNAL_SERVER_ERROR
220220
end
221+
if code == 429 or (code >= 500 and code < 600 ) then
222+
core.log.error("LLM service returned error status: ", code)
223+
return HTTP_INTERNAL_SERVER_ERROR
224+
end
221225
end
222226

223227
return _M

docs/en/latest/plugins/ai-proxy-multi.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ In addition, the Plugin also supports logging LLM request information in the acc
5151

5252
| Name | Type | Required | Default | Valid Values | Description |
5353
|------------------------------------|----------------|----------|-----------------------------------|--------------|-------------|
54-
| fallback_strategy | string | False | instance_health_and_rate_limiting | instance_health_and_rate_limiting | Fallback strategy. When set, the Plugin will check whether the specified instance’s token has been exhausted when a request is forwarded. If so, forward the request to the next instance regardless of the instance priority. When not set, the Plugin will not forward the request to low priority instances when token of the high priority instance is exhausted. |
54+
| fallback_strategy | string or array | False | | string: "instance_health_and_rate_limiting", "http_429", "http_5xx"<br>array: ["rate_limiting", "http_429", "http_5xx"] | Fallback strategy. When set, the Plugin will check whether the specified instance’s token has been exhausted when a request is forwarded. If so, forward the request to the next instance regardless of the instance priority. When not set, the Plugin will not forward the request to low priority instances when token of the high priority instance is exhausted. |
5555
| balancer | object | False | | | Load balancing configurations. |
5656
| balancer.algorithm | string | False | roundrobin | [roundrobin, chash] | Load balancing algorithm. When set to `roundrobin`, weighted round robin algorithm is used. When set to `chash`, consistent hashing algorithm is used. |
5757
| balancer.hash_on | string | False | | [vars, headers, cookie, consumer, vars_combinations] | Used when `type` is `chash`. Support hashing on [NGINX variables](https://nginx.org/en/docs/varindex.html), headers, cookie, consumer, or a combination of [NGINX variables](https://nginx.org/en/docs/varindex.html). |
@@ -186,7 +186,7 @@ DeepSeek responses: 2
186186

187187
### Configure Instance Priority and Rate Limiting
188188

189-
The following example demonstrates how you can configure two models with different priorities and apply rate limiting on the instance with a higher priority. In the case where `fallback_strategy` is set to `instance_health_and_rate_limiting`, the Plugin should continue to forward requests to the low priority instance once the high priority instance's rate limiting quota is fully consumed.
189+
The following example demonstrates how you can configure two models with different priorities and apply rate limiting on the instance with a higher priority. In the case where `fallback_strategy` is set to `["rate_limiting"]`, the Plugin should continue to forward requests to the low priority instance once the high priority instance's rate limiting quota is fully consumed.
190190

191191
Create a Route as such and update with your LLM providers, models, API keys, and endpoints if applicable:
192192

@@ -199,7 +199,7 @@ curl "http://127.0.0.1:9180/apisix/admin/routes" -X PUT \
199199
"methods": ["POST"],
200200
"plugins": {
201201
"ai-proxy-multi": {
202-
"fallback_strategy: "instance_health_and_rate_limiting",
202+
"fallback_strategy: ["rate_limiting"],
203203
"instances": [
204204
{
205205
"name": "openai-instance",
@@ -423,7 +423,7 @@ curl "http://127.0.0.1:9180/apisix/admin/routes" -X PUT \
423423
"plugins": {
424424
"key-auth": {},
425425
"ai-proxy-multi": {
426-
"fallback_strategy: "instance_health_and_rate_limiting",
426+
"fallback_strategy: ["rate_limiting"],
427427
"instances": [
428428
{
429429
"name": "openai-instance",

docs/en/latest/plugins/ai-rate-limiting.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -413,9 +413,9 @@ X-AI-RateLimit-Reset-deepseek-instance: 0
413413

414414
### Configure Instance Priority and Rate Limiting
415415

416-
The following example demonstrates how you can configure two models with different priorities and apply rate limiting on the instance with a higher priority. In the case where `fallback_strategy` is set to `instance_health_and_rate_limiting`, the Plugin should continue to forward requests to the low priority instance once the high priority instance's rate limiting quota is fully consumed.
416+
The following example demonstrates how you can configure two models with different priorities and apply rate limiting on the instance with a higher priority. In the case where `fallback_strategy` is set to `["rate_limiting"]`, the Plugin should continue to forward requests to the low priority instance once the high priority instance's rate limiting quota is fully consumed.
417417

418-
Create a Route as such to set rate limiting and a higher priority on `openai-instance` instance and set the `fallback_strategy` to `instance_health_and_rate_limiting`. Update with your LLM providers, models, API keys, and endpoints, if applicable:
418+
Create a Route as such to set rate limiting and a higher priority on `openai-instance` instance and set the `fallback_strategy` to `["rate_limiting"]`. Update with your LLM providers, models, API keys, and endpoints, if applicable:
419419

420420
```shell
421421
curl "http://127.0.0.1:9180/apisix/admin/routes" -X PUT \
@@ -426,7 +426,7 @@ curl "http://127.0.0.1:9180/apisix/admin/routes" -X PUT \
426426
"methods": ["POST"],
427427
"plugins": {
428428
"ai-proxy-multi": {
429-
"fallback_strategy: "instance_health_and_rate_limiting",
429+
"fallback_strategy: ["rate_limiting"],
430430
"instances": [
431431
{
432432
"name": "openai-instance",
@@ -650,7 +650,7 @@ curl "http://127.0.0.1:9180/apisix/admin/routes" -X PUT \
650650
"plugins": {
651651
"key-auth": {},
652652
"ai-proxy-multi": {
653-
"fallback_strategy: "instance_health_and_rate_limiting",
653+
"fallback_strategy: ["rate_limiting"],
654654
"instances": [
655655
{
656656
"name": "openai-instance",

0 commit comments

Comments
 (0)