Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 117 additions & 0 deletions apisix/plugins/ai-proxy-multi.lua
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
-- limitations under the License.
--

local ngx = ngx
local core = require("apisix.core")
local schema = require("apisix.plugins.ai-proxy.schema")
local base = require("apisix.plugins.ai-proxy.base")
Expand Down Expand Up @@ -299,6 +300,76 @@ local function get_instance_conf(instances, name)
end


local function match_client_models(instances, models)
local ordered_names = {}
local matched = {}

for _, model_pref in ipairs(models) do
local target_model, target_provider
if type(model_pref) == "string" then
Comment on lines +303 to +309
Copy link

Copilot AI Mar 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

match_client_models() assumes models is an array and iterates it with ipairs(models). If a client sends a non-array JSON value (e.g. "models":"gpt-4"), ipairs will raise an error and the request will 500.

Suggestion: validate request_body.models is a table/array before calling match_client_models (and/or harden match_client_models to return the default ordering when type(models) ~= "table"). Also consider skipping elements that are not a string or object.

Copilot uses AI. Check for mistakes.
target_model = model_pref
elseif type(model_pref) == "table" then
target_model = model_pref.model
target_provider = model_pref.provider
end

for _, instance in ipairs(instances) do
if not matched[instance.name] then
local inst_model = instance.options and instance.options.model
local matches = false
if target_provider then
matches = (instance.provider == target_provider
and inst_model == target_model)
else
matches = (inst_model == target_model)
end
if matches then
matched[instance.name] = true
core.table.insert(ordered_names, instance.name)
break
end
end
end
end

for _, instance in ipairs(instances) do
if not matched[instance.name] then
core.table.insert(ordered_names, instance.name)
end
end

Comment on lines +335 to +340
Copy link

Copilot AI Mar 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The fallback order appended for instances not listed by the client is based on the raw conf.instances array order. However, the server-driven picker uses priority-based ordering (see priority_balancer sorting by instance.priority). If conf.instances is not already sorted, this can cause lower-priority instances to be tried before higher-priority ones when the client omits them.

Suggestion: when appending unmatched instances, order them consistently with the server-side priority behavior (e.g. sort unmatched by instance.priority descending, and keep stable order within the same priority).

Suggested change
for _, instance in ipairs(instances) do
if not matched[instance.name] then
core.table.insert(ordered_names, instance.name)
end
end
local unmatched = {}
for idx, instance in ipairs(instances) do
if not matched[instance.name] then
core.table.insert(unmatched, {
name = instance.name,
priority = instance.priority or 0,
index = idx,
})
end
end
table.sort(unmatched, function(a, b)
if a.priority == b.priority then
return a.index < b.index
end
return a.priority > b.priority
end)
for _, item in ipairs(unmatched) do
core.table.insert(ordered_names, item.name)
end

Copilot uses AI. Check for mistakes.
return ordered_names
end


local function pick_preferred_instance(ctx, conf)
local preference = ctx.client_model_preference
local tried = ctx.client_model_tried or {}

for _, name in ipairs(preference) do
if not tried[name] then
local instance_conf = get_instance_conf(conf.instances, name)
if instance_conf then
if fallback_strategy_has(conf.fallback_strategy, "rate_limiting") then
local ai_rate_limiting = require("apisix.plugins.ai-rate-limiting")
if not ai_rate_limiting.check_instance_status(nil, ctx, name) then
core.log.info("preferred instance ", name,
" rate limited, trying next")
tried[name] = true
goto continue
end
end
ctx.client_model_tried = tried
return name, instance_conf
end
end
::continue::
end

return nil, nil, "all preferred instances exhausted"
end


function _M.construct_upstream(instance)
local upstream = {}
local node = instance._dns_value
Expand Down Expand Up @@ -409,6 +480,33 @@ end


function _M.access(conf, ctx)
if conf.allow_client_model_preference then
local body, err = core.request.get_body()
if body then
local request_body, decode_err = core.json.decode(body)
if request_body and not decode_err and request_body.models then
ctx.client_model_preference = match_client_models(
conf.instances, request_body.models)
core.log.info("client model preference: ",
core.json.delay_encode(ctx.client_model_preference))
request_body.models = nil
ngx.req.set_body_data(core.json.encode(request_body))
end
end
end
Comment on lines +483 to +496
Copy link

Copilot AI Mar 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The models field is only stripped when allow_client_model_preference is enabled. This contradicts the PR description/docs (and the new tests) which state that models is always stripped before forwarding upstream. As-is, requests sent with models while the feature is disabled will forward an extra models field to upstream providers.

Suggestion: always remove models from the JSON request body when present, but only apply client-driven reordering when allow_client_model_preference is true (i.e., strip unconditionally; reorder conditionally).

Copilot uses AI. Check for mistakes.

if ctx.client_model_preference then
local name, ai_instance, err = pick_preferred_instance(ctx, conf)
if err then
return 503, err
end
ctx.picked_ai_instance_name = name
ctx.picked_ai_instance = ai_instance
ctx.balancer_ip = name
ctx.bypass_nginx_upstream = true
return
Comment on lines +499 to +507
Copy link

Copilot AI Mar 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When client preference is used, instance selection bypasses pick_target() and therefore bypasses the existing active health-check filtering done via fetch_health_instances()/healthcheck_manager. This means a client can force the plugin to try an instance that health checks currently mark as down, changing behavior compared to the server-driven picker.

Suggestion: apply the same health-check availability filtering to the preferred list (e.g. build an allowlist of healthy instance names before iterating, or reuse the existing picker/checker status logic) so client-driven ordering doesn't ignore configured health checks.

Suggested change
local name, ai_instance, err = pick_preferred_instance(ctx, conf)
if err then
return 503, err
end
ctx.picked_ai_instance_name = name
ctx.picked_ai_instance = ai_instance
ctx.balancer_ip = name
ctx.bypass_nginx_upstream = true
return
-- apply health-check availability filtering to the preferred list
local healthy_instances, health_err = fetch_health_instances(conf, ctx)
local use_client_preference = true
if not healthy_instances then
core.log.warn("failed to fetch healthy instances for client model preference: ",
health_err, ", falling back to balancer selection")
use_client_preference = false
else
local healthy_set = {}
for _, inst in ipairs(healthy_instances) do
if inst.name then
healthy_set[inst.name] = true
end
end
local filtered_preference = {}
for _, pref in ipairs(ctx.client_model_preference) do
if pref.instance_name and healthy_set[pref.instance_name] then
filtered_preference[#filtered_preference + 1] = pref
end
end
if #filtered_preference == 0 then
core.log.warn("no healthy instances match client model preference; ",
"falling back to balancer selection")
use_client_preference = false
else
ctx.client_model_preference = filtered_preference
end
end
if use_client_preference then
local name, ai_instance, err = pick_preferred_instance(ctx, conf)
if err then
return 503, err
end
ctx.picked_ai_instance_name = name
ctx.picked_ai_instance = ai_instance
ctx.balancer_ip = name
ctx.bypass_nginx_upstream = true
return
end

Copilot uses AI. Check for mistakes.
end

local ups_tab = {}
local algo = core.table.try_read_attr(conf, "balancer", "algorithm")
if algo == "chash" then
Expand All @@ -430,6 +528,25 @@ end


local function retry_on_error(ctx, conf, code)
if ctx.client_model_preference then
ctx.client_model_tried = ctx.client_model_tried or {}
ctx.client_model_tried[ctx.picked_ai_instance_name] = true
if (code == 429 and fallback_strategy_has(conf.fallback_strategy, "http_429")) or
(code >= 500 and code < 600 and
fallback_strategy_has(conf.fallback_strategy, "http_5xx")) then
local name, ai_instance, err = pick_preferred_instance(ctx, conf)
if err then
core.log.error("all preferred instances failed: ", err)
return 502
end
ctx.balancer_ip = name
ctx.picked_ai_instance_name = name
ctx.picked_ai_instance = ai_instance
return
end
return code
end
Comment on lines 530 to +548
Copy link

Copilot AI Mar 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new client-preference retry path in retry_on_error() (falling back through ctx.client_model_preference on 429/5xx) isn’t covered by the added tests. Current test cases validate selection/reordering and stripping, but not that a 429/5xx from the preferred instance causes the plugin to retry the next preferred instance (and that it stops retrying on non-matching status codes).

Suggestion: add a test that makes the first preferred instance return 429/5xx and asserts the response comes from the next preferred instance (similar to existing fallback tests in ai-proxy-multi.balancer.t).

Copilot uses AI. Check for mistakes.

if not ctx.server_picker then
return code
end
Expand Down
6 changes: 6 additions & 0 deletions apisix/plugins/ai-proxy/schema.lua
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,12 @@ _M.ai_proxy_multi_schema = {
},
instances = ai_instance_schema,
logging = logging_schema,
allow_client_model_preference = {
type = "boolean",
default = false,
description = "When enabled, clients can specify preferred model ordering "
.. "via a `models` array in the request body.",
},
fallback_strategy = {
anyOf = {
{
Expand Down
107 changes: 107 additions & 0 deletions docs/en/latest/plugins/ai-proxy-multi.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,13 @@ In addition, the Plugin also supports logging LLM request information in the acc
| `messages` | Array | True | An array of message objects. |
| `messages.role` | String | True | Role of the message (`system`, `user`, `assistant`).|
| `messages.content` | String | True | Content of the message. |
| `models` | Array | False | Preferred model ordering. Only used when `allow_client_model_preference` is `true`. Each element can be a string (model name) or an object with `provider` and `model` fields. |

## Attributes

| Name | Type | Required | Default | Valid Values | Description |
|------------------------------------|----------------|----------|-----------------------------------|--------------|-------------|
| allow_client_model_preference | boolean | False | false | | When enabled, clients can include a `models` array in the request body to specify preferred model ordering. Each element can be a model name string or an object with `provider` and `model` fields. The plugin matches entries against configured instances and reorders them accordingly. Unrecognized entries are ignored. When disabled, the `models` field is ignored. |
| fallback_strategy | string or array | False | | string: "instance_health_and_rate_limiting", "http_429", "http_5xx"<br />array: ["rate_limiting", "http_429", "http_5xx"] | Fallback strategy. When set, the Plugin will check whether the specified instance’s token has been exhausted when a request is forwarded. If so, forward the request to the next instance regardless of the instance priority. When not set, the Plugin will not forward the request to low priority instances when token of the high priority instance is exhausted. |
| balancer | object | False | | | Load balancing configurations. |
| balancer.algorithm | string | False | roundrobin | [roundrobin, chash] | Load balancing algorithm. When set to `roundrobin`, weighted round robin algorithm is used. When set to `chash`, consistent hashing algorithm is used. |
Expand Down Expand Up @@ -1006,3 +1008,108 @@ In the gateway's access log, you should see a log entry similar to the following
```

The access log entry shows the request type is `ai_chat`, Apisix upstream response time is `5765` milliseconds, time to first token is `2858` milliseconds, Requested LLM model is `gpt-4`. LLM model is `gpt-4`, prompt token usage is `23`, and completion token usage is `8`.

### Client-Driven Model Selection

The following example demonstrates how you can allow clients to specify their preferred model ordering in the request body. This is useful when multiple teams share a single gateway but have different model preferences.

Create a Route with `allow_client_model_preference` enabled and multiple model instances:

```shell
curl "http://127.0.0.1:9180/apisix/admin/routes" -X PUT \
-H "X-API-KEY: ${admin_key}" \
-d '{
"id": "ai-proxy-multi-route",
"uri": "/anything",
"methods": ["POST"],
"plugins": {
"ai-proxy-multi": {
"allow_client_model_preference": true,
"fallback_strategy": ["http_429", "http_5xx"],
"instances": [
{
"name": "openai-instance",
"provider": "openai",
"priority": 1,
"weight": 0,
"auth": {
"header": {
"Authorization": "Bearer '"$OPENAI_API_KEY"'"
}
},
"options": {
"model": "gpt-4"
}
},
{
"name": "deepseek-instance",
"provider": "deepseek",
"priority": 0,
"weight": 0,
"auth": {
"header": {
"Authorization": "Bearer '"$DEEPSEEK_API_KEY"'"
}
},
"options": {
"model": "deepseek-chat"
}
}
]
}
}
}'
```

The server-configured priority makes OpenAI the default. Clients that prefer DeepSeek can override the ordering by including a `models` array in the request body.

Send a request with model preference using string shorthand:

```shell
curl "http://127.0.0.1:9080/anything" -X POST \
-H "Content-Type: application/json" \
-d '{
"messages": [
{ "role": "user", "content": "What is 1+1?" }
],
"models": ["deepseek-chat", "gpt-4"]
}'
```

The request is forwarded to DeepSeek first. If DeepSeek returns an error matching the `fallback_strategy`, the request falls back to OpenAI.

You can also use the object form for disambiguation when multiple instances share the same model name:

```shell
curl "http://127.0.0.1:9080/anything" -X POST \
-H "Content-Type: application/json" \
-d '{
"messages": [
{ "role": "user", "content": "What is 1+1?" }
],
"models": [
{ "provider": "deepseek", "model": "deepseek-chat" },
{ "provider": "openai", "model": "gpt-4" }
]
}'
```

Requests without a `models` field use the server-configured priority as usual:

```shell
curl "http://127.0.0.1:9080/anything" -X POST \
-H "Content-Type: application/json" \
-d '{
"messages": [
{ "role": "user", "content": "What is 1+1?" }
]
}'
```

This request is forwarded to OpenAI (priority 1) by default.

:::note

The `models` field is always stripped from the request body before forwarding to the upstream provider. Clients cannot introduce models or providers that are not configured on the route — unrecognized entries are silently ignored.

:::
Loading
Loading