feat(ai-gateway): Add new load balancing alogrithms (#3659)

tomek-labuk · web-flow · commit dae1bdbbf660 · 2025-12-09T07:17:23.000+01:00
diff --git a/app/_kong_plugins/ai-proxy-advanced/examples/least-connections.yaml b/app/_kong_plugins/ai-proxy-advanced/examples/least-connections.yaml
@@ -0,0 +1,65 @@
+title: 'Load balancing: Least-connections'
+description: 'Configure the plugin to use two OpenAI models and route requests based on in-flight connection counts and spare capacity.'
+
+extended_description: |
+  {% new_in 3.13 %} Configure the plugin to use two OpenAI models and route requests to the backend with the highest spare capacity based on in-flight connection counts.
+
+  In this example, both models have equal weight (2), so requests are distributed based on which backend has fewer active connections. The algorithm automatically routes new requests to backends with more spare capacity, making it particularly effective when backends have varying response times.
+
+weight: 111
+
+requirements:
+  - An OpenAI account
+
+config:
+  balancer:
+    algorithm: least-connections
+    retries: 3
+    failover_criteria:
+    - error
+    - timeout
+    - http_429
+    - non_idempotent
+  targets:
+  - model:
+      name: gpt-4o
+      provider: openai
+      options:
+        max_tokens: 1024
+        temperature: 1.0
+    route_type: llm/v1/chat
+    weight: 2
+    auth:
+      header_name: Authorization
+      header_value: Bearer ${key}
+    logging:
+      log_statistics: true
+      log_payloads: true
+  - model:
+      name: gpt-4o-mini
+      provider: openai
+      options:
+        max_tokens: 1024
+        temperature: 1.0
+    route_type: llm/v1/chat
+    weight: 2
+    auth:
+      header_name: Authorization
+      header_value: Bearer ${key}
+    logging:
+      log_statistics: true
+      log_payloads: true
+
+variables:
+  key:
+    value: $OPENAI_API_KEY
+    description: The API key to use to connect to OpenAI.
+
+tools:
+  - deck
+  - admin-api
+  - konnect-api
+  - kic
+  - terraform
+
+group: load-balancing
diff --git a/app/_kong_plugins/ai-proxy-advanced/examples/semantic-with-fallback.yaml b/app/_kong_plugins/ai-proxy-advanced/examples/semantic-with-fallback.yaml
@@ -0,0 +1,102 @@
+title: 'Load balancing: Semantic with fallback'
+description: 'Configure the plugin to route requests based on semantic similarity between prompts and model descriptions, with automatic fallback among models sharing identical descriptions.'
+
+extended_description: |
+  {% new_in 3.13 %} Configure the plugin to use three OpenAI models and route requests based on semantic similarity between the prompt and model descriptions.
+
+  In this example, two targets share the same description ("Specialist in programming problems"). When a prompt matches this description, the plugin will first route to the target with weight 75 (gpt-4o). If that target fails, it falls back to the target with weight 25 (gpt-4o-mini) using round-robin. The third target with a different description ("Specialist in real life topics") handles prompts about non-technical topics.
+
+weight: 111
+
+min_version:
+  gateway: '3.13'
+
+requirements:
+  - An OpenAI account
+  - A Redis instance for vector storage
+
+config:
+  balancer:
+    algorithm: semantic
+    retries: 3
+    failover_criteria:
+    - error
+    - timeout
+    - http_429
+    - http_503
+    - non_idempotent
+  embeddings:
+    auth:
+      header_name: Authorization
+      header_value: Bearer ${key}
+    model:
+      name: text-embedding-3-small
+      provider: openai
+  vectordb:
+    strategy: redis
+    distance_metric: cosine
+    threshold: 0.7
+    dimensions: 1024
+    redis:
+      host: localhost
+      port: 6379
+  targets:
+  - model:
+      name: gpt-4o
+      provider: openai
+      options:
+        max_tokens: 1024
+        temperature: 1.0
+    route_type: llm/v1/chat
+    weight: 2
+    description: Specialist in real life topics
+    auth:
+      header_name: Authorization
+      header_value: Bearer ${key}
+    logging:
+      log_statistics: true
+      log_payloads: true
+  - model:
+      name: gpt-4o
+      provider: openai
+      options:
+        max_tokens: 1024
+        temperature: 1.0
+    route_type: llm/v1/chat
+    weight: 75
+    description: Specialist in programming problems
+    auth:
+      header_name: Authorization
+      header_value: Bearer ${key}
+    logging:
+      log_statistics: true
+      log_payloads: true
+  - model:
+      name: gpt-4o-mini
+      provider: openai
+      options:
+        max_tokens: 1024
+        temperature: 1.0
+    route_type: llm/v1/chat
+    weight: 25
+    description: Specialist in programming problems
+    auth:
+      header_name: Authorization
+      header_value: Bearer ${key}
+    logging:
+      log_statistics: true
+      log_payloads: true
+
+variables:
+  key:
+    value: $OPENAI_API_KEY
+    description: The API key to use to connect to OpenAI.
+
+tools:
+  - deck
+  - admin-api
+  - konnect-api
+  - kic
+  - terraform
+
+group: load-balancing
diff --git a/app/_kong_plugins/ai-proxy-advanced/index.md b/app/_kong_plugins/ai-proxy-advanced/index.md
@@ -171,6 +171,9 @@ rows:
   - algorithm: "[Consistent-hashing (sticky-session on given header value)](/plugins/ai-proxy-advanced/examples/consistent-hashing/)"
     description: |
       The consistent-hashing algorithm routes requests based on a specified header value (`X-Hashing-Header`). Requests with the same header are repeatedly routed to the same model, enabling sticky sessions for maintaining context or affinity across user interactions.
+  - algorithm: "[Least-connections](/plugins/ai-proxy-advanced/examples/least-connections/)"
+    description: |
+      {% new_in 3.13 %} The least-connections algorithm tracks the number of in-flight requests for each backend. Weights are used to calculate the connection capacity of a backend. Requests are routed to the backend with the highest spare capacity. This option is more dynamic, automatically routing new requests to other backends when slower backends accumulate more open connections.
   - algorithm: "[Lowest-latency](/plugins/ai-proxy-advanced/examples/lowest-latency/)"
     description: |
       The lowest-latency algorithm is based on the response time for each model. It distributes requests to models with the lowest response time.
@@ -189,10 +192,12 @@ rows:
       The priority algorithm routes requests to groups of models based on assigned weights. Higher-weighted groups are preferred, and if all models in a group fail, the plugin falls back to the next group. This allows for reliable failover and cost-aware routing across multiple AI models.
   - algorithm: "[Round-robin (weighted)](/plugins/ai-proxy-advanced/examples/round-robin/)"
     description: |
-      The round-robin algorithm distributes requests across models based on their respective weights. For example, if your models `gpt-4`, `gpt-4o-mini`, and `gpt-3` have weights of `70`, `25`, and `5` respectively, they’ll receive approximately 70%, 25%, and 5% of the traffic in turn. Requests are distributed proportionally, independent of usage or latency metrics.
+      The round-robin algorithm distributes requests across models based on their respective weights. For example, if your models `gpt-4`, `gpt-4o-mini`, and `gpt-3` have weights of `70`, `25`, and `5` respectively, they'll receive approximately 70%, 25%, and 5% of the traffic in turn. Requests are distributed proportionally, independent of usage or latency metrics.
   - algorithm: "[Semantic](/plugins/ai-proxy-advanced/examples/semantic/)"
     description: |
       The semantic algorithm distributes requests to different models based on the similarity between the prompt in the request and the description provided in the model configuration. This allows Kong to automatically select the model that is best suited for the given domain or use case.
+
+      {% new_in 3.13 %} Multiple targets can be [configured with identical descriptions](/plugins/ai-proxy-advanced/examples/semantic-with-fallback/). When multiple targets share the same description, the AI balancer performs round-robin fallback among these targets if the primary target fails. Weights affect the order in which fallback targets are selected.
 {% endtable %}
 <!--vale on-->
 
diff --git a/app/ai-gateway/load-balancing.md b/app/ai-gateway/load-balancing.md
@@ -53,6 +53,12 @@ Kong AI Gateway supports multiple load balancing strategies to optimize traffic
 
 The table below provides a detailed overview of the available algorithms, along with considerations to keep in mind when selecting the best option for your use case.
 
+### Load balancing algorithms
+
+Kong AI Gateway supports multiple load balancing strategies to optimize traffic distribution across AI models. Each algorithm is suited for different performance goals such as balancing load, improving cache-hit ratios, reducing latency, or ensuring [failover reliability](#retry-and-fallback).
+
+The table below provides a detailed overview of the available algorithms, along with considerations to keep in mind when selecting the best option for your use case.
+
 <!--vale off-->
 {% table %}
 columns:
@@ -77,6 +83,13 @@ rows:
       * Especially effective with consistent keys like user IDs.
       * Requires diverse hash inputs for balanced distribution.
       * Ideal for maintaining session persistence.
+  - algorithm: "[Least-connections](/plugins/ai-proxy-advanced/examples/least-connections/)"
+    description: |
+      {% new_in 3.13 %} Routes requests to backends with the highest spare capacity based on in-flight request counts. In the configuration, the [`weight`](/plugins/ai-proxy-advanced/reference/#schema--config-targets-weight) parameter calculates the connection capacity of each backend.
+    considerations: |
+      * Provides good distribution of traffic.
+      * More dynamic, automatically routing new requests to other backends when slower backends accumulate more open connections.
+      * Does not improve cache-hit ratios.
   - algorithm: "[Lowest-usage](/plugins/ai-proxy-advanced/examples/lowest-usage/)"
     description: |
       Routes requests to the least-utilized models based on resource usage metrics. In the configuration, the [`tokens_count_strategy`](/plugins/ai-proxy-advanced/reference/#schema--config-balancer-tokens-count-strategy) (for example, `prompt-tokens`) defines how usage is measured, focusing on prompt tokens or other resource indicators.
@@ -88,14 +101,16 @@ rows:
     description: |
       Routes requests to the models with the lowest observed latency. In the configuration, the [`latency_strategy`](/plugins/ai-proxy-advanced/reference/#schema--config-balancer-latency-strategy) parameter (for example, `latency_strategy: e2e`) defines how latency is measured, typically based on end-to-end response times. By default, the latency is calculated based on the time the model takes to generate each token (`tpot`).
 
-      The latency algorithm is based on peak EWMA (Exponentially Weighted Moving Average), which ensures that the balancer selects the backend by the lowest latency. The latency metric used is the full request cycle, from TCP connect to body response time. Since it’s a moving average, the metrics will decay over time.
+      The latency algorithm is based on peak EWMA (Exponentially Weighted Moving Average), which ensures that the balancer selects the backend by the lowest latency. The latency metric used is the full request cycle, from TCP connect to body response time. Since it's a moving average, the metrics will decay over time.
     considerations: |
       * Prioritizes models with the fastest response times.
       * Optimizes for real-time performance in time-sensitive applications.
       * Less suitable for long-lived or persistent connections (e.g., WebSockets).
   - algorithm: "[Semantic](/plugins/ai-proxy-advanced/examples/semantic/)"
     description: |
       Routes requests based on semantic similarity between the prompt and model descriptions. In the configuration, embeddings are generated using a specified model (e.g., `text-embedding-3-small`), and similarity is calculated using vector search.
+
+      {% new_in 3.13 %} Multiple targets can be configured with [identical descriptions](/plugins/ai-proxy-advanced/examples/semantic-with-fallback/). When multiple targets share the same description, the AI balancer performs round-robin fallback among these targets if the primary target fails. Weights affect the order in which fallback targets are selected.
     considerations: |
       * Uses vector search (for example, Redis) to find the best match based on prompt embeddings.
       * `distance_metric` and `threshold` settings fine-tune matching sensitivity.