Skip to content

Commit 30655ed

Browse files
authored
fix: AI Rate Limiting Bug Causing Tenant 429 Errors on /v1/ API Path (#132)
1 parent c71f6c7 commit 30655ed

File tree

3 files changed

+215
-2
lines changed

3 files changed

+215
-2
lines changed

infra-ai-hub/params/apim/api_policy.xml.tftpl

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,15 @@
2424
var match = System.Text.RegularExpressions.Regex.Match(path, @"/deployments/([^/]+)/");
2525
if (match.Success) { return match.Groups[1].Value; }
2626
// For /v1/ format: model field is the deployment name lookup key on Azure OpenAI
27-
// Client sends e.g. "gpt-4.1-mini"; tenant-prefix to match deployment name
27+
// Client sends e.g. "gpt-4.1-mini"; use bare model name to match deployment names
28+
// NOTE: do NOT prepend tenant prefix here — rate-limit <when> conditions compare
29+
// against bare deployment names from tfvars (e.g. "gpt-4.1-mini", not "tenant-gpt-4.1-mini").
30+
// URL rewriting (further below) adds the tenant prefix independently for backend routing.
2831
if (path.ToLower().Contains(&quot;/v1/&quot;)) {
2932
try {
3033
var body = context.Request.Body.As&lt;JObject&gt;(preserveContent: true);
3134
var model = body?[&quot;model&quot;]?.ToString();
32-
if (!string.IsNullOrEmpty(model)) { return &quot;${tenant_name}-&quot; + model; }
35+
if (!string.IsNullOrEmpty(model)) { return model; }
3336
} catch { }
3437
}
3538
return &quot;unknown&quot;;

tests/integration/test-helper.bash

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,106 @@ parse_response() {
212212
export RESPONSE_STATUS RESPONSE_BODY
213213
}
214214

215+
# Parse a header-inclusive response (from apim_request_with_headers) into three variables:
216+
# RESPONSE_STATUS — HTTP status code (last line of output)
217+
# RESPONSE_HEADERS — raw response headers, one per line, CR stripped
218+
# RESPONSE_BODY — response body (everything after the header blank line)
219+
#
220+
# curl -i output structure:
221+
# HTTP/1.1 <status> <reason>\r\n
222+
# Header: value\r\n
223+
# ...\r\n
224+
# \r\n
225+
# <body>
226+
# <status_code> ← appended by -w "\n%{http_code}"
227+
parse_response_with_headers() {
228+
local response="${1}"
229+
230+
# Status code is always the last line (appended by -w "\n%{http_code}")
231+
RESPONSE_STATUS=$(echo "${response}" | tail -n1)
232+
233+
# Strip the trailing status line
234+
local without_status
235+
without_status=$(echo "${response}" | sed '$d')
236+
237+
# Headers: everything up to (but not including) the first blank line, CR stripped
238+
RESPONSE_HEADERS=$(echo "${without_status}" | sed -n '1,/^[[:space:]]*$/{ /^[[:space:]]*$/d; p }' | tr -d '\r')
239+
240+
# Body: everything after the first blank line
241+
RESPONSE_BODY=$(echo "${without_status}" | awk 'found{print} /^[[:space:]]*$/{found=1}')
242+
243+
export RESPONSE_STATUS RESPONSE_HEADERS RESPONSE_BODY
244+
}
245+
246+
# Extract a single header value from RESPONSE_HEADERS (set by parse_response_with_headers).
247+
# Matching is case-insensitive. Returns empty string if not found.
248+
# Usage: get_response_header <header-name>
249+
get_response_header() {
250+
local name="${1}"
251+
echo "${RESPONSE_HEADERS}" | grep -i "^${name}:" | head -1 | sed 's/^[^:]*:[[:space:]]*//' | tr -d '\r'
252+
}
253+
254+
# HTTP request wrapper that includes response headers in output.
255+
# Identical to apim_request but adds curl -i so headers are captured.
256+
# Use parse_response_with_headers to split the output.
257+
# Usage: apim_request_with_headers <method> <tenant> <path> [body]
258+
apim_request_with_headers() {
259+
local method="${1}"
260+
local tenant="${2}"
261+
local path="${3}"
262+
local body="${4:-}"
263+
264+
local subscription_key
265+
subscription_key=$(get_subscription_key "${tenant}")
266+
267+
if [[ -z "${subscription_key}" ]]; then
268+
echo "Error: No subscription key for tenant ${tenant}" >&2
269+
return 1
270+
fi
271+
272+
local url="${APIM_GATEWAY_URL}/${tenant}${path}"
273+
274+
local curl_opts=(
275+
-s # Silent
276+
-i # Include response headers in output
277+
-w "\n%{http_code}" # Append HTTP status code as last line
278+
-H "api-key: ${subscription_key}"
279+
-H "Content-Type: application/json"
280+
-H "Accept: application/json"
281+
--max-time 60
282+
)
283+
284+
if [[ -n "${body}" ]]; then
285+
curl_opts+=(-d "${body}")
286+
fi
287+
288+
local response
289+
response=$(curl -X "${method}" "${curl_opts[@]}" "${url}")
290+
291+
local status
292+
status=$(echo "${response}" | tail -n1)
293+
294+
# 401 fallback: key may be stale after rotation; refresh from KV and retry once
295+
if [[ "${status}" == "401" ]] && refresh_tenant_key_from_vault "${tenant}"; then
296+
subscription_key=$(get_subscription_key "${tenant}")
297+
curl_opts=(
298+
-s
299+
-i
300+
-w "\n%{http_code}"
301+
-H "api-key: ${subscription_key}"
302+
-H "Content-Type: application/json"
303+
-H "Accept: application/json"
304+
--max-time 60
305+
)
306+
if [[ -n "${body}" ]]; then
307+
curl_opts+=(-d "${body}")
308+
fi
309+
response=$(curl -X "${method}" "${curl_opts[@]}" "${url}")
310+
fi
311+
312+
echo "${response}"
313+
}
314+
215315
# Retry configuration for rate limiting and transient failures
216316
# With low quota allocations, rate limits are tighter — use more retries + backoff
217317
MAX_RETRIES="${MAX_RETRIES:-5}"

tests/integration/v1-chat-completions.bats

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,116 @@ skip_if_no_key() {
312312
[[ -n "${content}" ]]
313313
}
314314

315+
# =============================================================================
316+
# Rate Limit Header Validation — token budget correctness
317+
# -----------------------------------------------------------------------------
318+
# Regression for the tenant-prefix bug:
319+
# APIM policy constructed deploymentName = "${tenant}-${model}" for /v1/ paths.
320+
# This never matched any <when> condition → fell through to the 1,000 TPM fallback.
321+
# x-ratelimit-limit-tokens: 1000 instead of 1,500,000 for gpt-4.1-mini.
322+
#
323+
# These tests assert that:
324+
# 1. /v1/ format returns a token limit >> 1000 (not the fallback)
325+
# 2. /deployments/ format returns the same token limit as /v1/
326+
# (both hit the same <when> block in the APIM policy)
327+
# =============================================================================
328+
329+
@test "V1-RateLimit: /v1/ format token limit is not the 1000 TPM fallback" {
330+
# Why this catches the bug:
331+
# Before fix: deploymentName = "wlrs-water-form-assistant-gpt-4.1-mini" → no <when> match
332+
# → <otherwise> fires → tokens-per-minute=1000 → x-ratelimit-limit-tokens: 1000
333+
# After fix: deploymentName = "gpt-4.1-mini" → matches <when> → capacity=1500
334+
# → tokens-per-minute=1500000 → x-ratelimit-limit-tokens: 1500000
335+
skip_if_no_key "wlrs-water-form-assistant"
336+
337+
local path="/openai/v1/chat/completions"
338+
local body='{"model":"gpt-4.1-mini","messages":[{"role":"user","content":"Say hello"}],"max_tokens":5}'
339+
340+
local raw_response
341+
raw_response=$(apim_request_with_headers "POST" "wlrs-water-form-assistant" "${path}" "${body}")
342+
parse_response_with_headers "${raw_response}"
343+
344+
echo "# /v1/ status: ${RESPONSE_STATUS}" >&3
345+
assert_status "200" "${RESPONSE_STATUS}"
346+
347+
local limit
348+
limit=$(get_response_header "x-ratelimit-limit-tokens")
349+
echo "# /v1/ x-ratelimit-limit-tokens: ${limit}" >&3
350+
351+
[[ -n "${limit}" ]] || { echo "x-ratelimit-limit-tokens header is missing" >&2; return 1; }
352+
353+
# The 1,000 TPM fallback would produce limit=1000.
354+
# Every real model has capacity >= 50 → limit >= 50,000.
355+
# Asserting > 1000 is sufficient to detect the fallback without hard-coding model capacity.
356+
[[ "${limit}" -gt 1000 ]] || {
357+
echo "x-ratelimit-limit-tokens is ${limit} — looks like the 1000 TPM fallback is active." >&2
358+
echo "deploymentName variable in APIM policy may still be including tenant prefix for /v1/ paths." >&2
359+
return 1
360+
}
361+
}
362+
363+
@test "V1-RateLimit: /deployments/ format token limit is not the 1000 TPM fallback" {
364+
skip_if_no_key "wlrs-water-form-assistant"
365+
366+
local path="/openai/deployments/gpt-4.1-mini/chat/completions?api-version=${OPENAI_API_VERSION}"
367+
local body='{"messages":[{"role":"user","content":"Say hello"}],"max_tokens":5}'
368+
369+
local raw_response
370+
raw_response=$(apim_request_with_headers "POST" "wlrs-water-form-assistant" "${path}" "${body}")
371+
parse_response_with_headers "${raw_response}"
372+
373+
echo "# /deployments/ status: ${RESPONSE_STATUS}" >&3
374+
assert_status "200" "${RESPONSE_STATUS}"
375+
376+
local limit
377+
limit=$(get_response_header "x-ratelimit-limit-tokens")
378+
echo "# /deployments/ x-ratelimit-limit-tokens: ${limit}" >&3
379+
380+
[[ -n "${limit}" ]] || { echo "x-ratelimit-limit-tokens header is missing" >&2; return 1; }
381+
[[ "${limit}" -gt 1000 ]] || {
382+
echo "x-ratelimit-limit-tokens is ${limit} — fallback active on /deployments/ path." >&2
383+
return 1
384+
}
385+
}
386+
387+
@test "V1-RateLimit: /v1/ and /deployments/ formats report identical token limit for same model" {
388+
# Both paths must resolve deploymentName to the bare model name and hit the same
389+
# <when> block in the APIM policy → identical llm-token-limit counter and capacity.
390+
# A mismatch means one path is hitting the fallback while the other is not.
391+
skip_if_no_key "wlrs-water-form-assistant"
392+
393+
local v1_path="/openai/v1/chat/completions"
394+
local v1_body='{"model":"gpt-4.1-mini","messages":[{"role":"user","content":"Say hello"}],"max_tokens":5}'
395+
396+
local dep_path="/openai/deployments/gpt-4.1-mini/chat/completions?api-version=${OPENAI_API_VERSION}"
397+
local dep_body='{"messages":[{"role":"user","content":"Say hello"}],"max_tokens":5}'
398+
399+
local v1_raw dep_raw
400+
v1_raw=$(apim_request_with_headers "POST" "wlrs-water-form-assistant" "${v1_path}" "${v1_body}")
401+
parse_response_with_headers "${v1_raw}"
402+
local v1_limit
403+
v1_limit=$(get_response_header "x-ratelimit-limit-tokens")
404+
echo "# /v1/ x-ratelimit-limit-tokens: ${v1_limit}" >&3
405+
406+
# Brief pause to avoid token counter spillover between the two requests
407+
sleep 2
408+
409+
dep_raw=$(apim_request_with_headers "POST" "wlrs-water-form-assistant" "${dep_path}" "${dep_body}")
410+
parse_response_with_headers "${dep_raw}"
411+
local dep_limit
412+
dep_limit=$(get_response_header "x-ratelimit-limit-tokens")
413+
echo "# /deployments/ x-ratelimit-limit-tokens: ${dep_limit}" >&3
414+
415+
[[ -n "${v1_limit}" ]] || { echo "/v1/ x-ratelimit-limit-tokens header missing" >&2; return 1; }
416+
[[ -n "${dep_limit}" ]] || { echo "/deployments/ x-ratelimit-limit-tokens header missing" >&2; return 1; }
417+
418+
[[ "${v1_limit}" == "${dep_limit}" ]] || {
419+
echo "/v1/ limit (${v1_limit}) != /deployments/ limit (${dep_limit})" >&2
420+
echo "One path is hitting the fallback rate-limit block — check deploymentName construction in api_policy.xml.tftpl" >&2
421+
return 1
422+
}
423+
}
424+
315425
# =============================================================================
316426
# Cross-tenant Isolation
317427
# =============================================================================

0 commit comments

Comments
 (0)