fix: AI Rate Limiting Bug Causing Tenant 429 Errors on /v1/ API Path (#132)

mishraomp · web-flow · commit 30655edbdeec · 2026-03-03T22:01:27.000-08:00
diff --git a/infra-ai-hub/params/apim/api_policy.xml.tftpl b/infra-ai-hub/params/apim/api_policy.xml.tftpl
@@ -24,12 +24,15 @@
                     var match = System.Text.RegularExpressions.Regex.Match(path, @&quot;/deployments/([^/]+)/&quot;);
                     if (match.Success) { return match.Groups[1].Value; }
                     // For /v1/ format: model field is the deployment name lookup key on Azure OpenAI
-                    // Client sends e.g. "gpt-4.1-mini"; tenant-prefix to match deployment name
+                    // Client sends e.g. "gpt-4.1-mini"; use bare model name to match deployment names
+                    // NOTE: do NOT prepend tenant prefix here — rate-limit <when> conditions compare
+                    // against bare deployment names from tfvars (e.g. "gpt-4.1-mini", not "tenant-gpt-4.1-mini").
+                    // URL rewriting (further below) adds the tenant prefix independently for backend routing.
                     if (path.ToLower().Contains(&quot;/v1/&quot;)) {
                         try {
                             var body = context.Request.Body.As&lt;JObject&gt;(preserveContent: true);
                             var model = body?[&quot;model&quot;]?.ToString();
-                            if (!string.IsNullOrEmpty(model)) { return &quot;${tenant_name}-&quot; + model; }
+                            if (!string.IsNullOrEmpty(model)) { return model; }
                         } catch { }
                     }
                     return &quot;unknown&quot;;
diff --git a/tests/integration/test-helper.bash b/tests/integration/test-helper.bash
@@ -212,6 +212,106 @@ parse_response() {
     export RESPONSE_STATUS RESPONSE_BODY
 }
 
+# Parse a header-inclusive response (from apim_request_with_headers) into three variables:
+#   RESPONSE_STATUS  — HTTP status code (last line of output)
+#   RESPONSE_HEADERS — raw response headers, one per line, CR stripped
+#   RESPONSE_BODY    — response body (everything after the header blank line)
+#
+# curl -i output structure:
+#   HTTP/1.1 <status> <reason>\r\n
+#   Header: value\r\n
+#   ...\r\n
+#   \r\n
+#   <body>
+#   <status_code>          ← appended by -w "\n%{http_code}"
+parse_response_with_headers() {
+    local response="${1}"
+
+    # Status code is always the last line (appended by -w "\n%{http_code}")
+    RESPONSE_STATUS=$(echo "${response}" | tail -n1)
+
+    # Strip the trailing status line
+    local without_status
+    without_status=$(echo "${response}" | sed '$d')
+
+    # Headers: everything up to (but not including) the first blank line, CR stripped
+    RESPONSE_HEADERS=$(echo "${without_status}" | sed -n '1,/^[[:space:]]*$/{ /^[[:space:]]*$/d; p }' | tr -d '\r')
+
+    # Body: everything after the first blank line
+    RESPONSE_BODY=$(echo "${without_status}" | awk 'found{print} /^[[:space:]]*$/{found=1}')
+
+    export RESPONSE_STATUS RESPONSE_HEADERS RESPONSE_BODY
+}
+
+# Extract a single header value from RESPONSE_HEADERS (set by parse_response_with_headers).
+# Matching is case-insensitive. Returns empty string if not found.
+# Usage: get_response_header <header-name>
+get_response_header() {
+    local name="${1}"
+    echo "${RESPONSE_HEADERS}" | grep -i "^${name}:" | head -1 | sed 's/^[^:]*:[[:space:]]*//' | tr -d '\r'
+}
+
+# HTTP request wrapper that includes response headers in output.
+# Identical to apim_request but adds curl -i so headers are captured.
+# Use parse_response_with_headers to split the output.
+# Usage: apim_request_with_headers <method> <tenant> <path> [body]
+apim_request_with_headers() {
+    local method="${1}"
+    local tenant="${2}"
+    local path="${3}"
+    local body="${4:-}"
+
+    local subscription_key
+    subscription_key=$(get_subscription_key "${tenant}")
+
+    if [[ -z "${subscription_key}" ]]; then
+        echo "Error: No subscription key for tenant ${tenant}" >&2
+        return 1
+    fi
+
+    local url="${APIM_GATEWAY_URL}/${tenant}${path}"
+
+    local curl_opts=(
+        -s                                          # Silent
+        -i                                          # Include response headers in output
+        -w "\n%{http_code}"                         # Append HTTP status code as last line
+        -H "api-key: ${subscription_key}"
+        -H "Content-Type: application/json"
+        -H "Accept: application/json"
+        --max-time 60
+    )
+
+    if [[ -n "${body}" ]]; then
+        curl_opts+=(-d "${body}")
+    fi
+
+    local response
+    response=$(curl -X "${method}" "${curl_opts[@]}" "${url}")
+
+    local status
+    status=$(echo "${response}" | tail -n1)
+
+    # 401 fallback: key may be stale after rotation; refresh from KV and retry once
+    if [[ "${status}" == "401" ]] && refresh_tenant_key_from_vault "${tenant}"; then
+        subscription_key=$(get_subscription_key "${tenant}")
+        curl_opts=(
+            -s
+            -i
+            -w "\n%{http_code}"
+            -H "api-key: ${subscription_key}"
+            -H "Content-Type: application/json"
+            -H "Accept: application/json"
+            --max-time 60
+        )
+        if [[ -n "${body}" ]]; then
+            curl_opts+=(-d "${body}")
+        fi
+        response=$(curl -X "${method}" "${curl_opts[@]}" "${url}")
+    fi
+
+    echo "${response}"
+}
+
 # Retry configuration for rate limiting and transient failures
 # With low quota allocations, rate limits are tighter — use more retries + backoff
 MAX_RETRIES="${MAX_RETRIES:-5}"
diff --git a/tests/integration/v1-chat-completions.bats b/tests/integration/v1-chat-completions.bats
@@ -312,6 +312,116 @@ skip_if_no_key() {
     [[ -n "${content}" ]]
 }
 
+# =============================================================================
+# Rate Limit Header Validation — token budget correctness
+# -----------------------------------------------------------------------------
+# Regression for the tenant-prefix bug:
+#   APIM policy constructed deploymentName = "${tenant}-${model}" for /v1/ paths.
+#   This never matched any <when> condition → fell through to the 1,000 TPM fallback.
+#   x-ratelimit-limit-tokens: 1000 instead of 1,500,000 for gpt-4.1-mini.
+#
+# These tests assert that:
+#   1. /v1/ format returns a token limit >> 1000 (not the fallback)
+#   2. /deployments/ format returns the same token limit as /v1/
+#      (both hit the same <when> block in the APIM policy)
+# =============================================================================
+
+@test "V1-RateLimit: /v1/ format token limit is not the 1000 TPM fallback" {
+    # Why this catches the bug:
+    # Before fix: deploymentName = "wlrs-water-form-assistant-gpt-4.1-mini" → no <when> match
+    #             → <otherwise> fires → tokens-per-minute=1000 → x-ratelimit-limit-tokens: 1000
+    # After fix:  deploymentName = "gpt-4.1-mini" → matches <when> → capacity=1500
+    #             → tokens-per-minute=1500000 → x-ratelimit-limit-tokens: 1500000
+    skip_if_no_key "wlrs-water-form-assistant"
+
+    local path="/openai/v1/chat/completions"
+    local body='{"model":"gpt-4.1-mini","messages":[{"role":"user","content":"Say hello"}],"max_tokens":5}'
+
+    local raw_response
+    raw_response=$(apim_request_with_headers "POST" "wlrs-water-form-assistant" "${path}" "${body}")
+    parse_response_with_headers "${raw_response}"
+
+    echo "# /v1/ status: ${RESPONSE_STATUS}" >&3
+    assert_status "200" "${RESPONSE_STATUS}"
+
+    local limit
+    limit=$(get_response_header "x-ratelimit-limit-tokens")
+    echo "# /v1/ x-ratelimit-limit-tokens: ${limit}" >&3
+
+    [[ -n "${limit}" ]] || { echo "x-ratelimit-limit-tokens header is missing" >&2; return 1; }
+
+    # The 1,000 TPM fallback would produce limit=1000.
+    # Every real model has capacity >= 50 → limit >= 50,000.
+    # Asserting > 1000 is sufficient to detect the fallback without hard-coding model capacity.
+    [[ "${limit}" -gt 1000 ]] || {
+        echo "x-ratelimit-limit-tokens is ${limit} — looks like the 1000 TPM fallback is active." >&2
+        echo "deploymentName variable in APIM policy may still be including tenant prefix for /v1/ paths." >&2
+        return 1
+    }
+}
+
+@test "V1-RateLimit: /deployments/ format token limit is not the 1000 TPM fallback" {
+    skip_if_no_key "wlrs-water-form-assistant"
+
+    local path="/openai/deployments/gpt-4.1-mini/chat/completions?api-version=${OPENAI_API_VERSION}"
+    local body='{"messages":[{"role":"user","content":"Say hello"}],"max_tokens":5}'
+
+    local raw_response
+    raw_response=$(apim_request_with_headers "POST" "wlrs-water-form-assistant" "${path}" "${body}")
+    parse_response_with_headers "${raw_response}"
+
+    echo "# /deployments/ status: ${RESPONSE_STATUS}" >&3
+    assert_status "200" "${RESPONSE_STATUS}"
+
+    local limit
+    limit=$(get_response_header "x-ratelimit-limit-tokens")
+    echo "# /deployments/ x-ratelimit-limit-tokens: ${limit}" >&3
+
+    [[ -n "${limit}" ]] || { echo "x-ratelimit-limit-tokens header is missing" >&2; return 1; }
+    [[ "${limit}" -gt 1000 ]] || {
+        echo "x-ratelimit-limit-tokens is ${limit} — fallback active on /deployments/ path." >&2
+        return 1
+    }
+}
+
+@test "V1-RateLimit: /v1/ and /deployments/ formats report identical token limit for same model" {
+    # Both paths must resolve deploymentName to the bare model name and hit the same
+    # <when> block in the APIM policy → identical llm-token-limit counter and capacity.
+    # A mismatch means one path is hitting the fallback while the other is not.
+    skip_if_no_key "wlrs-water-form-assistant"
+
+    local v1_path="/openai/v1/chat/completions"
+    local v1_body='{"model":"gpt-4.1-mini","messages":[{"role":"user","content":"Say hello"}],"max_tokens":5}'
+
+    local dep_path="/openai/deployments/gpt-4.1-mini/chat/completions?api-version=${OPENAI_API_VERSION}"
+    local dep_body='{"messages":[{"role":"user","content":"Say hello"}],"max_tokens":5}'
+
+    local v1_raw dep_raw
+    v1_raw=$(apim_request_with_headers "POST" "wlrs-water-form-assistant" "${v1_path}" "${v1_body}")
+    parse_response_with_headers "${v1_raw}"
+    local v1_limit
+    v1_limit=$(get_response_header "x-ratelimit-limit-tokens")
+    echo "# /v1/ x-ratelimit-limit-tokens:          ${v1_limit}" >&3
+
+    # Brief pause to avoid token counter spillover between the two requests
+    sleep 2
+
+    dep_raw=$(apim_request_with_headers "POST" "wlrs-water-form-assistant" "${dep_path}" "${dep_body}")
+    parse_response_with_headers "${dep_raw}"
+    local dep_limit
+    dep_limit=$(get_response_header "x-ratelimit-limit-tokens")
+    echo "# /deployments/ x-ratelimit-limit-tokens: ${dep_limit}" >&3
+
+    [[ -n "${v1_limit}" ]]  || { echo "/v1/ x-ratelimit-limit-tokens header missing" >&2; return 1; }
+    [[ -n "${dep_limit}" ]] || { echo "/deployments/ x-ratelimit-limit-tokens header missing" >&2; return 1; }
+
+    [[ "${v1_limit}" == "${dep_limit}" ]] || {
+        echo "/v1/ limit (${v1_limit}) != /deployments/ limit (${dep_limit})" >&2
+        echo "One path is hitting the fallback rate-limit block — check deploymentName construction in api_policy.xml.tftpl" >&2
+        return 1
+    }
+}
+
 # =============================================================================
 # Cross-tenant Isolation
 # =============================================================================