@@ -312,6 +312,116 @@ skip_if_no_key() {
312312 [[ -n " ${content} " ]]
313313}
314314
315+ # =============================================================================
316+ # Rate Limit Header Validation — token budget correctness
317+ # -----------------------------------------------------------------------------
318+ # Regression for the tenant-prefix bug:
319+ # APIM policy constructed deploymentName = "${tenant}-${model}" for /v1/ paths.
320+ # This never matched any <when> condition → fell through to the 1,000 TPM fallback.
321+ # x-ratelimit-limit-tokens: 1000 instead of 1,500,000 for gpt-4.1-mini.
322+ #
323+ # These tests assert that:
324+ # 1. /v1/ format returns a token limit >> 1000 (not the fallback)
325+ # 2. /deployments/ format returns the same token limit as /v1/
326+ # (both hit the same <when> block in the APIM policy)
327+ # =============================================================================
328+
329+ @test " V1-RateLimit: /v1/ format token limit is not the 1000 TPM fallback" {
330+ # Why this catches the bug:
331+ # Before fix: deploymentName = "wlrs-water-form-assistant-gpt-4.1-mini" → no <when> match
332+ # → <otherwise> fires → tokens-per-minute=1000 → x-ratelimit-limit-tokens: 1000
333+ # After fix: deploymentName = "gpt-4.1-mini" → matches <when> → capacity=1500
334+ # → tokens-per-minute=1500000 → x-ratelimit-limit-tokens: 1500000
335+ skip_if_no_key " wlrs-water-form-assistant"
336+
337+ local path=" /openai/v1/chat/completions"
338+ local body=' {"model":"gpt-4.1-mini","messages":[{"role":"user","content":"Say hello"}],"max_tokens":5}'
339+
340+ local raw_response
341+ raw_response=$( apim_request_with_headers " POST" " wlrs-water-form-assistant" " ${path} " " ${body} " )
342+ parse_response_with_headers " ${raw_response} "
343+
344+ echo " # /v1/ status: ${RESPONSE_STATUS} " >&3
345+ assert_status " 200" " ${RESPONSE_STATUS} "
346+
347+ local limit
348+ limit=$( get_response_header " x-ratelimit-limit-tokens" )
349+ echo " # /v1/ x-ratelimit-limit-tokens: ${limit} " >&3
350+
351+ [[ -n " ${limit} " ]] || { echo " x-ratelimit-limit-tokens header is missing" >&2 ; return 1; }
352+
353+ # The 1,000 TPM fallback would produce limit=1000.
354+ # Every real model has capacity >= 50 → limit >= 50,000.
355+ # Asserting > 1000 is sufficient to detect the fallback without hard-coding model capacity.
356+ [[ " ${limit} " -gt 1000 ]] || {
357+ echo " x-ratelimit-limit-tokens is ${limit} — looks like the 1000 TPM fallback is active." >&2
358+ echo " deploymentName variable in APIM policy may still be including tenant prefix for /v1/ paths." >&2
359+ return 1
360+ }
361+ }
362+
363+ @test " V1-RateLimit: /deployments/ format token limit is not the 1000 TPM fallback" {
364+ skip_if_no_key " wlrs-water-form-assistant"
365+
366+ local path=" /openai/deployments/gpt-4.1-mini/chat/completions?api-version=${OPENAI_API_VERSION} "
367+ local body=' {"messages":[{"role":"user","content":"Say hello"}],"max_tokens":5}'
368+
369+ local raw_response
370+ raw_response=$( apim_request_with_headers " POST" " wlrs-water-form-assistant" " ${path} " " ${body} " )
371+ parse_response_with_headers " ${raw_response} "
372+
373+ echo " # /deployments/ status: ${RESPONSE_STATUS} " >&3
374+ assert_status " 200" " ${RESPONSE_STATUS} "
375+
376+ local limit
377+ limit=$( get_response_header " x-ratelimit-limit-tokens" )
378+ echo " # /deployments/ x-ratelimit-limit-tokens: ${limit} " >&3
379+
380+ [[ -n " ${limit} " ]] || { echo " x-ratelimit-limit-tokens header is missing" >&2 ; return 1; }
381+ [[ " ${limit} " -gt 1000 ]] || {
382+ echo " x-ratelimit-limit-tokens is ${limit} — fallback active on /deployments/ path." >&2
383+ return 1
384+ }
385+ }
386+
387+ @test " V1-RateLimit: /v1/ and /deployments/ formats report identical token limit for same model" {
388+ # Both paths must resolve deploymentName to the bare model name and hit the same
389+ # <when> block in the APIM policy → identical llm-token-limit counter and capacity.
390+ # A mismatch means one path is hitting the fallback while the other is not.
391+ skip_if_no_key " wlrs-water-form-assistant"
392+
393+ local v1_path=" /openai/v1/chat/completions"
394+ local v1_body=' {"model":"gpt-4.1-mini","messages":[{"role":"user","content":"Say hello"}],"max_tokens":5}'
395+
396+ local dep_path=" /openai/deployments/gpt-4.1-mini/chat/completions?api-version=${OPENAI_API_VERSION} "
397+ local dep_body=' {"messages":[{"role":"user","content":"Say hello"}],"max_tokens":5}'
398+
399+ local v1_raw dep_raw
400+ v1_raw=$( apim_request_with_headers " POST" " wlrs-water-form-assistant" " ${v1_path} " " ${v1_body} " )
401+ parse_response_with_headers " ${v1_raw} "
402+ local v1_limit
403+ v1_limit=$( get_response_header " x-ratelimit-limit-tokens" )
404+ echo " # /v1/ x-ratelimit-limit-tokens: ${v1_limit} " >&3
405+
406+ # Brief pause to avoid token counter spillover between the two requests
407+ sleep 2
408+
409+ dep_raw=$( apim_request_with_headers " POST" " wlrs-water-form-assistant" " ${dep_path} " " ${dep_body} " )
410+ parse_response_with_headers " ${dep_raw} "
411+ local dep_limit
412+ dep_limit=$( get_response_header " x-ratelimit-limit-tokens" )
413+ echo " # /deployments/ x-ratelimit-limit-tokens: ${dep_limit} " >&3
414+
415+ [[ -n " ${v1_limit} " ]] || { echo " /v1/ x-ratelimit-limit-tokens header missing" >&2 ; return 1; }
416+ [[ -n " ${dep_limit} " ]] || { echo " /deployments/ x-ratelimit-limit-tokens header missing" >&2 ; return 1; }
417+
418+ [[ " ${v1_limit} " == " ${dep_limit} " ]] || {
419+ echo " /v1/ limit (${v1_limit} ) != /deployments/ limit (${dep_limit} )" >&2
420+ echo " One path is hitting the fallback rate-limit block — check deploymentName construction in api_policy.xml.tftpl" >&2
421+ return 1
422+ }
423+ }
424+
315425# =============================================================================
316426# Cross-tenant Isolation
317427# =============================================================================
0 commit comments