@@ -3718,6 +3718,7 @@ pub async fn test_streaming_include_original_response_with_provider(provider: E2
37183718 if provider. variant_name == "aws-sagemaker-tgi" {
37193719 return ;
37203720 }
3721+
37213722 let episode_id = Uuid :: now_v7 ( ) ;
37223723 let tag_value = Uuid :: now_v7 ( ) . to_string ( ) ;
37233724 // Generate random u32
@@ -3727,6 +3728,7 @@ pub async fn test_streaming_include_original_response_with_provider(provider: E2
37273728 & provider, episode_id, seed, & tag_value, false , true ,
37283729 )
37293730 . await ;
3731+
37303732 tokio:: time:: sleep ( std:: time:: Duration :: from_millis ( 200 ) ) . await ;
37313733 let cached_content = test_simple_streaming_inference_request_with_provider_cache (
37323734 & provider, episode_id, seed, & tag_value, true , true ,
@@ -3858,16 +3860,20 @@ pub async fn test_simple_streaming_inference_request_with_provider_cache(
38583860 let inference_id = inference_id. unwrap ( ) ;
38593861 assert ! ( full_content. to_lowercase( ) . contains( "tokyo" ) ) ;
38603862
3861- // NB: Azure OpenAI Service doesn't support input/output tokens during streaming, but Azure AI Foundry does
3862- if ( provider. variant_name . contains ( "azure" )
3863- && !provider. variant_name . contains ( "azure-ai-foundry" ) )
3864- || check_cache
3865- {
3866- assert_eq ! ( input_tokens, 0 ) ;
3867- assert_eq ! ( output_tokens, 0 ) ;
3868- } else {
3869- assert ! ( input_tokens > 0 ) ;
3870- assert ! ( output_tokens > 0 ) ;
3863+ // This is flaky on Fireworks - it seems like they sometimes don't send us usage information,
3864+ // so TensorZero reports 0 for input/output token usage.
3865+ if provider. model_provider_name != "fireworks" {
3866+ // NB: Azure OpenAI Service doesn't support input/output tokens during streaming, but Azure AI Foundry does
3867+ if ( provider. variant_name . contains ( "azure" )
3868+ && !provider. variant_name . contains ( "azure-ai-foundry" ) )
3869+ || check_cache
3870+ {
3871+ assert_eq ! ( input_tokens, 0 ) ;
3872+ assert_eq ! ( output_tokens, 0 ) ;
3873+ } else {
3874+ assert ! ( input_tokens > 0 ) ;
3875+ assert ! ( output_tokens > 0 ) ;
3876+ }
38713877 }
38723878
38733879 assert ! ( finish_reason. is_some( ) ) ;
@@ -3970,15 +3976,19 @@ pub async fn test_simple_streaming_inference_request_with_provider_cache(
39703976 let input_tokens = result. get ( "input_tokens" ) . unwrap ( ) ;
39713977 let output_tokens = result. get ( "output_tokens" ) . unwrap ( ) ;
39723978
3973- // NB: Azure OpenAI Service doesn't support input/output tokens during streaming, but Azure AI Foundry does
3974- if provider. variant_name . contains ( "azure" )
3975- && !provider. variant_name . contains ( "azure-ai-foundry" )
3976- {
3977- assert ! ( input_tokens. is_null( ) ) ;
3978- assert ! ( output_tokens. is_null( ) ) ;
3979- } else {
3980- assert ! ( input_tokens. as_u64( ) . unwrap( ) > 0 ) ;
3981- assert ! ( output_tokens. as_u64( ) . unwrap( ) > 0 ) ;
3979+ // This is flaky on Fireworks - it seems like they sometimes don't send us usage information,
3980+ // so TensorZero reports 0 for input/output token usage.
3981+ if provider. model_provider_name != "fireworks" {
3982+ // NB: Azure OpenAI Service doesn't support input/output tokens during streaming, but Azure AI Foundry does
3983+ if provider. variant_name . contains ( "azure" )
3984+ && !provider. variant_name . contains ( "azure-ai-foundry" )
3985+ {
3986+ assert ! ( input_tokens. is_null( ) ) ;
3987+ assert ! ( output_tokens. is_null( ) ) ;
3988+ } else {
3989+ assert ! ( input_tokens. as_u64( ) . unwrap( ) > 0 ) ;
3990+ assert ! ( output_tokens. as_u64( ) . unwrap( ) > 0 ) ;
3991+ }
39823992 }
39833993
39843994 let response_time_ms = result. get ( "response_time_ms" ) . unwrap ( ) . as_u64 ( ) . unwrap ( ) ;
0 commit comments