Skip to content

Commit d4dbabb

Browse files
authored
Merge pull request #1373 from miuosz/feature/add-support-vertex-ai-caching
improvement: add cache details to usage for (vertex ai) gemini and anthropic models
2 parents 47f7152 + c085626 commit d4dbabb

File tree

5 files changed

+168
-12
lines changed

5 files changed

+168
-12
lines changed

src/providers/anthropic/chatComplete.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -587,6 +587,9 @@ export const AnthropicChatCompleteResponseTransform: (
587587
output_tokens +
588588
(cache_creation_input_tokens ?? 0) +
589589
(cache_read_input_tokens ?? 0),
590+
prompt_tokens_details: {
591+
cached_tokens: cache_read_input_tokens ?? 0,
592+
},
590593
...(shouldSendCacheUsage && {
591594
cache_read_input_tokens: cache_read_input_tokens,
592595
cache_creation_input_tokens: cache_creation_input_tokens,
@@ -718,9 +721,12 @@ export const AnthropicChatCompleteStreamChunkTransform: (
718721
},
719722
],
720723
usage: {
721-
completion_tokens: parsedChunk.usage?.output_tokens,
722724
...streamState.usage,
725+
completion_tokens: parsedChunk.usage?.output_tokens,
723726
total_tokens: totalTokens,
727+
prompt_tokens_details: {
728+
cached_tokens: streamState.usage?.cache_read_input_tokens ?? 0,
729+
},
724730
},
725731
})}` + '\n\n'
726732
);

src/providers/anthropic/types.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@ export type AnthropicStreamState = {
22
toolIndex?: number;
33
usage?: {
44
prompt_tokens?: number;
5+
prompt_tokens_details?: {
6+
cached_tokens?: number;
7+
};
58
completion_tokens?: number;
69
cache_read_input_tokens?: number;
710
cache_creation_input_tokens?: number;

src/providers/google-vertex-ai/chatComplete.ts

Lines changed: 84 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,13 @@ import {
4040
transformFinishReason,
4141
} from '../utils';
4242
import { transformGenerationConfig } from './transformGenerationConfig';
43-
import type {
43+
import {
4444
GoogleErrorResponse,
4545
GoogleGenerateContentResponse,
4646
VertexLlamaChatCompleteStreamChunk,
4747
VertexLLamaChatCompleteResponse,
48+
GoogleSearchRetrievalTool,
49+
VERTEX_MODALITY,
4850
} from './types';
4951
import {
5052
getMimeType,
@@ -431,7 +433,18 @@ export const GoogleChatCompleteResponseTransform: (
431433
candidatesTokenCount = 0,
432434
totalTokenCount = 0,
433435
thoughtsTokenCount = 0,
436+
cachedContentTokenCount = 0,
437+
promptTokensDetails = [],
438+
candidatesTokensDetails = [],
434439
} = response.usageMetadata;
440+
const inputAudioTokens = promptTokensDetails.reduce((acc, curr) => {
441+
if (curr.modality === VERTEX_MODALITY.AUDIO) return acc + curr.tokenCount;
442+
return acc;
443+
}, 0);
444+
const outputAudioTokens = candidatesTokensDetails.reduce((acc, curr) => {
445+
if (curr.modality === VERTEX_MODALITY.AUDIO) return acc + curr.tokenCount;
446+
return acc;
447+
}, 0);
435448

436449
return {
437450
id: 'portkey-' + crypto.randomUUID(),
@@ -510,6 +523,11 @@ export const GoogleChatCompleteResponseTransform: (
510523
total_tokens: totalTokenCount,
511524
completion_tokens_details: {
512525
reasoning_tokens: thoughtsTokenCount,
526+
audio_tokens: outputAudioTokens,
527+
},
528+
prompt_tokens_details: {
529+
cached_tokens: cachedContentTokenCount,
530+
audio_tokens: inputAudioTokens,
513531
},
514532
},
515533
};
@@ -603,6 +621,26 @@ export const GoogleChatCompleteStreamChunkTransform: (
603621
total_tokens: parsedChunk.usageMetadata.totalTokenCount,
604622
completion_tokens_details: {
605623
reasoning_tokens: parsedChunk.usageMetadata.thoughtsTokenCount ?? 0,
624+
audio_tokens:
625+
parsedChunk.usageMetadata?.candidatesTokensDetails?.reduce(
626+
(acc, curr) => {
627+
if (curr.modality === VERTEX_MODALITY.AUDIO)
628+
return acc + curr.tokenCount;
629+
return acc;
630+
},
631+
0
632+
),
633+
},
634+
prompt_tokens_details: {
635+
cached_tokens: parsedChunk.usageMetadata.cachedContentTokenCount,
636+
audio_tokens: parsedChunk.usageMetadata?.promptTokensDetails?.reduce(
637+
(acc, curr) => {
638+
if (curr.modality === VERTEX_MODALITY.AUDIO)
639+
return acc + curr.tokenCount;
640+
return acc;
641+
},
642+
0
643+
),
606644
},
607645
};
608646
}
@@ -739,7 +777,22 @@ export const VertexAnthropicChatCompleteResponseTransform: (
739777
}
740778

741779
if ('content' in response) {
742-
const { input_tokens = 0, output_tokens = 0 } = response?.usage ?? {};
780+
const {
781+
input_tokens = 0,
782+
output_tokens = 0,
783+
cache_creation_input_tokens = 0,
784+
cache_read_input_tokens = 0,
785+
} = response?.usage ?? {};
786+
787+
const totalTokens =
788+
input_tokens +
789+
output_tokens +
790+
cache_creation_input_tokens +
791+
cache_read_input_tokens;
792+
793+
const shouldSendCacheUsage =
794+
!strictOpenAiCompliance &&
795+
(cache_creation_input_tokens || cache_read_input_tokens);
743796

744797
let content: AnthropicContentItem[] | string = strictOpenAiCompliance
745798
? ''
@@ -794,7 +847,14 @@ export const VertexAnthropicChatCompleteResponseTransform: (
794847
usage: {
795848
prompt_tokens: input_tokens,
796849
completion_tokens: output_tokens,
797-
total_tokens: input_tokens + output_tokens,
850+
total_tokens: totalTokens,
851+
prompt_tokens_details: {
852+
cached_tokens: cache_read_input_tokens,
853+
},
854+
...(shouldSendCacheUsage && {
855+
cache_read_input_tokens: cache_read_input_tokens,
856+
cache_creation_input_tokens: cache_creation_input_tokens,
857+
}),
798858
},
799859
};
800860
}
@@ -863,10 +923,20 @@ export const VertexAnthropicChatCompleteStreamChunkTransform: (
863923
}
864924

865925
if (parsedChunk.type === 'message_start' && parsedChunk.message?.usage) {
926+
const shouldSendCacheUsage =
927+
parsedChunk.message?.usage?.cache_read_input_tokens ||
928+
parsedChunk.message?.usage?.cache_creation_input_tokens;
929+
866930
streamState.model = parsedChunk?.message?.model ?? '';
867931

868932
streamState.usage = {
869933
prompt_tokens: parsedChunk.message.usage?.input_tokens,
934+
...(shouldSendCacheUsage && {
935+
cache_read_input_tokens:
936+
parsedChunk.message?.usage?.cache_read_input_tokens,
937+
cache_creation_input_tokens:
938+
parsedChunk.message?.usage?.cache_creation_input_tokens,
939+
}),
870940
};
871941
return (
872942
`data: ${JSON.stringify({
@@ -893,6 +963,12 @@ export const VertexAnthropicChatCompleteStreamChunkTransform: (
893963
}
894964

895965
if (parsedChunk.type === 'message_delta' && parsedChunk.usage) {
966+
const totalTokens =
967+
(streamState?.usage?.prompt_tokens ?? 0) +
968+
(streamState?.usage?.cache_creation_input_tokens ?? 0) +
969+
(streamState?.usage?.cache_read_input_tokens ?? 0) +
970+
(parsedChunk.usage.output_tokens ?? 0);
971+
896972
return (
897973
`data: ${JSON.stringify({
898974
id: fallbackId,
@@ -911,11 +987,12 @@ export const VertexAnthropicChatCompleteStreamChunkTransform: (
911987
},
912988
],
913989
usage: {
990+
...streamState.usage,
914991
completion_tokens: parsedChunk.usage?.output_tokens,
915-
prompt_tokens: streamState.usage?.prompt_tokens,
916-
total_tokens:
917-
(streamState.usage?.prompt_tokens || 0) +
918-
(parsedChunk.usage?.output_tokens || 0),
992+
total_tokens: totalTokens,
993+
prompt_tokens_details: {
994+
cached_tokens: streamState.usage?.cache_read_input_tokens ?? 0,
995+
},
919996
},
920997
})}` + '\n\n'
921998
);

src/providers/google-vertex-ai/types.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,15 @@ export interface GoogleGenerateContentResponse {
7070
candidatesTokenCount: number;
7171
totalTokenCount: number;
7272
thoughtsTokenCount?: number;
73+
cachedContentTokenCount?: number;
74+
promptTokensDetails: {
75+
modality: VERTEX_MODALITY;
76+
tokenCount: number;
77+
}[];
78+
candidatesTokensDetails: {
79+
modality: VERTEX_MODALITY;
80+
tokenCount: number;
81+
}[];
7382
};
7483
}
7584

@@ -259,3 +268,10 @@ export enum VERTEX_GEMINI_GENERATE_CONTENT_FINISH_REASON {
259268
PROHIBITED_CONTENT = 'PROHIBITED_CONTENT',
260269
SPII = 'SPII',
261270
}
271+
272+
export enum VERTEX_MODALITY {
273+
MODALITY_UNSPECIFIED = 'MODALITY_UNSPECIFIED',
274+
TEXT = 'TEXT',
275+
IMAGE = 'IMAGE',
276+
AUDIO = 'AUDIO',
277+
}

src/providers/google/chatComplete.ts

Lines changed: 58 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ import {
99
SYSTEM_MESSAGE_ROLES,
1010
MESSAGE_ROLES,
1111
} from '../../types/requestBody';
12+
import { buildGoogleSearchRetrievalTool } from '../google-vertex-ai/chatComplete';
13+
import { VERTEX_MODALITY } from '../google-vertex-ai/types';
1214
import {
1315
getMimeType,
1416
googleTools,
@@ -496,6 +498,15 @@ interface GoogleGenerateContentResponse {
496498
candidatesTokenCount: number;
497499
totalTokenCount: number;
498500
thoughtsTokenCount?: number;
501+
cachedContentTokenCount?: number;
502+
promptTokensDetails: {
503+
modality: VERTEX_MODALITY;
504+
tokenCount: number;
505+
}[];
506+
candidatesTokensDetails: {
507+
modality: VERTEX_MODALITY;
508+
tokenCount: number;
509+
}[];
499510
};
500511
}
501512

@@ -537,6 +548,24 @@ export const GoogleChatCompleteResponseTransform: (
537548
}
538549

539550
if ('candidates' in response) {
551+
const {
552+
promptTokenCount = 0,
553+
candidatesTokenCount = 0,
554+
totalTokenCount = 0,
555+
thoughtsTokenCount = 0,
556+
cachedContentTokenCount = 0,
557+
promptTokensDetails = [],
558+
candidatesTokensDetails = [],
559+
} = response.usageMetadata;
560+
const inputAudioTokens = promptTokensDetails.reduce((acc, curr) => {
561+
if (curr.modality === VERTEX_MODALITY.AUDIO) return acc + curr.tokenCount;
562+
return acc;
563+
}, 0);
564+
const outputAudioTokens = candidatesTokensDetails.reduce((acc, curr) => {
565+
if (curr.modality === VERTEX_MODALITY.AUDIO) return acc + curr.tokenCount;
566+
return acc;
567+
}, 0);
568+
540569
return {
541570
id: 'portkey-' + crypto.randomUUID(),
542571
object: 'chat.completion',
@@ -605,11 +634,16 @@ export const GoogleChatCompleteResponseTransform: (
605634
};
606635
}) ?? [],
607636
usage: {
608-
prompt_tokens: response.usageMetadata.promptTokenCount,
609-
completion_tokens: response.usageMetadata.candidatesTokenCount,
610-
total_tokens: response.usageMetadata.totalTokenCount,
637+
prompt_tokens: promptTokenCount,
638+
completion_tokens: candidatesTokenCount,
639+
total_tokens: totalTokenCount,
611640
completion_tokens_details: {
612-
reasoning_tokens: response.usageMetadata.thoughtsTokenCount ?? 0,
641+
reasoning_tokens: thoughtsTokenCount,
642+
audio_tokens: outputAudioTokens,
643+
},
644+
prompt_tokens_details: {
645+
cached_tokens: cachedContentTokenCount,
646+
audio_tokens: inputAudioTokens,
613647
},
614648
},
615649
};
@@ -658,6 +692,26 @@ export const GoogleChatCompleteStreamChunkTransform: (
658692
total_tokens: parsedChunk.usageMetadata.totalTokenCount,
659693
completion_tokens_details: {
660694
reasoning_tokens: parsedChunk.usageMetadata.thoughtsTokenCount ?? 0,
695+
audio_tokens:
696+
parsedChunk.usageMetadata?.candidatesTokensDetails?.reduce(
697+
(acc, curr) => {
698+
if (curr.modality === VERTEX_MODALITY.AUDIO)
699+
return acc + curr.tokenCount;
700+
return acc;
701+
},
702+
0
703+
),
704+
},
705+
prompt_tokens_details: {
706+
cached_tokens: parsedChunk.usageMetadata.cachedContentTokenCount,
707+
audio_tokens: parsedChunk.usageMetadata?.promptTokensDetails?.reduce(
708+
(acc, curr) => {
709+
if (curr.modality === VERTEX_MODALITY.AUDIO)
710+
return acc + curr.tokenCount;
711+
return acc;
712+
},
713+
0
714+
),
661715
},
662716
};
663717
}

0 commit comments

Comments
 (0)