Skip to content

Commit 4d18b26

Browse files
committed
Merge branch 'gemini-audio-billing' into alpha
2 parents 0d724af + 9496dac commit 4d18b26

File tree

8 files changed

+480
-287
lines changed

8 files changed

+480
-287
lines changed

relay/channel/gemini/dto.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -112,10 +112,16 @@ type GeminiChatResponse struct {
112112
}
113113

114114
type GeminiUsageMetadata struct {
115-
PromptTokenCount int `json:"promptTokenCount"`
116-
CandidatesTokenCount int `json:"candidatesTokenCount"`
117-
TotalTokenCount int `json:"totalTokenCount"`
118-
ThoughtsTokenCount int `json:"thoughtsTokenCount"`
115+
PromptTokenCount int `json:"promptTokenCount"`
116+
CandidatesTokenCount int `json:"candidatesTokenCount"`
117+
TotalTokenCount int `json:"totalTokenCount"`
118+
ThoughtsTokenCount int `json:"thoughtsTokenCount"`
119+
PromptTokensDetails []GeminiPromptTokensDetails `json:"promptTokensDetails"`
120+
}
121+
122+
type GeminiPromptTokensDetails struct {
123+
Modality string `json:"modality"`
124+
TokenCount int `json:"tokenCount"`
119125
}
120126

121127
// Imagen related structs

relay/channel/gemini/relay-gemini-native.go

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,16 @@ func GeminiTextGenerationHandler(c *gin.Context, resp *http.Response, info *rela
5555
TotalTokens: geminiResponse.UsageMetadata.TotalTokenCount,
5656
}
5757

58+
usage.CompletionTokenDetails.ReasoningTokens = geminiResponse.UsageMetadata.ThoughtsTokenCount
59+
60+
for _, detail := range geminiResponse.UsageMetadata.PromptTokensDetails {
61+
if detail.Modality == "AUDIO" {
62+
usage.PromptTokensDetails.AudioTokens = detail.TokenCount
63+
} else if detail.Modality == "TEXT" {
64+
usage.PromptTokensDetails.TextTokens = detail.TokenCount
65+
}
66+
}
67+
5868
// 直接返回 Gemini 原生格式的 JSON 响应
5969
jsonResponse, err := json.Marshal(geminiResponse)
6070
if err != nil {
@@ -100,6 +110,14 @@ func GeminiTextGenerationStreamHandler(c *gin.Context, resp *http.Response, info
100110
usage.PromptTokens = geminiResponse.UsageMetadata.PromptTokenCount
101111
usage.CompletionTokens = geminiResponse.UsageMetadata.CandidatesTokenCount
102112
usage.TotalTokens = geminiResponse.UsageMetadata.TotalTokenCount
113+
usage.CompletionTokenDetails.ReasoningTokens = geminiResponse.UsageMetadata.ThoughtsTokenCount
114+
for _, detail := range geminiResponse.UsageMetadata.PromptTokensDetails {
115+
if detail.Modality == "AUDIO" {
116+
usage.PromptTokensDetails.AudioTokens = detail.TokenCount
117+
} else if detail.Modality == "TEXT" {
118+
usage.PromptTokensDetails.TextTokens = detail.TokenCount
119+
}
120+
}
103121
}
104122

105123
// 直接发送 GeminiChatResponse 响应
@@ -118,7 +136,6 @@ func GeminiTextGenerationStreamHandler(c *gin.Context, resp *http.Response, info
118136
}
119137

120138
// 计算最终使用量
121-
usage.PromptTokensDetails.TextTokens = usage.PromptTokens
122139
usage.CompletionTokens = usage.TotalTokens - usage.PromptTokens
123140

124141
// 移除流式响应结尾的[Done],因为Gemini API没有发送Done的行为

relay/channel/gemini/relay-gemini.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -313,13 +313,13 @@ func CovertGemini2OpenAI(textRequest dto.GeneralOpenAIRequest, info *relaycommon
313313
if part.GetInputAudio().Data == "" {
314314
return nil, fmt.Errorf("only base64 audio is supported in gemini")
315315
}
316-
format, base64String, err := service.DecodeBase64FileData(part.GetInputAudio().Data)
316+
base64String, err := service.DecodeBase64AudioData(part.GetInputAudio().Data)
317317
if err != nil {
318318
return nil, fmt.Errorf("decode base64 audio data failed: %s", err.Error())
319319
}
320320
parts = append(parts, GeminiPart{
321321
InlineData: &GeminiInlineData{
322-
MimeType: format,
322+
MimeType: "audio/" + part.GetInputAudio().Format,
323323
Data: base64String,
324324
},
325325
})
@@ -771,6 +771,13 @@ func GeminiChatStreamHandler(c *gin.Context, resp *http.Response, info *relaycom
771771
usage.CompletionTokens = geminiResponse.UsageMetadata.CandidatesTokenCount
772772
usage.CompletionTokenDetails.ReasoningTokens = geminiResponse.UsageMetadata.ThoughtsTokenCount
773773
usage.TotalTokens = geminiResponse.UsageMetadata.TotalTokenCount
774+
for _, detail := range geminiResponse.UsageMetadata.PromptTokensDetails {
775+
if detail.Modality == "AUDIO" {
776+
usage.PromptTokensDetails.AudioTokens = detail.TokenCount
777+
} else if detail.Modality == "TEXT" {
778+
usage.PromptTokensDetails.TextTokens = detail.TokenCount
779+
}
780+
}
774781
}
775782
err = helper.ObjectData(c, response)
776783
if err != nil {
@@ -845,6 +852,14 @@ func GeminiChatHandler(c *gin.Context, resp *http.Response, info *relaycommon.Re
845852
usage.CompletionTokenDetails.ReasoningTokens = geminiResponse.UsageMetadata.ThoughtsTokenCount
846853
usage.CompletionTokens = usage.TotalTokens - usage.PromptTokens
847854

855+
for _, detail := range geminiResponse.UsageMetadata.PromptTokensDetails {
856+
if detail.Modality == "AUDIO" {
857+
usage.PromptTokensDetails.AudioTokens = detail.TokenCount
858+
} else if detail.Modality == "TEXT" {
859+
usage.PromptTokensDetails.TextTokens = detail.TokenCount
860+
}
861+
}
862+
848863
fullTextResponse.Usage = usage
849864
jsonResponse, err := json.Marshal(fullTextResponse)
850865
if err != nil {

relay/relay-text.go

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,7 @@ func postConsumeQuota(ctx *gin.Context, relayInfo *relaycommon.RelayInfo,
352352
promptTokens := usage.PromptTokens
353353
cacheTokens := usage.PromptTokensDetails.CachedTokens
354354
imageTokens := usage.PromptTokensDetails.ImageTokens
355+
audioTokens := usage.PromptTokensDetails.AudioTokens
355356
completionTokens := usage.CompletionTokens
356357
modelName := relayInfo.OriginModelName
357358

@@ -367,6 +368,7 @@ func postConsumeQuota(ctx *gin.Context, relayInfo *relaycommon.RelayInfo,
367368
dPromptTokens := decimal.NewFromInt(int64(promptTokens))
368369
dCacheTokens := decimal.NewFromInt(int64(cacheTokens))
369370
dImageTokens := decimal.NewFromInt(int64(imageTokens))
371+
dAudioTokens := decimal.NewFromInt(int64(audioTokens))
370372
dCompletionTokens := decimal.NewFromInt(int64(completionTokens))
371373
dCompletionRatio := decimal.NewFromFloat(completionRatio)
372374
dCacheRatio := decimal.NewFromFloat(cacheRatio)
@@ -412,23 +414,43 @@ func postConsumeQuota(ctx *gin.Context, relayInfo *relaycommon.RelayInfo,
412414
dFileSearchQuota = decimal.NewFromFloat(fileSearchPrice).
413415
Mul(decimal.NewFromInt(int64(fileSearchTool.CallCount))).
414416
Div(decimal.NewFromInt(1000)).Mul(dGroupRatio).Mul(dQuotaPerUnit)
415-
extraContent += fmt.Sprintf("File Search 调用 %d 次,调用花费 $%s",
417+
extraContent += fmt.Sprintf("File Search 调用 %d 次,调用花费 %s",
416418
fileSearchTool.CallCount, dFileSearchQuota.String())
417419
}
418420
}
419421

420422
var quotaCalculateDecimal decimal.Decimal
423+
424+
var audioInputQuota decimal.Decimal
425+
var audioInputPrice float64
421426
if !priceData.UsePrice {
422-
nonCachedTokens := dPromptTokens.Sub(dCacheTokens)
423-
cachedTokensWithRatio := dCacheTokens.Mul(dCacheRatio)
424-
425-
promptQuota := nonCachedTokens.Add(cachedTokensWithRatio)
426-
if imageTokens > 0 {
427-
nonImageTokens := dPromptTokens.Sub(dImageTokens)
428-
imageTokensWithRatio := dImageTokens.Mul(dImageRatio)
429-
promptQuota = nonImageTokens.Add(imageTokensWithRatio)
427+
baseTokens := dPromptTokens
428+
// 减去 cached tokens
429+
var cachedTokensWithRatio decimal.Decimal
430+
if !dCacheTokens.IsZero() {
431+
baseTokens = baseTokens.Sub(dCacheTokens)
432+
cachedTokensWithRatio = dCacheTokens.Mul(dCacheRatio)
433+
}
434+
435+
// 减去 image tokens
436+
var imageTokensWithRatio decimal.Decimal
437+
if !dImageTokens.IsZero() {
438+
baseTokens = baseTokens.Sub(dImageTokens)
439+
imageTokensWithRatio = dImageTokens.Mul(dImageRatio)
430440
}
431441

442+
// 减去 Gemini audio tokens
443+
if !dAudioTokens.IsZero() {
444+
audioInputPrice = operation_setting.GetGeminiInputAudioPricePerMillionTokens(modelName)
445+
if audioInputPrice > 0 {
446+
// 重新计算 base tokens
447+
baseTokens = baseTokens.Sub(dAudioTokens)
448+
audioInputQuota = decimal.NewFromFloat(audioInputPrice).Div(decimal.NewFromInt(1000000)).Mul(dAudioTokens).Mul(dGroupRatio).Mul(dQuotaPerUnit)
449+
extraContent += fmt.Sprintf("Audio Input 花费 %s", audioInputQuota.String())
450+
}
451+
}
452+
promptQuota := baseTokens.Add(cachedTokensWithRatio).Add(imageTokensWithRatio)
453+
432454
completionQuota := dCompletionTokens.Mul(dCompletionRatio)
433455

434456
quotaCalculateDecimal = promptQuota.Add(completionQuota).Mul(ratio)
@@ -442,6 +464,8 @@ func postConsumeQuota(ctx *gin.Context, relayInfo *relaycommon.RelayInfo,
442464
// 添加 responses tools call 调用的配额
443465
quotaCalculateDecimal = quotaCalculateDecimal.Add(dWebSearchQuota)
444466
quotaCalculateDecimal = quotaCalculateDecimal.Add(dFileSearchQuota)
467+
// 添加 audio input 独立计费
468+
quotaCalculateDecimal = quotaCalculateDecimal.Add(audioInputQuota)
445469

446470
quota := int(quotaCalculateDecimal.Round(0).IntPart())
447471
totalTokens := promptTokens + completionTokens
@@ -512,6 +536,11 @@ func postConsumeQuota(ctx *gin.Context, relayInfo *relaycommon.RelayInfo,
512536
other["file_search_price"] = fileSearchPrice
513537
}
514538
}
539+
if !audioInputQuota.IsZero() {
540+
other["audio_input_seperate_price"] = true
541+
other["audio_input_token_count"] = audioTokens
542+
other["audio_input_price"] = audioInputPrice
543+
}
515544
model.RecordConsumeLog(ctx, relayInfo.UserId, relayInfo.ChannelId, promptTokens, completionTokens, logModel,
516545
tokenName, quota, logContent, relayInfo.TokenId, userQuota, int(useTimeSeconds), relayInfo.IsStream, relayInfo.Group, other)
517546
}

service/audio.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package service
33
import (
44
"encoding/base64"
55
"fmt"
6+
"strings"
67
)
78

89
func parseAudio(audioBase64 string, format string) (duration float64, err error) {
@@ -29,3 +30,19 @@ func parseAudio(audioBase64 string, format string) (duration float64, err error)
2930
duration = float64(samplesCount) / float64(sampleRate)
3031
return duration, nil
3132
}
33+
34+
func DecodeBase64AudioData(audioBase64 string) (string, error) {
35+
// 检查并移除 data:audio/xxx;base64, 前缀
36+
idx := strings.Index(audioBase64, ",")
37+
if idx != -1 {
38+
audioBase64 = audioBase64[idx+1:]
39+
}
40+
41+
// 解码 Base64 数据
42+
_, err := base64.StdEncoding.DecodeString(audioBase64)
43+
if err != nil {
44+
return "", fmt.Errorf("base64 decode error: %v", err)
45+
}
46+
47+
return audioBase64, nil
48+
}

setting/operation_setting/tools.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,13 @@ const (
1414
FileSearchPrice = 2.5
1515
)
1616

17+
const (
18+
// Gemini Audio Input Price
19+
Gemini25FlashPreviewInputAudioPrice = 1.00
20+
Gemini25FlashNativeAudioInputAudioPrice = 3.00
21+
Gemini20FlashInputAudioPrice = 0.70
22+
)
23+
1724
func GetWebSearchPricePerThousand(modelName string, contextSize string) float64 {
1825
// 确定模型类型
1926
// https://platform.openai.com/docs/pricing Web search 价格按模型类型和 search context size 收费
@@ -55,3 +62,14 @@ func GetWebSearchPricePerThousand(modelName string, contextSize string) float64
5562
func GetFileSearchPricePerThousand() float64 {
5663
return FileSearchPrice
5764
}
65+
66+
func GetGeminiInputAudioPricePerMillionTokens(modelName string) float64 {
67+
if strings.HasPrefix(modelName, "gemini-2.5-flash-preview") {
68+
return Gemini25FlashPreviewInputAudioPrice
69+
} else if strings.HasPrefix(modelName, "gemini-2.5-flash-preview-native-audio") {
70+
return Gemini25FlashNativeAudioInputAudioPrice
71+
} else if strings.HasPrefix(modelName, "gemini-2.0-flash") {
72+
return Gemini20FlashInputAudioPrice
73+
}
74+
return 0
75+
}

0 commit comments

Comments
 (0)