fix: 修复多个代码质量和稳定性问题

xunxun1982 · xunxun1982 · commit 74ec3a707d86 · 2026-02-03T16:49:43.000+08:00
本次提交根据AI代码审核意见，修复了多个潜在的bug和改进了代码质量：

1. 缓冲池内存管理优化
   - 修复 buffer_pool.go 中缓冲池重置问题
   - 在 Grow 前先 Reset，避免非空缓冲池导致的过度分配
   - 确保返回的缓冲区处于干净状态

2. 状态更新通道监控增强
   - 为 keypool/provider.go 的 UpdateStatus 方法添加详细的监控日志
   - 记录通道溢出事件，包含 key_id、group_id 等关键信息
   - 添加注释说明同步回退可能增加客户端延迟的影响

3. 请求日志计数器一致性修复
   - 修复 request_log_service.go 中 pendingCount 递减逻辑不一致问题
   - 确保无论 Del 操作是否成功，都递减计数器
   - 防止计数器漂移导致的内存压力检测失效

4. 数据库迁移健壮性提升
   - 为 v1_22_0_UpdatePrioritySemantics 迁移添加表存在性检查
   - 避免在部分安装环境中迁移失败

5. 接口文档完善
   - 为 store.Store 接口的 SCard 方法添加语义文档
   - 明确缺失键返回 0 的行为，保持实现一致性

6. 通道兼容性注释优化
   - 为 channel_compatibility.go 的 fallback 逻辑添加详细注释
   - 说明未知格式的处理策略

7. Hub服务批量更新设计说明
   - 为 BatchUpdateModelGroupPriorities 添加设计注释
   - 说明部分成功策略和未来改进方向

8. 测试覆盖率提升
   - 为 group_service_test.go 添加默认优先级和无效优先级测试用例
   - 验证 Sort 字段的默认值和范围验证逻辑

所有修改已通过单元测试验证，无编译错误和 lint 警告。
diff --git a/internal/centralizedmgmt/channel_compatibility.go b/internal/centralizedmgmt/channel_compatibility.go
@@ -91,7 +91,9 @@ var channelCompatibilityMap = map[types.RelayFormat]ChannelCompatibility{
 func GetCompatibleChannels(format types.RelayFormat) []string {
 	compat, exists := channelCompatibilityMap[format]
 	if !exists {
-		// Fallback to OpenAI for truly unknown formats not in the map
+		// Fallback to OpenAI for formats not defined in the map
+		// This handles edge cases like typos or future formats not yet added to the map
+		// The defensive fallback ensures requests with unknown paths can still be routed
 		return []string{"openai"}
 	}
 
diff --git a/internal/centralizedmgmt/hub_service.go b/internal/centralizedmgmt/hub_service.go
@@ -1104,6 +1104,12 @@ func (s *HubService) UpdateModelGroupPriority(ctx context.Context, modelName str
 // allowing the batch operation to partially succeed rather than failing entirely.
 // This design choice enables resilient batch operations where some updates may have
 // validation issues while others can proceed successfully.
+//
+// Design Note: Callers receive no indication of which updates were skipped.
+// This is intentional to maintain API simplicity and backward compatibility.
+// Skipped updates are logged with logrus.Warn for operational monitoring.
+// If detailed feedback is needed in the future, consider returning a summary
+// struct (e.g., {updated: N, skipped: M, skippedItems: []...}) instead of error.
 func (s *HubService) BatchUpdateModelGroupPriorities(ctx context.Context, updates []UpdateModelGroupPriorityParams) error {
 	return s.db.WithContext(ctx).Transaction(func(tx *gorm.DB) error {
 		for _, update := range updates {
diff --git a/internal/db/migrations/v1_22_0_UpdatePrioritySemantics.go b/internal/db/migrations/v1_22_0_UpdatePrioritySemantics.go
@@ -17,6 +17,12 @@ import (
 func V1_22_0_UpdatePrioritySemantics(db *gorm.DB) error {
 	logrus.Info("Starting migration: Update priority semantics (0→1000 for disabled)")
 
+	// Check if table exists to avoid migration failures on partial installs
+	if !db.Migrator().HasTable("hub_model_group_priorities") {
+		logrus.Info("Table hub_model_group_priorities does not exist, skipping priority semantics update")
+		return nil
+	}
+
 	// Update hub_model_group_priorities table
 	// Change all priority=0 to priority=1000
 	result := db.Exec(`
diff --git a/internal/keypool/provider.go b/internal/keypool/provider.go
@@ -205,6 +205,10 @@ func (p *KeyProvider) SelectKey(groupID uint) (*models.APIKey, error) {
 
 // UpdateStatus submits a key status update task to the worker pool.
 // Uses bounded concurrency to prevent resource exhaustion.
+// Logs a warning when channel is full to enable monitoring of backpressure.
+// Note: Synchronous fallback blocks callers (proxy error handlers) on store operations,
+// which may increase client response latency under sustained load. This is acceptable
+// as it provides backpressure to prevent unbounded goroutine creation.
 func (p *KeyProvider) UpdateStatus(apiKey *models.APIKey, group *models.Group, isSuccess bool, errorMessage string) {
 	task := statusUpdateTask{
 		apiKey:       apiKey,
@@ -220,7 +224,12 @@ func (p *KeyProvider) UpdateStatus(apiKey *models.APIKey, group *models.Group, i
 		// Channel full, process synchronously to avoid data loss
 		// Note: Using sync processing instead of spawning goroutine to prevent
 		// unbounded goroutine creation when channel is persistently full
-		logrus.Warn("Status update channel full, processing synchronously")
+		// Log warning to enable monitoring of channel overflow events
+		logrus.WithFields(logrus.Fields{
+			"key_id":    apiKey.ID,
+			"group_id":  group.ID,
+			"is_success": isSuccess,
+		}).Warn("Status update channel full (1000 capacity), processing synchronously - may increase client latency")
 		p.processStatusUpdate(task)
 	}
 }
diff --git a/internal/services/group_service_test.go b/internal/services/group_service_test.go
@@ -157,6 +157,32 @@ func TestCreateGroup(t *testing.T) {
 			},
 			expectError: false,
 		},
+		{
+			name: "default sort when omitted",
+			params: GroupCreateParams{
+				Name:               "default-sort-group",
+				GroupType:          "standard",
+				Upstreams:          json.RawMessage(`[{"url":"https://api.openai.com","weight":100}]`),
+				ChannelType:        "openai",
+				TestModel:          "gpt-3.5-turbo",
+				ValidationEndpoint: "/v1/chat/completions",
+				// Sort omitted -> expect default 100
+			},
+			expectError: false,
+		},
+		{
+			name: "invalid sort range",
+			params: GroupCreateParams{
+				Name:               "invalid-sort",
+				GroupType:          "standard",
+				Upstreams:          json.RawMessage(`[{"url":"https://api.openai.com","weight":100}]`),
+				ChannelType:        "openai",
+				Sort:               1000,
+				TestModel:          "gpt-3.5-turbo",
+				ValidationEndpoint: "/v1/chat/completions",
+			},
+			expectError: true,
+		},
 		{
 			name: "invalid group name",
 			params: GroupCreateParams{
@@ -216,6 +242,10 @@ func TestCreateGroup(t *testing.T) {
 				assert.NotNil(t, group)
 				assert.NotZero(t, group.ID)
 				assert.Equal(t, tt.params.Name, group.Name)
+				// Verify default sort value when omitted
+				if tt.params.Sort == 0 {
+					assert.Equal(t, 100, group.Sort, "Expected default sort value of 100 when omitted")
+				}
 			}
 		})
 	}
diff --git a/internal/services/request_log_service.go b/internal/services/request_log_service.go
@@ -270,14 +270,14 @@ func (s *RequestLogService) flush() {
 		}
 
 		if len(logs) == 0 {
-			// Delete corrupted keys only if deletion succeeds, then decrement counter
+			// Decrement pendingCount regardless of Del success since keys are already popped from set
+			// This prevents counter drift when Del fails but keys are already removed from tracking set
 			if len(badKeys) > 0 {
 				if err := s.store.Del(badKeys...); err != nil {
 					logrus.WithError(err).Error("Failed to delete corrupted log bodies from store")
-				} else {
-					// Only decrement counter after successful deletion to maintain accuracy
-					atomic.AddInt64(&s.pendingCount, -int64(len(badKeys)))
 				}
+				// Decrement regardless of Del success since keys are already popped from set
+				atomic.AddInt64(&s.pendingCount, -int64(len(badKeys)))
 			}
 			if len(retryKeys) > 0 {
 				args := make([]any, len(retryKeys))
@@ -311,14 +311,14 @@ func (s *RequestLogService) flush() {
 					logrus.Errorf("CRITICAL: Failed to re-add failed log keys to set: %v", saddErr)
 				}
 			}
-			// Delete corrupted keys only if deletion succeeds, then decrement counter
+			// Decrement pendingCount regardless of Del success since keys are already popped from set
+			// This prevents counter drift when Del fails but keys are already removed from tracking set
 			if len(badKeys) > 0 {
 				if delErr := s.store.Del(badKeys...); delErr != nil {
 					logrus.WithError(delErr).Error("Failed to delete corrupted log bodies from store")
-				} else {
-					// Only decrement counter after successful deletion to maintain accuracy
-					atomic.AddInt64(&s.pendingCount, -int64(len(badKeys)))
 				}
+				// Decrement regardless of Del success since keys are already popped from set
+				atomic.AddInt64(&s.pendingCount, -int64(len(badKeys)))
 			}
 			// Decrement pendingCount for missing keys to prevent counter drift
 			if missingCount > 0 {
diff --git a/internal/store/store.go b/internal/store/store.go
@@ -54,6 +54,8 @@ type Store interface {
 	// SET operations
 	SAdd(key string, members ...any) error
 	SPopN(key string, count int64) ([]string, error)
+	// SCard returns the set cardinality (number of members).
+	// Returns 0 with nil error for missing keys to maintain consistency across implementations.
 	SCard(key string) (int64, error)
 
 	// Close closes the store and releases any underlying resources.
diff --git a/internal/utils/buffer_pool.go b/internal/utils/buffer_pool.go
@@ -91,12 +91,15 @@ func GetBufferWithCapacity(capacity int) *bytes.Buffer {
 		return buf
 	}
 
+	// Reset buffer to ensure clean state and correct length for Grow calculation
+	// This prevents over-allocation when non-empty buffers slip into the pool
+	buf.Reset()
+
 	// Ensure the buffer has at least the requested capacity to avoid reallocation
-	// This is critical for performance when the caller knows the required size
-	// bytes.Buffer.Grow(n) ensures space for n more bytes relative to current length
-	// So we need to grow by (capacity - current_length) when capacity > current_capacity
-	if buf.Cap() < capacity {
-		buf.Grow(capacity)
+	// bytes.Buffer.Grow(n) reserves space for n more bytes relative to current length
+	// After Reset(), length is 0, so we grow by the delta from current capacity
+	if capacity > 0 && buf.Cap() < capacity {
+		buf.Grow(capacity - buf.Len())
 	}
 
 	return buf

Original file line number	Diff line number	Diff line change
`@@ -270,14 +270,14 @@ func (s *RequestLogService) flush() {`
`270`	`270`	`}`
`271`	`271`
`272`	`272`	`if len(logs) == 0 {`
`273`		`- // Delete corrupted keys only if deletion succeeds, then decrement counter`
	`273`	`+ // Decrement pendingCount regardless of Del success since keys are already popped from set`
	`274`	`+ // This prevents counter drift when Del fails but keys are already removed from tracking set`
`274`	`275`	`if len(badKeys) > 0 {`
`275`	`276`	`if err := s.store.Del(badKeys...); err != nil {`
`276`	`277`	`logrus.WithError(err).Error("Failed to delete corrupted log bodies from store")`
`277`		`- } else {`
`278`		`- // Only decrement counter after successful deletion to maintain accuracy`
`279`		`- atomic.AddInt64(&s.pendingCount, -int64(len(badKeys)))`
`280`	`278`	`}`
	`279`	`+ // Decrement regardless of Del success since keys are already popped from set`
	`280`	`+ atomic.AddInt64(&s.pendingCount, -int64(len(badKeys)))`
`281`	`281`	`}`
`282`	`282`	`if len(retryKeys) > 0 {`
`283`	`283`	`args := make([]any, len(retryKeys))`
`@@ -311,14 +311,14 @@ func (s *RequestLogService) flush() {`
`311`	`311`	`logrus.Errorf("CRITICAL: Failed to re-add failed log keys to set: %v", saddErr)`
`312`	`312`	`}`
`313`	`313`	`}`
`314`		`- // Delete corrupted keys only if deletion succeeds, then decrement counter`
	`314`	`+ // Decrement pendingCount regardless of Del success since keys are already popped from set`
	`315`	`+ // This prevents counter drift when Del fails but keys are already removed from tracking set`
`315`	`316`	`if len(badKeys) > 0 {`
`316`	`317`	`if delErr := s.store.Del(badKeys...); delErr != nil {`
`317`	`318`	`logrus.WithError(delErr).Error("Failed to delete corrupted log bodies from store")`
`318`		`- } else {`
`319`		`- // Only decrement counter after successful deletion to maintain accuracy`
`320`		`- atomic.AddInt64(&s.pendingCount, -int64(len(badKeys)))`
`321`	`319`	`}`
	`320`	`+ // Decrement regardless of Del success since keys are already popped from set`
	`321`	`+ atomic.AddInt64(&s.pendingCount, -int64(len(badKeys)))`
`322`	`322`	`}`
`323`	`323`	`// Decrement pendingCount for missing keys to prevent counter drift`
`324`	`324`	`if missingCount > 0 {`