fix: handle array-of-objects in FlexibleStringSlice for tripleshot combine evaluations (#666)

Iron-Ham · web-flow · commit 23592e21c607 · 2026-02-16T10:48:43.000-05:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,6 +19,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- **TripleShot Combine Evaluation Parse Failure** - `FlexibleStringSlice` now handles LLM judge output that writes an array of objects (e.g., `[{"description":"...","source":"attempt_1"}]`) where flat strings were expected; also improved the judge prompt to show a populated `suggested_changes` example and explicitly require plain strings
+
 - **Agent Teams tmux mode** - Prevent Claude Code Agent Teams from starting in tmux mode inside Claudio by setting `teammateMode: "in-process"` in worktree settings (#664)
 
 - **Agent Teams tmux mode (CLI flag)** - Pass `--teammate-mode in-process` directly on the CLI command for both start and resume, ensuring CC cannot override the setting via user-level settings or `$TMUX` auto-detection
diff --git a/internal/orchestrator/workflows/tripleshot/AGENTS.md b/internal/orchestrator/workflows/tripleshot/AGENTS.md
@@ -5,5 +5,5 @@
 
 ## Pitfalls
 
-- **LLM output type mismatches in sentinel files** — LLMs frequently write a plain string where the JSON schema expects `[]string` (e.g., `"suggested_changes": "fix the bug"` instead of `"suggested_changes": ["fix the bug"]`). The `Evaluation`, `AttemptEvaluationItem`, and `AdversarialReviewFile` structs use `FlexibleStringSlice` for all `[]string` fields and `FlexibleString` for `Reasoning` to tolerate this. When adding new LLM-parsed fields of type `string` or `[]string`, use these flexible types instead of bare Go types. Without this, `json.Unmarshal` fails, `VerifyWork` returns false, and the bridge retries the task — spawning a duplicate instance.
+- **LLM output type mismatches in sentinel files** — LLMs frequently deviate from the expected JSON types. `FlexibleStringSlice` handles three cases: a plain string (`"fix the bug"`), an array of strings (`["fix A", "fix B"]`), and an array of objects (`[{"description":"fix A","source":"attempt_1"}]`). For objects, it extracts a well-known text key (`description`, `text`, `change`, `message`, `content`, `value`) or falls back to JSON-encoding the whole object. `FlexibleString` similarly handles string-or-array. When adding new LLM-parsed fields of type `string` or `[]string`, use these flexible types instead of bare Go types. Without this, `json.Unmarshal` fails, `VerifyWork` returns false, and the bridge retries the task — spawning a duplicate instance.
 - **Sentinel file search in subdirectories** — `FindCompletionFile`, `FindEvaluationFile`, and `FindAdversarialReviewFile` all search the worktree root *and* immediate subdirectories. LLM instances sometimes write files relative to their CWD rather than the worktree root. Don't bypass `Find*File` with a direct `filepath.Join(worktree, filename)`.
diff --git a/internal/orchestrator/workflows/tripleshot/session_test.go b/internal/orchestrator/workflows/tripleshot/session_test.go
@@ -400,6 +400,39 @@ func TestParseEvaluationFile_FlexibleFields(t *testing.T) {
 			wantWeaknessesLen: 1,
 			wantFirstWeakness: "No tests",
 		},
+		{
+			name: "combine with suggested_changes as array of objects",
+			json: `{
+				"winner_index": -1,
+				"merge_strategy": "combine",
+				"reasoning": "Cherry-pick from multiple attempts",
+				"attempt_evaluations": [],
+				"suggested_changes": [
+					{"description": "Use Attempt 1 as the base branch", "source": "attempt_1"},
+					{"description": "Cherry-pick ContentEquatable from Attempt 3", "source": "attempt_3"}
+				]
+			}`,
+			wantStrategy:    MergeStrategyCombine,
+			wantReasoning:   "Cherry-pick from multiple attempts",
+			wantChangesLen:  2,
+			wantFirstChange: "Use Attempt 1 as the base branch",
+		},
+		{
+			name: "suggested_changes as objects with text key",
+			json: `{
+				"winner_index": -1,
+				"merge_strategy": "merge",
+				"reasoning": "Merged",
+				"attempt_evaluations": [],
+				"suggested_changes": [
+					{"text": "Apply error handling from Attempt 2"}
+				]
+			}`,
+			wantStrategy:    MergeStrategyMerge,
+			wantReasoning:   "Merged",
+			wantChangesLen:  1,
+			wantFirstChange: "Apply error handling from Attempt 2",
+		},
 	}
 
 	for _, tt := range tests {
diff --git a/internal/orchestrator/workflows/tripleshot/types.go b/internal/orchestrator/workflows/tripleshot/types.go
@@ -179,9 +179,47 @@ func (f *FlexibleStringSlice) UnmarshalJSON(data []byte) error {
 		return nil
 	}
 
+	// Try to unmarshal as an array of mixed elements (handles LLM output
+	// that writes objects like [{"description":"...","source":"attempt_1"}]
+	// where flat strings were expected)
+	var mixed []any
+	if err := json.Unmarshal(data, &mixed); err == nil && len(mixed) > 0 {
+		result := make([]string, 0, len(mixed))
+		for _, item := range mixed {
+			result = append(result, stringifyItem(item))
+		}
+		*f = result
+		return nil
+	}
+
 	return fmt.Errorf("FlexibleStringSlice: expected string or []string, got %s", string(data))
 }
 
+// textLikeKeys are map keys checked in order when extracting a string from an object.
+var textLikeKeys = []string{"description", "text", "change", "message", "content", "value"}
+
+// stringifyItem converts an arbitrary JSON-decoded value into a string.
+// For maps, it looks for a well-known text key; otherwise it JSON-encodes the value.
+func stringifyItem(v any) string {
+	switch val := v.(type) {
+	case string:
+		return val
+	case map[string]any:
+		for _, key := range textLikeKeys {
+			if text, ok := val[key]; ok {
+				if s, ok := text.(string); ok {
+					return s
+				}
+			}
+		}
+		// No known text key found — marshal the whole object
+		b, _ := json.Marshal(val)
+		return string(b)
+	default:
+		return fmt.Sprintf("%v", val)
+	}
+}
+
 // CompletionFile represents the completion report written by an attempt
 type CompletionFile struct {
 	AttemptIndex  int            `json:"attempt_index"`
@@ -699,7 +737,7 @@ Write your evaluation to ` + "`" + EvaluationFileName + "`" + ` using this struc
       "weaknesses": ["No tests", "Missing error handling"]
     }
   ],
-  "suggested_changes": []
+  "suggested_changes": ["Use Attempt 1 as the base — it has the cleanest API surface", "Cherry-pick the retry logic from Attempt 3: src/retry.go"]
 }
 ` + "```" + `
 
@@ -708,7 +746,7 @@ Write your evaluation to ` + "`" + EvaluationFileName + "`" + ` using this struc
 - **merge_strategy**: "select" (use one as-is), "merge" (combine changes), or "combine" (cherry-pick specific changes)
 - **reasoning**: Detailed explanation of your evaluation and decision
 - **attempt_evaluations**: Score and analysis for each attempt (1-10 scale)
-- **suggested_changes**: If merge_strategy is "merge" or "combine", list the specific changes to make
+- **suggested_changes**: If merge_strategy is "merge" or "combine", list the specific changes to make. Each entry must be a plain string (not an object).
 
 ## Helpful Commands