Skip to content

Commit 68f9016

Browse files
authored
fix(eval): repair eval-conflicts CLI and expand dataset to 100 pairs (#377)
The eval-conflicts CLI never worked end-to-end because all three HTTP response decoders (auth token, validator eval, scorer eval) failed to unwrap the server's {"data":...} envelope. Fix all three. Expand the validator eval dataset from 63 to 100 labeled pairs by adding 37 production-labeled conflicts (17 genuine, 16 related_not_contradicting, 4 unrelated_false_positive) from a judge + meta-judge audit of real conflict data. Baseline metrics on the expanded dataset: - Scorer precision: 45.9% (embedding-only, no threshold separates classes) - Validator (gpt-4o-mini) precision: 91.5%, recall: 95.6%, F1: 93.5% - Relationship accuracy: 77.4% (5-class exact match) Refs: #376
1 parent 07ba1a6 commit 68f9016

File tree

2 files changed

+501
-8
lines changed

2 files changed

+501
-8
lines changed

cmd/eval-conflicts/main.go

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -186,12 +186,17 @@ func authenticate(baseURL, agentID, apiKey string) (string, error) {
186186
}
187187

188188
var result struct {
189-
Token string `json:"token"`
189+
Data struct {
190+
Token string `json:"token"`
191+
} `json:"data"`
190192
}
191193
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
192194
return "", fmt.Errorf("decode: %w", err)
193195
}
194-
return result.Token, nil
196+
if result.Data.Token == "" {
197+
return "", fmt.Errorf("empty token in response")
198+
}
199+
return result.Data.Token, nil
195200
}
196201

197202
type validatorEvalResponse struct {
@@ -223,11 +228,13 @@ func callValidatorEval(baseURL, token string) (conflicts.EvalMetrics, []conflict
223228
return conflicts.EvalMetrics{}, nil, fmt.Errorf("status %d: %s", resp.StatusCode, string(respBody))
224229
}
225230

226-
var result validatorEvalResponse
227-
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
231+
var envelope struct {
232+
Data validatorEvalResponse `json:"data"`
233+
}
234+
if err := json.NewDecoder(resp.Body).Decode(&envelope); err != nil {
228235
return conflicts.EvalMetrics{}, nil, fmt.Errorf("decode: %w", err)
229236
}
230-
return result.Metrics, result.Results, nil
237+
return envelope.Data.Metrics, envelope.Data.Results, nil
231238
}
232239

233240
func callScorerEval(baseURL, token string) (scorerEvalResult, error) {
@@ -254,9 +261,11 @@ func callScorerEval(baseURL, token string) (scorerEvalResult, error) {
254261
return scorerEvalResult{}, fmt.Errorf("status %d: %s", resp.StatusCode, string(respBody))
255262
}
256263

257-
var result scorerEvalResult
258-
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
264+
var envelope struct {
265+
Data scorerEvalResult `json:"data"`
266+
}
267+
if err := json.NewDecoder(resp.Body).Decode(&envelope); err != nil {
259268
return scorerEvalResult{}, fmt.Errorf("decode: %w", err)
260269
}
261-
return result, nil
270+
return envelope.Data, nil
262271
}

0 commit comments

Comments
 (0)