Skip to content

Commit 12b82ce

Browse files
authored
Merge pull request #425 from symflower/fetch-costs
Fetch total costs from OpenRouter after query
2 parents d9b0914 + 8187e1d commit 12b82ce

File tree

12 files changed

+238
-148
lines changed

12 files changed

+238
-148
lines changed

cmd/eval-dev-quality/cmd/evaluate_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ func validateMetrics(t *testing.T, csvData string, expectedAssessments []metrics
3434
actualAssessmentTuples := reporttesting.ParseMetrics(t, csvData)
3535
actual = make([]metrics.Assessments, len(actualAssessmentTuples))
3636
for i, tuple := range actualAssessmentTuples {
37-
assert.Greater(t, tuple.Assessment[metrics.AssessmentKeyProcessingTime], uint64(0))
37+
assert.Greater(t, tuple.Assessment[metrics.AssessmentKeyProcessingTime], float64(0))
3838
actual[i] = tuple.Assessment
3939
}
4040

evaluate/evaluate_test.go

Lines changed: 108 additions & 108 deletions
Large diffs are not rendered by default.

evaluate/metrics/assessment.go

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,14 +59,20 @@ var (
5959
AssessmentKeyTokenInput = RegisterAssessmentKey("token-input")
6060
// AssessmentKeyTokenOutput collects the number of output token.
6161
AssessmentKeyTokenOutput = RegisterAssessmentKey("token-output")
62+
// AssessmentKeyNativeTokenInput collects the number of input token.
63+
AssessmentKeyNativeTokenInput = RegisterAssessmentKey("native-token-input")
64+
// AssessmentKeyNativeTokenOutput collects the number of output token.
65+
AssessmentKeyNativeTokenOutput = RegisterAssessmentKey("native-token-output")
66+
// AssessmentKeyCostsTokenActual collects the number of output token.
67+
AssessmentKeyCostsTokenActual = RegisterAssessmentKey("costs-total-actual")
6268
)
6369

6470
// Assessments holds a collection of numerical assessment metrics.
65-
type Assessments map[AssessmentKey]uint64
71+
type Assessments map[AssessmentKey]float64
6672

6773
// NewAssessments creates a new assessment collection.
6874
func NewAssessments() Assessments {
69-
return map[AssessmentKey]uint64{}
75+
return map[AssessmentKey]float64{}
7076
}
7177

7278
// Add adds the given assessment collection to the current one.
@@ -98,7 +104,7 @@ func (a Assessments) Award(key AssessmentKey) {
98104

99105
// AwardMultiple yields multiple score points.
100106
func (a Assessments) AwardMultiple(key AssessmentKey, count uint64) {
101-
a[key] += count
107+
a[key] += float64(count)
102108
}
103109

104110
// String returns a string representation of the metrics.
@@ -109,7 +115,7 @@ func (a Assessments) String() string {
109115
entries := make([]string, len(AllAssessmentKeys))
110116

111117
for i, key := range AllAssessmentKeys {
112-
entries[i] = fmt.Sprintf("%s=%d", key, a[key])
118+
entries[i] = fmt.Sprintf("%s=%v", key, a[key])
113119
}
114120

115121
return strings.Join(entries, ", ")
@@ -123,7 +129,7 @@ func (a Assessments) StringCSV() (row []string) {
123129

124130
row = make([]string, len(AllAssessmentKeys))
125131
for i, key := range AllAssessmentKeys {
126-
row[i] = fmt.Sprintf("%d", a[key])
132+
row[i] = fmt.Sprintf("%v", a[key])
127133
}
128134

129135
return row

evaluate/metrics/assessment_test.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -37,26 +37,26 @@ func TestAssessmentsAdd(t *testing.T) {
3737
Name: "Non existing key",
3838

3939
Assessments: NewAssessments(),
40-
X: map[AssessmentKey]uint64{
40+
X: map[AssessmentKey]float64{
4141
AssessmentKeyResponseNoExcess: 1,
4242
},
4343

44-
ExpectedAssessments: map[AssessmentKey]uint64{
44+
ExpectedAssessments: map[AssessmentKey]float64{
4545
AssessmentKeyResponseNoExcess: 1,
4646
},
4747
})
4848

4949
validate(t, &testCase{
5050
Name: "Existing key",
5151

52-
Assessments: map[AssessmentKey]uint64{
52+
Assessments: map[AssessmentKey]float64{
5353
AssessmentKeyResponseNoExcess: 1,
5454
},
55-
X: map[AssessmentKey]uint64{
55+
X: map[AssessmentKey]float64{
5656
AssessmentKeyResponseNoExcess: 1,
5757
},
5858

59-
ExpectedAssessments: map[AssessmentKey]uint64{
59+
ExpectedAssessments: map[AssessmentKey]float64{
6060
AssessmentKeyResponseNoExcess: 2,
6161
},
6262
})
@@ -84,7 +84,7 @@ func TestAssessmentString(t *testing.T) {
8484

8585
Assessment: NewAssessments(),
8686

87-
ExpectedString: "coverage=0, files-executed=0, files-executed-maximum-reachable=0, generate-tests-for-file-character-count=0, processing-time=0, response-character-count=0, response-no-error=0, response-no-excess=0, response-with-code=0, tests-passing=0, token-input=0, token-output=0",
87+
ExpectedString: "costs-total-actual=0, coverage=0, files-executed=0, files-executed-maximum-reachable=0, generate-tests-for-file-character-count=0, native-token-input=0, native-token-output=0, processing-time=0, response-character-count=0, response-no-error=0, response-no-excess=0, response-with-code=0, tests-passing=0, token-input=0, token-output=0",
8888
})
8989

9090
validate(t, &testCase{
@@ -105,7 +105,7 @@ func TestAssessmentString(t *testing.T) {
105105
AssessmentKeyTokenOutput: 456,
106106
},
107107

108-
ExpectedString: "coverage=1, files-executed=2, files-executed-maximum-reachable=2, generate-tests-for-file-character-count=50, processing-time=200, response-character-count=100, response-no-error=3, response-no-excess=4, response-with-code=5, tests-passing=7, token-input=123, token-output=456",
108+
ExpectedString: "costs-total-actual=0, coverage=1, files-executed=2, files-executed-maximum-reachable=2, generate-tests-for-file-character-count=50, native-token-input=0, native-token-output=0, processing-time=200, response-character-count=100, response-no-error=3, response-no-excess=4, response-with-code=5, tests-passing=7, token-input=123, token-output=456",
109109
})
110110
}
111111

@@ -209,7 +209,7 @@ func TestCombineModelAndSymflowerFixAssessments(t *testing.T) {
209209

210210
ModelAssessment: Assessments{
211211
AssessmentKeyFilesExecuted: 1,
212-
AssessmentKeyProcessingTime: uint64(200),
212+
AssessmentKeyProcessingTime: float64(200),
213213
AssessmentKeyCoverage: 0,
214214
AssessmentKeyResponseCharacterCount: 100,
215215
AssessmentKeyGenerateTestsForFileCharacterCount: 50,
@@ -219,15 +219,15 @@ func TestCombineModelAndSymflowerFixAssessments(t *testing.T) {
219219
},
220220
SymflowerFixAssessments: Assessments{
221221
AssessmentKeyFilesExecuted: 1,
222-
AssessmentKeyProcessingTime: uint64(100),
222+
AssessmentKeyProcessingTime: float64(100),
223223
AssessmentKeyCoverage: 1,
224224
AssessmentKeyResponseNoError: 1,
225225
AssessmentKeyTestsPassing: 10,
226226
},
227227

228228
ExpectedAssessments: Assessments{
229229
AssessmentKeyFilesExecuted: 1,
230-
AssessmentKeyProcessingTime: uint64(300),
230+
AssessmentKeyProcessingTime: float64(300),
231231
AssessmentKeyCoverage: 1,
232232
AssessmentKeyResponseCharacterCount: 100,
233233
AssessmentKeyGenerateTestsForFileCharacterCount: 50,

evaluate/report/csv_test.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ func TestNewEvaluationFile(t *testing.T) {
2424
require.NoError(t, err)
2525

2626
expectedEvaluationFileContent := bytesutil.StringTrimIndentations(`
27-
model-id,language,repository,case,task,run,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing,token-input,token-output
27+
model-id,language,repository,case,task,run,costs-total-actual,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,native-token-input,native-token-output,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing,token-input,token-output
2828
`)
2929

3030
assert.Equal(t, expectedEvaluationFileContent, string(actualEvaluationFileContent))
@@ -65,8 +65,8 @@ func TestWriteEvaluationRecord(t *testing.T) {
6565
},
6666

6767
ExpectedCSV: `
68-
model-id,language,repository,case,task,run,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing,token-input,token-output
69-
mocked-model,golang,golang/plain,plain.go,write-tests,1,0,0,0,0,0,0,0,0,0,0,0,0
68+
model-id,language,repository,case,task,run,costs-total-actual,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,native-token-input,native-token-output,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing,token-input,token-output
69+
mocked-model,golang,golang/plain,plain.go,write-tests,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7070
`,
7171
})
7272
validate(t, &testCase{
@@ -90,9 +90,9 @@ func TestWriteEvaluationRecord(t *testing.T) {
9090
},
9191

9292
ExpectedCSV: `
93-
model-id,language,repository,case,task,run,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing,token-input,token-output
94-
mocked-model,golang,golang/plain,plain.go,write-tests,1,0,1,1,0,0,0,1,0,0,0,0,0
95-
mocked-model,golang,golang/plain,plain.go,write-tests-symflower-fix,1,10,1,1,0,0,0,1,0,0,0,0,0
93+
model-id,language,repository,case,task,run,costs-total-actual,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,native-token-input,native-token-output,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing,token-input,token-output
94+
mocked-model,golang,golang/plain,plain.go,write-tests,1,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0
95+
mocked-model,golang,golang/plain,plain.go,write-tests-symflower-fix,1,0,10,1,1,0,0,0,0,0,1,0,0,0,0,0
9696
`,
9797
})
9898
}

evaluate/report/testing/csv.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@ import (
1111
"github.com/symflower/eval-dev-quality/task"
1212
)
1313

14-
func atoiUint64(t *testing.T, s string) uint64 {
15-
value, err := strconv.ParseUint(s, 10, 64)
14+
func parseFloat64(t *testing.T, s string) float64 {
15+
value, err := strconv.ParseFloat(s, 64)
1616
assert.NoErrorf(t, err, "parsing unsigned integer from: %q", s)
1717

18-
return uint64(value)
18+
return value
1919
}
2020

2121
// ParseMetrics extracts multiple assessment metrics from the given string.
@@ -37,7 +37,7 @@ func ParseMetrics(t *testing.T, data string) (assessments metricstesting.Assessm
3737
Assessment: metrics.Assessments{},
3838
}
3939
for i, key := range metrics.AllAssessmentKeys {
40-
tuple.Assessment[key] = atoiUint64(t, cells[i+6])
40+
tuple.Assessment[key] = parseFloat64(t, cells[i+6])
4141
}
4242

4343
assessments = append(assessments, tuple)

evaluate/task/symflower.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ func runModelAndSymflowerFix(ctx evaltask.Context, modelCtx model.Context, runMo
109109

110110
// Symflower was able to fix a failure so now update the assessment with the improved results.
111111
withSymflowerFix := metrics.NewAssessments()
112-
withSymflowerFix[metrics.AssessmentKeyProcessingTime] = processingTime
112+
withSymflowerFix[metrics.AssessmentKeyProcessingTime] = float64(processingTime)
113113
withSymflowerFix.Award(metrics.AssessmentKeyFilesExecuted)
114114
withSymflowerFix.AwardMultiple(metrics.AssessmentKeyCoverage, withSymflowerFixTestResult.Coverage)
115115

evaluate/task/transpile.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,8 @@ func (t *Transpile) Run(ctx evaltask.Context) (repositoryAssessment map[string]m
6969
modelAssessments := metrics.NewAssessments()
7070
withSymflowerAssessments := metrics.NewAssessments()
7171
maximumReachableFiles := uint64(len(language.Languages) - 1) // Transpile repositories contain sub-tasks to transpile from every other supported language minus the one we are transpiling to.
72-
modelAssessments[metrics.AssessmentKeyFilesExecutedMaximumReachable] = maximumReachableFiles
73-
withSymflowerAssessments[metrics.AssessmentKeyFilesExecutedMaximumReachable] = maximumReachableFiles
72+
modelAssessments[metrics.AssessmentKeyFilesExecutedMaximumReachable] = float64(maximumReachableFiles)
73+
withSymflowerAssessments[metrics.AssessmentKeyFilesExecutedMaximumReachable] = float64(maximumReachableFiles)
7474
repositoryAssessment[packagePath] = map[evaltask.Identifier]metrics.Assessments{
7575
IdentifierTranspile: modelAssessments,
7676
IdentifierTranspileSymflowerFix: withSymflowerAssessments,
@@ -131,7 +131,7 @@ func (t *Transpile) Run(ctx evaltask.Context) (repositoryAssessment map[string]m
131131

132132
// Symflower was able to fix a failure so now update the assessment with the improved results.
133133
withSymflowerFixAssessments := metrics.NewAssessments()
134-
withSymflowerFixAssessments[metrics.AssessmentKeyProcessingTime] = processingTime
134+
withSymflowerFixAssessments[metrics.AssessmentKeyProcessingTime] = float64(processingTime)
135135
withSymflowerFixAssessments.Award(metrics.AssessmentKeyFilesExecuted)
136136
withSymflowerFixAssessments.AwardMultiple(metrics.AssessmentKeyTestsPassing, uint64(testsPassing))
137137

model/llm/llm.go

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,11 @@ func (m *Model) query(logger *log.Logger, request string) (queryResult *provider
339339
return err
340340
}
341341
duration = time.Since(start)
342-
logger.Info("model responded", "model", m.ID(), "id", id, "duration", duration.Milliseconds(), "response-id", queryResult.ResponseID, "token-input", queryResult.Usage.PromptTokens, "token-output", queryResult.Usage.CompletionTokens, "response", string(bytesutil.PrefixLines([]byte(queryResult.Message), []byte("\t"))))
342+
totalCosts := float64(-1)
343+
if queryResult.GenerationInfo != nil {
344+
totalCosts = queryResult.GenerationInfo.TotalCost
345+
}
346+
logger.Info("model responded", "model", m.ID(), "id", id, "duration", duration.Milliseconds(), "response-id", queryResult.ResponseID, "costs-total", totalCosts, "token-input", queryResult.Usage.PromptTokens, "token-output", queryResult.Usage.CompletionTokens, "response", string(bytesutil.PrefixLines([]byte(queryResult.Message), []byte("\t"))))
343347

344348
return nil
345349
},
@@ -491,11 +495,16 @@ func handleQueryResult(queryResult *provider.QueryResult, filePathAbsolute strin
491495
if err != nil {
492496
return nil, pkgerrors.WithStack(err)
493497
}
494-
assessment[metrics.AssessmentKeyProcessingTime] = uint64(queryResult.Duration.Milliseconds())
495-
assessment[metrics.AssessmentKeyResponseCharacterCount] = uint64(len(queryResult.Message))
496-
assessment[metrics.AssessmentKeyGenerateTestsForFileCharacterCount] = uint64(len(sourceFileContent))
497-
assessment[metrics.AssessmentKeyTokenInput] = uint64(queryResult.Usage.PromptTokens)
498-
assessment[metrics.AssessmentKeyTokenOutput] = uint64(queryResult.Usage.CompletionTokens)
498+
assessment[metrics.AssessmentKeyProcessingTime] = float64(queryResult.Duration.Milliseconds())
499+
assessment[metrics.AssessmentKeyResponseCharacterCount] = float64(len(queryResult.Message))
500+
assessment[metrics.AssessmentKeyGenerateTestsForFileCharacterCount] = float64(len(sourceFileContent))
501+
assessment[metrics.AssessmentKeyTokenInput] = float64(queryResult.Usage.PromptTokens)
502+
assessment[metrics.AssessmentKeyTokenOutput] = float64(queryResult.Usage.CompletionTokens)
503+
if queryResult.GenerationInfo != nil {
504+
assessment[metrics.AssessmentKeyNativeTokenInput] = float64(queryResult.GenerationInfo.NativeTokensPrompt)
505+
assessment[metrics.AssessmentKeyNativeTokenOutput] = float64(queryResult.GenerationInfo.NativeTokensCompletion)
506+
assessment[metrics.AssessmentKeyCostsTokenActual] = queryResult.GenerationInfo.TotalCost
507+
}
499508

500509
if err := os.MkdirAll(filepath.Dir(filePathAbsolute), 0755); err != nil {
501510
return nil, pkgerrors.WithStack(err)

model/symflower/symflower.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ func (m *Model) WriteTests(ctx model.Context) (assessment metrics.Assessments, e
121121
return nil, pkgerrors.WithStack(err)
122122
}
123123

124-
processingTime := uint64(time.Since(start).Milliseconds())
124+
processingTime := float64(time.Since(start).Milliseconds())
125125

126126
characterCount, err := countCharactersOfGeneratedFiles(ctx.RepositoryPath, extractGeneratedFilePaths(output))
127127
if err != nil {
@@ -130,8 +130,8 @@ func (m *Model) WriteTests(ctx model.Context) (assessment metrics.Assessments, e
130130

131131
return metrics.Assessments{ // Symflower always generates just source code when it does not fail, so no need to check the assessment properties.
132132
metrics.AssessmentKeyProcessingTime: processingTime,
133-
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: characterCount,
134-
metrics.AssessmentKeyResponseCharacterCount: characterCount,
133+
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: float64(characterCount),
134+
metrics.AssessmentKeyResponseCharacterCount: float64(characterCount),
135135
metrics.AssessmentKeyResponseNoExcess: 1,
136136
metrics.AssessmentKeyResponseWithCode: 1,
137137
}, nil

0 commit comments

Comments
 (0)