Skip to content

Commit c62ea12

Browse files
ahumenbergerbauersimon
authored andcommitted
fix, Collect assessments if a model responds with an empty message
Closes #427
1 parent f17be9f commit c62ea12

File tree

5 files changed

+101
-19
lines changed

5 files changed

+101
-19
lines changed

evaluate/evaluate_test.go

Lines changed: 58 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -161,15 +161,39 @@ func TestEvaluate(t *testing.T) {
161161

162162
{
163163
languageGolang := &golang.Language{}
164-
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, "empty-response-model")
164+
mockedModelID := "testing-provider/empty-response-model"
165+
mockedQuery := providertesting.NewMockQuery(t)
166+
mockedModel := llm.NewModel(mockedQuery, mockedModelID)
165167
repositoryPath := filepath.Join("golang", "plain")
166168

167169
validate(t, &testCase{
168-
Name: "Empty model responses are errors",
170+
Name: "Empty model response",
169171

170172
Before: func(t *testing.T, logger *log.Logger, resultPath string) {
173+
queryResult1 := &provider.QueryResult{
174+
Message: "",
175+
GenerationInfo: &provider.GenerationInfo{
176+
TotalCost: 0.111111111,
177+
NativeTokensPrompt: 111,
178+
NativeTokensCompletion: 222,
179+
},
180+
}
181+
// Set up mocks, when test is running.
182+
mockedQuery.On("Query", mock.Anything, mock.Anything, mock.Anything).Return(queryResult1, nil).Once().After(10 * time.Millisecond) // Simulate a model response delay because our internal safety measures trigger when a query is done in 0 milliseconds.
183+
184+
queryResult2 := &provider.QueryResult{
185+
Message: "",
186+
GenerationInfo: &provider.GenerationInfo{
187+
TotalCost: 0.222222222,
188+
NativeTokensPrompt: 333,
189+
NativeTokensCompletion: 444,
190+
},
191+
}
171192
// Set up mocks, when test is running.
172-
mockedModel.MockCapabilityWriteTests.On("WriteTests", mock.Anything).Return(nil, ErrEmptyResponseFromModel)
193+
mockedQuery.On("Query", mock.Anything, mock.Anything, mock.Anything).Return(queryResult2, nil).Once().After(10 * time.Millisecond) // Simulate a model response delay because our internal safety measures trigger when a query is done in 0 milliseconds.
194+
},
195+
After: func(t *testing.T, logger *log.Logger, resultPath string) {
196+
mockedQuery.AssertNumberOfCalls(t, "Query", 2)
173197
},
174198

175199
Context: &Context{
@@ -180,6 +204,11 @@ func TestEvaluate(t *testing.T) {
180204
Models: []evalmodel.Model{
181205
mockedModel,
182206
},
207+
QueryAttempts: 3,
208+
209+
RepositoryPaths: []string{
210+
repositoryPath,
211+
},
183212
},
184213

185214
ExpectedAssessments: []*metricstesting.AssessmentTuple{
@@ -189,8 +218,12 @@ func TestEvaluate(t *testing.T) {
189218
RepositoryPath: repositoryPath,
190219
Case: "plain.go",
191220
Task: evaluatetask.IdentifierWriteTests,
192-
Assessment: metrics.Assessments{
221+
Assessment: map[metrics.AssessmentKey]float64{
193222
metrics.AssessmentKeyFilesExecutedMaximumReachable: 1,
223+
metrics.AssessmentKeyResponseNoError: 1,
224+
metrics.AssessmentKeyCostsTokenActual: 0.111111111,
225+
metrics.AssessmentKeyNativeTokenInput: 111,
226+
metrics.AssessmentKeyNativeTokenOutput: 222,
194227
},
195228
},
196229
&metricstesting.AssessmentTuple{
@@ -199,8 +232,12 @@ func TestEvaluate(t *testing.T) {
199232
RepositoryPath: repositoryPath,
200233
Case: "plain.go",
201234
Task: evaluatetask.IdentifierWriteTestsSymflowerFix,
202-
Assessment: metrics.Assessments{
235+
Assessment: map[metrics.AssessmentKey]float64{
203236
metrics.AssessmentKeyFilesExecutedMaximumReachable: 1,
237+
metrics.AssessmentKeyResponseNoError: 1,
238+
metrics.AssessmentKeyCostsTokenActual: 0.111111111,
239+
metrics.AssessmentKeyNativeTokenInput: 111,
240+
metrics.AssessmentKeyNativeTokenOutput: 222,
204241
},
205242
},
206243
&metricstesting.AssessmentTuple{
@@ -209,8 +246,12 @@ func TestEvaluate(t *testing.T) {
209246
RepositoryPath: repositoryPath,
210247
Case: "plain.go",
211248
Task: evaluatetask.IdentifierWriteTestsSymflowerTemplate,
212-
Assessment: metrics.Assessments{
249+
Assessment: map[metrics.AssessmentKey]float64{
213250
metrics.AssessmentKeyFilesExecutedMaximumReachable: 1,
251+
metrics.AssessmentKeyResponseNoError: 1,
252+
metrics.AssessmentKeyCostsTokenActual: 0.222222222,
253+
metrics.AssessmentKeyNativeTokenInput: 333,
254+
metrics.AssessmentKeyNativeTokenOutput: 444,
214255
},
215256
},
216257
&metricstesting.AssessmentTuple{
@@ -219,15 +260,23 @@ func TestEvaluate(t *testing.T) {
219260
RepositoryPath: repositoryPath,
220261
Case: "plain.go",
221262
Task: evaluatetask.IdentifierWriteTestsSymflowerTemplateSymflowerFix,
222-
Assessment: metrics.Assessments{
263+
Assessment: map[metrics.AssessmentKey]float64{
223264
metrics.AssessmentKeyFilesExecutedMaximumReachable: 1,
265+
metrics.AssessmentKeyResponseNoError: 1,
266+
metrics.AssessmentKeyCostsTokenActual: 0.222222222,
267+
metrics.AssessmentKeyNativeTokenInput: 333,
268+
metrics.AssessmentKeyNativeTokenOutput: 444,
224269
},
225270
},
226271
},
227272
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
228273
"evaluation.log": nil,
229-
filepath.Join(string(evaluatetask.IdentifierWriteTests), mockedModel.ID(), "golang", "golang", "plain", "evaluation.log"): nil,
230-
"evaluation.csv": nil,
274+
filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): func(t *testing.T, filePath, data string) {
275+
assert.Equal(t, 4, strings.Count(data, "no test files found"), "number of ocurrences of \"no test files found\" not matched")
276+
},
277+
"evaluation.csv": func(t *testing.T, filePath, data string) {
278+
assert.Lenf(t, strings.Split(data, "\n"), 6, "expected 6 lines: header, 4x entries and final new line:\n%s", data)
279+
},
231280
},
232281
})
233282
}

model/llm/llm.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,10 @@ func handleQueryResult(queryResult *provider.QueryResult, filePathAbsolute strin
506506
assessment[metrics.AssessmentKeyCostsTokenActual] = queryResult.GenerationInfo.TotalCost
507507
}
508508

509+
if sourceFileContent == "" {
510+
return assessment, nil
511+
}
512+
509513
if err := os.MkdirAll(filepath.Dir(filePathAbsolute), 0755); err != nil {
510514
return nil, pkgerrors.WithStack(err)
511515
}

model/llm/llm_test.go

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"path/filepath"
66
"strings"
77
"testing"
8+
"time"
89

910
"github.com/stretchr/testify/assert"
1011
"github.com/stretchr/testify/mock"
@@ -35,9 +36,10 @@ func TestModelGenerateTestsForFile(t *testing.T) {
3536
SourceFileContent string
3637
SourceFilePath string
3738

38-
ExpectedAssessment metrics.Assessments
39-
ExpectedTestFileContent string
40-
ExpectedTestFilePath string
39+
ExpectedAssessment metrics.Assessments
40+
ExpectedTestFileContent string
41+
ExpectedTestFilePath string
42+
ExpectedTestFilePathNotExists string
4143
}
4244

4345
validate := func(t *testing.T, tc *testCase) {
@@ -74,10 +76,16 @@ func TestModelGenerateTestsForFile(t *testing.T) {
7476

7577
assert.Equal(t, metricstesting.Clean(tc.ExpectedAssessment), metricstesting.Clean(actualAssessment))
7678

77-
actualTestFileContent, err := os.ReadFile(filepath.Join(temporaryPath, tc.ExpectedTestFilePath))
78-
assert.NoError(t, err)
79+
if tc.ExpectedTestFilePath != "" {
80+
actualTestFileContent, err := os.ReadFile(filepath.Join(temporaryPath, tc.ExpectedTestFilePath))
81+
assert.NoError(t, err)
82+
83+
assert.Equal(t, strings.TrimSpace(bytesutil.StringTrimIndentations(tc.ExpectedTestFileContent)), string(actualTestFileContent))
84+
}
7985

80-
assert.Equal(t, strings.TrimSpace(bytesutil.StringTrimIndentations(tc.ExpectedTestFileContent)), string(actualTestFileContent))
86+
if tc.ExpectedTestFilePathNotExists != "" {
87+
assert.NoFileExists(t, filepath.Join(temporaryPath, tc.ExpectedTestFilePathNotExists))
88+
}
8189
})
8290
}
8391

@@ -131,6 +139,29 @@ func TestModelGenerateTestsForFile(t *testing.T) {
131139
`,
132140
ExpectedTestFilePath: "simple_test.go",
133141
})
142+
validate(t, &testCase{
143+
Name: "Empty response",
144+
145+
SetupMock: func(mockedProvider *providertesting.MockQuery) {
146+
queryResult := &provider.QueryResult{
147+
Duration: time.Millisecond * 123,
148+
GenerationInfo: &provider.GenerationInfo{
149+
TotalCost: 0.123456789,
150+
},
151+
}
152+
mockedProvider.On("Query", mock.Anything, mock.Anything, promptMessage).Return(queryResult, nil)
153+
},
154+
155+
Language: &golang.Language{},
156+
ModelID: "model-id",
157+
SourceFileContent: sourceFileContent,
158+
SourceFilePath: sourceFilePath,
159+
160+
ExpectedAssessment: metrics.Assessments{
161+
metrics.AssessmentKeyCostsTokenActual: 0.123456789,
162+
},
163+
ExpectedTestFilePathNotExists: "simple_test.go",
164+
})
134165
}
135166

136167
func TestModelRepairSourceCodeFile(t *testing.T) {

model/llm/prompt/parse.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import (
44
"regexp"
55
"strings"
66

7-
pkgerrors "github.com/pkg/errors"
87
"github.com/zimmski/osutil/bytesutil"
98

109
"github.com/symflower/eval-dev-quality/evaluate/metrics"
@@ -21,7 +20,7 @@ func ParseResponse(response string) (assessment metrics.Assessments, code string
2120

2221
// Check for empty responses.
2322
if strings.TrimSpace(response) == "" {
24-
return assessment, "", pkgerrors.New("empty response from model")
23+
return assessment, "", nil
2524
}
2625

2726
// Some models produce duplicated code tags, so unify them if needed.

model/llm/prompt/parse_test.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,13 +73,12 @@ func TestParseResponse(t *testing.T) {
7373
})
7474

7575
validate(t, &testCase{
76-
Name: "Expected error on empty response",
76+
Name: "No error on empty response",
7777

7878
Response: "",
7979

8080
ExpectedAssessment: metrics.Assessments{},
8181
ExpectedCode: "",
82-
ExpectedError: true,
8382
})
8483

8584
t.Run("Formatted Code", func(t *testing.T) {

0 commit comments

Comments
 (0)