Skip to content

Commit 26bf756

Browse files
bauersimonzimmski
authored andcommitted
Report CSV data per case instead of per repository for more granular information
Closes #390
1 parent 2fc125f commit 26bf756

File tree

18 files changed

+774
-464
lines changed

18 files changed

+774
-464
lines changed

cmd/eval-dev-quality/cmd/evaluate_test.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -597,12 +597,12 @@ func TestEvaluateExecute(t *testing.T) {
597597
filepath.Join("result-directory", "config.json"): nil,
598598
filepath.Join("result-directory", "evaluation.csv"): func(t *testing.T, filePath, data string) {
599599
// Check if the runs are written to the CSV file.
600-
assert.Contains(t, data, "golang,"+filepath.Join("golang", "plain")+",write-tests,1")
601-
assert.Contains(t, data, "golang,"+filepath.Join("golang", "plain")+",write-tests,2")
602-
assert.Contains(t, data, "golang,"+filepath.Join("golang", "plain")+",write-tests,3")
603-
assert.Contains(t, data, "golang,"+filepath.Join("golang", "plain")+",write-tests-symflower-fix,1")
604-
assert.Contains(t, data, "golang,"+filepath.Join("golang", "plain")+",write-tests-symflower-fix,2")
605-
assert.Contains(t, data, "golang,"+filepath.Join("golang", "plain")+",write-tests-symflower-fix,3")
600+
assert.Contains(t, data, "golang,"+filepath.Join("golang", "plain")+",plain.go,write-tests,1")
601+
assert.Contains(t, data, "golang,"+filepath.Join("golang", "plain")+",plain.go,write-tests,2")
602+
assert.Contains(t, data, "golang,"+filepath.Join("golang", "plain")+",plain.go,write-tests,3")
603+
assert.Contains(t, data, "golang,"+filepath.Join("golang", "plain")+",plain.go,write-tests-symflower-fix,1")
604+
assert.Contains(t, data, "golang,"+filepath.Join("golang", "plain")+",plain.go,write-tests-symflower-fix,2")
605+
assert.Contains(t, data, "golang,"+filepath.Join("golang", "plain")+",plain.go,write-tests-symflower-fix,3")
606606

607607
_ = validateMetrics(t, data, []metrics.Assessments{
608608
metrics.Assessments{

evaluate/evaluate.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"github.com/symflower/eval-dev-quality/provider"
1414
evaltask "github.com/symflower/eval-dev-quality/task"
1515
"github.com/symflower/eval-dev-quality/util"
16+
"golang.org/x/exp/maps"
1617
)
1718

1819
// Context holds an evaluation context.
@@ -322,7 +323,12 @@ func withLoadedModel(logger *log.Logger, model evalmodel.Model, modelProvider pr
322323
}
323324

324325
// succeededPlain checks if the assessments attest that the "plain" repository was successfully solved.
325-
func succeededPlain(assessment map[evaltask.Identifier]metrics.Assessments) bool {
326+
func succeededPlain(assessmentPerCase map[string]map[evaltask.Identifier]metrics.Assessments) bool {
327+
if len(assessmentPerCase) != 1 { // The "plain" repository only has one case.
328+
return false
329+
}
330+
assessment := assessmentPerCase[maps.Keys(assessmentPerCase)[0]]
331+
326332
if withoutTemplate, ok := assessment[evaluatetask.IdentifierWriteTests]; ok && withoutTemplate[metrics.AssessmentKeyFilesExecuted] > 0 {
327333
return true
328334
} else if withTemplate, ok := assessment[evaluatetask.IdentifierWriteTestsSymflowerTemplate]; ok && withTemplate[metrics.AssessmentKeyFilesExecuted] > 0 {

evaluate/evaluate_test.go

Lines changed: 116 additions & 0 deletions
Large diffs are not rendered by default.

evaluate/metrics/testing/assessments.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ type AssessmentTuple struct {
5555
Model string
5656
Language string
5757
RepositoryPath string
58+
Case string
5859
Task task.Identifier
5960
Assessment metrics.Assessments
6061
}

evaluate/report/csv.go

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"encoding/csv"
66
"io"
77
"slices"
8+
"sort"
89
"strconv"
910

1011
pkgerrors "github.com/pkg/errors"
@@ -39,17 +40,24 @@ func NewEvaluationFile(writer io.Writer) (evaluationFile *EvaluationFile, err er
3940
}
4041

4142
// WriteEvaluationRecord writes the assessments of a task into the evaluation CSV.
42-
func (e *EvaluationFile) WriteEvaluationRecord(model model.Model, language language.Language, repositoryName string, run uint, assessmentsPerTask map[task.Identifier]metrics.Assessments) (err error) {
43-
tasks := maps.Keys(assessmentsPerTask)
44-
slices.SortStableFunc(tasks, func(a, b task.Identifier) int {
45-
return cmp.Compare(a, b)
46-
})
47-
43+
func (e *EvaluationFile) WriteEvaluationRecord(model model.Model, language language.Language, repositoryName string, run uint, assessmentsPerCasePerTask map[string]map[task.Identifier]metrics.Assessments) (err error) {
4844
allRecords := [][]string{}
49-
for _, task := range tasks {
50-
assessment := assessmentsPerTask[task]
51-
row := append([]string{model.ID(), language.ID(), repositoryName, string(task), strconv.FormatUint(uint64(run), 10)}, assessment.StringCSV()...)
52-
allRecords = append(allRecords, row)
45+
46+
cases := maps.Keys(assessmentsPerCasePerTask)
47+
sort.Strings(cases)
48+
for _, caseName := range cases {
49+
assessmentsPerTask := assessmentsPerCasePerTask[caseName]
50+
51+
tasks := maps.Keys(assessmentsPerTask)
52+
slices.SortStableFunc(tasks, func(a, b task.Identifier) int {
53+
return cmp.Compare(a, b)
54+
})
55+
56+
for _, task := range tasks {
57+
assessment := assessmentsPerTask[task]
58+
row := append([]string{model.ID(), language.ID(), repositoryName, caseName, string(task), strconv.FormatUint(uint64(run), 10)}, assessment.StringCSV()...)
59+
allRecords = append(allRecords, row)
60+
}
5361
}
5462

5563
return e.WriteLines(allRecords)
@@ -72,5 +80,5 @@ func (e *EvaluationFile) WriteLines(records [][]string) (err error) {
7280

7381
// EvaluationHeader returns the CSV header for the evaluation CSV.
7482
func EvaluationHeader() (header []string) {
75-
return append([]string{"model-id", "language", "repository", "task", "run"}, metrics.AllAssessmentKeysStrings...)
83+
return append([]string{"model-id", "language", "repository", "case", "task", "run"}, metrics.AllAssessmentKeysStrings...)
7684
}

evaluate/report/csv_test.go

Lines changed: 25 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ func TestNewEvaluationFile(t *testing.T) {
2424
require.NoError(t, err)
2525

2626
expectedEvaluationFileContent := bytesutil.StringTrimIndentations(`
27-
model-id,language,repository,task,run,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
27+
model-id,language,repository,case,task,run,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
2828
`)
2929

3030
assert.Equal(t, expectedEvaluationFileContent, string(actualEvaluationFileContent))
@@ -34,7 +34,7 @@ func TestWriteEvaluationRecord(t *testing.T) {
3434
type testCase struct {
3535
Name string
3636

37-
Assessments map[task.Identifier]metrics.Assessments
37+
Assessments map[string]map[task.Identifier]metrics.Assessments
3838

3939
ExpectedCSV string
4040
}
@@ -58,37 +58,41 @@ func TestWriteEvaluationRecord(t *testing.T) {
5858
validate(t, &testCase{
5959
Name: "Single task with empty assessments",
6060

61-
Assessments: map[task.Identifier]metrics.Assessments{
62-
evaluatetask.IdentifierWriteTests: metrics.NewAssessments(),
61+
Assessments: map[string]map[task.Identifier]metrics.Assessments{
62+
"plain.go": {
63+
evaluatetask.IdentifierWriteTests: metrics.NewAssessments(),
64+
},
6365
},
6466

6567
ExpectedCSV: `
66-
model-id,language,repository,task,run,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
67-
mocked-model,golang,golang/plain,write-tests,1,0,0,0,0,0,0,0,0,0,0
68+
model-id,language,repository,case,task,run,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
69+
mocked-model,golang,golang/plain,plain.go,write-tests,1,0,0,0,0,0,0,0,0,0,0
6870
`,
6971
})
7072
validate(t, &testCase{
7173
Name: "Multiple tasks with assessments",
7274

73-
Assessments: map[task.Identifier]metrics.Assessments{
74-
evaluatetask.IdentifierWriteTests: metrics.Assessments{
75-
metrics.AssessmentKeyFilesExecuted: 1,
76-
metrics.AssessmentKeyFilesExecutedMaximumReachable: 1,
77-
metrics.AssessmentKeyResponseNoError: 1,
78-
metrics.AssessmentKeyCoverage: 0,
79-
},
80-
evaluatetask.IdentifierWriteTestsSymflowerFix: metrics.Assessments{
81-
metrics.AssessmentKeyFilesExecuted: 1,
82-
metrics.AssessmentKeyFilesExecutedMaximumReachable: 1,
83-
metrics.AssessmentKeyResponseNoError: 1,
84-
metrics.AssessmentKeyCoverage: 10,
75+
Assessments: map[string]map[task.Identifier]metrics.Assessments{
76+
"plain.go": {
77+
evaluatetask.IdentifierWriteTests: metrics.Assessments{
78+
metrics.AssessmentKeyFilesExecuted: 1,
79+
metrics.AssessmentKeyFilesExecutedMaximumReachable: 1,
80+
metrics.AssessmentKeyResponseNoError: 1,
81+
metrics.AssessmentKeyCoverage: 0,
82+
},
83+
evaluatetask.IdentifierWriteTestsSymflowerFix: metrics.Assessments{
84+
metrics.AssessmentKeyFilesExecuted: 1,
85+
metrics.AssessmentKeyFilesExecutedMaximumReachable: 1,
86+
metrics.AssessmentKeyResponseNoError: 1,
87+
metrics.AssessmentKeyCoverage: 10,
88+
},
8589
},
8690
},
8791

8892
ExpectedCSV: `
89-
model-id,language,repository,task,run,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
90-
mocked-model,golang,golang/plain,write-tests,1,0,1,1,0,0,0,1,0,0,0
91-
mocked-model,golang,golang/plain,write-tests-symflower-fix,1,10,1,1,0,0,0,1,0,0,0
93+
model-id,language,repository,case,task,run,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
94+
mocked-model,golang,golang/plain,plain.go,write-tests,1,0,1,1,0,0,0,1,0,0,0
95+
mocked-model,golang,golang/plain,plain.go,write-tests-symflower-fix,1,10,1,1,0,0,0,1,0,0,0
9296
`,
9397
})
9498
}

evaluate/report/testing/csv.go

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ func atoiUint64(t *testing.T, s string) uint64 {
1919
}
2020

2121
// extractMetricsCSVMatch is a regular expression to extract metrics from CSV rows.
22-
var extractMetricsCSVMatch = regexp.MustCompile(`(\S+),(\S+),(\S+),(\S+),\d+,(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+)`)
22+
var extractMetricsCSVMatch = regexp.MustCompile(`(\S+),(\S+),(\S+),(\S+),(\S+),\d+,(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+)`)
2323

2424
// ParseMetrics extracts multiple assessment metrics from the given string.
2525
func ParseMetrics(t *testing.T, data string) (assessments metricstesting.AssessmentTuples) {
@@ -30,17 +30,18 @@ func ParseMetrics(t *testing.T, data string) (assessments metricstesting.Assessm
3030
Model: match[1],
3131
Language: match[2],
3232
RepositoryPath: match[3],
33-
Task: task.Identifier(match[4]),
33+
Case: match[4],
34+
Task: task.Identifier(match[5]),
3435
Assessment: metrics.Assessments{
35-
metrics.AssessmentKeyCoverage: atoiUint64(t, match[5]),
36-
metrics.AssessmentKeyFilesExecuted: atoiUint64(t, match[6]),
37-
metrics.AssessmentKeyFilesExecutedMaximumReachable: atoiUint64(t, match[7]),
38-
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: atoiUint64(t, match[8]),
39-
metrics.AssessmentKeyProcessingTime: atoiUint64(t, match[9]),
40-
metrics.AssessmentKeyResponseCharacterCount: atoiUint64(t, match[10]),
41-
metrics.AssessmentKeyResponseNoError: atoiUint64(t, match[11]),
42-
metrics.AssessmentKeyResponseNoExcess: atoiUint64(t, match[12]),
43-
metrics.AssessmentKeyResponseWithCode: atoiUint64(t, match[13]),
36+
metrics.AssessmentKeyCoverage: atoiUint64(t, match[6]),
37+
metrics.AssessmentKeyFilesExecuted: atoiUint64(t, match[7]),
38+
metrics.AssessmentKeyFilesExecutedMaximumReachable: atoiUint64(t, match[8]),
39+
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: atoiUint64(t, match[9]),
40+
metrics.AssessmentKeyProcessingTime: atoiUint64(t, match[10]),
41+
metrics.AssessmentKeyResponseCharacterCount: atoiUint64(t, match[11]),
42+
metrics.AssessmentKeyResponseNoError: atoiUint64(t, match[12]),
43+
metrics.AssessmentKeyResponseNoExcess: atoiUint64(t, match[13]),
44+
metrics.AssessmentKeyResponseWithCode: atoiUint64(t, match[14]),
4445
},
4546
})
4647
}

evaluate/task/code-repair.go

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ func (t *CodeRepair) Identifier() evaltask.Identifier {
3333

3434
// Run performs source code repairing in a repository with compilation errors.
3535
// This task requires the repository to consist of multiple packages, with each containing one faulty implementation file and a corresponding test file.
36-
func (t *CodeRepair) Run(ctx evaltask.Context) (repositoryAssessment map[evaltask.Identifier]metrics.Assessments, problems []error, err error) {
36+
func (t *CodeRepair) Run(ctx evaltask.Context) (repositoryAssessment map[string]map[evaltask.Identifier]metrics.Assessments, problems []error, err error) {
3737
modelCapability, ok := ctx.Model.(model.CapabilityRepairCode)
3838
if !ok {
3939
return nil, nil, pkgerrors.Wrap(evaltask.ErrTaskUnsupportedByModel, fmt.Sprintf("%q does not support %q", ctx.Model.ID(), string(t.Identifier())))
@@ -54,26 +54,31 @@ func (t *CodeRepair) Run(ctx evaltask.Context) (repositoryAssessment map[evaltas
5454
}
5555
for _, file := range files {
5656
if file.IsDir() && !strings.HasPrefix(file.Name(), ".") { // Ignore hidden directories.
57-
packagePaths = append(packagePaths, filepath.Join(ctx.Repository.DataPath(), file.Name()))
57+
packagePaths = append(packagePaths, file.Name())
5858
}
5959
}
6060

61-
modelAssessment := metrics.NewAssessments()
62-
modelAssessment[metrics.AssessmentKeyFilesExecutedMaximumReachable] = uint64(len(packagePaths))
61+
repositoryAssessment = map[string]map[evaltask.Identifier]metrics.Assessments{}
6362
for _, packagePath := range packagePaths {
63+
modelAssessment := metrics.NewAssessments()
64+
modelAssessment[metrics.AssessmentKeyFilesExecutedMaximumReachable] = 1
65+
repositoryAssessment[packagePath] = map[evaltask.Identifier]metrics.Assessments{
66+
IdentifierCodeRepair: modelAssessment,
67+
}
68+
6469
if err := ctx.Repository.Reset(ctx.Logger); err != nil {
6570
ctx.Logger.Panicf("ERROR: unable to reset temporary repository path: %s", err)
6671
}
6772

68-
sourceFile, mistakes, err := t.unpackCodeRepairPackage(ctx, taskLogger.Logger, packagePath)
73+
sourceFile, mistakes, err := t.unpackCodeRepairPackage(ctx, taskLogger.Logger, filepath.Join(ctx.Repository.DataPath(), packagePath))
6974
if err != nil {
7075
return nil, nil, err
7176
}
7277

7378
modelContext := model.Context{
7479
Language: ctx.Language,
7580

76-
RepositoryPath: packagePath,
81+
RepositoryPath: filepath.Join(ctx.Repository.DataPath(), packagePath),
7782
FilePath: sourceFile,
7883

7984
Arguments: &ArgumentsCodeRepair{
@@ -94,7 +99,7 @@ func (t *CodeRepair) Run(ctx evaltask.Context) (repositoryAssessment map[evaltas
9499
modelAssessment.Add(assessments)
95100
modelAssessment.Award(metrics.AssessmentKeyResponseNoError)
96101

97-
testResult, ps, err := ctx.Language.ExecuteTests(taskLogger.Logger, packagePath)
102+
testResult, ps, err := ctx.Language.ExecuteTests(taskLogger.Logger, filepath.Join(ctx.Repository.DataPath(), packagePath))
98103
problems = append(problems, ps...)
99104
if err != nil {
100105
problems = append(problems, pkgerrors.WithMessage(err, sourceFile))
@@ -107,10 +112,6 @@ func (t *CodeRepair) Run(ctx evaltask.Context) (repositoryAssessment map[evaltas
107112
modelAssessment.AwardMultiple(metrics.AssessmentKeyTestsPassing, uint64(testsPassing))
108113
}
109114

110-
repositoryAssessment = map[evaltask.Identifier]metrics.Assessments{
111-
IdentifierCodeRepair: modelAssessment,
112-
}
113-
114115
return repositoryAssessment, problems, nil
115116
}
116117

0 commit comments

Comments
 (0)