Skip to content

Commit f99c625

Browse files
authored
Implement BLEU score evaluation for NLP tests (#6537)
* Implement BLEU score evaluation for NLP tests * Fix style warnings * Support multiple references for a single evaluator * Make some suggested updats. * More review updates * Feedback updates. * Update READMEs * Make word tokenizer internal. * Feedback updates. * More tweaks based on feedback * Remove version from NLP library
1 parent 0652a04 commit f99c625

File tree

27 files changed

+1642
-0
lines changed

27 files changed

+1642
-0
lines changed

eng/MSBuild/LegacySupport.props

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,8 @@
7474
<ItemGroup Condition="'$(InjectPlatformAttributesOnLegacy)' == 'true' AND ('$(TargetFramework)' == 'net462' or '$(TargetFramework)' == 'netstandard2.0' or '$(TargetFramework)' == 'netcoreapp3.1')">
7575
<Compile Include="$(MSBuildThisFileDirectory)\..\..\src\LegacySupport\PlatformAttributes\*.cs" LinkBase="LegacySupport\PlatformAttributes" />
7676
</ItemGroup>
77+
78+
<ItemGroup Condition="'$(InjectCollectionBuilderAttributesOnLegacy)' == 'true' AND ('$(TargetFramework)' == 'net462' or '$(TargetFramework)' == 'netstandard2.0' or '$(TargetFramework)' == 'netcoreapp3.1')">
79+
<Compile Include="$(MSBuildThisFileDirectory)\..\..\src\LegacySupport\CollectionBuilder\*.cs" LinkBase="LegacySupport\CollectionBuilder" />
80+
</ItemGroup>
7781
</Project>
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
4+
namespace System.Runtime.CompilerServices;
5+
6+
[AttributeUsage(AttributeTargets.Class | AttributeTargets.Struct | AttributeTargets.Interface)]
7+
internal sealed class CollectionBuilderAttribute : Attribute
8+
{
9+
public CollectionBuilderAttribute(Type builderType, string methodName)
10+
{
11+
BuilderType = builderType;
12+
MethodName = methodName;
13+
}
14+
15+
public Type BuilderType { get; }
16+
public string MethodName { get; }
17+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
To use this source in your project, add the following to your `.csproj` file:
2+
3+
```xml
4+
<PropertyGroup>
5+
<InjectCollectionBuilderAttributesOnLegacy>true</InjectCollectionBuilderAttributesOnLegacy>
6+
</PropertyGroup>
7+
```

src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
* [`Microsoft.Extensions.AI.Evaluation`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation) - Defines core abstractions and types for supporting evaluation.
66
* [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Retrieval, Equivalence and Groundedness.
77
* [`Microsoft.Extensions.AI.Evaluation.Safety`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) - Contains a set of evaluators that are built atop the Azure AI Foundry Evaluation service that can be used to evaluate the content safety of AI responses in your projects including Protected Material, Groundedness Pro, Ungrounded Attributes, Hate and Unfairness, Self Harm, Violence, Sexual, Code Vulnerability and Indirect Attack.
8+
* [`Microsoft.Extensions.AI.Evaluation.NLP`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.NLP) - Contains a set of evaluators that implement common algorithms for evaluating machine translation and natural
9+
language processing tasks. Evaluators currently include BLEU score, with more planned.
810
* [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data.
911
* [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container.
1012
* [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data.
@@ -18,6 +20,7 @@ dotnet add package Microsoft.Extensions.AI.Evaluation
1820
dotnet add package Microsoft.Extensions.AI.Evaluation.Quality
1921
dotnet add package Microsoft.Extensions.AI.Evaluation.Safety
2022
dotnet add package Microsoft.Extensions.AI.Evaluation.Reporting
23+
dotnet add package Microsoft.Extensions.AI.Evaluation.NLP
2124
```
2225

2326
Or directly in the C# project file:
@@ -28,6 +31,7 @@ Or directly in the C# project file:
2831
<PackageReference Include="Microsoft.Extensions.AI.Evaluation.Quality" Version="[CURRENTVERSION]" />
2932
<PackageReference Include="Microsoft.Extensions.AI.Evaluation.Safety" Version="[CURRENTVERSION]" />
3033
<PackageReference Include="Microsoft.Extensions.AI.Evaluation.Reporting" Version="[CURRENTVERSION]" />
34+
<PackageReference Include="Microsoft.Extensions.AI.Evaluation.NLP" Version="[CURRENTVERSION]" />
3135
</ItemGroup>
3236
```
3337

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
4+
using System.Collections.Generic;
5+
using System.Globalization;
6+
using System.Linq;
7+
using System.Threading;
8+
using System.Threading.Tasks;
9+
using Microsoft.Extensions.AI.Evaluation.NLP.Common;
10+
using Microsoft.Extensions.AI.Evaluation.Utilities;
11+
using Microsoft.Shared.Diagnostics;
12+
13+
namespace Microsoft.Extensions.AI.Evaluation.NLP;
14+
15+
/// <summary>
16+
/// An <see cref="IEvaluator"/> that evaluates the quality of a response produced by an AI model by comparing
17+
/// it to a reference response using the BLEU (Bilingual Evaluation Understudy) algorithm. It is often used
18+
/// to evaluate the quality of machine translation or text generation tasks.
19+
/// </summary>
20+
/// <remarks>
21+
/// <para>
22+
/// The <see cref="BLEUEvaluator"/> computes the BLEU score of a response ("hypothesis") compared to a reference
23+
/// supplied via <see cref="BLEUEvaluatorContext.References"/>. The score is returned in a <see cref="NumericMetric"/>
24+
/// with a value between 0.0 and 1.0 where 0.0 represents no match at all and 1.0 indicates a perfect match.
25+
/// By default, the score is interpreted with a pass/fail cutoff of 0.5. So a score of 0.5 or higher is
26+
/// passing and a score below 0.5 is failing.
27+
/// </para>
28+
/// </remarks>
29+
public sealed class BLEUEvaluator : IEvaluator
30+
{
31+
/// <summary>
32+
/// Gets the <see cref="EvaluationMetric.Name"/> of the <see cref="NumericMetric"/> returned by
33+
/// <see cref="BLEUEvaluator"/>.
34+
/// </summary>
35+
public static string BLEUMetricName => "BLEU";
36+
37+
/// <inheritdoc/>
38+
public IReadOnlyCollection<string> EvaluationMetricNames { get; } = [BLEUMetricName];
39+
40+
/// <inheritdoc/>
41+
public ValueTask<EvaluationResult> EvaluateAsync(
42+
IEnumerable<ChatMessage> messages,
43+
ChatResponse modelResponse,
44+
ChatConfiguration? chatConfiguration = null,
45+
IEnumerable<EvaluationContext>? additionalContext = null,
46+
CancellationToken cancellationToken = default)
47+
{
48+
_ = Throw.IfNull(modelResponse);
49+
50+
var metric = new NumericMetric(BLEUMetricName);
51+
var result = new EvaluationResult(metric);
52+
53+
if (string.IsNullOrWhiteSpace(modelResponse.Text))
54+
{
55+
metric.AddDiagnostics(
56+
EvaluationDiagnostic.Error($"The {nameof(modelResponse)} supplied for evaluation was null or empty."));
57+
58+
return new ValueTask<EvaluationResult>(result);
59+
}
60+
61+
if (additionalContext?.OfType<BLEUEvaluatorContext>().FirstOrDefault()
62+
is not BLEUEvaluatorContext context)
63+
{
64+
metric.AddDiagnostics(
65+
EvaluationDiagnostic.Error(
66+
$"A value of type '{nameof(BLEUEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection."));
67+
68+
return new ValueTask<EvaluationResult>(result);
69+
}
70+
71+
if (context.References.Count is 0)
72+
{
73+
metric.AddDiagnostics(
74+
EvaluationDiagnostic.Error(
75+
$"Supplied '{nameof(BLEUEvaluatorContext)}' did not contain any '{nameof(BLEUEvaluatorContext.References)}'."));
76+
77+
return new ValueTask<EvaluationResult>(result);
78+
}
79+
80+
var (score, duration) = TimingHelper.ExecuteWithTiming(() =>
81+
{
82+
var references = context.References.Select(reference => SimpleWordTokenizer.WordTokenize(reference));
83+
var hypothesis = SimpleWordTokenizer.WordTokenize(modelResponse.Text);
84+
return BLEUAlgorithm.SentenceBLEU(references, hypothesis, BLEUAlgorithm.DefaultBLEUWeights, SmoothingFunction.Method4);
85+
});
86+
87+
metric.Value = score;
88+
string durationText = $"{duration.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s";
89+
metric.AddOrUpdateMetadata(name: "evaluation-duration", value: durationText);
90+
metric.AddOrUpdateContext(context);
91+
metric.Interpretation = NLPScoreInterpretation.Interpret(metric);
92+
93+
return new ValueTask<EvaluationResult>(result);
94+
}
95+
96+
}
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
4+
#pragma warning disable S3604
5+
// S3604: Member initializer values should not be redundant.
6+
// We disable this warning because it is a false positive arising from the analyzer's lack of support for C#'s primary
7+
// constructor syntax.
8+
9+
using System.Collections.Generic;
10+
using System.Linq;
11+
12+
namespace Microsoft.Extensions.AI.Evaluation.NLP;
13+
14+
/// <summary>
15+
/// Contextual information that the <see cref="BLEUEvaluator"/> uses to compute the BLEU score for a response.
16+
/// </summary>
17+
/// <remarks>
18+
/// <see cref="BLEUEvaluator"/> measures the BLEU score of a response compared to a reference. BLEU (Bilingual Evaluation Understudy)
19+
/// is a metric used to evaluate the quality of machine-generated text.
20+
/// </remarks>
21+
public sealed class BLEUEvaluatorContext : EvaluationContext
22+
{
23+
/// <summary>
24+
/// Gets the unique <see cref="EvaluationContext.Name"/> that is used for
25+
/// <see cref="BLEUEvaluatorContext"/>.
26+
/// </summary>
27+
public static string BLEUContextName => "BLEU Context";
28+
29+
/// <summary>
30+
/// Gets the reference responses against which the provided model response will be scored.
31+
/// </summary>
32+
/// <remarks>
33+
/// The <see cref="BLEUEvaluator"/> measures the degree to which the response being evaluated is similar to
34+
/// the response supplied via <see cref="References"/>. The metric will be reported as a BLEU score.
35+
/// </remarks>
36+
public IReadOnlyList<string> References { get; }
37+
38+
/// <summary>
39+
/// Initializes a new instance of the <see cref="BLEUEvaluatorContext"/> class.
40+
/// </summary>
41+
/// <param name="references">
42+
/// The reference responses against which the response that is being evaluated is compared.
43+
/// </param>
44+
public BLEUEvaluatorContext(params string[] references)
45+
: this(references as IEnumerable<string>)
46+
{
47+
}
48+
49+
/// <summary>
50+
/// Initializes a new instance of the <see cref="BLEUEvaluatorContext"/> class.
51+
/// </summary>
52+
/// <param name="references">
53+
/// The reference responses against which the response that is being evaluated is compared.
54+
/// </param>
55+
public BLEUEvaluatorContext(IEnumerable<string> references)
56+
: base(
57+
name: BLEUContextName,
58+
contents: [.. references.Select(c => new TextContent(c))])
59+
{
60+
References = [.. references];
61+
}
62+
}

0 commit comments

Comments
 (0)