Skip to content

Commit f5919f2

Browse files
authored
Separate key terms from other training data on S3 (#846)
1 parent 564f686 commit f5919f2

File tree

11 files changed

+106
-19
lines changed

11 files changed

+106
-19
lines changed

src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ await client.BuildStartedAsync(
124124
List<InsertPretranslationsRequest> pretranslationsRequests = [];
125125
await _parallelCorpusPreprocessingService.PreprocessAsync(
126126
request.Corpora.Select(Map).ToList(),
127-
row => Task.CompletedTask,
127+
(_, _) => Task.CompletedTask,
128128
(row, _, corpus) =>
129129
{
130130
string[] tokens = row.SourceSegment.Split();

src/Echo/src/EchoEngine/WordAlignmentEngineServiceV1.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ await client.BuildStartedAsync(
7979
List<InsertWordAlignmentsRequest> wordAlignmentsRequests = [];
8080
await _parallelCorpusPreprocessingService.PreprocessAsync(
8181
request.Corpora.Select(Map).ToList(),
82-
row => Task.CompletedTask,
82+
(_, _) => Task.CompletedTask,
8383
(row, _, corpus) =>
8484
{
8585
wordAlignmentsRequests.Add(

src/Machine/src/Serval.Machine.Shared/Services/SmtTransferTrainBuildJob.cs

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,14 @@ CancellationToken cancellationToken
3838
await DownloadDataAsync(buildId, corpusDir, cancellationToken);
3939

4040
// assemble corpus
41-
ITextCorpus sourceCorpus = new TextFileTextCorpus(Path.Combine(corpusDir, "train.src.txt"));
42-
ITextCorpus targetCorpus = new TextFileTextCorpus(Path.Combine(corpusDir, "train.trg.txt"));
41+
ITextCorpus sourceCorpus = new TextFileTextCorpus(
42+
Path.Combine(corpusDir, "train.src.txt"),
43+
Path.Combine(corpusDir, "train.key-terms.src.txt")
44+
);
45+
ITextCorpus targetCorpus = new TextFileTextCorpus(
46+
Path.Combine(corpusDir, "train.trg.txt"),
47+
Path.Combine(corpusDir, "train.key-terms.trg.txt")
48+
);
4349
IParallelTextCorpus parallelCorpus = sourceCorpus.AlignRows(targetCorpus);
4450

4551
// train SMT model
@@ -106,6 +112,20 @@ private async Task DownloadDataAsync(string buildId, string corpusDir, Cancellat
106112
);
107113
await using FileStream tgtFileStream = File.Create(Path.Combine(corpusDir, "train.trg.txt"));
108114
await tgtText.CopyToAsync(tgtFileStream, cancellationToken);
115+
116+
await using Stream srcKeyTermsText = await _sharedFileService.OpenReadAsync(
117+
$"builds/{buildId}/train.key-terms.src.txt",
118+
cancellationToken
119+
);
120+
await using FileStream srcKeyTermsFileStream = File.Create(Path.Combine(corpusDir, "train.key-terms.src.txt"));
121+
await srcKeyTermsText.CopyToAsync(srcKeyTermsFileStream, cancellationToken);
122+
123+
await using Stream tgtKeyTermsText = await _sharedFileService.OpenReadAsync(
124+
$"builds/{buildId}/train.key-terms.trg.txt",
125+
cancellationToken
126+
);
127+
await using FileStream tgtKeyTermsFileStream = File.Create(Path.Combine(corpusDir, "train.key-terms.trg.txt"));
128+
await tgtKeyTermsFileStream.CopyToAsync(tgtKeyTermsText, cancellationToken);
109129
}
110130

111131
private async Task<(int TrainCorpusSize, double Confidence)> TrainAsync(

src/Machine/src/Serval.Machine.Shared/Services/StatisticalTrainBuildJob.cs

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,14 @@ CancellationToken cancellationToken
3838
await DownloadDataAsync(buildId, corpusDir, cancellationToken);
3939

4040
// assemble corpus
41-
ITextCorpus sourceCorpus = new TextFileTextCorpus(Path.Combine(corpusDir, "train.src.txt"));
42-
ITextCorpus targetCorpus = new TextFileTextCorpus(Path.Combine(corpusDir, "train.trg.txt"));
41+
ITextCorpus sourceCorpus = new TextFileTextCorpus(
42+
Path.Combine(corpusDir, "train.src.txt"),
43+
Path.Combine(corpusDir, "train.key-terms.src.txt")
44+
);
45+
ITextCorpus targetCorpus = new TextFileTextCorpus(
46+
Path.Combine(corpusDir, "train.trg.txt"),
47+
Path.Combine(corpusDir, "train.key-terms.trg.txt")
48+
);
4349
IParallelTextCorpus parallelCorpus = sourceCorpus.AlignRows(targetCorpus);
4450

4551
// train word alignment model
@@ -100,6 +106,20 @@ private async Task DownloadDataAsync(string buildId, string corpusDir, Cancellat
100106
);
101107
await using FileStream tgtFileStream = File.Create(Path.Combine(corpusDir, "train.trg.txt"));
102108
await tgtText.CopyToAsync(tgtFileStream, cancellationToken);
109+
110+
await using Stream srcKeyTermsText = await _sharedFileService.OpenReadAsync(
111+
$"builds/{buildId}/train.key-terms.src.txt",
112+
cancellationToken
113+
);
114+
await using FileStream srcKeyTermsFileStream = File.Create(Path.Combine(corpusDir, "train.key-terms.src.txt"));
115+
await srcKeyTermsText.CopyToAsync(srcKeyTermsFileStream, cancellationToken);
116+
117+
await using Stream tgtKeyTermsText = await _sharedFileService.OpenReadAsync(
118+
$"builds/{buildId}/train.key-terms.trg.txt",
119+
cancellationToken
120+
);
121+
await using FileStream tgtKeyTermsFileStream = File.Create(Path.Combine(corpusDir, "train.key-terms.trg.txt"));
122+
await tgtKeyTermsFileStream.CopyToAsync(tgtKeyTermsText, cancellationToken);
103123
}
104124

105125
private async Task<int> TrainAsync(

src/Machine/src/Serval.Machine.Shared/Services/TranslationPreprocessBuildJob.cs

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@ CancellationToken cancellationToken
3535
await using StreamWriter targetTrainWriter =
3636
new(await SharedFileService.OpenWriteAsync($"builds/{buildId}/train.trg.txt", cancellationToken));
3737

38+
await using StreamWriter sourceKeyTermsTrainWriter =
39+
new(await SharedFileService.OpenWriteAsync($"builds/{buildId}/train.key-terms.src.txt", cancellationToken));
40+
await using StreamWriter targetKeyTermsTrainWriter =
41+
new(await SharedFileService.OpenWriteAsync($"builds/{buildId}/train.key-terms.trg.txt", cancellationToken));
42+
3843
await using Stream pretranslateStream = await SharedFileService.OpenWriteAsync(
3944
$"builds/{buildId}/pretranslate.src.json",
4045
cancellationToken
@@ -46,12 +51,20 @@ CancellationToken cancellationToken
4651
pretranslateWriter.WriteStartArray();
4752
await ParallelCorpusPreprocessingService.PreprocessAsync(
4853
corpora,
49-
async row =>
54+
async (row, trainingDataType) =>
5055
{
5156
if (row.SourceSegment.Length > 0 || row.TargetSegment.Length > 0)
5257
{
53-
await sourceTrainWriter.WriteAsync($"{row.SourceSegment}\n");
54-
await targetTrainWriter.WriteAsync($"{row.TargetSegment}\n");
58+
if (trainingDataType == TrainingDataType.KeyTerms)
59+
{
60+
await sourceKeyTermsTrainWriter.WriteAsync($"{row.SourceSegment}\n");
61+
await targetKeyTermsTrainWriter.WriteAsync($"{row.TargetSegment}\n");
62+
}
63+
else
64+
{
65+
await sourceTrainWriter.WriteAsync($"{row.SourceSegment}\n");
66+
await targetTrainWriter.WriteAsync($"{row.TargetSegment}\n");
67+
}
5568
}
5669
if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
5770
trainCount++;

src/Machine/src/Serval.Machine.Shared/Services/WordAlignmentPreprocessBuildJob.cs

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@ CancellationToken cancellationToken
3535
await using StreamWriter targetTrainWriter =
3636
new(await SharedFileService.OpenWriteAsync($"builds/{buildId}/train.trg.txt", cancellationToken));
3737

38+
await using StreamWriter sourceKeyTermsTrainWriter =
39+
new(await SharedFileService.OpenWriteAsync($"builds/{buildId}/train.key-terms.src.txt", cancellationToken));
40+
await using StreamWriter targetKeyTermsTrainWriter =
41+
new(await SharedFileService.OpenWriteAsync($"builds/{buildId}/train.key-terms.trg.txt", cancellationToken));
42+
3843
await using Stream wordAlignmentStream = await SharedFileService.OpenWriteAsync(
3944
$"builds/{buildId}/word_alignments.inputs.json",
4045
cancellationToken
@@ -46,12 +51,21 @@ CancellationToken cancellationToken
4651
wordAlignmentWriter.WriteStartArray();
4752
await ParallelCorpusPreprocessingService.PreprocessAsync(
4853
corpora,
49-
async row =>
54+
async (row, trainingDataType) =>
5055
{
5156
if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
5257
{
53-
await sourceTrainWriter.WriteAsync($"{row.SourceSegment}\n");
54-
await targetTrainWriter.WriteAsync($"{row.TargetSegment}\n");
58+
if (trainingDataType == TrainingDataType.KeyTerms)
59+
{
60+
await sourceKeyTermsTrainWriter.WriteAsync($"{row.SourceSegment}\n");
61+
await targetKeyTermsTrainWriter.WriteAsync($"{row.TargetSegment}\n");
62+
}
63+
else
64+
{
65+
await sourceTrainWriter.WriteAsync($"{row.SourceSegment}\n");
66+
await targetTrainWriter.WriteAsync($"{row.TargetSegment}\n");
67+
}
68+
5569
trainCount++;
5670
}
5771
},

src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -978,6 +978,10 @@ public async Task<string> GetTargetExtractAsync()
978978
{
979979
using StreamReader srcReader = new(await SharedFileService.OpenReadAsync("builds/build1/train.src.txt"));
980980
using StreamReader trgReader = new(await SharedFileService.OpenReadAsync("builds/build1/train.trg.txt"));
981+
using StreamReader srcTermReader =
982+
new(await SharedFileService.OpenReadAsync("builds/build1/train.key-terms.src.txt"));
983+
using StreamReader trgTermReader =
984+
new(await SharedFileService.OpenReadAsync("builds/build1/train.key-terms.trg.txt"));
981985
int src1Count = 0;
982986
int src2Count = 0;
983987
int trgCount = 0;
@@ -998,8 +1002,17 @@ public async Task<string> GetTargetExtractAsync()
9981002
else if (srcLine.Length == 0)
9991003
trgCount++;
10001004
else
1001-
termCount++;
1005+
throw new ArgumentException("Unexpected line in test output");
10021006
}
1007+
1008+
while (
1009+
(srcLine = await srcTermReader.ReadLineAsync()) is not null
1010+
&& (trgLine = await trgTermReader.ReadLineAsync()) is not null
1011+
)
1012+
{
1013+
termCount++;
1014+
}
1015+
10031016
return (src1Count, src2Count, trgCount, termCount);
10041017
}
10051018

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
namespace SIL.ServiceToolkit.Models;
2+
3+
public enum TrainingDataType
4+
{
5+
Text = 0,
6+
KeyTerms = 1,
7+
}

src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ ParallelCorpus corpus
99

1010
Task PreprocessAsync(
1111
IReadOnlyList<ParallelCorpus> corpora,
12-
Func<Row, Task> train,
12+
Func<Row, TrainingDataType, Task> train,
1313
Func<Row, bool, ParallelCorpus, Task> inference,
1414
bool useKeyTerms = false,
1515
HashSet<string>? ignoreUsfmMarkers = null

src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ MonolingualCorpus monolingualCorpus in parallelCorpus.SourceCorpora.Concat(paral
6868

6969
public async Task PreprocessAsync(
7070
IReadOnlyList<ParallelCorpus> corpora,
71-
Func<Row, Task> train,
71+
Func<Row, TrainingDataType, Task> train,
7272
Func<Row, bool, ParallelCorpus, Task> inference,
7373
bool useKeyTerms = false,
7474
HashSet<string>? ignoreUsfmMarkers = null
@@ -128,7 +128,7 @@ public async Task PreprocessAsync(
128128

129129
foreach (Row row in CollapseRanges(trainingRows))
130130
{
131-
await train(row);
131+
await train(row, TrainingDataType.Text);
132132
if (!parallelTrainingDataPresent && row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
133133
{
134134
parallelTrainingDataPresent = true;
@@ -177,7 +177,7 @@ ParallelTextRow row in parallelKeyTermsCorpus.DistinctBy(row =>
177177
{
178178
foreach (Row row in keyTermTrainingData)
179179
{
180-
await train(row);
180+
await train(row, TrainingDataType.KeyTerms);
181181
}
182182
}
183183
}

0 commit comments

Comments
 (0)