Skip to content

Commit 20d0768

Browse files
committed
Preserve figs, notes and cross references
Switch Notes to "Embedded" * include figures and cross references Configure saving embedded and style USFM update tests work - now to the others.
1 parent 9433640 commit 20d0768

17 files changed

+447
-168
lines changed

src/SIL.Machine/Corpora/IUsfmParserHandler.cs

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,14 +69,24 @@ IReadOnlyList<UsfmAttribute> attributes
6969
void EndChar(UsfmParserState state, string marker, IReadOnlyList<UsfmAttribute> attributes, bool closed);
7070

7171
/// <summary>
72-
/// Start of a note
72+
/// Start of an embedded - a note, figure or cross reference
7373
/// </summary>
74-
void StartNote(UsfmParserState state, string marker, string caller, string category);
74+
void StartEmbedded(UsfmParserState state, string marker, string caller, string category);
7575

7676
/// <summary>
77-
/// End of a note
77+
/// End of an embedded
7878
/// </summary>
79-
void EndNote(UsfmParserState state, string marker, bool closed);
79+
void EndEmbedded(UsfmParserState state, string marker, IReadOnlyList<UsfmAttribute> attributes, bool closed);
80+
81+
/// <summary>
82+
/// Start of an embedded text
83+
/// </summary>
84+
void StartEmbeddedText(UsfmParserState state);
85+
86+
/// <summary>
87+
/// End of an embedded text
88+
/// </summary>
89+
void EndEmbeddedText(UsfmParserState state);
8090

8191
/// <summary>
8292
/// Start of a table

src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ public string UpdateUsfm(
2323
string bookId,
2424
IReadOnlyList<(IReadOnlyList<ScriptureRef>, string)> rows,
2525
string fullName = null,
26-
UpdateUsfmBehavior behavior = UpdateUsfmBehavior.PreferExisting
26+
UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferExisting,
27+
UpdateUsfmIntraVerseMarkerBehavior embeddedBehavior = UpdateUsfmIntraVerseMarkerBehavior.Preserve,
28+
UpdateUsfmIntraVerseMarkerBehavior styleBehavior = UpdateUsfmIntraVerseMarkerBehavior.Strip
2729
)
2830
{
2931
string fileName = _settings.GetBookFileName(bookId);
@@ -36,7 +38,13 @@ public string UpdateUsfm(
3638
usfm = reader.ReadToEnd();
3739
}
3840

39-
var handler = new UpdateUsfmParserHandler(rows, fullName is null ? null : $"- {fullName}", behavior);
41+
var handler = new UpdateUsfmParserHandler(
42+
rows,
43+
fullName is null ? null : $"- {fullName}",
44+
textBehavior,
45+
embeddedBehavior,
46+
styleBehavior
47+
);
4048
try
4149
{
4250
UsfmParser.Parse(usfm, handler, _settings.Stylesheet, _settings.Versification);

src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ public enum ScriptureTextType
99
None,
1010
NonVerse,
1111
Verse,
12-
Note
12+
Embedded,
13+
EmbeddedText
1314
}
1415

1516
public abstract class ScriptureRefUsfmParserHandlerBase : UsfmParserHandlerBase
@@ -151,21 +152,29 @@ public override void EndSidebar(UsfmParserState state, string marker, bool close
151152
EndParentElement();
152153
}
153154

154-
public override void StartNote(UsfmParserState state, string marker, string caller, string category)
155+
public override void StartEmbedded(UsfmParserState state, string marker, string caller, string category)
155156
{
156-
if (CurrentTextType != ScriptureTextType.None && !_duplicateVerse)
157+
if (_curVerseRef.IsDefault)
158+
UpdateVerseRef(state.VerseRef, marker);
159+
160+
if (!_duplicateVerse)
157161
{
158162
// if we hit a note in a verse paragraph and we aren't in a verse, then start a non-verse segment
159163
CheckConvertVerseParaToNonVerse(state);
160164
NextElement(marker);
161-
StartNoteText(state);
162165
}
163166
}
164167

165-
public override void EndNote(UsfmParserState state, string marker, bool closed)
168+
public override void StartEmbeddedText(UsfmParserState state)
166169
{
167-
if (CurrentTextType == ScriptureTextType.Note && !_duplicateVerse)
168-
EndNoteText(state);
170+
_curTextType.Push(ScriptureTextType.EmbeddedText);
171+
StartEmbeddedText(state, CreateNonVerseRef());
172+
}
173+
174+
public override void EndEmbeddedText(UsfmParserState state)
175+
{
176+
EndEmbeddedText(state, CreateNonVerseRef());
177+
_curTextType.Pop();
169178
}
170179

171180
public override void Text(UsfmParserState state, string text)
@@ -200,9 +209,9 @@ protected virtual void StartNonVerseText(UsfmParserState state, ScriptureRef scr
200209

201210
protected virtual void EndNonVerseText(UsfmParserState state, ScriptureRef scriptureRef) { }
202211

203-
protected virtual void StartNoteText(UsfmParserState state, ScriptureRef scriptureRef) { }
212+
protected virtual void StartEmbeddedText(UsfmParserState state, ScriptureRef scriptureRef) { }
204213

205-
protected virtual void EndNoteText(UsfmParserState state, ScriptureRef scriptureRef) { }
214+
protected virtual void EndEmbeddedText(UsfmParserState state, ScriptureRef scriptureRef) { }
206215

207216
private void StartVerseText(UsfmParserState state)
208217
{
@@ -231,18 +240,6 @@ private void EndNonVerseText(UsfmParserState state)
231240
_curTextType.Pop();
232241
}
233242

234-
private void StartNoteText(UsfmParserState state)
235-
{
236-
_curTextType.Push(ScriptureTextType.Note);
237-
StartNoteText(state, CreateNonVerseRef());
238-
}
239-
240-
private void EndNoteText(UsfmParserState state)
241-
{
242-
EndNoteText(state, CreateNonVerseRef());
243-
_curTextType.Pop();
244-
}
245-
246243
private void UpdateVerseRef(VerseRef verseRef, string marker)
247244
{
248245
if (!VerseRef.AreOverlappingVersesRanges(verseRef, _curVerseRef))

src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs

Lines changed: 90 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,19 @@
44

55
namespace SIL.Machine.Corpora
66
{
7-
public enum UpdateUsfmBehavior
7+
public enum UpdateUsfmTextBehavior
88
{
99
PreferExisting,
1010
PreferNew,
1111
StripExisting
1212
}
1313

14+
public enum UpdateUsfmIntraVerseMarkerBehavior
15+
{
16+
Preserve,
17+
Strip,
18+
}
19+
1420
/***
1521
* This is a USFM parser handler that can be used to replace the existing text in a USFM file with the specified
1622
* text.
@@ -21,23 +27,29 @@ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase
2127
private readonly List<UsfmToken> _tokens;
2228
private readonly List<UsfmToken> _newTokens;
2329
private readonly string _idText;
24-
private readonly UpdateUsfmBehavior _behavior;
30+
private readonly UpdateUsfmTextBehavior _textBehavior;
31+
private readonly UpdateUsfmIntraVerseMarkerBehavior _embeddedBehavior;
32+
private readonly UpdateUsfmIntraVerseMarkerBehavior _styleBehavior;
2533
private readonly Stack<bool> _replace;
2634
private int _rowIndex;
2735
private int _tokenIndex;
2836

2937
public UpdateUsfmParserHandler(
3038
IReadOnlyList<(IReadOnlyList<ScriptureRef>, string)> rows = null,
3139
string idText = null,
32-
UpdateUsfmBehavior behavior = UpdateUsfmBehavior.PreferExisting
40+
UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferExisting,
41+
UpdateUsfmIntraVerseMarkerBehavior embeddedBehavior = UpdateUsfmIntraVerseMarkerBehavior.Preserve,
42+
UpdateUsfmIntraVerseMarkerBehavior styleBehavior = UpdateUsfmIntraVerseMarkerBehavior.Strip
3343
)
3444
{
3545
_rows = rows ?? Array.Empty<(IReadOnlyList<ScriptureRef>, string)>();
3646
_tokens = new List<UsfmToken>();
3747
_newTokens = new List<UsfmToken>();
3848
_idText = idText;
3949
_replace = new Stack<bool>();
40-
_behavior = behavior;
50+
_textBehavior = textBehavior;
51+
_embeddedBehavior = embeddedBehavior;
52+
_styleBehavior = styleBehavior;
4153
}
4254

4355
public IReadOnlyList<UsfmToken> Tokens => _tokens;
@@ -176,30 +188,39 @@ bool closed
176188
)
177189
{
178190
// strip out char-style markers in verses that are being replaced
179-
if (closed && ReplaceWithNewTokens(state))
191+
if (ReplaceWithNewTokens(state, closed: closed))
180192
SkipTokens(state);
193+
else
194+
CollectTokens(state);
181195

182196
base.EndChar(state, marker, attributes, closed);
183197
}
184198

185-
public override void StartNote(UsfmParserState state, string marker, string caller, string category)
199+
public override void StartEmbedded(UsfmParserState state, string marker, string caller, string category)
186200
{
187201
// strip out notes in verses that are being replaced
188202
if (ReplaceWithNewTokens(state))
189203
SkipTokens(state);
190204
else
191205
CollectTokens(state);
192206

193-
base.StartNote(state, marker, caller, category);
207+
base.StartEmbedded(state, marker, caller, category);
194208
}
195209

196-
public override void EndNote(UsfmParserState state, string marker, bool closed)
210+
public override void EndEmbedded(
211+
UsfmParserState state,
212+
string marker,
213+
IReadOnlyList<UsfmAttribute> attributes,
214+
bool closed
215+
)
197216
{
198217
// strip out notes in verses that are being replaced
199-
if (closed && ReplaceWithNewTokens(state))
218+
if (ReplaceWithNewTokens(state, closed: closed, endEmbedded: true))
200219
SkipTokens(state);
220+
else
221+
CollectTokens(state);
201222

202-
base.EndNote(state, marker, closed);
223+
base.EndEmbedded(state, marker, attributes, closed);
203224
}
204225

205226
public override void Ref(UsfmParserState state, string marker, string display, string target)
@@ -268,31 +289,13 @@ protected override void EndNonVerseText(UsfmParserState state, ScriptureRef scri
268289
PopNewTokens();
269290
}
270291

271-
protected override void StartNoteText(UsfmParserState state, ScriptureRef scriptureRef)
292+
protected override void StartEmbeddedText(UsfmParserState state, ScriptureRef scriptureRef)
272293
{
273294
IReadOnlyList<string> rowTexts = AdvanceRows(new[] { scriptureRef });
274-
var newTokens = new List<UsfmToken>();
275-
if (rowTexts.Count > 0)
276-
{
277-
newTokens.Add(state.Token);
278-
newTokens.Add(new UsfmToken(UsfmTokenType.Character, "ft", null, "ft*"));
279-
for (int i = 0; i < rowTexts.Count; i++)
280-
{
281-
string text = rowTexts[i];
282-
if (i < rowTexts.Count - 1)
283-
text += " ";
284-
newTokens.Add(new UsfmToken(text));
285-
}
286-
newTokens.Add(new UsfmToken(UsfmTokenType.End, state.Token.EndMarker, null, null));
287-
PushNewTokens(newTokens);
288-
}
289-
else
290-
{
291-
PushTokensAsPrevious();
292-
}
295+
PushNewTokens(rowTexts.Select(t => new UsfmToken(t + " ")));
293296
}
294297

295-
protected override void EndNoteText(UsfmParserState state, ScriptureRef scriptureRef)
298+
protected override void EndEmbeddedText(UsfmParserState state, ScriptureRef scriptureRef)
296299
{
297300
PopNewTokens();
298301
}
@@ -362,29 +365,60 @@ private void SkipTokens(UsfmParserState state)
362365
_tokenIndex = state.Index + 1 + state.SpecialTokenCount;
363366
}
364367

365-
private bool ReplaceWithNewTokens(UsfmParserState state)
368+
private bool ReplaceWithNewTokens(UsfmParserState state, bool closed = true, bool endEmbedded = false)
366369
{
367-
bool newText = _replace.Count > 0 && _replace.Peek();
368-
int tokenEnd = state.Index + state.SpecialTokenCount;
369-
bool existingText = false;
370-
for (int index = _tokenIndex; index <= tokenEnd; index++)
370+
bool untranslatableParagraph =
371+
state.ParaTag?.Marker != null && UsfmStylesheet.IsUntranslatedParagraph(state.ParaTag.Marker);
372+
if (_textBehavior == UpdateUsfmTextBehavior.StripExisting)
371373
{
372-
if (state.Tokens[index].Type == UsfmTokenType.Text && state.Tokens[index].Text.Length > 0)
373-
{
374-
existingText = true;
375-
break;
376-
}
374+
if (untranslatableParagraph)
375+
ClearNewTokens();
376+
else
377+
AddNewTokens();
378+
return true;
377379
}
380+
381+
bool newText = _replace.Count > 0 && _replace.Peek();
382+
bool inEmbedded = state.EmbeddedTag != null || endEmbedded;
383+
bool inEmbeddedText =
384+
CurrentTextType == ScriptureTextType.EmbeddedText && !UsfmStylesheet.IsEmbeddedText(state.Token.Marker);
385+
bool isStyleTag = state.Token.Marker != null && !UsfmStylesheet.IsEmbeddedPart(state.Token.Marker);
386+
387+
bool existingText = state
388+
.Tokens.Skip(_tokenIndex)
389+
.Take(state.Index + 1 + state.SpecialTokenCount - _tokenIndex)
390+
.Any(t => t.Type == UsfmTokenType.Text && t.Text.Length > 0);
391+
378392
bool useNewTokens =
379-
_behavior == UpdateUsfmBehavior.StripExisting
380-
|| (newText && !existingText)
381-
|| (newText && _behavior == UpdateUsfmBehavior.PreferNew);
393+
!untranslatableParagraph
394+
&& newText
395+
&& (!existingText || _textBehavior == UpdateUsfmTextBehavior.PreferNew)
396+
&& (!inEmbedded || inEmbeddedText);
382397

383398
if (useNewTokens)
384-
_tokens.AddRange(_newTokens);
399+
AddNewTokens();
385400

386-
_newTokens.Clear();
387-
return useNewTokens;
401+
if (untranslatableParagraph || (existingText && _textBehavior == UpdateUsfmTextBehavior.PreferExisting))
402+
ClearNewTokens();
403+
404+
// figure out when to skip the existing text
405+
bool withinNewText = _replace.Any(r => r);
406+
if (withinNewText && inEmbedded)
407+
{
408+
if (_embeddedBehavior == UpdateUsfmIntraVerseMarkerBehavior.Strip)
409+
return true;
410+
411+
if (!inEmbeddedText)
412+
return false;
413+
}
414+
415+
bool skipTokens = useNewTokens && closed;
416+
417+
if (newText && isStyleTag)
418+
{
419+
skipTokens = _styleBehavior == UpdateUsfmIntraVerseMarkerBehavior.Strip;
420+
}
421+
return skipTokens;
388422
}
389423

390424
private void PushNewTokens(IEnumerable<UsfmToken> tokens)
@@ -393,9 +427,16 @@ private void PushNewTokens(IEnumerable<UsfmToken> tokens)
393427
_newTokens.AddRange(tokens);
394428
}
395429

396-
private void PushTokensAsPrevious()
430+
private void AddNewTokens()
431+
{
432+
if (_newTokens.Count > 0)
433+
_tokens.AddRange(_newTokens);
434+
_newTokens.Clear();
435+
}
436+
437+
private void ClearNewTokens()
397438
{
398-
_replace.Push(_replace.Peek());
439+
_newTokens.Clear();
399440
}
400441

401442
private void PopNewTokens()

0 commit comments

Comments
 (0)