Skip to content

Commit 03adec2

Browse files
committed
Preserve figs, notes and cross references
Switch Notes to SubComponents * include figures and cross references Configure saving subcomponents and formatting USFM update tests work - now to the others.
1 parent c304a75 commit 03adec2

17 files changed

+451
-157
lines changed

src/SIL.Machine/Corpora/IUsfmParserHandler.cs

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,14 +69,29 @@ IReadOnlyList<UsfmAttribute> attributes
6969
void EndChar(UsfmParserState state, string marker, IReadOnlyList<UsfmAttribute> attributes, bool closed);
7070

7171
/// <summary>
72-
/// Start of a note
72+
/// Start of a sub component - a note, figure or cross reference
7373
/// </summary>
74-
void StartNote(UsfmParserState state, string marker, string caller, string category);
74+
void StartSubComponent(UsfmParserState state, string marker, string caller, string category);
7575

7676
/// <summary>
77-
/// End of a note
77+
/// End of a sub component
7878
/// </summary>
79-
void EndNote(UsfmParserState state, string marker, bool closed);
79+
void EndSubComponent(
80+
UsfmParserState state,
81+
string marker,
82+
IReadOnlyList<UsfmAttribute> attributes,
83+
bool closed
84+
);
85+
86+
/// <summary>
87+
/// Start of a sub component text
88+
/// </summary>
89+
void StartSubComponentText(UsfmParserState state);
90+
91+
/// <summary>
92+
/// End of a sub component text
93+
/// </summary>
94+
void EndSubComponentText(UsfmParserState state);
8095

8196
/// <summary>
8297
/// Start of a table

src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ public string UpdateUsfm(
2323
string bookId,
2424
IReadOnlyList<(IReadOnlyList<ScriptureRef>, string)> rows,
2525
string fullName = null,
26-
UpdateUsfmBehavior behavior = UpdateUsfmBehavior.PreferExisting
26+
UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferExisting,
27+
UpdateUsfmIntraVerseMarkerBehavior subComponentBehavior = UpdateUsfmIntraVerseMarkerBehavior.Preserve,
28+
UpdateUsfmIntraVerseMarkerBehavior formattingBehavior = UpdateUsfmIntraVerseMarkerBehavior.Strip
2729
)
2830
{
2931
string fileName = _settings.GetBookFileName(bookId);
@@ -36,7 +38,13 @@ public string UpdateUsfm(
3638
usfm = reader.ReadToEnd();
3739
}
3840

39-
var handler = new UpdateUsfmParserHandler(rows, fullName is null ? null : $"- {fullName}", behavior);
41+
var handler = new UpdateUsfmParserHandler(
42+
rows,
43+
fullName is null ? null : $"- {fullName}",
44+
textBehavior,
45+
subComponentBehavior,
46+
formattingBehavior
47+
);
4048
try
4149
{
4250
UsfmParser.Parse(usfm, handler, _settings.Stylesheet, _settings.Versification);

src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ public enum ScriptureTextType
99
None,
1010
NonVerse,
1111
Verse,
12-
Note
12+
SubComponent,
13+
SubComponentText
1314
}
1415

1516
public abstract class ScriptureRefUsfmParserHandlerBase : UsfmParserHandlerBase
@@ -151,21 +152,29 @@ public override void EndSidebar(UsfmParserState state, string marker, bool close
151152
EndParentElement();
152153
}
153154

154-
public override void StartNote(UsfmParserState state, string marker, string caller, string category)
155+
public override void StartSubComponent(UsfmParserState state, string marker, string caller, string category)
155156
{
157+
if (_curVerseRef.IsDefault)
158+
UpdateVerseRef(state.VerseRef, marker, startAsChildElement: true);
159+
156160
if (CurrentTextType != ScriptureTextType.None && !_duplicateVerse)
157161
{
158162
// if we hit a note in a verse paragraph and we aren't in a verse, then start a non-verse segment
159163
CheckConvertVerseParaToNonVerse(state);
160164
NextElement(marker);
161-
StartNoteText(state);
162165
}
163166
}
164167

165-
public override void EndNote(UsfmParserState state, string marker, bool closed)
168+
public override void StartSubComponentText(UsfmParserState state)
166169
{
167-
if (CurrentTextType == ScriptureTextType.Note && !_duplicateVerse)
168-
EndNoteText(state);
170+
_curTextType.Push(ScriptureTextType.SubComponentText);
171+
StartSubComponentText(state, CreateNonVerseRef());
172+
}
173+
174+
public override void EndSubComponentText(UsfmParserState state)
175+
{
176+
EndSubComponentText(state, CreateNonVerseRef());
177+
_curTextType.Pop();
169178
}
170179

171180
public override void Text(UsfmParserState state, string text)
@@ -200,9 +209,9 @@ protected virtual void StartNonVerseText(UsfmParserState state, ScriptureRef scr
200209

201210
protected virtual void EndNonVerseText(UsfmParserState state, ScriptureRef scriptureRef) { }
202211

203-
protected virtual void StartNoteText(UsfmParserState state, ScriptureRef scriptureRef) { }
212+
protected virtual void StartSubComponentText(UsfmParserState state, ScriptureRef scriptureRef) { }
204213

205-
protected virtual void EndNoteText(UsfmParserState state, ScriptureRef scriptureRef) { }
214+
protected virtual void EndSubComponentText(UsfmParserState state, ScriptureRef scriptureRef) { }
206215

207216
private void StartVerseText(UsfmParserState state)
208217
{
@@ -231,24 +240,13 @@ private void EndNonVerseText(UsfmParserState state)
231240
_curTextType.Pop();
232241
}
233242

234-
private void StartNoteText(UsfmParserState state)
235-
{
236-
_curTextType.Push(ScriptureTextType.Note);
237-
StartNoteText(state, CreateNonVerseRef());
238-
}
239-
240-
private void EndNoteText(UsfmParserState state)
241-
{
242-
EndNoteText(state, CreateNonVerseRef());
243-
_curTextType.Pop();
244-
}
245-
246-
private void UpdateVerseRef(VerseRef verseRef, string marker)
243+
private void UpdateVerseRef(VerseRef verseRef, string marker, bool startAsChildElement = false)
247244
{
248245
if (!VerseRef.AreOverlappingVersesRanges(verseRef, _curVerseRef))
249246
{
250247
_curElements.Clear();
251-
_curElements.Push(new ScriptureElement(0, marker));
248+
int position = startAsChildElement ? 1 : 0;
249+
_curElements.Push(new ScriptureElement(position, marker));
252250
}
253251
_curVerseRef = verseRef;
254252
}

src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs

Lines changed: 91 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,19 @@
44

55
namespace SIL.Machine.Corpora
66
{
7-
public enum UpdateUsfmBehavior
7+
public enum UpdateUsfmTextBehavior
88
{
99
PreferExisting,
1010
PreferNew,
1111
StripExisting
1212
}
1313

14+
public enum UpdateUsfmIntraVerseMarkerBehavior
15+
{
16+
Preserve,
17+
Strip,
18+
}
19+
1420
/***
1521
* This is a USFM parser handler that can be used to replace the existing text in a USFM file with the specified
1622
* text.
@@ -21,23 +27,29 @@ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase
2127
private readonly List<UsfmToken> _tokens;
2228
private readonly List<UsfmToken> _newTokens;
2329
private readonly string _idText;
24-
private readonly UpdateUsfmBehavior _behavior;
30+
private readonly UpdateUsfmTextBehavior _textBehavior;
31+
private readonly UpdateUsfmIntraVerseMarkerBehavior _subComponentBehavior;
32+
private readonly UpdateUsfmIntraVerseMarkerBehavior _formattingBehavior;
2533
private readonly Stack<bool> _replace;
2634
private int _rowIndex;
2735
private int _tokenIndex;
2836

2937
public UpdateUsfmParserHandler(
3038
IReadOnlyList<(IReadOnlyList<ScriptureRef>, string)> rows = null,
3139
string idText = null,
32-
UpdateUsfmBehavior behavior = UpdateUsfmBehavior.PreferExisting
40+
UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferExisting,
41+
UpdateUsfmIntraVerseMarkerBehavior subComponentBehavior = UpdateUsfmIntraVerseMarkerBehavior.Preserve,
42+
UpdateUsfmIntraVerseMarkerBehavior formattingBehavior = UpdateUsfmIntraVerseMarkerBehavior.Strip
3343
)
3444
{
3545
_rows = rows ?? Array.Empty<(IReadOnlyList<ScriptureRef>, string)>();
3646
_tokens = new List<UsfmToken>();
3747
_newTokens = new List<UsfmToken>();
3848
_idText = idText;
3949
_replace = new Stack<bool>();
40-
_behavior = behavior;
50+
_textBehavior = textBehavior;
51+
_subComponentBehavior = subComponentBehavior;
52+
_formattingBehavior = formattingBehavior;
4153
}
4254

4355
public IReadOnlyList<UsfmToken> Tokens => _tokens;
@@ -176,30 +188,39 @@ bool closed
176188
)
177189
{
178190
// strip out char-style markers in verses that are being replaced
179-
if (closed && ReplaceWithNewTokens(state))
191+
if (ReplaceWithNewTokens(state, closed: closed))
180192
SkipTokens(state);
193+
else
194+
CollectTokens(state);
181195

182196
base.EndChar(state, marker, attributes, closed);
183197
}
184198

185-
public override void StartNote(UsfmParserState state, string marker, string caller, string category)
199+
public override void StartSubComponent(UsfmParserState state, string marker, string caller, string category)
186200
{
187201
// strip out notes in verses that are being replaced
188202
if (ReplaceWithNewTokens(state))
189203
SkipTokens(state);
190204
else
191205
CollectTokens(state);
192206

193-
base.StartNote(state, marker, caller, category);
207+
base.StartSubComponent(state, marker, caller, category);
194208
}
195209

196-
public override void EndNote(UsfmParserState state, string marker, bool closed)
210+
public override void EndSubComponent(
211+
UsfmParserState state,
212+
string marker,
213+
IReadOnlyList<UsfmAttribute> attributes,
214+
bool closed
215+
)
197216
{
198217
// strip out notes in verses that are being replaced
199-
if (closed && ReplaceWithNewTokens(state))
218+
if (ReplaceWithNewTokens(state, closed: closed, endSubComponent: true))
200219
SkipTokens(state);
220+
else
221+
CollectTokens(state);
201222

202-
base.EndNote(state, marker, closed);
223+
base.EndSubComponent(state, marker, attributes, closed);
203224
}
204225

205226
public override void Ref(UsfmParserState state, string marker, string display, string target)
@@ -268,31 +289,13 @@ protected override void EndNonVerseText(UsfmParserState state, ScriptureRef scri
268289
PopNewTokens();
269290
}
270291

271-
protected override void StartNoteText(UsfmParserState state, ScriptureRef scriptureRef)
292+
protected override void StartSubComponentText(UsfmParserState state, ScriptureRef scriptureRef)
272293
{
273294
IReadOnlyList<string> rowTexts = AdvanceRows(new[] { scriptureRef });
274-
var newTokens = new List<UsfmToken>();
275-
if (rowTexts.Count > 0)
276-
{
277-
newTokens.Add(state.Token);
278-
newTokens.Add(new UsfmToken(UsfmTokenType.Character, "ft", null, "ft*"));
279-
for (int i = 0; i < rowTexts.Count; i++)
280-
{
281-
string text = rowTexts[i];
282-
if (i < rowTexts.Count - 1)
283-
text += " ";
284-
newTokens.Add(new UsfmToken(text));
285-
}
286-
newTokens.Add(new UsfmToken(UsfmTokenType.End, state.Token.EndMarker, null, null));
287-
PushNewTokens(newTokens);
288-
}
289-
else
290-
{
291-
PushTokensAsPrevious();
292-
}
295+
PushNewTokens(rowTexts.Select(t => new UsfmToken(t + " ")));
293296
}
294297

295-
protected override void EndNoteText(UsfmParserState state, ScriptureRef scriptureRef)
298+
protected override void EndSubComponentText(UsfmParserState state, ScriptureRef scriptureRef)
296299
{
297300
PopNewTokens();
298301
}
@@ -362,29 +365,61 @@ private void SkipTokens(UsfmParserState state)
362365
_tokenIndex = state.Index + 1 + state.SpecialTokenCount;
363366
}
364367

365-
private bool ReplaceWithNewTokens(UsfmParserState state)
368+
private bool ReplaceWithNewTokens(UsfmParserState state, bool closed = true, bool endSubComponent = false)
366369
{
367-
bool newText = _replace.Count > 0 && _replace.Peek();
368-
int tokenEnd = state.Index + state.SpecialTokenCount;
369-
bool existingText = false;
370-
for (int index = _tokenIndex; index <= tokenEnd; index++)
370+
bool untranslatableParagraph =
371+
state.ParaTag?.Marker != null && UsfmStylesheet.IsUntranslatedParagraph(state.ParaTag.Marker);
372+
if (_textBehavior == UpdateUsfmTextBehavior.StripExisting)
371373
{
372-
if (state.Tokens[index].Type == UsfmTokenType.Text && state.Tokens[index].Text.Length > 0)
373-
{
374-
existingText = true;
375-
break;
376-
}
374+
if (untranslatableParagraph)
375+
ClearNewTokens();
376+
else
377+
AddNewTokens();
378+
return true;
377379
}
380+
381+
bool newText = _replace.Count > 0 && _replace.Peek();
382+
bool inSubComponent = state.SubComponentTag != null || endSubComponent;
383+
bool inSubComponentText =
384+
CurrentTextType == ScriptureTextType.SubComponentText
385+
&& !UsfmStylesheet.IsSubComponentText(state.Token.Marker);
386+
bool isFormattingTag = state.Token.Marker != null && !UsfmStylesheet.IsSubComponentPart(state.Token.Marker);
387+
388+
bool existingText = state
389+
.Tokens.Skip(_tokenIndex)
390+
.Take(state.Index + 1 + state.SpecialTokenCount - _tokenIndex)
391+
.Any(t => t.Type == UsfmTokenType.Text && t.Text.Length > 0);
392+
378393
bool useNewTokens =
379-
_behavior == UpdateUsfmBehavior.StripExisting
380-
|| (newText && !existingText)
381-
|| (newText && _behavior == UpdateUsfmBehavior.PreferNew);
394+
!untranslatableParagraph
395+
&& newText
396+
&& (!existingText || _textBehavior == UpdateUsfmTextBehavior.PreferNew)
397+
&& (!inSubComponent || inSubComponentText);
382398

383399
if (useNewTokens)
384-
_tokens.AddRange(_newTokens);
400+
AddNewTokens();
385401

386-
_newTokens.Clear();
387-
return useNewTokens;
402+
if (untranslatableParagraph || (existingText && _textBehavior == UpdateUsfmTextBehavior.PreferExisting))
403+
ClearNewTokens();
404+
405+
// figure out when to skip the existing text
406+
bool withinNewText = _replace.Any(r => r);
407+
if (withinNewText && inSubComponent)
408+
{
409+
if (_subComponentBehavior == UpdateUsfmIntraVerseMarkerBehavior.Strip)
410+
return true;
411+
412+
if (!inSubComponentText)
413+
return false;
414+
}
415+
416+
bool skipTokens = useNewTokens && closed;
417+
418+
if (newText && isFormattingTag)
419+
{
420+
skipTokens = _formattingBehavior == UpdateUsfmIntraVerseMarkerBehavior.Strip;
421+
}
422+
return skipTokens;
388423
}
389424

390425
private void PushNewTokens(IEnumerable<UsfmToken> tokens)
@@ -393,9 +428,16 @@ private void PushNewTokens(IEnumerable<UsfmToken> tokens)
393428
_newTokens.AddRange(tokens);
394429
}
395430

396-
private void PushTokensAsPrevious()
431+
private void AddNewTokens()
432+
{
433+
if (_newTokens.Count > 0)
434+
_tokens.AddRange(_newTokens);
435+
_newTokens.Clear();
436+
}
437+
438+
private void ClearNewTokens()
397439
{
398-
_replace.Push(_replace.Peek());
440+
_newTokens.Clear();
399441
}
400442

401443
private void PopNewTokens()

0 commit comments

Comments
 (0)