44
55namespace SIL . Machine . Corpora
66{
7- public enum UpdateUsfmBehavior
7+ public enum UpdateUsfmTextBehavior
88 {
99 PreferExisting ,
1010 PreferNew ,
1111 StripExisting
1212 }
1313
14+ public enum UpdateUsfmIntraVerseMarkerBehavior
15+ {
16+ Preserve ,
17+ Strip ,
18+ }
19+
1420 /***
1521 * This is a USFM parser handler that can be used to replace the existing text in a USFM file with the specified
1622 * text.
@@ -21,23 +27,29 @@ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase
2127 private readonly List < UsfmToken > _tokens ;
2228 private readonly List < UsfmToken > _newTokens ;
2329 private readonly string _idText ;
24- private readonly UpdateUsfmBehavior _behavior ;
30+ private readonly UpdateUsfmTextBehavior _textBehavior ;
31+ private readonly UpdateUsfmIntraVerseMarkerBehavior _embeddedBehavior ;
32+ private readonly UpdateUsfmIntraVerseMarkerBehavior _styleBehavior ;
2533 private readonly Stack < bool > _replace ;
2634 private int _rowIndex ;
2735 private int _tokenIndex ;
2836
2937 public UpdateUsfmParserHandler (
3038 IReadOnlyList < ( IReadOnlyList < ScriptureRef > , string ) > rows = null ,
3139 string idText = null ,
32- UpdateUsfmBehavior behavior = UpdateUsfmBehavior . PreferExisting
40+ UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior . PreferExisting ,
41+ UpdateUsfmIntraVerseMarkerBehavior embeddedBehavior = UpdateUsfmIntraVerseMarkerBehavior . Preserve ,
42+ UpdateUsfmIntraVerseMarkerBehavior styleBehavior = UpdateUsfmIntraVerseMarkerBehavior . Strip
3343 )
3444 {
3545 _rows = rows ?? Array . Empty < ( IReadOnlyList < ScriptureRef > , string ) > ( ) ;
3646 _tokens = new List < UsfmToken > ( ) ;
3747 _newTokens = new List < UsfmToken > ( ) ;
3848 _idText = idText ;
3949 _replace = new Stack < bool > ( ) ;
40- _behavior = behavior ;
50+ _textBehavior = textBehavior ;
51+ _embeddedBehavior = embeddedBehavior ;
52+ _styleBehavior = styleBehavior ;
4153 }
4254
4355 public IReadOnlyList < UsfmToken > Tokens => _tokens ;
@@ -176,30 +188,39 @@ bool closed
176188 )
177189 {
178190 // strip out char-style markers in verses that are being replaced
179- if ( closed && ReplaceWithNewTokens ( state ) )
191+ if ( ReplaceWithNewTokens ( state , closed : closed ) )
180192 SkipTokens ( state ) ;
193+ else
194+ CollectTokens ( state ) ;
181195
182196 base . EndChar ( state , marker , attributes , closed ) ;
183197 }
184198
185- public override void StartNote ( UsfmParserState state , string marker , string caller , string category )
199+ public override void StartEmbedded ( UsfmParserState state , string marker , string caller , string category )
186200 {
187201 // strip out notes in verses that are being replaced
188202 if ( ReplaceWithNewTokens ( state ) )
189203 SkipTokens ( state ) ;
190204 else
191205 CollectTokens ( state ) ;
192206
193- base . StartNote ( state , marker , caller , category ) ;
207+ base . StartEmbedded ( state , marker , caller , category ) ;
194208 }
195209
196- public override void EndNote ( UsfmParserState state , string marker , bool closed )
210+ public override void EndEmbedded (
211+ UsfmParserState state ,
212+ string marker ,
213+ IReadOnlyList < UsfmAttribute > attributes ,
214+ bool closed
215+ )
197216 {
198217 // strip out notes in verses that are being replaced
199- if ( closed && ReplaceWithNewTokens ( state ) )
218+ if ( ReplaceWithNewTokens ( state , closed : closed , endEmbedded : true ) )
200219 SkipTokens ( state ) ;
220+ else
221+ CollectTokens ( state ) ;
201222
202- base . EndNote ( state , marker , closed ) ;
223+ base . EndEmbedded ( state , marker , attributes , closed ) ;
203224 }
204225
205226 public override void Ref ( UsfmParserState state , string marker , string display , string target )
@@ -268,31 +289,13 @@ protected override void EndNonVerseText(UsfmParserState state, ScriptureRef scri
268289 PopNewTokens ( ) ;
269290 }
270291
271- protected override void StartNoteText ( UsfmParserState state , ScriptureRef scriptureRef )
292+ protected override void StartEmbeddedText ( UsfmParserState state , ScriptureRef scriptureRef )
272293 {
273294 IReadOnlyList < string > rowTexts = AdvanceRows ( new [ ] { scriptureRef } ) ;
274- var newTokens = new List < UsfmToken > ( ) ;
275- if ( rowTexts . Count > 0 )
276- {
277- newTokens . Add ( state . Token ) ;
278- newTokens . Add ( new UsfmToken ( UsfmTokenType . Character , "ft" , null , "ft*" ) ) ;
279- for ( int i = 0 ; i < rowTexts . Count ; i ++ )
280- {
281- string text = rowTexts [ i ] ;
282- if ( i < rowTexts . Count - 1 )
283- text += " " ;
284- newTokens . Add ( new UsfmToken ( text ) ) ;
285- }
286- newTokens . Add ( new UsfmToken ( UsfmTokenType . End , state . Token . EndMarker , null , null ) ) ;
287- PushNewTokens ( newTokens ) ;
288- }
289- else
290- {
291- PushTokensAsPrevious ( ) ;
292- }
295+ PushNewTokens ( rowTexts . Select ( t => new UsfmToken ( t + " " ) ) ) ;
293296 }
294297
295- protected override void EndNoteText ( UsfmParserState state , ScriptureRef scriptureRef )
298+ protected override void EndEmbeddedText ( UsfmParserState state , ScriptureRef scriptureRef )
296299 {
297300 PopNewTokens ( ) ;
298301 }
@@ -362,29 +365,60 @@ private void SkipTokens(UsfmParserState state)
362365 _tokenIndex = state . Index + 1 + state . SpecialTokenCount ;
363366 }
364367
365- private bool ReplaceWithNewTokens ( UsfmParserState state )
368+ private bool ReplaceWithNewTokens ( UsfmParserState state , bool closed = true , bool endEmbedded = false )
366369 {
367- bool newText = _replace . Count > 0 && _replace . Peek ( ) ;
368- int tokenEnd = state . Index + state . SpecialTokenCount ;
369- bool existingText = false ;
370- for ( int index = _tokenIndex ; index <= tokenEnd ; index ++ )
370+ bool untranslatableParagraph =
371+ state . ParaTag ? . Marker != null && UsfmStylesheet . IsUntranslatedParagraph ( state . ParaTag . Marker ) ;
372+ if ( _textBehavior == UpdateUsfmTextBehavior . StripExisting )
371373 {
372- if ( state . Tokens [ index ] . Type == UsfmTokenType . Text && state . Tokens [ index ] . Text . Length > 0 )
373- {
374- existingText = true ;
375- break ;
376- }
374+ if ( untranslatableParagraph )
375+ ClearNewTokens ( ) ;
376+ else
377+ AddNewTokens ( ) ;
378+ return true ;
377379 }
380+
381+ bool newText = _replace . Count > 0 && _replace . Peek ( ) ;
382+ bool inEmbedded = state . EmbeddedTag != null || endEmbedded ;
383+ bool inEmbeddedText =
384+ CurrentTextType == ScriptureTextType . EmbeddedText && ! UsfmStylesheet . IsEmbeddedText ( state . Token . Marker ) ;
385+ bool isStyleTag = state . Token . Marker != null && ! UsfmStylesheet . IsEmbeddedPart ( state . Token . Marker ) ;
386+
387+ bool existingText = state
388+ . Tokens . Skip ( _tokenIndex )
389+ . Take ( state . Index + 1 + state . SpecialTokenCount - _tokenIndex )
390+ . Any ( t => t . Type == UsfmTokenType . Text && t . Text . Length > 0 ) ;
391+
378392 bool useNewTokens =
379- _behavior == UpdateUsfmBehavior . StripExisting
380- || ( newText && ! existingText )
381- || ( newText && _behavior == UpdateUsfmBehavior . PreferNew ) ;
393+ ! untranslatableParagraph
394+ && newText
395+ && ( ! existingText || _textBehavior == UpdateUsfmTextBehavior . PreferNew )
396+ && ( ! inEmbedded || inEmbeddedText ) ;
382397
383398 if ( useNewTokens )
384- _tokens . AddRange ( _newTokens ) ;
399+ AddNewTokens ( ) ;
385400
386- _newTokens . Clear ( ) ;
387- return useNewTokens ;
401+ if ( untranslatableParagraph || ( existingText && _textBehavior == UpdateUsfmTextBehavior . PreferExisting ) )
402+ ClearNewTokens ( ) ;
403+
404+ // figure out when to skip the existing text
405+ bool withinNewText = _replace . Any ( r => r ) ;
406+ if ( withinNewText && inEmbedded )
407+ {
408+ if ( _embeddedBehavior == UpdateUsfmIntraVerseMarkerBehavior . Strip )
409+ return true ;
410+
411+ if ( ! inEmbeddedText )
412+ return false ;
413+ }
414+
415+ bool skipTokens = useNewTokens && closed ;
416+
417+ if ( newText && isStyleTag )
418+ {
419+ skipTokens = _styleBehavior == UpdateUsfmIntraVerseMarkerBehavior . Strip ;
420+ }
421+ return skipTokens ;
388422 }
389423
390424 private void PushNewTokens ( IEnumerable < UsfmToken > tokens )
@@ -393,9 +427,16 @@ private void PushNewTokens(IEnumerable<UsfmToken> tokens)
393427 _newTokens . AddRange ( tokens ) ;
394428 }
395429
396- private void PushTokensAsPrevious ( )
430+ private void AddNewTokens ( )
431+ {
432+ if ( _newTokens . Count > 0 )
433+ _tokens . AddRange ( _newTokens ) ;
434+ _newTokens . Clear ( ) ;
435+ }
436+
437+ private void ClearNewTokens ( )
397438 {
398- _replace . Push ( _replace . Peek ( ) ) ;
439+ _newTokens . Clear ( ) ;
399440 }
400441
401442 private void PopNewTokens ( )
0 commit comments