1313package segmenter
1414
1515import (
16+ "unicode/utf8"
17+
1618 ucd "github.com/go-text/typesetting/internal/unicodedata"
1719)
1820
@@ -225,12 +227,20 @@ type Segmenter struct {
225227 // text : [b, u, l, l]
226228 // attributes : [<start> b, b u, u l, l l, l <end>]
227229 attributes []breakAttr
230+
231+ invalidUTF8Indices map [int ]struct {}
228232}
229233
230234// Init resets the segmenter storage with the given input,
231235// and computes the attributes required to segment the text.
236+ //
237+ // If paragraph includes an invalid rune, some outputs like
238+ // [Line.OffsetInBytes] and [Line.LengthInBytes] are undefined.
232239func (seg * Segmenter ) Init (paragraph []rune ) {
233240 seg .text = append (seg .text [:0 ], paragraph ... )
241+ for k := range seg .invalidUTF8Indices {
242+ delete (seg .invalidUTF8Indices , k )
243+ }
234244 seg .initAttributes ()
235245}
236246
@@ -241,9 +251,25 @@ func (seg *Segmenter) Init(paragraph []rune) {
241251//
242252// InitWithString is more efficient than [Init] if the input is a string.
243253// No allocation for the text is made if its internal buffer capacity is already large enough.
254+ //
255+ // Invalid UTF-8 sequences in paragmraph are replaced with U+FFFD.
244256func (seg * Segmenter ) InitWithString (paragraph string ) {
245257 seg .text = seg .text [:0 ]
246- for _ , r := range paragraph {
258+ for k := range seg .invalidUTF8Indices {
259+ delete (seg .invalidUTF8Indices , k )
260+ }
261+ for i , r := range paragraph {
262+ if r == utf8 .RuneError {
263+ // Check whether the rune is acually U+FFFD, or an invalid UTF-8 sequence.
264+ if r , l := utf8 .DecodeRuneInString (paragraph [i :]); r == utf8 .RuneError && l == 1 {
265+ // The current rune is an invalid UTF-8 sequence.
266+ // Record the index.
267+ if seg .invalidUTF8Indices == nil {
268+ seg .invalidUTF8Indices = make (map [int ]struct {})
269+ }
270+ seg .invalidUTF8Indices [i ] = struct {}{}
271+ }
272+ }
247273 seg .text = append (seg .text , r )
248274 }
249275 seg .initAttributes ()
@@ -256,10 +282,26 @@ func (seg *Segmenter) InitWithString(paragraph string) {
256282//
257283// InitWithBytes is more efficient than [Init] if the input is a byte slice.
258284// No allocation for the text is made if its internal buffer capacity is already large enough.
285+ //
286+ // Invalid UTF-8 sequences in paragmraph are replaced with U+FFFD.
259287func (seg * Segmenter ) InitWithBytes (paragraph []byte ) {
260288 seg .text = seg .text [:0 ]
289+ for k := range seg .invalidUTF8Indices {
290+ delete (seg .invalidUTF8Indices , k )
291+ }
261292 // The Go compiler should optimize this without allocating a string.
262- for _ , r := range string (paragraph ) {
293+ for i , r := range string (paragraph ) {
294+ if r == utf8 .RuneError {
295+ // Check whether the rune is acually U+FFFD, or an invalid UTF-8 sequence.
296+ if r , l := utf8 .DecodeRune (paragraph [i :]); r == utf8 .RuneError && l == 1 {
297+ // The current rune is an invalid UTF-8 sequence.
298+ // Record the index.
299+ if seg .invalidUTF8Indices == nil {
300+ seg .invalidUTF8Indices = make (map [int ]struct {})
301+ }
302+ seg .invalidUTF8Indices [i ] = struct {}{}
303+ }
304+ }
263305 seg .text = append (seg .text , r )
264306 }
265307 seg .initAttributes ()
@@ -273,34 +315,62 @@ func (seg *Segmenter) initAttributes() {
273315// attributeIterator is an helper type used to
274316// handle iterating over a slice of runeAttr
275317type attributeIterator struct {
276- src * Segmenter
277- pos int // the current position in the input slice
278- lastBreak int // the start of the current segment
279- flag breakAttr // break where this flag is on
318+ src * Segmenter
319+ pos int // the current position in the input slice (in runes)
320+ lastBreak int // the start of the current segment (in runes)
321+ posInBytes int // the current position in the input (in UTF-8 bytes)
322+ lastBreakInBytes int // the start of the current segment (in UTF-8 bytes)
323+ flag breakAttr // break where this flag is on
280324}
281325
282326// next returns true if there is still a segment to process,
283327// and advances the iterator; or return false.
284328// if returning true, the segment is at [iter.lastBreak:iter.pos]
285329func (iter * attributeIterator ) next () bool {
286330 iter .lastBreak = iter .pos // remember the start of the next segment
287- iter .pos ++
331+ iter .lastBreakInBytes = iter .posInBytes
332+ iter .incrementPos ()
288333 for iter .pos <= len (iter .src .text ) {
289334 // can we break before i ?
290335 if iter .src .attributes [iter .pos ]& iter .flag != 0 {
291336 return true
292337 }
293- iter .pos ++
338+ iter .incrementPos ()
294339 }
295340 return false
296341}
297342
343+ func (iter * attributeIterator ) incrementPos () {
344+ // If the current position is an invalid UTF-8 sequence, the byte is likely replaced with U+FFFD.
345+ // Advance the position by 1 byte and 1 rune.
346+ if _ , ok := iter .src .invalidUTF8Indices [iter .posInBytes ]; ok {
347+ iter .posInBytes ++
348+ iter .pos ++
349+ return
350+ }
351+
352+ if iter .pos < len (iter .src .text ) {
353+ r := iter .src .text [iter .pos ]
354+ if l := utf8 .RuneLen (r ); l > 0 {
355+ iter .posInBytes += l
356+ }
357+ // If l <= 0, it means that the rune is invalid UTF-8.
358+ // There is no correct way to update the byte position.
359+ // This case is treated as an undefined behavior. Just skip it.
360+ }
361+ iter .pos ++
362+ }
363+
298364// Line is the content of a line delimited by the segmenter.
299365type Line struct {
300366 // Text is a subslice of the original input slice, containing the delimited line
301367 Text []rune
302368 // Offset is the start of the line in the input rune slice
303369 Offset int
370+ // OffsetInBytes is the start of the line in the input, in UTF-8 bytes
371+ OffsetInBytes int
372+ // LengthInBytes is the length of the line in the input, in UTF-8 bytes
373+ LengthInBytes int
304374 // IsMandatoryBreak is true if breaking (at the end of the line)
305375 // is mandatory
306376 IsMandatoryBreak bool
@@ -320,6 +390,8 @@ func (li *LineIterator) Next() bool { return li.next() }
320390func (li * LineIterator ) Line () Line {
321391 return Line {
322392 Offset : li .lastBreak ,
393+ OffsetInBytes : li .lastBreakInBytes ,
394+ LengthInBytes : li .posInBytes - li .lastBreakInBytes ,
323395 Text : li .src .text [li .lastBreak :li .pos ], // pos is not included since we break right before
324396 IsMandatoryBreak : li .src .attributes [li .pos ]& mandatoryLineBoundary != 0 ,
325397 }
@@ -337,6 +409,10 @@ type Grapheme struct {
337409 Text []rune
338410 // Offset is the start of the grapheme in the input rune slice
339411 Offset int
412+ // OffsetInBytes is the start of the grapheme in the input, in UTF-8 bytes
413+ OffsetInBytes int
414+ // LengthInBytes is the length of the grapheme in the input, in UTF-8 bytes
415+ LengthInBytes int
340416}
341417
342418// GraphemeIterator provides a convenient way of
@@ -352,8 +428,10 @@ func (gr *GraphemeIterator) Next() bool { return gr.next() }
352428// Grapheme returns the current `Grapheme`
353429func (gr * GraphemeIterator ) Grapheme () Grapheme {
354430 return Grapheme {
355- Offset : gr .lastBreak ,
356- Text : gr .src .text [gr .lastBreak :gr .pos ],
431+ Offset : gr .lastBreak ,
432+ OffsetInBytes : gr .lastBreakInBytes ,
433+ LengthInBytes : gr .posInBytes - gr .lastBreakInBytes ,
434+ Text : gr .src .text [gr .lastBreak :gr .pos ],
357435 }
358436}
359437
@@ -377,6 +455,10 @@ type Word struct {
377455 Text []rune
378456 // Offset is the start of the word in the input rune slice
379457 Offset int
458+ // OffsetInBytes is the start of the word in the input, in UTF-8 bytes
459+ OffsetInBytes int
460+ // LengthInBytes is the length of the word in the input, in UTF-8 bytes
461+ LengthInBytes int
380462}
381463
382464type WordIterator struct {
@@ -409,8 +491,10 @@ func (gr *WordIterator) Next() bool {
409491// Word returns the current `Word`
410492func (gr * WordIterator ) Word () Word {
411493 return Word {
412- Offset : gr .lastBreak ,
413- Text : gr .src .text [gr .lastBreak :gr .pos ],
494+ Offset : gr .lastBreak ,
495+ OffsetInBytes : gr .lastBreakInBytes ,
496+ LengthInBytes : gr .posInBytes - gr .lastBreakInBytes ,
497+ Text : gr .src .text [gr .lastBreak :gr .pos ],
414498 }
415499}
416500
0 commit comments