go-text · benoitkugler · Mar 9, 2026 · Mar 4, 2026
@@ -13,6 +13,9 @@
 package segmenter
 
 import (
+	"fmt"
+	"unicode/utf8"
+
 	ucd "github.com/go-text/typesetting/internal/unicodedata"
 )
 
@@ -229,6 +232,9 @@ type Segmenter struct {
 
 // Init resets the segmenter storage with the given input,
 // and computes the attributes required to segment the text.
+//
+// If paragraph includes an invalid rune like out of range, some outputs like
+// [Line.OffsetInBytes] and [Line.LengthInBytes] are undefined.
 func (seg *Segmenter) Init(paragraph []rune) {
 	seg.text = append(seg.text[:0], paragraph...)
 	seg.initAttributes()
@@ -237,32 +243,60 @@ func (seg *Segmenter) Init(paragraph []rune) {
 // InitWithString resets the segmenter storage with the given string input,
 // and computes the attributes required to segment the text.
 //
-// If paragraph includes an invalid UTF-8 sequence, these are replaced with U+FFFD.
+// InitWithString returns an error if paragraph includes an invalid UTF-8 sequence.
 //
 // InitWithString is more efficient than [Init] if the input is a string.
 // No allocation for the text is made if its internal buffer capacity is already large enough.
-func (seg *Segmenter) InitWithString(paragraph string) {
+func (seg *Segmenter) InitWithString(paragraph string) (err error) {
+	defer func() {
+		if err != nil {
+			seg.text = seg.text[:0]
+			seg.attributes = seg.attributes[:0]
+		}
+	}()
+
 	seg.text = seg.text[:0]
-	for _, r := range paragraph {
+	for i, r := range paragraph {
+		if r == utf8.RuneError {
+			// Check whether the rune is acually U+FFFD, or an invalid UTF-8 sequence.
+			if r, l := utf8.DecodeRuneInString(paragraph[i:]); r == utf8.RuneError && l == 1 {
+				return fmt.Errorf("invalid UTF-8 sequence at index %d", i)
+			}
+		}
 		seg.text = append(seg.text, r)
 	}
 	seg.initAttributes()
+	return nil
 }
 
 // InitWithBytes resets the segmenter storage with the given byte slice input,
 // and computes the attributes required to segment the text.
 //
-// If paragraph includes an invalid UTF-8 sequence, these are replaced with U+FFFD.
+// InitWithBytes returns an error if paragraph includes an invalid UTF-8 sequence.
 //
 // InitWithBytes is more efficient than [Init] if the input is a byte slice.
 // No allocation for the text is made if its internal buffer capacity is already large enough.
-func (seg *Segmenter) InitWithBytes(paragraph []byte) {
+func (seg *Segmenter) InitWithBytes(paragraph []byte) (err error) {
+	defer func() {
+		if err != nil {
+			seg.text = seg.text[:0]
+			seg.attributes = seg.attributes[:0]
+		}
+	}()
+
 	seg.text = seg.text[:0]
 	// The Go compiler should optimize this without allocating a string.
-	for _, r := range string(paragraph) {
+	for i, r := range string(paragraph) {
+		if r == utf8.RuneError {
+			// Check whether the rune is acually U+FFFD, or an invalid UTF-8 sequence.
+			if r, l := utf8.DecodeRune(paragraph[i:]); r == utf8.RuneError && l == 1 {
+				return fmt.Errorf("invalid UTF-8 sequence at index %d", i)
+			}
+		}
 		seg.text = append(seg.text, r)
 	}
 	seg.initAttributes()
+	return nil
 }
 
 func (seg *Segmenter) initAttributes() {
@@ -273,34 +307,54 @@ func (seg *Segmenter) initAttributes() {
 // attributeIterator is an helper type used to
 // handle iterating over a slice of runeAttr
 type attributeIterator struct {
-	src       *Segmenter
-	pos       int       // the current position in the input slice
-	lastBreak int       // the start of the current segment
-	flag      breakAttr // break where this flag is on
+	src              *Segmenter
+	pos              int       // the current position in the input slice (in runes)
+	lastBreak        int       // the start of the current segment (in runes)
+	posInBytes       int       // the current position in the input (in UTF-8 bytes)
+	lastBreakInBytes int       // the start of the current segment (in UTF-8 bytes)
+	flag             breakAttr // break where this flag is on
 }
 
 // next returns true if there is still a segment to process,
 // and advances the iterator; or return false.
 // if returning true, the segment is at [iter.lastBreak:iter.pos]
 func (iter *attributeIterator) next() bool {
 	iter.lastBreak = iter.pos // remember the start of the next segment
-	iter.pos++
+	iter.lastBreakInBytes = iter.posInBytes
+	iter.incrementPos()
 	for iter.pos <= len(iter.src.text) {
 		// can we break before i ?
 		if iter.src.attributes[iter.pos]&iter.flag != 0 {
 			return true
 		}
-		iter.pos++
+		iter.incrementPos()
 	}
 	return false
 }
 
+func (iter *attributeIterator) incrementPos() {
+	if iter.pos < len(iter.src.text) {
+		r := iter.src.text[iter.pos]
+		if l := utf8.RuneLen(r); l > 0 {
+			iter.posInBytes += l
+		}
+		// If l <= 0, it means that the rune is an invalid code point like out of range.
+		// There is no correct way to update the byte position.
+		// This case is treated as an undefined behavior. Just skip it.
+	}
+	iter.pos++
+}
+
 // Line is the content of a line delimited by the segmenter.
 type Line struct {
 	// Text is a subslice of the original input slice, containing the delimited line
 	Text []rune
 	// Offset is the start of the line in the input rune slice
 	Offset int
+	// OffsetInBytes is the start of the line in the input, in UTF-8 bytes
+	OffsetInBytes int
+	// LengthInBytes is the length of the line in the input, in UTF-8 bytes
+	LengthInBytes int
 	// IsMandatoryBreak is true if breaking (at the end of the line)
 	// is mandatory
 	IsMandatoryBreak bool
@@ -320,6 +374,8 @@ func (li *LineIterator) Next() bool { return li.next() }
 func (li *LineIterator) Line() Line {
 	return Line{
 		Offset:           li.lastBreak,
+		OffsetInBytes:    li.lastBreakInBytes,
+		LengthInBytes:    li.posInBytes - li.lastBreakInBytes,
 		Text:             li.src.text[li.lastBreak:li.pos], // pos is not included since we break right before
 		IsMandatoryBreak: li.src.attributes[li.pos]&mandatoryLineBoundary != 0,
 	}
@@ -337,6 +393,10 @@ type Grapheme struct {
 	Text []rune
 	// Offset is the start of the grapheme in the input rune slice
 	Offset int
+	// OffsetInBytes is the start of the grapheme in the input, in UTF-8 bytes
+	OffsetInBytes int
+	// LengthInBytes is the length of the grapheme in the input, in UTF-8 bytes
+	LengthInBytes int
 }
 
 // GraphemeIterator provides a convenient way of
@@ -352,8 +412,10 @@ func (gr *GraphemeIterator) Next() bool { return gr.next() }
 // Grapheme returns the current `Grapheme`
 func (gr *GraphemeIterator) Grapheme() Grapheme {
 	return Grapheme{
-		Offset: gr.lastBreak,
-		Text:   gr.src.text[gr.lastBreak:gr.pos],
+		Offset:        gr.lastBreak,
+		OffsetInBytes: gr.lastBreakInBytes,
+		LengthInBytes: gr.posInBytes - gr.lastBreakInBytes,
+		Text:          gr.src.text[gr.lastBreak:gr.pos],
 	}
 }
 
@@ -377,6 +439,10 @@ type Word struct {
 	Text []rune
 	// Offset is the start of the word in the input rune slice
 	Offset int
+	// OffsetInBytes is the start of the word in the input, in UTF-8 bytes
+	OffsetInBytes int
+	// LengthInBytes is the length of the word in the input, in UTF-8 bytes
+	LengthInBytes int
 }
 
 type WordIterator struct {
@@ -409,8 +475,10 @@ func (gr *WordIterator) Next() bool {
 // Word returns the current `Word`
 func (gr *WordIterator) Word() Word {
 	return Word{
-		Offset: gr.lastBreak,
-		Text:   gr.src.text[gr.lastBreak:gr.pos],
+		Offset:        gr.lastBreak,
+		OffsetInBytes: gr.lastBreakInBytes,
+		LengthInBytes: gr.posInBytes - gr.lastBreakInBytes,
+		Text:          gr.src.text[gr.lastBreak:gr.pos],
 	}
 }
 

@@ -87,9 +87,13 @@ func TestLineBreakUnicodeReference(t *testing.T) {
 			case initModeRunes:
 				seg1.Init(text)
 			case initModeString:
-				seg1.InitWithString(s)
+				if err := seg1.InitWithString(s); err != nil {
+					t.Error(err)
+				}
 			case initModeBytes:
-				seg1.InitWithBytes([]byte(s))
+				if err := seg1.InitWithBytes([]byte(s)); err != nil {
+					t.Error(err)
+				}
 			}
 			actualSegments := collectLineBreaks(&seg1)
 			if !reflect.DeepEqual(expectedSegments, actualSegments) {
@@ -158,9 +162,13 @@ func TestGraphemeBreakUnicodeReference(t *testing.T) {
 			case initModeRunes:
 				seg1.Init(text)
 			case initModeString:
-				seg1.InitWithString(s)
+				if err := seg1.InitWithString(s); err != nil {
+					t.Error(err)
+				}
 			case initModeBytes:
-				seg1.InitWithBytes([]byte(s))
+				if err := seg1.InitWithBytes([]byte(s)); err != nil {
+					t.Error(err)
+				}
 			}
 			actualSegments := collectGraphemes(&seg1)
 			if !reflect.DeepEqual(expectedSegments, actualSegments) {
@@ -190,9 +198,13 @@ func TestWordBreakUnicodeReference(t *testing.T) {
 			case initModeRunes:
 				seg1.Init(text)
 			case initModeString:
-				seg1.InitWithString(string(s))
+				if err := seg1.InitWithString(string(s)); err != nil {
+					t.Error(err)
+				}
 			case initModeBytes:
-				seg1.InitWithBytes([]byte(string(s)))
+				if err := seg1.InitWithBytes([]byte(string(s))); err != nil {
+					t.Error(err)
+				}
 			}
 			actualBoundaries := collectWordBoundaries(&seg1)
 			if !reflect.DeepEqual(expectedBoundaries, actualBoundaries) {
@@ -217,9 +229,13 @@ func TestWordSegmenter(t *testing.T) {
 			case initModeRunes:
 				seg.Init([]rune(test.input))
 			case initModeString:
-				seg.InitWithString(test.input)
+				if err := seg.InitWithString(test.input); err != nil {
+					t.Error(err)
+				}
 			case initModeBytes:
-				seg.InitWithBytes([]byte(test.input))
+				if err := seg.InitWithBytes([]byte(test.input)); err != nil {
+					t.Error(err)
+				}
 			}
 			got := collectWords(&seg)
 			if !reflect.DeepEqual(test.words, got) {
@@ -229,6 +245,78 @@ func TestWordSegmenter(t *testing.T) {
 	}
 }
 
+func TestBytePositions(t *testing.T) {
+	tests := []string{
+		"",
+		"a",
+		"Hello World",
+		"café latte",
+		"🍣寿司🍣",
+		"Hi 🧑‍🧒‍🧒 there", // Emoji with zero-width joiner
+		"This is a test.\ncafé\n🍣寿司🍣",
+		"aaa\ufffdbbb", // U+FFFD (Replacement Character)
+	}
+
+	var seg Segmenter
+	initSeg := func(seg *Segmenter, mode initMode, input string) {
+		switch mode {
+		case initModeRunes:
+			seg.Init([]rune(input))
+		case initModeString:
+			if err := seg.InitWithString(input); err != nil {
+				t.Error(err)
+			}
+		case initModeBytes:
+			if err := seg.InitWithBytes([]byte(input)); err != nil {
+				t.Error(err)
+			}
+		}
+	}
+
+	for mode := initMode(0); mode < initModeMax; mode++ {
+		for _, input := range tests {
+			// Test GraphemeIterator byte positions.
+			initSeg(&seg, mode, input)
+			iter := seg.GraphemeIterator()
+			for iter.Next() {
+				g := iter.Grapheme()
+				got := []rune(input[g.OffsetInBytes : g.OffsetInBytes+g.LengthInBytes])
+				expected := g.Text
+				if !reflect.DeepEqual(got, expected) {
+					t.Errorf("grapheme: input=%q mode=%d: byte slice %q != rune text %q (offset=%d, offsetInBytes=%d, lengthInBytes=%d)",
+						input, mode, got, expected, g.Offset, g.OffsetInBytes, g.LengthInBytes)
+				}
+			}
+
+			// Test LineIterator byte positions.
+			initSeg(&seg, mode, input)
+			lineIter := seg.LineIterator()
+			for lineIter.Next() {
+				l := lineIter.Line()
+				got := []rune(input[l.OffsetInBytes : l.OffsetInBytes+l.LengthInBytes])
+				expected := l.Text
+				if !reflect.DeepEqual(got, expected) {
+					t.Errorf("line: input=%q mode=%d: byte slice %q != rune text %q (offset=%d, offsetInBytes=%d, lengthInBytes=%d)",
+						input, mode, got, expected, l.Offset, l.OffsetInBytes, l.LengthInBytes)
+				}
+			}
+
+			// Test WordIterator byte positions.
+			initSeg(&seg, mode, input)
+			wordIter := seg.WordIterator()
+			for wordIter.Next() {
+				w := wordIter.Word()
+				got := []rune(input[w.OffsetInBytes : w.OffsetInBytes+w.LengthInBytes])
+				expected := w.Text
+				if !reflect.DeepEqual(got, expected) {
+					t.Errorf("word: input=%q mode=%d: byte slice %q != rune text %q (offset=%d, offsetInBytes=%d, lengthInBytes=%d)",
+						input, mode, got, expected, w.Offset, w.OffsetInBytes, w.LengthInBytes)
+				}
+			}
+		}
+	}
+}
+
 func lineSegmentCount(s *Segmenter, input []rune) int {
 	s.Init(input)
 	iter := s.LineIterator()