segmenter: add OffsetInBytes and LengthInBytes to Grapheme, Line, and Word

hajimehoshi · hajimehoshi · commit 5229631f7d83 · 2026-03-05T02:53:50.000+09:00
Track UTF-8 byte positions alongside rune positions in the attribute iterator, and expose them as OffsetInBytes and LengthInBytes fields on Grapheme, Line, and Word structs. This allows users to efficiently extract segments from byte slices or strings without O(n) conversion. Fixes #240
diff --git a/segmenter/segmenter.go b/segmenter/segmenter.go
@@ -13,6 +13,8 @@
 package segmenter
 
 import (
+	"unicode/utf8"
+
 	ucd "github.com/go-text/typesetting/internal/unicodedata"
 )
 
@@ -225,12 +227,20 @@ type Segmenter struct {
 	// 	text : 			[b, 		u, 	l, 	l]
 	// 	attributes :	[<start> b, b u, u l, l l, l <end>]
 	attributes []breakAttr
+
+	invalidUTF8Indices map[int]struct{}
 }
 
 // Init resets the segmenter storage with the given input,
 // and computes the attributes required to segment the text.
+//
+// If paragraph includes an invalid rune, some outputs like
+// [Line.OffsetInBytes] and [Line.LengthInBytes] are undefined.
 func (seg *Segmenter) Init(paragraph []rune) {
 	seg.text = append(seg.text[:0], paragraph...)
+	for k := range seg.invalidUTF8Indices {
+		delete(seg.invalidUTF8Indices, k)
+	}
 	seg.initAttributes()
 }
 
@@ -241,9 +251,25 @@ func (seg *Segmenter) Init(paragraph []rune) {
 //
 // InitWithString is more efficient than [Init] if the input is a string.
 // No allocation for the text is made if its internal buffer capacity is already large enough.
+//
+// Invalid UTF-8 sequences in paragmraph are replaced with U+FFFD.
 func (seg *Segmenter) InitWithString(paragraph string) {
 	seg.text = seg.text[:0]
-	for _, r := range paragraph {
+	for k := range seg.invalidUTF8Indices {
+		delete(seg.invalidUTF8Indices, k)
+	}
+	for i, r := range paragraph {
+		if r == utf8.RuneError {
+			// Check whether the rune is acually U+FFFD, or an invalid UTF-8 sequence.
+			if r, l := utf8.DecodeRuneInString(paragraph[i:]); r == utf8.RuneError && l == 1 {
+				// The current rune is an invalid UTF-8 sequence.
+				// Record the index.
+				if seg.invalidUTF8Indices == nil {
+					seg.invalidUTF8Indices = make(map[int]struct{})
+				}
+				seg.invalidUTF8Indices[i] = struct{}{}
+			}
+		}
 		seg.text = append(seg.text, r)
 	}
 	seg.initAttributes()
@@ -256,10 +282,26 @@ func (seg *Segmenter) InitWithString(paragraph string) {
 //
 // InitWithBytes is more efficient than [Init] if the input is a byte slice.
 // No allocation for the text is made if its internal buffer capacity is already large enough.
+//
+// Invalid UTF-8 sequences in paragmraph are replaced with U+FFFD.
 func (seg *Segmenter) InitWithBytes(paragraph []byte) {
 	seg.text = seg.text[:0]
+	for k := range seg.invalidUTF8Indices {
+		delete(seg.invalidUTF8Indices, k)
+	}
 	// The Go compiler should optimize this without allocating a string.
-	for _, r := range string(paragraph) {
+	for i, r := range string(paragraph) {
+		if r == utf8.RuneError {
+			// Check whether the rune is acually U+FFFD, or an invalid UTF-8 sequence.
+			if r, l := utf8.DecodeRune(paragraph[i:]); r == utf8.RuneError && l == 1 {
+				// The current rune is an invalid UTF-8 sequence.
+				// Record the index.
+				if seg.invalidUTF8Indices == nil {
+					seg.invalidUTF8Indices = make(map[int]struct{})
+				}
+				seg.invalidUTF8Indices[i] = struct{}{}
+			}
+		}
 		seg.text = append(seg.text, r)
 	}
 	seg.initAttributes()
@@ -273,34 +315,62 @@ func (seg *Segmenter) initAttributes() {
 // attributeIterator is an helper type used to
 // handle iterating over a slice of runeAttr
 type attributeIterator struct {
-	src       *Segmenter
-	pos       int       // the current position in the input slice
-	lastBreak int       // the start of the current segment
-	flag      breakAttr // break where this flag is on
+	src              *Segmenter
+	pos              int       // the current position in the input slice (in runes)
+	lastBreak        int       // the start of the current segment (in runes)
+	posInBytes       int       // the current position in the input (in UTF-8 bytes)
+	lastBreakInBytes int       // the start of the current segment (in UTF-8 bytes)
+	flag             breakAttr // break where this flag is on
 }
 
 // next returns true if there is still a segment to process,
 // and advances the iterator; or return false.
 // if returning true, the segment is at [iter.lastBreak:iter.pos]
 func (iter *attributeIterator) next() bool {
 	iter.lastBreak = iter.pos // remember the start of the next segment
-	iter.pos++
+	iter.lastBreakInBytes = iter.posInBytes
+	iter.incrementPos()
 	for iter.pos <= len(iter.src.text) {
 		// can we break before i ?
 		if iter.src.attributes[iter.pos]&iter.flag != 0 {
 			return true
 		}
-		iter.pos++
+		iter.incrementPos()
 	}
 	return false
 }
 
+func (iter *attributeIterator) incrementPos() {
+	// If the current position is an invalid UTF-8 sequence, the byte is likely replaced with U+FFFD.
+	// Advance the position by 1 byte and 1 rune.
+	if _, ok := iter.src.invalidUTF8Indices[iter.posInBytes]; ok {
+		iter.posInBytes++
+		iter.pos++
+		return
+	}
+
+	if iter.pos < len(iter.src.text) {
+		r := iter.src.text[iter.pos]
+		if l := utf8.RuneLen(r); l > 0 {
+			iter.posInBytes += l
+		}
+		// If l <= 0, it means that the rune is invalid UTF-8.
+		// There is no correct way to update the byte position.
+		// This case is treated as an undefined behavior. Just skip it.
+	}
+	iter.pos++
+}
+
 // Line is the content of a line delimited by the segmenter.
 type Line struct {
 	// Text is a subslice of the original input slice, containing the delimited line
 	Text []rune
 	// Offset is the start of the line in the input rune slice
 	Offset int
+	// OffsetInBytes is the start of the line in the input, in UTF-8 bytes
+	OffsetInBytes int
+	// LengthInBytes is the length of the line in the input, in UTF-8 bytes
+	LengthInBytes int
 	// IsMandatoryBreak is true if breaking (at the end of the line)
 	// is mandatory
 	IsMandatoryBreak bool
@@ -320,6 +390,8 @@ func (li *LineIterator) Next() bool { return li.next() }
 func (li *LineIterator) Line() Line {
 	return Line{
 		Offset:           li.lastBreak,
+		OffsetInBytes:    li.lastBreakInBytes,
+		LengthInBytes:    li.posInBytes - li.lastBreakInBytes,
 		Text:             li.src.text[li.lastBreak:li.pos], // pos is not included since we break right before
 		IsMandatoryBreak: li.src.attributes[li.pos]&mandatoryLineBoundary != 0,
 	}
@@ -337,6 +409,10 @@ type Grapheme struct {
 	Text []rune
 	// Offset is the start of the grapheme in the input rune slice
 	Offset int
+	// OffsetInBytes is the start of the grapheme in the input, in UTF-8 bytes
+	OffsetInBytes int
+	// LengthInBytes is the length of the grapheme in the input, in UTF-8 bytes
+	LengthInBytes int
 }
 
 // GraphemeIterator provides a convenient way of
@@ -352,8 +428,10 @@ func (gr *GraphemeIterator) Next() bool { return gr.next() }
 // Grapheme returns the current `Grapheme`
 func (gr *GraphemeIterator) Grapheme() Grapheme {
 	return Grapheme{
-		Offset: gr.lastBreak,
-		Text:   gr.src.text[gr.lastBreak:gr.pos],
+		Offset:        gr.lastBreak,
+		OffsetInBytes: gr.lastBreakInBytes,
+		LengthInBytes: gr.posInBytes - gr.lastBreakInBytes,
+		Text:          gr.src.text[gr.lastBreak:gr.pos],
 	}
 }
 
@@ -377,6 +455,10 @@ type Word struct {
 	Text []rune
 	// Offset is the start of the word in the input rune slice
 	Offset int
+	// OffsetInBytes is the start of the word in the input, in UTF-8 bytes
+	OffsetInBytes int
+	// LengthInBytes is the length of the word in the input, in UTF-8 bytes
+	LengthInBytes int
 }
 
 type WordIterator struct {
@@ -409,8 +491,10 @@ func (gr *WordIterator) Next() bool {
 // Word returns the current `Word`
 func (gr *WordIterator) Word() Word {
 	return Word{
-		Offset: gr.lastBreak,
-		Text:   gr.src.text[gr.lastBreak:gr.pos],
+		Offset:        gr.lastBreak,
+		OffsetInBytes: gr.lastBreakInBytes,
+		LengthInBytes: gr.posInBytes - gr.lastBreakInBytes,
+		Text:          gr.src.text[gr.lastBreak:gr.pos],
 	}
 }
 
diff --git a/segmenter/segmenter_test.go b/segmenter/segmenter_test.go
@@ -9,6 +9,7 @@ import (
 	"strconv"
 	"strings"
 	"testing"
+	"unicode/utf8"
 
 	tu "github.com/go-text/typesetting/testutils"
 )
@@ -229,6 +230,83 @@ func TestWordSegmenter(t *testing.T) {
 	}
 }
 
+func TestBytePositions(t *testing.T) {
+	tests := []string{
+		"",
+		"a",
+		"Hello World",
+		"café latte",
+		"🍣寿司🍣",
+		"Hi 🧑‍🧒‍🧒 there", // Emoji with zero-width joiner
+		"This is a test.\ncafé\n🍣寿司🍣",
+		"aaa\xffbbb",     // Invalid UTF-8
+		"aaa\xff\xffbbb", // Invalid UTF-8
+		"aaa\ufffdbbb",   // U+FFFD (Replacement Character)
+	}
+
+	var seg Segmenter
+	initSeg := func(seg *Segmenter, mode initMode, input string) {
+		switch mode {
+		case initModeRunes:
+			seg.Init([]rune(input))
+		case initModeString:
+			seg.InitWithString(input)
+		case initModeBytes:
+			seg.InitWithBytes([]byte(input))
+		}
+	}
+
+	for mode := initMode(0); mode < initModeMax; mode++ {
+		for _, input := range tests {
+			if mode == initModeRunes && !utf8.ValidString(input) {
+				// If the input is not valid UTF-8, converting it to []rune
+				// replaces invalid UTF-8 sequences with the replacement character.
+				// There is no correct way to handle this case.
+				continue
+			}
+
+			// Test GraphemeIterator byte positions.
+			initSeg(&seg, mode, input)
+			iter := seg.GraphemeIterator()
+			for iter.Next() {
+				g := iter.Grapheme()
+				got := []rune(input[g.OffsetInBytes : g.OffsetInBytes+g.LengthInBytes])
+				expected := g.Text
+				if !reflect.DeepEqual(got, expected) {
+					t.Errorf("grapheme: input=%q mode=%d: byte slice %q != rune text %q (offset=%d, offsetInBytes=%d, lengthInBytes=%d)",
+						input, mode, got, expected, g.Offset, g.OffsetInBytes, g.LengthInBytes)
+				}
+			}
+
+			// Test LineIterator byte positions.
+			initSeg(&seg, mode, input)
+			lineIter := seg.LineIterator()
+			for lineIter.Next() {
+				l := lineIter.Line()
+				got := []rune(input[l.OffsetInBytes : l.OffsetInBytes+l.LengthInBytes])
+				expected := l.Text
+				if !reflect.DeepEqual(got, expected) {
+					t.Errorf("line: input=%q mode=%d: byte slice %q != rune text %q (offset=%d, offsetInBytes=%d, lengthInBytes=%d)",
+						input, mode, got, expected, l.Offset, l.OffsetInBytes, l.LengthInBytes)
+				}
+			}
+
+			// Test WordIterator byte positions.
+			initSeg(&seg, mode, input)
+			wordIter := seg.WordIterator()
+			for wordIter.Next() {
+				w := wordIter.Word()
+				got := []rune(input[w.OffsetInBytes : w.OffsetInBytes+w.LengthInBytes])
+				expected := w.Text
+				if !reflect.DeepEqual(got, expected) {
+					t.Errorf("word: input=%q mode=%d: byte slice %q != rune text %q (offset=%d, offsetInBytes=%d, lengthInBytes=%d)",
+						input, mode, got, expected, w.Offset, w.OffsetInBytes, w.LengthInBytes)
+				}
+			}
+		}
+	}
+}
+
 func lineSegmentCount(s *Segmenter, input []rune) int {
 	s.Init(input)
 	iter := s.LineIterator()