Skip to content

Commit 5229631

Browse files
committed
segmenter: add OffsetInBytes and LengthInBytes to Grapheme, Line, and Word
Track UTF-8 byte positions alongside rune positions in the attribute iterator, and expose them as OffsetInBytes and LengthInBytes fields on Grapheme, Line, and Word structs. This allows users to efficiently extract segments from byte slices or strings without O(n) conversion. Fixes #240
1 parent 40da633 commit 5229631

File tree

2 files changed

+174
-12
lines changed

2 files changed

+174
-12
lines changed

segmenter/segmenter.go

Lines changed: 96 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
package segmenter
1414

1515
import (
16+
"unicode/utf8"
17+
1618
ucd "github.com/go-text/typesetting/internal/unicodedata"
1719
)
1820

@@ -225,12 +227,20 @@ type Segmenter struct {
225227
// text : [b, u, l, l]
226228
// attributes : [<start> b, b u, u l, l l, l <end>]
227229
attributes []breakAttr
230+
231+
invalidUTF8Indices map[int]struct{}
228232
}
229233

230234
// Init resets the segmenter storage with the given input,
231235
// and computes the attributes required to segment the text.
236+
//
237+
// If paragraph includes an invalid rune, some outputs like
238+
// [Line.OffsetInBytes] and [Line.LengthInBytes] are undefined.
232239
func (seg *Segmenter) Init(paragraph []rune) {
233240
seg.text = append(seg.text[:0], paragraph...)
241+
for k := range seg.invalidUTF8Indices {
242+
delete(seg.invalidUTF8Indices, k)
243+
}
234244
seg.initAttributes()
235245
}
236246

@@ -241,9 +251,25 @@ func (seg *Segmenter) Init(paragraph []rune) {
241251
//
242252
// InitWithString is more efficient than [Init] if the input is a string.
243253
// No allocation for the text is made if its internal buffer capacity is already large enough.
254+
//
255+
// Invalid UTF-8 sequences in paragmraph are replaced with U+FFFD.
244256
func (seg *Segmenter) InitWithString(paragraph string) {
245257
seg.text = seg.text[:0]
246-
for _, r := range paragraph {
258+
for k := range seg.invalidUTF8Indices {
259+
delete(seg.invalidUTF8Indices, k)
260+
}
261+
for i, r := range paragraph {
262+
if r == utf8.RuneError {
263+
// Check whether the rune is acually U+FFFD, or an invalid UTF-8 sequence.
264+
if r, l := utf8.DecodeRuneInString(paragraph[i:]); r == utf8.RuneError && l == 1 {
265+
// The current rune is an invalid UTF-8 sequence.
266+
// Record the index.
267+
if seg.invalidUTF8Indices == nil {
268+
seg.invalidUTF8Indices = make(map[int]struct{})
269+
}
270+
seg.invalidUTF8Indices[i] = struct{}{}
271+
}
272+
}
247273
seg.text = append(seg.text, r)
248274
}
249275
seg.initAttributes()
@@ -256,10 +282,26 @@ func (seg *Segmenter) InitWithString(paragraph string) {
256282
//
257283
// InitWithBytes is more efficient than [Init] if the input is a byte slice.
258284
// No allocation for the text is made if its internal buffer capacity is already large enough.
285+
//
286+
// Invalid UTF-8 sequences in paragmraph are replaced with U+FFFD.
259287
func (seg *Segmenter) InitWithBytes(paragraph []byte) {
260288
seg.text = seg.text[:0]
289+
for k := range seg.invalidUTF8Indices {
290+
delete(seg.invalidUTF8Indices, k)
291+
}
261292
// The Go compiler should optimize this without allocating a string.
262-
for _, r := range string(paragraph) {
293+
for i, r := range string(paragraph) {
294+
if r == utf8.RuneError {
295+
// Check whether the rune is acually U+FFFD, or an invalid UTF-8 sequence.
296+
if r, l := utf8.DecodeRune(paragraph[i:]); r == utf8.RuneError && l == 1 {
297+
// The current rune is an invalid UTF-8 sequence.
298+
// Record the index.
299+
if seg.invalidUTF8Indices == nil {
300+
seg.invalidUTF8Indices = make(map[int]struct{})
301+
}
302+
seg.invalidUTF8Indices[i] = struct{}{}
303+
}
304+
}
263305
seg.text = append(seg.text, r)
264306
}
265307
seg.initAttributes()
@@ -273,34 +315,62 @@ func (seg *Segmenter) initAttributes() {
273315
// attributeIterator is an helper type used to
274316
// handle iterating over a slice of runeAttr
275317
type attributeIterator struct {
276-
src *Segmenter
277-
pos int // the current position in the input slice
278-
lastBreak int // the start of the current segment
279-
flag breakAttr // break where this flag is on
318+
src *Segmenter
319+
pos int // the current position in the input slice (in runes)
320+
lastBreak int // the start of the current segment (in runes)
321+
posInBytes int // the current position in the input (in UTF-8 bytes)
322+
lastBreakInBytes int // the start of the current segment (in UTF-8 bytes)
323+
flag breakAttr // break where this flag is on
280324
}
281325

282326
// next returns true if there is still a segment to process,
283327
// and advances the iterator; or return false.
284328
// if returning true, the segment is at [iter.lastBreak:iter.pos]
285329
func (iter *attributeIterator) next() bool {
286330
iter.lastBreak = iter.pos // remember the start of the next segment
287-
iter.pos++
331+
iter.lastBreakInBytes = iter.posInBytes
332+
iter.incrementPos()
288333
for iter.pos <= len(iter.src.text) {
289334
// can we break before i ?
290335
if iter.src.attributes[iter.pos]&iter.flag != 0 {
291336
return true
292337
}
293-
iter.pos++
338+
iter.incrementPos()
294339
}
295340
return false
296341
}
297342

343+
func (iter *attributeIterator) incrementPos() {
344+
// If the current position is an invalid UTF-8 sequence, the byte is likely replaced with U+FFFD.
345+
// Advance the position by 1 byte and 1 rune.
346+
if _, ok := iter.src.invalidUTF8Indices[iter.posInBytes]; ok {
347+
iter.posInBytes++
348+
iter.pos++
349+
return
350+
}
351+
352+
if iter.pos < len(iter.src.text) {
353+
r := iter.src.text[iter.pos]
354+
if l := utf8.RuneLen(r); l > 0 {
355+
iter.posInBytes += l
356+
}
357+
// If l <= 0, it means that the rune is invalid UTF-8.
358+
// There is no correct way to update the byte position.
359+
// This case is treated as an undefined behavior. Just skip it.
360+
}
361+
iter.pos++
362+
}
363+
298364
// Line is the content of a line delimited by the segmenter.
299365
type Line struct {
300366
// Text is a subslice of the original input slice, containing the delimited line
301367
Text []rune
302368
// Offset is the start of the line in the input rune slice
303369
Offset int
370+
// OffsetInBytes is the start of the line in the input, in UTF-8 bytes
371+
OffsetInBytes int
372+
// LengthInBytes is the length of the line in the input, in UTF-8 bytes
373+
LengthInBytes int
304374
// IsMandatoryBreak is true if breaking (at the end of the line)
305375
// is mandatory
306376
IsMandatoryBreak bool
@@ -320,6 +390,8 @@ func (li *LineIterator) Next() bool { return li.next() }
320390
func (li *LineIterator) Line() Line {
321391
return Line{
322392
Offset: li.lastBreak,
393+
OffsetInBytes: li.lastBreakInBytes,
394+
LengthInBytes: li.posInBytes - li.lastBreakInBytes,
323395
Text: li.src.text[li.lastBreak:li.pos], // pos is not included since we break right before
324396
IsMandatoryBreak: li.src.attributes[li.pos]&mandatoryLineBoundary != 0,
325397
}
@@ -337,6 +409,10 @@ type Grapheme struct {
337409
Text []rune
338410
// Offset is the start of the grapheme in the input rune slice
339411
Offset int
412+
// OffsetInBytes is the start of the grapheme in the input, in UTF-8 bytes
413+
OffsetInBytes int
414+
// LengthInBytes is the length of the grapheme in the input, in UTF-8 bytes
415+
LengthInBytes int
340416
}
341417

342418
// GraphemeIterator provides a convenient way of
@@ -352,8 +428,10 @@ func (gr *GraphemeIterator) Next() bool { return gr.next() }
352428
// Grapheme returns the current `Grapheme`
353429
func (gr *GraphemeIterator) Grapheme() Grapheme {
354430
return Grapheme{
355-
Offset: gr.lastBreak,
356-
Text: gr.src.text[gr.lastBreak:gr.pos],
431+
Offset: gr.lastBreak,
432+
OffsetInBytes: gr.lastBreakInBytes,
433+
LengthInBytes: gr.posInBytes - gr.lastBreakInBytes,
434+
Text: gr.src.text[gr.lastBreak:gr.pos],
357435
}
358436
}
359437

@@ -377,6 +455,10 @@ type Word struct {
377455
Text []rune
378456
// Offset is the start of the word in the input rune slice
379457
Offset int
458+
// OffsetInBytes is the start of the word in the input, in UTF-8 bytes
459+
OffsetInBytes int
460+
// LengthInBytes is the length of the word in the input, in UTF-8 bytes
461+
LengthInBytes int
380462
}
381463

382464
type WordIterator struct {
@@ -409,8 +491,10 @@ func (gr *WordIterator) Next() bool {
409491
// Word returns the current `Word`
410492
func (gr *WordIterator) Word() Word {
411493
return Word{
412-
Offset: gr.lastBreak,
413-
Text: gr.src.text[gr.lastBreak:gr.pos],
494+
Offset: gr.lastBreak,
495+
OffsetInBytes: gr.lastBreakInBytes,
496+
LengthInBytes: gr.posInBytes - gr.lastBreakInBytes,
497+
Text: gr.src.text[gr.lastBreak:gr.pos],
414498
}
415499
}
416500

segmenter/segmenter_test.go

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"strconv"
1010
"strings"
1111
"testing"
12+
"unicode/utf8"
1213

1314
tu "github.com/go-text/typesetting/testutils"
1415
)
@@ -229,6 +230,83 @@ func TestWordSegmenter(t *testing.T) {
229230
}
230231
}
231232

233+
func TestBytePositions(t *testing.T) {
234+
tests := []string{
235+
"",
236+
"a",
237+
"Hello World",
238+
"café latte",
239+
"🍣寿司🍣",
240+
"Hi 🧑‍🧒‍🧒 there", // Emoji with zero-width joiner
241+
"This is a test.\ncafé\n🍣寿司🍣",
242+
"aaa\xffbbb", // Invalid UTF-8
243+
"aaa\xff\xffbbb", // Invalid UTF-8
244+
"aaa\ufffdbbb", // U+FFFD (Replacement Character)
245+
}
246+
247+
var seg Segmenter
248+
initSeg := func(seg *Segmenter, mode initMode, input string) {
249+
switch mode {
250+
case initModeRunes:
251+
seg.Init([]rune(input))
252+
case initModeString:
253+
seg.InitWithString(input)
254+
case initModeBytes:
255+
seg.InitWithBytes([]byte(input))
256+
}
257+
}
258+
259+
for mode := initMode(0); mode < initModeMax; mode++ {
260+
for _, input := range tests {
261+
if mode == initModeRunes && !utf8.ValidString(input) {
262+
// If the input is not valid UTF-8, converting it to []rune
263+
// replaces invalid UTF-8 sequences with the replacement character.
264+
// There is no correct way to handle this case.
265+
continue
266+
}
267+
268+
// Test GraphemeIterator byte positions.
269+
initSeg(&seg, mode, input)
270+
iter := seg.GraphemeIterator()
271+
for iter.Next() {
272+
g := iter.Grapheme()
273+
got := []rune(input[g.OffsetInBytes : g.OffsetInBytes+g.LengthInBytes])
274+
expected := g.Text
275+
if !reflect.DeepEqual(got, expected) {
276+
t.Errorf("grapheme: input=%q mode=%d: byte slice %q != rune text %q (offset=%d, offsetInBytes=%d, lengthInBytes=%d)",
277+
input, mode, got, expected, g.Offset, g.OffsetInBytes, g.LengthInBytes)
278+
}
279+
}
280+
281+
// Test LineIterator byte positions.
282+
initSeg(&seg, mode, input)
283+
lineIter := seg.LineIterator()
284+
for lineIter.Next() {
285+
l := lineIter.Line()
286+
got := []rune(input[l.OffsetInBytes : l.OffsetInBytes+l.LengthInBytes])
287+
expected := l.Text
288+
if !reflect.DeepEqual(got, expected) {
289+
t.Errorf("line: input=%q mode=%d: byte slice %q != rune text %q (offset=%d, offsetInBytes=%d, lengthInBytes=%d)",
290+
input, mode, got, expected, l.Offset, l.OffsetInBytes, l.LengthInBytes)
291+
}
292+
}
293+
294+
// Test WordIterator byte positions.
295+
initSeg(&seg, mode, input)
296+
wordIter := seg.WordIterator()
297+
for wordIter.Next() {
298+
w := wordIter.Word()
299+
got := []rune(input[w.OffsetInBytes : w.OffsetInBytes+w.LengthInBytes])
300+
expected := w.Text
301+
if !reflect.DeepEqual(got, expected) {
302+
t.Errorf("word: input=%q mode=%d: byte slice %q != rune text %q (offset=%d, offsetInBytes=%d, lengthInBytes=%d)",
303+
input, mode, got, expected, w.Offset, w.OffsetInBytes, w.LengthInBytes)
304+
}
305+
}
306+
}
307+
}
308+
}
309+
232310
func lineSegmentCount(s *Segmenter, input []rune) int {
233311
s.Init(input)
234312
iter := s.LineIterator()

0 commit comments

Comments
 (0)