@@ -454,8 +454,7 @@ Package llama3 implements the Llama 3 tokenizer in Go. This file contains the pu
454454 - [ func \( t \* Tokenizer\) EncodeBPE\( pretoken string\) \[\] int] ( < #Tokenizer.EncodeBPE > )
455455 - [ func \( t \* Tokenizer\) EncodeBytes\( data \[\] byte, opts \* EncodeOptions\) \[\] int] ( < #Tokenizer.EncodeBytes > )
456456 - [ func \( t \* Tokenizer\) GetSpecialTokenID\( token string\) \( int, error\) ] ( < #Tokenizer.GetSpecialTokenID > )
457- - [ func \( t \* Tokenizer\) NewScanner\( r io.Reader\) Scanner] ( < #Tokenizer.NewScanner > )
458- - [ func \( t \* Tokenizer\) NewScannerOptions\( r io.Reader, opts ...ScannerOption\) Scanner] ( < #Tokenizer.NewScannerOptions > )
457+ - [ func \( t \* Tokenizer\) NewScanner\( r io.Reader, opts ...ScannerOption\) Scanner] ( < #Tokenizer.NewScanner > )
459458 - [ func \( t \* Tokenizer\) OptimisticCount\( text string\) int] ( < #Tokenizer.OptimisticCount > )
460459 - [ func \( t \* Tokenizer\) PreTokenize\( text string\) \[\] string] ( < #Tokenizer.PreTokenize > )
461460 - [ func \( t \* Tokenizer\) Process\( r io.Reader, w io.Writer\) \( int64, error\) ] ( < #Tokenizer.Process > )
@@ -469,25 +468,25 @@ Package llama3 implements the Llama 3 tokenizer in Go. This file contains the pu
469468
470469## Variables
471470
472- <a name =" ErrDataNotFound " ></a >Common errors
471+ <a name =" ErrDataNotFound " ></a >Common errors.
473472
474473``` go
475474var (
476- // ErrDataNotFound indicates that the tokenizer data files could not be found
475+ // ErrDataNotFound indicates that the tokenizer data files could not be found.
477476 ErrDataNotFound = errors.New (" tokenizer data not found" )
478477
479- // ErrInvalidToken indicates an invalid token was provided
478+ // ErrInvalidToken indicates an invalid token was provided.
480479 ErrInvalidToken = errors.New (" invalid token" )
481480
482- // ErrTokenNotFound indicates a token was not found in the vocabulary
481+ // ErrTokenNotFound indicates a token was not found in the vocabulary.
483482 ErrTokenNotFound = errors.New (" token not found" )
484483
485- // ErrInvalidTokenID indicates an invalid token ID was provided
484+ // ErrInvalidTokenID indicates an invalid token ID was provided.
486485 ErrInvalidTokenID = errors.New (" invalid token ID" )
487486)
488487```
489488
490- <a name =" WithBufferSize " ></a >Scanner option functions \- these are re\- exported from the scanner package
489+ <a name =" WithBufferSize " ></a >Scanner option functions \- these are re\- exported from the scanner package.
491490
492491``` go
493492var (
@@ -517,7 +516,7 @@ var (
517516func NewConfigError (field string , value any , err error ) error
518517```
519518
520- NewConfigError creates a new ConfigError
519+ NewConfigError creates a new ConfigError.
521520
522521<a name="NewDataError"></a>
523522## func [NewDataError](<https:// github.com/agentstation/tokenizer/blob/master/llama3/errors.go#L81>)
@@ -526,7 +525,7 @@ NewConfigError creates a new ConfigError
526525func NewDataError(op, path string, err error) error
527526```
528527
529- NewDataError creates a new DataError
528+ NewDataError creates a new DataError.
530529
531530<a name="NewTokenError"></a>
532531## func [NewTokenError](<https:// github.com/agentstation/tokenizer/blob/master/llama3/errors.go#L86>)
@@ -535,7 +534,7 @@ NewDataError creates a new DataError
535534func NewTokenError(op, token string, err error) error
536535```
537536
538- NewTokenError creates a new TokenError
537+ NewTokenError creates a new TokenError.
539538
540539<a name="NewTokenIDError"></a>
541540## func [NewTokenIDError](<https:// github.com/agentstation/tokenizer/blob/master/llama3/errors.go#L91>)
@@ -544,7 +543,7 @@ NewTokenError creates a new TokenError
544543func NewTokenIDError(op string, tokenID int, err error) error
545544```
546545
547- NewTokenIDError creates a new TokenError with a token ID
546+ NewTokenIDError creates a new TokenError with a token ID.
548547
549548<a name="BPE"></a>
550549## type [BPE](<https:// github.com/agentstation/tokenizer/blob/master/llama3/tokenizer.go#L80-L84>)
@@ -583,7 +582,7 @@ type Cache interface {
583582<a name =" ConfigError " ></a >
584583## type [ ConfigError] ( < https://github.com/agentstation/tokenizer/blob/master/llama3/errors.go#L64-L68 > )
585584
586- ConfigError represents an error in tokenizer configuration
585+ ConfigError represents an error in tokenizer configuration.
587586
588587``` go
589588type ConfigError struct {
@@ -614,7 +613,7 @@ func (e *ConfigError) Unwrap() error
614613<a name="DataError"></a>
615614## type [DataError](<https:// github.com/agentstation/tokenizer/blob/master/llama3/errors.go#L24-L28>)
616615
617- DataError represents an error related to tokenizer data loading or processing
616+ DataError represents an error related to tokenizer data loading or processing.
618617
619618```go
620619type DataError struct {
@@ -809,7 +808,7 @@ type ScannerOption = scanner.Option
809808<a name =" TokenError " ></a >
810809## type [ TokenError] ( < https://github.com/agentstation/tokenizer/blob/master/llama3/errors.go#L42-L47 > )
811810
812- TokenError represents an error related to token operations
811+ TokenError represents an error related to token operations.
813812
814813``` go
815814type TokenError struct {
@@ -1082,22 +1081,13 @@ func main() {
10821081</details >
10831082
10841083<a name =" Tokenizer.NewScanner " ></a >
1085- ### func \(\* Tokenizer\) [ NewScanner] ( < https://github.com/agentstation/tokenizer/blob/master/llama3/scanner.go#L65 > )
1084+ ### func \(\* Tokenizer\) [ NewScanner] ( < https://github.com/agentstation/tokenizer/blob/master/llama3/scanner.go#L67 > )
10861085
10871086``` go
1088- func (t *Tokenizer ) NewScanner (r io .Reader ) Scanner
1087+ func (t *Tokenizer ) NewScanner (r io .Reader , opts ... ScannerOption ) Scanner
10891088```
10901089
1091- NewScanner creates a scanner for streaming tokenization with default options.
1092-
1093- <a name="Tokenizer.NewScannerOptions"></a>
1094- ### func \(\*Tokenizer\) [NewScannerOptions](<https:// github.com/agentstation/tokenizer/blob/master/llama3/scanner.go#L70>)
1095-
1096- ```go
1097- func (t *Tokenizer) NewScannerOptions(r io.Reader, opts ...ScannerOption) Scanner
1098- ```
1099-
1100- NewScannerOptions creates a scanner with custom options.
1090+ NewScanner creates a scanner for streaming tokenization. The scanner processes input with bounded memory usage, making it suitable for large files or continuous streams.
11011091
11021092<a name="Tokenizer.OptimisticCount"></a>
11031093### func \(\*Tokenizer\) [OptimisticCount](<https:// github.com/agentstation/tokenizer/blob/master/llama3/tokenizer.go#L373>)
0 commit comments