Skip to content

Commit e9d530c

Browse files
authored
Merge pull request #4 from agentstation/consolidate-scanner-implementation
refactor: consolidate to single memory-efficient scanner implementation
2 parents 0dd15a6 + 501e035 commit e9d530c

File tree

13 files changed

+250
-197
lines changed

13 files changed

+250
-197
lines changed

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ tokenizer llama3 "Hello, world!"
7272
# Decode tokens
7373
tokenizer llama3 decode 128000 9906 11 1917 0 128001
7474

75-
# Stream large files (automatic pipe detection)
75+
# Process large files (automatic pipe detection)
7676
cat document.txt | tokenizer llama3
7777

7878
# Get tokenizer information
@@ -117,7 +117,7 @@ tokenizer llama3 "Hello, world!"
117117
tokenizer llama3 decode 128000 9906 11 1917 0 128001
118118
# Output: <|begin_of_text|>Hello, world!<|end_of_text|>
119119

120-
# Stream from files (automatic)
120+
# Process from files (automatic)
121121
cat document.txt | tokenizer llama3
122122

123123
# Get help
@@ -224,6 +224,8 @@ MIT
224224
import "github.com/agentstation/tokenizer"
225225
```
226226

227+
Package tokenizer provides a collection of high\-performance tokenizer implementations.
228+
227229
## Index
228230

229231

cmd/tokenizer/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,8 @@ tokenizer [tokenizer-name] [command] [options]
168168
import "github.com/agentstation/tokenizer/cmd/tokenizer"
169169
```
170170

171+
Package main provides the tokenizer CLI tool.
172+
171173
## Index
172174

173175

cmd/tokenizer/completion.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,17 +41,18 @@ PowerShell:
4141
DisableFlagsInUseLine: true,
4242
ValidArgs: []string{"bash", "zsh", "fish", "powershell"},
4343
Args: cobra.MatchAll(cobra.ExactArgs(1), cobra.OnlyValidArgs),
44-
Run: func(cmd *cobra.Command, args []string) {
44+
RunE: func(cmd *cobra.Command, args []string) error {
4545
switch args[0] {
4646
case "bash":
47-
cmd.Root().GenBashCompletion(os.Stdout)
47+
return cmd.Root().GenBashCompletion(os.Stdout)
4848
case "zsh":
49-
cmd.Root().GenZshCompletion(os.Stdout)
49+
return cmd.Root().GenZshCompletion(os.Stdout)
5050
case "fish":
51-
cmd.Root().GenFishCompletion(os.Stdout, true)
51+
return cmd.Root().GenFishCompletion(os.Stdout, true)
5252
case "powershell":
53-
cmd.Root().GenPowerShellCompletionWithDesc(os.Stdout)
53+
return cmd.Root().GenPowerShellCompletionWithDesc(os.Stdout)
5454
}
55+
return nil
5556
},
5657
}
5758

docs/demo.gif

10.2 KB
Loading

docs/demo.svg

Lines changed: 1 addition & 1 deletion
Loading

llama3/README.md

Lines changed: 17 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -454,8 +454,7 @@ Package llama3 implements the Llama 3 tokenizer in Go. This file contains the pu
454454
- [func \(t \*Tokenizer\) EncodeBPE\(pretoken string\) \[\]int](<#Tokenizer.EncodeBPE>)
455455
- [func \(t \*Tokenizer\) EncodeBytes\(data \[\]byte, opts \*EncodeOptions\) \[\]int](<#Tokenizer.EncodeBytes>)
456456
- [func \(t \*Tokenizer\) GetSpecialTokenID\(token string\) \(int, error\)](<#Tokenizer.GetSpecialTokenID>)
457-
- [func \(t \*Tokenizer\) NewScanner\(r io.Reader\) Scanner](<#Tokenizer.NewScanner>)
458-
- [func \(t \*Tokenizer\) NewScannerOptions\(r io.Reader, opts ...ScannerOption\) Scanner](<#Tokenizer.NewScannerOptions>)
457+
- [func \(t \*Tokenizer\) NewScanner\(r io.Reader, opts ...ScannerOption\) Scanner](<#Tokenizer.NewScanner>)
459458
- [func \(t \*Tokenizer\) OptimisticCount\(text string\) int](<#Tokenizer.OptimisticCount>)
460459
- [func \(t \*Tokenizer\) PreTokenize\(text string\) \[\]string](<#Tokenizer.PreTokenize>)
461460
- [func \(t \*Tokenizer\) Process\(r io.Reader, w io.Writer\) \(int64, error\)](<#Tokenizer.Process>)
@@ -469,25 +468,25 @@ Package llama3 implements the Llama 3 tokenizer in Go. This file contains the pu
469468

470469
## Variables
471470

472-
<a name="ErrDataNotFound"></a>Common errors
471+
<a name="ErrDataNotFound"></a>Common errors.
473472

474473
```go
475474
var (
476-
// ErrDataNotFound indicates that the tokenizer data files could not be found
475+
// ErrDataNotFound indicates that the tokenizer data files could not be found.
477476
ErrDataNotFound = errors.New("tokenizer data not found")
478477

479-
// ErrInvalidToken indicates an invalid token was provided
478+
// ErrInvalidToken indicates an invalid token was provided.
480479
ErrInvalidToken = errors.New("invalid token")
481480

482-
// ErrTokenNotFound indicates a token was not found in the vocabulary
481+
// ErrTokenNotFound indicates a token was not found in the vocabulary.
483482
ErrTokenNotFound = errors.New("token not found")
484483

485-
// ErrInvalidTokenID indicates an invalid token ID was provided
484+
// ErrInvalidTokenID indicates an invalid token ID was provided.
486485
ErrInvalidTokenID = errors.New("invalid token ID")
487486
)
488487
```
489488

490-
<a name="WithBufferSize"></a>Scanner option functions \- these are re\-exported from the scanner package
489+
<a name="WithBufferSize"></a>Scanner option functions \- these are re\-exported from the scanner package.
491490

492491
```go
493492
var (
@@ -517,7 +516,7 @@ var (
517516
func NewConfigError(field string, value any, err error) error
518517
```
519518

520-
NewConfigError creates a new ConfigError
519+
NewConfigError creates a new ConfigError.
521520

522521
<a name="NewDataError"></a>
523522
## func [NewDataError](<https://github.com/agentstation/tokenizer/blob/master/llama3/errors.go#L81>)
@@ -526,7 +525,7 @@ NewConfigError creates a new ConfigError
526525
func NewDataError(op, path string, err error) error
527526
```
528527

529-
NewDataError creates a new DataError
528+
NewDataError creates a new DataError.
530529

531530
<a name="NewTokenError"></a>
532531
## func [NewTokenError](<https://github.com/agentstation/tokenizer/blob/master/llama3/errors.go#L86>)
@@ -535,7 +534,7 @@ NewDataError creates a new DataError
535534
func NewTokenError(op, token string, err error) error
536535
```
537536

538-
NewTokenError creates a new TokenError
537+
NewTokenError creates a new TokenError.
539538

540539
<a name="NewTokenIDError"></a>
541540
## func [NewTokenIDError](<https://github.com/agentstation/tokenizer/blob/master/llama3/errors.go#L91>)
@@ -544,7 +543,7 @@ NewTokenError creates a new TokenError
544543
func NewTokenIDError(op string, tokenID int, err error) error
545544
```
546545

547-
NewTokenIDError creates a new TokenError with a token ID
546+
NewTokenIDError creates a new TokenError with a token ID.
548547

549548
<a name="BPE"></a>
550549
## type [BPE](<https://github.com/agentstation/tokenizer/blob/master/llama3/tokenizer.go#L80-L84>)
@@ -583,7 +582,7 @@ type Cache interface {
583582
<a name="ConfigError"></a>
584583
## type [ConfigError](<https://github.com/agentstation/tokenizer/blob/master/llama3/errors.go#L64-L68>)
585584

586-
ConfigError represents an error in tokenizer configuration
585+
ConfigError represents an error in tokenizer configuration.
587586

588587
```go
589588
type ConfigError struct {
@@ -614,7 +613,7 @@ func (e *ConfigError) Unwrap() error
614613
<a name="DataError"></a>
615614
## type [DataError](<https://github.com/agentstation/tokenizer/blob/master/llama3/errors.go#L24-L28>)
616615

617-
DataError represents an error related to tokenizer data loading or processing
616+
DataError represents an error related to tokenizer data loading or processing.
618617

619618
```go
620619
type DataError struct {
@@ -809,7 +808,7 @@ type ScannerOption = scanner.Option
809808
<a name="TokenError"></a>
810809
## type [TokenError](<https://github.com/agentstation/tokenizer/blob/master/llama3/errors.go#L42-L47>)
811810

812-
TokenError represents an error related to token operations
811+
TokenError represents an error related to token operations.
813812

814813
```go
815814
type TokenError struct {
@@ -1082,22 +1081,13 @@ func main() {
10821081
</details>
10831082

10841083
<a name="Tokenizer.NewScanner"></a>
1085-
### func \(\*Tokenizer\) [NewScanner](<https://github.com/agentstation/tokenizer/blob/master/llama3/scanner.go#L65>)
1084+
### func \(\*Tokenizer\) [NewScanner](<https://github.com/agentstation/tokenizer/blob/master/llama3/scanner.go#L67>)
10861085

10871086
```go
1088-
func (t *Tokenizer) NewScanner(r io.Reader) Scanner
1087+
func (t *Tokenizer) NewScanner(r io.Reader, opts ...ScannerOption) Scanner
10891088
```
10901089

1091-
NewScanner creates a scanner for streaming tokenization with default options.
1092-
1093-
<a name="Tokenizer.NewScannerOptions"></a>
1094-
### func \(\*Tokenizer\) [NewScannerOptions](<https://github.com/agentstation/tokenizer/blob/master/llama3/scanner.go#L70>)
1095-
1096-
```go
1097-
func (t *Tokenizer) NewScannerOptions(r io.Reader, opts ...ScannerOption) Scanner
1098-
```
1099-
1100-
NewScannerOptions creates a scanner with custom options.
1090+
NewScanner creates a scanner for streaming tokenization. The scanner processes input with bounded memory usage, making it suitable for large files or continuous streams.
11011091

11021092
<a name="Tokenizer.OptimisticCount"></a>
11031093
### func \(\*Tokenizer\) [OptimisticCount](<https://github.com/agentstation/tokenizer/blob/master/llama3/tokenizer.go#L373>)

llama3/cmd/llama3/command.go

Lines changed: 54 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,16 @@ import (
1111
// This command provides encode, decode, stream, and info subcommands
1212
// for working with the Llama 3 tokenizer.
1313
func Command() *cobra.Command {
14+
// Define shared flags that can be used with implicit encoding/streaming
15+
var (
16+
output string
17+
count bool
18+
countOnly bool
19+
bos bool
20+
eos bool
21+
metrics bool
22+
)
23+
1424
cmd := &cobra.Command{
1525
Use: "llama3",
1626
Short: "Llama 3 tokenizer operations",
@@ -22,23 +32,26 @@ vocabulary of 128,256 tokens (128,000 regular tokens + 256 special tokens).
2232
Available commands:
2333
encode - Encode text to token IDs (default when text is provided)
2434
decode - Decode token IDs to text
25-
stream - Process text in streaming mode
2635
info - Display tokenizer information`,
2736
Example: ` # Encode text (explicit)
2837
tokenizer llama3 encode "Hello, world!"
2938
3039
# Encode text (implicit - default action)
3140
tokenizer llama3 "Hello, world!"
3241
42+
# Encode with flags (implicit)
43+
tokenizer llama3 "Hello, world!" --count
44+
tokenizer llama3 "Hello, world!" --output json
45+
3346
# Decode tokens
3447
tokenizer llama3 decode 128000 9906 11 1917 0 128001
3548
36-
# Stream from stdin (explicit)
37-
cat large_file.txt | tokenizer llama3 stream
38-
39-
# Stream from stdin (implicit - automatic)
49+
# Encode from stdin (implicit - automatic)
4050
cat large_file.txt | tokenizer llama3
4151
52+
# Encode with flags (implicit)
53+
cat large_file.txt | tokenizer llama3 --count-only
54+
4255
# Show tokenizer info
4356
tokenizer llama3 info`,
4457
Args: cobra.ArbitraryArgs,
@@ -63,27 +76,59 @@ Available commands:
6376
// Not a subcommand, treat as text to encode
6477
encodeCmd := newEncodeCmd()
6578
encodeCmd.SetArgs(args)
79+
// Copy over parent command flags for encode
80+
encodeCmd.SetOut(cmd.OutOrStdout())
81+
encodeCmd.SetErr(cmd.ErrOrStderr())
82+
encodeCmd.SetIn(cmd.InOrStdin())
83+
84+
// Set flags from parent command
85+
encAddBOS = bos
86+
encAddEOS = eos
87+
encOutput = output
88+
encCount = count
89+
encCountOnly = countOnly
90+
encMetrics = metrics
91+
6692
return encodeCmd.Execute()
6793
}
6894

6995
// No args provided - check if stdin is piped
7096
stat, _ := os.Stdin.Stat()
7197
if (stat.Mode() & os.ModeCharDevice) == 0 {
72-
// Data is being piped to stdin, use streaming mode
73-
streamCmd := newStreamCmd()
74-
return streamCmd.Execute()
98+
// Data is being piped to stdin, use encode
99+
encodeCmd := newEncodeCmd()
100+
encodeCmd.SetOut(cmd.OutOrStdout())
101+
encodeCmd.SetErr(cmd.ErrOrStderr())
102+
encodeCmd.SetIn(cmd.InOrStdin())
103+
104+
// Set flags from parent command
105+
encAddBOS = bos
106+
encAddEOS = eos
107+
encOutput = output
108+
encCount = count
109+
encCountOnly = countOnly
110+
encMetrics = metrics
111+
112+
return encodeCmd.RunE(encodeCmd, []string{})
75113
}
76114

77115
// No args and no piped input, show help
78116
return cmd.Help()
79117
},
80118
}
81119

120+
// Add flags that work with implicit encoding/streaming
121+
cmd.PersistentFlags().StringVarP(&output, "output", "o", "space", "Output format: space, newline, json")
122+
cmd.PersistentFlags().BoolVar(&count, "count", false, "Show token count with output")
123+
cmd.PersistentFlags().BoolVar(&countOnly, "count-only", false, "Show only token count (no tokens)")
124+
cmd.PersistentFlags().BoolVar(&bos, "bos", true, "Add beginning of sequence token")
125+
cmd.PersistentFlags().BoolVar(&eos, "eos", true, "Add end of sequence token")
126+
cmd.PersistentFlags().BoolVar(&metrics, "metrics", false, "Show performance metrics")
127+
82128
// Add subcommands
83129
cmd.AddCommand(
84130
newEncodeCmd(),
85131
newDecodeCmd(),
86-
newStreamCmd(),
87132
newInfoCmd(),
88133
)
89134

0 commit comments

Comments
 (0)