fix: correct .gitignore to only exclude root tokenizer binary

jackspirou · claude · jackspirou · commit fbfd55d27933 · 2025-08-01T11:21:46.000-05:00
- Change tokenizer to /tokenizer in .gitignore - This prevents cmd/tokenizer directory from being ignored - Add previously ignored cmd/tokenizer files to git - Fixes GoReleaser "couldn't find main file" error 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -36,12 +36,22 @@ jobs:
       with:
         go-version: '1.24'
 
+    - name: Debug - List directory structure
+      run: |
+        echo "Current directory: $(pwd)"
+        echo "Directory contents:"
+        ls -la
+        echo "cmd directory:"
+        ls -la cmd/
+        echo "cmd/tokenizer directory:"
+        ls -la cmd/tokenizer/
+
     - name: Run GoReleaser
       uses: goreleaser/goreleaser-action@v6
       with:
         distribution: goreleaser
         version: '~> v2'
-        args: release --clean
+        args: release --clean --verbose
       env:
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
diff --git a/.gitignore b/.gitignore
@@ -143,7 +143,7 @@ build/
 dist/
 bin/
 cmd/example/example
-tokenizer
+/tokenizer
 
 # GoReleaser
 dist/
diff --git a/cmd/tokenizer/README.md b/cmd/tokenizer/README.md
@@ -0,0 +1,162 @@
+# Tokenizer CLI
+
+A command-line interface for tokenizing text using various language model tokenizers.
+
+## Installation
+
+```bash
+go install github.com/agentstation/tokenizer/cmd/tokenizer@latest
+```
+
+Or build from source:
+
+```bash
+go build -o tokenizer ./cmd/tokenizer
+```
+
+## Usage
+
+The tokenizer CLI uses a subcommand structure where each tokenizer implementation is a subcommand.
+
+### Basic Commands
+
+```bash
+# Encode text to token IDs
+tokenizer llama3 encode "Hello, world!"
+# Output: 128000 9906 11 1917 0 128001
+
+# Decode token IDs back to text
+tokenizer llama3 decode 128000 9906 11 1917 0 128001
+# Output: <|begin_of_text|>Hello, world!<|end_of_text|>
+
+# Get tokenizer information
+tokenizer llama3 info
+```
+
+### Encoding Options
+
+```bash
+# Encode without special tokens
+tokenizer llama3 encode --bos=false --eos=false "Hello, world!"
+# Output: 9906 11 1917 0
+
+# Different output formats
+tokenizer llama3 encode -o json "Hello, world!"
+# Output: [128000,9906,11,1917,0,128001]
+
+tokenizer llama3 encode -o newline "Hello, world!"
+# Output: (one token per line)
+# 128000
+# 9906
+# 11
+# 1917
+# 0
+# 128001
+```
+
+### Piping and Streaming
+
+```bash
+# Pipe text to encode
+echo "Hello, world!" | tokenizer llama3 encode
+
+# Pipe tokens to decode
+echo "128000 9906 11 1917 0 128001" | tokenizer llama3 decode
+
+# Round-trip encoding and decoding
+tokenizer llama3 encode "test" | tokenizer llama3 decode
+
+# Stream large files
+cat large_file.txt | tokenizer llama3 stream
+```
+
+### Streaming Mode
+
+For processing large files or real-time input:
+
+```bash
+# Basic streaming
+tokenizer llama3 stream < input.txt
+
+# Custom buffer settings
+tokenizer llama3 stream --buffer-size=8192 --max-buffer=2097152 < large_file.txt
+
+# Stream without special tokens
+tokenizer llama3 stream --bos=false --eos=false < input.txt
+```
+
+## Available Tokenizers
+
+### llama3
+
+Meta's Llama 3 tokenizer with 128,256 tokens (128,000 regular + 256 special tokens).
+
+**Commands:**
+- `encode` - Convert text to token IDs
+- `decode` - Convert token IDs to text  
+- `stream` - Process text in streaming mode
+- `info` - Display tokenizer information
+
+## Examples
+
+### Tokenize a file
+
+```bash
+# Tokenize entire file
+tokenizer llama3 encode < document.txt > tokens.txt
+
+# Count tokens in a file
+tokenizer llama3 encode < document.txt | wc -w
+```
+
+### Batch processing
+
+```bash
+# Process multiple files
+for file in *.txt; do
+    echo "Tokenizing $file..."
+    tokenizer llama3 encode < "$file" > "${file%.txt}.tokens"
+done
+```
+
+### Integration with other tools
+
+```bash
+# Use with jq for JSON processing
+tokenizer llama3 encode -o json "Hello" | jq length
+
+# Extract specific tokens
+tokenizer llama3 encode "Hello, world!" | awk '{print $2}'
+```
+
+## Future Tokenizers
+
+The CLI is designed to support multiple tokenizers. Future additions may include:
+- GPT-2/GPT-3 tokenizers
+- BERT tokenizer
+- SentencePiece tokenizers
+- Custom tokenizers
+
+Each tokenizer will follow the same subcommand pattern:
+```bash
+tokenizer [tokenizer-name] [command] [options]
+```
+
+<!-- gomarkdoc:embed:start -->
+
+<!-- Code generated by gomarkdoc. DO NOT EDIT -->
+
+# tokenizer
+
+```go
+import "github.com/agentstation/tokenizer/cmd/tokenizer"
+```
+
+## Index
+
+
+
+Generated by [gomarkdoc](<https://github.com/princjef/gomarkdoc>)
+
+
+<!-- gomarkdoc:embed:end -->
diff --git a/cmd/tokenizer/generate.go b/cmd/tokenizer/generate.go
@@ -0,0 +1,3 @@
+package main
+
+//go:generate gomarkdoc -o README.md -e . --embed --repository.url https://github.com/agentstation/tokenizer --repository.default-branch master --repository.path /cmd/tokenizer
diff --git a/cmd/tokenizer/main.go b/cmd/tokenizer/main.go
@@ -0,0 +1,21 @@
+package main
+
+import (
+	"fmt"
+	"os"
+)
+
+var (
+	// Version information (set by build flags).
+	version   = "dev"
+	commit    = "none"
+	buildDate = "unknown"
+	goVersion = "unknown"
+)
+
+func main() {
+	if err := rootCmd.Execute(); err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(1)
+	}
+}
diff --git a/cmd/tokenizer/root.go b/cmd/tokenizer/root.go
@@ -0,0 +1,69 @@
+package main
+
+import (
+	"fmt"
+
+	"github.com/spf13/cobra"
+
+	llama3cmd "github.com/agentstation/tokenizer/llama3/cmd/llama3"
+)
+
+// rootCmd represents the base command when called without any subcommands.
+var rootCmd = &cobra.Command{
+	Use:   "tokenizer",
+	Short: "A multi-model tokenizer CLI tool",
+	Long: `Tokenizer is a CLI tool for tokenizing text using various language models.
+
+This tool provides a unified interface for working with different tokenizer
+implementations. Each tokenizer is available as a subcommand with its own
+set of operations.
+
+Currently supported tokenizers:
+  - llama3: Meta's Llama 3 tokenizer (128k vocabulary, byte-level BPE)
+
+Common operations available for tokenizers:
+  - encode: Convert text to token IDs
+  - decode: Convert token IDs back to text
+  - stream: Process large files in streaming mode
+  - info:   Display tokenizer information`,
+	Example: `  # Encode text with Llama 3
+  tokenizer llama3 encode "Hello, world!"
+  
+  # Decode tokens
+  tokenizer llama3 decode 1234 5678
+  
+  # Stream a large file
+  cat large_file.txt | tokenizer llama3 stream
+  
+  # Get tokenizer info
+  tokenizer llama3 info`,
+	SilenceUsage: true,
+}
+
+// versionCmd represents the version command.
+var versionCmd = &cobra.Command{
+	Use:   "version",
+	Short: "Print version information",
+	Run: func(cmd *cobra.Command, args []string) {
+		fmt.Printf("tokenizer version %s\n", version)
+		if commit != "none" {
+			fmt.Printf("  commit:     %s\n", commit)
+		}
+		if buildDate != "unknown" {
+			fmt.Printf("  built:      %s\n", buildDate)
+		}
+		if goVersion != "unknown" {
+			fmt.Printf("  go version: %s\n", goVersion)
+		}
+	},
+}
+
+func init() {
+	// Register commands
+	rootCmd.AddCommand(versionCmd)
+	rootCmd.AddCommand(llama3cmd.Command())
+
+	// Future tokenizers can be added here:
+	// rootCmd.AddCommand(gpt2cmd.Command())
+	// rootCmd.AddCommand(bertcmd.Command())
+}
diff --git a/test-tokenizer b/test-tokenizer

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+package main`
	`2`	`+`
	`3`	`+//go:generate gomarkdoc -o README.md -e . --embed --repository.url https://github.com/agentstation/tokenizer --repository.default-branch master --repository.path /cmd/tokenizer`