|
4 | 4 | [](https://github.com/gomantics/chunkx/actions/workflows/ci.yml) |
5 | 5 | [](https://goreportcard.com/report/github.com/gomantics/chunkx) |
6 | 6 |
|
7 | | -A Go library for AST-based code chunking implementing the CAST (Chunking via Abstract Syntax Trees) algorithm from the paper ["cAST: Enhancing Code Retrieval-Augmented Generation with Structural Chunking via Abstract Syntax Tree"](https://arxiv.org/pdf/2506.15655). |
8 | | - |
9 | | -## Features |
10 | | - |
11 | | -- **Syntax-aware chunking**: Respects code structure (functions, classes, methods) instead of arbitrarily splitting at line boundaries |
12 | | -- **Multi-language support**: Works with 30+ languages via tree-sitter parsers |
13 | | -- **Generic fallback**: Automatically falls back to line-based chunking for unsupported file types |
14 | | -- **Configurable chunk sizes**: Set maximum chunk size in tokens, bytes, or lines |
15 | | -- **Custom token counters**: Pluggable interface for custom tokenization strategies |
16 | | -- **Overlap support**: Optional chunk overlapping for better context preservation |
| 7 | +A Go library for AST-based code chunking implementing the CAST (Chunking via Abstract Syntax Trees) algorithm. |
17 | 8 |
|
18 | 9 | ## Installation |
19 | 10 |
|
20 | 11 | ```bash |
21 | 12 | go get github.com/gomantics/chunkx |
22 | 13 | ``` |
23 | 14 |
|
24 | | -## Quick Start |
| 15 | +## Documentation |
25 | 16 |
|
26 | | -```go |
27 | | -package main |
28 | | - |
29 | | -import ( |
30 | | - "fmt" |
31 | | - "github.com/gomantics/chunkx" |
32 | | - "github.com/gomantics/chunkx/languages" |
33 | | -) |
34 | | - |
35 | | -func main() { |
36 | | - chunker := chunkx.NewChunker() |
| 17 | +For complete documentation, usage examples, and API reference, visit: |
37 | 18 |
|
38 | | - code := `package main |
39 | | -
|
40 | | -func hello() { |
41 | | - fmt.Println("Hello, World!") |
42 | | -} |
43 | | -
|
44 | | -func goodbye() { |
45 | | - fmt.Println("Goodbye!") |
46 | | -}` |
47 | | - |
48 | | - chunks, err := chunker.Chunk(code, |
49 | | - chunkx.WithLanguage(languages.Go), |
50 | | - chunkx.WithMaxSize(50)) |
51 | | - if err != nil { |
52 | | - panic(err) |
53 | | - } |
54 | | - |
55 | | - for i, chunk := range chunks { |
56 | | - fmt.Printf("Chunk %d (lines %d-%d):\n%s\n\n", |
57 | | - i+1, chunk.StartLine, chunk.EndLine, chunk.Content) |
58 | | - } |
59 | | -} |
60 | | -``` |
61 | | - |
62 | | -## Usage |
63 | | - |
64 | | -### Basic Chunking |
65 | | - |
66 | | -```go |
67 | | -chunker := chunkx.NewChunker() |
| 19 | +**https://gomantics.dev/chunkx** |
68 | 20 |
|
69 | | -// Chunk code with language specified |
70 | | -chunks, err := chunker.Chunk(code, chunkx.WithLanguage(languages.Python)) |
71 | | -``` |
72 | | - |
73 | | -### File-based Chunking |
74 | | - |
75 | | -```go |
76 | | -// Auto-detects language from file extension |
77 | | -chunks, err := chunker.ChunkFile("main.go") |
78 | | -``` |
79 | | - |
80 | | -### Custom Configuration |
| 21 | +## Features |
81 | 22 |
|
82 | | -```go |
83 | | -chunks, err := chunker.Chunk(code, |
84 | | - chunkx.WithLanguage(languages.Go), |
85 | | - chunkx.WithMaxSize(1500), // Max 1500 tokens per chunk |
86 | | - chunkx.WithOverlap(10), // 10% overlap between chunks |
87 | | -) |
88 | | -``` |
| 23 | +- Syntax-aware chunking that respects code structure |
| 24 | +- Support for 30+ programming languages via tree-sitter |
| 25 | +- Configurable chunk sizes (tokens, bytes, or lines) |
| 26 | +- Custom token counters (including OpenAI tiktoken) |
| 27 | +- Optional chunk overlapping for better context |
89 | 28 |
|
90 | | -### Custom Token Counter |
| 29 | +## Quick Example |
91 | 30 |
|
92 | 31 | ```go |
93 | | -type MyTokenCounter struct{} |
94 | | - |
95 | | -func (m *MyTokenCounter) CountTokens(text string) (int, error) { |
96 | | - // Your custom tokenization logic |
97 | | - return len(strings.Fields(text)), nil |
98 | | -} |
99 | | - |
100 | | -chunks, err := chunker.Chunk(code, |
101 | | - chunkx.WithLanguage(languages.Go), |
102 | | - chunkx.WithTokenCounter(&MyTokenCounter{})) |
103 | | -``` |
104 | | - |
105 | | -### OpenAI-Compatible Token Counting |
106 | | - |
107 | | -For production use with OpenAI models, you can integrate [tiktoken-go](https://github.com/pkoukk/tiktoken-go) for accurate token counting: |
| 32 | +package main |
108 | 33 |
|
109 | | -```go |
110 | 34 | import ( |
111 | | - "github.com/pkoukk/tiktoken-go" |
112 | | - |
113 | 35 | "github.com/gomantics/chunkx" |
114 | 36 | "github.com/gomantics/chunkx/languages" |
115 | 37 | ) |
116 | 38 |
|
117 | | -// TikTokenCounter uses OpenAI's tiktoken for accurate token counting |
118 | | -type TikTokenCounter struct { |
119 | | - encoding *tiktoken.Tiktoken |
120 | | -} |
121 | | - |
122 | | -// NewTikTokenCounter creates a counter for a specific OpenAI model |
123 | | -func NewTikTokenCounter(model string) (*TikTokenCounter, error) { |
124 | | - encoding, err := tiktoken.EncodingForModel(model) |
125 | | - if err != nil { |
126 | | - return nil, err |
127 | | - } |
128 | | - return &TikTokenCounter{encoding: encoding}, nil |
129 | | -} |
130 | | - |
131 | | -func (t *TikTokenCounter) CountTokens(text string) (int, error) { |
132 | | - tokens := t.encoding.Encode(text, nil, nil) |
133 | | - return len(tokens), nil |
134 | | -} |
135 | | - |
136 | | -// Usage example |
137 | 39 | func main() { |
138 | | - tokenCounter, err := NewTikTokenCounter("gpt-4") |
139 | | - if err != nil { |
140 | | - panic(err) |
141 | | - } |
142 | | - |
143 | 40 | chunker := chunkx.NewChunker() |
144 | | - chunks, err := chunker.Chunk(code, |
145 | | - chunkx.WithLanguage(languages.Python), |
146 | | - chunkx.WithMaxSize(8000), |
147 | | - chunkx.WithTokenCounter(tokenCounter)) |
148 | | - if err != nil { |
149 | | - panic(err) |
150 | | - } |
151 | | - |
152 | | - // Your chunks are now sized according to GPT-4's tokenization |
153 | | - for _, chunk := range chunks { |
154 | | - // Process chunks... |
155 | | - } |
156 | | -} |
157 | | -``` |
158 | | - |
159 | | -This ensures your chunks respect the exact token limits of OpenAI models like GPT-3.5, GPT-4, and GPT-4o. |
160 | | - |
161 | | -### Built-in Token Counters |
162 | | - |
163 | | -- `SimpleTokenCounter`: Whitespace-based word counting (default) |
164 | | -- `ByteCounter`: Counts bytes instead of tokens |
165 | | -- `LineCounter`: Counts lines instead of tokens |
166 | | - |
167 | | -```go |
168 | | -// Use byte-based chunking |
169 | | -chunks, err := chunker.Chunk(code, |
170 | | - chunkx.WithLanguage(languages.Python), |
171 | | - chunkx.WithMaxSize(4096), |
172 | | - chunkx.WithTokenCounter(&chunkx.ByteCounter{})) |
173 | | -``` |
174 | | - |
175 | | -## Supported Languages |
176 | | - |
177 | | -ChunkX supports 30+ programming languages via tree-sitter. Use the exported language constants from the `languages` package (e.g., `languages.Go`, `languages.Python`, `languages.JavaScript`, etc.). See the [languages package](languages/registry.go) for the complete list of supported languages and file extensions. |
178 | | - |
179 | | -For files with unrecognized extensions or explicitly using `languages.Generic`, ChunkX automatically falls back to a line-based chunking algorithm that maintains the chunking semantics without requiring AST parsing. |
180 | | - |
181 | | -## How It Works |
182 | | - |
183 | | -ChunkX implements the CAST algorithm which: |
184 | | - |
185 | | -1. Parses source code into an Abstract Syntax Tree (AST) |
186 | | -2. Recursively traverses the AST to identify semantic units |
187 | | -3. Groups nodes while respecting the maximum chunk size |
188 | | -4. Splits large nodes that exceed the size limit |
189 | | -5. Merges smaller sibling nodes to maximize chunk density |
190 | | - |
191 | | -This approach ensures that chunks: |
192 | | - |
193 | | -- Preserve syntactic integrity (no mid-function splits) |
194 | | -- Maintain semantic coherence |
195 | | -- Are self-contained and meaningful |
196 | | -- Respect language-specific structures |
197 | | - |
198 | | -## Chunk Structure |
199 | | - |
200 | | -```go |
201 | | -type Chunk struct { |
202 | | - Content string // The actual code content |
203 | | - StartLine int // Starting line number (1-based) |
204 | | - EndLine int // Ending line number (1-based) |
205 | | - StartByte int // Starting byte offset |
206 | | - EndByte int // Ending byte offset |
207 | | - NodeTypes []string // AST node types included |
208 | | - Language languages.LanguageName // Programming language |
| 41 | + chunks, _ := chunker.Chunk(code, |
| 42 | + chunkx.WithLanguage(languages.Go), |
| 43 | + chunkx.WithMaxSize(1500)) |
209 | 44 | } |
210 | 45 | ``` |
211 | 46 |
|
212 | | -## Performance |
213 | | - |
214 | | -Benchmarks on Apple M4 Max (3s run): |
215 | | - |
216 | | -``` |
217 | | -BenchmarkASTChunking-14 41301 85932 ns/op 19520 B/op 170 allocs/op |
218 | | -BenchmarkLineBasedChunking-14 4392780 831.6 ns/op 1904 B/op 10 allocs/op |
219 | | -BenchmarkASTChunkingLarge-14 4681 769800 ns/op 110464 B/op 794 allocs/op |
220 | | -BenchmarkLineBasedChunkingLarge-14 437184 8273 ns/op 16880 B/op 27 allocs/op |
221 | | -BenchmarkASTChunkingMultipleLanguages-14 22951 156257 ns/op 42336 B/op 336 allocs/op |
222 | | -BenchmarkTokenCounters/SimpleTokenCounter-14 51332 70434 ns/op 4760 B/op 20 allocs/op |
223 | | -BenchmarkTokenCounters/ByteCounter-14 40485 88952 ns/op 21504 B/op 227 allocs/op |
224 | | -BenchmarkTokenCounters/LineCounter-14 51607 70349 ns/op 3224 B/op 19 allocs/op |
225 | | -BenchmarkOverlapChunking/Overlap0-14 42333 85163 ns/op 19544 B/op 172 allocs/op |
226 | | -BenchmarkOverlapChunking/Overlap10-14 41676 85761 ns/op 21832 B/op 187 allocs/op |
227 | | -BenchmarkOverlapChunking/Overlap25-14 42122 85715 ns/op 22032 B/op 187 allocs/op |
228 | | -BenchmarkOverlapChunking/Overlap50-14 41696 85976 ns/op 22360 B/op 187 allocs/op |
229 | | -``` |
230 | | - |
231 | | -AST-based chunking is ~100x slower than naive line-based chunking but produces semantically superior chunks that improve RAG performance. The SimpleTokenCounter and LineCounter provide the best performance, while ByteCounter has slightly higher overhead due to more allocations. Chunk overlap has minimal performance impact (~0.5% overhead). |
232 | | - |
233 | | -## Examples |
234 | | - |
235 | | -The `testdata/` directory contains real-world code examples in multiple languages, along with their chunked outputs in JSON format. These examples serve as both documentation and regression tests: |
236 | | - |
237 | | -- **`testdata/sources/`**: Example source files in Go, Python, JavaScript, TypeScript, Java, Rust, and C++ |
238 | | -- **`testdata/*.approved.json`**: Snapshot test outputs showing how each example is chunked |
239 | | - |
240 | | -To see how chunkx handles different languages and chunk sizes, browse the approved JSON files. They show: |
241 | | - |
242 | | -- Complete chunk content |
243 | | -- Line and byte ranges |
244 | | -- AST node types included in each chunk |
245 | | -- How semantic boundaries are preserved |
246 | | - |
247 | | -The snapshots are automatically verified using [go-approval-tests](https://github.com/approvals/go-approval-tests) to ensure chunking behavior remains consistent across changes. |
248 | | - |
249 | | -## Testing |
250 | | - |
251 | | -```bash |
252 | | -# Run tests |
253 | | -go test ./... |
254 | | - |
255 | | -# Run benchmarks |
256 | | -go test -bench=. -benchtime=10s |
257 | | - |
258 | | -# Run with coverage |
259 | | -go test -cover ./... |
260 | | - |
261 | | -# Run approval tests (regenerate snapshots on first failure) |
262 | | -go test -run TestChunkingExamples |
263 | | -``` |
264 | | - |
265 | | -## Use Cases |
266 | | - |
267 | | -- **RAG Systems**: Improve retrieval quality by providing semantically coherent code chunks |
268 | | -- **Code Search**: Index code at meaningful boundaries |
269 | | -- **Documentation**: Generate documentation from logical code units |
270 | | -- **Code Analysis**: Process code in structured segments |
271 | | -- **LLM Context Windows**: Fit code into token limits while preserving structure |
272 | | - |
273 | | -## Design Principles |
274 | | - |
275 | | -1. **Minimalist**: Clean, focused codebase with no unnecessary abstractions |
276 | | -2. **Well-tested**: Comprehensive unit, integration, and benchmark tests |
277 | | -3. **Pluggable**: Interface-based design for extensibility |
278 | | -4. **Language-agnostic**: Works consistently across programming languages |
279 | | - |
280 | | -## References |
281 | | - |
282 | | -- [cAST Paper (arXiv:2506.15655)](https://arxiv.org/pdf/2506.15655) |
283 | | -- [Tree-sitter](https://tree-sitter.github.io/tree-sitter/) |
284 | | -- [go-tree-sitter](https://github.com/smacker/go-tree-sitter) |
285 | | - |
286 | 47 | ## License |
287 | 48 |
|
288 | 49 | [MIT](./LICENSE) |
0 commit comments