Skip to content

Commit ba79a9e

Browse files
authored
Merge pull request #47 from planetf1/chunkfixcli
2 parents 0304e3a + e7b7627 commit ba79a9e

File tree

6 files changed

+120
-3
lines changed

6 files changed

+120
-3
lines changed

cli/README.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,26 @@ Semantic chunking uses sentence transformers to identify natural break points in
344344

345345
**Note**: Semantic chunking uses sentence-transformers for chunking decisions, but the resulting chunks are embedded using your collection's embedding model (e.g., nomic-embed-text) for search operations.
346346

347+
Additional notes:
348+
349+
- Advanced semantic parameters are fully supported via flags in the CLI in addition to the common ones:
350+
- `--semantic-model` (model identifier, e.g., all-MiniLM-L6-v2)
351+
- `--semantic-window-size` (integer context window)
352+
- `--semantic-threshold-percentile` (0–100 split sensitivity)
353+
- Plus common: `--chunk-size`, `--chunk-overlap`
354+
- Completion: the CLI provides completion for `--chunking-strategy` (includes `Semantic`). The `--semantic-model` value is free-form (no static suggestions); numeric flags disable file completion.
355+
356+
Example with semantic-specific flags:
357+
358+
```bash
359+
./maestro-k collection create --vdb=my-database --name=my-collection \
360+
--chunking-strategy=Semantic \
361+
--chunk-size=768 \
362+
--semantic-model=all-MiniLM-L6-v2 \
363+
--semantic-window-size=1 \
364+
--semantic-threshold-percentile=95
365+
```
366+
347367
### Environment Variable Substitution in YAML Files
348368

349369
The CLI supports environment variable substitution in YAML files using the `{{ENV_VAR_NAME}}` syntax. This allows you to use environment variables directly in your configuration files:
@@ -368,6 +388,7 @@ When you run `./maestro-k create vector-db config.yaml`, the CLI will:
368388
3. Process the YAML file with the substituted values
369389

370390
**Features**:
391+
371392
- **Automatic substitution**: All `{{ENV_VAR_NAME}}` placeholders are replaced before YAML parsing
372393
- **Error handling**: Clear error messages if required environment variables are missing
373394
- **Verbose logging**: Shows which environment variables are being substituted (when using `--verbose`)
@@ -541,6 +562,7 @@ The CLI provides resource-based create commands for vector databases, collection
541562
```
542563

543564
**Supported Override Flags**:
565+
544566
- `--type`: Override database type (milvus, weaviate)
545567
- `--uri`: Override connection URI
546568
- `--collection-name`: Override collection name
@@ -558,6 +580,13 @@ The CLI provides resource-based create commands for vector databases, collection
558580
559581
# Create collection with dry-run mode
560582
./maestro-k collection create --name=my-collection --vdb=my-database --dry-run
583+
584+
# Create collection with chunking configuration
585+
./maestro-k collection create --name=my-collection --vdb=my-database \
586+
--embedding=text-embedding-3-small \
587+
--chunking-strategy=Sentence \
588+
--chunk-size=512 \
589+
--chunk-overlap=32
561590
```
562591

563592
#### Create Document Command

cli/src/commands.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,4 +380,12 @@ func init() {
380380

381381
// Add flags to collection create command
382382
collectionCreateCmd.Flags().StringVar(&collectionEmbedding, "embedding", "default", "Embedding model to use for the collection")
383+
// Expose chunking flags here as well (kept in sync with create_collection.go)
384+
collectionCreateCmd.Flags().StringVar(&collectionChunkStrategy, "chunking-strategy", "None", "Chunking strategy to use for the collection (None, Fixed, Sentence, Semantic)")
385+
collectionCreateCmd.Flags().IntVar(&collectionChunkSize, "chunk-size", 0, "Chunk size in characters (optional; defaults may vary by strategy)")
386+
collectionCreateCmd.Flags().IntVar(&collectionChunkOverlap, "chunk-overlap", 0, "Chunk overlap in characters (optional)")
387+
// Semantic-specific parameters
388+
collectionCreateCmd.Flags().StringVar(&semanticModel, "semantic-model", "", "Semantic chunking model identifier (e.g., all-MiniLM-L6-v2)")
389+
collectionCreateCmd.Flags().IntVar(&semanticWindowSize, "semantic-window-size", 0, "Semantic chunking window size (optional)")
390+
collectionCreateCmd.Flags().Float64Var(&semanticThresholdPercentile, "semantic-threshold-percentile", 0, "Semantic chunking threshold percentile (0-100, optional)")
383391
}

cli/src/completion.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ func (cp *CompletionProvider) CompleteEmbeddings(partial string) ([]CompletionIt
177177

178178
// CompleteChunkingStrategies provides completion for chunking strategy values
179179
func (cp *CompletionProvider) CompleteChunkingStrategies(partial string) ([]CompletionItem, error) {
180-
strategies := []string{"None", "Fixed", "Sentence"}
180+
strategies := []string{"None", "Fixed", "Sentence", "Semantic"}
181181

182182
var completions []CompletionItem
183183
for _, s := range strategies {
@@ -259,6 +259,7 @@ func (cp *CompletionProvider) CompleteFlags(partial string) ([]CompletionItem, e
259259
"--verbose", "--silent", "--dry-run", "--force",
260260
"--mcp-server-uri", "--doc-limit",
261261
"--chunking-strategy", "--chunk-size", "--chunk-overlap",
262+
"--semantic-model", "--semantic-window-size", "--semantic-threshold-percentile",
262263
"-h", "--help", "--version",
263264
}
264265

cli/src/completion_cmd.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,17 @@ func SetupCustomCompletions() {
154154
collectionCreateCmd.RegisterFlagCompletionFunc("chunk-overlap", func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) {
155155
return nil, cobra.ShellCompDirectiveNoFileComp
156156
})
157+
158+
// Semantic-specific flags
159+
collectionCreateCmd.RegisterFlagCompletionFunc("semantic-model", func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) {
160+
return nil, cobra.ShellCompDirectiveNoFileComp
161+
})
162+
collectionCreateCmd.RegisterFlagCompletionFunc("semantic-window-size", func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) {
163+
return nil, cobra.ShellCompDirectiveNoFileComp
164+
})
165+
collectionCreateCmd.RegisterFlagCompletionFunc("semantic-threshold-percentile", func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) {
166+
return nil, cobra.ShellCompDirectiveNoFileComp
167+
})
157168
}
158169

159170
// Status --vdb completion using vector database names

cli/src/create_collection.go

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,15 +78,19 @@ var (
7878
collectionChunkStrategy string
7979
collectionChunkSize int
8080
collectionChunkOverlap int
81+
// Semantic-specific params
82+
semanticModel string
83+
semanticWindowSize int
84+
semanticThresholdPercentile float64
8185
)
8286

8387
func init() {
8488
// Add flags for collection creation to all collection creation commands
8589
commands := []*cobra.Command{createCollectionCmd, createVdbColCmd, createColCmd}
8690
for _, cmd := range commands {
8791
cmd.Flags().StringVar(&collectionEmbedding, "embedding", "default", "Embedding model to use for the collection")
88-
cmd.Flags().StringVar(&collectionChunkStrategy, "chunking-strategy", "None", "Chunking strategy to use for the collection (None, Fixed, Sentence)")
89-
cmd.Flags().IntVar(&collectionChunkSize, "chunk-size", 0, "Chunk size in characters (optional, defaults to 512 when strategy != None)")
92+
cmd.Flags().StringVar(&collectionChunkStrategy, "chunking-strategy", "None", "Chunking strategy to use for the collection (None, Fixed, Sentence, Semantic)")
93+
cmd.Flags().IntVar(&collectionChunkSize, "chunk-size", 0, "Chunk size in characters (optional; defaults may vary by strategy)")
9094
cmd.Flags().IntVar(&collectionChunkOverlap, "chunk-overlap", 0, "Chunk overlap in characters (optional)")
9195
}
9296
}
@@ -149,6 +153,18 @@ func createCollection(vdbName, collectionName string) error {
149153
if collectionChunkOverlap > 0 {
150154
params["overlap"] = collectionChunkOverlap
151155
}
156+
if collectionChunkStrategy == "Semantic" {
157+
if semanticWindowSize > 0 {
158+
params["window_size"] = semanticWindowSize
159+
}
160+
if semanticThresholdPercentile > 0 {
161+
params["threshold_percentile"] = semanticThresholdPercentile
162+
}
163+
if semanticModel != "" {
164+
// Use 'model_name' to align with server and semantic chunking API
165+
params["model_name"] = semanticModel
166+
}
167+
}
152168
chunkCfg = map[string]interface{}{
153169
"strategy": collectionChunkStrategy,
154170
"parameters": params,

cli/tests/create_collection_test.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,4 +133,56 @@ func TestCreateCollectionHelp(t *testing.T) {
133133
if !contains(outputStr, "--name string") {
134134
t.Errorf("Should show name flag in help, got: %s", outputStr)
135135
}
136+
137+
// Ensure chunking flags are shown in help
138+
if !contains(outputStr, "--chunking-strategy") {
139+
t.Errorf("Help should include --chunking-strategy flag, got: %s", outputStr)
140+
}
141+
if !contains(outputStr, "--chunk-size") {
142+
t.Errorf("Help should include --chunk-size flag, got: %s", outputStr)
143+
}
144+
if !contains(outputStr, "--chunk-overlap") {
145+
t.Errorf("Help should include --chunk-overlap flag, got: %s", outputStr)
146+
}
147+
148+
// Ensure semantic-specific flags are shown in help
149+
if !contains(outputStr, "--semantic-model") {
150+
t.Errorf("Help should include --semantic-model flag, got: %s", outputStr)
151+
}
152+
if !contains(outputStr, "--semantic-window-size") {
153+
t.Errorf("Help should include --semantic-window-size flag, got: %s", outputStr)
154+
}
155+
if !contains(outputStr, "--semantic-threshold-percentile") {
156+
t.Errorf("Help should include --semantic-threshold-percentile flag, got: %s", outputStr)
157+
}
158+
}
159+
160+
// TestCreateCollectionWithChunkingFlagsDryRun ensures chunking flags are accepted (dry-run)
161+
func TestCreateCollectionWithChunkingFlagsDryRun(t *testing.T) {
162+
cmd := exec.Command("../maestro-k", "collection", "create",
163+
"--name=test-collection", "--vdb=test-db",
164+
"--chunking-strategy=Sentence", "--chunk-size=512", "--chunk-overlap=64",
165+
"--dry-run")
166+
output, err := cmd.CombinedOutput()
167+
168+
if err != nil {
169+
t.Fatalf("Create collection with chunking flags failed: %v, output: %s", err, string(output))
170+
}
171+
}
172+
173+
// TestCreateCollectionWithSemanticFlagsDryRun ensures semantic-specific flags are accepted (dry-run)
174+
func TestCreateCollectionWithSemanticFlagsDryRun(t *testing.T) {
175+
cmd := exec.Command("../maestro-k", "collection", "create",
176+
"--name=test-collection", "--vdb=test-db",
177+
"--chunking-strategy=Semantic",
178+
"--chunk-size=768",
179+
"--semantic-model=all-MiniLM-L6-v2",
180+
"--semantic-window-size=1",
181+
"--semantic-threshold-percentile=95",
182+
"--dry-run")
183+
output, err := cmd.CombinedOutput()
184+
185+
if err != nil {
186+
t.Fatalf("Create collection with semantic flags failed: %v, output: %s", err, string(output))
187+
}
136188
}

0 commit comments

Comments
 (0)