Skip to content

Commit 81f4b3c

Browse files
committed
fixing underlying embed queue issue and adding tests
1 parent ef222e9 commit 81f4b3c

File tree

6 files changed

+462
-27
lines changed

6 files changed

+462
-27
lines changed

pkg/nornicdb/embed_queue.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ func DefaultEmbedWorkerConfig() *EmbedWorkerConfig {
7979
ChunkSize: 512,
8080
ChunkOverlap: 50,
8181
ClusterDebounceDelay: 30 * time.Second, // Wait 30s after last embedding before k-means
82-
ClusterMinBatchSize: 10, // Need at least 10 embeddings to trigger k-means
82+
ClusterMinBatchSize: 10, // Need at least 10 embeddings to trigger k-means
8383
}
8484
}
8585

@@ -319,8 +319,11 @@ func (ew *EmbedWorker) worker() {
319319
// Short initial delay to let server start
320320
time.Sleep(500 * time.Millisecond)
321321

322-
// Immediate scan on startup for any existing nodes needing embedding
322+
// Refresh the pending embeddings index on startup to catch any nodes
323+
// that need embedding (e.g., after restart, bulk import, or cleared embeddings)
323324
fmt.Println("🔍 Initial scan for nodes needing embeddings...")
325+
ew.refreshEmbeddingIndex()
326+
324327
ew.processUntilEmpty()
325328

326329
ticker := time.NewTicker(ew.config.ScanInterval)

pkg/storage/badger.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2818,6 +2818,13 @@ func (b *BadgerEngine) ClearAllEmbeddings() (int, error) {
28182818
}
28192819

28202820
log.Printf("✓ Cleared embeddings from %d nodes", cleared)
2821+
2822+
// Refresh the pending embeddings index so nodes get re-processed
2823+
added := b.RefreshPendingEmbeddingsIndex()
2824+
if added > 0 {
2825+
log.Printf("📊 Added %d nodes to pending embeddings queue", added)
2826+
}
2827+
28212828
return cleared, nil
28222829
}
28232830

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
package storage
2+
3+
import (
4+
"testing"
5+
6+
"github.com/stretchr/testify/assert"
7+
)
8+
9+
func TestNodeNeedsEmbedding(t *testing.T) {
10+
t.Run("nil node returns false", func(t *testing.T) {
11+
assert.False(t, NodeNeedsEmbedding(nil))
12+
})
13+
14+
t.Run("node with embedding returns false", func(t *testing.T) {
15+
node := &Node{
16+
ID: "test-1",
17+
Labels: []string{"Person"},
18+
Properties: map[string]interface{}{"name": "Alice"},
19+
Embedding: []float32{0.1, 0.2, 0.3},
20+
}
21+
assert.False(t, NodeNeedsEmbedding(node))
22+
})
23+
24+
t.Run("node without embedding returns true", func(t *testing.T) {
25+
node := &Node{
26+
ID: "test-2",
27+
Labels: []string{"Person"},
28+
Properties: map[string]interface{}{"name": "Bob"},
29+
Embedding: nil,
30+
}
31+
assert.True(t, NodeNeedsEmbedding(node))
32+
})
33+
34+
t.Run("node with empty embedding returns true", func(t *testing.T) {
35+
node := &Node{
36+
ID: "test-3",
37+
Labels: []string{"Person"},
38+
Properties: map[string]interface{}{"name": "Charlie"},
39+
Embedding: []float32{},
40+
}
41+
assert.True(t, NodeNeedsEmbedding(node))
42+
})
43+
44+
t.Run("internal node (underscore label) returns false", func(t *testing.T) {
45+
node := &Node{
46+
ID: "test-4",
47+
Labels: []string{"_Internal"},
48+
Properties: map[string]interface{}{"data": "internal"},
49+
Embedding: nil,
50+
}
51+
assert.False(t, NodeNeedsEmbedding(node))
52+
})
53+
54+
t.Run("node with embedding_skipped returns false", func(t *testing.T) {
55+
node := &Node{
56+
ID: "test-5",
57+
Labels: []string{"Empty"},
58+
Properties: map[string]interface{}{
59+
"embedding_skipped": "no content",
60+
},
61+
Embedding: nil,
62+
}
63+
assert.False(t, NodeNeedsEmbedding(node))
64+
})
65+
66+
// REGRESSION TEST: has_embedding property should NOT affect the result
67+
// Only the actual Embedding array matters
68+
t.Run("REGRESSION: has_embedding=true but no embedding array returns true", func(t *testing.T) {
69+
// This was the bug - nodes with has_embedding=true but no actual embedding
70+
// were being skipped, preventing regeneration
71+
node := &Node{
72+
ID: "test-6",
73+
Labels: []string{"File"},
74+
Properties: map[string]interface{}{
75+
"name": "test.txt",
76+
"has_embedding": true, // Property says true, but no actual embedding
77+
},
78+
Embedding: nil, // No actual embedding array
79+
}
80+
// Should return true because we need to generate the embedding
81+
assert.True(t, NodeNeedsEmbedding(node), "node with has_embedding=true but no embedding array should need embedding")
82+
})
83+
84+
t.Run("REGRESSION: has_embedding=false but no embedding array returns true", func(t *testing.T) {
85+
// Another regression case - has_embedding=false should not prevent processing
86+
// unless embedding_skipped is also set
87+
node := &Node{
88+
ID: "test-7",
89+
Labels: []string{"Document"},
90+
Properties: map[string]interface{}{
91+
"content": "Some content to embed",
92+
"has_embedding": false,
93+
},
94+
Embedding: nil,
95+
}
96+
// Should return true - has_embedding property is ignored
97+
assert.True(t, NodeNeedsEmbedding(node), "node with has_embedding=false should still need embedding if no embedding_skipped")
98+
})
99+
100+
t.Run("REGRESSION: has_chunks=true but no embedding array returns true", func(t *testing.T) {
101+
// File nodes that were partially processed (has_chunks but no has_embedding)
102+
// should be re-processed
103+
node := &Node{
104+
ID: "test-8",
105+
Labels: []string{"File"},
106+
Properties: map[string]interface{}{
107+
"name": "large_file.txt",
108+
"has_chunks": true,
109+
// Note: no has_embedding property - chunking was interrupted
110+
},
111+
Embedding: nil,
112+
}
113+
// Should return true - needs to complete the chunking process
114+
assert.True(t, NodeNeedsEmbedding(node), "node with has_chunks=true but incomplete processing should need embedding")
115+
})
116+
117+
t.Run("node with content and no embedding returns true", func(t *testing.T) {
118+
node := &Node{
119+
ID: "test-9",
120+
Labels: []string{"Memory"},
121+
Properties: map[string]interface{}{
122+
"content": "This is a memory that needs embedding",
123+
"createdAt": "2024-01-01",
124+
},
125+
Embedding: nil,
126+
}
127+
assert.True(t, NodeNeedsEmbedding(node))
128+
})
129+
130+
t.Run("FileChunk node without embedding returns true", func(t *testing.T) {
131+
node := &Node{
132+
ID: "chunk-1",
133+
Labels: []string{"FileChunk"},
134+
Properties: map[string]interface{}{
135+
"content": "Chunk content",
136+
"chunk_index": 0,
137+
"parent_file": "file-123",
138+
},
139+
Embedding: nil,
140+
}
141+
assert.True(t, NodeNeedsEmbedding(node))
142+
})
143+
}
144+
145+
func TestNodeNeedsEmbedding_ClearedEmbeddings(t *testing.T) {
146+
// Simulate the regeneration scenario where embeddings are cleared
147+
t.Run("after clearing embeddings, node should need embedding", func(t *testing.T) {
148+
// Start with a node that has an embedding
149+
node := &Node{
150+
ID: "regen-1",
151+
Labels: []string{"Person"},
152+
Properties: map[string]interface{}{
153+
"name": "Alice",
154+
"has_embedding": true,
155+
"embedding_model": "test-model",
156+
"embedding_dimensions": 512,
157+
"embedded_at": "2024-01-01",
158+
},
159+
Embedding: []float32{0.1, 0.2, 0.3},
160+
}
161+
assert.False(t, NodeNeedsEmbedding(node), "node with embedding should not need embedding")
162+
163+
// Clear the embedding (simulating ClearAllEmbeddings)
164+
node.Embedding = nil
165+
166+
// Now it should need embedding again, regardless of has_embedding property
167+
assert.True(t, NodeNeedsEmbedding(node), "node with cleared embedding should need embedding")
168+
})
169+
170+
t.Run("File node after clearing should need embedding", func(t *testing.T) {
171+
// File node that was chunked
172+
node := &Node{
173+
ID: "file-regen",
174+
Labels: []string{"File"},
175+
Properties: map[string]interface{}{
176+
"name": "document.md",
177+
"has_chunks": true,
178+
"chunk_count": 5,
179+
"has_embedding": true,
180+
"embedded_at": "2024-01-01",
181+
},
182+
Embedding: nil, // File nodes don't have embeddings, their chunks do
183+
}
184+
185+
// This is tricky - File nodes with has_chunks=true don't have their own embedding
186+
// The chunks have embeddings. So this node should NOT need embedding itself.
187+
// But if we're regenerating, we need to process it to recreate the chunks.
188+
// For now, we return true so it gets processed and chunks get recreated.
189+
assert.True(t, NodeNeedsEmbedding(node), "File node should be processed to recreate chunks")
190+
})
191+
}

0 commit comments

Comments
 (0)