|
| 1 | +package storage |
| 2 | + |
| 3 | +import ( |
| 4 | + "testing" |
| 5 | + |
| 6 | + "github.com/stretchr/testify/assert" |
| 7 | +) |
| 8 | + |
| 9 | +func TestNodeNeedsEmbedding(t *testing.T) { |
| 10 | + t.Run("nil node returns false", func(t *testing.T) { |
| 11 | + assert.False(t, NodeNeedsEmbedding(nil)) |
| 12 | + }) |
| 13 | + |
| 14 | + t.Run("node with embedding returns false", func(t *testing.T) { |
| 15 | + node := &Node{ |
| 16 | + ID: "test-1", |
| 17 | + Labels: []string{"Person"}, |
| 18 | + Properties: map[string]interface{}{"name": "Alice"}, |
| 19 | + Embedding: []float32{0.1, 0.2, 0.3}, |
| 20 | + } |
| 21 | + assert.False(t, NodeNeedsEmbedding(node)) |
| 22 | + }) |
| 23 | + |
| 24 | + t.Run("node without embedding returns true", func(t *testing.T) { |
| 25 | + node := &Node{ |
| 26 | + ID: "test-2", |
| 27 | + Labels: []string{"Person"}, |
| 28 | + Properties: map[string]interface{}{"name": "Bob"}, |
| 29 | + Embedding: nil, |
| 30 | + } |
| 31 | + assert.True(t, NodeNeedsEmbedding(node)) |
| 32 | + }) |
| 33 | + |
| 34 | + t.Run("node with empty embedding returns true", func(t *testing.T) { |
| 35 | + node := &Node{ |
| 36 | + ID: "test-3", |
| 37 | + Labels: []string{"Person"}, |
| 38 | + Properties: map[string]interface{}{"name": "Charlie"}, |
| 39 | + Embedding: []float32{}, |
| 40 | + } |
| 41 | + assert.True(t, NodeNeedsEmbedding(node)) |
| 42 | + }) |
| 43 | + |
| 44 | + t.Run("internal node (underscore label) returns false", func(t *testing.T) { |
| 45 | + node := &Node{ |
| 46 | + ID: "test-4", |
| 47 | + Labels: []string{"_Internal"}, |
| 48 | + Properties: map[string]interface{}{"data": "internal"}, |
| 49 | + Embedding: nil, |
| 50 | + } |
| 51 | + assert.False(t, NodeNeedsEmbedding(node)) |
| 52 | + }) |
| 53 | + |
| 54 | + t.Run("node with embedding_skipped returns false", func(t *testing.T) { |
| 55 | + node := &Node{ |
| 56 | + ID: "test-5", |
| 57 | + Labels: []string{"Empty"}, |
| 58 | + Properties: map[string]interface{}{ |
| 59 | + "embedding_skipped": "no content", |
| 60 | + }, |
| 61 | + Embedding: nil, |
| 62 | + } |
| 63 | + assert.False(t, NodeNeedsEmbedding(node)) |
| 64 | + }) |
| 65 | + |
| 66 | + // REGRESSION TEST: has_embedding property should NOT affect the result |
| 67 | + // Only the actual Embedding array matters |
| 68 | + t.Run("REGRESSION: has_embedding=true but no embedding array returns true", func(t *testing.T) { |
| 69 | + // This was the bug - nodes with has_embedding=true but no actual embedding |
| 70 | + // were being skipped, preventing regeneration |
| 71 | + node := &Node{ |
| 72 | + ID: "test-6", |
| 73 | + Labels: []string{"File"}, |
| 74 | + Properties: map[string]interface{}{ |
| 75 | + "name": "test.txt", |
| 76 | + "has_embedding": true, // Property says true, but no actual embedding |
| 77 | + }, |
| 78 | + Embedding: nil, // No actual embedding array |
| 79 | + } |
| 80 | + // Should return true because we need to generate the embedding |
| 81 | + assert.True(t, NodeNeedsEmbedding(node), "node with has_embedding=true but no embedding array should need embedding") |
| 82 | + }) |
| 83 | + |
| 84 | + t.Run("REGRESSION: has_embedding=false but no embedding array returns true", func(t *testing.T) { |
| 85 | + // Another regression case - has_embedding=false should not prevent processing |
| 86 | + // unless embedding_skipped is also set |
| 87 | + node := &Node{ |
| 88 | + ID: "test-7", |
| 89 | + Labels: []string{"Document"}, |
| 90 | + Properties: map[string]interface{}{ |
| 91 | + "content": "Some content to embed", |
| 92 | + "has_embedding": false, |
| 93 | + }, |
| 94 | + Embedding: nil, |
| 95 | + } |
| 96 | + // Should return true - has_embedding property is ignored |
| 97 | + assert.True(t, NodeNeedsEmbedding(node), "node with has_embedding=false should still need embedding if no embedding_skipped") |
| 98 | + }) |
| 99 | + |
| 100 | + t.Run("REGRESSION: has_chunks=true but no embedding array returns true", func(t *testing.T) { |
| 101 | + // File nodes that were partially processed (has_chunks but no has_embedding) |
| 102 | + // should be re-processed |
| 103 | + node := &Node{ |
| 104 | + ID: "test-8", |
| 105 | + Labels: []string{"File"}, |
| 106 | + Properties: map[string]interface{}{ |
| 107 | + "name": "large_file.txt", |
| 108 | + "has_chunks": true, |
| 109 | + // Note: no has_embedding property - chunking was interrupted |
| 110 | + }, |
| 111 | + Embedding: nil, |
| 112 | + } |
| 113 | + // Should return true - needs to complete the chunking process |
| 114 | + assert.True(t, NodeNeedsEmbedding(node), "node with has_chunks=true but incomplete processing should need embedding") |
| 115 | + }) |
| 116 | + |
| 117 | + t.Run("node with content and no embedding returns true", func(t *testing.T) { |
| 118 | + node := &Node{ |
| 119 | + ID: "test-9", |
| 120 | + Labels: []string{"Memory"}, |
| 121 | + Properties: map[string]interface{}{ |
| 122 | + "content": "This is a memory that needs embedding", |
| 123 | + "createdAt": "2024-01-01", |
| 124 | + }, |
| 125 | + Embedding: nil, |
| 126 | + } |
| 127 | + assert.True(t, NodeNeedsEmbedding(node)) |
| 128 | + }) |
| 129 | + |
| 130 | + t.Run("FileChunk node without embedding returns true", func(t *testing.T) { |
| 131 | + node := &Node{ |
| 132 | + ID: "chunk-1", |
| 133 | + Labels: []string{"FileChunk"}, |
| 134 | + Properties: map[string]interface{}{ |
| 135 | + "content": "Chunk content", |
| 136 | + "chunk_index": 0, |
| 137 | + "parent_file": "file-123", |
| 138 | + }, |
| 139 | + Embedding: nil, |
| 140 | + } |
| 141 | + assert.True(t, NodeNeedsEmbedding(node)) |
| 142 | + }) |
| 143 | +} |
| 144 | + |
| 145 | +func TestNodeNeedsEmbedding_ClearedEmbeddings(t *testing.T) { |
| 146 | + // Simulate the regeneration scenario where embeddings are cleared |
| 147 | + t.Run("after clearing embeddings, node should need embedding", func(t *testing.T) { |
| 148 | + // Start with a node that has an embedding |
| 149 | + node := &Node{ |
| 150 | + ID: "regen-1", |
| 151 | + Labels: []string{"Person"}, |
| 152 | + Properties: map[string]interface{}{ |
| 153 | + "name": "Alice", |
| 154 | + "has_embedding": true, |
| 155 | + "embedding_model": "test-model", |
| 156 | + "embedding_dimensions": 512, |
| 157 | + "embedded_at": "2024-01-01", |
| 158 | + }, |
| 159 | + Embedding: []float32{0.1, 0.2, 0.3}, |
| 160 | + } |
| 161 | + assert.False(t, NodeNeedsEmbedding(node), "node with embedding should not need embedding") |
| 162 | + |
| 163 | + // Clear the embedding (simulating ClearAllEmbeddings) |
| 164 | + node.Embedding = nil |
| 165 | + |
| 166 | + // Now it should need embedding again, regardless of has_embedding property |
| 167 | + assert.True(t, NodeNeedsEmbedding(node), "node with cleared embedding should need embedding") |
| 168 | + }) |
| 169 | + |
| 170 | + t.Run("File node after clearing should need embedding", func(t *testing.T) { |
| 171 | + // File node that was chunked |
| 172 | + node := &Node{ |
| 173 | + ID: "file-regen", |
| 174 | + Labels: []string{"File"}, |
| 175 | + Properties: map[string]interface{}{ |
| 176 | + "name": "document.md", |
| 177 | + "has_chunks": true, |
| 178 | + "chunk_count": 5, |
| 179 | + "has_embedding": true, |
| 180 | + "embedded_at": "2024-01-01", |
| 181 | + }, |
| 182 | + Embedding: nil, // File nodes don't have embeddings, their chunks do |
| 183 | + } |
| 184 | + |
| 185 | + // This is tricky - File nodes with has_chunks=true don't have their own embedding |
| 186 | + // The chunks have embeddings. So this node should NOT need embedding itself. |
| 187 | + // But if we're regenerating, we need to process it to recreate the chunks. |
| 188 | + // For now, we return true so it gets processed and chunks get recreated. |
| 189 | + assert.True(t, NodeNeedsEmbedding(node), "File node should be processed to recreate chunks") |
| 190 | + }) |
| 191 | +} |
0 commit comments