Skip to content

Commit 2c16c64

Browse files
authored
Merge pull request #69 from datum-cloud/test/e2e
fix: resolve stale document bug on policy update and add comprehensive e2e coverage
2 parents 34644fd + c60993e commit 2c16c64

File tree

15 files changed

+2951
-61
lines changed

15 files changed

+2951
-61
lines changed

.github/workflows/test-environment-validation.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
name: Test Environment Setup and Validation
22

33
on:
4+
push:
5+
branches:
6+
- main
47
workflow_dispatch:
58
inputs:
69
test_suite:

config/overlays/ci/kustomization.yaml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,30 @@ images:
2929

3030
patches:
3131
- path: patches/vector-sidecar-patch.yaml
32+
# In kind, images are loaded directly via `kind load` — there is no registry
33+
# to pull from. Setting imagePullPolicy: Never ensures the kubelet always uses
34+
# the image that was loaded, and never falls back to a stale cached layer.
35+
- target:
36+
kind: Deployment
37+
name: search-apiserver
38+
patch: |-
39+
- op: replace
40+
path: /spec/template/spec/containers/0/imagePullPolicy
41+
value: Never
42+
- target:
43+
kind: Deployment
44+
name: search-controller-manager
45+
patch: |-
46+
- op: replace
47+
path: /spec/template/spec/containers/0/imagePullPolicy
48+
value: Never
49+
- target:
50+
kind: Deployment
51+
name: resource-indexer
52+
patch: |-
53+
- op: add
54+
path: /spec/template/spec/containers/0/imagePullPolicy
55+
value: Never
3256
- target:
3357
kind: APIService
3458
name: v1alpha1.search.miloapis.com

config/overlays/dev/kustomization.yaml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,30 @@ images:
2828

2929
patches:
3030
- path: patches/vector-sidecar-patch.yaml
31+
# In kind, images are loaded directly via `kind load` — there is no registry
32+
# to pull from. Setting imagePullPolicy: Never ensures the kubelet always uses
33+
# the image that was loaded, and never falls back to a stale cached layer.
34+
- target:
35+
kind: Deployment
36+
name: search-apiserver
37+
patch: |-
38+
- op: replace
39+
path: /spec/template/spec/containers/0/imagePullPolicy
40+
value: Never
41+
- target:
42+
kind: Deployment
43+
name: search-controller-manager
44+
patch: |-
45+
- op: replace
46+
path: /spec/template/spec/containers/0/imagePullPolicy
47+
value: Never
48+
- target:
49+
kind: Deployment
50+
name: resource-indexer
51+
patch: |-
52+
- op: add
53+
path: /spec/template/spec/containers/0/imagePullPolicy
54+
value: Never
3155
- target:
3256
kind: APIService
3357
name: v1alpha1.search.miloapis.com

internal/controllers/policy/policy_controller.go

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,6 @@ package policy
22

33
import (
44
"context"
5-
"crypto/sha256"
6-
"encoding/hex"
7-
"encoding/json"
85
"fmt"
96
"sort"
107
"strings"
@@ -35,7 +32,9 @@ import (
3532
// single Kubernetes resource for background re-indexing (e.g. the NATS-backed
3633
type ResourceReindexPublisher interface {
3734
// PublishResource publishes a single resource object for re-indexing.
38-
PublishResource(ctx context.Context, resource map[string]any, resourceID string) error
35+
// policyName, indexName, and specHash identify the policy version that triggered the re-index
36+
// so the consumer can evaluate against the correct policy conditions.
37+
PublishResource(ctx context.Context, resource map[string]any, resourceID, policyName, indexName, specHash string) error
3938
}
4039

4140
// ResourceIndexPolicyReconciler reconciles a ResourceIndexPolicy object
@@ -437,17 +436,9 @@ func (r *ResourceIndexPolicyReconciler) Reconcile(ctx context.Context, req ctrl.
437436

438437
}
439438

440-
// computeSpecHash returns a short SHA-256 hex digest of the policy spec.
441-
// It is stored in an annotation to detect spec changes on aggregated API
442-
// servers that do not manage metadata.generation automatically.
439+
// computeSpecHash delegates to the shared utility for computing a policy spec hash.
443440
func computeSpecHash(spec *searchv1alpha1.ResourceIndexPolicySpec) string {
444-
b, err := json.Marshal(spec)
445-
if err != nil {
446-
// Extremely unlikely; fall back to a zero string so we always re-index.
447-
return ""
448-
}
449-
sum := sha256.Sum256(b)
450-
return hex.EncodeToString(sum[:])
441+
return utils.ComputeSpecHash(spec)
451442
}
452443

453444
// publishReindexMessages lists all resources matching the policy's TargetResource
@@ -498,13 +489,16 @@ func (r *ResourceIndexPolicyReconciler) publishReindexMessages(
498489
return fmt.Errorf("failed to list %v resources: %w", gvk, err)
499490
}
500491

492+
indexName := policy.Status.IndexName
493+
specHash := computeSpecHash(&policy.Spec)
494+
501495
for i := range list.Items {
502496
obj := &list.Items[i]
503497
logger.Info("Publishing re-index message", "resource", obj.GetName(), "namespace", obj.GetNamespace())
504498
// Use "reindex/<policyName>/<uid>" as the resourceID so that duplicate
505499
// messages from rapid policy updates are deduplicated by NATS.
506500
resourceID := fmt.Sprintf("reindex/%s/%s", policy.Name, obj.GetUID())
507-
if err := r.ReindexPublisher.PublishResource(ctx, obj.Object, resourceID); err != nil {
501+
if err := r.ReindexPublisher.PublishResource(ctx, obj.Object, resourceID, policy.Name, indexName, specHash); err != nil {
508502
logger.Error(err, "Failed to publish re-index message",
509503
"resource", obj.GetName(), "namespace", obj.GetNamespace())
510504
continue

internal/indexer/policy_cache.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,13 @@ func (c *PolicyCache) WaitForCacheSync(ctx context.Context) bool {
175175
return c.cache.WaitForCacheSync(ctx)
176176
}
177177

178+
// GetPolicy returns a single cached policy by name, or nil if not found.
179+
func (c *PolicyCache) GetPolicy(name string) *policyevaluation.CachedPolicy {
180+
c.mu.RLock()
181+
defer c.mu.RUnlock()
182+
return c.policies[name]
183+
}
184+
178185
// GetPolicies returns a snapshot of all cached policies.
179186
func (c *PolicyCache) GetPolicies() []*policyevaluation.CachedPolicy {
180187
c.mu.RLock()

internal/indexer/reindex_consumer.go

Lines changed: 57 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"fmt"
77

88
"github.com/nats-io/nats.go/jetstream"
9+
"go.miloapis.net/search/internal/utils"
910
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
1011
"k8s.io/klog/v2"
1112
)
@@ -58,49 +59,7 @@ func (r *ReindexConsumer) Start(ctx context.Context) error {
5859
return
5960
}
6061

61-
queued := false
62-
policies := r.policyCache.GetPolicies()
63-
64-
for _, cp := range policies {
65-
// Skip if index name is not set yet
66-
if cp.Policy.Status.IndexName == "" {
67-
klog.V(2).Infof("ReindexConsumer: policy %s has no IndexName in status, skipping", cp.Policy.Name)
68-
continue
69-
}
70-
71-
evalResult, err := cp.Evaluate(obj)
72-
if err != nil {
73-
klog.Errorf("ReindexConsumer: policy %s evaluation error: %v", cp.Policy.Name, err)
74-
continue
75-
}
76-
77-
if evalResult.Matched {
78-
klog.V(4).Infof("ReindexConsumer: match policy=%s resource=%s/%s (id=%s)",
79-
cp.Policy.Name, obj.GetNamespace(), obj.GetName(), event.ID)
80-
81-
// Transform into indexable document
82-
doc := evalResult.Transform()
83-
84-
// Ensure UID is set as primary key
85-
ensureUID(doc, resourceUID)
86-
87-
r.batcher.QueueUpsert(cp.Policy.Status.IndexName, doc, &msg)
88-
queued = true
89-
} else {
90-
klog.V(4).Infof("ReindexConsumer: policy %s did not match resource %s/%s (id=%s), ensuring deletion", cp.Policy.Name, obj.GetNamespace(), obj.GetName(), event.ID)
91-
92-
// If it doesn't match this policy, we should ensure it's removed from the index
93-
// in case it was previously indexed there.
94-
r.batcher.QueueDelete(cp.Policy.Status.IndexName, resourceUID, &msg)
95-
queued = true
96-
}
97-
}
98-
99-
// If the message wasn't queued for any operation (e.g. no policies), acknowledge it
100-
if !queued {
101-
klog.Warningf("ReindexConsumer: event (id=%s) matched no active policies in cache, skipping", event.ID)
102-
msg.Ack()
103-
}
62+
r.processTargetedEvent(msg, event, obj, resourceUID)
10463
})
10564

10665
if err != nil {
@@ -113,3 +72,58 @@ func (r *ReindexConsumer) Start(ctx context.Context) error {
11372
klog.Info("Shutting down ReindexConsumer...")
11473
return nil
11574
}
75+
76+
// processTargetedEvent evaluates a resource against the specific policy identified
77+
// in the event. It verifies the cached policy's spec hash matches the event's
78+
// spec hash to ensure evaluation uses the correct (updated) conditions.
79+
func (r *ReindexConsumer) processTargetedEvent(msg jetstream.Msg, event ReindexEvent, obj *unstructured.Unstructured, resourceUID string) {
80+
if event.PolicyName == "" || event.IndexName == "" {
81+
klog.Warningf("ReindexConsumer: event (id=%s) missing policyName or indexName, dropping", event.ID)
82+
msg.Ack()
83+
return
84+
}
85+
86+
cp := r.policyCache.GetPolicy(event.PolicyName)
87+
if cp == nil {
88+
// Policy not in cache yet — NAK so the message is redelivered
89+
// after the informer propagates the policy to the cache.
90+
klog.V(2).Infof("ReindexConsumer: policy %s not in cache yet, NAK for redelivery (id=%s)",
91+
event.PolicyName, event.ID)
92+
msg.Nak()
93+
return
94+
}
95+
96+
// Verify the cached policy spec matches the version that triggered re-indexing.
97+
if event.SpecHash != "" {
98+
cachedHash := utils.ComputeSpecHash(&cp.Policy.Spec)
99+
if cachedHash != event.SpecHash {
100+
// Cache is stale — NAK so the message is redelivered after
101+
// the informer propagates the updated policy spec.
102+
klog.V(2).Infof("ReindexConsumer: policy %s cache stale (cached=%s, event=%s), NAK for redelivery (id=%s)",
103+
event.PolicyName, cachedHash[:8], event.SpecHash[:8], event.ID)
104+
msg.Nak()
105+
return
106+
}
107+
}
108+
109+
evalResult, err := cp.Evaluate(obj)
110+
if err != nil {
111+
klog.Errorf("ReindexConsumer: policy %s evaluation error: %v", event.PolicyName, err)
112+
msg.Ack()
113+
return
114+
}
115+
116+
if evalResult.Matched {
117+
klog.V(4).Infof("ReindexConsumer: match policy=%s resource=%s/%s (id=%s)",
118+
event.PolicyName, obj.GetNamespace(), obj.GetName(), event.ID)
119+
120+
doc := evalResult.Transform()
121+
ensureUID(doc, resourceUID)
122+
r.batcher.QueueUpsert(event.IndexName, doc, &msg)
123+
} else {
124+
klog.V(4).Infof("ReindexConsumer: policy %s did not match resource %s/%s (id=%s), deleting from index",
125+
event.PolicyName, obj.GetNamespace(), obj.GetName(), event.ID)
126+
127+
r.batcher.QueueDelete(event.IndexName, resourceUID, &msg)
128+
}
129+
}

internal/indexer/reindex_publisher.go

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,13 @@ type ReindexEvent struct {
1515
ID string `json:"id"`
1616
// Resource is the full Kubernetes resource object to be re-indexed.
1717
Resource map[string]any `json:"resource"`
18+
// PolicyName identifies the policy that triggered this re-index.
19+
PolicyName string `json:"policyName"`
20+
// IndexName is the Meilisearch index name from the policy status.
21+
IndexName string `json:"indexName"`
22+
// SpecHash is the SHA-256 hash of the policy spec at the time of publishing.
23+
// The consumer uses this to ensure it evaluates against the correct policy version.
24+
SpecHash string `json:"specHash"`
1825
}
1926

2027
// ReindexPublisher publishes ReindexEvents to the REINDEX_EVENTS JetStream stream.
@@ -29,10 +36,13 @@ func NewReindexPublisher(js jetstream.JetStream, subject string) *ReindexPublish
2936
}
3037

3138
// PublishResource publishes a single Kubernetes resource for re-indexing.
32-
func (p *ReindexPublisher) PublishResource(ctx context.Context, resource map[string]any, id string) error {
39+
func (p *ReindexPublisher) PublishResource(ctx context.Context, resource map[string]any, id, policyName, indexName, specHash string) error {
3340
evt := ReindexEvent{
34-
ID: id,
35-
Resource: resource,
41+
ID: id,
42+
Resource: resource,
43+
PolicyName: policyName,
44+
IndexName: indexName,
45+
SpecHash: specHash,
3646
}
3747

3848
data, err := json.Marshal(evt)

internal/utils/search.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
package utils
22

33
import (
4+
"crypto/sha256"
5+
"encoding/hex"
6+
"encoding/json"
47
"strings"
58

69
searchv1alpha1 "go.miloapis.net/search/pkg/apis/search/v1alpha1"
@@ -10,3 +13,15 @@ func GetSearchIndex(tg searchv1alpha1.TargetResource) string {
1013
res := tg.Group + "_" + tg.Version + "_" + tg.Kind
1114
return strings.ReplaceAll(res, ".", "-")
1215
}
16+
17+
// ComputeSpecHash returns a SHA-256 hex digest of the policy spec.
18+
// Used by both the controller (to detect spec changes) and the re-index
19+
// consumer (to verify cache freshness before evaluating).
20+
func ComputeSpecHash(spec *searchv1alpha1.ResourceIndexPolicySpec) string {
21+
b, err := json.Marshal(spec)
22+
if err != nil {
23+
return ""
24+
}
25+
sum := sha256.Sum256(b)
26+
return hex.EncodeToString(sum[:])
27+
}

0 commit comments

Comments
 (0)