Skip to content

Commit 72a5c2b

Browse files
test(simulation): expose worker domain readiness errors
Signed-off-by: Diana Zawadzki <dzawa@live.de> Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 5d16724 commit 72a5c2b

File tree

2 files changed

+49
-2
lines changed

2 files changed

+49
-2
lines changed

simulation/replication/replication_simulation_test.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ import (
3636
"encoding/json"
3737
"flag"
3838
"fmt"
39+
"io"
3940
"net"
4041
"net/http"
4142
"os"
@@ -680,6 +681,7 @@ func waitUntilWorkersReady(t *testing.T) {
680681
for {
681682
allHealthy := true
682683
lastStatus := make(map[string]int, len(workerEndpoints))
684+
lastBody := make(map[string]string, len(workerEndpoints))
683685
lastErr := make(map[string]error, len(workerEndpoints))
684686
for _, endpoint := range workerEndpoints {
685687
resp, err := client.Get(endpoint)
@@ -688,8 +690,12 @@ func waitUntilWorkersReady(t *testing.T) {
688690
allHealthy = false
689691
break
690692
}
693+
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
691694
_ = resp.Body.Close()
692695
lastStatus[endpoint] = resp.StatusCode
696+
if len(body) > 0 {
697+
lastBody[endpoint] = strings.TrimSpace(string(body))
698+
}
693699
if resp.StatusCode != http.StatusOK {
694700
allHealthy = false
695701
break
@@ -705,7 +711,11 @@ func waitUntilWorkersReady(t *testing.T) {
705711
if err := lastErr[endpoint]; err != nil {
706712
simTypes.Logf(t, "Workers are not reporting healthy yet (%s error: %v). Sleep for 2s and try again", endpoint, err)
707713
} else if status, ok := lastStatus[endpoint]; ok {
708-
simTypes.Logf(t, "Workers are not reporting healthy yet (%s status: %d). Sleep for 2s and try again", endpoint, status)
714+
if body, ok := lastBody[endpoint]; ok && body != "" {
715+
simTypes.Logf(t, "Workers are not reporting healthy yet (%s status: %d body: %s). Sleep for 2s and try again", endpoint, status, body)
716+
} else {
717+
simTypes.Logf(t, "Workers are not reporting healthy yet (%s status: %d). Sleep for 2s and try again", endpoint, status)
718+
}
709719
}
710720
}
711721
time.Sleep(2 * time.Second)

simulation/replication/worker/cmd/main.go

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ package main
2222

2323
import (
2424
"context"
25+
"encoding/json"
2526
"flag"
2627
"fmt"
2728
"net/http"
@@ -54,6 +55,9 @@ var (
5455
clusterName = flag.String("cluster", "", "cluster name")
5556

5657
ready int32
58+
59+
domainErrMu sync.Mutex
60+
domainLastErr = map[string]string{}
5761
)
5862

5963
func main() {
@@ -101,12 +105,41 @@ func main() {
101105
apiv1.NewVisibilityAPIYARPCClient(clientConfig),
102106
)
103107

108+
type healthStatus struct {
109+
Cluster string `json:"cluster"`
110+
GRPCEndpoint string `json:"grpcEndpoint"`
111+
ReadyDomains int32 `json:"readyDomains"`
112+
TotalDomains int `json:"totalDomains"`
113+
LastErrors map[string]string `json:"lastErrors,omitempty"`
114+
}
115+
104116
http.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
105-
if atomic.LoadInt32(&ready) == int32(len(simCfg.Domains)) {
117+
readyDomains := atomic.LoadInt32(&ready)
118+
isReady := readyDomains == int32(len(simCfg.Domains))
119+
120+
domainErrMu.Lock()
121+
lastErrors := make(map[string]string, len(domainLastErr))
122+
for k, v := range domainLastErr {
123+
lastErrors[k] = v
124+
}
125+
domainErrMu.Unlock()
126+
127+
w.Header().Set("Content-Type", "application/json")
128+
status := healthStatus{
129+
Cluster: *clusterName,
130+
GRPCEndpoint: cluster.GRPCEndpoint,
131+
ReadyDomains: readyDomains,
132+
TotalDomains: len(simCfg.Domains),
133+
LastErrors: lastErrors,
134+
}
135+
b, _ := json.Marshal(status) // best-effort diagnostics
136+
137+
if isReady {
106138
w.WriteHeader(http.StatusOK)
107139
} else {
108140
w.WriteHeader(http.StatusServiceUnavailable)
109141
}
142+
_, _ = w.Write(b)
110143
})
111144
go http.ListenAndServe(":6060", nil)
112145

@@ -186,6 +219,10 @@ func waitUntilDomainReady(logger *zap.Logger, client workflowserviceclient.Inter
186219
return
187220
}
188221

222+
domainErrMu.Lock()
223+
domainLastErr[domainName] = err.Error()
224+
domainErrMu.Unlock()
225+
189226
logger.Info("Domains not ready", zap.String("domain", domainName), zap.Error(err))
190227
time.Sleep(2 * time.Second)
191228
}

0 commit comments

Comments
 (0)