Skip to content

Commit d0e8dde

Browse files
authored
build/devenv/tests/e2e: add indexer down chaos test (#503)
* build/devenv/tests/e2e: add indexer down chaos test Add a chaos test that brings down the indexer for a period of time where it is needed to index verifier results, and bring it back up. The indexer is expected to recover these verifier results and the executor is expected to execute the message. While writing the test a null pointer exception was found, so fixed that by checking whether the response is null or not. * run indexer chaos in a separate job * timeouts per test in matrix * add indexer restart policy * try the always restart policy * ping indexer db on startup * try on-failure * retry with sleeps in main.go * combine indexer chaos with other tests to reduce ci time * dump logs for timed out cl test
1 parent 24cfa32 commit d0e8dde

File tree

7 files changed

+204
-75
lines changed

7 files changed

+204
-75
lines changed

.github/workflows/test-cl-smoke.yaml

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,14 @@ jobs:
2020
runs-on: runs-on=${{ github.run_id }}-${{ strategy.job-index }}/family=c6i/cpu=32+48/ram=64+96/spot=false/image=ubuntu24-full-x64/extras=s3-cache+tmpfs
2121
strategy:
2222
matrix:
23-
test: [
24-
{ name: "TestE2ESmoke", config: "env.toml,env-cl.toml,env-cl-ci.toml" },
23+
test:
24+
- name: TestE2ESmoke
25+
config: "env.toml,env-cl.toml,env-cl-ci.toml"
26+
timeout: 15m
2527
# We need to configure HeadTracker for the CL tests to have finality depth. Otherwise, it does instant finality.
26-
# { name: "TestE2EReorg", config: "env.toml,env-src-auto-mine.toml,env-cl.toml,env-cl-ci.toml" },
27-
]
28+
# - name: TestE2EReorg
29+
# config: "env.toml,env-src-auto-mine.toml,env-cl.toml,env-cl-ci.toml"
30+
# timeout: 5m
2831
fail-fast: false
2932
steps:
3033
- name: Enable S3 Cache for Self-Hosted Runners
@@ -92,16 +95,44 @@ jobs:
9295
ccv u ${{ matrix.test.config }}
9396
9497
- name: Run Smoke Test
98+
id: test_run
9599
working-directory: build/devenv/tests/e2e
96100
env:
97101
LOKI_URL: http://localhost:3030/loki/api/v1/push
98102
run: |
99-
go test -v -timeout 15m -count=1 -run ${{ matrix.test.name }}
103+
set -o pipefail
104+
go test -v -timeout ${{ matrix.test.timeout }} -count=1 -run ${{ matrix.test.name }}
105+
continue-on-error: true
100106

107+
- name: Dump logs if they're not already dumped
108+
if: always()
109+
working-directory: build/devenv/tests/e2e
110+
run: |
111+
if [ ! -d "logs" ] || [ -z "$(ls -A logs)" ]; then
112+
echo "Logs not found or empty. Dumping logs manually..."
113+
ccv dl --dir-suffix "${{ matrix.test.name }}"
114+
else
115+
echo "Logs already exist. Skipping manual dump."
116+
fi
117+
118+
- name: Sanitize test name for log artifact name
119+
if: always()
120+
id: sanitize_name
121+
run: |
122+
# The vertical bar is invalid in the uploaded artifact name, so we replace it with an underscore.
123+
SANITIZED_NAME=$(echo "${{ matrix.test.name }}" | sed 's/[()|]/_/g')
124+
echo "name=$SANITIZED_NAME" >> $GITHUB_OUTPUT
125+
101126
- name: Upload Logs
102127
if: always()
103128
uses: actions/upload-artifact@v4
104129
with:
105-
name: container-logs-cl-${{ matrix.test.name }}
130+
name: container-logs-cl-${{ steps.sanitize_name.outputs.name }}
106131
path: build/devenv/tests/e2e/logs
107132
retention-days: 1
133+
134+
- name: Check test results
135+
if: always() && steps.test_run.outcome == 'failure'
136+
run: |
137+
echo "${{ matrix.test.name }} tests failed."
138+
exit 1

.github/workflows/test-smoke.yaml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,16 @@ jobs:
2525
test:
2626
- name: TestE2ESmoke
2727
config: env.toml
28+
timeout: 15m
2829
- name: TestE2EReorg
2930
config: env.toml,env-src-auto-mine.toml
31+
timeout: 5m
3032
- name: TestChaos_AggregatorOutageRecovery
3133
config: env.toml
32-
- name: (TestChaos_VerifierFaultToleranceThresholdViolated|TestChaos_AllButOneExecutorDown)
34+
timeout: 5m
35+
- name: (TestChaos_VerifierFaultToleranceThresholdViolated|TestChaos_AllExecutorsDown|TestChaos_IndexerDown)
3336
config: env.toml
37+
timeout: 5m
3438
steps:
3539
- name: Enable S3 Cache for Self-Hosted Runners
3640
uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
@@ -91,7 +95,7 @@ jobs:
9195
working-directory: build/devenv/tests/e2e
9296
run: |
9397
set -o pipefail
94-
go test -v -timeout 15m -count=1 -run '${{ matrix.test.name }}'
98+
go test -v -timeout ${{ matrix.test.timeout }} -count=1 -run '${{ matrix.test.name }}'
9599
continue-on-error: true
96100

97101
- name: Dump logs if they're not already dumped

build/devenv/environment.go

Lines changed: 86 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,6 @@ const (
7878
)
7979

8080
type Cfg struct {
81-
Mode services.Mode `toml:"mode"`
8281
CLDF CLDF `toml:"cldf" validate:"required"`
8382
JD *jd.Input `toml:"jd" validate:"required"`
8483
Fake *services.FakeInput `toml:"fake" validate:"required"`
@@ -111,6 +110,8 @@ func (c *Cfg) NewAggregatorClientForCommittee(logger zerolog.Logger, committeeNa
111110
return NewAggregatorClient(logger, endpoint, caCertFile)
112111
}
113112

113+
// checkKeys performs basic sanity checks on the private key being used depending on which chain is in
114+
// the provided configuration.
114115
func checkKeys(in *Cfg) error {
115116
if getNetworkPrivateKey() != DefaultAnvilKey && in.Blockchains[0].ChainID == "1337" && in.Blockchains[1].ChainID == "2337" {
116117
return errors.New("you are trying to run simulated chains with a key that do not belong to Anvil, please run 'unset PRIVATE_KEY'")
@@ -133,7 +134,7 @@ func NewProductConfigurationFromNetwork(typ string) (cciptestinterfaces.CCIP17Pr
133134
}
134135
}
135136

136-
// NewEnvironment creates a new CCIP CCV environment either locally in Docker or remotely in K8s.
137+
// NewEnvironment creates a new CCIP CCV environment locally in Docker.
137138
func NewEnvironment() (in *Cfg, err error) {
138139
ctx := context.Background()
139140
timeTrack := NewTimeTracker(Plog)
@@ -149,6 +150,10 @@ func NewEnvironment() (in *Cfg, err error) {
149150
return nil, err
150151
}
151152

153+
/////////////////////////////
154+
// START: Read Config toml //
155+
/////////////////////////////
156+
152157
configs := strings.Split(os.Getenv(EnvVarTestConfigs), ",")
153158
if len(configs) > 1 {
154159
L.Warn().Msg("Multiple configuration files detected, this feature may be unsupported in the future.")
@@ -158,15 +163,6 @@ func NewEnvironment() (in *Cfg, err error) {
158163
return nil, fmt.Errorf("failed to load configuration: %w", err)
159164
}
160165

161-
///////////////////////////////
162-
// Start: Initialize Configs //
163-
///////////////////////////////
164-
165-
// Override the default config to "cl"...
166-
if in.Mode == "" {
167-
in.Mode = services.Standalone
168-
}
169-
170166
// Executor config...
171167
if in.Executor != nil {
172168
for _, exec := range in.Executor {
@@ -175,20 +171,23 @@ func NewEnvironment() (in *Cfg, err error) {
175171
}
176172

177173
/////////////////////////////
178-
// End: Initialize Configs //
174+
// END: Read Config toml //
179175
/////////////////////////////
180176

181-
if err = checkKeys(in); err != nil {
182-
return nil, err
183-
}
184-
185177
// Start fake data provider. This isn't really used, but may be useful in the future.
186178
_, err = services.NewFake(in.Fake)
187179
if err != nil {
188180
return nil, fmt.Errorf("failed to create fake data provider: %w", err)
189181
}
190182

191-
// Start blockchains, the services crash if the RPC is not available.
183+
///////////////////////////////
184+
// START: Deploy blockchains //
185+
// The services crash if the RPC is not available.
186+
///////////////////////////////
187+
if err = checkKeys(in); err != nil {
188+
return nil, err
189+
}
190+
192191
impls := make([]cciptestinterfaces.CCIP17ProductConfiguration, 0)
193192
for _, bc := range in.Blockchains {
194193
var impl cciptestinterfaces.CCIP17ProductConfiguration
@@ -204,9 +203,12 @@ func NewEnvironment() (in *Cfg, err error) {
204203
return nil, fmt.Errorf("failed to deploy local networks: %w", err)
205204
}
206205
}
206+
/////////////////////////////
207+
// END: Deploy blockchains //
208+
/////////////////////////////
207209

208210
////////////////////////////
209-
// Start: Launch CL Nodes //
211+
// START: Launch CL Nodes //
210212
// We launch the CL nodes first because they don't require any configuration from
211213
// the rest of the system to be up and running.
212214
// In addition, if we need to launch the nodes (i.e if some services are not standalone),
@@ -222,10 +224,12 @@ func NewEnvironment() (in *Cfg, err error) {
222224
timeTrack.Record("[infra] deployed CL nodes")
223225

224226
//////////////////////////
225-
// End: Launch CL Nodes //
227+
// END: Launch CL Nodes //
226228
//////////////////////////
227229

228-
// Verifier configs...
230+
/////////////////////////////////////////////
231+
// START: Assign signing keys to verifiers //
232+
/////////////////////////////////////////////
229233
roundRobin := NewRoundRobinAssignment(onchainPublicKeys[signingKeyChainType])
230234
for i := range in.Verifier {
231235
ver := services.ApplyVerifierDefaults(*in.Verifier[i])
@@ -263,34 +267,13 @@ func NewEnvironment() (in *Cfg, err error) {
263267
// Apply changes back to input.
264268
in.Verifier[i] = &ver
265269
}
266-
267-
// JD is not currently used.
268-
/*
269-
prodJDImage := os.Getenv("JD_IMAGE")
270-
271-
if in.JD != nil {
272-
if prodJDImage != "" {
273-
in.JD.Image = prodJDImage
274-
}
275-
if len(in.JD.Image) == 0 {
276-
Plog.Warn().Msg("No JD image provided, skipping JD service startup")
277-
} else {
278-
_, err = jd.NewJD(in.JD)
279-
if err != nil {
280-
return nil, fmt.Errorf("failed to create JD service: %w", err)
281-
}
282-
}
283-
} else {
284-
Plog.Warn().Msg("No JD configuration provided, skipping JD service startup")
285-
}
286-
*/
287-
288-
timeTrack.Record("[infra] deploying blockchains")
270+
/////////////////////////////////////////////
271+
// END: Assign signing keys to verifiers //
272+
/////////////////////////////////////////////
289273

290274
/////////////////////////////
291-
// Start: Deploy contracts //
275+
// START: Deploy contracts //
292276
/////////////////////////////
293-
294277
var committees []cciptestinterfaces.OnChainCommittees
295278
{
296279
addrs := make(map[string][][]byte)
@@ -322,6 +305,7 @@ func NewEnvironment() (in *Cfg, err error) {
322305
}
323306
L.Info().Any("Selectors", selectors).Msg("Deploying for chain selectors")
324307

308+
timeTrack.Record("[infra] deploying blockchains")
325309
ds := datastore.NewMemoryDataStore()
326310
for i, impl := range impls {
327311
var networkInfo chainsel.ChainDetails
@@ -351,6 +335,13 @@ func NewEnvironment() (in *Cfg, err error) {
351335
}
352336
}
353337
e.DataStore = ds.Seal()
338+
///////////////////////////
339+
// END: Deploy contracts //
340+
///////////////////////////
341+
342+
/////////////////////////////////////////
343+
// START: Connect chains to each other //
344+
/////////////////////////////////////////
354345

355346
for i, impl := range impls {
356347
var networkInfo chainsel.ChainDetails
@@ -369,15 +360,15 @@ func NewEnvironment() (in *Cfg, err error) {
369360
return nil, err
370361
}
371362
}
372-
///////////////////////////
373-
// END: Deploy contracts //
374-
///////////////////////////
375363

376-
///////////////////////////////////////
377-
// Start: Launch standalone services //
378-
///////////////////////////////////////
364+
/////////////////////////////////////////
365+
// END: Connect chains to each other //
366+
/////////////////////////////////////////
367+
368+
///////////////////////////////
369+
// START: Launch aggregators //
370+
///////////////////////////////
379371

380-
// Start aggregators.
381372
in.AggregatorEndpoints = make(map[string]string)
382373
in.AggregatorCACertFiles = make(map[string]string)
383374

@@ -430,9 +421,15 @@ func NewEnvironment() (in *Cfg, err error) {
430421
}
431422
}
432423

433-
// Start indexer.
424+
///////////////////////////////
425+
// START: Launch aggregators //
426+
///////////////////////////////
427+
428+
///////////////////////////
429+
// START: Launch indexer //
434430
// start up the indexer after the aggregators are up to avoid spamming of errors
435431
// in the logs when it starts before the aggregators are up.
432+
///////////////////////////
436433
// Need to update the addresses in the indexer config due to contract deployment nondeterminism.
437434
for _, agg := range in.Aggregator {
438435
// XXX: in theory addresses should be matching across chains
@@ -497,6 +494,14 @@ func NewEnvironment() (in *Cfg, err error) {
497494

498495
in.IndexerEndpoint = indexerOut.ExternalHTTPURL
499496

497+
/////////////////////////
498+
// END: Launch indexer //
499+
/////////////////////////
500+
501+
/////////////////////////////
502+
// START: Launch executors //
503+
/////////////////////////////
504+
500505
if len(in.Executor) > 0 {
501506
execs, err := services.ResolveContractsForExecutor(e.DataStore, in.Blockchains, in.Executor)
502507
if err != nil {
@@ -533,6 +538,13 @@ func NewEnvironment() (in *Cfg, err error) {
533538
return nil, fmt.Errorf("failed to create standalone executor: %w", err)
534539
}
535540

541+
///////////////////////////
542+
// END: Launch executors //
543+
///////////////////////////
544+
545+
/////////////////////////////
546+
// START: Launch verifiers //
547+
/////////////////////////////
536548
// Populate verifier input with contract addresses from the CLDF datastore.
537549
for i := range in.Verifier {
538550
ver, err := services.ResolveContractsForVerifier(e.DataStore, in.Blockchains, *in.Verifier[i])
@@ -574,6 +586,14 @@ func NewEnvironment() (in *Cfg, err error) {
574586
return nil, fmt.Errorf("failed to create standalone verifiers: %w", err)
575587
}
576588

589+
/////////////////////////////
590+
// END: Launch verifiers //
591+
/////////////////////////////
592+
593+
///////////////////////////////////
594+
// START: Launch token verifiers //
595+
///////////////////////////////////
596+
577597
for i := range in.TokenVerifier {
578598
ver, err := services.ResolveContractsForTokenVerifier(e.DataStore, in.Blockchains, *in.TokenVerifier[i])
579599
if err != nil {
@@ -588,15 +608,25 @@ func NewEnvironment() (in *Cfg, err error) {
588608
return nil, fmt.Errorf("failed to create standalone token verifiers: %w", err)
589609
}
590610

591-
/////////////////////////////////////
592-
// End: Launch standalone services //
593-
/////////////////////////////////////
611+
///////////////////////////////////
612+
// END: Launch token verifiers //
613+
///////////////////////////////////
614+
615+
////////////////////////////////////////////////////
616+
// START: Create jobs for verifiers and executors //
617+
// Note that if they are started in standalone mode,
618+
// there would be no CL nodes and this would be a no-op.
619+
////////////////////////////////////////////////////
594620

595621
err = createJobs(in, in.Verifier, in.Executor)
596622
if err != nil {
597623
return nil, fmt.Errorf("failed to create jobs: %w", err)
598624
}
599625

626+
//////////////////////////////////////////////////
627+
// END: Create jobs for verifiers and executors //
628+
//////////////////////////////////////////////////
629+
600630
timeTrack.Print()
601631
if err = PrintCLDFAddresses(in); err != nil {
602632
return nil, err

build/devenv/services/indexer.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -340,10 +340,12 @@ func NewIndexer(in *IndexerInput) (*IndexerOutput, error) {
340340
return nil, fmt.Errorf("failed to get container host: %w", err)
341341
}
342342

343-
return &IndexerOutput{
343+
out := &IndexerOutput{
344344
ContainerName: in.ContainerName,
345345
ExternalHTTPURL: fmt.Sprintf("http://%s:%d", host, in.Port),
346346
InternalHTTPURL: fmt.Sprintf("http://%s:%d", in.ContainerName, in.Port),
347347
DBConnectionString: DefaultIndexerDBConnectionString,
348-
}, nil
348+
}
349+
in.Out = out
350+
return out, nil
349351
}

0 commit comments

Comments
 (0)