feat: Add basic testing large syncs support (#1862)

erezrokah · web-flow · commit 40a009574bb3 · 2024-08-08T15:00:14.000Z
#### Summary A very crude and simple way to do cloudquery/cloudquery-issues#1846. Adds a new hidden ~~fuzz~~ test scheduler that only multiplies the clients (at the moment). The code is based on the shuffle scheduler then adds then duplicates client based on the multiplier. See example: ``` cloudquery sync examples/pagerduty-postgres.yml Loading spec(s) from examples/pagerduty-postgres.yml Starting sync for: pagerduty (local@/Users/erezrokah/code/github/cloudquery/cloudquery-private/plugins/source/pagerduty/pagerduty) -> [postgresql (cloudquery/postgresql@v8.2.7)] Sync completed successfully. Resources: 525, Errors: 0, Warnings: 0, Time: 7s ``` ``` CQ_DEBUG_SYNC_MULTIPLIER=50 cloudquery sync examples/pagerduty-postgres.yml Loading spec(s) from examples/pagerduty-postgres.yml Starting sync for: pagerduty (local@/Users/erezrokah/code/github/cloudquery/cloudquery-private/plugins/source/pagerduty/pagerduty) -> [postgresql (cloudquery/postgresql@v8.2.7)] Sync completed successfully. Resources: 26385, Errors: 0, Warnings: 0, Time: 2m12s ``` This has a couple of downsides/tradeoffs 1. There will be clients with duplicate IDs which breaks the metrics counts https://github.com/cloudquery/plugin-sdk/blob/25ed3d25a529a22f351ab92e22fb03a19c9557d4/scheduler/metrics.go#L144 2. If a plugin uses the client ID to ensure uniqueness for state client keys, that logic will break too 3. If a table doesn't have any resources the impact of the multiplier will be lower However I think this is still useful if we want to artificially make a sync large (e.g. simulate a sync on many AWS accounts) ---
diff --git a/scheduler/scheduler.go b/scheduler/scheduler.go
@@ -217,6 +217,14 @@ func (s *Scheduler) Sync(ctx context.Context, client schema.ClientMeta, tables s
 	resources := make(chan *schema.Resource)
 	go func() {
 		defer close(resources)
+		testMultiplier, err := getTestMultiplier()
+		if err != nil {
+			panic(err)
+		}
+		if testMultiplier > 0 {
+			syncClient.syncTest(ctx, testMultiplier, resources)
+			return
+		}
 		switch s.strategy {
 		case StrategyDFS:
 			syncClient.syncDfs(ctx, resources)
diff --git a/scheduler/scheduler_tezt.go b/scheduler/scheduler_tezt.go
@@ -0,0 +1,94 @@
+package scheduler
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"strconv"
+	"sync"
+
+	"github.com/cloudquery/plugin-sdk/v4/schema"
+)
+
+const (
+	// This is an environment variable and not a spec option in each plugin to make it easier to enable it
+	cqDebugSyncMultiplier = "CQ_DEBUG_SYNC_MULTIPLIER"
+)
+
+func getTestMultiplier() (int, error) {
+	strValue, ok := os.LookupEnv(cqDebugSyncMultiplier)
+	if ok {
+		intValue, err := strconv.Atoi(strValue)
+		if err != nil {
+			return 0, fmt.Errorf("failed to parse %s=%s as integer: %w", cqDebugSyncMultiplier, strValue, err)
+		}
+		return intValue, nil
+	}
+	return 0, nil
+}
+
+func (s *syncClient) syncTest(ctx context.Context, syncMultiplier int, resolvedResources chan<- *schema.Resource) {
+	// we have this because plugins can return sometimes clients in a random way which will cause
+	// differences between this run and the next one.
+	preInitialisedClients := make([][]schema.ClientMeta, len(s.tables))
+	tableNames := make([]string, len(s.tables))
+	for i, table := range s.tables {
+		tableNames[i] = table.Name
+		clients := []schema.ClientMeta{s.client}
+		if table.Multiplex != nil {
+			clients = table.Multiplex(s.client)
+		}
+		// Detect duplicate clients while multiplexing
+		seenClients := make(map[string]bool)
+		for _, c := range clients {
+			if _, ok := seenClients[c.ID()]; !ok {
+				seenClients[c.ID()] = true
+			} else {
+				s.logger.Warn().Str("client", c.ID()).Str("table", table.Name).Msg("multiplex returned duplicate client")
+			}
+		}
+		preInitialisedClients[i] = clients
+		// we do this here to avoid locks so we initialize the metrics structure once in the main goroutine
+		// and then we can just read from it in the other goroutines concurrently given we are not writing to it.
+		s.metrics.initWithClients(table, clients)
+	}
+
+	// First interleave the tables like in round-robin
+	tableClients := roundRobinInterleave(s.tables, preInitialisedClients)
+	// Then shuffle the tableClients to randomize the order in which they are retrieved.
+	// We use a fixed seed so that runs with the same tables and clients perform similarly across syncs
+	// however, if the table order changes, the seed will change and the shuffle order will be different,
+	// so users have a little bit of control over the randomization.
+	seed := hashTableNames(tableNames)
+	allClients := make([]tableClient, 0, len(tableClients)*syncMultiplier)
+	for _, tc := range tableClients {
+		for i := 0; i < syncMultiplier; i++ {
+			allClients = append(allClients, tc)
+		}
+	}
+	shuffle(allClients, seed)
+
+	var wg sync.WaitGroup
+	for _, tc := range allClients {
+		table := tc.table
+		cl := tc.client
+		if err := s.scheduler.tableSems[0].Acquire(ctx, 1); err != nil {
+			// This means context was cancelled
+			wg.Wait()
+			return
+		}
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			defer s.scheduler.tableSems[0].Release(1)
+			// Not checking for error here as nothing much to do.
+			// the error is logged and this happens when context is cancelled.
+			// This currently uses the DFS algorithm to resolve the tables, but this
+			// may change in the future.
+			s.resolveTableDfs(ctx, table, cl, nil, resolvedResources, 1)
+		}()
+	}
+
+	// Wait for all the worker goroutines to finish
+	wg.Wait()
+}