reverted unwanted changes

Hongyu Zhou · Hongyu Zhou · commit 68f556fb34a4 · 2023-08-09T15:27:17.000-04:00
diff --git a/.github/workflows/build-ctlstore.yml b/.github/workflows/build-ctlstore.yml
diff --git a/Dockerfile b/Dockerfile
@@ -22,4 +22,4 @@ RUN apk --no-cache add sqlite
 
 COPY --from=0 /bin/chamber /bin/chamber
 COPY --from=0 /usr/local/bin/ctlstore /usr/local/bin/
-COPY --from=0 /usr/local/bin/ctlstore-cli /usr/local/bin/
+COPY --from=0 /usr/local/bin/ctlstore-cli /usr/local/bin/
diff --git a/pkg/cmd/ctlstore/main.go b/pkg/cmd/ctlstore/main.go
@@ -23,6 +23,7 @@ import (
 	"github.com/segmentio/ctlstore/pkg/errs"
 	executivepkg "github.com/segmentio/ctlstore/pkg/executive"
 	heartbeatpkg "github.com/segmentio/ctlstore/pkg/heartbeat"
+	"github.com/segmentio/ctlstore/pkg/ldbwriter"
 	"github.com/segmentio/ctlstore/pkg/ledger"
 	reflectorpkg "github.com/segmentio/ctlstore/pkg/reflector"
 	sidecarpkg "github.com/segmentio/ctlstore/pkg/sidecar"
@@ -46,22 +47,25 @@ type sidecarConfig struct {
 }
 
 type reflectorCliConfig struct {
-	LDBPath               string             `conf:"ldb-path" help:"Path to LDB file" validate:"nonzero"`
-	ChangelogPath         string             `conf:"changelog-path" help:"Path to changelog file"`
-	ChangelogSize         int                `conf:"changelog-size" help:"Maximum size of the changelog file"`
-	UpstreamDriver        string             `conf:"upstream-driver" help:"Upstream driver name (e.g. sqlite3)" validate:"nonzero"`
-	UpstreamDSN           string             `conf:"upstream-dsn" help:"Upstream DSN (e.g. path to file if sqlite3)" validate:"nonzero"`
-	UpstreamLedgerTable   string             `conf:"upstream-ledger-table" help:"Table on the upstream to look for statement ledger"`
-	BootstrapURL          string             `conf:"bootstrap-url" help:"Bootstraps LDB from an S3 URL"`
-	BootstrapRegion       string             `conf:"bootstrap-region" help:"If specified, indicates which region in which the S3 bucket lives"`
-	PollInterval          time.Duration      `conf:"poll-interval" help:"How often to pull the upstream" validate:"nonzero"`
-	PollJitterCoefficient float64            `conf:"poll-jitter-coefficient" help:"Coefficient for poll jittering"`
-	QueryBlockSize        int                `conf:"query-block-size" help:"Number of ledger entries to get at once"`
-	Debug                 bool               `conf:"debug" help:"Turns on debug logging"`
-	LedgerHealth          ledgerHealthConfig `conf:"ledger-latency" help:"Configure ledger latency behavior"`
-	Dogstatsd             dogstatsdConfig    `conf:"dogstatsd" help:"dogstatsd Configuration"`
-	MetricsBind           string             `conf:"metrics-bind" help:"address to serve Prometheus metircs"`
-	WALPollInterval       time.Duration      `conf:"wal-poll-interval" help:"How often to pull the sqlite's wal size and status. 0 indicates disabled monitoring'"`
+	LDBPath                    string                   `conf:"ldb-path" help:"Path to LDB file" validate:"nonzero"`
+	ChangelogPath              string                   `conf:"changelog-path" help:"Path to changelog file"`
+	ChangelogSize              int                      `conf:"changelog-size" help:"Maximum size of the changelog file"`
+	UpstreamDriver             string                   `conf:"upstream-driver" help:"Upstream driver name (e.g. sqlite3)" validate:"nonzero"`
+	UpstreamDSN                string                   `conf:"upstream-dsn" help:"Upstream DSN (e.g. path to file if sqlite3)" validate:"nonzero"`
+	UpstreamLedgerTable        string                   `conf:"upstream-ledger-table" help:"Table on the upstream to look for statement ledger"`
+	BootstrapURL               string                   `conf:"bootstrap-url" help:"Bootstraps LDB from an S3 URL"`
+	BootstrapRegion            string                   `conf:"bootstrap-region" help:"If specified, indicates which region in which the S3 bucket lives"`
+	PollInterval               time.Duration            `conf:"poll-interval" help:"How often to pull the upstream" validate:"nonzero"`
+	PollJitterCoefficient      float64                  `conf:"poll-jitter-coefficient" help:"Coefficient for poll jittering"`
+	QueryBlockSize             int                      `conf:"query-block-size" help:"Number of ledger entries to get at once"`
+	Debug                      bool                     `conf:"debug" help:"Turns on debug logging"`
+	LedgerHealth               ledgerHealthConfig       `conf:"ledger-latency" help:"Configure ledger latency behavior"`
+	Dogstatsd                  dogstatsdConfig          `conf:"dogstatsd" help:"dogstatsd Configuration"`
+	MetricsBind                string                   `conf:"metrics-bind" help:"address to serve Prometheus metircs"`
+	WALPollInterval            time.Duration            `conf:"wal-poll-interval" help:"How often to pull the sqlite's wal size and status. 0 indicates disabled monitoring'"`
+	WALCheckpointThresholdSize int                      `conf:"wal-checkpoint-threshold-size" help:"Performs a checkpoint after the WAL file exceeds this size in bytes"`
+	WALCheckpointType          ldbwriter.CheckpointType `conf:"wal-checkpoint-type" help:"what type of checkpoint to manually perform once the wal size is exceeded"`
+	BusyTimeoutMS              int                      `conf:"busy-timeout-ms" help:"Set a busy timeout on the connection string for sqlite in milliseconds"`
 }
 
 type executiveCliConfig struct {
@@ -490,6 +494,9 @@ func defaultReflectorCLIConfig(isSupervisor bool) reflectorCliConfig {
 		},
 		// disabled by default
 		WALPollInterval: 0,
+		// 8 MB, double what a "healthy" WAL file should be https://www.sqlite.org/compile.html#default_wal_autocheckpoint
+		WALCheckpointThresholdSize: 8 * 1024 * 1024,
+		WALCheckpointType:          ldbwriter.Passive,
 	}
 	if isSupervisor {
 		// the supervisor runs as an ECS task, so it cannot yet set
@@ -547,7 +554,10 @@ func newReflector(cliCfg reflectorCliConfig, isSupervisor bool) (*reflectorpkg.R
 			QueryBlockSize:        cliCfg.QueryBlockSize,
 			PollTimeout:           5 * time.Second,
 		},
-		WALPollInterval: cliCfg.WALPollInterval,
-		DoMonitorWAL:    cliCfg.WALPollInterval > 0,
+		WALPollInterval:            cliCfg.WALPollInterval,
+		DoMonitorWAL:               cliCfg.WALPollInterval > 0,
+		WALCheckpointThresholdSize: cliCfg.WALCheckpointThresholdSize,
+		WALCheckpointType:          cliCfg.WALCheckpointType,
+		BusyTimeoutMS:              cliCfg.BusyTimeoutMS,
 	})
 }
diff --git a/pkg/reflector/wal_monitor_test.go b/pkg/reflector/wal_monitor_test.go
@@ -5,25 +5,26 @@ import (
 	"fmt"
 	"os"
 	"sync"
+	"sync/atomic"
 	"testing"
 	"time"
 
 	"github.com/segmentio/ctlstore/pkg/ldbwriter"
 )
 
 type fake struct {
-	size           int64
-	err            error
-	wg             sync.WaitGroup
-	statCallCount  int
-	checkCallCount int
+	size          int64
+	err           error
+	wg            sync.WaitGroup
+	statCallCount atomic.Int64
+	cpCallCount   atomic.Int64
 }
 
 func (f *fake) Stat() func(m *WALMonitor) {
 	return func(m *WALMonitor) {
 		m.walSizeFunc = func(p string) (int64, error) {
 			defer f.wg.Done()
-			f.statCallCount++
+			f.statCallCount.Add(1)
 			v, err := m.getWALSize(p)
 			f.size = v
 			f.err = err
@@ -43,13 +44,14 @@ func (f *fake) Ticker() func(m *WALMonitor) {
 func (f *fake) Checkpointer() func(m *WALMonitor) {
 	return func(m *WALMonitor) {
 		m.cpTesterFunc = func() (*ldbwriter.PragmaWALResult, error) {
-			f.checkCallCount++
+			defer f.wg.Done()
+			f.cpCallCount.Add(1)
 			return nil, fmt.Errorf("fail")
 		}
 	}
 }
 
-func TestWALMonitorSize(t *testing.T) {
+func TestWALMonitorTooSmall(t *testing.T) {
 	tmpdir := t.TempDir()
 	f, err := os.CreateTemp(tmpdir, "*.ldb-wal")
 	if err != nil {
@@ -66,10 +68,57 @@ func TestWALMonitorSize(t *testing.T) {
 	}
 
 	var fake fake
-	fake.wg.Add(1)
+	fake.wg.Add(2)
 	mon := NewMonitor(MonitorConfig{
-		PollInterval: time.Millisecond,
-		Path:         f.Name(),
+		PollInterval:               time.Millisecond,
+		Path:                       f.Name(),
+		WALCheckpointThresholdSize: int64(n + 1),
+	}, nil, fake.Stat(), fake.Ticker(), fake.Checkpointer())
+
+	ctx, cancel := context.WithCancel(context.Background())
+	go mon.Start(ctx)
+	// wait for fake stat call
+	fake.wg.Wait()
+	cancel()
+
+	if fake.statCallCount.Load() == 0 {
+		t.Errorf("Stat should have been called at least once")
+	}
+
+	if fake.cpCallCount.Load() != 0 {
+		t.Errorf("Checkpoint should not have been called since the file wasn't large enough")
+	}
+	if fake.err != nil {
+		t.Errorf("unexpected error on stat: %v", fake.err)
+	}
+
+	if int64(n) != fake.size {
+		t.Errorf("expected file size of %d, got %d", n, fake.size)
+	}
+}
+
+func TestWALMonitorBigEnough(t *testing.T) {
+	tmpdir := t.TempDir()
+	f, err := os.CreateTemp(tmpdir, "*.ldb-wal")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	n, err := f.WriteString("some random bytes!")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if f.Sync() != nil {
+		t.Fatal(err)
+	}
+
+	var fake fake
+	fake.wg.Add(2)
+	mon := NewMonitor(MonitorConfig{
+		PollInterval:               time.Millisecond,
+		Path:                       f.Name(),
+		WALCheckpointThresholdSize: int64(n - 1),
 	}, nil, fake.Stat(), fake.Ticker(), fake.Checkpointer())
 
 	ctx, cancel := context.WithCancel(context.Background())
@@ -78,11 +127,11 @@ func TestWALMonitorSize(t *testing.T) {
 	fake.wg.Wait()
 	cancel()
 
-	if fake.statCallCount == 0 {
+	if fake.statCallCount.Load() == 0 {
 		t.Errorf("Stat should have been called at least once")
 	}
 
-	if fake.checkCallCount == 0 {
+	if fake.cpCallCount.Load() == 0 {
 		t.Errorf("Checkpoint should have been called at least once")
 	}
 	if fake.err != nil {
@@ -103,16 +152,16 @@ func TestNoWALPath(t *testing.T) {
 
 	mon.Start(context.Background())
 
-	if fake.statCallCount != 0 {
+	if fake.statCallCount.Load() != 0 {
 		t.Errorf("Stat should not have been called")
 	}
 
-	if fake.checkCallCount != 0 {
+	if fake.cpCallCount.Load() != 0 {
 		t.Errorf("Checkpoint should not have been called")
 	}
 }
 
-func TestWALMonitorStopsOnError(t *testing.T) {
+func TestWALMonitorStopsOnStatError(t *testing.T) {
 	var fake fake
 	fake.wg.Add(5)
 	mon := NewMonitor(MonitorConfig{
@@ -122,11 +171,41 @@ func TestWALMonitorStopsOnError(t *testing.T) {
 
 	mon.Start(context.Background())
 	fake.wg.Wait()
-	if fake.statCallCount != 5 {
-		t.Errorf("Stat should have been called 5 times, got %d", fake.statCallCount)
+	if fake.statCallCount.Load() != 5 {
+		t.Errorf("Stat should have been called 5 times, got %d", fake.statCallCount.Load())
 	}
 
-	if fake.checkCallCount != 5 {
-		t.Errorf("Checkpoint should have have been called 5 times, got %d", fake.checkCallCount)
+	if fake.cpCallCount.Load() != 0 {
+		t.Errorf("Checkpoint should not have been called")
+	}
+}
+
+func TestWALMonitorStopsOnCheckpointError(t *testing.T) {
+	tmpdir := t.TempDir()
+	f, err := os.CreateTemp(tmpdir, "*.ldb-wal")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	_, err = f.WriteString("some random bytes!")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	var fake fake
+	fake.wg.Add(10)
+	mon := NewMonitor(MonitorConfig{
+		PollInterval: 50 * time.Microsecond,
+		Path:         f.Name(),
+	}, nil, fake.Stat(), fake.Checkpointer())
+
+	mon.Start(context.Background())
+	fake.wg.Wait()
+	if fake.statCallCount.Load() != 5 {
+		t.Errorf("Stat should have been called 5 times, got %d", fake.statCallCount.Load())
+	}
+
+	if fake.cpCallCount.Load() != 5 {
+		t.Errorf("Checkpoint should not have been called")
 	}
 }