diff --git a/app/app.go b/app/app.go index 922ae3179..0f51e6f10 100644 --- a/app/app.go +++ b/app/app.go @@ -155,7 +155,7 @@ func Run(ctx context.Context, conf Config) (err error) { life := new(lifecycle.Manager) if conf.PrivKeyLocking { - lockSvc, err := privkeylock.New(conf.PrivKeyFile+".lock", "charon run") + lockSvc, err := privkeylock.New(conf.PrivKeyFile, conf.LockFile, "charon run") if err != nil { return err } diff --git a/app/privkeylock/privkeylock.go b/app/privkeylock/privkeylock.go index c462a0231..8362ca2d2 100644 --- a/app/privkeylock/privkeylock.go +++ b/app/privkeylock/privkeylock.go @@ -4,6 +4,7 @@ package privkeylock import ( "encoding/json" + "fmt" "os" "time" @@ -17,50 +18,81 @@ var ( // updatePeriod is the duration after which the private key lock file is updated. updatePeriod = 1 * time.Second + + // gracePeriod is the duration after which a new cluster (new lock hash) can be run after an edit command. + // (we can't use chain spec at this time, so we use fixed duration of 768 seconds) + gracePeriod = 2 * 32 * 12 * time.Second ) // New returns new private key locking service. It errors if a recently-updated private key lock file exists. -func New(path, command string) (Service, error) { - content, err := os.ReadFile(path) +func New(privKeyFilePath, clusterLockFilePath, command string) (Service, error) { + clusterLockHash, err := readClusterLockHash(clusterLockFilePath) + if err != nil { + return Service{}, err + } + + privKeyFilePath += ".lock" + + content, err := os.ReadFile(privKeyFilePath) if errors.Is(err, os.ErrNotExist) { //nolint:revive // Empty block is fine. // No file, we will create it in run } else if err != nil { - return Service{}, errors.Wrap(err, "read private key lock file", z.Str("path", path)) + return Service{}, errors.Wrap(err, "read private key lock file", z.Str("path", privKeyFilePath)) } else { var meta metadata if err := json.Unmarshal(content, &meta); err != nil { - return Service{}, errors.Wrap(err, "decode private key lock file", z.Str("path", path)) + return Service{}, errors.Wrap(err, "decode private key lock file", z.Str("path", privKeyFilePath)) } if time.Since(meta.Timestamp) <= staleDuration { return Service{}, errors.New( "existing private key lock file found, another charon instance may be running on your machine", - z.Str("path", path), + z.Str("path", privKeyFilePath), z.Str("command", meta.Command), + z.Str("cluster_lock_hash", meta.ClusterLockHash), ) } + + if meta.ClusterLockHash != "" && clusterLockHash != meta.ClusterLockHash { + elapsedPeriod := time.Since(meta.Timestamp) + if elapsedPeriod < gracePeriod { + waitTime := gracePeriod - elapsedPeriod + errText := fmt.Sprintf("existing private key lock file found with different cluster lock hash, you must wait for %v before starting charon with the new cluster hash", waitTime) + + return Service{}, errors.New( + errText, + z.Str("path", privKeyFilePath), + z.Str("command", meta.Command), + z.Str("existing_cluster_lock_hash", meta.ClusterLockHash), + z.Str("current_cluster_lock_hash", clusterLockHash), + z.Str("grace_period", gracePeriod.String()), + ) + } + } } - if err := writeFile(path, command, time.Now()); err != nil { + if err := writeFile(privKeyFilePath, clusterLockHash, command, time.Now()); err != nil { return Service{}, err } return Service{ - command: command, - path: path, - updatePeriod: updatePeriod, - quit: make(chan struct{}), - done: make(chan struct{}), + clusterLockHash: clusterLockHash, + command: command, + path: privKeyFilePath, + updatePeriod: updatePeriod, + quit: make(chan struct{}), + done: make(chan struct{}), }, nil } // Service is a private key locking service. type Service struct { - command string - path string - updatePeriod time.Duration - quit chan struct{} // Quit exits the Run goroutine if closed. - done chan struct{} // Done is closed when Run exits, which exits the Close goroutine. + clusterLockHash string + command string + path string + updatePeriod time.Duration + quit chan struct{} // Quit exits the Run goroutine if closed. + done chan struct{} // Done is closed when Run exits, which exits the Close goroutine. } // Run runs the service, updating the lock file every second and deleting it on context cancellation. @@ -80,7 +112,7 @@ func (s Service) Run() error { return nil case <-tick.C: // Overwrite lockfile with new metadata - if err := writeFile(s.path, s.command, time.Now()); err != nil { + if err := writeFile(s.path, s.clusterLockHash, s.command, time.Now()); err != nil { return err } } @@ -96,13 +128,14 @@ func (s Service) Close() { // metadata is the metadata stored in the lock file. type metadata struct { - Command string `json:"command"` - Timestamp time.Time `json:"timestamp"` + Command string `json:"command"` + Timestamp time.Time `json:"timestamp"` + ClusterLockHash string `json:"cluster_lock_hash,omitempty"` } // writeFile creates or updates the file with the latest metadata. -func writeFile(path, command string, now time.Time) error { - b, err := json.Marshal(metadata{Command: command, Timestamp: now}) +func writeFile(path, clusterLockHash, command string, now time.Time) error { + b, err := json.Marshal(metadata{Command: command, Timestamp: now, ClusterLockHash: clusterLockHash}) if err != nil { return errors.Wrap(err, "marshal private key lock file") } @@ -114,3 +147,25 @@ func writeFile(path, command string, now time.Time) error { return nil } + +type clusterLockHash struct { + LockHash string `json:"lock_hash"` +} + +// readClusterLockHash reads only lock_hash field from the cluster lock file. +// Returns empty string if file doesn't exist (e.g., during DKG before cluster-lock.json is created). +func readClusterLockHash(clusterLockFilePath string) (string, error) { + content, err := os.ReadFile(clusterLockFilePath) + if errors.Is(err, os.ErrNotExist) { + return "", nil // File doesn't exist yet, return empty hash + } else if err != nil { + return "", errors.Wrap(err, "read cluster lock file", z.Str("path", clusterLockFilePath)) + } + + var hash clusterLockHash + if err := json.Unmarshal(content, &hash); err != nil { + return "", errors.Wrap(err, "decode cluster lock hash file", z.Str("path", clusterLockFilePath)) + } + + return hash.LockHash, nil +} diff --git a/app/privkeylock/privkeylock_internal_test.go b/app/privkeylock/privkeylock_internal_test.go index a6593ed71..50f15457b 100644 --- a/app/privkeylock/privkeylock_internal_test.go +++ b/app/privkeylock/privkeylock_internal_test.go @@ -3,6 +3,7 @@ package privkeylock import ( + "encoding/json" "os" "path/filepath" "testing" @@ -14,32 +15,38 @@ import ( ) func TestService(t *testing.T) { - path := filepath.Join(t.TempDir(), "privkeylocktest") + tmpDir := t.TempDir() + privKeyPath := filepath.Join(tmpDir, "privkey") + lockFilePath := filepath.Join(tmpDir, "cluster-lock.json") + lockPath := privKeyPath + ".lock" + + // Create cluster lock file. + writeClusterLockFile(t, lockFilePath, "hash123") // Create a stale file that is ignored. - err := writeFile(path, "test", time.Now().Add(-staleDuration)) + err := writeFile(lockPath, "hash123", "test", time.Now().Add(-staleDuration)) require.NoError(t, err) // Create a new service. - svc, err := New(path, "test") + svc, err := New(privKeyPath, lockFilePath, "test") require.NoError(t, err) // Increase the update period to make the test faster. svc.updatePeriod = time.Millisecond - assertFileExists(t, path) + assertFileExists(t, lockPath) // Assert a new service can't be created. - _, err = New(path, "test") + _, err = New(privKeyPath, lockFilePath, "test") require.ErrorContains(t, err, "existing private key lock file found") // Delete the file so Run will create it again. - require.NoError(t, os.Remove(path)) + require.NoError(t, os.Remove(lockPath)) var eg errgroup.Group eg.Go(svc.Run) // Run will create the file. eg.Go(func() error { - assertFileExists(t, path) + assertFileExists(t, lockPath) svc.Close() return nil @@ -48,10 +55,113 @@ func TestService(t *testing.T) { require.NoError(t, eg.Wait()) // Assert the file is deleted. - _, openErr := os.Open(path) + _, openErr := os.Open(lockPath) require.ErrorIs(t, openErr, os.ErrNotExist) } +func TestClusterHashMismatchWithinGracePeriod(t *testing.T) { + tmpDir := t.TempDir() + privKeyPath := filepath.Join(tmpDir, "privkey") + lockFilePath := filepath.Join(tmpDir, "cluster-lock.json") + lockPath := privKeyPath + ".lock" + + // Create cluster lock file with hash1. + writeClusterLockFile(t, lockFilePath, "hash1") + + // Create a stale but within grace period lock file with hash1. + err := writeFile(lockPath, "hash1", "test", time.Now().Add(-staleDuration-time.Second)) + require.NoError(t, err) + + // Update cluster lock file to hash2. + writeClusterLockFile(t, lockFilePath, "hash2") + + // Try to create service with new hash within grace period - should fail. + _, err = New(privKeyPath, lockFilePath, "test") + require.Error(t, err) + require.ErrorContains(t, err, "different cluster lock hash") + require.ErrorContains(t, err, "you must wait") +} + +func TestClusterHashMismatchAfterGracePeriod(t *testing.T) { + tmpDir := t.TempDir() + privKeyPath := filepath.Join(tmpDir, "privkey") + lockFilePath := filepath.Join(tmpDir, "cluster-lock.json") + lockPath := privKeyPath + ".lock" + + // Create cluster lock file with hash1. + writeClusterLockFile(t, lockFilePath, "hash1") + + // Create an old lock file with hash1 (beyond grace period). + err := writeFile(lockPath, "hash1", "test", time.Now().Add(-gracePeriod-time.Second)) + require.NoError(t, err) + + // Update cluster lock file to hash2. + writeClusterLockFile(t, lockFilePath, "hash2") + + // Try to create service with new hash after grace period - should succeed. + _, err = New(privKeyPath, lockFilePath, "test") + require.NoError(t, err) + + // Verify the new hash is written. + content, err := os.ReadFile(lockPath) + require.NoError(t, err) + + var meta metadata + + err = json.Unmarshal(content, &meta) + require.NoError(t, err) + require.Equal(t, "hash2", meta.ClusterLockHash) +} + +func TestClusterHashMatchWithinGracePeriod(t *testing.T) { + tmpDir := t.TempDir() + privKeyPath := filepath.Join(tmpDir, "privkey") + lockFilePath := filepath.Join(tmpDir, "cluster-lock.json") + lockPath := privKeyPath + ".lock" + + // Create cluster lock file with hash1. + writeClusterLockFile(t, lockFilePath, "hash1") + + // Create a recent lock file with hash1 (within stale duration). + err := writeFile(lockPath, "hash1", "test", time.Now().Add(-time.Second)) + require.NoError(t, err) + + // Try to create service with same hash - should fail due to staleness check. + _, err = New(privKeyPath, lockFilePath, "test") + require.Error(t, err) + require.ErrorContains(t, err, "another charon instance may be running") + + // Now create a stale lock file with same hash (beyond stale duration but within grace period). + err = writeFile(lockPath, "hash1", "test", time.Now().Add(-staleDuration-time.Second)) + require.NoError(t, err) + + // Should succeed since hash matches and file is stale. + _, err = New(privKeyPath, lockFilePath, "test") + require.NoError(t, err) +} + +func TestClusterLockFileNotFound(t *testing.T) { + tmpDir := t.TempDir() + privKeyPath := filepath.Join(tmpDir, "privkey") + lockFilePath := filepath.Join(tmpDir, "nonexistent.json") + + // Should succeed when cluster lock file doesn't exist (e.g., during DKG). + // The cluster lock hash will be empty. + _, err := New(privKeyPath, lockFilePath, "test") + require.NoError(t, err) + + // Verify empty cluster hash is written. + lockPath := privKeyPath + ".lock" + content, err := os.ReadFile(lockPath) + require.NoError(t, err) + + var meta metadata + + err = json.Unmarshal(content, &meta) + require.NoError(t, err) + require.Empty(t, meta.ClusterLockHash) +} + func assertFileExists(t *testing.T, path string) { t.Helper() @@ -60,3 +170,45 @@ func assertFileExists(t *testing.T, path string) { return openErr == nil }, time.Second, time.Millisecond) } + +// writeClusterLockFile creates a cluster lock file with the given hash. +func writeClusterLockFile(t *testing.T, path, lockHash string) { + t.Helper() + + content := map[string]any{ + "lock_hash": lockHash, + "name": "test-cluster", + } + b, err := json.Marshal(content) + require.NoError(t, err) + err = os.WriteFile(path, b, 0o644) + require.NoError(t, err) +} + +func TestEmptyHashToHashMigration(t *testing.T) { + tmpDir := t.TempDir() + privKeyPath := filepath.Join(tmpDir, "privkey") + lockFilePath := filepath.Join(tmpDir, "cluster-lock.json") + lockPath := privKeyPath + ".lock" + + // Create cluster lock file. + writeClusterLockFile(t, lockFilePath, "newhash") + + // Create a stale lock file with empty cluster hash (migration scenario). + err := writeFile(lockPath, "", "test", time.Now().Add(-staleDuration*2)) + require.NoError(t, err) + + // Should succeed - empty hash shouldn't trigger grace period. + _, err = New(privKeyPath, lockFilePath, "test") + require.NoError(t, err) + + // Verify the new hash is written. + content, err := os.ReadFile(lockPath) + require.NoError(t, err) + + var meta metadata + + err = json.Unmarshal(content, &meta) + require.NoError(t, err) + require.Equal(t, "newhash", meta.ClusterLockHash) +} diff --git a/app/sse/listener.go b/app/sse/listener.go index 89f7e65b8..b7aa477cc 100644 --- a/app/sse/listener.go +++ b/app/sse/listener.go @@ -291,6 +291,7 @@ func (p *listener) storeBlockGossipTime(slot uint64, addr string, timestamp time if p.blockGossipTimes[slot] == nil { p.blockGossipTimes[slot] = make(map[string]time.Time) } + p.blockGossipTimes[slot][addr] = timestamp } @@ -318,6 +319,7 @@ func (p *listener) recordBlockProcessingTime(slot uint64, addr string, headTimes // Clean up this entry as it's no longer needed delete(addrMap, addr) + if len(addrMap) == 0 { delete(p.blockGossipTimes, slot) } diff --git a/app/sse/listener_internal_test.go b/app/sse/listener_internal_test.go index 8ca0bd590..b6366c888 100644 --- a/app/sse/listener_internal_test.go +++ b/app/sse/listener_internal_test.go @@ -238,7 +238,7 @@ func TestBlockProcessingTimeCleanup(t *testing.T) { // After processing slot 150, entries older than (150 - 32) = 118 are removed // Remaining entries: odd slots from 119-149 (never processed) = 16 entries // Even slots are immediately deleted after processing - require.Equal(t, 16, len(l.blockGossipTimes)) + require.Len(t, l.blockGossipTimes, 16) // Verify recent unprocessed entries are still there (odd slots from end) addrMap, found := l.blockGossipTimes[149] diff --git a/dkg/dkg.go b/dkg/dkg.go index 3a60ef59a..2a16a0f4d 100644 --- a/dkg/dkg.go +++ b/dkg/dkg.go @@ -8,6 +8,7 @@ import ( "encoding/hex" "fmt" "net/url" + "path" "slices" "time" @@ -115,7 +116,7 @@ func Run(ctx context.Context, conf Config) (err error) { { // Setup private key locking. - lockSvc, err := privkeylock.New(p2p.KeyPath(conf.DataDir)+".lock", "charon dkg") + lockSvc, err := privkeylock.New(p2p.KeyPath(conf.DataDir), path.Join(conf.DataDir, "cluster-lock.json"), "charon dkg") if err != nil { return err } diff --git a/dkg/protocol.go b/dkg/protocol.go index 4455ae0ed..423a68c82 100644 --- a/dkg/protocol.go +++ b/dkg/protocol.go @@ -26,9 +26,10 @@ import ( ) const ( - enrPrivateKeyFile = "charon-enr-private-key" - validatorKeysSubDir = "validator_keys" - clusterLockFile = "cluster-lock.json" + enrPrivateKeyFile = "charon-enr-private-key" + enrPrivateKeyLockFile = "charon-enr-private-key.lock" + validatorKeysSubDir = "validator_keys" + clusterLockFile = "cluster-lock.json" ) // Protocol is a generic interface for DKG protocols such as add/remove operators. diff --git a/dkg/protocolsteps.go b/dkg/protocolsteps.go index 0dd55db46..45a3a837b 100644 --- a/dkg/protocolsteps.go +++ b/dkg/protocolsteps.go @@ -177,6 +177,13 @@ func (s *writeArtifactsProtocolStep) Run(ctx context.Context, pctx *ProtocolCont return err } + maybeLockFilePath := pctx.PrivateKeyPath + ".lock" + if app.FileExists(maybeLockFilePath) { + if err := app.CopyFile(maybeLockFilePath, filepath.Join(s.outputDir, enrPrivateKeyLockFile)); err != nil { + return err + } + } + if err := storeKeys(s.outputDir, pctx.Shares); err != nil { return err } diff --git a/dkg/protocolsteps_internal_test.go b/dkg/protocolsteps_internal_test.go index bfe47c4af..e632be444 100644 --- a/dkg/protocolsteps_internal_test.go +++ b/dkg/protocolsteps_internal_test.go @@ -211,6 +211,9 @@ func TestWriteArtifactsProtocolStep(t *testing.T) { err := k1util.Save(nodeKeys[0], p2p.KeyPath(dataDir)) require.NoError(t, err) + err = os.WriteFile(p2p.KeyPath(dataDir)+".lock", []byte("{}"), 0o600) + require.NoError(t, err) + shares := valKeysToSharesNode0(t, valKeys, lock.Validators) pctx := &ProtocolContext{ @@ -226,6 +229,7 @@ func TestWriteArtifactsProtocolStep(t *testing.T) { require.FileExists(t, filepath.Join(step.outputDir, clusterLockFile)) require.DirExists(t, filepath.Join(step.outputDir, validatorKeysSubDir)) require.FileExists(t, p2p.KeyPath(step.outputDir)) + require.FileExists(t, p2p.KeyPath(step.outputDir)+".lock") enrPrivKey, err := os.ReadFile(p2p.KeyPath(step.outputDir)) require.NoError(t, err)